Grobid augmenter body sections, paragraphs, sentences (#275)

* add return... * body sections paragraphs sentences and test * versionne * meant for this to be True
allenai · Aug 15, 2023 · dcf5b2c · dcf5b2c
1 parent 36a538e
commit dcf5b2c
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 2 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = 'mmda'
-version = '0.9.10'
+version = '0.9.11'
 description = 'MMDA - multimodal document analysis'
 authors = [
     {name = 'Allen Institute for Artificial Intelligence', email = '[email protected]'},

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -99,6 +99,34 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
             )
         )
 
+        # sections
+        # Grobid provides coordinates and number attributes for section headers, and coordinates for
+        # sentences within the body text, also tagged by paragraphs.
+        # We use these to annotate the document in order to provide a hierarchical structure:
+        # e.g. doc.sections.header, doc.sections[0].paragraphs[0].sentences[0]
+        section_box_groups, heading_box_groups, paragraph_box_groups, sentence_box_groups = \
+            self._get_structured_body_text_box_groups(xml_root)
+        doc.annotate(
+            sections=box_groups_to_span_groups(
+                section_box_groups, doc, center=True
+            )
+        )
+        doc.annotate(
+            headings=box_groups_to_span_groups(
+                heading_box_groups, doc, center=True
+            )
+        )
+        doc.annotate(
+            paragraphs=box_groups_to_span_groups(
+                paragraph_box_groups, doc, center=True
+            )
+        )
+        doc.annotate(
+            sentences=box_groups_to_span_groups(
+                sentence_box_groups, doc, center=True
+            )
+        )
+
         return doc
 
     def _xml_coords_to_boxes(self, coords_attribute: str):
@@ -172,3 +200,52 @@ def _get_box_groups(
             else:
                 box_groups.append(BoxGroup(boxes=boxes))
         return box_groups
+
+    def _get_heading_box_group(
+            self,
+            section_div: et.Element
+    ) -> Optional[BoxGroup]:
+        box_group = None
+        heading_element = section_div.find(f".//tei:head", NS)
+        if heading_element is not None:  # elements evaluate as False if no children
+            coords_string = heading_element.attrib["coords"]
+            boxes = self._xml_coords_to_boxes(coords_string)
+            number = heading_element.attrib["n"] if "n" in heading_element.keys() else None
+            section_title = heading_element.text
+            box_group = BoxGroup(
+                boxes=boxes,
+                metadata=Metadata(number=number, title=section_title),
+            )
+        return box_group
+
+    def _get_structured_body_text_box_groups(
+            self,
+            root: et.Element
+    ) -> (List[BoxGroup], List[BoxGroup], List[BoxGroup], List[BoxGroup]):
+        section_list_root = root.find(f".//tei:body", NS)
+
+        body_sections: List[BoxGroup] = []
+        body_headings: List[BoxGroup] = []
+        body_paragraphs: List[BoxGroup] = []
+        body_sentences: List[BoxGroup] = []
+
+        section_divs = section_list_root.findall(f"./tei:div", NS)
+        for div in section_divs:
+            section_boxes: List[Box] = []
+            heading_box_group = self._get_heading_box_group(div)
+            if heading_box_group:
+                body_headings.append(heading_box_group)
+                section_boxes.extend(heading_box_group.boxes)
+            for p in div.findall(f"./tei:p", NS):
+                paragraph_boxes: List[Box] = []
+                paragraph_sentences: List[BoxGroup] = []
+                for s in p.findall(f"./tei:s", NS):
+                    sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"])
+                    paragraph_sentences.append(BoxGroup(boxes=sentence_boxes))
+                    paragraph_boxes.extend(sentence_boxes)
+                body_paragraphs.append(BoxGroup(boxes=paragraph_boxes))
+                section_boxes.extend(paragraph_boxes)
+                body_sentences.extend(paragraph_sentences)
+            body_sections.append(BoxGroup(boxes=section_boxes))
+
+        return body_sections, body_headings, body_paragraphs, body_sentences