Skip to content

Commit

Permalink
Grobid augmenter body sections, paragraphs, sentences (#275)
Browse files Browse the repository at this point in the history
* add return...

* body sections paragraphs sentences and test

* versionne

* meant for this to be True
  • Loading branch information
geli-gel authored Aug 15, 2023
1 parent 36a538e commit dcf5b2c
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = 'mmda'
version = '0.9.10'
version = '0.9.11'
description = 'MMDA - multimodal document analysis'
authors = [
{name = 'Allen Institute for Artificial Intelligence', email = '[email protected]'},
Expand Down
77 changes: 77 additions & 0 deletions src/mmda/parsers/grobid_augment_existing_document_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,34 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
)
)

# sections
# Grobid provides coordinates and number attributes for section headers, and coordinates for
# sentences within the body text, also tagged by paragraphs.
# We use these to annotate the document in order to provide a hierarchical structure:
# e.g. doc.sections.header, doc.sections[0].paragraphs[0].sentences[0]
section_box_groups, heading_box_groups, paragraph_box_groups, sentence_box_groups = \
self._get_structured_body_text_box_groups(xml_root)
doc.annotate(
sections=box_groups_to_span_groups(
section_box_groups, doc, center=True
)
)
doc.annotate(
headings=box_groups_to_span_groups(
heading_box_groups, doc, center=True
)
)
doc.annotate(
paragraphs=box_groups_to_span_groups(
paragraph_box_groups, doc, center=True
)
)
doc.annotate(
sentences=box_groups_to_span_groups(
sentence_box_groups, doc, center=True
)
)

return doc

def _xml_coords_to_boxes(self, coords_attribute: str):
Expand Down Expand Up @@ -172,3 +200,52 @@ def _get_box_groups(
else:
box_groups.append(BoxGroup(boxes=boxes))
return box_groups

def _get_heading_box_group(
self,
section_div: et.Element
) -> Optional[BoxGroup]:
box_group = None
heading_element = section_div.find(f".//tei:head", NS)
if heading_element is not None: # elements evaluate as False if no children
coords_string = heading_element.attrib["coords"]
boxes = self._xml_coords_to_boxes(coords_string)
number = heading_element.attrib["n"] if "n" in heading_element.keys() else None
section_title = heading_element.text
box_group = BoxGroup(
boxes=boxes,
metadata=Metadata(number=number, title=section_title),
)
return box_group

def _get_structured_body_text_box_groups(
self,
root: et.Element
) -> (List[BoxGroup], List[BoxGroup], List[BoxGroup], List[BoxGroup]):
section_list_root = root.find(f".//tei:body", NS)

body_sections: List[BoxGroup] = []
body_headings: List[BoxGroup] = []
body_paragraphs: List[BoxGroup] = []
body_sentences: List[BoxGroup] = []

section_divs = section_list_root.findall(f"./tei:div", NS)
for div in section_divs:
section_boxes: List[Box] = []
heading_box_group = self._get_heading_box_group(div)
if heading_box_group:
body_headings.append(heading_box_group)
section_boxes.extend(heading_box_group.boxes)
for p in div.findall(f"./tei:p", NS):
paragraph_boxes: List[Box] = []
paragraph_sentences: List[BoxGroup] = []
for s in p.findall(f"./tei:s", NS):
sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"])
paragraph_sentences.append(BoxGroup(boxes=sentence_boxes))
paragraph_boxes.extend(sentence_boxes)
body_paragraphs.append(BoxGroup(boxes=paragraph_boxes))
section_boxes.extend(paragraph_boxes)
body_sentences.extend(paragraph_sentences)
body_sections.append(BoxGroup(boxes=section_boxes))

return body_sections, body_headings, body_paragraphs, body_sentences
Loading

0 comments on commit dcf5b2c

Please sign in to comment.