From a39556d1da47ba27486163e9435316b719fb1a96 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 5 Dec 2023 16:44:05 -0800 Subject: [PATCH] grobid augmenter - fix Sections-Paragraphs bug (#282) * fix bug * version bump --- pyproject.toml | 2 +- .../grobid_augment_existing_document_parser.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 847bd5d4..1b14e29d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = 'mmda' -version = '0.9.16' +version = '0.9.17' description = 'MMDA - multimodal document analysis' authors = [ {name = 'Allen Institute for Artificial Intelligence', email = 'contact@allenai.org'}, diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 2f7406bf..81792bce 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -139,13 +139,15 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: unallocated_tokens_dict=unallocated_section_tokens_dict, fix_overlaps=True, ) + # check that conversion to spangroups resulted in actual spans before adding them to the section if all([sg.spans for sg in this_paragraph_sentence_span_groups]): sentence_span_groups.extend(this_paragraph_sentence_span_groups) - paragraph_spans = [] - for sg in this_paragraph_sentence_span_groups: - paragraph_spans.extend(sg.spans) - # TODO add boxes to paragraph spangroups - this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans)) + paragraph_spans = [] + for sg in this_paragraph_sentence_span_groups: + paragraph_spans.extend(sg.spans) + # TODO add boxes to paragraph spangroups + this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans)) + paragraph_span_groups.extend(this_section_paragraph_span_groups) for sg in this_section_paragraph_span_groups: section_spans.extend(sg.spans)