From 47a4d314ead952109db03f8b0427033b52ac89e9 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 22 Oct 2024 14:38:37 +0200 Subject: [PATCH] Fixes for MD Backend, to avoid duplicated text inserts into docling doc Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 0f51b05..2a70dde 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -37,6 +37,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.in_table = False self.md_table_buffer: list[str] = [] + try: if isinstance(self.path_or_stream, BytesIO): @@ -123,7 +124,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): doc_label = DocItemLabel.TITLE else: doc_label = DocItemLabel.SECTION_HEADER - snippet_text = element.children[0].children + snippet_text = element.children[0].children.strip() parent_element = doc.add_text( label=doc_label, parent=parent_element, text=snippet_text @@ -142,6 +143,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): elif isinstance(element, marko.block.ListItem): self.close_table(doc) _log.debug(" - List item") + snippet_text = str(element.children[0].children[0].children) is_numbered = False if parent_element.label == GroupLabel.ORDERED_LIST: @@ -150,23 +152,15 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): enumerated=is_numbered, parent=parent_element, text=snippet_text ) - elif isinstance(element, marko.block.Paragraph): - self.close_table(doc) - _log.debug(f" - Paragraph: {element.children[0].children}") - snippet_text = str(element.children[0].children) - doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=parent_element, text=snippet_text - ) - elif isinstance(element, marko.inline.Image): self.close_table(doc) + _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") doc.add_picture(parent=parent_element, caption=element.title) elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") - snippet_text = str(element.children) - + snippet_text = str(element.children).strip() # Detect start of the table: if "|" in snippet_text: # most likely part of the markdown table @@ -190,7 +184,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): elif isinstance(element, marko.inline.CodeSpan): self.close_table(doc) _log.debug(f" - Paragraph (code): {element.children}") - snippet_text = str(element.children) + snippet_text = str(element.children).strip() doc.add_text( label=DocItemLabel.CODE, parent=parent_element, text=snippet_text ) @@ -203,7 +197,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): elif isinstance(element, marko.block.HTMLBlock): self.close_table(doc) _log.debug("HTML Block: {}".format(element)) - snippet_text = str(element.children) + snippet_text = str(element.children).strip() doc.add_text( label=DocItemLabel.CODE, parent=parent_element, text=snippet_text ) @@ -214,9 +208,12 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): _log.debug("Some other element: {}".format(element)) # Iterate through the element's children (if any) - if hasattr(element, "children"): - for child in element.children: - self.iterate_elements(child, depth + 1, doc, parent_element) + if not isinstance(element, marko.block.ListItem): + if not isinstance(element, marko.block.Heading): + # if not isinstance(element, marko.block.Paragraph): + if hasattr(element, "children"): + for child in element.children: + self.iterate_elements(child, depth + 1, doc, parent_element) def is_valid(self) -> bool: return self.valid