Skip to content

Commit

Permalink
Fixes for MD Backend, to avoid duplicated text inserts into docling doc
Browse files Browse the repository at this point in the history
Signed-off-by: Maksym Lysak <[email protected]>
  • Loading branch information
Maksym Lysak committed Oct 22, 2024
1 parent 578e30e commit 47a4d31
Showing 1 changed file with 13 additions and 16 deletions.
29 changes: 13 additions & 16 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]

self.in_table = False
self.md_table_buffer: list[str] = []


try:
if isinstance(self.path_or_stream, BytesIO):
Expand Down Expand Up @@ -123,7 +124,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
doc_label = DocItemLabel.TITLE
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = element.children[0].children
snippet_text = element.children[0].children.strip()

parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
Expand All @@ -142,6 +143,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
elif isinstance(element, marko.block.ListItem):
self.close_table(doc)
_log.debug(" - List item")

snippet_text = str(element.children[0].children[0].children)
is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST:
Expand All @@ -150,23 +152,15 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
enumerated=is_numbered, parent=parent_element, text=snippet_text
)

elif isinstance(element, marko.block.Paragraph):
self.close_table(doc)
_log.debug(f" - Paragraph: {element.children[0].children}")
snippet_text = str(element.children[0].children)
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=parent_element, text=snippet_text
)

elif isinstance(element, marko.inline.Image):
self.close_table(doc)

_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)

elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = str(element.children)

snippet_text = str(element.children).strip()
# Detect start of the table:
if "|" in snippet_text:
# most likely part of the markdown table
Expand All @@ -190,7 +184,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
elif isinstance(element, marko.inline.CodeSpan):
self.close_table(doc)
_log.debug(f" - Paragraph (code): {element.children}")
snippet_text = str(element.children)
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
Expand All @@ -203,7 +197,7 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
elif isinstance(element, marko.block.HTMLBlock):
self.close_table(doc)
_log.debug("HTML Block: {}".format(element))
snippet_text = str(element.children)
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
Expand All @@ -214,9 +208,12 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
_log.debug("Some other element: {}".format(element))

# Iterate through the element's children (if any)
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
if not isinstance(element, marko.block.ListItem):
if not isinstance(element, marko.block.Heading):
# if not isinstance(element, marko.block.Paragraph):
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)

def is_valid(self) -> bool:
return self.valid
Expand Down

0 comments on commit 47a4d31

Please sign in to comment.