Added support for code blocks and fenced code in MD

Signed-off-by: Maksym Lysak <[email protected]>
DS4SD · Oct 22, 2024 · 186d71a · 186d71a
1 parent 4fb803f
commit 186d71a
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 20 deletions.
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
@@ -157,6 +157,11 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
             doc.add_picture(parent=parent_element, caption=element.title)
 
+        elif isinstance(element, marko.block.Paragraph):
+            print("Paragraph:")
+            print(element)
+            print("")
+
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
             snippet_text = str(element.children).strip()
@@ -182,12 +187,34 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
 
         elif isinstance(element, marko.inline.CodeSpan):
             self.close_table(doc)
-            _log.debug(f" - Paragraph (code): {element.children}")
+            _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
             doc.add_text(
                 label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
             )
 
+        elif isinstance(element, marko.block.CodeBlock):
+            self.close_table(doc)
+            print("CODE BLOCK")
+            print(element)
+            print("")
+            _log.debug(f" - Code Block: {element.children}")
+            snippet_text = str(element.children[0].children).strip()
+            doc.add_text(
+                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.block.FencedCode):
+            self.close_table(doc)
+            print("FENCED CODE")
+            print(element)
+            print("")
+            _log.debug(f" - Code Block: {element.children}")
+            snippet_text = str(element.children[0].children).strip()
+            doc.add_text(
+                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+            )
+
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
                 _log.debug("Line break in a table")
@@ -205,14 +232,18 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
             if not isinstance(element, str):
                 self.close_table(doc)
                 _log.debug("Some other element: {}".format(element))
+                print("SOMETHING ELSE")
+                print(element)
+                print("")
 
         # Iterate through the element's children (if any)
         if not isinstance(element, marko.block.ListItem):
             if not isinstance(element, marko.block.Heading):
-                # if not isinstance(element, marko.block.Paragraph):
-                if hasattr(element, "children"):
-                    for child in element.children:
-                        self.iterate_elements(child, depth + 1, doc, parent_element)
+                if not isinstance(element, marko.block.FencedCode):
+                    # if not isinstance(element, marko.block.Paragraph):
+                    if hasattr(element, "children"):
+                        for child in element.children:
+                            self.iterate_elements(child, depth + 1, doc, parent_element)
 
     def is_valid(self) -> bool:
         return self.valid

diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py
@@ -1,13 +1,11 @@
 import json
 import logging
-from io import BytesIO
 from pathlib import Path
 
 import yaml
 
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import DocumentStream, InputFormat
+from docling.datamodel.base_models import InputFormat
 from docling.document_converter import (
     DocumentConverter,
     PdfFormatOption,
@@ -21,24 +19,17 @@
 
 def main():
     input_paths = [
+        Path("README.md"),
         Path("tests/data/wiki_duck.html"),
         Path("tests/data/word_sample.docx"),
-        Path("tests/data/word_nested.docx"),
         Path("tests/data/lorem_ipsum.docx"),
         Path("tests/data/powerpoint_sample.pptx"),
         Path("tests/data/2305.03393v1-pg9-img.png"),
         Path("tests/data/2206.01062.pdf"),
         Path("tests/data/test_01.asciidoc"),
-        Path("tests/data/test_02.asciidoc"),
-        Path("README.md"),
+        Path("tests/data/test_01.asciidoc"),
     ]
 
-    # To read from bytes instead:
-    # docs = [
-    #    DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
-    #    for f in input_paths
-    # ]
-
     ## for defaults use:
     # doc_converter = DocumentConverter()
 
@@ -57,8 +48,7 @@ def main():
             ],  # whitelist formats, non-matching files are ignored.
             format_options={
                 InputFormat.PDF: PdfFormatOption(
-                    pipeline_cls=StandardPdfPipeline,
-                    backend=DoclingParseDocumentBackend,
+                    pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
                 ),
                 InputFormat.DOCX: WordFormatOption(
                     pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
@@ -68,7 +58,6 @@ def main():
     )
 
     conv_results = doc_converter.convert_all(input_paths)
-    # conv_results = doc_converter.convert_all(docs)
 
     for res in conv_results:
         out_path = Path("scratch")