Skip to content

Commit

Permalink
Added support for code blocks and fenced code in MD
Browse files Browse the repository at this point in the history
Signed-off-by: Maksym Lysak <[email protected]>
  • Loading branch information
Maksym Lysak committed Oct 22, 2024
1 parent 4fb803f commit 186d71a
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 20 deletions.
41 changes: 36 additions & 5 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)

elif isinstance(element, marko.block.Paragraph):
print("Paragraph:")
print(element)
print("")

elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = str(element.children).strip()
Expand All @@ -182,12 +187,34 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):

elif isinstance(element, marko.inline.CodeSpan):
self.close_table(doc)
_log.debug(f" - Paragraph (code): {element.children}")
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)

elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
print("CODE BLOCK")
print(element)
print("")
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)

elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
print("FENCED CODE")
print(element)
print("")
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)

elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
_log.debug("Line break in a table")
Expand All @@ -205,14 +232,18 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
if not isinstance(element, str):
self.close_table(doc)
_log.debug("Some other element: {}".format(element))
print("SOMETHING ELSE")
print(element)
print("")

# Iterate through the element's children (if any)
if not isinstance(element, marko.block.ListItem):
if not isinstance(element, marko.block.Heading):
# if not isinstance(element, marko.block.Paragraph):
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
if not isinstance(element, marko.block.FencedCode):
# if not isinstance(element, marko.block.Paragraph):
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)

def is_valid(self) -> bool:
return self.valid
Expand Down
19 changes: 4 additions & 15 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import json
import logging
from io import BytesIO
from pathlib import Path

import yaml

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
Expand All @@ -21,24 +19,17 @@

def main():
input_paths = [
Path("README.md"),
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/word_nested.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_02.asciidoc"),
Path("README.md"),
Path("tests/data/test_01.asciidoc"),
]

# To read from bytes instead:
# docs = [
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
# for f in input_paths
# ]

## for defaults use:
# doc_converter = DocumentConverter()

Expand All @@ -57,8 +48,7 @@ def main():
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
backend=DoclingParseDocumentBackend,
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
Expand All @@ -68,7 +58,6 @@ def main():
)

conv_results = doc_converter.convert_all(input_paths)
# conv_results = doc_converter.convert_all(docs)

for res in conv_results:
out_path = Path("scratch")
Expand Down

0 comments on commit 186d71a

Please sign in to comment.