From 12033537e39d5329e1963b9a20f0adf9efede637 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 17 Oct 2024 19:58:07 +0200 Subject: [PATCH] updated the base-model and added the asciidoc_backend Signed-off-by: Peter Staar --- docling/backend/asciidoc_backend.py | 72 +++++++++++++++++++++++++++++ docling/datamodel/base_models.py | 3 ++ 2 files changed, 75 insertions(+) create mode 100644 docling/backend/asciidoc_backend.py diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py new file mode 100644 index 00000000..3aad25a7 --- /dev/null +++ b/docling/backend/asciidoc_backend.py @@ -0,0 +1,72 @@ +import logging +from io import BytesIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.doc import ( + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupLabel, + TableCell, + TableData, +) + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class ASCIIDocDocumentBackend(DeclarativeDocumentBackend): + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self.path_or_stream = path_or_stream + + self.valid = True + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + return + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.ASCIIDOC} + + def convert(self) -> DoclingDocument: + """ + Parses the ASCII into a structured document model. + """ + + fname = "" + if isinstance(self.path_or_stream, Path): + fname = self.path_or_stream.name + + origin = DocumentOrigin( + filename=fname, + mimetype="asciidoc", + binary_hash=self.document_hash, + ) + if len(fname) > 0: + docname = Path(fname).stem + else: + docname = "stream" + + doc = DoclingDocument(name=docname, origin=origin) + + doc = self.parse_stream(doc) + + return doc + + def parse(self, doc: DoclingDocument): + + return doc diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 9c8f14c8..0a424351 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -30,6 +30,7 @@ class InputFormat(str, Enum): HTML = "html" IMAGE = "image" PDF = "pdf" + ASCIIDOC = "asciidoc" class OutputFormat(str, Enum): @@ -45,6 +46,7 @@ class OutputFormat(str, Enum): InputFormat.PDF: ["pdf"], InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], + InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"], } FormatToMimeType: Dict[InputFormat, Set[str]] = { @@ -66,6 +68,7 @@ class OutputFormat(str, Enum): "image/bmp", }, InputFormat.PDF: {"application/pdf"}, + InputFormat.ASCIIDOC: {"application/asciidoc"}, } MimeTypeToFormat = { mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes