Skip to content

Commit

Permalink
updated the base-model and added the asciidoc_backend
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Oct 17, 2024
1 parent 034a411 commit 1203353
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
72 changes: 72 additions & 0 deletions docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union

from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)


class ASCIIDocDocumentBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

self.path_or_stream = path_or_stream

self.valid = True

def is_valid(self) -> bool:
return self.valid

@classmethod
def supports_pagination(cls) -> bool:
return False

def unload(self):
return

@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.ASCIIDOC}

def convert(self) -> DoclingDocument:
"""
Parses the ASCII into a structured document model.
"""

fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name

origin = DocumentOrigin(
filename=fname,
mimetype="asciidoc",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"

doc = DoclingDocument(name=docname, origin=origin)

doc = self.parse_stream(doc)

return doc

def parse(self, doc: DoclingDocument):

return doc
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class InputFormat(str, Enum):
HTML = "html"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"


class OutputFormat(str, Enum):
Expand All @@ -45,6 +46,7 @@ class OutputFormat(str, Enum):
InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", ".asciidoc", "asc"],
}

FormatToMimeType: Dict[InputFormat, Set[str]] = {
Expand All @@ -66,6 +68,7 @@ class OutputFormat(str, Enum):
"image/bmp",
},
InputFormat.PDF: {"application/pdf"},
InputFormat.ASCIIDOC: {"application/asciidoc"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
Expand Down

0 comments on commit 1203353

Please sign in to comment.