diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e688af2..d21f6f9 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -143,11 +143,13 @@ def __init__( self.valid = False except (FileNotFoundError, OSError) as e: + self.valid = False _log.exception( f"File {self.file.name} not found or cannot be opened.", exc_info=e ) # raise except RuntimeError as e: + self.valid = False _log.exception( f"An unexpected error occurred while opening the document {self.file.name}", exc_info=e, @@ -166,6 +168,8 @@ def _init_doc( ) self._backend = backend(self, path_or_stream=path_or_stream) + if not self._backend.is_valid(): + self.valid = False class DocumentFormat(str, Enum): diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py new file mode 100644 index 0000000..3f7dd0c --- /dev/null +++ b/tests/test_input_doc.py @@ -0,0 +1,58 @@ +from io import BytesIO +from pathlib import Path + +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import DocumentStream, InputFormat +from docling.datamodel.document import InputDocument + + +def test_in_doc_from_valid_path(): + + test_doc_path = Path("./tests/data/2206.01062.pdf") + doc = _make_input_doc(test_doc_path) + assert doc.valid == True + + +def test_in_doc_from_invalid_path(): + test_doc_path = Path("./tests/does/not/exist.pdf") + + doc = _make_input_doc(test_doc_path) + + assert doc.valid == False + + +def test_in_doc_from_valid_buf(): + + buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + stream = DocumentStream(name="my_doc.pdf", stream=buf) + + doc = _make_input_doc_from_stream(stream) + assert doc.valid == True + + +def test_in_doc_from_invalid_buf(): + + buf = BytesIO(b"") + stream = DocumentStream(name="my_doc.pdf", stream=buf) + + doc = _make_input_doc_from_stream(stream) + assert doc.valid == False + + +def _make_input_doc(path): + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + ) + return in_doc + + +def _make_input_doc_from_stream(doc_stream): + in_doc = InputDocument( + path_or_stream=doc_stream.stream, + format=InputFormat.PDF, + filename=doc_stream.name, + backend=PyPdfiumDocumentBackend, + ) + return in_doc