Skip to content

Commit

Permalink
Merge PR #1018 into 14.0
Browse files Browse the repository at this point in the history
Signed-off-by alexis-via
  • Loading branch information
OCA-git-bot committed Sep 24, 2024
2 parents b44373a + a5c733c commit bb4fe31
Showing 1 changed file with 56 additions and 39 deletions.
95 changes: 56 additions & 39 deletions account_invoice_import_simple_pdf/wizard/account_invoice_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def fallback_parse_pdf_invoice(self, file_data):
@api.model
def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
res = False
version = None
try:
pages = []
doc = fitz.open(fileobj.name)
Expand All @@ -55,25 +56,34 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
"all": "\n\n".join(pages),
"first": pages and pages[0] or "",
}
logger.info("Text extraction made with PyMuPDF %s", fitz.__version__)
test_info["text_extraction"] = "pymupdf %s" % fitz.__version__
# For PyMuPDF, we used to get the version via __version__
# but it is not possible with newer version of the lib
if hasattr(fitz, "__version__"):
version = fitz.__version__
elif hasattr(fitz, "version") and isinstance(fitz.version, tuple):
version = fitz.version[0]
logger.info("Text extraction made with PyMuPDF %s", version)
test_info["text_extraction"] = "pymupdf %s" % version
except Exception as e:
logger.warning("Text extraction with PyMuPDF failed. Error: %s", e)
return res

@api.model
def _simple_pdf_text_extraction_pypdf(self, fileobj, test_info):
res = False
reader = pypdf.PdfReader(fileobj.name)
pages = []
for pdf_page in reader.pages:
pages.append(pdf_page.extract_text())
res = {
"all": "\n\n".join(pages),
"first": pages and pages[0] or "",
}
test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
logger.info("Text extraction made with pypdf %s", pypdf.__version__)
try:
reader = pypdf.PdfReader(fileobj.name)
pages = []
for pdf_page in reader.pages:
pages.append(pdf_page.extract_text())
res = {
"all": "\n\n".join(pages),
"first": pages and pages[0] or "",
}
test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
logger.info("Text extraction made with pypdf %s", pypdf.__version__)
except Exception as e:
logger.warning("Text extraction with pypdf failed. Error: %s", e)
return res

@api.model
Expand Down Expand Up @@ -164,12 +174,6 @@ def _simple_pdf_text_extraction_specific_tool(

@api.model
def simple_pdf_text_extraction(self, file_data, test_info):
fileobj = NamedTemporaryFile("wb", prefix="odoo-simple-pdf-", suffix=".pdf")
fileobj.write(file_data)
# Extract text from PDF
# Very interesting reading:
# https://dida.do/blog/how-to-extract-text-from-pdf
# https://github.com/erfelipe/PDFtextExtraction
specific_tool = (
self.env["ir.config_parameter"]
.sudo()
Expand All @@ -178,27 +182,41 @@ def simple_pdf_text_extraction(self, file_data, test_info):
if specific_tool:
specific_tool = specific_tool.strip().lower()
test_info["text_extraction_config"] = specific_tool
if specific_tool:
res = self._simple_pdf_text_extraction_specific_tool(
specific_tool, fileobj, test_info
)
else:
# From best tool to worst
res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
if not res:
res = self._simple_pdf_text_extraction_pdftotext_lib(fileobj, test_info)
if not res:
res = self._simple_pdf_text_extraction_pdftotext_cmd(fileobj, test_info)
if not res:
res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
if not res:
raise UserError(
_(
"Odoo could not extract the text from the PDF invoice. "
"Refer to the Odoo server logs for more technical information "
"about the cause of the failure."
)

with NamedTemporaryFile(
"wb", prefix="odoo-simple-pdf-", suffix=".pdf"
) as fileobj:
fileobj.write(file_data)
fileobj.seek(0)
# Extract text from PDF
# Very interesting reading:
# https://dida.do/blog/how-to-extract-text-from-pdf
# https://github.com/erfelipe/PDFtextExtraction
if specific_tool:
res = self._simple_pdf_text_extraction_specific_tool(
specific_tool, fileobj, test_info
)
else:
# From best tool to worst
res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
if not res:
res = self._simple_pdf_text_extraction_pdftotext_lib(
fileobj, test_info
)
if not res:
res = self._simple_pdf_text_extraction_pdftotext_cmd(
fileobj, test_info
)
if not res:
res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
if not res:
raise UserError(
_(
"Odoo could not extract the text from the PDF invoice. "
"Refer to the Odoo server logs for more technical information "
"about the cause of the failure."
)
)
for key, text in res.items():
if text:
# Remove lonely accents
Expand All @@ -213,7 +231,6 @@ def simple_pdf_text_extraction(self, file_data, test_info):
res["first_no_space"] = regex.sub(
"%s+" % test_info["space_pattern"], "", res["first"]
)
fileobj.close()
return res

@api.model
Expand Down

0 comments on commit bb4fe31

Please sign in to comment.