Merge PR #1018 into 14.0

Signed-off-by alexis-via
OCA · Sep 24, 2024 · bb4fe31 · bb4fe31
2 parents b44373a + a5c733c
commit bb4fe31
Showing 1 changed file with 56 additions and 39 deletions.
diff --git a/account_invoice_import_simple_pdf/wizard/account_invoice_import.py b/account_invoice_import_simple_pdf/wizard/account_invoice_import.py
@@ -46,6 +46,7 @@ def fallback_parse_pdf_invoice(self, file_data):
  @api.model
  def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
  res = False
+ version = None
  try:
  pages = []
  doc = fitz.open(fileobj.name)
@@ -55,25 +56,34 @@ def _simple_pdf_text_extraction_pymupdf(self, fileobj, test_info):
  "all": "\n\n".join(pages),
  "first": pages and pages[0] or "",
  }
- logger.info("Text extraction made with PyMuPDF %s", fitz.__version__)
- test_info["text_extraction"] = "pymupdf %s" % fitz.__version__
+ # For PyMuPDF, we used to get the version via __version__
+ # but it is not possible with newer version of the lib
+ if hasattr(fitz, "__version__"):
+ version = fitz.__version__
+ elif hasattr(fitz, "version") and isinstance(fitz.version, tuple):
+ version = fitz.version[0]
+ logger.info("Text extraction made with PyMuPDF %s", version)
+ test_info["text_extraction"] = "pymupdf %s" % version
  except Exception as e:
  logger.warning("Text extraction with PyMuPDF failed. Error: %s", e)
  return res
 
  @api.model
  def _simple_pdf_text_extraction_pypdf(self, fileobj, test_info):
  res = False
- reader = pypdf.PdfReader(fileobj.name)
- pages = []
- for pdf_page in reader.pages:
- pages.append(pdf_page.extract_text())
- res = {
- "all": "\n\n".join(pages),
- "first": pages and pages[0] or "",
- }
- test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
- logger.info("Text extraction made with pypdf %s", pypdf.__version__)
+ try:
+ reader = pypdf.PdfReader(fileobj.name)
+ pages = []
+ for pdf_page in reader.pages:
+ pages.append(pdf_page.extract_text())
+ res = {
+ "all": "\n\n".join(pages),
+ "first": pages and pages[0] or "",
+ }
+ test_info["text_extraction"] = "pypdf %s" % pypdf.__version__
+ logger.info("Text extraction made with pypdf %s", pypdf.__version__)
+ except Exception as e:
+ logger.warning("Text extraction with pypdf failed. Error: %s", e)
  return res
 
  @api.model
@@ -164,12 +174,6 @@ def _simple_pdf_text_extraction_specific_tool(
 
  @api.model
  def simple_pdf_text_extraction(self, file_data, test_info):
- fileobj = NamedTemporaryFile("wb", prefix="odoo-simple-pdf-", suffix=".pdf")
- fileobj.write(file_data)
- # Extract text from PDF
- # Very interesting reading:
- # https://dida.do/blog/how-to-extract-text-from-pdf
- # https://github.com/erfelipe/PDFtextExtraction
  specific_tool = (
  self.env["ir.config_parameter"]
  .sudo()
@@ -178,27 +182,41 @@ def simple_pdf_text_extraction(self, file_data, test_info):
  if specific_tool:
  specific_tool = specific_tool.strip().lower()
  test_info["text_extraction_config"] = specific_tool
- if specific_tool:
- res = self._simple_pdf_text_extraction_specific_tool(
- specific_tool, fileobj, test_info
- )
- else:
- # From best tool to worst
- res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
- if not res:
- res = self._simple_pdf_text_extraction_pdftotext_lib(fileobj, test_info)
- if not res:
- res = self._simple_pdf_text_extraction_pdftotext_cmd(fileobj, test_info)
- if not res:
- res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
- if not res:
- raise UserError(
- _(
- "Odoo could not extract the text from the PDF invoice. "
- "Refer to the Odoo server logs for more technical information "
- "about the cause of the failure."
- )
+
+ with NamedTemporaryFile(
+ "wb", prefix="odoo-simple-pdf-", suffix=".pdf"
+ ) as fileobj:
+ fileobj.write(file_data)
+ fileobj.seek(0)
+ # Extract text from PDF
+ # Very interesting reading:
+ # https://dida.do/blog/how-to-extract-text-from-pdf
+ # https://github.com/erfelipe/PDFtextExtraction
+ if specific_tool:
+ res = self._simple_pdf_text_extraction_specific_tool(
+ specific_tool, fileobj, test_info
  )
+ else:
+ # From best tool to worst
+ res = self._simple_pdf_text_extraction_pymupdf(fileobj, test_info)
+ if not res:
+ res = self._simple_pdf_text_extraction_pdftotext_lib(
+ fileobj, test_info
+ )
+ if not res:
+ res = self._simple_pdf_text_extraction_pdftotext_cmd(
+ fileobj, test_info
+ )
+ if not res:
+ res = self._simple_pdf_text_extraction_pypdf(fileobj, test_info)
+ if not res:
+ raise UserError(
+ _(
+ "Odoo could not extract the text from the PDF invoice. "
+ "Refer to the Odoo server logs for more technical information "
+ "about the cause of the failure."
+ )
+ )
  for key, text in res.items():
  if text:
  # Remove lonely accents
@@ -213,7 +231,6 @@ def simple_pdf_text_extraction(self, file_data, test_info):
  res["first_no_space"] = regex.sub(
  "%s+" % test_info["space_pattern"], "", res["first"]
  )
- fileobj.close()
  return res
 
  @api.model