Merge pull request #108 from MJedr/fix-incorrect-urls

fix regression in extract_texkeys_and_urls_from_pdf
inspirehep · Sep 18, 2023 · ef676a2 · ef676a2
2 parents ee93261 + abd0e31
commit ef676a2
Show file tree

Hide file tree

Showing 8 changed files with 140 additions and 15 deletions.
diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py
@@ -55,24 +55,30 @@ def extract_texkeys_and_urls_from_pdf(pdf_file):
  destinations = pdf.getNamedDestinations()
  urls = extract_urls(pdf)
  except Exception:
- LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
+ LOGGER.debug("PDF: Internal PyPDF2 error, no TeXkeys returned.")
  return []
  # not all named destinations point to references
  refs = []
  for destination in destinations.items():
- destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0]
+ destination_key = (
+ destination[0].decode("utf-8")
+ if isinstance(destination[0], ByteStringObject)
+ else destination[0]
+ )
  match = re_reference_in_dest.match(destination_key)
  if match:
  refs.append(destination)
+ two_column_layout = False
  try:
  if _destinations_in_two_columns(pdf, refs):
- LOGGER.debug(u"PDF: Using two-column layout")
+ two_column_layout = True
+ LOGGER.debug("PDF: Using two-column layout")
 
  def sortfunc(dest_couple):
  return dest_couple[1]
 
  else:
- LOGGER.debug(u"PDF: Using single-column layout")
+ LOGGER.debug("PDF: Using single-column layout")
 
  def sortfunc(dest_couple):
  page, _, ypos, xpos = dest_couple[1]
@@ -91,34 +97,41 @@ def sortfunc(dest_couple):
  if nb < len(refs) - 1:
  next_reference_data = refs[nb + 1]
  matched_urls_for_reference, urls = _match_urls_with_reference(
- urls, ref, next_reference_data
+ urls, ref, next_reference_data, two_column_layout=two_column_layout
  )
  else:
  matched_urls_for_reference, urls = _match_urls_with_reference(
- urls, ref
+ urls, ref, two_column_layout=two_column_layout
  )
  if matched_urls_for_reference:
  current_texkey_urls_dict["urls"] = matched_urls_for_reference
  texkey_url_list.append(current_texkey_urls_dict)
  return texkey_url_list
  except Exception:
- LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned")
+ LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned")
  return []
 
 
-def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
+def _match_urls_with_reference(
+ urls_to_match, reference, next_reference=None, two_column_layout=False
+):
  ref_page_number, ref_column, ref_y, _ = reference[1]
  if next_reference:
  next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1]
  urls_for_reference = set()
  for (url_index, url) in enumerate(urls_to_match):
  url_page_number, url_col, url_y, _ = url[1]
  is_url_under_texkey = ref_y <= url_y
+ is_url_in_same_col = ref_column == url_col
+ is_url_in_next_col = url_col > ref_column
  is_reference_on_same_page_as_url = ref_page_number == url_page_number
  is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number
  if not next_reference:
  if (
- is_reference_on_same_page_as_url or
+ (
+ is_reference_on_same_page_as_url and
+ (is_url_in_same_col or is_url_in_next_col)
+ ) or
  is_reference_on_previous_page_than_url
  ) and is_url_under_texkey:
  urls_for_reference.add(url[0])
@@ -137,7 +150,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
  is_next_reference_on_the_same_page and
  is_url_under_texkey and
  (next_ref_col > url_col) and
- next_ref_y < url_y
+ next_ref_y < url_y and
+ ref_y <= url_y and
+ (is_url_in_same_col or is_url_in_next_col)
  )
  is_in_new_column = (
  is_reference_on_same_page_as_url and
@@ -155,8 +170,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
  is_url_unrelated_to_references = ref_page_number > url_page_number
  is_url_for_next_reference = url_y >= next_ref_y
  if is_url_between_texkeys:
- urls_for_reference.add(url[0])
- continue
+ if not two_column_layout or (two_column_layout and url_col == ref_column):
+ urls_for_reference.add(url[0])
+ continue
  elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout:
  urls_for_reference.add(url[0])
  continue

diff --git a/tests/data/2301.05883.pdf b/tests/data/2301.05883.pdf
diff --git a/tests/data/2303.03819.pdf b/tests/data/2303.03819.pdf
diff --git a/tests/data/2304.10117.pdf b/tests/data/2304.10117.pdf
diff --git a/tests/data/packed_pdf.pdf b/tests/data/packed_pdf.pdf
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -162,7 +162,7 @@ def test_extract_references_from_url(pdf_files):
 
 def test_long_registrant_dois(pdf_files):
  """ DOIs with 5 digit registrant code """
- r = extract_references_from_file(pdf_files[8])
+ r = extract_references_from_file(pdf_files[11])
  assert len(r) == 6
  for ref in r[1:]:
  assert 'doi' in ref

diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -438,7 +438,7 @@ def test_reference_split_handles_semicolon():
 
 def test_clean_pdf_before_run(tmp_path, pdf_files):
  tmp_file_path = tmp_path / "packed.pdf"
- pdf = pdf_files[7]
+ pdf = pdf_files[10]
  with open(pdf, 'rb') as input, open(tmp_file_path, 'wb') as tmp_out:
  tmp_out.write(input.read())
 

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -795,6 +795,115 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):
 
  assert two_col_2 == expected_two_col_keys_2
 
+ two_col_with_one_url_only = extract_texkeys_and_urls_from_pdf(pdf_files[6])
+ expected_two_col_with_one_url_only = [
+ {"texkey": "Gr20"},
+ {"texkey": "Au18"},
+ {"texkey": "Ue35"},
+ {"texkey": "SM88"},
+ {"texkey": "Sh00"},
+ {"texkey": "Jaku21"},
+ {"texkey": "T60"},
+ {"texkey": "MT00"},
+ {"texkey": "Jaku21b", "urls": {"http://arxiv.org/abs/2102.08069"}},
+ {"texkey": "Mo64"},
+ {"texkey": "Ko21"},
+ {"texkey": "Lan"},
+ {"texkey": "Sch55"},
+ {"texkey": "Le56"},
+ {"texkey": "FR74"},
+ {"texkey": "BS19"},
+ {"texkey": "Va00"},
+ {"texkey": "T61"},
+ {"texkey": "BD64"},
+ {"texkey": "Ye61"},
+ {"texkey": "Kl77"},
+ {"texkey": "VJ"},
+ {"texkey": "Sal"},
+ {"texkey": "YRW"},
+ {"texkey": "Ma69"},
+ {"texkey": "MG64"},
+ {"texkey": "We65"},
+ {"texkey": "Lo58"},
+ {"texkey": "Reu82"},
+ {"texkey": "Off91"},
+ {"texkey": "Jaku22"},
+ {"texkey": "Fr72"},
+ {"texkey": "Jo62"},
+ ]
+ assert two_col_with_one_url_only == expected_two_col_with_one_url_only
+
+ two_col_with_one_url_only_1 = extract_texkeys_and_urls_from_pdf(pdf_files[7])
+ expected_two_col_with_one_url_only_1 = [
+ {"texkey": "Hees-Rapp"},
+ {"texkey": "He:2022ywp", "urls": {"http://arxiv.org/abs/2204.09299"}},
+ {"texkey": "Das-Alam-Mohanty"},
+ {"texkey": "Svetitsky:1987gq"},
+ {"texkey": "Tsallis"},
+ {"texkey": "Marques-Cleymans-Deppman-2015"},
+ {"texkey": "Marques-Andrade-Deppman-2013"},
+ {"texkey": "WilkWlodarkzyk-multiparticle"},
+ {"texkey": "TsallisBook"},
+ {"texkey": "PLASTINO1995347"},
+ {"texkey": "Muskat"},
+ {"texkey": "Schwammle"},
+ {"texkey": "Schwammle2009"},
+ {"texkey": "WaltonRafelski"},
+ {"texkey": "Wong:2015mba"},
+ {"texkey": "Deppman:2019yno"},
+ {"texkey": "PasechnikSumbera"},
+ {"texkey": "Adolfsson:2020dhm"},
+ {"texkey": "Qin:2015srf"},
+ {"texkey": "Apolinario:2015bfm"},
+ {"texkey": "Casalderrey-Solana:2018wrw"},
+ {"texkey": "CORADDU2003473"},
+ {"texkey": "Curilef"},
+ {"texkey": "Annala:2019puf"},
+ {"texkey": "Annala:2020rgx"},
+ {"texkey": "Cardoso2017"},
+ {"texkey": "Sen:2021tdu"},
+ ]
+ assert two_col_with_one_url_only_1 == expected_two_col_with_one_url_only_1
+
+ two_col_with_one_url_only_2 = extract_texkeys_and_urls_from_pdf(pdf_files[8])
+ two_col_with_one_url_only_2_expected = [
+ {"texkey": "Penrose:1964wq"},
+ {"texkey": "Penrose:1969pc"},
+ {"texkey": "Hawking:1976ra"},
+ {"texkey": "wald2001thermodynamics"},
+ {"texkey": "abbott2016observation"},
+ {"texkey": "isi2021testing"},
+ {"texkey": "isi2019testing"},
+ {"texkey": "PhysRevD.7.2333"},
+ {"texkey": "PhysRevLett.30.71"},
+ {"texkey": "bardeen1973four"},
+ {"texkey": "wald1994quantum"},
+ {"texkey": "abbott2021gwtc", "urls": {"http://arxiv.org/abs/2111.03606"}},
+ {"texkey": "cabero2018observational"},
+ {"texkey": "samples2"},
+ {"texkey": "scientific2016tests"},
+ {"texkey": "lalsimulation"},
+ {"texkey": "abbott2021gwtc1"},
+ {"texkey": "samples1"},
+ {"texkey": "schutz1999gravitational"},
+ {"texkey": "martynov2016sensitivity"},
+ {"texkey": "bersanetti2021advanced"},
+ {"texkey": "kagra2019kagra"},
+ {"texkey": "shaddock2008space"},
+ {"texkey": "hu2017taiji"},
+ {"texkey": "ruan2020taiji"},
+ {"texkey": "luo2020brief"},
+ {"texkey": "luo2016tianqin"},
+ {"texkey": "Gong:2021gvw"},
+ {"texkey": "amaro2018relativistic"},
+ {"texkey": "berry2013observing"},
+ {"texkey": "babak2017science"},
+ {"texkey": "berry2013expectations"},
+ {"texkey": "gwosc"},
+ ]
+
+ assert two_col_with_one_url_only_2 == two_col_with_one_url_only_2_expected
+
 
 def test_extract_texkeys_and_urls_from_pdf_no_crash_on_incomplete_dest_coordinates(
  pdf_files,
@@ -814,6 +923,6 @@ def test_extract_texkeys_from_pdf_no_crash_on_pydpf2_error(pdf_files):
 
 def test_extract_texkeys_from_pdf_no_crash_on_other_exceptions(pdf_files):
  expected = []
- result = extract_texkeys_and_urls_from_pdf(pdf_files[6])
+ result = extract_texkeys_and_urls_from_pdf(pdf_files[9])
 
  assert result == expected