Skip to content

Commit

Permalink
Merge pull request #108 from MJedr/fix-incorrect-urls
Browse files Browse the repository at this point in the history
fix regression in extract_texkeys_and_urls_from_pdf
  • Loading branch information
MJedr authored Sep 18, 2023
2 parents ee93261 + abd0e31 commit ef676a2
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 15 deletions.
40 changes: 28 additions & 12 deletions refextract/references/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,30 @@ def extract_texkeys_and_urls_from_pdf(pdf_file):
destinations = pdf.getNamedDestinations()
urls = extract_urls(pdf)
except Exception:
LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
LOGGER.debug("PDF: Internal PyPDF2 error, no TeXkeys returned.")
return []
# not all named destinations point to references
refs = []
for destination in destinations.items():
destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0]
destination_key = (
destination[0].decode("utf-8")
if isinstance(destination[0], ByteStringObject)
else destination[0]
)
match = re_reference_in_dest.match(destination_key)
if match:
refs.append(destination)
two_column_layout = False
try:
if _destinations_in_two_columns(pdf, refs):
LOGGER.debug(u"PDF: Using two-column layout")
two_column_layout = True
LOGGER.debug("PDF: Using two-column layout")

def sortfunc(dest_couple):
return dest_couple[1]

else:
LOGGER.debug(u"PDF: Using single-column layout")
LOGGER.debug("PDF: Using single-column layout")

def sortfunc(dest_couple):
page, _, ypos, xpos = dest_couple[1]
Expand All @@ -91,34 +97,41 @@ def sortfunc(dest_couple):
if nb < len(refs) - 1:
next_reference_data = refs[nb + 1]
matched_urls_for_reference, urls = _match_urls_with_reference(
urls, ref, next_reference_data
urls, ref, next_reference_data, two_column_layout=two_column_layout
)
else:
matched_urls_for_reference, urls = _match_urls_with_reference(
urls, ref
urls, ref, two_column_layout=two_column_layout
)
if matched_urls_for_reference:
current_texkey_urls_dict["urls"] = matched_urls_for_reference
texkey_url_list.append(current_texkey_urls_dict)
return texkey_url_list
except Exception:
LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned")
LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned")
return []


def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
def _match_urls_with_reference(
urls_to_match, reference, next_reference=None, two_column_layout=False
):
ref_page_number, ref_column, ref_y, _ = reference[1]
if next_reference:
next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1]
urls_for_reference = set()
for (url_index, url) in enumerate(urls_to_match):
url_page_number, url_col, url_y, _ = url[1]
is_url_under_texkey = ref_y <= url_y
is_url_in_same_col = ref_column == url_col
is_url_in_next_col = url_col > ref_column
is_reference_on_same_page_as_url = ref_page_number == url_page_number
is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number
if not next_reference:
if (
is_reference_on_same_page_as_url or
(
is_reference_on_same_page_as_url and
(is_url_in_same_col or is_url_in_next_col)
) or
is_reference_on_previous_page_than_url
) and is_url_under_texkey:
urls_for_reference.add(url[0])
Expand All @@ -137,7 +150,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
is_next_reference_on_the_same_page and
is_url_under_texkey and
(next_ref_col > url_col) and
next_ref_y < url_y
next_ref_y < url_y and
ref_y <= url_y and
(is_url_in_same_col or is_url_in_next_col)
)
is_in_new_column = (
is_reference_on_same_page_as_url and
Expand All @@ -155,8 +170,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
is_url_unrelated_to_references = ref_page_number > url_page_number
is_url_for_next_reference = url_y >= next_ref_y
if is_url_between_texkeys:
urls_for_reference.add(url[0])
continue
if not two_column_layout or (two_column_layout and url_col == ref_column):
urls_for_reference.add(url[0])
continue
elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout:
urls_for_reference.add(url[0])
continue
Expand Down
Binary file added tests/data/2301.05883.pdf
Binary file not shown.
Binary file added tests/data/2303.03819.pdf
Binary file not shown.
Binary file added tests/data/2304.10117.pdf
Binary file not shown.
Binary file modified tests/data/packed_pdf.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_extract_references_from_url(pdf_files):

def test_long_registrant_dois(pdf_files):
""" DOIs with 5 digit registrant code """
r = extract_references_from_file(pdf_files[8])
r = extract_references_from_file(pdf_files[11])
assert len(r) == 6
for ref in r[1:]:
assert 'doi' in ref
Expand Down
2 changes: 1 addition & 1 deletion tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def test_reference_split_handles_semicolon():

def test_clean_pdf_before_run(tmp_path, pdf_files):
tmp_file_path = tmp_path / "packed.pdf"
pdf = pdf_files[7]
pdf = pdf_files[10]
with open(pdf, 'rb') as input, open(tmp_file_path, 'wb') as tmp_out:
tmp_out.write(input.read())

Expand Down
111 changes: 110 additions & 1 deletion tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,115 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):

assert two_col_2 == expected_two_col_keys_2

two_col_with_one_url_only = extract_texkeys_and_urls_from_pdf(pdf_files[6])
expected_two_col_with_one_url_only = [
{"texkey": "Gr20"},
{"texkey": "Au18"},
{"texkey": "Ue35"},
{"texkey": "SM88"},
{"texkey": "Sh00"},
{"texkey": "Jaku21"},
{"texkey": "T60"},
{"texkey": "MT00"},
{"texkey": "Jaku21b", "urls": {"http://arxiv.org/abs/2102.08069"}},
{"texkey": "Mo64"},
{"texkey": "Ko21"},
{"texkey": "Lan"},
{"texkey": "Sch55"},
{"texkey": "Le56"},
{"texkey": "FR74"},
{"texkey": "BS19"},
{"texkey": "Va00"},
{"texkey": "T61"},
{"texkey": "BD64"},
{"texkey": "Ye61"},
{"texkey": "Kl77"},
{"texkey": "VJ"},
{"texkey": "Sal"},
{"texkey": "YRW"},
{"texkey": "Ma69"},
{"texkey": "MG64"},
{"texkey": "We65"},
{"texkey": "Lo58"},
{"texkey": "Reu82"},
{"texkey": "Off91"},
{"texkey": "Jaku22"},
{"texkey": "Fr72"},
{"texkey": "Jo62"},
]
assert two_col_with_one_url_only == expected_two_col_with_one_url_only

two_col_with_one_url_only_1 = extract_texkeys_and_urls_from_pdf(pdf_files[7])
expected_two_col_with_one_url_only_1 = [
{"texkey": "Hees-Rapp"},
{"texkey": "He:2022ywp", "urls": {"http://arxiv.org/abs/2204.09299"}},
{"texkey": "Das-Alam-Mohanty"},
{"texkey": "Svetitsky:1987gq"},
{"texkey": "Tsallis"},
{"texkey": "Marques-Cleymans-Deppman-2015"},
{"texkey": "Marques-Andrade-Deppman-2013"},
{"texkey": "WilkWlodarkzyk-multiparticle"},
{"texkey": "TsallisBook"},
{"texkey": "PLASTINO1995347"},
{"texkey": "Muskat"},
{"texkey": "Schwammle"},
{"texkey": "Schwammle2009"},
{"texkey": "WaltonRafelski"},
{"texkey": "Wong:2015mba"},
{"texkey": "Deppman:2019yno"},
{"texkey": "PasechnikSumbera"},
{"texkey": "Adolfsson:2020dhm"},
{"texkey": "Qin:2015srf"},
{"texkey": "Apolinario:2015bfm"},
{"texkey": "Casalderrey-Solana:2018wrw"},
{"texkey": "CORADDU2003473"},
{"texkey": "Curilef"},
{"texkey": "Annala:2019puf"},
{"texkey": "Annala:2020rgx"},
{"texkey": "Cardoso2017"},
{"texkey": "Sen:2021tdu"},
]
assert two_col_with_one_url_only_1 == expected_two_col_with_one_url_only_1

two_col_with_one_url_only_2 = extract_texkeys_and_urls_from_pdf(pdf_files[8])
two_col_with_one_url_only_2_expected = [
{"texkey": "Penrose:1964wq"},
{"texkey": "Penrose:1969pc"},
{"texkey": "Hawking:1976ra"},
{"texkey": "wald2001thermodynamics"},
{"texkey": "abbott2016observation"},
{"texkey": "isi2021testing"},
{"texkey": "isi2019testing"},
{"texkey": "PhysRevD.7.2333"},
{"texkey": "PhysRevLett.30.71"},
{"texkey": "bardeen1973four"},
{"texkey": "wald1994quantum"},
{"texkey": "abbott2021gwtc", "urls": {"http://arxiv.org/abs/2111.03606"}},
{"texkey": "cabero2018observational"},
{"texkey": "samples2"},
{"texkey": "scientific2016tests"},
{"texkey": "lalsimulation"},
{"texkey": "abbott2021gwtc1"},
{"texkey": "samples1"},
{"texkey": "schutz1999gravitational"},
{"texkey": "martynov2016sensitivity"},
{"texkey": "bersanetti2021advanced"},
{"texkey": "kagra2019kagra"},
{"texkey": "shaddock2008space"},
{"texkey": "hu2017taiji"},
{"texkey": "ruan2020taiji"},
{"texkey": "luo2020brief"},
{"texkey": "luo2016tianqin"},
{"texkey": "Gong:2021gvw"},
{"texkey": "amaro2018relativistic"},
{"texkey": "berry2013observing"},
{"texkey": "babak2017science"},
{"texkey": "berry2013expectations"},
{"texkey": "gwosc"},
]

assert two_col_with_one_url_only_2 == two_col_with_one_url_only_2_expected


def test_extract_texkeys_and_urls_from_pdf_no_crash_on_incomplete_dest_coordinates(
pdf_files,
Expand All @@ -814,6 +923,6 @@ def test_extract_texkeys_from_pdf_no_crash_on_pydpf2_error(pdf_files):

def test_extract_texkeys_from_pdf_no_crash_on_other_exceptions(pdf_files):
expected = []
result = extract_texkeys_and_urls_from_pdf(pdf_files[6])
result = extract_texkeys_and_urls_from_pdf(pdf_files[9])

assert result == expected

0 comments on commit ef676a2

Please sign in to comment.