Skip to content

Commit

Permalink
move record counting scripts to src/utils
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 3, 2024
1 parent 872eb5f commit 2306819
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 21 deletions.
3 changes: 3 additions & 0 deletions data/raw/eprints/cleaned_links/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Cleaned links

`joined.csv` contains date info as well, which the individual files don't - they might be older.
11 changes: 0 additions & 11 deletions data/raw/eprints/cleaned_links/counts.py

This file was deleted.

10 changes: 0 additions & 10 deletions data/raw/eprints/eprints_pub/counts.py

This file was deleted.

24 changes: 24 additions & 0 deletions src/utils/count_entries_in_cleaned_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse
import pandas as pd

def main(path):
df = pd.read_csv(path)
for e_r in sorted(df.eprints_repo.unique()):
print("Repo:", e_r)
df_temp = df[df.eprints_repo == e_r]
print("Links:", len(df_temp.domain_url.notna()))
print("Pattern cleaned links:", len(df_temp.pattern_cleaned_url.notna()))
print("User cleaned links:", len(df_temp.github_user_cleaned_url.notna()))
print("Unique links:", len(df_temp.github_user_cleaned_url.unique()))
print("\n")

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="count_entries_in_cleaned_links",
description="For each ePrints repository, report the number of non-null entries in the relevant columns."
)
parser.add_argument("-f", "--file", type=str,
help="CSV file containing data from all ePrints repositories",
default="../../data/raw/eprints/cleaned_links/joined.csv")
args = parser.parse_args()
main(args.file)
23 changes: 23 additions & 0 deletions src/utils/count_entries_in_extracted_pub_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import argparse
import pandas as pd

def main(dirpath):
for f_path in sorted(os.listdir(dirpath)):
if f_path.endswith(".csv"):
print("File:", f_path)
df = pd.read_csv(os.path.join(dirpath, f_path))
print("Entries:", len(df))
print("PDFs:", len(df[df['pdf_url'].str.endswith(".pdf")]))
print("\n")

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="count_entries_in_extracted_pub_urls",
description="For each CSV file in the directory, report the number of non-null entries in the relevant columns."
)
parser.add_argument("-d", "--directory", type=str,
help="Directory containing CSV files with publication URLs, one for each ePrints repository",
default="../../data/raw/eprints/eprints_pub/")
args = parser.parse_args()
main(args.directory)

0 comments on commit 2306819

Please sign in to comment.