move record counting scripts to src/utils

softwaresaved · Jul 3, 2024 · 2306819 · 2306819
1 parent 872eb5f
commit 2306819
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 21 deletions.
diff --git a/data/raw/eprints/cleaned_links/README.md b/data/raw/eprints/cleaned_links/README.md
@@ -0,0 +1,3 @@
+# Cleaned links
+
+`joined.csv` contains date info as well, which the individual files don't - they might be older.
diff --git a/data/raw/eprints/cleaned_links/counts.py b/data/raw/eprints/cleaned_links/counts.py
diff --git a/data/raw/eprints/eprints_pub/counts.py b/data/raw/eprints/eprints_pub/counts.py
diff --git a/src/utils/count_entries_in_cleaned_links.py b/src/utils/count_entries_in_cleaned_links.py
@@ -0,0 +1,24 @@
+import argparse
+import pandas as pd
+
+def main(path):
+    df = pd.read_csv(path)
+    for e_r in sorted(df.eprints_repo.unique()):
+        print("Repo:", e_r)
+        df_temp = df[df.eprints_repo == e_r]
+        print("Links:", len(df_temp.domain_url.notna()))
+        print("Pattern cleaned links:", len(df_temp.pattern_cleaned_url.notna()))
+        print("User cleaned links:", len(df_temp.github_user_cleaned_url.notna()))
+        print("Unique links:", len(df_temp.github_user_cleaned_url.unique()))
+        print("\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="count_entries_in_cleaned_links",
+        description="For each ePrints repository, report the number of non-null entries in the relevant columns."
+    )
+    parser.add_argument("-f", "--file", type=str,
+                        help="CSV file containing data from all ePrints repositories",
+                        default="../../data/raw/eprints/cleaned_links/joined.csv")
+    args = parser.parse_args()
+    main(args.file)
diff --git a/src/utils/count_entries_in_extracted_pub_urls.py b/src/utils/count_entries_in_extracted_pub_urls.py
@@ -0,0 +1,23 @@
+import os
+import argparse
+import pandas as pd
+
+def main(dirpath):
+    for f_path in sorted(os.listdir(dirpath)):
+        if f_path.endswith(".csv"):
+            print("File:", f_path)
+            df = pd.read_csv(os.path.join(dirpath, f_path))
+            print("Entries:", len(df))
+            print("PDFs:", len(df[df['pdf_url'].str.endswith(".pdf")]))
+            print("\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="count_entries_in_extracted_pub_urls",
+        description="For each CSV file in the directory, report the number of non-null entries in the relevant columns."
+    )
+    parser.add_argument("-d", "--directory", type=str,
+                        help="Directory containing CSV files with publication URLs, one for each ePrints repository",
+                        default="../../data/raw/eprints/eprints_pub/")
+    args = parser.parse_args()
+    main(args.directory)