-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
move record counting scripts to src/utils
- Loading branch information
1 parent
872eb5f
commit 2306819
Showing
5 changed files
with
50 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Cleaned links | ||
|
||
`joined.csv` contains date info as well, which the individual files don't - they might be older. |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import argparse | ||
import pandas as pd | ||
|
||
def main(path): | ||
df = pd.read_csv(path) | ||
for e_r in sorted(df.eprints_repo.unique()): | ||
print("Repo:", e_r) | ||
df_temp = df[df.eprints_repo == e_r] | ||
print("Links:", len(df_temp.domain_url.notna())) | ||
print("Pattern cleaned links:", len(df_temp.pattern_cleaned_url.notna())) | ||
print("User cleaned links:", len(df_temp.github_user_cleaned_url.notna())) | ||
print("Unique links:", len(df_temp.github_user_cleaned_url.unique())) | ||
print("\n") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
prog="count_entries_in_cleaned_links", | ||
description="For each ePrints repository, report the number of non-null entries in the relevant columns." | ||
) | ||
parser.add_argument("-f", "--file", type=str, | ||
help="CSV file containing data from all ePrints repositories", | ||
default="../../data/raw/eprints/cleaned_links/joined.csv") | ||
args = parser.parse_args() | ||
main(args.file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import os | ||
import argparse | ||
import pandas as pd | ||
|
||
def main(dirpath): | ||
for f_path in sorted(os.listdir(dirpath)): | ||
if f_path.endswith(".csv"): | ||
print("File:", f_path) | ||
df = pd.read_csv(os.path.join(dirpath, f_path)) | ||
print("Entries:", len(df)) | ||
print("PDFs:", len(df[df['pdf_url'].str.endswith(".pdf")])) | ||
print("\n") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
prog="count_entries_in_extracted_pub_urls", | ||
description="For each CSV file in the directory, report the number of non-null entries in the relevant columns." | ||
) | ||
parser.add_argument("-d", "--directory", type=str, | ||
help="Directory containing CSV files with publication URLs, one for each ePrints repository", | ||
default="../../data/raw/eprints/eprints_pub/") | ||
args = parser.parse_args() | ||
main(args.directory) |