diff --git a/requirements.txt b/requirements.txt index 32a573e..df56a9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,12 +7,15 @@ charset-normalizer==3.3.2 conda-inject==1.3.1 ConfigArgParse==1.7 connection_pool==0.0.3 +cramjam==2.8.1 datrie==0.8.2 dill==0.3.8 docutils==0.20.1 dpath==2.1.6 et-xmlfile==1.1.0 fastjsonschema==2.19.1 +fastparquet==2023.10.1 +fsspec==2024.2.0 gitdb==4.0.11 GitPython==3.1.41 humanfriendly==10.0 diff --git a/src/REF2021_processing/read_write.py b/src/REF2021_processing/read_write.py index 7faca7a..c4b4eaa 100644 --- a/src/REF2021_processing/read_write.py +++ b/src/REF2021_processing/read_write.py @@ -44,8 +44,8 @@ }, "groups": { "records": 2036, - } - } + }, + }, }, "environment_statements": { "unit": { @@ -54,6 +54,9 @@ "input_extension": ".txt", "name": "EnvironmentStatementsUnitLevel", "output_path": "data/processed/environment_statements/prepared/", + "tests": { + "records": 1874, + }, }, "institution": { "extracted_path": "data/processed/environment_statements/extracted/institution/", @@ -61,6 +64,9 @@ "input_extension": ".txt", "prefix": "Institution environment statement - ", "output_path": "data/processed/environment_statements/prepared/", + "tests": { + "records": 143, + }, }, }, "results": { @@ -71,7 +77,7 @@ "output_path": "data/processed/sheets/", "tests": { "records": 1888, - } + }, }, } diff --git a/tests/submissions_results_data_test.py b/tests/submissions_results_data_test.py index bc09682..29d903f 100644 --- a/tests/submissions_results_data_test.py +++ b/tests/submissions_results_data_test.py @@ -4,7 +4,7 @@ import REF2021_processing.read_write as rw -def test_processed_submissions_record_numbers(): +def test_processed_submissions_records(): """Test if the processed submissions files have the expected number of records.""" source = "submissions" @@ -21,7 +21,7 @@ def test_processed_submissions_record_numbers(): ), f"{sheet_name}: {records} records, expected {expected_records}" -def test_processed_results_record_numbers(): +def test_processed_results_records(): """Test if the processed results file has the expected number of records.""" source = "results" @@ -36,3 +36,43 @@ def test_processed_results_record_numbers(): assert ( records == expected_records ), f"{sheet_name}: {records} records, expected {expected_records}" + + +def test_processed_institution_environment_statements_records(): + """Test if the processed institution environment statements + file has the expected number of records. + """ + + source = "environment_statements" + level = "institution" + level_name = rw.SOURCES[source][level]["name"] + fpath = os.path.join( + rw.PROJECT_PATH, + f"{rw.SOURCES[source][level]['output_path']}{level_name}{rw.OUTPUT_EXTENSION}", + ) + pf = ParquetFile(fpath) + records = pf.count() + expected_records = rw.SOURCES[source][level]["tests"]["records"] + assert ( + records == expected_records + ), f"{level_name}: {records} records, expected {expected_records}" + + +def test_processed_unit_environment_statements_records(): + """Test if the processed unit environment statements + file has the expected number of records. + """ + + source = "environment_statements" + level = "unit" + level_name = rw.SOURCES[source][level]["name"] + fpath = os.path.join( + rw.PROJECT_PATH, + f"{rw.SOURCES[source][level]['output_path']}{level_name}{rw.OUTPUT_EXTENSION}", + ) + pf = ParquetFile(fpath) + records = pf.count() + expected_records = rw.SOURCES[source][level]["tests"]["records"] + assert ( + records == expected_records + ), f"{level_name}: {records} records, expected {expected_records}"