diff --git a/src/REF2021_processing/process_submissions_and_results.py b/src/REF2021_processing/process_submissions_and_results.py index 606a145..c4e0e40 100644 --- a/src/REF2021_processing/process_submissions_and_results.py +++ b/src/REF2021_processing/process_submissions_and_results.py @@ -238,11 +238,12 @@ def preprocess_impacts(dset): return dset -def preprocess_sheet(source): +def preprocess_sheet(source, output_path): """Preprocess a sheet from the raw data. Args: sname (str): Name of the sheet to preprocess. + output_path (str): Path to save the pre-processed data. """ # set the input excel file name and index @@ -304,7 +305,7 @@ def preprocess_sheet(source): # set the index name and save the pre-processed data dset.index.name = "Record" rw.export_dataframe( - dset, os.path.join(rw.SOURCES["submissions"]["output_path"], sname), sname + dset, os.path.join(output_path, sname), sname ) @@ -336,14 +337,16 @@ def preprocess_sheet(source): if source_name == "results": SHEET_NAME = rw.SOURCES["results"]["sheet"] + OUTPUT_PATH = rw.SOURCES["results"]["output_path"] else: SHEET_NAME = rw.SOURCES["submissions"]["sheets"][source_name] + OUTPUT_PATH = rw.SOURCES["submissions"]["output_path"] STATUS = utils.setup_logger(SHEET_NAME, verbose=args.verbose) # run pre-processing if STATUS: - preprocess_sheet(source_name) + preprocess_sheet(source_name, OUTPUT_PATH) else: print(f"{utils.FAILED_ICON} failed: setup logger") diff --git a/src/REF2021_processing/read_write.py b/src/REF2021_processing/read_write.py index 2825d74..9080fa7 100644 --- a/src/REF2021_processing/read_write.py +++ b/src/REF2021_processing/read_write.py @@ -48,6 +48,7 @@ "filename": "REF-2021-Results-All-2022-05-06.xlsx", "header_index": 6, "sheet": "Results", + "output_path": "data/processed/sheets/" }, } diff --git a/tests/folders_and_files_test.py b/tests/folders_and_files_test.py index bc1aae8..093ce31 100644 --- a/tests/folders_and_files_test.py +++ b/tests/folders_and_files_test.py @@ -3,16 +3,97 @@ import REF2021_processing.read_write as rw -def test_log_folder_exists(): - """Test if the log folder exists.""" +# Submissions +# =========== +def test_process_submissions_logs_exist(): + """Test if the logs files for submissions exist.""" - fpath = rw.LOGS["path"] - assert os.path.exists(fpath), f"{fpath} does not exist" + for sheet in rw.SOURCES["submissions"]["sheets"].values(): + fpath = f"{rw.LOGS['path']}{sheet}{rw.LOGS['extension']}" + assert os.path.exists(fpath), f"{fpath} does not exist" -def test_process_submissions_logs_exist(): - """Test if the logs files for submissions exist.""" +def test_process_submissions_logs_empty(): + """Test if the logs files for submissions are empty.""" for sheet in rw.SOURCES["submissions"]["sheets"].values(): fpath = f"{rw.LOGS['path']}{sheet}{rw.LOGS['extension']}" + assert not os.path.getsize(fpath) == 0, f"{fpath} is empty" + + +def test_process_submissions_output_exist(): + """Test if the processed files for submissions exist.""" + + for sheet in rw.SOURCES["submissions"]["sheets"].values(): + fpath = ( + f"{rw.SOURCES['submissions']['output_path']}{sheet}{rw.OUTPUT_EXTENSION}" + ) assert os.path.exists(fpath), f"{fpath} does not exist" + + +def test_process_submissions_output_empty(): + """Test if the processed files for submissions are empty.""" + + for sheet in rw.SOURCES["submissions"]["sheets"].values(): + fpath = ( + f"{rw.SOURCES['submissions']['output_path']}{sheet}{rw.OUTPUT_EXTENSION}" + ) + assert not os.path.getsize(fpath) == 0, f"{fpath} is empty" + + +# Results +# ======= +# Results logs +def test_process_results_logs_exist(): + """Test if the logs files for results exist.""" + + sheet = rw.SOURCES["results"]["sheet"] + fpath = f"{rw.LOGS['path']}{sheet}{rw.LOGS['extension']}" + assert os.path.exists(fpath), f"{fpath} does not exist" + + +def test_process_results_logs_empty(): + """Test if the logs files for results are empty.""" + + sheet = rw.SOURCES["results"]["sheet"] + fpath = f"{rw.LOGS['path']}{sheet}{rw.LOGS['extension']}" + assert not os.path.getsize(fpath) == 0, f"{fpath} is empty" + + +# Results output +def test_process_results_output_exist(): + """Test if the processed files for results exist.""" + + fpath = ( + f"{rw.SOURCES['results']['output_path']}" + f"{rw.SOURCES['results']['sheet']}{rw.OUTPUT_EXTENSION}" + ) + assert os.path.exists(fpath), f"{fpath} does not exist" + + +def test_process_results_output_empty(): + """Test if the processed files for results are empty.""" + + fpath = ( + f"{rw.SOURCES['results']['output_path']}" + f"{rw.SOURCES['results']['sheet']}{rw.OUTPUT_EXTENSION}" + ) + assert not os.path.getsize(fpath) == 0, f"{fpath} is empty" + + +# Environment statements +# ======================= +def test_process_environment_statements_logs_exist(): + """Test if the logs files for environment statements exist.""" + + for _, config in rw.SOURCES["environment_statements"].items(): + fpath = f"{rw.LOGS['path']}{config['name']}{rw.LOGS['extension']}" + assert os.path.exists(fpath), f"{fpath} does not exist" + + +def test_process_environment_statements_logs_empty(): + """Test if the logs files for environment statements are empty.""" + + for _, config in rw.SOURCES["environment_statements"].items(): + fpath = f"{rw.LOGS['path']}{config['name']}{rw.LOGS['extension']}" + assert not os.path.getsize(fpath) == 0, f"{fpath} is empty"