From a67be2c17a2dd4f5c980933471a3c75186b9d655 Mon Sep 17 00:00:00 2001 From: Andrei Rusu Date: Mon, 4 Dec 2023 08:30:51 +0100 Subject: [PATCH] update the file integrity checker (#148) * update the file integrity checker to load metadata from the raw file and some other refactoring * fixed code style --- .../L0_unprocessed_data/raw_handler.py | 166 +++++++++++------- oceanstream/L0_unprocessed_data/template.py | 61 ------- .../L2_calibrated_data/processed_data_io.py | 10 +- oceanstream/L2_calibrated_data/template.py | 61 ------- .../applying_masks_handler.py | 16 +- oceanstream/L3_regridded_data/template.py | 61 ------- oceanstream/utils.py | 8 +- setup.py | 8 +- tests/conftest.py | 6 +- tests/test_processed_reader.py | 18 +- tests/test_raw_handler.py | 37 ++-- 11 files changed, 166 insertions(+), 286 deletions(-) delete mode 100644 oceanstream/L0_unprocessed_data/template.py delete mode 100644 oceanstream/L2_calibrated_data/template.py delete mode 100644 oceanstream/L3_regridded_data/template.py diff --git a/oceanstream/L0_unprocessed_data/raw_handler.py b/oceanstream/L0_unprocessed_data/raw_handler.py index 0aaf5a2..984fb88 100644 --- a/oceanstream/L0_unprocessed_data/raw_handler.py +++ b/oceanstream/L0_unprocessed_data/raw_handler.py @@ -24,10 +24,12 @@ # Import necessary libraries import os import re +import xml.etree.ElementTree as ET from datetime import datetime, timedelta from typing import Dict, List, Union import echopype as ep +from echopype.convert.utils.ek_raw_io import RawSimradFile SUPPORTED_SONAR_MODELS = ["EK60", "ES70", "EK80", "EA640", "AZFP", "AD2CP"] TIME_BETWEEN_FILES = 30 # time in minutes between two consecutive files @@ -164,86 +166,81 @@ def file_integrity_checking( Returns: - dict: A dictionary containing the following keys: - 'file_path': Absolute path to the file. - 'campaign_id': Identifier for the measuring\ - campaign extracted from the file name. - 'date': Date and time when the measurement started,\ - extracted from the file name. - 'sonar_model': Type of sonar that produced the file. - 'file_integrity': Boolean indicating if the file is readable by echopype. - 'use_swap': Applicable only for raw files.\ + 'file_path': Absolute path to the file. + 'campaign_id': Identifier for the measuring campaign, + extracted from the file name. + 'date': Date and time when the measurement started, + extracted from the file name. Returns None if the date + and time cannot be determined. + 'file_integrity': Boolean indicating if the file is of a supported type. + 'use_swap': Applicable only for raw files.\ A Boolean indicating whether the option was used when reading raw files or not. Raises: - - - Exception: If the file type is not supported or - if there are issues reading the file. + - Exception: If the file type is not supported. Example: - file_integrity_checking("/path/to/JR161-D20230509-T100645.raw") { - 'file_path': '/path/to/JR161-D20230509-T100645.raw', - 'campaign_id': 'JR161', - 'date': datetime.datetime(2023, 5, 9, 10, 6, 45), - 'sonar_model': 'EK60', - 'file_integrity': True - 'use_swap': False + 'file_path': '/path/to/JR161-D20230509-T100645.raw', + 'campaign_id': 'JR161', + 'date': datetime.datetime(2023, 5, 9, 10, 6, 45), + 'file_integrity': True } """ return_dict = {} + # get file name from path - file_name = os.path.split(file_path)[-1] - # eliminate file type - file_name = file_name.split(".")[0] - campaign_id = file_name.split("-")[0] - no_date_from_file_name = False - date = datetime.now() - try: - pattern_date = r"D(\d{4})(\d{2})(\d{2})" - pattern_time = r"T(\d{2})(\d{2})(\d{2})" + file_path = os.path.abspath(file_path) + _, file_name = os.path.split(file_path) + _, file_extension = os.path.splitext(file_path) + file_extension = file_extension.lower() + + if file_extension not in [".raw", ".nc", ".zarr"]: + raise Exception("File type not supported for " + str(file_path)) - matches_date = re.findall(pattern_date, file_name)[0] - matches_time = re.findall(pattern_time, file_name)[0] + file_integrity = True + metadata = None + sonar_model = None + date = None + campaign_id = None - year, month, day = matches_date - hour, minute, second = matches_time + if ".raw" == file_extension: + metadata = parse_metadata(file_path) + if metadata is not None: + campaign_id = metadata.get("survey_name", None) + date = metadata.get("timestamp", None) + sonar_model = detect_sonar_model(file_path, metadata=metadata) + + if not metadata: + campaign_id = file_name.split("-")[0] - datetime_string = f"D{year}{month}{day}-T{hour}{minute}{second}" - date = datetime.strptime(datetime_string, "D%Y%m%d-T%H%M%S") - except Exception as e: - e += "!" - no_date_from_file_name = True - - if ".raw" in file_path: - for s_m in SUPPORTED_SONAR_MODELS: - try: - ed = ep.open_raw(file_path, sonar_model=s_m, use_swap=use_swap) # type: ignore - file_integrity = True - break - except Exception: - continue - if not file_integrity: - raise Exception("File type not supported for " + str(file_path)) - elif ".nc" in file_path or ".zarr" in file_path: try: - ed = ep.open_converted(file_path) - file_integrity = True - except ValueError: - raise Exception("File type not supported for " + str(file_path)) - else: - raise Exception("File type not supported for " + str(file_path)) - if no_date_from_file_name: - datetime_string = ed["Top-level"].date_created - date = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%SZ") + pattern_date = r"D(\d{4})(\d{2})(\d{2})" + pattern_time = r"T(\d{2})(\d{2})(\d{2})" + + matches_date = re.findall(pattern_date, file_name)[0] + matches_time = re.findall(pattern_time, file_name)[0] + + year, month, day = matches_date + hour, minute, second = matches_time + + datetime_string = f"D{year}{month}{day}-T{hour}{minute}{second}" + date = datetime.strptime(datetime_string, "D%Y%m%d-T%H%M%S") + except Exception as e: + e += "!" + file_integrity = False + date = None return_dict["file_path"] = file_path return_dict["campaign_id"] = campaign_id return_dict["date"] = date - return_dict["sonar_model"] = ed.sonar_model return_dict["file_integrity"] = file_integrity - if ".raw" in file_path: + return_dict["sonar_model"] = sonar_model + + if ".raw" == file_extension: return_dict["use_swap"] = use_swap + return return_dict @@ -271,7 +268,7 @@ def read_raw_files( """ ret_list = [] for f_i in file_dicts: - opened_file = _read_file(f_i["file_path"], f_i["sonar_model"], f_i["use_swap"]) + opened_file = _read_file(file_path=f_i["file_path"]) ret_list.append(opened_file) return ret_list @@ -301,9 +298,7 @@ def read_processed_files(file_paths: List[str]) -> List[ep.echodata.EchoData]: return ret_list -def _read_file( - file_path: str, sonar_model: str = "EK80", use_swap: bool = False -) -> ep.echodata.EchoData: +def _read_file(file_path: str, sonar_model: str = None) -> ep.echodata.EchoData: """ Reads an echo sounder file and returns the corresponding Dataset. @@ -334,7 +329,10 @@ def _read_file( """ file_name = os.path.split(file_path)[-1] if ".raw" in file_name: - ed = ep.open_raw(file_path, sonar_model=sonar_model, use_swap=use_swap) # type: ignore + if sonar_model is None: + sonar_model = detect_sonar_model(file_path) + + ed = ep.open_raw(file_path, sonar_model=sonar_model) # type: ignore elif ".nc" in file_name or ".zarr" in file_name: ed = ep.open_converted(file_path) # create an EchoData object else: @@ -373,7 +371,7 @@ def convert_raw_files( """ ret_list = [] for f_i in file_dicts: - opened_file = _read_file(f_i["file_path"], f_i["sonar_model"], f_i["use_swap"]) + opened_file = _read_file(file_path=f_i["file_path"]) _write_file(opened_file, save_path, save_file_type) file_name = os.path.split(f_i["file_path"])[-1] file_type = save_file_type @@ -506,3 +504,41 @@ def concatenate_files( list_of_datasets.append(_read_file(file_info["file_path"])) combined_dataset = ep.combine_echodata(list_of_datasets) return combined_dataset + + +def parse_metadata(file_path): + try: + with RawSimradFile(file_path, "r", storage_options={}) as fid: + config_datagram = fid.read(1) + return config_datagram + except Exception as e: + print(f"Error parsing metadata from {file_path}. Error: {e}") + return None + + +def detect_sonar_model(file_path: str, metadata=None) -> str: + if metadata is None: + metadata = parse_metadata(file_path) + + if metadata is None: + return None + + if "sounder_name" not in metadata: + try: + xml_string = metadata.get("xml", None) + root = ET.fromstring(xml_string) + header = root.find("Header") + application_name = header.attrib.get("ApplicationName") + + if application_name == "EK80": + return "EK80" + + except ET.ParseError: + return None + + return None + + if metadata["sounder_name"] == "EK60" or metadata["sounder_name"] == "ER60": + return "EK60" + + return metadata["sounder_name"] diff --git a/oceanstream/L0_unprocessed_data/template.py b/oceanstream/L0_unprocessed_data/template.py deleted file mode 100644 index df89ea9..0000000 --- a/oceanstream/L0_unprocessed_data/template.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -module_name_here.py -------------------------- -Description: Brief description of what this module does. - -""" - -# Import necessary libraries -import os -import sys - -# Constants (if any) -CONSTANT_NAME = "Constant Value" - - -def function_name_1(arg1, arg2): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - arg2 (type): Description of arg2. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_1(value1, value2) - Expected Output - """ - # Function implementation - path = os.path.join(["this", "and", "that"]) - current_path = sys.path - return path, current_path - - -def function_name_2(arg1): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_2(value1) - Expected Output - """ - # Function implementation - pass - - -# Additional functions... - - -if __name__ == "__main__": - # Code to be executed if this module is run as a standalone script - # For example, for testing purposes - pass diff --git a/oceanstream/L2_calibrated_data/processed_data_io.py b/oceanstream/L2_calibrated_data/processed_data_io.py index 194a42e..3c276b1 100644 --- a/oceanstream/L2_calibrated_data/processed_data_io.py +++ b/oceanstream/L2_calibrated_data/processed_data_io.py @@ -111,8 +111,14 @@ def write_processed( raise TypeError("Expected a xarray Dataset") path = Path(file_path) - if not path.is_dir(): - raise FileNotFoundError(f"Invalid path provided: {path}") + if not path.exists(): + try: + path.mkdir(parents=True, exist_ok=True) + except Exception as e: + raise Exception(f"An error occurred while creating the directory: {e}") + else: + if not path.is_dir(): + raise NotADirectoryError(f"Path exists but is not a directory: {path}") if not file_name: file_name = Path(sv.source_filenames[0].values.item()).stem diff --git a/oceanstream/L2_calibrated_data/template.py b/oceanstream/L2_calibrated_data/template.py deleted file mode 100644 index df89ea9..0000000 --- a/oceanstream/L2_calibrated_data/template.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -module_name_here.py -------------------------- -Description: Brief description of what this module does. - -""" - -# Import necessary libraries -import os -import sys - -# Constants (if any) -CONSTANT_NAME = "Constant Value" - - -def function_name_1(arg1, arg2): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - arg2 (type): Description of arg2. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_1(value1, value2) - Expected Output - """ - # Function implementation - path = os.path.join(["this", "and", "that"]) - current_path = sys.path - return path, current_path - - -def function_name_2(arg1): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_2(value1) - Expected Output - """ - # Function implementation - pass - - -# Additional functions... - - -if __name__ == "__main__": - # Code to be executed if this module is run as a standalone script - # For example, for testing purposes - pass diff --git a/oceanstream/L3_regridded_data/applying_masks_handler.py b/oceanstream/L3_regridded_data/applying_masks_handler.py index 3bbd3a9..c0d93f9 100644 --- a/oceanstream/L3_regridded_data/applying_masks_handler.py +++ b/oceanstream/L3_regridded_data/applying_masks_handler.py @@ -86,7 +86,7 @@ def apply_selected_noise_masks_and_or_noise_removal( """ - valid_processes = valid_processes = [ + valid_processes = [ "mask_impulse", "mask_attenuation", "mask_transient", @@ -95,6 +95,13 @@ def apply_selected_noise_masks_and_or_noise_removal( "mask_seabed", ] + # Check for unexpected masks/processes + for process in processes_to_apply: + if process not in valid_processes: + raise ValueError( + "Unexpected mask/process. Please refer to the function documentation for valid masks/processes." + ) + for process in valid_processes: if process in processes_to_apply: params = processes_to_apply[process] @@ -110,13 +117,6 @@ def apply_selected_noise_masks_and_or_noise_removal( elif process == "remove_background_noise": ds = background_noise_remover.apply_remove_background_noise(ds, **params) - # Check for unexpected masks/processes - for process in processes_to_apply: - if process not in valid_processes: - raise ValueError( - "Unexpected mask/process. Please refer to the function documentation for valid masks/processes." - ) - return ds diff --git a/oceanstream/L3_regridded_data/template.py b/oceanstream/L3_regridded_data/template.py deleted file mode 100644 index df89ea9..0000000 --- a/oceanstream/L3_regridded_data/template.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -module_name_here.py -------------------------- -Description: Brief description of what this module does. - -""" - -# Import necessary libraries -import os -import sys - -# Constants (if any) -CONSTANT_NAME = "Constant Value" - - -def function_name_1(arg1, arg2): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - arg2 (type): Description of arg2. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_1(value1, value2) - Expected Output - """ - # Function implementation - path = os.path.join(["this", "and", "that"]) - current_path = sys.path - return path, current_path - - -def function_name_2(arg1): - """ - Brief description of the function. - - Parameters: - - arg1 (type): Description of arg1. - - Returns: - - type: Description of the return value. - - Example: - >>> function_name_2(value1) - Expected Output - """ - # Function implementation - pass - - -# Additional functions... - - -if __name__ == "__main__": - # Code to be executed if this module is run as a standalone script - # For example, for testing purposes - pass diff --git a/oceanstream/utils.py b/oceanstream/utils.py index 42171d4..2490078 100644 --- a/oceanstream/utils.py +++ b/oceanstream/utils.py @@ -33,13 +33,14 @@ def add_metadata_to_mask(mask: xr.Dataset, metadata: Dict) -> xr.Dataset: return mask -def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset) -> xr.Dataset: +def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset, mask_type: str = None) -> xr.Dataset: """ Attaches a mask to an existing Sv dataset, allowing the mask to travel in one data structure to the next module Parameters: - Sv (xarray.Dataset): dataset to attach a mask to - mask (xarray.Dataset): mask to be attached, with a mask_type attribute explaining what sort of mask it is + - mask_type (str): type of mask to be attached (optional, if not specified, the mask_type attribute of the mask) Returns: - xarray.Dataset: dataset enriched with the mask @@ -49,11 +50,14 @@ def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset) -> xr.Dataset: Expected Output: Sv with an extra variable containing the mask, named mask_[mask_type] """ - mask_type = mask.attrs["mask_type"] + if mask_type is None: + mask_type = mask.attrs["mask_type"] + mask_name = "mask_" + mask_type Sv_mask = Sv.assign(mask=mask) Sv_mask["mask"].attrs = mask.attrs Sv_mask = Sv_mask.rename({"mask": mask_name}) + return Sv_mask diff --git a/setup.py b/setup.py index fa291ee..66a61cf 100644 --- a/setup.py +++ b/setup.py @@ -6,14 +6,14 @@ setup( name="oceanstream", - version="0.1", # You can adjust the version number as needed + version="0.1", packages=find_packages(), install_requires=requirements, # Optional metadata - author="PineView AS", + author="Pine View Software AS", author_email="hello@pineview.io", - description="A brief description of the oceanstream package", + description="", license="MIT", - keywords="ocean stream processing", + keywords="oceanstream echosounder", url="https://github.com/OceanStreamIO/oceanstream", ) diff --git a/tests/conftest.py b/tests/conftest.py index 2280321..7531e61 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,9 +130,9 @@ def denoise_dataset(dataset): @pytest.fixture(scope="session") def ftp_data(): test_data_folder = Path(TEST_DATA_FOLDER) / "ek60" - with FTP(FTP_MAIN) as ftp: - ftp.login() # Add credentials if needed: ftp.login(user="username", passwd="password") - download_ftp_directory(ftp, FTP_PARTIAL_PATH, test_data_folder) + # with FTP(FTP_MAIN) as ftp: + # ftp.login() + # download_ftp_directory(ftp, FTP_PARTIAL_PATH, test_data_folder) yield str(test_data_folder) # Optional: Cleanup after tests are done # shutil.rmtree(TEST_DATA_FOLDER) diff --git a/tests/test_processed_reader.py b/tests/test_processed_reader.py index 29503ce..bebe49b 100644 --- a/tests/test_processed_reader.py +++ b/tests/test_processed_reader.py @@ -85,15 +85,15 @@ def test_invalid_dataset_input(): write_processed(invalid_input, file_path, file_name) -def test_invalid_directory_path(enriched_ek60_Sv): - sv_echopype_ek60 = enriched_ek60_Sv - # Use pathlib to create the path - invalid_path = Path("/path/that/does/not/exist") - file_name = "invalid_path_test.nc" - - path_pattern = re.escape(str(invalid_path)) - with pytest.raises(FileNotFoundError, match=f"Invalid path provided: {path_pattern}"): - write_processed(sv_echopype_ek60, invalid_path, file_name) +# def test_invalid_directory_path(enriched_ek60_Sv): +# sv_echopype_ek60 = enriched_ek60_Sv +# # Use pathlib to create the path +# invalid_path = Path("/path/that/does/not/exist") +# file_name = "invalid_path_test.nc" +# +# path_pattern = re.escape(str(invalid_path)) +# with pytest.raises(FileNotFoundError, match=f"Invalid path provided: {path_pattern}"): +# write_processed(sv_echopype_ek60, invalid_path, file_name) def test_unsupported_file_type(enriched_ek60_Sv): diff --git a/tests/test_raw_handler.py b/tests/test_raw_handler.py index 391cd1b..c83a2c7 100644 --- a/tests/test_raw_handler.py +++ b/tests/test_raw_handler.py @@ -1,5 +1,5 @@ import os - +import urllib.request import pytest from oceanstream.L0_unprocessed_data.raw_handler import ( @@ -9,6 +9,7 @@ read_processed_files, read_raw_files, split_files, + detect_sonar_model, ) from tests.conftest import TEST_DATA_FOLDER @@ -40,14 +41,7 @@ def test_file_integrity_checking(ftp_data): # Test with a valid raw echo sounder file result_files = file_integrity_checking(found_files[0]) assert result_files["file_integrity"] == True - assert result_files["sonar_model"] in [ - "EK60", - "ES70", - "EK80", - "EA640", - "AZFP", - "AD2CP", - ] + assert result_files["campaign_id"] == 'SignytoP2' # Test with a valid netCDF file valid_netcdf_file = convert_raw_files( @@ -117,7 +111,7 @@ def test_convert_raw_files(ftp_data): # Test with an unsupported save file type with pytest.raises( - Exception + Exception ): # Assuming the function raises an exception for unsupported file types convert_raw_files(file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="unsupported") @@ -149,3 +143,26 @@ def test_split_files(ftp_data): # Test with an empty list with pytest.raises(Exception): grouped_files = split_files([]) + + +def test_detect_sonar_model_ek60(ftp_data): + # Test with a valid raw echo sounder file + found_files = file_finder(ftp_data, "raw") + sonar_model = detect_sonar_model(found_files[0]) + + assert sonar_model == "EK60" + + +def test_detect_sonar_model_ek80(ftp_data): + base_url = "https://noaa-wcsd-pds.s3.amazonaws.com/" + path = "data/raw/Sally_Ride/SR1611/EK80/" + file_name = "D20161108-T214612.raw" + + local_path = os.path.join(TEST_DATA_FOLDER, file_name) + if not os.path.isfile(local_path): + raw_file_address = base_url + path + file_name + urllib.request.urlretrieve(raw_file_address, local_path) + + sonar_model = detect_sonar_model(local_path) + + assert sonar_model == "EK80"