From a67be2c17a2dd4f5c980933471a3c75186b9d655 Mon Sep 17 00:00:00 2001
From: Andrei Rusu <beatfactor@users.noreply.github.com>
Date: Mon, 4 Dec 2023 08:30:51 +0100
Subject: [PATCH] update the file integrity checker  (#148)

* update the file integrity checker to load metadata from the raw file and some other refactoring

* fixed code style
---
 .../L0_unprocessed_data/raw_handler.py        | 166 +++++++++++-------
 oceanstream/L0_unprocessed_data/template.py   |  61 -------
 .../L2_calibrated_data/processed_data_io.py   |  10 +-
 oceanstream/L2_calibrated_data/template.py    |  61 -------
 .../applying_masks_handler.py                 |  16 +-
 oceanstream/L3_regridded_data/template.py     |  61 -------
 oceanstream/utils.py                          |   8 +-
 setup.py                                      |   8 +-
 tests/conftest.py                             |   6 +-
 tests/test_processed_reader.py                |  18 +-
 tests/test_raw_handler.py                     |  37 ++--
 11 files changed, 166 insertions(+), 286 deletions(-)
 delete mode 100644 oceanstream/L0_unprocessed_data/template.py
 delete mode 100644 oceanstream/L2_calibrated_data/template.py
 delete mode 100644 oceanstream/L3_regridded_data/template.py

diff --git a/oceanstream/L0_unprocessed_data/raw_handler.py b/oceanstream/L0_unprocessed_data/raw_handler.py
index 0aaf5a2..984fb88 100644
--- a/oceanstream/L0_unprocessed_data/raw_handler.py
+++ b/oceanstream/L0_unprocessed_data/raw_handler.py
@@ -24,10 +24,12 @@
 # Import necessary libraries
 import os
 import re
+import xml.etree.ElementTree as ET
 from datetime import datetime, timedelta
 from typing import Dict, List, Union
 
 import echopype as ep
+from echopype.convert.utils.ek_raw_io import RawSimradFile
 
 SUPPORTED_SONAR_MODELS = ["EK60", "ES70", "EK80", "EA640", "AZFP", "AD2CP"]
 TIME_BETWEEN_FILES = 30  # time in minutes between two consecutive files
@@ -164,86 +166,81 @@ def file_integrity_checking(
     Returns:
 
     - dict: A dictionary containing the following keys:
-    'file_path': Absolute path to the file.
-    'campaign_id': Identifier for the measuring\
-    campaign extracted from the file name.
-    'date': Date and time when the measurement started,\
-     extracted from the file name.
-    'sonar_model': Type of sonar that produced the file.
-    'file_integrity': Boolean indicating if the file is readable by echopype.
-    'use_swap': Applicable only for raw files.\
+        'file_path': Absolute path to the file.
+        'campaign_id': Identifier for the measuring campaign,
+                      extracted from the file name.
+        'date': Date and time when the measurement started,
+                extracted from the file name. Returns None if the date
+                and time cannot be determined.
+        'file_integrity': Boolean indicating if the file is of a supported type.
+        'use_swap': Applicable only for raw files.\
      A Boolean indicating whether the option was used when reading raw files or not.
 
     Raises:
-
-    - Exception: If the file type is not supported or
-    if there are issues reading the file.
+    - Exception: If the file type is not supported.
 
     Example:
-
     file_integrity_checking("/path/to/JR161-D20230509-T100645.raw")
     {
-    'file_path': '/path/to/JR161-D20230509-T100645.raw',
-    'campaign_id': 'JR161',
-    'date': datetime.datetime(2023, 5, 9, 10, 6, 45),
-    'sonar_model': 'EK60',
-    'file_integrity': True
-    'use_swap': False
+        'file_path': '/path/to/JR161-D20230509-T100645.raw',
+        'campaign_id': 'JR161',
+        'date': datetime.datetime(2023, 5, 9, 10, 6, 45),
+        'file_integrity': True
     }
     """
     return_dict = {}
+
     # get file name from path
-    file_name = os.path.split(file_path)[-1]
-    # eliminate file type
-    file_name = file_name.split(".")[0]
-    campaign_id = file_name.split("-")[0]
-    no_date_from_file_name = False
-    date = datetime.now()
-    try:
-        pattern_date = r"D(\d{4})(\d{2})(\d{2})"
-        pattern_time = r"T(\d{2})(\d{2})(\d{2})"
+    file_path = os.path.abspath(file_path)
+    _, file_name = os.path.split(file_path)
+    _, file_extension = os.path.splitext(file_path)
+    file_extension = file_extension.lower()
+
+    if file_extension not in [".raw", ".nc", ".zarr"]:
+        raise Exception("File type not supported for " + str(file_path))
 
-        matches_date = re.findall(pattern_date, file_name)[0]
-        matches_time = re.findall(pattern_time, file_name)[0]
+    file_integrity = True
+    metadata = None
+    sonar_model = None
+    date = None
+    campaign_id = None
 
-        year, month, day = matches_date
-        hour, minute, second = matches_time
+    if ".raw" == file_extension:
+        metadata = parse_metadata(file_path)
+        if metadata is not None:
+            campaign_id = metadata.get("survey_name", None)
+            date = metadata.get("timestamp", None)
+            sonar_model = detect_sonar_model(file_path, metadata=metadata)
+
+    if not metadata:
+        campaign_id = file_name.split("-")[0]
 
-        datetime_string = f"D{year}{month}{day}-T{hour}{minute}{second}"
-        date = datetime.strptime(datetime_string, "D%Y%m%d-T%H%M%S")
-    except Exception as e:
-        e += "!"
-        no_date_from_file_name = True
-
-    if ".raw" in file_path:
-        for s_m in SUPPORTED_SONAR_MODELS:
-            try:
-                ed = ep.open_raw(file_path, sonar_model=s_m, use_swap=use_swap)  # type: ignore
-                file_integrity = True
-                break
-            except Exception:
-                continue
-        if not file_integrity:
-            raise Exception("File type not supported for " + str(file_path))
-    elif ".nc" in file_path or ".zarr" in file_path:
         try:
-            ed = ep.open_converted(file_path)
-            file_integrity = True
-        except ValueError:
-            raise Exception("File type not supported for " + str(file_path))
-    else:
-        raise Exception("File type not supported for " + str(file_path))
-    if no_date_from_file_name:
-        datetime_string = ed["Top-level"].date_created
-        date = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%SZ")
+            pattern_date = r"D(\d{4})(\d{2})(\d{2})"
+            pattern_time = r"T(\d{2})(\d{2})(\d{2})"
+
+            matches_date = re.findall(pattern_date, file_name)[0]
+            matches_time = re.findall(pattern_time, file_name)[0]
+
+            year, month, day = matches_date
+            hour, minute, second = matches_time
+
+            datetime_string = f"D{year}{month}{day}-T{hour}{minute}{second}"
+            date = datetime.strptime(datetime_string, "D%Y%m%d-T%H%M%S")
+        except Exception as e:
+            e += "!"
+            file_integrity = False
+            date = None
 
     return_dict["file_path"] = file_path
     return_dict["campaign_id"] = campaign_id
     return_dict["date"] = date
-    return_dict["sonar_model"] = ed.sonar_model
     return_dict["file_integrity"] = file_integrity
-    if ".raw" in file_path:
+    return_dict["sonar_model"] = sonar_model
+
+    if ".raw" == file_extension:
         return_dict["use_swap"] = use_swap
+
     return return_dict
 
 
@@ -271,7 +268,7 @@ def read_raw_files(
     """
     ret_list = []
     for f_i in file_dicts:
-        opened_file = _read_file(f_i["file_path"], f_i["sonar_model"], f_i["use_swap"])
+        opened_file = _read_file(file_path=f_i["file_path"])
         ret_list.append(opened_file)
     return ret_list
 
@@ -301,9 +298,7 @@ def read_processed_files(file_paths: List[str]) -> List[ep.echodata.EchoData]:
     return ret_list
 
 
-def _read_file(
-    file_path: str, sonar_model: str = "EK80", use_swap: bool = False
-) -> ep.echodata.EchoData:
+def _read_file(file_path: str, sonar_model: str = None) -> ep.echodata.EchoData:
     """
     Reads an echo sounder file and
     returns the corresponding Dataset.
@@ -334,7 +329,10 @@ def _read_file(
     """
     file_name = os.path.split(file_path)[-1]
     if ".raw" in file_name:
-        ed = ep.open_raw(file_path, sonar_model=sonar_model, use_swap=use_swap)  # type: ignore
+        if sonar_model is None:
+            sonar_model = detect_sonar_model(file_path)
+
+        ed = ep.open_raw(file_path, sonar_model=sonar_model)  # type: ignore
     elif ".nc" in file_name or ".zarr" in file_name:
         ed = ep.open_converted(file_path)  # create an EchoData object
     else:
@@ -373,7 +371,7 @@ def convert_raw_files(
     """
     ret_list = []
     for f_i in file_dicts:
-        opened_file = _read_file(f_i["file_path"], f_i["sonar_model"], f_i["use_swap"])
+        opened_file = _read_file(file_path=f_i["file_path"])
         _write_file(opened_file, save_path, save_file_type)
         file_name = os.path.split(f_i["file_path"])[-1]
         file_type = save_file_type
@@ -506,3 +504,41 @@ def concatenate_files(
         list_of_datasets.append(_read_file(file_info["file_path"]))
     combined_dataset = ep.combine_echodata(list_of_datasets)
     return combined_dataset
+
+
+def parse_metadata(file_path):
+    try:
+        with RawSimradFile(file_path, "r", storage_options={}) as fid:
+            config_datagram = fid.read(1)
+            return config_datagram
+    except Exception as e:
+        print(f"Error parsing metadata from {file_path}. Error: {e}")
+        return None
+
+
+def detect_sonar_model(file_path: str, metadata=None) -> str:
+    if metadata is None:
+        metadata = parse_metadata(file_path)
+
+    if metadata is None:
+        return None
+
+    if "sounder_name" not in metadata:
+        try:
+            xml_string = metadata.get("xml", None)
+            root = ET.fromstring(xml_string)
+            header = root.find("Header")
+            application_name = header.attrib.get("ApplicationName")
+
+            if application_name == "EK80":
+                return "EK80"
+
+        except ET.ParseError:
+            return None
+
+        return None
+
+    if metadata["sounder_name"] == "EK60" or metadata["sounder_name"] == "ER60":
+        return "EK60"
+
+    return metadata["sounder_name"]
diff --git a/oceanstream/L0_unprocessed_data/template.py b/oceanstream/L0_unprocessed_data/template.py
deleted file mode 100644
index df89ea9..0000000
--- a/oceanstream/L0_unprocessed_data/template.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-module_name_here.py
--------------------------
-Description: Brief description of what this module does.
-
-"""
-
-# Import necessary libraries
-import os
-import sys
-
-# Constants (if any)
-CONSTANT_NAME = "Constant Value"
-
-
-def function_name_1(arg1, arg2):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-    - arg2 (type): Description of arg2.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_1(value1, value2)
-    Expected Output
-    """
-    # Function implementation
-    path = os.path.join(["this", "and", "that"])
-    current_path = sys.path
-    return path, current_path
-
-
-def function_name_2(arg1):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_2(value1)
-    Expected Output
-    """
-    # Function implementation
-    pass
-
-
-# Additional functions...
-
-
-if __name__ == "__main__":
-    # Code to be executed if this module is run as a standalone script
-    # For example, for testing purposes
-    pass
diff --git a/oceanstream/L2_calibrated_data/processed_data_io.py b/oceanstream/L2_calibrated_data/processed_data_io.py
index 194a42e..3c276b1 100644
--- a/oceanstream/L2_calibrated_data/processed_data_io.py
+++ b/oceanstream/L2_calibrated_data/processed_data_io.py
@@ -111,8 +111,14 @@ def write_processed(
         raise TypeError("Expected a xarray Dataset")
 
     path = Path(file_path)
-    if not path.is_dir():
-        raise FileNotFoundError(f"Invalid path provided: {path}")
+    if not path.exists():
+        try:
+            path.mkdir(parents=True, exist_ok=True)
+        except Exception as e:
+            raise Exception(f"An error occurred while creating the directory: {e}")
+    else:
+        if not path.is_dir():
+            raise NotADirectoryError(f"Path exists but is not a directory: {path}")
 
     if not file_name:
         file_name = Path(sv.source_filenames[0].values.item()).stem
diff --git a/oceanstream/L2_calibrated_data/template.py b/oceanstream/L2_calibrated_data/template.py
deleted file mode 100644
index df89ea9..0000000
--- a/oceanstream/L2_calibrated_data/template.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-module_name_here.py
--------------------------
-Description: Brief description of what this module does.
-
-"""
-
-# Import necessary libraries
-import os
-import sys
-
-# Constants (if any)
-CONSTANT_NAME = "Constant Value"
-
-
-def function_name_1(arg1, arg2):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-    - arg2 (type): Description of arg2.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_1(value1, value2)
-    Expected Output
-    """
-    # Function implementation
-    path = os.path.join(["this", "and", "that"])
-    current_path = sys.path
-    return path, current_path
-
-
-def function_name_2(arg1):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_2(value1)
-    Expected Output
-    """
-    # Function implementation
-    pass
-
-
-# Additional functions...
-
-
-if __name__ == "__main__":
-    # Code to be executed if this module is run as a standalone script
-    # For example, for testing purposes
-    pass
diff --git a/oceanstream/L3_regridded_data/applying_masks_handler.py b/oceanstream/L3_regridded_data/applying_masks_handler.py
index 3bbd3a9..c0d93f9 100644
--- a/oceanstream/L3_regridded_data/applying_masks_handler.py
+++ b/oceanstream/L3_regridded_data/applying_masks_handler.py
@@ -86,7 +86,7 @@ def apply_selected_noise_masks_and_or_noise_removal(
 
     """
 
-    valid_processes = valid_processes = [
+    valid_processes = [
         "mask_impulse",
         "mask_attenuation",
         "mask_transient",
@@ -95,6 +95,13 @@ def apply_selected_noise_masks_and_or_noise_removal(
         "mask_seabed",
     ]
 
+    # Check for unexpected masks/processes
+    for process in processes_to_apply:
+        if process not in valid_processes:
+            raise ValueError(
+                "Unexpected mask/process. Please refer to the function documentation for valid masks/processes."
+            )
+
     for process in valid_processes:
         if process in processes_to_apply:
             params = processes_to_apply[process]
@@ -110,13 +117,6 @@ def apply_selected_noise_masks_and_or_noise_removal(
             elif process == "remove_background_noise":
                 ds = background_noise_remover.apply_remove_background_noise(ds, **params)
 
-    # Check for unexpected masks/processes
-    for process in processes_to_apply:
-        if process not in valid_processes:
-            raise ValueError(
-                "Unexpected mask/process. Please refer to the function documentation for valid masks/processes."
-            )
-
     return ds
 
 
diff --git a/oceanstream/L3_regridded_data/template.py b/oceanstream/L3_regridded_data/template.py
deleted file mode 100644
index df89ea9..0000000
--- a/oceanstream/L3_regridded_data/template.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-module_name_here.py
--------------------------
-Description: Brief description of what this module does.
-
-"""
-
-# Import necessary libraries
-import os
-import sys
-
-# Constants (if any)
-CONSTANT_NAME = "Constant Value"
-
-
-def function_name_1(arg1, arg2):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-    - arg2 (type): Description of arg2.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_1(value1, value2)
-    Expected Output
-    """
-    # Function implementation
-    path = os.path.join(["this", "and", "that"])
-    current_path = sys.path
-    return path, current_path
-
-
-def function_name_2(arg1):
-    """
-    Brief description of the function.
-
-    Parameters:
-    - arg1 (type): Description of arg1.
-
-    Returns:
-    - type: Description of the return value.
-
-    Example:
-    >>> function_name_2(value1)
-    Expected Output
-    """
-    # Function implementation
-    pass
-
-
-# Additional functions...
-
-
-if __name__ == "__main__":
-    # Code to be executed if this module is run as a standalone script
-    # For example, for testing purposes
-    pass
diff --git a/oceanstream/utils.py b/oceanstream/utils.py
index 42171d4..2490078 100644
--- a/oceanstream/utils.py
+++ b/oceanstream/utils.py
@@ -33,13 +33,14 @@ def add_metadata_to_mask(mask: xr.Dataset, metadata: Dict) -> xr.Dataset:
     return mask
 
 
-def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset) -> xr.Dataset:
+def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset, mask_type: str = None) -> xr.Dataset:
     """
     Attaches a mask to an existing Sv dataset, allowing the mask to travel in one data structure to the next module
 
     Parameters:
     - Sv (xarray.Dataset): dataset to attach a mask to
     - mask (xarray.Dataset): mask to be attached, with a mask_type attribute explaining what sort of mask it is
+    - mask_type (str): type of mask to be attached (optional, if not specified, the mask_type attribute of the mask)
 
     Returns:
     - xarray.Dataset: dataset enriched with the mask
@@ -49,11 +50,14 @@ def attach_mask_to_dataset(Sv: xr.Dataset, mask: xr.Dataset) -> xr.Dataset:
     Expected Output:
         Sv with an extra variable containing the mask, named mask_[mask_type]
     """
-    mask_type = mask.attrs["mask_type"]
+    if mask_type is None:
+        mask_type = mask.attrs["mask_type"]
+
     mask_name = "mask_" + mask_type
     Sv_mask = Sv.assign(mask=mask)
     Sv_mask["mask"].attrs = mask.attrs
     Sv_mask = Sv_mask.rename({"mask": mask_name})
+
     return Sv_mask
 
 
diff --git a/setup.py b/setup.py
index fa291ee..66a61cf 100644
--- a/setup.py
+++ b/setup.py
@@ -6,14 +6,14 @@
 
 setup(
     name="oceanstream",
-    version="0.1",  # You can adjust the version number as needed
+    version="0.1",
     packages=find_packages(),
     install_requires=requirements,
     # Optional metadata
-    author="PineView AS",
+    author="Pine View Software AS",
     author_email="hello@pineview.io",
-    description="A brief description of the oceanstream package",
+    description="",
     license="MIT",
-    keywords="ocean stream processing",
+    keywords="oceanstream echosounder",
     url="https://github.com/OceanStreamIO/oceanstream",
 )
diff --git a/tests/conftest.py b/tests/conftest.py
index 2280321..7531e61 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -130,9 +130,9 @@ def denoise_dataset(dataset):
 @pytest.fixture(scope="session")
 def ftp_data():
     test_data_folder = Path(TEST_DATA_FOLDER) / "ek60"
-    with FTP(FTP_MAIN) as ftp:
-        ftp.login()  # Add credentials if needed: ftp.login(user="username", passwd="password")
-        download_ftp_directory(ftp, FTP_PARTIAL_PATH, test_data_folder)
+    # with FTP(FTP_MAIN) as ftp:
+    #     ftp.login()
+    #     download_ftp_directory(ftp, FTP_PARTIAL_PATH, test_data_folder)
     yield str(test_data_folder)
     # Optional: Cleanup after tests are done
     # shutil.rmtree(TEST_DATA_FOLDER)
diff --git a/tests/test_processed_reader.py b/tests/test_processed_reader.py
index 29503ce..bebe49b 100644
--- a/tests/test_processed_reader.py
+++ b/tests/test_processed_reader.py
@@ -85,15 +85,15 @@ def test_invalid_dataset_input():
         write_processed(invalid_input, file_path, file_name)
 
 
-def test_invalid_directory_path(enriched_ek60_Sv):
-    sv_echopype_ek60 = enriched_ek60_Sv
-    # Use pathlib to create the path
-    invalid_path = Path("/path/that/does/not/exist")
-    file_name = "invalid_path_test.nc"
-
-    path_pattern = re.escape(str(invalid_path))
-    with pytest.raises(FileNotFoundError, match=f"Invalid path provided: {path_pattern}"):
-        write_processed(sv_echopype_ek60, invalid_path, file_name)
+# def test_invalid_directory_path(enriched_ek60_Sv):
+#     sv_echopype_ek60 = enriched_ek60_Sv
+#     # Use pathlib to create the path
+#     invalid_path = Path("/path/that/does/not/exist")
+#     file_name = "invalid_path_test.nc"
+#
+#     path_pattern = re.escape(str(invalid_path))
+#     with pytest.raises(FileNotFoundError, match=f"Invalid path provided: {path_pattern}"):
+#         write_processed(sv_echopype_ek60, invalid_path, file_name)
 
 
 def test_unsupported_file_type(enriched_ek60_Sv):
diff --git a/tests/test_raw_handler.py b/tests/test_raw_handler.py
index 391cd1b..c83a2c7 100644
--- a/tests/test_raw_handler.py
+++ b/tests/test_raw_handler.py
@@ -1,5 +1,5 @@
 import os
-
+import urllib.request
 import pytest
 
 from oceanstream.L0_unprocessed_data.raw_handler import (
@@ -9,6 +9,7 @@
     read_processed_files,
     read_raw_files,
     split_files,
+    detect_sonar_model,
 )
 from tests.conftest import TEST_DATA_FOLDER
 
@@ -40,14 +41,7 @@ def test_file_integrity_checking(ftp_data):
     # Test with a valid raw echo sounder file
     result_files = file_integrity_checking(found_files[0])
     assert result_files["file_integrity"] == True
-    assert result_files["sonar_model"] in [
-        "EK60",
-        "ES70",
-        "EK80",
-        "EA640",
-        "AZFP",
-        "AD2CP",
-    ]
+    assert result_files["campaign_id"] == 'SignytoP2'
 
     # Test with a valid netCDF file
     valid_netcdf_file = convert_raw_files(
@@ -117,7 +111,7 @@ def test_convert_raw_files(ftp_data):
 
     # Test with an unsupported save file type
     with pytest.raises(
-        Exception
+            Exception
     ):  # Assuming the function raises an exception for unsupported file types
         convert_raw_files(file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="unsupported")
 
@@ -149,3 +143,26 @@ def test_split_files(ftp_data):
     # Test with an empty list
     with pytest.raises(Exception):
         grouped_files = split_files([])
+
+
+def test_detect_sonar_model_ek60(ftp_data):
+    # Test with a valid raw echo sounder file
+    found_files = file_finder(ftp_data, "raw")
+    sonar_model = detect_sonar_model(found_files[0])
+
+    assert sonar_model == "EK60"
+
+
+def test_detect_sonar_model_ek80(ftp_data):
+    base_url = "https://noaa-wcsd-pds.s3.amazonaws.com/"
+    path = "data/raw/Sally_Ride/SR1611/EK80/"
+    file_name = "D20161108-T214612.raw"
+
+    local_path = os.path.join(TEST_DATA_FOLDER, file_name)
+    if not os.path.isfile(local_path):
+        raw_file_address = base_url + path + file_name
+        urllib.request.urlretrieve(raw_file_address, local_path)
+
+    sonar_model = detect_sonar_model(local_path)
+
+    assert sonar_model == "EK80"