Don't separate temporally into multiple files by default (#190)

* Temporal grouping is optional with default false * Adapt tests to non-default grouping * Separate plot and save methods tests * Don't order parameters alphabetically in yaml files * Linting * Trigger dask progress bar internally * Improve notebook text
CWorthy-ocean · Nov 5, 2024 · b5cc382 · b5cc382
1 parent 2e62bc6
commit b5cc382
Show file tree

Hide file tree

Showing 14 changed files with 2,848 additions and 3,533 deletions.
diff --git a/docs/boundary_forcing.ipynb b/docs/boundary_forcing.ipynb
diff --git a/docs/grid.ipynb b/docs/grid.ipynb
diff --git a/docs/initial_conditions.ipynb b/docs/initial_conditions.ipynb
diff --git a/docs/surface_forcing.ipynb b/docs/surface_forcing.ipynb
diff --git a/docs/tides.ipynb b/docs/tides.ipynb
diff --git a/roms_tools/setup/boundary_forcing.py b/roms_tools/setup/boundary_forcing.py
@@ -649,7 +649,14 @@ def plot(
         if var_name not in self.ds:
             raise ValueError(f"Variable '{var_name}' is not found in dataset.")
 
-        field = self.ds[var_name].isel(bry_time=time).load()
+        field = self.ds[var_name].isel(bry_time=time)
+
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                field = field.load()
+
         title = field.long_name
 
         if "s_rho" in field.dims:
@@ -699,24 +706,26 @@ def plot(
             _line_plot(field, title=title)
 
     def save(
-        self, filepath: Union[str, Path], np_eta: int = None, np_xi: int = None
+        self,
+        filepath: Union[str, Path],
+        np_eta: int = None,
+        np_xi: int = None,
+        group: bool = False,
     ) -> None:
-        """Save the boundary forcing fields to netCDF4 files.
-
-        This method saves the dataset by grouping it into subsets based on the data frequency. The subsets are then written
-        to one or more netCDF4 files. The filenames of the output files reflect the temporal coverage of the data.
+        """Save the boundary forcing fields to one or more netCDF4 files.
 
-        There are two modes of saving the dataset:
+        This method saves the dataset either as a single file or as multiple files depending on the partitioning and grouping options.
+        The dataset can be saved in two modes:
 
-          1. **Single File Mode (default)**:
+        1. **Single File Mode (default)**:
+            - If both `np_eta` and `np_xi` are `None`, the entire dataset is saved as a single netCDF4 file.
+            - The file is named based on the `filepath`, with `.nc` automatically appended.
 
-            If both `np_eta` and `np_xi` are `None`, the entire dataset, divided by temporal subsets, is saved as a single netCDF4 file
-            with the base filename specified by `filepath.nc`.
+        2. **Partitioned Mode**:
+            - If either `np_eta` or `np_xi` is specified, the dataset is partitioned into spatial tiles along the `eta` and `xi` axes.
+            - Each tile is saved as a separate netCDF4 file, and filenames are modified with an index (e.g., `"filepath_YYYYMM.0.nc"`, `"filepath_YYYYMM.1.nc"`).
 
-          2. **Partitioned Mode**:
-
-            - If either `np_eta` or `np_xi` is specified, the dataset is divided into spatial tiles along the eta-axis and xi-axis.
-            - Each spatial tile is saved as a separate netCDF4 file.
+        Additionally, if `group` is set to `True`, the dataset is first grouped into temporal subsets, resulting in multiple grouped files before partitioning and saving.
 
         Parameters
         ----------
@@ -728,6 +737,8 @@ def save(
             The number of partitions along the `eta` direction. If `None`, no spatial partitioning is performed.
         np_xi : int, optional
             The number of partitions along the `xi` direction. If `None`, no spatial partitioning is performed.
+        group: bool, optional
+            If `True`, groups the dataset into multiple files based on temporal data frequency. Defaults to `False`.
 
         Returns
         -------
@@ -742,7 +753,18 @@ def save(
         if filepath.suffix == ".nc":
             filepath = filepath.with_suffix("")
 
-        dataset_list, output_filenames = group_dataset(self.ds.load(), str(filepath))
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                self.ds.load()
+
+        if group:
+            dataset_list, output_filenames = group_dataset(self.ds, str(filepath))
+        else:
+            dataset_list = [self.ds]
+            output_filenames = [str(filepath)]
+
         saved_filenames = save_datasets(
             dataset_list, output_filenames, np_eta=np_eta, np_xi=np_xi
         )
@@ -796,7 +818,7 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             # Write header
             file.write(header)
             # Write YAML data
-            yaml.dump(yaml_data, file, default_flow_style=False)
+            yaml.dump(yaml_data, file, default_flow_style=False, sort_keys=False)
 
     @classmethod
     def from_yaml(

diff --git a/roms_tools/setup/grid.py b/roms_tools/setup/grid.py
@@ -679,7 +679,7 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             # Write header
             file.write(header)
             # Write YAML data
-            yaml.dump(yaml_data, file, default_flow_style=False)
+            yaml.dump(yaml_data, file, default_flow_style=False, sort_keys=False)
 
     @classmethod
     def from_yaml(cls, filepath: Union[str, Path]) -> "Grid":

diff --git a/roms_tools/setup/initial_conditions.py b/roms_tools/setup/initial_conditions.py
@@ -518,7 +518,12 @@ def plot(
         ):
             raise ValueError("For 2D fields, specify either eta or xi, not both.")
 
-        self.ds[var_name].load()
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                self.ds[var_name].load()
+
         field = self.ds[var_name].squeeze()
 
         if all(dim in field.dims for dim in ["eta_rho", "xi_rho"]):
@@ -681,7 +686,13 @@ def save(
         if filepath.suffix == ".nc":
             filepath = filepath.with_suffix("")
 
-        dataset_list = [self.ds.load()]
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                self.ds.load()
+
+        dataset_list = [self.ds]
         output_filenames = [str(filepath)]
 
         saved_filenames = save_datasets(
@@ -719,15 +730,18 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
 
         initial_conditions_data = {
             "InitialConditions": {
-                "source": self.source,
                 "ini_time": self.ini_time.isoformat(),
-                "model_reference_date": self.model_reference_date.isoformat(),
+                "source": self.source,
             }
         }
         # Include bgc_source if it's not None
         if self.bgc_source is not None:
             initial_conditions_data["InitialConditions"]["bgc_source"] = self.bgc_source
 
+        initial_conditions_data["InitialConditions"][
+            "model_reference_date"
+        ] = self.model_reference_date.isoformat()
+
         yaml_data = {
             **grid_yaml_data,
             **initial_conditions_data,
@@ -737,7 +751,7 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             # Write header
             file.write(header)
             # Write YAML data
-            yaml.dump(yaml_data, file, default_flow_style=False)
+            yaml.dump(yaml_data, file, default_flow_style=False, sort_keys=False)
 
     @classmethod
     def from_yaml(

diff --git a/roms_tools/setup/surface_forcing.py b/roms_tools/setup/surface_forcing.py
@@ -462,7 +462,13 @@ def plot(self, var_name, time=0) -> None:
         if var_name not in self.ds:
             raise ValueError(f"Variable '{var_name}' is not found in dataset.")
 
-        field = self.ds[var_name].isel(time=time).load()
+        field = self.ds[var_name].isel(time=time)
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                field = field.load()
+
         title = field.long_name
 
         # assign lat / lon
@@ -502,24 +508,26 @@ def plot(self, var_name, time=0) -> None:
         )
 
     def save(
-        self, filepath: Union[str, Path], np_eta: int = None, np_xi: int = None
+        self,
+        filepath: Union[str, Path],
+        np_eta: int = None,
+        np_xi: int = None,
+        group: bool = False,
     ) -> None:
-        """Save the surface forcing fields to netCDF4 files.
-
-        This method saves the dataset by grouping it into subsets based on the data frequency. The subsets are then written
-        to one or more netCDF4 files. The filenames of the output files reflect the temporal coverage of the data.
-
-        There are two modes of saving the dataset:
+        """Save the surface forcing fields to one or more netCDF4 files.
 
-          1. **Single File Mode (default)**:
+        This method saves the dataset either as a single file or as multiple files depending on the partitioning and grouping options.
+        The dataset can be saved in two modes:
 
-            If both `np_eta` and `np_xi` are `None`, the entire dataset, divided by temporal subsets, is saved as a single netCDF4 file
-            with the base filename specified by `filepath.nc`.
+        1. **Single File Mode (default)**:
+            - If both `np_eta` and `np_xi` are `None`, the entire dataset is saved as a single netCDF4 file.
+            - The file is named based on the `filepath`, with `.nc` automatically appended.
 
-          2. **Partitioned Mode**:
+        2. **Partitioned Mode**:
+            - If either `np_eta` or `np_xi` is specified, the dataset is partitioned into spatial tiles along the `eta` and `xi` axes.
+            - Each tile is saved as a separate netCDF4 file, and filenames are modified with an index (e.g., `"filepath_YYYYMM.0.nc"`, `"filepath_YYYYMM.1.nc"`).
 
-            - If either `np_eta` or `np_xi` is specified, the dataset is divided into spatial tiles along the eta-axis and xi-axis.
-            - Each spatial tile is saved as a separate netCDF4 file.
+        Additionally, if `group` is set to `True`, the dataset is first grouped into temporal subsets, resulting in multiple grouped files before partitioning and saving.
 
         Parameters
         ----------
@@ -531,6 +539,8 @@ def save(
             The number of partitions along the `eta` direction. If `None`, no spatial partitioning is performed.
         np_xi : int, optional
             The number of partitions along the `xi` direction. If `None`, no spatial partitioning is performed.
+        group: bool, optional
+            If `True`, groups the dataset into multiple files based on temporal data frequency. Defaults to `False`.
 
         Returns
         -------
@@ -545,7 +555,18 @@ def save(
         if filepath.suffix == ".nc":
             filepath = filepath.with_suffix("")
 
-        dataset_list, output_filenames = group_dataset(self.ds.load(), str(filepath))
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                self.ds.load()
+
+        if group:
+            dataset_list, output_filenames = group_dataset(self.ds, str(filepath))
+        else:
+            dataset_list = [self.ds]
+            output_filenames = [str(filepath)]
+
         saved_filenames = save_datasets(
             dataset_list, output_filenames, np_eta=np_eta, np_xi=np_xi
         )
@@ -603,7 +624,7 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             # Write header
             file.write(header)
             # Write YAML data
-            yaml.dump(yaml_data, file, default_flow_style=False)
+            yaml.dump(yaml_data, file, default_flow_style=False, sort_keys=False)
 
     @classmethod
     def from_yaml(

diff --git a/roms_tools/setup/tides.py b/roms_tools/setup/tides.py
@@ -300,7 +300,14 @@ def plot(self, var_name, ntides=0) -> None:
         >>> tidal_forcing.plot("ssh_Re", nc=0)
         """
 
-        field = self.ds[var_name].isel(ntides=ntides).compute()
+        field = self.ds[var_name].isel(ntides=ntides)
+
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                field = field.load()
+
         if all(dim in field.dims for dim in ["eta_rho", "xi_rho"]):
             field = field.where(self.grid.ds.mask_rho)
             field = field.assign_coords(
@@ -378,7 +385,13 @@ def save(
         if filepath.suffix == ".nc":
             filepath = filepath.with_suffix("")
 
-        dataset_list = [self.ds.load()]
+        if self.use_dask:
+            from dask.diagnostics import ProgressBar
+
+            with ProgressBar():
+                self.ds.load()
+
+        dataset_list = [self.ds]
         output_filenames = [str(filepath)]
 
         saved_filenames = save_datasets(
@@ -419,8 +432,8 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             "TidalForcing": {
                 "source": self.source,
                 "ntides": self.ntides,
-                "model_reference_date": self.model_reference_date.isoformat(),
                 "allan_factor": self.allan_factor,
+                "model_reference_date": self.model_reference_date.isoformat(),
             }
         }
 
@@ -431,7 +444,7 @@ def to_yaml(self, filepath: Union[str, Path]) -> None:
             # Write header
             file.write(header)
             # Write YAML data
-            yaml.dump(yaml_data, file, default_flow_style=False)
+            yaml.dump(yaml_data, file, default_flow_style=False, sort_keys=False)
 
     @classmethod
     def from_yaml(