From 78a859fb107afbf1a4f678ba623923a31db6b9bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CRobHanna-NOAA=E2=80=9D?=
 <“Robert.Hanna@NOAA.gov”>
Date: Fri, 10 May 2024 20:31:16 +0000
Subject: [PATCH 1/9] update code

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 105 +++++++++++++++++-
 src/utils/shared_validators.py                |  35 ++++++
 2 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index 1bbd34bb0..f2caa6250 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 
 import argparse
+import datetime as dt
+import logging
 import os
 from datetime import datetime
 from glob import glob
 from io import BytesIO
 from urllib.request import urlopen
+from urllib.parse import urlsplit
 from zipfile import ZipFile
 
 import numpy as np
@@ -13,6 +16,28 @@
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
 from preprocess_benchmark import preprocess_benchmark_static
+from src.utils.shared_validators import is_valid_huc
+
+
+def __setup_logger(output_folder_path):
+    start_time = datetime.now()
+    file_dt_string = start_time.strftime("%Y_%m_%d-%H_%M_%S")
+    log_file_name = f"create_ble_benchmark-{file_dt_string}.log"
+
+    log_file_path = os.path.join(output_folder_path, log_file_name)
+
+    file_handler = logging.FileHandler(log_file_path)
+    file_handler.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG)
+
+    logger = logging.getLogger()
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    logger.setLevel(logging.DEBUG)
+
+    logging.info(f'Started : {start_time.strftime("%m/%d/%Y %H:%M:%S")}')
+    logging.info("----------------")
 
 
 def create_ble_benchmark(
@@ -59,12 +84,22 @@ def create_ble_benchmark(
     if not os.path.exists(save_folder):
         os.makedirs(save_folder)
 
+    __setup_logger(save_folder)
+
+    start_time = dt.datetime.now(dt.timezone.utc)
+
+    print("==================================")
+    logging.info("Starting load of BLE benchmark data")
+    logging.info(f"Start time: {start_time.strftime('%m/%d/%Y %H:%M:%S')}")
+    print()
+
+
     # EBFE_urls_20230608.xlsx acquired from FEMA (fethomps@usgs.gov)
     ext = os.path.splitext(input_file)[1]
     if ext == '.xlsx':
-        data = pd.read_excel(input_file, header=None, names=['size', 'units', 'URL'])
+        data = pd.read_excel(input_file, header=None, names=['Size', 'Units', 'URL'])
     elif ext == '.csv':
-        data = pd.read_csv(input_file, header=None, names=['size', 'units', 'URL'])
+        data = pd.read_csv(input_file, header=None, names=['Size', 'Units', 'URL'])
 
     # Subset Spatial Data URLs
     spatial_df = data[data['URL'].str.contains('SpatialData')]
@@ -78,8 +113,13 @@ def create_ble_benchmark(
 
     spatial_df, ble_geodatabase = download_and_extract_rasters(spatial_df, save_folder)
 
+    print()
+    logging.info ("=======================================")
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
+        logging.info("++++++++++++++++++++++++++")
+        logging.info(f"HUC is {huc}")
+
         # reference_raster is used to set the metadata for benchmark_raster
         reference_raster = os.path.join(reference_folder, f'{huc}/branches/0/rem_zeroed_masked_0.tif')
         for benchmark_raster in row['rasters']:
@@ -92,8 +132,10 @@ def create_ble_benchmark(
             out_raster_path = os.path.join(out_raster_dir, f"ble_huc_{huc}_extent_{magnitude}.tif")
 
             # Make benchmark inundation raster
+            logging.info(f"  {huc} : magnitude : {magnitude} : preprocessing stats")
             preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path)
 
+            logging.info(f"  {huc} : magnitude : {magnitude} : creating flow file")
             create_flow_forecast_file(
                 huc,
                 ble_geodatabase,
@@ -104,6 +146,25 @@ def create_ble_benchmark(
                 nwm_feature_id_field,
             )
 
+    # Get time metrics
+    end_time = dt.datetime.now(dt.timezone.utc)
+    logging.info("")
+    print("==================================")
+    logging.info("Complete")
+    logging.info(f"   End time: {end_time.strftime('%m/%d/%Y %H:%M:%S')}")
+
+    time_delta = end_time - start_time
+    total_seconds = int(time_delta.total_seconds())
+
+    ___, rem_seconds = divmod(total_seconds, 60 * 60 * 24)
+    total_hours, rem_seconds = divmod(rem_seconds, 60 * 60)
+    total_mins, seconds = divmod(rem_seconds, 60)
+
+    time_fmt = f"{total_hours:02d} hours {total_mins:02d} mins {seconds:02d} secs"
+
+    logging.info(f"Duration: {time_fmt}")
+    print()
+
 
 def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     """
@@ -131,20 +192,47 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     huc_names = []
     out_files = []
     out_list = []
+
+    logging.info ("=======================================")
+    logging.info ("Extracting and downloading spatial data")
+    print()
     for i, row in spatial_df.iterrows():
         # Extract HUC and HUC Name from URL
         url = row['URL']
+
+        if url is None:
+            continue
+        
+        logging.info("++++++++++++++++++++++++++")
+        logging.info(f"URL is {url}")
+
         huc, huc_name = os.path.basename(os.path.dirname(url)).split('_')
+
+        # Check to ensure it is a valid HUC (some are not)
+        is_ble_huc_valid, ___ = is_valid_huc(huc)
+
+        if not is_ble_huc_valid:
+            logging.info(f"Invalid huc")
+            continue
+
+        # logging.info
+        logging.info(f"{huc} : Downloading and extracting for {huc_name}")
         hucs.append(huc)
         huc_names.append(huc_name)
 
-        # Download and unzip file
+        # Download file
+        logging.info(f"  {huc} : Unzipping")
         save_file = os.path.join(save_folder, os.path.basename(url))
         if not os.path.exists(save_file):
             http_response = urlopen(url)
             zipfile = ZipFile(BytesIO(http_response.read()))
             zipfile.extractall(path=save_file)
+        else:
+            logging.info(f"  {huc} :  zip file already exists: {save_file}")
 
+
+        # Loading gdb
+        logging.info(f"  {huc} : Loading gdb")
         gdb_list = glob(os.path.join(save_file, '**', '*.gdb'), recursive=True)
 
         if len(gdb_list) == 1:
@@ -158,13 +246,16 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
                 if not os.path.exists(out_file):
                     # Read raster data from GDB
-                    print(f'Reading {depth_raster} for {huc}')
+                    print(f'.. {huc} : Reading {depth_raster}')
                     depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
                     extract_raster(depth_raster_path, out_file)
 
                 out_list.append(out_file)
 
+        else:
+            logging.info(f" {huc} : gdb_list does not equal 1")
+
         if len(out_list) > 0:
             out_files.append(out_list)
 
@@ -172,6 +263,10 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     spatial_df['HUC_Name'] = huc_names
     spatial_df['rasters'] = out_files
 
+    print()
+    logging.info ("Extracting and downloading spatial data - COMPLETE")
+    print()
+
     return spatial_df, ble_geodatabase
 
 
@@ -208,7 +303,7 @@ def extract_raster(in_raster: str, out_raster: str):
     parser = argparse.ArgumentParser(
         description='Create BLE benchmark files',
         usage='''python3 /foss_fim/data/ble/ble_benchmark/create_ble_benchmark.py
-                                                -i /data/inputs/ble/ble_benchmark/EBFE_urls_20230608.xlsx
+                                                -i /data/inputs/ble/ble_benchmark/EBFE_urls_20240416.xlsx
                                                 -s /data/temp/ble_benchmark
                                                 -r /data/outputs/my_run/
                                                 -o /data/test_cases/ble_test_cases/validation_data_ble
diff --git a/src/utils/shared_validators.py b/src/utils/shared_validators.py
index b51251283..4a5417464 100644
--- a/src/utils/shared_validators.py
+++ b/src/utils/shared_validators.py
@@ -67,3 +67,38 @@ def is_valid_crs(crs):
         return False, err_msg, ""
 
     return True, "", crs_number
+
+
+# -------------------------------------------------
+def is_valid_huc(huc_number):
+    """
+    Returns:
+        - First is bool (True or False). True meaning it is a valid huc
+        - Second is a string, which might be empty, but if failed, then this will be the reason for failure
+
+        It is up to the calling code to decide if an exception should be raised.
+
+    Usage:
+        valid_huc, err_msg = is_valid_huc(huc_number)
+
+        (if you choose to hand it this way....)
+        if (valid_huc == False):
+            raise ValueError(err_msg)
+
+    """
+
+    err_msg = ""
+
+    if huc_number is None:
+        err_msg = "huc number not set"
+        return False, err_msg
+
+    if len(str(huc_number)) != 8:
+        err_msg = f"huc number of {huc_number} is not eight characters in length"
+        return False, err_msg
+
+    if huc_number.isnumeric() is False:
+        err_msg = f"huc number of {huc_number} is not a number"
+        return False, err_msg
+
+    return True, ""
\ No newline at end of file

From 3ea81394ef348a44918f76ffd995b9558e74bb32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CRobHanna-NOAA=E2=80=9D?=
 <“Robert.Hanna@NOAA.gov”>
Date: Fri, 10 May 2024 20:36:14 +0000
Subject: [PATCH 2/9] update benchmark code

---
 data/ble/ble_benchmark/create_ble_benchmark.py | 17 ++++++++---------
 src/utils/shared_validators.py                 |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index f2caa6250..9e7db89e4 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -7,8 +7,8 @@
 from datetime import datetime
 from glob import glob
 from io import BytesIO
-from urllib.request import urlopen
 from urllib.parse import urlsplit
+from urllib.request import urlopen
 from zipfile import ZipFile
 
 import numpy as np
@@ -16,6 +16,7 @@
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
 from preprocess_benchmark import preprocess_benchmark_static
+
 from src.utils.shared_validators import is_valid_huc
 
 
@@ -93,7 +94,6 @@ def create_ble_benchmark(
     logging.info(f"Start time: {start_time.strftime('%m/%d/%Y %H:%M:%S')}")
     print()
 
-
     # EBFE_urls_20230608.xlsx acquired from FEMA (fethomps@usgs.gov)
     ext = os.path.splitext(input_file)[1]
     if ext == '.xlsx':
@@ -114,7 +114,7 @@ def create_ble_benchmark(
     spatial_df, ble_geodatabase = download_and_extract_rasters(spatial_df, save_folder)
 
     print()
-    logging.info ("=======================================")
+    logging.info("=======================================")
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
         logging.info("++++++++++++++++++++++++++")
@@ -193,8 +193,8 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     out_files = []
     out_list = []
 
-    logging.info ("=======================================")
-    logging.info ("Extracting and downloading spatial data")
+    logging.info("=======================================")
+    logging.info("Extracting and downloading spatial data")
     print()
     for i, row in spatial_df.iterrows():
         # Extract HUC and HUC Name from URL
@@ -202,7 +202,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
         if url is None:
             continue
-        
+
         logging.info("++++++++++++++++++++++++++")
         logging.info(f"URL is {url}")
 
@@ -212,7 +212,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         is_ble_huc_valid, ___ = is_valid_huc(huc)
 
         if not is_ble_huc_valid:
-            logging.info(f"Invalid huc")
+            logging.info("Invalid huc")
             continue
 
         # logging.info
@@ -230,7 +230,6 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         else:
             logging.info(f"  {huc} :  zip file already exists: {save_file}")
 
-
         # Loading gdb
         logging.info(f"  {huc} : Loading gdb")
         gdb_list = glob(os.path.join(save_file, '**', '*.gdb'), recursive=True)
@@ -264,7 +263,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     spatial_df['rasters'] = out_files
 
     print()
-    logging.info ("Extracting and downloading spatial data - COMPLETE")
+    logging.info("Extracting and downloading spatial data - COMPLETE")
     print()
 
     return spatial_df, ble_geodatabase
diff --git a/src/utils/shared_validators.py b/src/utils/shared_validators.py
index 4a5417464..5a0bace6a 100644
--- a/src/utils/shared_validators.py
+++ b/src/utils/shared_validators.py
@@ -101,4 +101,4 @@ def is_valid_huc(huc_number):
         err_msg = f"huc number of {huc_number} is not a number"
         return False, err_msg
 
-    return True, ""
\ No newline at end of file
+    return True, ""

From 4e431c4e087d5395978978b37aba2341b1729a45 Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.hanna@noaa.gov>
Date: Mon, 13 May 2024 23:05:47 +0000
Subject: [PATCH 3/9] fixes to run ble reload

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 71 +++++++++++++------
 .../create_flow_forecast_file.py              | 23 +++++-
 .../ble/ble_benchmark/preprocess_benchmark.py | 17 ++++-
 3 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index 9e7db89e4..8a9d01c85 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -4,27 +4,29 @@
 import datetime as dt
 import logging
 import os
+import shutil
 from datetime import datetime
 from glob import glob
 from io import BytesIO
+from pathlib import Path
 from urllib.parse import urlsplit
 from urllib.request import urlopen
 from zipfile import ZipFile
 
 import numpy as np
 import pandas as pd
+import rasterio
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
 from preprocess_benchmark import preprocess_benchmark_static
 
-from src.utils.shared_validators import is_valid_huc
+from utils.shared_validators import is_valid_huc
 
 
 def __setup_logger(output_folder_path):
     start_time = datetime.now()
     file_dt_string = start_time.strftime("%Y_%m_%d-%H_%M_%S")
     log_file_name = f"create_ble_benchmark-{file_dt_string}.log"
-
     log_file_path = os.path.join(output_folder_path, log_file_name)
 
     file_handler = logging.FileHandler(log_file_path)
@@ -94,15 +96,27 @@ def create_ble_benchmark(
     logging.info(f"Start time: {start_time.strftime('%m/%d/%Y %H:%M:%S')}")
     print()
 
-    # EBFE_urls_20230608.xlsx acquired from FEMA (fethomps@usgs.gov)
+    # set from GDAL options, mostly to shut off the verbosity
+    rasterio_gdal_path = rasterio._env.get_gdal_data()
+    os.environ['GDAL_DATA'] = rasterio_gdal_path
+    gdal_new_config = {'CPL_DEBUG': 'OFF', 'GDAL_CACHEMAX': '536870912', 'GDAL_DATA': rasterio_gdal_path}
+
+    for key, val in gdal_new_config.items():
+        gdal.SetConfigOption(key, val)
+
+    # Latest is:
+    # EBFE_urls_20240426.xlsx acquired from FEMA (fethomps@usgs.gov) (Florence Thompson)
+
+    # *** Note: with each new file we get from Florence, you might have to adjust the columns below.
     ext = os.path.splitext(input_file)[1]
     if ext == '.xlsx':
-        data = pd.read_excel(input_file, header=None, names=['Size', 'Units', 'URL'])
+        data = pd.read_excel(input_file, header=0, names=['date', 'size', 'units', 'URL'])
     elif ext == '.csv':
-        data = pd.read_csv(input_file, header=None, names=['Size', 'Units', 'URL'])
+        data = pd.read_csv(input_file, header=None, names=['size', 'units', 'URL'])
 
     # Subset Spatial Data URLs
     spatial_df = data[data['URL'].str.contains('SpatialData')]
+
     if huc is not None:
         spatial_df = spatial_df[spatial_df['URL'].str.contains(huc)]
 
@@ -115,10 +129,15 @@ def create_ble_benchmark(
 
     print()
     logging.info("=======================================")
+    logging.info("")
+    logging.info("Creating flow files")
+    logging.info("")
+    num_recs = len(spatial_df)
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
         logging.info("++++++++++++++++++++++++++")
         logging.info(f"HUC is {huc}")
+        print(f" Flow file {i+1} of {num_recs}")
 
         # reference_raster is used to set the metadata for benchmark_raster
         reference_raster = os.path.join(reference_folder, f'{huc}/branches/0/rem_zeroed_masked_0.tif')
@@ -146,6 +165,13 @@ def create_ble_benchmark(
                 nwm_feature_id_field,
             )
 
+        # cleaning up temp files.
+        for f in Path(save_folder).glob(f"{huc}*"):
+            if os.path.isfile(f):
+                os.remove(f)
+            else:
+                shutil.rmtree(f, ignore_errors=True)
+
     # Get time metrics
     end_time = dt.datetime.now(dt.timezone.utc)
     logging.info("")
@@ -195,10 +221,15 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
     logging.info("=======================================")
     logging.info("Extracting and downloading spatial data")
+
+    num_recs = len(spatial_df)
+    logging.info(f"  Number of HUC records to download is {num_recs}")
     print()
     for i, row in spatial_df.iterrows():
+
         # Extract HUC and HUC Name from URL
         url = row['URL']
+        print(f" Downloading {i+1} of {num_recs}")
 
         if url is None:
             continue
@@ -212,26 +243,32 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         is_ble_huc_valid, ___ = is_valid_huc(huc)
 
         if not is_ble_huc_valid:
-            logging.info("Invalid huc")
+            logging.info(f"{huc} : Invalid huc")
             continue
 
         # logging.info
         logging.info(f"{huc} : Downloading and extracting for {huc_name}")
+
+        # we don't want to add a huc record if it is aleady there
+        if huc in hucs:
+            logging.info(f"{huc} : HUC is already included in the list. Skipping {url}")
+            continue
+
         hucs.append(huc)
         huc_names.append(huc_name)
 
         # Download file
-        logging.info(f"  {huc} : Unzipping")
         save_file = os.path.join(save_folder, os.path.basename(url))
         if not os.path.exists(save_file):
+            logging.info(f"Saving download file to {save_file}")
             http_response = urlopen(url)
             zipfile = ZipFile(BytesIO(http_response.read()))
             zipfile.extractall(path=save_file)
         else:
-            logging.info(f"  {huc} :  zip file already exists: {save_file}")
+            logging.info(f"{huc} :  zip file already exists in file system: {save_file}")
 
         # Loading gdb
-        logging.info(f"  {huc} : Loading gdb")
+        logging.info(f"{huc} : Loading gdb")
         gdb_list = glob(os.path.join(save_file, '**', '*.gdb'), recursive=True)
 
         if len(gdb_list) == 1:
@@ -245,7 +282,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
                 if not os.path.exists(out_file):
                     # Read raster data from GDB
-                    print(f'.. {huc} : Reading {depth_raster}')
+                    print(f'{huc} : Reading {depth_raster}')
                     depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
                     extract_raster(depth_raster_path, out_file)
@@ -253,7 +290,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
                 out_list.append(out_file)
 
         else:
-            logging.info(f" {huc} : gdb_list does not equal 1")
+            logging.info(f"{huc} : gdb_list does not equal 1")
 
         if len(out_list) > 0:
             out_files.append(out_list)
@@ -304,10 +341,10 @@ def extract_raster(in_raster: str, out_raster: str):
         usage='''python3 /foss_fim/data/ble/ble_benchmark/create_ble_benchmark.py
                                                 -i /data/inputs/ble/ble_benchmark/EBFE_urls_20240416.xlsx
                                                 -s /data/temp/ble_benchmark
-                                                -r /data/outputs/my_run/
-                                                -o /data/test_cases/ble_test_cases/validation_data_ble
+                                                -r /data/outputs/fim_test_BED/
+                                                -o /data/test_cases/test/ble_test_cases/validation_data_ble
                                                 -n /data/inputs/nwm_hydrofabric/nwm_flows.gpkg
-                                                -u 12030105
+                                                -u 12040101
                                             ''',
     )
     parser.add_argument('-i', '--input-file', type=str, help='Input file', required=True)
@@ -346,10 +383,4 @@ def extract_raster(in_raster: str, out_raster: str):
 
     args = vars(parser.parse_args())
 
-    start_time = datetime.now()
-    print('Pulling BLE Benchmark data...  \n')
-
     create_ble_benchmark(**args)
-
-    end_time = datetime.now()
-    print('\n Finished Pulling BLE Benchmark data \n', 'Total Time: ', end_time - start_time)
diff --git a/data/ble/ble_benchmark/create_flow_forecast_file.py b/data/ble/ble_benchmark/create_flow_forecast_file.py
index 65c22daf0..03bc18ff6 100755
--- a/data/ble/ble_benchmark/create_flow_forecast_file.py
+++ b/data/ble/ble_benchmark/create_flow_forecast_file.py
@@ -9,7 +9,7 @@
 import pandas as pd
 
 
-gpd.options.io_engine = "pyogrio"
+# gpd.options.io_engine = "pyogrio"
 
 
 def create_flow_forecast_file(
@@ -55,6 +55,9 @@ def create_flow_forecast_file(
 
     '''
 
+    # print(locals())
+    print(" ******************************************")
+
     def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         """
         This function will fill in missing flows in the forecast dataframe.
@@ -135,15 +138,25 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         ble_xs_layer_name = 'XS_1D'
 
     xs_layer = gpd.read_file(ble_geodatabase, layer=ble_xs_layer_name)
+    # print("^^^^^^ xs_layer")
+    # print(xs_layer)
+    # print(xs_layer.crs)
 
     # Read in the NWM stream layer into a geopandas dataframe using the bounding box option based on the extents of the BLE XS layer.
-    nwm_river_layer = gpd.read_file(nwm_geodatabase, bbox=xs_layer, layer=nwm_stream_layer_name)
+    nwm_river_layer = gpd.read_file(nwm_geodatabase, mask=xs_layer, layer=nwm_stream_layer_name)
+    # print("^^^^^^ nwm_river_layer")
+    # print(nwm_river_layer)
+    # print(nwm_river_layer.crs)
 
     # Make sure xs_layer is in same projection as nwm_river_layer.
     xs_layer_proj = xs_layer.to_crs(nwm_river_layer.crs)
+    # print("^^^^^^ xs_layer_proj")
+    # print(xs_layer_proj)
 
     # Perform an intersection of the BLE layers and the NWM layers, using the keep_geom_type set to False produces a point output.
     intersection = gpd.overlay(xs_layer_proj, nwm_river_layer, how='intersection', keep_geom_type=False)
+    # print("^^^^^^ intersection")
+    # print(intersection)
 
     ## Create the flow forecast files
     # Define fields containing flow (typically these won't change for BLE)
@@ -168,6 +181,9 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         forecast = forecast.groupby('feature_id').median()
         forecast = forecast.reset_index(level=0)
 
+        # print(f"forecast 2")
+        # print(len(forecast))
+
         # Convert CFS to CMS
         forecast['discharge'] = forecast['discharge'] * dischargeMultiplier
 
@@ -175,6 +191,9 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         print(f'Filling in missing flows for {return_period[i]} flow.')
         forecast = fill_missing_flows(forecast, nwm_river_layer)
 
+        # print(f"forecast 3")
+        # print(len(forecast))
+
         # Set paths and write file
         output_dir = os.path.join(output_parent_dir, huc)
         dir_of_csv = os.path.join(output_dir, return_period[i])
diff --git a/data/ble/ble_benchmark/preprocess_benchmark.py b/data/ble/ble_benchmark/preprocess_benchmark.py
index f9b1792cc..7084bcc39 100755
--- a/data/ble/ble_benchmark/preprocess_benchmark.py
+++ b/data/ble/ble_benchmark/preprocess_benchmark.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python3
 
 import argparse
+import os
+import sys
 
 import numpy as np
 import rasterio
+import rasterio._env
+import rasterio._path
+import rasterio.enums
 import rasterio.mask
 from rasterio.warp import Resampling, calculate_default_transform, reproject
 
@@ -33,6 +38,15 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
             (required for writing to output dataset).
 
     '''
+
+    # rasterio_gdal_path = rasterio._env.get_gdal_data()
+    # os.environ['GDAL_DATA'] = rasterio_gdal_path
+
+    # with warnings.catch_warnings():
+    #    warnings.simplefilter("ignore")
+
+    # with rasterio.Env(CPL_DEBUG=OFF, GDAL_CACHEMAX=512000000):
+
     # Open and read raster and benchmark rasters
     reference = rasterio.open(reference_raster)
     benchmark = rasterio.open(benchmark_raster)
@@ -82,10 +96,11 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     # Write out preprocessed benchmark array to raster if path is supplied
     if out_raster_path is not None:
-        with rasterio.Env():
+        with rasterio.Env(CPL_DEBUG=False):
             # Write out reassigned values to raster dataset
             with rasterio.open(out_raster_path, 'w', **profile) as dst:
                 dst.write(boolean_benchmark.astype('int8'), 1)
+
     return boolean_benchmark.astype('int8'), profile
 
 

From 283d0827afcb75b621a8b0a56d486b15765746af Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.hanna@noaa.gov>
Date: Mon, 13 May 2024 23:23:30 +0000
Subject: [PATCH 4/9] cleanup debugging

---
 .../ble_benchmark/create_flow_forecast_file.py   | 16 ----------------
 data/ble/ble_benchmark/preprocess_benchmark.py   | 10 ----------
 2 files changed, 26 deletions(-)

diff --git a/data/ble/ble_benchmark/create_flow_forecast_file.py b/data/ble/ble_benchmark/create_flow_forecast_file.py
index 03bc18ff6..647148157 100755
--- a/data/ble/ble_benchmark/create_flow_forecast_file.py
+++ b/data/ble/ble_benchmark/create_flow_forecast_file.py
@@ -138,25 +138,15 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         ble_xs_layer_name = 'XS_1D'
 
     xs_layer = gpd.read_file(ble_geodatabase, layer=ble_xs_layer_name)
-    # print("^^^^^^ xs_layer")
-    # print(xs_layer)
-    # print(xs_layer.crs)
 
     # Read in the NWM stream layer into a geopandas dataframe using the bounding box option based on the extents of the BLE XS layer.
     nwm_river_layer = gpd.read_file(nwm_geodatabase, mask=xs_layer, layer=nwm_stream_layer_name)
-    # print("^^^^^^ nwm_river_layer")
-    # print(nwm_river_layer)
-    # print(nwm_river_layer.crs)
 
     # Make sure xs_layer is in same projection as nwm_river_layer.
     xs_layer_proj = xs_layer.to_crs(nwm_river_layer.crs)
-    # print("^^^^^^ xs_layer_proj")
-    # print(xs_layer_proj)
 
     # Perform an intersection of the BLE layers and the NWM layers, using the keep_geom_type set to False produces a point output.
     intersection = gpd.overlay(xs_layer_proj, nwm_river_layer, how='intersection', keep_geom_type=False)
-    # print("^^^^^^ intersection")
-    # print(intersection)
 
     ## Create the flow forecast files
     # Define fields containing flow (typically these won't change for BLE)
@@ -181,9 +171,6 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         forecast = forecast.groupby('feature_id').median()
         forecast = forecast.reset_index(level=0)
 
-        # print(f"forecast 2")
-        # print(len(forecast))
-
         # Convert CFS to CMS
         forecast['discharge'] = forecast['discharge'] * dischargeMultiplier
 
@@ -191,9 +178,6 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         print(f'Filling in missing flows for {return_period[i]} flow.')
         forecast = fill_missing_flows(forecast, nwm_river_layer)
 
-        # print(f"forecast 3")
-        # print(len(forecast))
-
         # Set paths and write file
         output_dir = os.path.join(output_parent_dir, huc)
         dir_of_csv = os.path.join(output_dir, return_period[i])
diff --git a/data/ble/ble_benchmark/preprocess_benchmark.py b/data/ble/ble_benchmark/preprocess_benchmark.py
index 7084bcc39..b7f8a5a08 100755
--- a/data/ble/ble_benchmark/preprocess_benchmark.py
+++ b/data/ble/ble_benchmark/preprocess_benchmark.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python3
 
 import argparse
-import os
-import sys
 
 import numpy as np
 import rasterio
@@ -39,14 +37,6 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     '''
 
-    # rasterio_gdal_path = rasterio._env.get_gdal_data()
-    # os.environ['GDAL_DATA'] = rasterio_gdal_path
-
-    # with warnings.catch_warnings():
-    #    warnings.simplefilter("ignore")
-
-    # with rasterio.Env(CPL_DEBUG=OFF, GDAL_CACHEMAX=512000000):
-
     # Open and read raster and benchmark rasters
     reference = rasterio.open(reference_raster)
     benchmark = rasterio.open(benchmark_raster)

From 71c84192e278e0f163a255be56de7403dd5dde75 Mon Sep 17 00:00:00 2001
From: Rob Hanna - NOAA <90854818+RobHanna-NOAA@users.noreply.github.com>
Date: Mon, 13 May 2024 17:33:51 -0600
Subject: [PATCH 5/9] Update CHANGELOG.md

---
 docs/CHANGELOG.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index c904fef02..474c4274a 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -1,6 +1,22 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.x.x - 2024-05-15 - [PR#1155](https://github.com/NOAA-OWP/inundation-mapping/pull/1155)
+
+Download and process the latest BLE benchmark data.
+
+### Changes
+- `data/ble/ble_benchmark`:
+     - `create_ble_benchmark.py`: Added logging system, added GDAL config to control output messages, add system to cleanup temp working files.
+     - `create_flow_forecast_file.py`: Add logging system.
+     - `preprocess_benchmark.py`: Adjust GDAL to control excess output.
+
+- `src/utils`:
+   - `shared_validators.py`: Add function to valid if huc is good.
+
+<br/><br/>
+
+
 ## v4.5.0.1 - 2024-05-09 - [PR#1150](https://github.com/NOAA-OWP/inundation-mapping/pull/1150)
 
 Fixes two bugs discovered in v4.5.0.0:

From c2d01be446791e4e3c7760785f5aaf6caa8beae9 Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.hanna@noaa.gov>
Date: Tue, 14 May 2024 20:44:06 +0000
Subject: [PATCH 6/9] fix raster df

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 28 +++++++------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index 8a9d01c85..95a02a597 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -132,6 +132,7 @@ def create_ble_benchmark(
     logging.info("")
     logging.info("Creating flow files")
     logging.info("")
+
     num_recs = len(spatial_df)
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
@@ -139,9 +140,9 @@ def create_ble_benchmark(
         logging.info(f"HUC is {huc}")
         print(f" Flow file {i+1} of {num_recs}")
 
-        # reference_raster is used to set the metadata for benchmark_raster
+        # Reference_raster is used to set the metadata for benchmark_raster
         reference_raster = os.path.join(reference_folder, f'{huc}/branches/0/rem_zeroed_masked_0.tif')
-        for benchmark_raster in row['rasters']:
+        for benchmark_raster in row['raster_path']:
             magnitude = '100yr' if 'BLE_DEP01PCT' in benchmark_raster else '500yr'
 
             out_raster_dir = os.path.join(benchmark_folder, huc, magnitude)
@@ -167,10 +168,10 @@ def create_ble_benchmark(
 
         # cleaning up temp files.
         for f in Path(save_folder).glob(f"{huc}*"):
-            if os.path.isfile(f):
+            if os.path.isfile(f) is True:
                 os.remove(f)
             else:
-                shutil.rmtree(f, ignore_errors=True)
+                shutil.rmtree(f, ignore_errors=False)
 
     # Get time metrics
     end_time = dt.datetime.now(dt.timezone.utc)
@@ -231,9 +232,6 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         url = row['URL']
         print(f" Downloading {i+1} of {num_recs}")
 
-        if url is None:
-            continue
-
         logging.info("++++++++++++++++++++++++++")
         logging.info(f"URL is {url}")
 
@@ -249,14 +247,6 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         # logging.info
         logging.info(f"{huc} : Downloading and extracting for {huc_name}")
 
-        # we don't want to add a huc record if it is aleady there
-        if huc in hucs:
-            logging.info(f"{huc} : HUC is already included in the list. Skipping {url}")
-            continue
-
-        hucs.append(huc)
-        huc_names.append(huc_name)
-
         # Download file
         save_file = os.path.join(save_folder, os.path.basename(url))
         if not os.path.exists(save_file):
@@ -286,19 +276,21 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
                     depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
                     extract_raster(depth_raster_path, out_file)
-
                 out_list.append(out_file)
 
         else:
             logging.info(f"{huc} : gdb_list does not equal 1")
-
         if len(out_list) > 0:
             out_files.append(out_list)
 
+    print(f"Number of hucs for spatial df = {len(hucs)}")
+    print(f"Number of HUC_Name for spatial df = {len(huc_names)}")
+    print(f"Number of raster files for spatial df = {len(out_files)}")
     spatial_df['HUC'] = hucs
     spatial_df['HUC_Name'] = huc_names
-    spatial_df['rasters'] = out_files
+    spatial_df['raster_path'] = out_files
 
+    spatial_df.to_csv(os.path.join(save_folder, "spatial_db_w_rasters.csv"), header=True, index=False)
     print()
     logging.info("Extracting and downloading spatial data - COMPLETE")
     print()

From 681bea244f8c35db8d839a194fb5a240496ff21f Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.Hanna@noaa.gov>
Date: Thu, 16 May 2024 13:33:37 +0000
Subject: [PATCH 7/9] fundamentally starting over

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 138 ++++++++++++++----
 .../create_flow_forecast_file.py              |   4 +-
 .../ble/ble_benchmark/preprocess_benchmark.py |  24 ++-
 3 files changed, 128 insertions(+), 38 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index 95a02a597..8490b1bf5 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -4,7 +4,7 @@
 import datetime as dt
 import logging
 import os
-import shutil
+import sys
 from datetime import datetime
 from glob import glob
 from io import BytesIO
@@ -15,7 +15,8 @@
 
 import numpy as np
 import pandas as pd
-import rasterio
+
+# import rasterio
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
 from preprocess_benchmark import preprocess_benchmark_static
@@ -97,12 +98,12 @@ def create_ble_benchmark(
     print()
 
     # set from GDAL options, mostly to shut off the verbosity
-    rasterio_gdal_path = rasterio._env.get_gdal_data()
-    os.environ['GDAL_DATA'] = rasterio_gdal_path
-    gdal_new_config = {'CPL_DEBUG': 'OFF', 'GDAL_CACHEMAX': '536870912', 'GDAL_DATA': rasterio_gdal_path}
+    # rasterio_gdal_path = rasterio._env.get_gdal_data()
+    # os.environ['GDAL_DATA'] = rasterio_gdal_path
+    # gdal_new_config = {'CPL_DEBUG': 'OFF', 'GDAL_CACHEMAX': '536870912', 'GDAL_DATA': rasterio_gdal_path}
 
-    for key, val in gdal_new_config.items():
-        gdal.SetConfigOption(key, val)
+    # for key, val in gdal_new_config.items():
+    #    gdal.SetConfigOption(key, val)
 
     # Latest is:
     # EBFE_urls_20240426.xlsx acquired from FEMA (fethomps@usgs.gov) (Florence Thompson)
@@ -125,8 +126,17 @@ def create_ble_benchmark(
     # Convert size to MiB
     spatial_df['MiB'] = np.where(spatial_df['units'] == 'GiB', spatial_df['size'] * 1000, spatial_df['size'])
 
+    # create some new empty columns
+    num_cols = len(spatial_df.columns)
+    spatial_df.insert(num_cols - 1, "HUC", "")
+    spatial_df.insert(num_cols, "HUC_Name", "")
+    spatial_df.insert(num_cols + 1, "raster_paths", "")
+    spatial_df = spatial_df.reset_index()
+
     spatial_df, ble_geodatabase = download_and_extract_rasters(spatial_df, save_folder)
 
+    print(ble_geodatabase)
+
     print()
     logging.info("=======================================")
     logging.info("")
@@ -134,6 +144,7 @@ def create_ble_benchmark(
     logging.info("")
 
     num_recs = len(spatial_df)
+    print(spatial_df)
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
         logging.info("++++++++++++++++++++++++++")
@@ -142,7 +153,11 @@ def create_ble_benchmark(
 
         # Reference_raster is used to set the metadata for benchmark_raster
         reference_raster = os.path.join(reference_folder, f'{huc}/branches/0/rem_zeroed_masked_0.tif')
-        for benchmark_raster in row['raster_path']:
+        print(f"number of rasters is {len(row['raster_paths'])}")
+        print(f"full raster path value is {row['raster_paths']}")
+        for benchmark_raster in row['raster_paths']:
+
+            print(f"benchmark raster is {benchmark_raster}")
             magnitude = '100yr' if 'BLE_DEP01PCT' in benchmark_raster else '500yr'
 
             out_raster_dir = os.path.join(benchmark_folder, huc, magnitude)
@@ -165,13 +180,7 @@ def create_ble_benchmark(
                 nwm_stream_layer_name,
                 nwm_feature_id_field,
             )
-
-        # cleaning up temp files.
-        for f in Path(save_folder).glob(f"{huc}*"):
-            if os.path.isfile(f) is True:
-                os.remove(f)
-            else:
-                shutil.rmtree(f, ignore_errors=False)
+        logging.info(f"HUC {huc} flowfiles complete")
 
     # Get time metrics
     end_time = dt.datetime.now(dt.timezone.utc)
@@ -191,6 +200,7 @@ def create_ble_benchmark(
 
     logging.info(f"Duration: {time_fmt}")
     print()
+    return
 
 
 def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
@@ -226,11 +236,10 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     num_recs = len(spatial_df)
     logging.info(f"  Number of HUC records to download is {num_recs}")
     print()
-    for i, row in spatial_df.iterrows():
-
+    for idx, row in spatial_df.iterrows():
         # Extract HUC and HUC Name from URL
         url = row['URL']
-        print(f" Downloading {i+1} of {num_recs}")
+        print(f" Downloading {idx+1} of {num_recs}")
 
         logging.info("++++++++++++++++++++++++++")
         logging.info(f"URL is {url}")
@@ -244,18 +253,19 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
             logging.info(f"{huc} : Invalid huc")
             continue
 
+        hucs.append(huc)
+        huc_names.append(huc_name)
+
         # logging.info
         logging.info(f"{huc} : Downloading and extracting for {huc_name}")
 
         # Download file
         save_file = os.path.join(save_folder, os.path.basename(url))
+        logging.info(f"Saving download file to {save_file}")
         if not os.path.exists(save_file):
-            logging.info(f"Saving download file to {save_file}")
             http_response = urlopen(url)
             zipfile = ZipFile(BytesIO(http_response.read()))
             zipfile.extractall(path=save_file)
-        else:
-            logging.info(f"{huc} :  zip file already exists in file system: {save_file}")
 
         # Loading gdb
         logging.info(f"{huc} : Loading gdb")
@@ -266,7 +276,10 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
             src_ds = gdal.Open(ble_geodatabase, gdal.GA_ReadOnly)
             subdatasets = src_ds.GetSubDatasets()
 
+            src_ds = None
+
             # Find depth rasters
+            # huc_rasters = []
             for depth_raster in depth_rasters:
                 out_file = os.path.join(save_folder, f'{huc}_{depth_raster}.tif')
 
@@ -275,20 +288,39 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
                     print(f'{huc} : Reading {depth_raster}')
                     depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
-                    extract_raster(depth_raster_path, out_file)
+                    if extract_raster(huc, depth_raster_path, out_file) is False:
+                        continue
+
+                    # huc_rasters.append(out_file)
                 out_list.append(out_file)
 
-        else:
-            logging.info(f"{huc} : gdb_list does not equal 1")
+            # if len(huc_rasters) > 0:
+
+            #     spatial_df.at[idx, 'HUC'] = huc
+            #     spatial_df.at[idx, 'HUC_Name'] = huc_name
+            #     spatial_df.at[idx, 'raster_paths'] = huc_rasters
+
+            #     hucs.append(huc)
+            # huc_names.append(huc_name)
+            # out_files.append(str(huc_rasters))
+
         if len(out_list) > 0:
             out_files.append(out_list)
 
-    print(f"Number of hucs for spatial df = {len(hucs)}")
-    print(f"Number of HUC_Name for spatial df = {len(huc_names)}")
-    print(f"Number of raster files for spatial df = {len(out_files)}")
+    if len(hucs) == 0:
+        logging.info("No valid rasters remain. System aborted.")
+        sys.exit(1)
+
+    # this is ugly
+
+    # print("final hucs")
+    # print(hucs)
+    # print(huc_names)
+    # print(out_files)
+
     spatial_df['HUC'] = hucs
     spatial_df['HUC_Name'] = huc_names
-    spatial_df['raster_path'] = out_files
+    spatial_df['raster_paths'] = out_files
 
     spatial_df.to_csv(os.path.join(save_folder, "spatial_db_w_rasters.csv"), header=True, index=False)
     print()
@@ -298,7 +330,7 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     return spatial_df, ble_geodatabase
 
 
-def extract_raster(in_raster: str, out_raster: str):
+def extract_raster(huc: str, in_raster: str, out_raster: str):
     """
     Extract raster from GDB and save to out_raster
 
@@ -311,21 +343,67 @@ def extract_raster(in_raster: str, out_raster: str):
 
     Returns
     -------
-    None
+    True = raster saved
     """
 
     data_ds = gdal.Open(in_raster, gdal.GA_ReadOnly)
+    gt = data_ds.GetGeoTransform()
+
+    # Why are we using GDAL? becuase it is loading directliy from a gdb
+    # ie) in_raster is OpenFileGDB:"/data/test_cases/rob_ble_reload/
+    # download_files_test_large/08040301_SpatialData.zip/Spatial/LA_LowerRed_BLE.gdb":BLE_DEP0_2PCT
+
+    # TODO: May 15, 2024
+    # Some of them come in as 5m resolution is too big and memory failes. It will load but ReadAsArray fails
+    # we tried some gdal warp and various things to get it work.
+    # For now, we will just a have to skip anyting that is not 10m.
+
+    if gt[1] != 10.0:
+        #     # resample
+        #     layer_name = in_raster
+        #     # rename the file
+        #     resampled_file_name = file_name.replace(".tif", "_resampled.tif")
+
+        #     print(f"in : {in_raster} and resample : {resampled_file_name}")
+
+        #     print(f" --- gdal wrap : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
+        #     #gdal.WarpOptions(geoloc=True, format='GTiff', xRes=10.0, yRes=10.0)
+        #     # kwargs = {'format': 'GTiff', 'geoloc': True}
+        #     #gdal.Warp(in_raster, resampled_file_name, geoloc=True, format='GTiff', xRes=10, yRes=10)
+        #     gdal.Warp(in_raster, resampled_file_name, format='GTiff')
+
+        #     print(f" --- gdal re-open : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
+        #     # reload the resampled version
+        #     data_ds = gdal.Open(in_raster, gdal.GA_ReadOnly)
+        #     print(f" --- gdal re-open : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
+        #     gt = data_ds.GetGeoTransform()
+        #     print(gt)
+
+        logging.info(
+            f"Dropping {huc} raster not resolution of 10 m. It is {gt[1]} : raster input = {in_raster}"
+        )
+        return False
+
+    # print(f" --- read as array : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     data = data_ds.ReadAsArray()
+    # print(f" --- GetRasterBand : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     nodata = data_ds.GetRasterBand(1).GetNoDataValue()
 
     driver = gdal.GetDriverByName('GTiff')
     out_ds = driver.Create(out_raster, data.shape[1], data.shape[0], 1, gdal.GDT_Float32)
+    # print(f" --- SetGeoTransform : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.SetGeoTransform(data_ds.GetGeoTransform())
+    # print(f" --- SetProjection : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.SetProjection(data_ds.GetProjection())
+    # print(f" --- WriteArray : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.GetRasterBand(1).WriteArray(data)
     out_ds.GetRasterBand(1).SetNoDataValue(nodata)
+    # print(f" --- open raster done : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
+    data_ds = None
     out_ds = None
 
+    return True
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
diff --git a/data/ble/ble_benchmark/create_flow_forecast_file.py b/data/ble/ble_benchmark/create_flow_forecast_file.py
index 647148157..0979c8b76 100755
--- a/data/ble/ble_benchmark/create_flow_forecast_file.py
+++ b/data/ble/ble_benchmark/create_flow_forecast_file.py
@@ -9,7 +9,7 @@
 import pandas as pd
 
 
-# gpd.options.io_engine = "pyogrio"
+gpd.options.io_engine = "pyogrio"
 
 
 def create_flow_forecast_file(
@@ -55,8 +55,8 @@ def create_flow_forecast_file(
 
     '''
 
-    # print(locals())
     print(" ******************************************")
+    # print(locals())
 
     def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         """
diff --git a/data/ble/ble_benchmark/preprocess_benchmark.py b/data/ble/ble_benchmark/preprocess_benchmark.py
index b7f8a5a08..999fef7e6 100755
--- a/data/ble/ble_benchmark/preprocess_benchmark.py
+++ b/data/ble/ble_benchmark/preprocess_benchmark.py
@@ -1,12 +1,14 @@
 #!/usr/bin/env python3
 
 import argparse
+import logging
 
 import numpy as np
 import rasterio
-import rasterio._env
-import rasterio._path
-import rasterio.enums
+
+# import rasterio._env
+# import rasterio._path
+# import rasterio.enums
 import rasterio.mask
 from rasterio.warp import Resampling, calculate_default_transform, reproject
 
@@ -37,9 +39,19 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     '''
 
+    # gdal.SetConfigOption('CPL_LOG', '/dev/null')
+    # gdal.SetConfigOption('CPL_DEBUG', 'False')
+
+    # with rasterio.Env(CPL_DEBUG=False,
+    #                   GDAL_CACHEMAX=536870912,
+    #                   CPL_CURL_VERBOSE=False,
+    #                   GDAL_DATA=rasterio._env.get_gdal_data() ):
+
     # Open and read raster and benchmark rasters
-    reference = rasterio.open(reference_raster)
-    benchmark = rasterio.open(benchmark_raster)
+    logging.info(f"--- Opening reference_raster: {reference_raster}")
+    reference = rasterio.open(reference_raster, "r")
+    logging.info(f"--- Opening benchmark_raster: {benchmark_raster}")
+    benchmark = rasterio.open(benchmark_raster, "r")
     benchmark_arr = benchmark.read(1)
 
     # Set arbitrary no data value that is not possible value of the benchmark dataset. It is reassigned later.
@@ -86,7 +98,7 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     # Write out preprocessed benchmark array to raster if path is supplied
     if out_raster_path is not None:
-        with rasterio.Env(CPL_DEBUG=False):
+        with rasterio.Env():
             # Write out reassigned values to raster dataset
             with rasterio.open(out_raster_path, 'w', **profile) as dst:
                 dst.write(boolean_benchmark.astype('int8'), 1)

From e6873aff2812bd3a29aa6a169d6d20f5f4ba8849 Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.Hanna@noaa.gov>
Date: Fri, 17 May 2024 01:15:13 +0000
Subject: [PATCH 8/9] working - pre MP

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 163 ++++++++++--------
 .../create_flow_forecast_file.py              |  15 +-
 .../ble/ble_benchmark/preprocess_benchmark.py |  25 +--
 3 files changed, 121 insertions(+), 82 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index 8490b1bf5..e256c2b32 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+import traceback
 from datetime import datetime
 from glob import glob
 from io import BytesIO
@@ -16,7 +17,8 @@
 import numpy as np
 import pandas as pd
 
-# import rasterio
+# import fiona
+import rasterio
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
 from preprocess_benchmark import preprocess_benchmark_static
@@ -24,24 +26,33 @@
 from utils.shared_validators import is_valid_huc
 
 
+gdal.UseExceptions()
+
+
 def __setup_logger(output_folder_path):
     start_time = datetime.now()
     file_dt_string = start_time.strftime("%Y_%m_%d-%H_%M_%S")
     log_file_name = f"create_ble_benchmark-{file_dt_string}.log"
     log_file_path = os.path.join(output_folder_path, log_file_name)
 
-    file_handler = logging.FileHandler(log_file_path)
-    file_handler.setLevel(logging.INFO)
-    console_handler = logging.StreamHandler()
-    console_handler.setLevel(logging.DEBUG)
+    # logging.basicConfig(
+    #     format='%(asctime)s -- ',
+    #     level=logging.INFO,
+    #     datefmt='%Y-%m-%d %H:%M:%S')
 
     logger = logging.getLogger()
-    logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
     logger.setLevel(logging.DEBUG)
+    fmt = logging.Formatter("%(asctime)s >> %(message)s", "%Y-%m-%d %H:%M:%S")
 
-    logging.info(f'Started : {start_time.strftime("%m/%d/%Y %H:%M:%S")}')
-    logging.info("----------------")
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG)
+    console_handler.setFormatter(fmt)
+    logger.addHandler(console_handler)
+
+    file_handler = logging.FileHandler(log_file_path)
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(fmt)
+    logger.addHandler(file_handler)
 
 
 def create_ble_benchmark(
@@ -97,14 +108,6 @@ def create_ble_benchmark(
     logging.info(f"Start time: {start_time.strftime('%m/%d/%Y %H:%M:%S')}")
     print()
 
-    # set from GDAL options, mostly to shut off the verbosity
-    # rasterio_gdal_path = rasterio._env.get_gdal_data()
-    # os.environ['GDAL_DATA'] = rasterio_gdal_path
-    # gdal_new_config = {'CPL_DEBUG': 'OFF', 'GDAL_CACHEMAX': '536870912', 'GDAL_DATA': rasterio_gdal_path}
-
-    # for key, val in gdal_new_config.items():
-    #    gdal.SetConfigOption(key, val)
-
     # Latest is:
     # EBFE_urls_20240426.xlsx acquired from FEMA (fethomps@usgs.gov) (Florence Thompson)
 
@@ -121,21 +124,22 @@ def create_ble_benchmark(
     if huc is not None:
         spatial_df = spatial_df[spatial_df['URL'].str.contains(huc)]
 
-    spatial_df = spatial_df.reset_index()
+    spatial_df.reset_index(inplace=True)
 
     # Convert size to MiB
     spatial_df['MiB'] = np.where(spatial_df['units'] == 'GiB', spatial_df['size'] * 1000, spatial_df['size'])
 
     # create some new empty columns
     num_cols = len(spatial_df.columns)
-    spatial_df.insert(num_cols - 1, "HUC", "")
-    spatial_df.insert(num_cols, "HUC_Name", "")
-    spatial_df.insert(num_cols + 1, "raster_paths", "")
-    spatial_df = spatial_df.reset_index()
+    spatial_df.insert(num_cols, "HUC", "")
+    spatial_df.insert(num_cols + 1, "HUC_Name", "")
+    spatial_df.insert(num_cols + 2, "raster_paths", "")
 
     spatial_df, ble_geodatabase = download_and_extract_rasters(spatial_df, save_folder)
 
-    print(ble_geodatabase)
+    spatial_df = spatial_df[spatial_df['HUC'] != ""]
+    spatial_df = spatial_df[spatial_df['raster_paths'] != ""]
+    spatial_df.reset_index(inplace=True)
 
     print()
     logging.info("=======================================")
@@ -144,7 +148,6 @@ def create_ble_benchmark(
     logging.info("")
 
     num_recs = len(spatial_df)
-    print(spatial_df)
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
         logging.info("++++++++++++++++++++++++++")
@@ -153,11 +156,15 @@ def create_ble_benchmark(
 
         # Reference_raster is used to set the metadata for benchmark_raster
         reference_raster = os.path.join(reference_folder, f'{huc}/branches/0/rem_zeroed_masked_0.tif')
-        print(f"number of rasters is {len(row['raster_paths'])}")
-        print(f"full raster path value is {row['raster_paths']}")
+        if os.path.exists(reference_raster) is False:
+            logging.info(
+                f" {huc} : Raster path of {reference_raster} does not exist." " Might be a non-applicable HUC"
+            )
+            continue
+
         for benchmark_raster in row['raster_paths']:
 
-            print(f"benchmark raster is {benchmark_raster}")
+            print(f" {huc} : benchmark raster is {benchmark_raster}")
             magnitude = '100yr' if 'BLE_DEP01PCT' in benchmark_raster else '500yr'
 
             out_raster_dir = os.path.join(benchmark_folder, huc, magnitude)
@@ -167,19 +174,31 @@ def create_ble_benchmark(
             out_raster_path = os.path.join(out_raster_dir, f"ble_huc_{huc}_extent_{magnitude}.tif")
 
             # Make benchmark inundation raster
-            logging.info(f"  {huc} : magnitude : {magnitude} : preprocessing stats")
-            preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path)
-
-            logging.info(f"  {huc} : magnitude : {magnitude} : creating flow file")
-            create_flow_forecast_file(
-                huc,
-                ble_geodatabase,
-                nwm_geopackage,
-                benchmark_folder,
-                ble_xs_layer_name,
-                nwm_stream_layer_name,
-                nwm_feature_id_field,
-            )
+            logging.info(f" {huc} : magnitude : {magnitude} : preprocessing stats")
+            try:
+                preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path)
+
+            except Exception:
+                logging.info(f"HUC {huc} - An error with preprocessing benchmark stats")
+                logging.info(traceback.format_exc())
+                continue
+
+            logging.info(f" {huc} : magnitude : {magnitude} : creating flow file")
+            try:
+                create_flow_forecast_file(
+                    huc,
+                    ble_geodatabase,
+                    nwm_geopackage,
+                    benchmark_folder,
+                    ble_xs_layer_name,
+                    nwm_stream_layer_name,
+                    nwm_feature_id_field,
+                )
+            except Exception:
+                logging.info(f"HUC {huc} - An error with creating flow files has occured.")
+                logging.info(traceback.format_exc())
+                continue
+
         logging.info(f"HUC {huc} flowfiles complete")
 
     # Get time metrics
@@ -226,9 +245,9 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
     # Download and unzip each file
     hucs = []
-    huc_names = []
-    out_files = []
-    out_list = []
+    # huc_names = []
+    # out_files = []
+    # out_list = []
 
     logging.info("=======================================")
     logging.info("Extracting and downloading spatial data")
@@ -237,35 +256,45 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
     logging.info(f"  Number of HUC records to download is {num_recs}")
     print()
     for idx, row in spatial_df.iterrows():
+
         # Extract HUC and HUC Name from URL
         url = row['URL']
-        print(f" Downloading {idx+1} of {num_recs}")
+        print(f" Downloading {idx + 1} of {num_recs}")
 
+        logging.info("")
         logging.info("++++++++++++++++++++++++++")
         logging.info(f"URL is {url}")
 
         huc, huc_name = os.path.basename(os.path.dirname(url)).split('_')
+        # hucs.append(huc)
+        # huc_names.append(huc_name)
 
         # Check to ensure it is a valid HUC (some are not)
         is_ble_huc_valid, ___ = is_valid_huc(huc)
 
         if not is_ble_huc_valid:
             logging.info(f"{huc} : Invalid huc")
+            # out_files.append("")
             continue
 
+        spatial_df.at[idx, 'HUC'] = huc
+        spatial_df.at[idx, 'HUC_Name'] = huc_name
+
         hucs.append(huc)
-        huc_names.append(huc_name)
+        # huc_names.append(huc_name)
 
         # logging.info
         logging.info(f"{huc} : Downloading and extracting for {huc_name}")
 
         # Download file
         save_file = os.path.join(save_folder, os.path.basename(url))
-        logging.info(f"Saving download file to {save_file}")
+        logging.info(f"{huc} : Saving download file to {save_file}")
         if not os.path.exists(save_file):
             http_response = urlopen(url)
             zipfile = ZipFile(BytesIO(http_response.read()))
             zipfile.extractall(path=save_file)
+        else:
+            logging.info(f"{huc} : File already exists. Skipping download.")
 
         # Loading gdb
         logging.info(f"{huc} : Loading gdb")
@@ -279,33 +308,30 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
             src_ds = None
 
             # Find depth rasters
-            # huc_rasters = []
+            raster_paths = []
             for depth_raster in depth_rasters:
                 out_file = os.path.join(save_folder, f'{huc}_{depth_raster}.tif')
 
                 if not os.path.exists(out_file):
                     # Read raster data from GDB
-                    print(f'{huc} : Reading {depth_raster}')
+                    logging.info(f'{huc} : Reading {depth_raster}')
                     depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
                     if extract_raster(huc, depth_raster_path, out_file) is False:
                         continue
 
-                    # huc_rasters.append(out_file)
-                out_list.append(out_file)
+                raster_paths.append(out_file)
 
-            # if len(huc_rasters) > 0:
+                # out_list.append(out_file)
 
-            #     spatial_df.at[idx, 'HUC'] = huc
-            #     spatial_df.at[idx, 'HUC_Name'] = huc_name
-            #     spatial_df.at[idx, 'raster_paths'] = huc_rasters
+            if len(raster_paths) > 0:
+                spatial_df.at[idx, 'raster_paths'] = raster_paths
 
-            #     hucs.append(huc)
             # huc_names.append(huc_name)
             # out_files.append(str(huc_rasters))
 
-        if len(out_list) > 0:
-            out_files.append(out_list)
+        # if len(out_list) > 0:
+        #    out_files.append(out_list)
 
     if len(hucs) == 0:
         logging.info("No valid rasters remain. System aborted.")
@@ -313,14 +339,18 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
     # this is ugly
 
+    # print(spatial_df)
+
     # print("final hucs")
     # print(hucs)
     # print(huc_names)
     # print(out_files)
 
-    spatial_df['HUC'] = hucs
-    spatial_df['HUC_Name'] = huc_names
-    spatial_df['raster_paths'] = out_files
+    # spatial_df['HUC'] = hucs
+    # spatial_df['HUC_Name'] = huc_names
+    # spatial_df['raster_paths'] = out_files
+
+    # print(out_files)
 
     spatial_df.to_csv(os.path.join(save_folder, "spatial_db_w_rasters.csv"), header=True, index=False)
     print()
@@ -358,7 +388,9 @@ def extract_raster(huc: str, in_raster: str, out_raster: str):
     # we tried some gdal warp and various things to get it work.
     # For now, we will just a have to skip anyting that is not 10m.
 
-    if gt[1] != 10.0:
+    logging.info(f"{huc} : Raster reesolution is {gt[1]}")
+
+    if int(gt[1]) != 10:
         #     # resample
         #     layer_name = in_raster
         #     # rename the file
@@ -380,25 +412,18 @@ def extract_raster(huc: str, in_raster: str, out_raster: str):
         #     print(gt)
 
         logging.info(
-            f"Dropping {huc} raster not resolution of 10 m. It is {gt[1]} : raster input = {in_raster}"
+            f"{huc} dropping raster not resolution of 10 m. It is {gt[1]} : raster input = {in_raster}"
         )
         return False
 
-    # print(f" --- read as array : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     data = data_ds.ReadAsArray()
-    # print(f" --- GetRasterBand : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     nodata = data_ds.GetRasterBand(1).GetNoDataValue()
-
     driver = gdal.GetDriverByName('GTiff')
     out_ds = driver.Create(out_raster, data.shape[1], data.shape[0], 1, gdal.GDT_Float32)
-    # print(f" --- SetGeoTransform : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.SetGeoTransform(data_ds.GetGeoTransform())
-    # print(f" --- SetProjection : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.SetProjection(data_ds.GetProjection())
-    # print(f" --- WriteArray : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     out_ds.GetRasterBand(1).WriteArray(data)
     out_ds.GetRasterBand(1).SetNoDataValue(nodata)
-    # print(f" --- open raster done : {dt.datetime.now(dt.timezone.utc).strftime('%H:%M:%S')}")
     data_ds = None
     out_ds = None
 
diff --git a/data/ble/ble_benchmark/create_flow_forecast_file.py b/data/ble/ble_benchmark/create_flow_forecast_file.py
index 0979c8b76..72361919c 100755
--- a/data/ble/ble_benchmark/create_flow_forecast_file.py
+++ b/data/ble/ble_benchmark/create_flow_forecast_file.py
@@ -4,12 +4,16 @@
 import os
 
 import fiona
+import fiona._env
+import fiona.env
 import geopandas as gpd
 import numpy as np
 import pandas as pd
+import rasterio._env
+from osgeo import gdal
 
 
-gpd.options.io_engine = "pyogrio"
+# gpd.options.io_engine = "pyogrio"
 
 
 def create_flow_forecast_file(
@@ -58,6 +62,10 @@ def create_flow_forecast_file(
     print(" ******************************************")
     # print(locals())
 
+    # gdal.SetConfigOption('GDAL_DATA', rasterio._env.get_gdal_data())
+    # gdal.SetConfigOption('CPL_LOG', '/dev/null')
+    # gdal.SetConfigOption('CPL_DEBUG', 'OFF')
+
     def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         """
         This function will fill in missing flows in the forecast dataframe.
@@ -84,7 +92,7 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
 
             nonintersects = merged[merged['discharge'].isna()]
 
-            print(f'Iteration {n}. {len(nonintersects)} segments remaining.')
+            # print(f'Iteration {n}. {len(nonintersects)} segments remaining.')
 
             updated = False
             for i, row in nonintersects.iterrows():
@@ -175,7 +183,6 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         forecast['discharge'] = forecast['discharge'] * dischargeMultiplier
 
         # Assign flow to segments missing flows
-        print(f'Filling in missing flows for {return_period[i]} flow.')
         forecast = fill_missing_flows(forecast, nwm_river_layer)
 
         # Set paths and write file
@@ -185,6 +192,8 @@ def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         path_to_csv = os.path.join(dir_of_csv, "ble_huc_{}_flows_{}.csv".format(huc, return_period[i]))
         forecast.to_csv(path_to_csv, index=False)
 
+    return
+
 
 if __name__ == '__main__':
     # Parse arguments
diff --git a/data/ble/ble_benchmark/preprocess_benchmark.py b/data/ble/ble_benchmark/preprocess_benchmark.py
index 999fef7e6..438679b3d 100755
--- a/data/ble/ble_benchmark/preprocess_benchmark.py
+++ b/data/ble/ble_benchmark/preprocess_benchmark.py
@@ -5,14 +5,22 @@
 
 import numpy as np
 import rasterio
+import rasterio._env
 
-# import rasterio._env
 # import rasterio._path
 # import rasterio.enums
 import rasterio.mask
+from osgeo import gdal
 from rasterio.warp import Resampling, calculate_default_transform, reproject
 
 
+# gdal.SetConfigOption('GDAL_DATA', rasterio._env.get_gdal_data())
+# gdal.SetConfigOption('CPL_LOG', 'NUL')
+# gdal.SetConfigOption('CPL_DEBUG', 'OFF')
+# gdal.SetConfigOption('OLC_FASTSETNEXTBYINDEX', '1')
+# gdal.SetConfigOption('OLC_FASTFEATURECOUNT', '1')
+
+
 def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path=None):
     '''
     This function will preprocess a benchmark dataset for purposes of evaluating FIM output.
@@ -39,19 +47,16 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     '''
 
-    # gdal.SetConfigOption('CPL_LOG', '/dev/null')
-    # gdal.SetConfigOption('CPL_DEBUG', 'False')
-
-    # with rasterio.Env(CPL_DEBUG=False,
+    # with rasterio.Env(CPL_DEBUG="OFF"):
     #                   GDAL_CACHEMAX=536870912,
-    #                   CPL_CURL_VERBOSE=False,
     #                   GDAL_DATA=rasterio._env.get_gdal_data() ):
 
     # Open and read raster and benchmark rasters
-    logging.info(f"--- Opening reference_raster: {reference_raster}")
-    reference = rasterio.open(reference_raster, "r")
-    logging.info(f"--- Opening benchmark_raster: {benchmark_raster}")
-    benchmark = rasterio.open(benchmark_raster, "r")
+    # logging.info(f"--- Opening reference_raster: {reference_raster}")
+    # with rasterio.Env():
+    reference = rasterio.open(reference_raster)
+    # logging.info(f"--- Opening benchmark_raster: {benchmark_raster}")
+    benchmark = rasterio.open(benchmark_raster)
     benchmark_arr = benchmark.read(1)
 
     # Set arbitrary no data value that is not possible value of the benchmark dataset. It is reassigned later.

From 6e393f90a7d046304d955727701c036eb34730f3 Mon Sep 17 00:00:00 2001
From: Rob Hanna <Robert.Hanna@noaa.gov>
Date: Fri, 17 May 2024 12:47:32 +0000
Subject: [PATCH 9/9] running in full scale now

---
 .../ble/ble_benchmark/create_ble_benchmark.py | 101 ++++++++----------
 .../create_flow_forecast_file.py              |   5 -
 .../ble/ble_benchmark/preprocess_benchmark.py |  16 ---
 3 files changed, 43 insertions(+), 79 deletions(-)

diff --git a/data/ble/ble_benchmark/create_ble_benchmark.py b/data/ble/ble_benchmark/create_ble_benchmark.py
index e256c2b32..a8823fa72 100644
--- a/data/ble/ble_benchmark/create_ble_benchmark.py
+++ b/data/ble/ble_benchmark/create_ble_benchmark.py
@@ -16,8 +16,6 @@
 
 import numpy as np
 import pandas as pd
-
-# import fiona
 import rasterio
 from create_flow_forecast_file import create_flow_forecast_file
 from osgeo import gdal
@@ -35,11 +33,6 @@ def __setup_logger(output_folder_path):
     log_file_name = f"create_ble_benchmark-{file_dt_string}.log"
     log_file_path = os.path.join(output_folder_path, log_file_name)
 
-    # logging.basicConfig(
-    #     format='%(asctime)s -- ',
-    #     level=logging.INFO,
-    #     datefmt='%Y-%m-%d %H:%M:%S')
-
     logger = logging.getLogger()
     logger.setLevel(logging.DEBUG)
     fmt = logging.Formatter("%(asctime)s >> %(message)s", "%Y-%m-%d %H:%M:%S")
@@ -65,6 +58,7 @@ def create_ble_benchmark(
     nwm_stream_layer_name: str,
     nwm_feature_id_field: str,
     huc: str = None,
+    num_jobs: int = 1,
 ):
     """
     Downloads and preprocesses BLE benchmark datasets for purposes of evaluating FIM output.
@@ -148,6 +142,7 @@ def create_ble_benchmark(
     logging.info("")
 
     num_recs = len(spatial_df)
+
     for i, row in spatial_df.iterrows():
         huc = row['HUC']
         logging.info("++++++++++++++++++++++++++")
@@ -245,9 +240,6 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
 
     # Download and unzip each file
     hucs = []
-    # huc_names = []
-    # out_files = []
-    # out_list = []
 
     logging.info("=======================================")
     logging.info("Extracting and downloading spatial data")
@@ -266,8 +258,6 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         logging.info(f"URL is {url}")
 
         huc, huc_name = os.path.basename(os.path.dirname(url)).split('_')
-        # hucs.append(huc)
-        # huc_names.append(huc_name)
 
         # Check to ensure it is a valid HUC (some are not)
         is_ble_huc_valid, ___ = is_valid_huc(huc)
@@ -281,77 +271,64 @@ def download_and_extract_rasters(spatial_df: pd.DataFrame, save_folder: str):
         spatial_df.at[idx, 'HUC_Name'] = huc_name
 
         hucs.append(huc)
-        # huc_names.append(huc_name)
 
         # logging.info
         logging.info(f"{huc} : Downloading and extracting for {huc_name}")
 
         # Download file
-        save_file = os.path.join(save_folder, os.path.basename(url))
-        logging.info(f"{huc} : Saving download file to {save_file}")
-        if not os.path.exists(save_file):
-            http_response = urlopen(url)
-            zipfile = ZipFile(BytesIO(http_response.read()))
-            zipfile.extractall(path=save_file)
-        else:
-            logging.info(f"{huc} : File already exists. Skipping download.")
+        try:
+            save_file = os.path.join(save_folder, os.path.basename(url))
+            logging.info(f"{huc} : Saving download file to {save_file}")
+            if not os.path.exists(save_file):
+                http_response = urlopen(url)
+                zipfile = ZipFile(BytesIO(http_response.read()))
+                zipfile.extractall(path=save_file)
+            else:
+                logging.info(f"{huc} : File already exists. Skipping download.")
+        except Exception:
+            logging.info(f"HUC {huc} - An error with downloading and unzipping.")
+            logging.info(traceback.format_exc())
+            continue
 
         # Loading gdb
         logging.info(f"{huc} : Loading gdb")
         gdb_list = glob(os.path.join(save_file, '**', '*.gdb'), recursive=True)
 
         if len(gdb_list) == 1:
-            ble_geodatabase = gdb_list[0]
-            src_ds = gdal.Open(ble_geodatabase, gdal.GA_ReadOnly)
-            subdatasets = src_ds.GetSubDatasets()
+            raster_paths = []
+            try:
+                ble_geodatabase = gdb_list[0]
+                src_ds = gdal.Open(ble_geodatabase, gdal.GA_ReadOnly)
+                subdatasets = src_ds.GetSubDatasets()
 
-            src_ds = None
+                src_ds = None
 
-            # Find depth rasters
-            raster_paths = []
-            for depth_raster in depth_rasters:
-                out_file = os.path.join(save_folder, f'{huc}_{depth_raster}.tif')
+                # Find depth rasters
+                for depth_raster in depth_rasters:
+                    out_file = os.path.join(save_folder, f'{huc}_{depth_raster}.tif')
 
-                if not os.path.exists(out_file):
-                    # Read raster data from GDB
-                    logging.info(f'{huc} : Reading {depth_raster}')
-                    depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
+                    if not os.path.exists(out_file):
+                        # Read raster data from GDB
+                        logging.info(f'{huc} : Reading {depth_raster}')
+                        depth_raster_path = [item[0] for item in subdatasets if depth_raster in item[1]][0]
 
-                    if extract_raster(huc, depth_raster_path, out_file) is False:
-                        continue
+                        if extract_raster(huc, depth_raster_path, out_file) is False:
+                            continue
 
-                raster_paths.append(out_file)
+                    raster_paths.append(out_file)
 
-                # out_list.append(out_file)
+            except Exception:
+                logging.info(f"HUC {huc} - An error with getting rasters.")
+                logging.info(traceback.format_exc())
+                continue
 
             if len(raster_paths) > 0:
                 spatial_df.at[idx, 'raster_paths'] = raster_paths
 
-            # huc_names.append(huc_name)
-            # out_files.append(str(huc_rasters))
-
-        # if len(out_list) > 0:
-        #    out_files.append(out_list)
-
     if len(hucs) == 0:
         logging.info("No valid rasters remain. System aborted.")
         sys.exit(1)
 
-    # this is ugly
-
-    # print(spatial_df)
-
-    # print("final hucs")
-    # print(hucs)
-    # print(huc_names)
-    # print(out_files)
-
-    # spatial_df['HUC'] = hucs
-    # spatial_df['HUC_Name'] = huc_names
-    # spatial_df['raster_paths'] = out_files
-
-    # print(out_files)
-
     spatial_df.to_csv(os.path.join(save_folder, "spatial_db_w_rasters.csv"), header=True, index=False)
     print()
     logging.info("Extracting and downloading spatial data - COMPLETE")
@@ -475,6 +452,14 @@ def extract_raster(huc: str, in_raster: str, out_raster: str):
         required=False,
         default='ID',
     )
+    parser.add_argument(
+        '-j',
+        '--num_jobs',
+        help='Number of jobs - aka: number of multi processes allowed to run at one time',
+        required=False,
+        default=1,
+        type=int,
+    )
 
     args = vars(parser.parse_args())
 
diff --git a/data/ble/ble_benchmark/create_flow_forecast_file.py b/data/ble/ble_benchmark/create_flow_forecast_file.py
index 72361919c..40de89c2f 100755
--- a/data/ble/ble_benchmark/create_flow_forecast_file.py
+++ b/data/ble/ble_benchmark/create_flow_forecast_file.py
@@ -60,11 +60,6 @@ def create_flow_forecast_file(
     '''
 
     print(" ******************************************")
-    # print(locals())
-
-    # gdal.SetConfigOption('GDAL_DATA', rasterio._env.get_gdal_data())
-    # gdal.SetConfigOption('CPL_LOG', '/dev/null')
-    # gdal.SetConfigOption('CPL_DEBUG', 'OFF')
 
     def fill_missing_flows(forecast: pd.DataFrame, nwm_river_layer: pd.DataFrame):
         """
diff --git a/data/ble/ble_benchmark/preprocess_benchmark.py b/data/ble/ble_benchmark/preprocess_benchmark.py
index 438679b3d..cbf17b59e 100755
--- a/data/ble/ble_benchmark/preprocess_benchmark.py
+++ b/data/ble/ble_benchmark/preprocess_benchmark.py
@@ -6,21 +6,11 @@
 import numpy as np
 import rasterio
 import rasterio._env
-
-# import rasterio._path
-# import rasterio.enums
 import rasterio.mask
 from osgeo import gdal
 from rasterio.warp import Resampling, calculate_default_transform, reproject
 
 
-# gdal.SetConfigOption('GDAL_DATA', rasterio._env.get_gdal_data())
-# gdal.SetConfigOption('CPL_LOG', 'NUL')
-# gdal.SetConfigOption('CPL_DEBUG', 'OFF')
-# gdal.SetConfigOption('OLC_FASTSETNEXTBYINDEX', '1')
-# gdal.SetConfigOption('OLC_FASTFEATURECOUNT', '1')
-
-
 def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_path=None):
     '''
     This function will preprocess a benchmark dataset for purposes of evaluating FIM output.
@@ -47,12 +37,6 @@ def preprocess_benchmark_static(benchmark_raster, reference_raster, out_raster_p
 
     '''
 
-    # with rasterio.Env(CPL_DEBUG="OFF"):
-    #                   GDAL_CACHEMAX=536870912,
-    #                   GDAL_DATA=rasterio._env.get_gdal_data() ):
-
-    # Open and read raster and benchmark rasters
-    # logging.info(f"--- Opening reference_raster: {reference_raster}")
     # with rasterio.Env():
     reference = rasterio.open(reference_raster)
     # logging.info(f"--- Opening benchmark_raster: {benchmark_raster}")