From 9c1f3af41924a31fa1eddb65177f3e3a96a25064 Mon Sep 17 00:00:00 2001
From: MattLuck-NOAA <matt.luck@noaa.gov>
Date: Mon, 8 Jul 2024 10:56:06 -0700
Subject: [PATCH] v4.5.2.5 Snap crosswalk to NWM streams (#1205)

---
 docs/CHANGELOG.md    |  11 +++
 src/add_crosswalk.py | 189 +++++++++++++++----------------------------
 2 files changed, 74 insertions(+), 126 deletions(-)
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 07f1ed850..35de89aed 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -1,6 +1,17 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.2.5 - 2024-07-08 - [PR#1205](https://github.com/NOAA-OWP/inundation-mapping/pull/1205)
+
+Snaps crosswalk from the midpoint of DEM-derived reaches to the nearest point on NWM streams within a threshold of 100 meters. DEM-derived streams that do not locate any NWM streams within 100 meters of their midpoints are removed from the FIM hydrofabric and their catchments are not inundated.
+
+### Changes
+
+- `src/add_crosswalk.py`: Locates nearest NWM stream to midpoint of DEM-derived reaches if within 100 meters. Also fixes a couple of minor bugs. 
+
+<br/><br/>
+
+
 ## v4.5.2.4 - 2024-07-08 - [PR#1204](https://github.com/NOAA-OWP/inundation-mapping/pull/1204)
 
 Bug fix for extending outlets in order to ensure proper flow direction in depression filling algorithm. This PR adds a distance criteria that in order for the end of an outlet stream to be snapped to the wbd_buffered boundary, the end point must be less than 100 meters from the WBD boundary.
diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
index ddc9444df..62789c9b3 100755
--- a/src/add_crosswalk.py
+++ b/src/add_crosswalk.py
@@ -40,130 +40,70 @@ def add_crosswalk(
     input_catchments = gpd.read_file(input_catchments_fileName, engine="pyogrio", use_arrow=True)
     input_flows = gpd.read_file(input_flows_fileName, engine="pyogrio", use_arrow=True)
     input_huc = gpd.read_file(input_huc_fileName, engine="pyogrio", use_arrow=True)
+    input_nwmcat = gpd.read_file(input_nwmcat_fileName, engine="pyogrio", use_arrow=True)
     input_nwmflows = gpd.read_file(input_nwmflows_fileName, engine="pyogrio", use_arrow=True)
     min_catchment_area = float(min_catchment_area)  # 0.25#
     min_stream_length = float(min_stream_length)  # 0.5#
 
-    if extent == 'FR':
-        ## crosswalk using majority catchment method
-
-        # calculate majority catchments
-        majority_calc = zonal_stats(
-            input_catchments, input_nwmcatras_fileName, stats=['majority'], geojson_out=True
-        )
-        input_majorities = gpd.GeoDataFrame.from_features(majority_calc)
-        input_majorities = input_majorities.rename(columns={'majority': 'feature_id'})
-
-        input_majorities = input_majorities[:][input_majorities['feature_id'].notna()]
-        if input_majorities.feature_id.dtype != 'int':
-            input_majorities.feature_id = input_majorities.feature_id.astype(int)
-        if input_majorities.HydroID.dtype != 'int':
-            input_majorities.HydroID = input_majorities.HydroID.astype(int)
-
-        input_nwmflows = input_nwmflows.rename(columns={'ID': 'feature_id'})
-        if input_nwmflows.feature_id.dtype != 'int':
-            input_nwmflows.feature_id = input_nwmflows.feature_id.astype(int)
-        relevant_input_nwmflows = input_nwmflows[
-            input_nwmflows['feature_id'].isin(input_majorities['feature_id'])
-        ]
-        relevant_input_nwmflows = relevant_input_nwmflows.filter(items=['feature_id', 'order_'])
-
-        if input_catchments.HydroID.dtype != 'int':
-            input_catchments.HydroID = input_catchments.HydroID.astype(int)
-        output_catchments = input_catchments.merge(input_majorities[['HydroID', 'feature_id']], on='HydroID')
-        output_catchments = output_catchments.merge(
-            relevant_input_nwmflows[['order_', 'feature_id']], on='feature_id'
-        )
-
-        if input_flows.HydroID.dtype != 'int':
-            input_flows.HydroID = input_flows.HydroID.astype(int)
-        output_flows = input_flows.merge(input_majorities[['HydroID', 'feature_id']], on='HydroID')
-        if output_flows.HydroID.dtype != 'int':
-            output_flows.HydroID = output_flows.HydroID.astype(int)
-        output_flows = output_flows.merge(relevant_input_nwmflows[['order_', 'feature_id']], on='feature_id')
-        output_flows = output_flows.merge(
-            output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID'
-        )
-
-    elif (extent == 'MS') | (extent == 'GMS'):
-        ## crosswalk using stream segment midpoint method
-        input_nwmcat = gpd.read_file(input_nwmcat_fileName, mask=input_huc, engine="fiona")
-
-        # only reduce nwm catchments to mainstems if running mainstems
-        if extent == 'MS':
-            input_nwmcat = input_nwmcat.loc[input_nwmcat.mainstem == 1]
-
-        input_nwmcat = input_nwmcat.rename(columns={'ID': 'feature_id'})
-        if input_nwmcat.feature_id.dtype != 'int':
-            input_nwmcat.feature_id = input_nwmcat.feature_id.astype(int)
-        input_nwmcat = input_nwmcat.set_index('feature_id')
-
-        input_nwmflows = input_nwmflows.rename(columns={'ID': 'feature_id'})
-        if input_nwmflows.feature_id.dtype != 'int':
-            input_nwmflows.feature_id = input_nwmflows.feature_id.astype(int)
-
-        # Get stream midpoint
-        stream_midpoint = []
-        hydroID = []
-        for i, lineString in enumerate(input_flows.geometry):
-            hydroID = hydroID + [input_flows.loc[i, 'HydroID']]
-            stream_midpoint = stream_midpoint + [lineString.interpolate(0.5, normalized=True)]
-
-        input_flows_midpoint = gpd.GeoDataFrame(
-            {'HydroID': hydroID, 'geometry': stream_midpoint}, crs=input_flows.crs, geometry='geometry'
-        )
-        input_flows_midpoint = input_flows_midpoint.set_index('HydroID')
-
-        # Create crosswalk
-        crosswalk = gpd.sjoin(
-            input_flows_midpoint, input_nwmcat, how='left', predicate='within'
-        ).reset_index()
-        crosswalk = crosswalk.rename(columns={"index_right": "feature_id"})
-
-        # fill in missing ms
-        crosswalk_missing = crosswalk.loc[crosswalk.feature_id.isna()]
-        for index, stream in crosswalk_missing.iterrows():
-            # find closest nwm catchment by distance
-            distances = [stream.geometry.distance(poly) for poly in input_nwmcat.geometry]
-            min_dist = min(distances)
-            nwmcat_index = distances.index(min_dist)
-
-            # update crosswalk
-            crosswalk.loc[crosswalk.HydroID == stream.HydroID, 'feature_id'] = input_nwmcat.iloc[
-                nwmcat_index
-            ].name
-            crosswalk.loc[crosswalk.HydroID == stream.HydroID, 'AreaSqKM'] = input_nwmcat.iloc[
-                nwmcat_index
-            ].AreaSqKM
-            crosswalk.loc[crosswalk.HydroID == stream.HydroID, 'Shape_Length'] = input_nwmcat.iloc[
-                nwmcat_index
-            ].Shape_Length
-            crosswalk.loc[crosswalk.HydroID == stream.HydroID, 'Shape_Area'] = input_nwmcat.iloc[
-                nwmcat_index
-            ].Shape_Area
-
-        crosswalk = crosswalk.filter(items=['HydroID', 'feature_id'])
-        crosswalk = crosswalk.merge(input_nwmflows[['feature_id', 'order_']], on='feature_id')
-
-        if len(crosswalk) < 1:
-            print("No relevant streams within HUC boundaries.")
-            sys.exit(0)
-
-        if input_catchments.HydroID.dtype != 'int':
-            input_catchments.HydroID = input_catchments.HydroID.astype(int)
-        output_catchments = input_catchments.merge(crosswalk, on='HydroID')
-
-        if input_flows.HydroID.dtype != 'int':
-            input_flows.HydroID = input_flows.HydroID.astype(int)
-        output_flows = input_flows.merge(crosswalk, on='HydroID')
-
-        # added for GMS. Consider adding filter_catchments_and_add_attributes.py to run_by_branch.sh
-        if 'areasqkm' not in output_catchments.columns:
-            output_catchments['areasqkm'] = output_catchments.geometry.area / (1000**2)
-
-        output_flows = output_flows.merge(
-            output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID'
-        )
+    input_catchments = input_catchments.dissolve(by='HydroID').reset_index()
+
+    ## crosswalk using stream segment midpoint method
+    input_nwmcat = gpd.read_file(input_nwmcat_fileName, mask=input_huc, engine="fiona")
+
+    # only reduce nwm catchments to mainstems if running mainstems
+    if extent == 'MS':
+        input_nwmcat = input_nwmcat.loc[input_nwmcat.mainstem == 1]
+
+    input_nwmcat = input_nwmcat.rename(columns={'ID': 'feature_id'})
+    if input_nwmcat.feature_id.dtype != 'int':
+        input_nwmcat.feature_id = input_nwmcat.feature_id.astype(int)
+    input_nwmcat = input_nwmcat.set_index('feature_id')
+
+    input_nwmflows = input_nwmflows.rename(columns={'ID': 'feature_id'})
+    if input_nwmflows.feature_id.dtype != 'int':
+        input_nwmflows.feature_id = input_nwmflows.feature_id.astype(int)
+    input_nwmflows = input_nwmflows.set_index('feature_id')
+
+    # Get stream midpoint
+    stream_midpoint = []
+    hydroID = []
+    for i, lineString in enumerate(input_flows.geometry):
+        hydroID = hydroID + [input_flows.loc[i, 'HydroID']]
+        stream_midpoint = stream_midpoint + [lineString.interpolate(0.5, normalized=True)]
+
+    input_flows_midpoint = gpd.GeoDataFrame(
+        {'HydroID': hydroID, 'geometry': stream_midpoint}, crs=input_flows.crs, geometry='geometry'
+    )
+    input_flows_midpoint = input_flows_midpoint.set_index('HydroID')
+
+    # Create crosswalk
+    crosswalk = gpd.sjoin_nearest(
+        input_flows_midpoint, input_nwmflows, how='left', distance_col='distance'
+    ).reset_index()
+    crosswalk = crosswalk.rename(columns={"index_right": "feature_id"})
+
+    crosswalk.loc[crosswalk['distance'] > 100.0, 'feature_id'] = pd.NA
+
+    crosswalk = crosswalk.filter(items=['HydroID', 'feature_id', 'distance'])
+    crosswalk = crosswalk.merge(input_nwmflows[['order_']], on='feature_id')
+
+    if len(crosswalk) < 1:
+        print("No relevant streams within HUC boundaries.")
+        sys.exit(0)
+
+    if input_catchments.HydroID.dtype != 'int':
+        input_catchments.HydroID = input_catchments.HydroID.astype(int)
+    output_catchments = input_catchments.merge(crosswalk, on='HydroID')
+
+    if input_flows.HydroID.dtype != 'int':
+        input_flows.HydroID = input_flows.HydroID.astype(int)
+    output_flows = input_flows.merge(crosswalk, on='HydroID')
+
+    # added for GMS. Consider adding filter_catchments_and_add_attributes.py to run_by_branch.sh
+    if 'areasqkm' not in output_catchments.columns:
+        output_catchments['areasqkm'] = output_catchments.geometry.area / (1000**2)
+
+    output_flows = output_flows.merge(output_catchments.filter(items=['HydroID', 'areasqkm']), on='HydroID')
 
     output_flows['ManningN'] = mannings_n
 
@@ -223,7 +163,7 @@ def add_crosswalk(
             elif len(output_flows.loc[output_flows.From_Node == to_node]['HydroID']) > 1:
                 try:
                     max_order = max(
-                        output_flows.loc[output_flows.From_Node == to_node]['HydroID']['order_']
+                        output_flows.loc[output_flows.From_Node == to_node]['order_']
                     )  # drainage area would be better than stream order but we would need to calculate
                 except Exception as e:
                     print(
@@ -276,7 +216,7 @@ def add_crosswalk(
                 update_id = output_flows.loc[output_flows.From_Node == to_node]['HydroID'].item()
 
             else:
-                update_id = output_flows.loc[output_flows.HydroID == short_id]['HydroID'].item()
+                update_id = output_flows[output_flows.HydroID == short_id]['HydroID'].iloc[0]
 
             output_order = output_flows.loc[output_flows.HydroID == short_id]['order_']
             if len(output_order) == 1:
@@ -347,10 +287,7 @@ def add_crosswalk(
                     ['Discharge (m3s-1)'],
                 ] = src_stage[1]
 
-    if extent == 'FR':
-        output_src = output_src.merge(input_majorities[['HydroID', 'feature_id']], on='HydroID')
-    elif (extent == 'MS') | (extent == 'GMS'):
-        output_src = output_src.merge(crosswalk[['HydroID', 'feature_id']], on='HydroID')
+    output_src = output_src.merge(crosswalk[['HydroID', 'feature_id']], on='HydroID')
 
     output_crosswalk = output_src[['HydroID', 'feature_id']]
     output_crosswalk = output_crosswalk.drop_duplicates(ignore_index=True)