Merge pull request #19 from lekoenig/add-nhd-met-data

Add nhd met data
USGS-R · Sep 6, 2022 · f4ab37a · f4ab37a
2 parents f328c58 + 0cb3652
commit f4ab37a
Show file tree

Hide file tree

Showing 6 changed files with 314 additions and 64 deletions.
diff --git a/1_fetch.R b/1_fetch.R
@@ -7,22 +7,18 @@ source("1_fetch/src/read_netcdf.R")
 
 p1_targets_list <- list(
 
-  # Read in the NHM - NHDv2 crosswalk file
+  # Important! This pipeline uses two versions of the NHM-NHDv2 crosswalk table
+  # for different purposes. All targets pertaining to the "dendritic" version
+  # of the crosswalk table will include the string "dendritic" in the name.
+  # Crosswalk #1: Read in the NHM - NHDv2 crosswalk file that contains all 
+  # NHDPlusv2 COMIDs in the DRB (including divergent reaches and reaches 
+  # without catchments)
   tar_target(
     p1_GFv1_NHDv2_xwalk,
     read_csv(GFv1_NHDv2_xwalk_url,col_types = cols(.default = "c"))
   ),
 
-  # Reshape crosswalk table to return all COMIDs that overlap each NHM segment
-  tar_target(
-    p1_drb_comids_segs, 
-    p1_GFv1_NHDv2_xwalk %>%
-      select(PRMS_segid, segidnat, comid_seg) %>% 
-      tidyr::separate_rows(comid_seg,sep=";") %>% 
-      rename(COMID = comid_seg)
-  ),
-
-  # Reshape crosswalk table to return all COMIDs that drain to each NHM segment
+  # Reshape crosswalk table to return all COMIDs that drain to each NHM segment.
   tar_target(
     p1_drb_comids_all_tribs, 
     p1_GFv1_NHDv2_xwalk %>%
@@ -31,25 +27,38 @@ p1_targets_list <- list(
       rename(COMID = comid_cat)
   ),
 
-  # Use crosswalk table to fetch just the NHDv2 reaches that overlap the NHM network
+  # Reshape crosswalk table to return all COMIDs that overlap each NHM segment.
   tar_target(
-    p1_nhd_reaches_along_NHM,
-    download_nhdplus_flowlines(p1_drb_comids_segs$COMID)
+    p1_drb_comids_segs, 
+    p1_GFv1_NHDv2_xwalk %>%
+      select(PRMS_segid, segidnat, comid_seg) %>% 
+      tidyr::separate_rows(comid_seg,sep=";") %>% 
+      rename(COMID = comid_seg)
   ),
 
-  # Use crosswalk table to fetch all NHDv2 reaches in the DRB 
+  # Use crosswalk table to fetch all NHDv2 reaches in the DRB. These COMIDs 
+  # should be used for preparing feature data, including aggregating feature 
+  # values from the NHD-scale to the NHM-scale and/or for deriving feature 
+  # values from raster data.
   tar_target(
     p1_nhd_reaches,
     download_nhdplus_flowlines(p1_drb_comids_all_tribs$COMID)
   ),
 
-  ## May take a while to run
+  # Use crosswalk table to fetch just the NHDv2 reaches that overlap the NHM network.
+  tar_target(
+    p1_nhd_reaches_along_NHM,
+    download_nhdplus_flowlines(p1_drb_comids_segs$COMID)
+  ),
+
+  # Download all NHDPlusv2 catchments in the DRB (may take awhile to run).
   tar_target(
     p1_nhd_catchments,
     get_nhdplusv2_catchments(comid = p1_nhd_reaches$comid)
   ),
 
-  ## polygons are analogous to HRU 
+  # Dissolve NHDPlusv2 catchments to create a single catchment polygon for
+  # each NHM segment (analogous to HRU).
   tar_target(
     p1_nhm_catchments_dissolved,
     {sf_use_s2(FALSE)
@@ -61,13 +70,32 @@ p1_targets_list <- list(
     }
   ),
 
-  # Track depth to bedrock raster dataset in 1_fetch/in
+  # Crosswalk #2: Read in the NHM - NHDv2 crosswalk file that corresponds to 
+  # the *dendritic* network only (i.e., divergent reaches have been omitted). 
+  # This version of the crosswalk table was used to build the network distance
+  # matrix in drb-network-prep, and so includes the COMIDs that we will make 
+  # predictions on in the NHD-downscaling set of experiments. 
   tar_target(
-    p1_depth_to_bedrock_tif,
-    Shangguan_dtb_cm_250m_clip_path,
-    format = "file"
+    p1_GFv1_NHDv2_xwalk_dendritic,
+    read_csv(GFv1_NHDv2_xwalk_dendritic_url,col_types = cols(.default = "c"))
   ),
 
+  # Reshape dendritic crosswalk table to return all COMIDs that overlap each 
+  # NHM segment
+  tar_target(
+    p1_drb_comids_dendritic_segs, 
+    p1_GFv1_NHDv2_xwalk_dendritic %>%
+      select(PRMS_segid, segidnat, comid_seg) %>% 
+      tidyr::separate_rows(comid_seg,sep=";") %>% 
+      rename(COMID = comid_seg)
+  ),
+
+  # Use crosswalk table to fetch just the dendritic NHDv2 reaches that overlap
+  # the NHM network
+  tar_target(
+    p1_dendritic_nhd_reaches_along_NHM,
+    download_nhdplus_flowlines(p1_drb_comids_dendritic_segs$COMID)
+  ),
 
   # Manually download temperature site locations from ScienceBase using the
   # commented-out code below and place the downloaded zip file in 1_fetch/in. 
@@ -78,7 +106,6 @@ p1_targets_list <- list(
   # download_sb_file(sb_id = "623e54c4d34e915b67d83580",
   #                 file_name = "study_monitoring_sites.zip",
   #                 out_dir = "1_fetch/in")
-
   tar_target(
     p1_drb_temp_sites_shp,
     {
@@ -147,6 +174,17 @@ p1_targets_list <- list(
     read_netcdf(p1_sntemp_input_output_nc)
   ),
 
+  # Read in meteorological data aggregated to NHDPlusV2 catchments for the 
+  # DRB (prepped in https://github.com/USGS-R/drb_gridmet_tools). Note that
+  # the DRB met data file must be stored in 1_fetch/in. If working outside
+  # of tallgrass/caldera, this file will need to be downloaded from the
+  # PGDL-DO project's S3 bucket and manually placed in 1_fetch/in.
+  tar_target(
+    p1_drb_nhd_gridmet,
+    "1_fetch/in/drb_climate_2022_06_14.nc",
+    format = "file"
+  ),
+
   # Download ref-gages v0.6 to help QC matching NWIS sites to NHDv2 flowlines
   tar_target(
     p1_ref_gages_geojson,
@@ -163,16 +201,16 @@ p1_targets_list <- list(
   ),
 
   # STATSGO SOIL Characteristics
-  ## get selected child items nhdv2 STATSGO Soil Characteristics
-  ## 1) Text attributes and 2) Layer attributes
+  # get selected child items nhdv2 STATSGO Soil Characteristics,
+  # including 1) texture and 2) layer attributes.
   tar_target(
     p1_selected_statsgo_sbid_children,
     sbtools::item_list_children(sb_id = nhd_statsgo_parent_sbid) %>% 
       Filter(function(x){str_detect(x[['title']],'Text|Layer')},
              .)
-    ),
+  ),
 
-  ## download selected CONUS STATSGO datasets from Science base
+  # Download selected CONUS STATSGO datasets from Science base
   tar_target(
     p1_download_statsgo_text_layer_zip,
     lapply(p1_selected_statsgo_sbid_children,
@@ -182,9 +220,10 @@ p1_targets_list <- list(
                                         overwrite_file = TRUE)}
            ) %>%
       unlist(),
-    format = 'file'),
+    format = 'file'
+  ),
 
-  ## Combine statsgo TEXT and Layer Attributes for CAT and TOT and filter to drb
+  # Combine statsgo TEXT and Layer Attributes for CAT and TOT and filter to drb
   tar_target(
     p1_statsgo_soil_df,
     sb_read_filter_by_comids(data_path = '1_fetch/out/statsgo',
@@ -194,5 +233,13 @@ p1_targets_list <- list(
                                                         "NO4AVE","SILTAVE","CLAYAVE",
                                                         "SANDAVE",'WTDEP'),
                              cbind = TRUE)
+  ),
+
+  # Track depth to bedrock raster dataset in 1_fetch/in
+  tar_target(
+    p1_depth_to_bedrock_tif,
+    Shangguan_dtb_cm_250m_clip_path,
+    format = "file"
   )
+
 )
diff --git a/2_process.R b/2_process.R
@@ -2,9 +2,18 @@ source("2_process/src/subset_closest_nhd.R")
 source('2_process/src/raster_in_polygon_weighted_mean.R')
 source("2_process/src/estimate_mean_width.R")
 source("2_process/src/write_data.R")
+source("2_process/src/combine_nhd_input_drivers.R")
 
 p2_targets_list <- list(
 
+  # Subset NHDv2 reaches that overlap the NHM network to only include those 
+  # that have a corresponding catchment (and meteorological data)
+  tar_target(
+    p2_dendritic_nhd_reaches_along_NHM_w_cats,
+    p1_dendritic_nhd_reaches_along_NHM %>%
+      filter(areasqkm > 0)
+  ),
+
   # Match temperature monitoring locations to "mainstem" NHDPlusv2 flowline
   # reaches, i.e. those that NHD reaches that intersect the NHM river network.
   # Note that this site-to-segment matching procedure emulates the process
@@ -13,7 +22,7 @@ p2_targets_list <- list(
   # for which the downstream vertex (endpoint) is close to the site point.
   tar_target(
     p2_drb_temp_sites_w_segs,
-    subset_closest_nhd(nhd_lines = p1_nhd_reaches_along_NHM,
+    subset_closest_nhd(nhd_lines = p2_dendritic_nhd_reaches_along_NHM_w_cats,
                        sites = p1_drb_temp_sites_sf)
   ),
 
@@ -60,15 +69,15 @@ p2_targets_list <- list(
   # Estimate mean width for each "mainstem" NHDv2 reach 
   tar_target(
     p2_nhd_mainstem_reaches_w_width,
-    estimate_mean_width(p1_nhd_reaches_along_NHM, 
+    estimate_mean_width(p2_dendritic_nhd_reaches_along_NHM_w_cats, 
                         estimation_method = 'nwis',
                         network_pos_variable = 'arbolate_sum',
                         ref_gages = p1_ref_gages_sf)
   ),
 
   # Pull static segment attributes from PRMS SNTemp model driver data
   tar_target(
-    p2_input_drivers_prms,
+    p2_static_input_drivers_prms,
     p1_sntemp_input_output %>%
       group_by(seg_id_nat) %>%
       summarize(seg_elev = unique(seg_elev),
@@ -78,42 +87,85 @@ p2_targets_list <- list(
       select(segidnat, seg_elev, seg_slope, seg_width)
   ),
 
-  # Compile river-dl input drivers at NHDv2 resolution, including river width
-  # (meters), slope (unitless), and min/max elevation (transformed to meters
-  # from cm). Note that these input drivers represent "mainstem" NHDv2 reaches 
-  # only (i.e., those reaches that intersect the NHM fabric).
+  # Pull dynamic segment attributes from PRMS SNTemp model driver data
+  tar_target(
+    p2_dynamic_input_drivers_prms,
+    p1_sntemp_input_output %>%
+      mutate(segidnat = as.character(seg_id_nat)) %>%
+      select(segidnat, date, seginc_potet)
+  ),
+
+  # Subset the DRB meteorological data to only include the NHDPlusv2 catchments 
+  # (COMID) that intersect the NHM segments. `subset_nc_to_comid()` originally
+  # developed by Jeff Sadler as part of the PGDL-DO project:
+  # https://github.com/USGS-R/drb-do-ml/blob/main/2_process/src/subset_nc_to_comid.py
+  # The resulting target is ~0.6 GB.
+  tar_target(
+    p2_met_data_nhd_mainstem_reaches,
+    {
+      reticulate::source_python("2_process/src/subset_nc_to_comid.py")
+      subset_nc_to_comids(p1_drb_nhd_gridmet, 
+                          p2_nhd_mainstem_reaches_w_width$comid) %>%
+        as_tibble() %>%
+        relocate(c(COMID,time), .before = "tmmx") %>%
+        # format dates
+        mutate(date = lubridate::as_date(time, tz = "UTC")) %>%
+        # convert gridmet precip units from inches to meters, and temperature
+        # units from degrees Farenheit to degrees Celsius. 
+        # Note that we average the daily minimum and maximum temperatures from
+        # gridmet and call that "seg_tave_air", which we assume corresponds 
+        # approximately to the PRMS-SNTemp variable with the same name. 
+        mutate(tmmean = rowMeans(select(., c(tmmn,tmmx))),
+               seg_tave_air = ((tmmean - 32) * (5/9)),
+               seg_rain = pr * 0.0254) %>%
+        # rename gridmet columns to conform to PRMS-SNTemp names used in river-dl
+        select(COMID, date, seg_tave_air, srad, seg_rain) %>%
+        rename(seginc_swrad = srad)
+    }
+  ),
+
+  # Compile river-dl static input drivers at NHDv2 resolution, including river 
+  # width (m) slope (unitless), and min/max elevation (m). Note that these input 
+  # drivers represent "mainstem" NHDv2 reaches only (i.e., those reaches that 
+  # intersect the NHM fabric). Some of the columns in p2_static_input_drivers_nhd 
+  # are meant to facilitate comparison/EDA between segment attributes at the NHM and 
+  # NHDPlusv2 scales (i.e., seg_slope ~ slope_len_wtd_mean; seg_elev ~ seg_elev_min; 
+  # seg_width ~ seg_width_max). 
+  tar_target(
+    p2_static_input_drivers_nhd,
+    prepare_nhd_static_inputs(nhd_flowlines = p2_nhd_mainstem_reaches_w_width,
+                              prms_inputs = p2_static_input_drivers_prms,
+                              nhd_nhm_xwalk = p1_drb_comids_all_tribs)
+  ),
+
+  # Combine NHD-scale static input drivers with dynamic climate drivers. 
+  # Note that there is currently no pot ET analog variable at the NHD-scale,
+  # so we are using seginc_potet from PRMS-SNTemp and assuming that there is 
+  # not much intra-segment variation in potential ET among NHD reaches that 
+  # contribute to a given NHM segment. The resulting target contains 15,800 
+  # days of climate data across each of 3,173 COMIDs = 50,133,400 total rows.
   tar_target(
     p2_input_drivers_nhd,
-    p2_nhd_mainstem_reaches_w_width %>%
-      sf::st_drop_geometry() %>%
-      mutate(COMID = as.character(comid),
-             min_elev_m = minelevsmo/100, 
-             max_elev_m = maxelevsmo/100,
-             slope = if_else(slope == -9998, NA_real_, slope)) %>%
-      # add NHM segment identifier segidnat
-      left_join(y = p1_drb_comids_all_tribs, by = "COMID") %>%
-      rename(subsegid = PRMS_segid) %>%
-      # calculate length-weighted average slope for NHDv2 reaches associated
-      # with each NHM reach. For simplicity, weight by the reach length rather
-      # than another value-added attribute, slopelenkm, which represents the
-      # length over which the NHDv2 attribute slope was computed.
-      group_by(subsegid) %>%
-      mutate(slope_len_wtd_mean = weighted.mean(x = slope, w = lengthkm, na.rm = TRUE),
-             seg_width_max = max(est_width_m, na.rm = TRUE), 
-             seg_elev_min = min(min_elev_m, na.rm = TRUE)) %>%
-      ungroup() %>%
-      # join select attributes from PRMS-SNTemp
-      left_join(y = p2_input_drivers_prms, by = "segidnat") %>%
-      select(COMID, segidnat, subsegid, est_width_m, slope, slopelenkm, slope_len_wtd_mean, 
-             lengthkm, min_elev_m, max_elev_m, seg_elev, seg_slope, seg_width, seg_width_max,
-             seg_elev_min) 
+    {
+      static_inputs <- p2_static_input_drivers_nhd %>%
+        select(COMID, segidnat, subsegid, 
+               est_width_m, min_elev_m, slope) %>%
+        rename(seg_width_mean = est_width_m,
+               seg_elev = min_elev_m,
+               seg_slope = slope)
+      combine_nhd_input_drivers(nhd_static_inputs = static_inputs, 
+                                climate_inputs = p2_met_data_nhd_mainstem_reaches,
+                                prms_dynamic_inputs = p2_dynamic_input_drivers_prms,
+                                earliest_date = "1979-01-01",
+                                latest_date = "2022-04-04")
+    }
   ),
 
-  # Save river-dl input drivers at NHDv2 resolution as a feather file
+  # Save river-dl input drivers at NHDv2 resolution as a zarr data store
   tar_target(
     p2_input_drivers_nhd_zarr,
     write_df_to_zarr(p2_input_drivers_nhd, 
-                     index_cols = c("COMID"), 
+                     index_cols = c("date", "COMID"), 
                      "2_process/out/nhdv2_inputs_io.zarr"),
     format = "file"
   )