diff --git a/.gitignore b/.gitignore index d5643ae0e..b49a75846 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ config/*.swp .vscode/ **/.DS_Store .private/ +!tools/catfim/*.csv +!tools/catfim/catfim.env.template diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 01acf9e1b..83674c543 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,14 +2,13 @@ # Additional hooks: https://pre-commit.com/hooks.html repos: + - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.0.1 hooks: - id: trailing-whitespace # Below is python regex to exclude all .md files exclude: .*md$ - - id: end-of-file-fixer - exclude: Pipfile.lock - id: check-added-large-files args: ['--maxkb=5000'] - id: check-json diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 08e634eee..a300cf957 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,14 @@ All notable changes to this project will be documented in this file. We follow the [Semantic Versioning 2.0.0](http://semver.org/) format. + +## v4.5.13.0 - 2024-12-10 - [PR#1285](https://github.com/NOAA-OWP/inundation-mapping/pull/1285) + +Major upgrades and bug fixes to the CatFIM product, informally called CatFIM 2.1. See the PR for all details + +

+ + ## v4.5.12.2 - 2024-12-10 - [PR#1346](https://github.com/NOAA-OWP/inundation-mapping/pull/1346) This PR updates deny lists to avoid saving unnecessary files. @@ -44,6 +52,8 @@ As part of this PR, small modification was applied to bridge_inundation.py.

+ + ## v4.5.11.3 - 2024-10-25 - [PR#1320](https://github.com/NOAA-OWP/inundation-mapping/pull/1320) The fix: During the post processing scan for the word "error" or "warning", it was only finding records which had either of those two words as stand alone words and not part of bigger phrases. ie); "error" was found, but not "fielderror". Added wildcards and it is now fixed. diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 index c91682ff3..0cc98c52a --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ maintainers = [ {name = "Hamideh Safa", email = "hamideh.safa@noaa.gov"}, {name = "James Coll", email = "james.coll@noaa.gov"}, {name = "Matt Luck", email = "matt.luck@noaa.gov"}, - {name = "Nick Chadwick", email = "nick.chadwick@noaa.gov"}, {name = "Riley McDermott", email = "riley.mcdermott@noaa.gov"}, {name = "Robert Hanna", email = "robert.hanna@noaa.gov"}, {name = "Ryan Spies", email = "ryan.spies@noaa.gov"} @@ -39,17 +38,24 @@ Wiki = "https://github.com/NOAA-OWP/inundation-mapping/wiki" # - Tools - [tool.black] -line_length = 110 skip-string-normalization = true skip-magic-trailing-comma = true - +line-length = 110 +exclude = ''' +/( + \.csv$ +)/ +''' [tool.isort] profile = 'black' line_length = 110 multi_line_output = 3 lines_after_imports = 2 +# extend-skip = [".csv"] +skip_glob = ["*.csv"] +# yes, the exclue format is different from black [tool.flake8] count = true doctests = true @@ -59,26 +65,21 @@ extend-ignore = """ E203, E266, E501, + E712, W503, - W391 - F403, + W391, F401, + F403, """ +exclude = ["*.csv"] + per-file-ignores = """ - src/subdiv_chan_obank_src.py: E712 - src/src_roughness_optimization.py: E712, F841 - src/agreedem.py: E712 + src/src_roughness_optimization.py: F841 src/build_stream_traversal.py: E722 - - tools/check_deep_flooding.py: E712 tools/eval_alt_catfim.py: F841 - tools/generate_categorical_fim.py: C901, E712 - tools/generate_categorical_fim_mapping.py: E712 tools/inundation.py: F821 tools/rating_curve_comparison.py: F821, F841 tools/run_test_case.py: E711 tools/tools_shared_functions.py: F821, F841, E711 - tools/vary_mannings_n_composite.py: E712 - data/usgs/rating_curve_get_usgs_curves.py: F841 """ diff --git a/src/utils/fim_logger.py b/src/utils/fim_logger.py index 5a79c7fb8..770761f3e 100644 --- a/src/utils/fim_logger.py +++ b/src/utils/fim_logger.py @@ -2,8 +2,6 @@ import datetime as dt import os -import random -import traceback from pathlib import Path @@ -147,7 +145,8 @@ def MP_Log_setup(self, parent_log_output_file, file_prefix): using a defined file path. As this is an MP file, the parent_log_output_file may have a date in it - The file name is calculated as such {file_prefix}-{random 12 digit key}.log() + The file name is calculated as such + {file_prefix}-{currernt datetime with milli}.log() ie) catfim_2024_07_09-16_30_02__012345678901.log The extra file portion is added as in MultiProc, you can have dozens of processes @@ -165,8 +164,10 @@ def MP_Log_setup(self, parent_log_output_file, file_prefix): # ----------------- log_folder = os.path.dirname(parent_log_output_file) - random_id = random.randrange(1000000000, 99999999999) - log_file_name = f"{file_prefix}___{random_id}.log" + # random_id = random.randrange(1000000000, 99999999999) + # this is an epoch time + dt_str = dt.datetime.now().strftime('%H%M%S%f') + log_file_name = f"{file_prefix}___{dt_str}.log" log_file_path = os.path.join(log_folder, log_file_name) self.setup(log_file_path) @@ -232,7 +233,7 @@ def merge_log_files(self, parent_log_output_file, file_prefix, remove_old_files= folder_path = os.path.dirname(parent_log_output_file) os.makedirs(folder_path, exist_ok=True) - log_file_list_paths = list(Path(folder_path).rglob(f"{file_prefix}*")) + log_file_list_paths = list(Path(folder_path).glob(f"*{file_prefix}*")) log_file_list = [str(x) for x in log_file_list_paths] if len(log_file_list) > 0: @@ -240,55 +241,60 @@ def merge_log_files(self, parent_log_output_file, file_prefix, remove_old_files= # we are merging them in order (reg files, then warnings, then errors) - # open and write to the parent log - # This will write all logs including errors and warning - with open(parent_log_output_file, 'a') as main_log: - # Iterate through list - for temp_log_file in log_file_list: - # Open each file in read mode - with open(temp_log_file) as infile: - main_log.write(infile.read()) - if "warning" not in temp_log_file and "error" not in temp_log_file: - if remove_old_files: - os.remove(temp_log_file) - - # now the warning files if there are any - log_warning_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_warnings*")) - if len(log_warning_file_list) > 0: - log_warning_file_list.sort() - parent_warning_file = parent_log_output_file.replace(".log", "_warnings.log") - with open(parent_warning_file, 'a') as warning_log: + # It is ok if it fails with a file not found. Sometimes during multi proc + # merging, we get some anomylies. Rare but it happens. + try: + # open and write to the parent log + # This will write all logs including errors and warning + with open(parent_log_output_file, 'a') as main_log: # Iterate through list - for temp_log_file in log_warning_file_list: + for temp_log_file in log_file_list: # Open each file in read mode with open(temp_log_file) as infile: - warning_log.write(infile.read()) - - if remove_old_files: - os.remove(temp_log_file) - - # now the warning files if there are any - log_error_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_errors*")) - if len(log_error_file_list) > 0: - log_error_file_list.sort() - parent_error_file = parent_log_output_file.replace(".log", "_errors.log") - # doesn't yet exist, then create a blank one - with open(parent_error_file, 'a') as error_log: - # Iterate through list - for temp_log_file in log_error_file_list: - # Open each file in read mode - with open(temp_log_file) as infile: - error_log.write(infile.read()) - - if remove_old_files: - os.remove(temp_log_file) + main_log.write(infile.read()) + if "warning" not in temp_log_file and "error" not in temp_log_file: + if remove_old_files: + os.remove(temp_log_file) + + # now the warning files if there are any + log_warning_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_warnings*")) + if len(log_warning_file_list) > 0: + log_warning_file_list.sort() + parent_warning_file = parent_log_output_file.replace(".log", "_warnings.log") + with open(parent_warning_file, 'a') as warning_log: + # Iterate through list + for temp_log_file in log_warning_file_list: + # Open each file in read mode + with open(temp_log_file) as infile: + warning_log.write(infile.read()) + + if remove_old_files: + os.remove(temp_log_file) + + # now the warning files if there are any + log_error_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_errors*")) + if len(log_error_file_list) > 0: + log_error_file_list.sort() + parent_error_file = parent_log_output_file.replace(".log", "_errors.log") + # doesn't yet exist, then create a blank one + with open(parent_error_file, 'a') as error_log: + # Iterate through list + for temp_log_file in log_error_file_list: + # Open each file in read mode + with open(temp_log_file) as infile: + error_log.write(infile.read()) + + if remove_old_files: + os.remove(temp_log_file) + except FileNotFoundError as ex: + print(f"Merge file not found. Details: {ex}. Program continuing") return # ------------------------------------------------- def trace(self, msg): # goes to file only, not console - level = "TRACE " # keeps spacing the same + level = "TRACE".ljust(9) # keeps spacing the same (9 chars wide) if self.LOG_FILE_PATH == "": print(self.LOG_SYS_NOT_SETUP_MSG) return @@ -301,7 +307,7 @@ def trace(self, msg): # ------------------------------------------------- def lprint(self, msg): # goes to console and log file - level = "LPRINT " # keeps spacing the same + level = "LPRINT".ljust(9) # keeps spacing the same (9 chars wide) print(f"{msg} ") if self.LOG_FILE_PATH == "": @@ -316,7 +322,7 @@ def lprint(self, msg): # ------------------------------------------------- def notice(self, msg): # goes to console and log file - level = "NOTICE " # keeps spacing the same + level = "NOTICE".ljust(9) # keeps spacing the same (9 chars wide) # print(f"{cl.fore.TURQUOISE_2}{msg}{cl.style.RESET}") print(f"{level}: {msg}") @@ -332,7 +338,7 @@ def notice(self, msg): # ------------------------------------------------- def success(self, msg): # goes to console and log file - level = "SUCCESS " # keeps spacing the same + level = "SUCCESS".ljust(9) # keeps spacing the same (9 chars wide) # c_msg_type = f"{cl.fore.SPRING_GREEN_2B}<{level}>{cl.style.RESET}" # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") @@ -350,7 +356,7 @@ def success(self, msg): # ------------------------------------------------- def warning(self, msg): # goes to console and log file and warning log file - level = "WARNING " # keeps spacing the same + level = "WARNING".ljust(9) # keeps spacing the same (9 chars wide) # c_msg_type = f"{cl.fore.LIGHT_YELLOW}<{level}>{cl.style.RESET}" # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") @@ -372,7 +378,7 @@ def warning(self, msg): # ------------------------------------------------- def error(self, msg): # goes to console and log file and error log file - level = "ERROR " # keeps spacing the same + level = "ERROR".ljust(9) # keeps spacing the same (9 chars wide) # c_msg_type = f"{cl.fore.RED_1}<{level}>{cl.style.RESET}" # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") @@ -393,7 +399,7 @@ def error(self, msg): # ------------------------------------------------- def critical(self, msg): - level = "CRITICAL".ljust(9) + level = "CRITICAL".ljust(9) # keeps spacing the same (9 chars wide) # c_msg_type = f"{cl.style.BOLD}{cl.fore.RED_3A}{cl.back.WHITE}{self.__get_dt()}" # c_msg_type += f" <{level}>" diff --git a/tools/.env.template b/tools/.env.template deleted file mode 100644 index 048c2283c..000000000 --- a/tools/.env.template +++ /dev/null @@ -1,6 +0,0 @@ -API_BASE_URL= -EVALUATED_SITES_CSV= -WBD_LAYER= -NWM_FLOWS_MS= -USGS_METADATA_URL= -USGS_DOWNLOAD_URL= diff --git a/tools/.gitignore b/tools/.gitignore deleted file mode 100644 index 4c49bd78f..000000000 --- a/tools/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.env diff --git a/tools/catfim/README.md b/tools/catfim/README.md new file mode 100644 index 000000000..d9ec70e85 --- /dev/null +++ b/tools/catfim/README.md @@ -0,0 +1,76 @@ +## About CatFIM +CatFIM is short for Categorical Flood Inundation Mapping. CatFIM is a tool that is run on HAND FIM outputs to inundate a bunch of AHPS sites to specific stages or flow levels. This is useful for quality controlling HAND FIM as well as for preparing for hypothetical flood events. CatFIM models floods at up to 5 different magnitudes: action, minor, moderate, major, and record. The flows or stages for these magnitudes is pulled from the WRDS API. + +### Stage-based vs Flow-based +There are two modes for running CatFIM: stage-based and flow-based. Stage-based CatFIM inundates the HAND FIM relative elevation model (REM) based on different magnitudes of stage, which is the height of the water surface. Flow-based CatFIM inundates the REM based on different magnitudes of flow, which is measured as the volume over time of water flowing past a certain point of the river. A rating curve is used to get the water height from the flow volume at a given location. + +### How are CatFIM sites selected? + +Our goal is to produce good CatFIM for as many APHS sites as possible. In order to meet this goal, a site must meet a variety of acceptance critera and data availability checks for us to produce CatFIM at the site. + +If you have a question about why a specific point is being excluded, you can check the "Status" attribute of the CatFIM point to see what issue might be occurring. If there is a question or concern about a specific point, feel free to reach out to GID via Slack or VLAB. + + +CatFIM sites must: +- have site-specific stage- or flow- thresholds available +- be an NWM forecast point (for sites in CONUS*) +- not be on the stage-based AHPS restricted sites list (for stage-based CatFIM) +- have an accurate vertical datum (for stage-based CatFIM) +- meet the USGS Gages Acceptance Criteria (detailed below) + +USGS Gages Acceptance Criteria: +- [Lat/Long Coordinate Method](https://help.waterdata.usgs.gov/code/coord_meth_cd_query?fmt=html) must be one of the following: "C", "D", "W", "X", "Y", "Z", "N", "M", "L", "G", "R", "F", "S" +- [Acceptable Altitute Accuracy Threshold](https://help.waterdata.usgs.gov/codes-and-parameters/codes#SI) must be 1 or lower +- [Altitute Method Type](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) must be one of the following: "A", "D", "F", "I", "J", "L", "N", "R", "W", "X", "Y", "Z" +- [Site Type](https://help.waterdata.usgs.gov/code/site_tp_query?fmt=html) must be: "ST" (stream) + +*Note: Previous versions of CatFIM also restricted sites based on [Lat/Long Coordinate Accuracy](https://help.waterdata.usgs.gov/code/coord_acy_cd_query?fmt=html), but that criteria was removed in Fall 2024 to increase availability of CatFIM sites.* + +*For sites outside of CONUS: As of 12/4/2024, these criteria are currently being workshopped to account for the unique challenges of producing CatFIM in non-CONUS locations. Check back for updates or reach out to GID via Slack if you have specific questions! + + +## Running CatFIM +### Who can run CatFIM? +CatFIM can only be run by systems that can access the WRDS API, which is restricted to computers on the NOAA network. If you are outside the NOAA network and would like to run code from inundation-mapping, see the README in NOAA-OWP/inundation-mapping. + + +### Commands +Stage-based example with step system and pre-downloaded metadata: + +`python /foss_fim/tools/generate_categorical_fim.py -f /outputs/Rob_catfim_test_1 -jh 1 -jn 10 -ji 8 -e /data/config/catfim.env -t /data/docker_test_1 -me '/data/nwm_metafile.pkl' -sb -step 2` + +Flow-based example with HUC list: + +`python /foss_fim/tools/catfim/generate_categorical_fim.py -f /data/previous_fim/fim_4_5_2_11/ -jh 4 -jn 2 -e /data/config/catfim.env -t /data/hand_4_5_11_1_catfim_datavis -o -lh '06010105 17110004 10300101 19020401 19020302'` + + +### Arguments +- `-f`, `--fim_run_dir`: Path to directory containing HAND outputs, e.g. /data/previous_fim/fim_4_5_2_11 +- `-e`, `--env_file`: Docker mount path to the catfim environment file. ie) data/config/catfim.env +- `-jh`, `--job_number_huc`: OPTIONAL: Number of processes to use for HUC scale operations. HUC and inundation job numbers should multiply to no more than one less than the CPU count of the machine. CatFIM sites generally only have 2-3 branches overlapping a site, so this number can be kept low (2-4). Defaults to 1. +- `-jn`, `--job_number_inundate`: OPTIONAL: Number of processes to use for inundating HUC and inundation job numbers should multiply to no more than one less than the CPU count of the machine. Defaults to 1. +- `-ji`, `--job_number_intervals`: OPTIONAL: Number of processes to use for inundating multiple intervals in stage-based inundation and interval job numbers should multiply to no more than one less than the CPU count of the machine. Defaults to 1. +- `-sb`, `--is_stage_based`: Run stage-based CatFIM instead of flow-based? Add this -sb param to make it stage based, leave it off for flow based. +- `-t`, `--output_folder`: OPTIONAL: Target location, Where the output folder will be. Defaults to /data/catfim/ +- `-s`, `--search`: OPTIONAL: Upstream and downstream search in miles. How far up and downstream do you want to go? Defaults to 5. +- `-lh`, `--lst_hucs`: OPTIONAL: Space-delimited list of HUCs to produce CatFIM for. Defaults to all HUCs. +- `-mc`, `--past_major_interval_cap`: OPTIONAL: Stage-Based Only. How many feet past major do you want to go for the interval FIMs? of the machine. Defaults to 5. +- `-step`: 'OPTIONAL: By adding a number here, you may be able to skip levels of processing. The number you submit means it will start at that step. e.g. step of 2 means start at step 2 which for flow based is the creating of tifs and gpkgs. Note: This assumes those previous steps have already been processed and the files are present. Defaults to 0 which means all steps processed. +- `-me`, `--nwm_metafile`: OPTIONAL: If you have a pre-existing nwm metadata pickle file, you can path to it here. NOTE: This parameter is for quick debugging only and should not be used in a production mode. +- `-o`, `--overwrite`: OPTIONAL: Overwrite files. + +## Visualization Tips & Tricks + +### Changing Symbol Drawing Order + +- **ArcGIS Pro:** Go to the Symbology pane and navgate to the Symbol layer drawing tab. Make sure the "Enable symbol layer drawing" option is switched "ON", and then adjust the Drawing Order of the magnitudes so they are in this order: action, minor, moderate, major, record. + +![screenshot of Symbology pane in ArcGIS Pro](https://github.com/NOAA-OWP/inundation-mapping/blob/b527e762478fef2c1ffc5f0ff4d494f1746663bb/tools/catfim/images/screenshot_vis_settings.JPG) + +- **QGIS**: To change the symbol drawing order in QGIS... + +## Glossary +- AHPS: Advanced Hydrologic Prediction Services (NOAA) +- WRDS: Water Resources Data System (NOAA) +- REM: relative elevation model +- HAND: height above nearest drainage diff --git a/tools/catfim/__init__.py b/tools/catfim/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/config/catfim_template.env b/tools/catfim/catfim.env.template similarity index 79% rename from config/catfim_template.env rename to tools/catfim/catfim.env.template index f6e2cdd04..d50b7b1b0 100644 --- a/config/catfim_template.env +++ b/tools/catfim/catfim.env.template @@ -1,5 +1,4 @@ API_BASE_URL= Enter Words API path here -EVALUATED_SITES_CSV=/data/inputs/ahp_sites/evaluated_ahps_sites.csv WBD_LAYER=/data/inputs/wbd/WBD_National.gpkg NWM_FLOWS_MS=/data/inputs/nwm_hydrofabric/nwm_flows_ms_wrds.gpkg USGS_METADATA_URL=https://fimnew.wim.usgs.gov diff --git a/tools/catfim/catfim_sites_compare.py b/tools/catfim/catfim_sites_compare.py new file mode 100755 index 000000000..c636613c6 --- /dev/null +++ b/tools/catfim/catfim_sites_compare.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import traceback +from datetime import datetime, timezone + +import pandas as pd + +import utils.fim_logger as fl + + +# global RLOG +FLOG = fl.FIM_logger() # the non mp version + + +### sorry.. this is a bit ugly and you need to comment, or adjust column names +# based on what you are loading. (for now) + +""" +This tool can compare two version of the "sites" csv"s (not the poly libray files at this time). +It is intended to compare a previous "sites" csv to a new one to see what changes have appeared +with sites. + +Rules: +- see if one of the two files has extra sites +- And yes.. for now, it assumes both of those sets of columns exist in both files +- compares the following columns at this time: Note: Not geometry at this time + - ahps_id + - nws_data_name + - HUC8 + - mapped + - status + +- It will display both sets of columns for each set, then a column with a TRUE/FALSE, if the + each column matches. +- You optionallly can add a flag saying differences only. +- It will auto overwrite output files already existing. + +""" + + +def compare_sites(prev_file, new_file, output_file): + + # Validation + if not os.path.exists(prev_file): + print(f"The -p (prev file) does not exist. {prev_file} ") + sys.exit(1) + + if not os.path.exists(new_file): + print(f"The -n (new file) does not exist. {new_file} ") + sys.exit(1) + + output_folder = os.path.dirname(output_file) + if not os.path.exists(output_folder): + print("While the output file does not need to exist, the output folder does.") + sys.exit(1) + + # Setup variables + overall_start_time = datetime.now(timezone.utc) + dt_string = overall_start_time.strftime("%m/%d/%Y %H:%M:%S") + + # setup logging system + log_file_name = f"compare_log_file_{overall_start_time.strftime('%Y_%m_%d__%H_%M_%S')}" + log_path = os.path.join(output_folder, log_file_name) + FLOG.setup(log_path) + + FLOG.lprint("================================") + FLOG.lprint(f"Start sites compare - (UTC): {dt_string}") + FLOG.lprint("") + # FLOG.lprint("Input arguments:") + # FLOG.lprint(locals()) + # FLOG.lprint("") + + # Load both files + print("loading previous file") + + # Column names to compare (for 4.4.0.0 and 4.5.11.1, the columns were named ahps_lid for stage) + # flow is / was always just ahps_lid + # col_names = ["nws_lid", "nws_data_name", "HUC8", "mapped", "status"] + + # Column names to compare (for 4.5.11.1, the columns were named ahps_lid for stage) + col_names = ["ahps_lid", "nws_data_name", "HUC8", "mapped", "status"] + + # If not, load the columns, the rename to match + prev_df = pd.read_csv(prev_file, usecols=col_names) + if "nws_lid" in prev_df.columns: + prev_df.rename(columns={'nws_lid': 'ahps_lid'}, inplace=True) + # change huc8 to string and order + prev_df.HUC8.astype(str) + prev_df.HUC8 = prev_df.HUC8.astype('str').str.zfill(8) + prev_df = prev_df.sort_values(by=['ahps_lid']) + prev_df = prev_df.add_prefix("prev_data_") # rename all columns to add the suffix of "prev_" + # prev_df.to_csv(output_file) + + print("loading new file") + # This has a bug. in 4.5.2.11 stage the column was named 'nws_lid' + # but for future versions, it will be named "ahps_lid" + + # 4.5.2.11 stage + # new_col_names = ["nws_lid", "nws_data_name", "HUC8", "mapped", "status"] + + # All flow ones are ahps_lid and 4.5.11.1 stage is ahps_lid + new_col_names = ["ahps_lid", "nws_data_name", "HUC8", "mapped", "status"] + new_df = pd.read_csv(new_file, usecols=new_col_names) + # to get it in sync with the prev column names + if "nws_lid" in new_df.columns: + new_df.rename(columns={'nws_lid': 'ahps_lid'}, inplace=True) + + new_df.HUC8.astype(str) + new_df.HUC8 = new_df.HUC8.astype('str').str.zfill(8) + new_df = new_df.sort_values(by=['ahps_lid']) + new_df = new_df.add_prefix("new_data_") # rename all columns to add the suffix of "new_" + + # print("Prev") + # print(prev_df) # prev_df.loc[0] + # print() + + # print("new") + # print(new_df) + # print() + + col_names = ["ahps_lid", "nws_data_name", "HUC8", "mapped", "status"] + compare_data(prev_df, new_df, output_file, col_names) + + # Wrap up + overall_end_time = datetime.now(timezone.utc) + FLOG.lprint("================================") + dt_string = overall_end_time.strftime("%m/%d/%Y %H:%M:%S") + FLOG.lprint(f"End sites compare - (UTC): {dt_string}") + + # calculate duration + time_duration = overall_end_time - overall_start_time + FLOG.lprint(f"Duration: {str(time_duration).split('.')[0]}") + + return # helps show the end of the def + + +def compare_data(prev_df, new_df, output_file, col_names): + + # find the matching recs based on aphs_id, and gets all of the prev_dfs recs + results_df = prev_df.merge( + new_df, left_on='prev_data_ahps_lid', right_on='new_data_ahps_lid', how='outer' + ) + + # strip out line breaks from the two status columns + results_df["prev_data_status"] = results_df["prev_data_status"].replace("\n", "") + results_df["new_data_status"] = results_df["new_data_status"].replace("\n", "") + + for col in col_names: + # add compare columns and sets defaults + results_df[f"does_{col}_match"] = results_df[f"prev_data_{col}"] == results_df[f"new_data_{col}"] + + # for 4.4.0.0, almost all stage status columns have the phrase "usgs_elev_table missing, " + # remove that from the front of prev_if applicable (the space on the end is critical) + + results_df["prev_data_status"] = results_df["prev_data_status"].str.replace( + "usgs_elev_table missing, ", "" + ) + # results_df["is_match_status_adj_status"] = results_df["prev_data_adj_status"] == results_df["new_status"] + + # 4.5.2.11. status was the word "OK" and 4.4.0.0, now in 4.5.11.1 it is the word "Good" + # results_df["does_status_match"] = results_df[f"does_{col}_match"] == False & results_df["prev_data_status"] == "OK" & results_df["new_data_status"] == "Good" + + # for col in col_names: + # results_df[f"has_{col}"] = False + + results_df.to_csv(output_file) + + return # helps show the end of the def + + +if __name__ == "__main__": + + """ + This tool can compare two version of the catfim sites csv's (not the poly libray files at this time). + It is intended to compare a previous sites csv to a new one to see what changes have appeared + with sites. + More details at the top of this file + + It will auto overwrite output files already existing. + """ + + """ + Sample + python /foss_fim/tools/catfim_sites_compare.py + -p /data/catfim/fim_4_4_0_0_stage_based/mapping/stage_based_catfim_sites.csv + -n /data/catfim/fim_4_5_2_11_stage_based/mapping/stage_based_catfim_sites.csv + -o /data/catfim/fim_4_5_2_11_stage_based/4_4_0_0__4_5_2_11_sites_comparison.csv + """ + + # Parse arguments + parser = argparse.ArgumentParser(description="Run CatFIM sites comparison") + + parser.add_argument( + "-p", + "--prev-file", + help="Path to the prev (or any one) of the CatFIM ahps sites csv's", + required=True, + ) + + parser.add_argument( + "-n", "--new-file", help="Path to the new (or any one) of the 'sites' csv", required=True + ) + + parser.add_argument( + "-o", "--output-file", help="Path to where the results file will be saved", required=True + ) + + args = vars(parser.parse_args()) + + try: + + # call main program + compare_sites(**args) + + except Exception: + FLOG.critical(traceback.format_exc()) diff --git a/tools/catfim/generate_categorical_fim.py b/tools/catfim/generate_categorical_fim.py new file mode 100755 index 000000000..239044f7e --- /dev/null +++ b/tools/catfim/generate_categorical_fim.py @@ -0,0 +1,1867 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import math +import os +import shutil +import sys +import time +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed, wait +from datetime import datetime, timezone + +import geopandas as gpd +import numpy as np +import pandas as pd +from dotenv import load_dotenv +from generate_categorical_fim_flows import generate_flows +from generate_categorical_fim_mapping import ( + manage_catfim_mapping, + post_process_cat_fim_for_viz, + produce_stage_based_lid_tifs, +) +from tools_shared_functions import ( + filter_nwm_segments_by_stream_order, + get_datum, + get_nwm_segs, + get_thresholds, + ngvd_to_navd_ft, +) +from tools_shared_variables import ( + acceptable_alt_acc_thresh, + acceptable_alt_meth_code_list, + acceptable_coord_acc_code_list, + acceptable_coord_method_code_list, + acceptable_site_type_list, +) + +import utils.fim_logger as fl +from utils.shared_variables import VIZ_PROJECTION + + +# from itertools import repeat +# from pathlib import Path + + +# global RLOG +FLOG = fl.FIM_logger() # the non mp version +MP_LOG = fl.FIM_logger() # the Multi Proc version + +gpd.options.io_engine = "pyogrio" + + +# TODO: Aug 2024: This script was upgraded significantly with lots of misc TODO's embedded. +# Lots of inline documenation needs updating as well + + +""" +Jun 17, 2024 +This system is continuing to mature over time. It has a number of optimizations that can still +be applied in areas such as logic, performance and error handling. + +In the interium there is still a consider amount of debug lines and tools embedded in that can +be commented on/off as required. + + +NOTE: For now.. all logs roll up to the parent log file. ie) catfim_2024_07_09-22-20-12.log +This creates a VERY large final log file, but the warnings and errors file should be manageable. +Later: Let's split this to seperate log files per huc. Easy to do that for Stage Based it has +"iterate_through_stage_based" function. Flow based? we have to think that one out a bit + +""" + + +def process_generate_categorical_fim( + fim_run_dir, + env_file, + job_number_huc, + job_number_inundate, + is_stage_based, + output_folder, + overwrite, + search, + # lid_to_run, + lst_hucs, + job_number_intervals, + past_major_interval_cap, + step_num, + nwm_metafile, +): + + # ================================ + # Step System + # This system allows us to to skip steps. + # Steps that are skipped are assumed to have the valid files that are needed + # When a number is submitted, ie) 2, it means skip steps 1 and start at 2 + ''' + Step number usage: + 0 = cover all (it is changed to 999 so all steps are covered) + flow: + 1 = start at generate_flows + 2 = start at manage_catfim_mapping + 3 = start at update mapping status + stage: + 1 = start at generate_flows and tifs + 2 = start at creation of gpkgs + 3 = start at update mapping status + ''' + + # ================================ + # Validation and setup + + # Append option configuration (flow_based or stage_based) to output folder name. + if is_stage_based: + catfim_method = "stage_based" + else: + catfim_method = "flow_based" + + # Define output directories + if output_folder.endswith("/"): + output_folder = output_folder[:-1] + output_catfim_dir = output_folder + "_" + catfim_method + + local_vals = locals() + + output_flows_dir = os.path.join(output_catfim_dir, 'flows') + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + + # ================================ + set_start_files_folders( + step_num, output_catfim_dir, output_mapping_dir, output_flows_dir, attributes_dir, overwrite + ) + + FLOG.trace("locals...") + FLOG.trace(local_vals) + + # ================================ + if nwm_metafile != "": + if os.path.exists(nwm_metafile) == False: + raise Exception("The nwm_metadata (-me) file can not be found. Please remove or fix pathing.") + file_ext = os.path.splitext(nwm_metafile) + if file_ext.count == 0: + raise Exception("The nwm_metadata (-me) file appears to be invalid. It is missing an extension.") + if file_ext[1].lower() != ".pkl": + raise Exception("The nwm_metadata (-me) file appears to be invalid. The extention is not pkl.") + + # ================================ + # Define default arguments. Modify these if necessary + fim_version = os.path.split(fim_run_dir)[1] + + # ================================ + # TODO: Aug 2024: Job values are not well used. There are some times where not + # all three job values are not being used. This needs to be cleaned up. + # Check job numbers and raise error if necessary + # Considering how we are using each CPU very well at all, we could experiment + # with either overclocking or chagnign to threading. Of course, if we change + # to threading we ahve to be super careful about file and thread collisions (locking) + + # commented out for now for some small overclocking tests (carefully of course) + # total_cpus_requested = job_number_huc * job_number_inundate * job_number_intervals + # total_cpus_available = os.cpu_count() - 2 + # if total_cpus_requested > total_cpus_available: + # raise ValueError( + # f"The HUC job number (jh) [{job_number_huc}]" + # f" multiplied by the inundate job number (jn) [{job_number_inundate}]" + # f" multiplied by the job number intervals (ji) [{job_number_intervals}]" + # " exceeds your machine\'s available CPU count minus one." + # " Please lower one or more of those values accordingly." + # ) + + # we are getting too many folders and files. We want just huc folders. + # output_flow_dir_list = os.listdir(fim_run_dir) + + # ================================ + # Get HUCs from FIM run directory + valid_ahps_hucs = [ + x + for x in os.listdir(fim_run_dir) + if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] + ] + + # If a HUC list is specified, only keep the specified HUCs + lst_hucs = lst_hucs.split() + if 'all' not in lst_hucs: + valid_ahps_hucs = [x for x in valid_ahps_hucs if x in lst_hucs] + dropped_huc_lst = list((set(lst_hucs).difference(valid_ahps_hucs))) + + valid_ahps_hucs.sort() + + num_hucs = len(valid_ahps_hucs) + if num_hucs == 0: + raise ValueError( + f'The number of valid hucs compared to the output directory of {fim_run_dir} is zero.' + ' Verify that you have the correct input folder and if you used the -lh flag that it' + ' is a valid matching HUC.' + ) + # End of Validation and setup + # ================================ + + overall_start_time = datetime.now(timezone.utc) + dt_string = overall_start_time.strftime("%m/%d/%Y %H:%M:%S") + + FLOG.lprint("================================") + FLOG.lprint(f"Start generate categorical fim for {catfim_method} - (UTC): {dt_string}") + FLOG.lprint("") + + FLOG.lprint(f"Processing {num_hucs} huc(s)") + + # If HUCs are given as an input + if 'all' not in lst_hucs: + print(f'HUCs to use (from input list): {valid_ahps_hucs}') + + if len(dropped_huc_lst) > 0: + FLOG.warning('Listed HUCs not available in FIM run directory:') + FLOG.warning(dropped_huc_lst) + + load_dotenv(env_file) + API_BASE_URL = os.getenv('API_BASE_URL') + if API_BASE_URL is None: + raise ValueError( + 'API base url not found. ' + 'Ensure inundation_mapping/tools/ has an .env file with the following info: ' + 'API_BASE_URL, WBD_LAYER, NWM_FLOWS_MS, ' + 'USGS_METADATA_URL, USGS_DOWNLOAD_URL' + ) + + # TODO: lid_to_run functionality... remove? for now, just hard code lid_to_run as "all" + # single lid, not multiple + lid_to_run = "all" + + # Check that fim_inputs.csv exists and raise error if necessary + fim_inputs_csv_path = os.path.join(fim_run_dir, 'fim_inputs.csv') + if not os.path.exists(fim_inputs_csv_path): + raise ValueError(f"{fim_inputs_csv_path} not found. Verify that you have the correct input files.") + + # print() + + # FLOG.lprint("Filtering out HUCs that do not have related ahps site in them.") + # valid_ahps_hucs = __filter_hucs_to_ahps(lst_hucs) + + # num_valid_hucs = len(valid_ahps_hucs) + # if num_valid_hucs == 0: + # raise Exception("None of the HUCs supplied have ahps sites in them. Check your fim output folder") + # else: + # FLOG.lprint(f"Processing {num_valid_hucs} huc(s) with AHPS sites") + + # Define upstream and downstream search in miles + nwm_us_search, nwm_ds_search = search, search + catfim_sites_file_path = "" + + # STAGE-BASED + if is_stage_based: + # Generate Stage-Based CatFIM mapping + # does flows and inundation (mapping) + + catfim_sites_file_path = os.path.join(output_mapping_dir, 'stage_based_catfim_sites.gpkg') + + if step_num <= 1: + + df_restricted_sites = load_restricted_sites() + + generate_stage_based_categorical_fim( + output_catfim_dir, + fim_run_dir, + nwm_us_search, + nwm_ds_search, + lid_to_run, + env_file, + job_number_inundate, + job_number_huc, + valid_ahps_hucs, + job_number_intervals, + past_major_interval_cap, + nwm_metafile, + df_restricted_sites, + ) + else: + FLOG.lprint("generate_stage_based_categorical_fim step skipped") + + FLOG.lprint("") + if step_num <= 2: + # creates the gpkgs (tif's created above) + # TODO: Aug 2024, so we need to clean it up + # This step does not need a job_number_inundate as it can't really use use it. + # It processes primarily hucs and ahps in multiproc + # for now, we will manuall multiple the huc * 5 (max number of ahps types) + + ahps_jobs = job_number_huc * 5 + post_process_cat_fim_for_viz( + catfim_method, output_catfim_dir, ahps_jobs, fim_version, FLOG.LOG_FILE_PATH + ) + else: + FLOG.lprint("post_process_cat_fim_for_viz step skipped") + + # FLOW-BASED + else: + FLOG.lprint("") + FLOG.lprint('Start creating flow files using the ' + catfim_method + ' technique...') + FLOG.lprint("") + start = time.time() + + catfim_sites_file_path = os.path.join(output_mapping_dir, 'flow_based_catfim_sites.gpkg') + # generate flows is only using one of the incoming job number params + # so let's multiply -jh (huc) and -jn (inundate) + job_flows = job_number_huc * job_number_inundate + + if step_num <= 1: + generate_flows( + output_catfim_dir, + nwm_us_search, + nwm_ds_search, + lid_to_run, + env_file, + job_flows, + is_stage_based, + valid_ahps_hucs, + nwm_metafile, + FLOG.LOG_FILE_PATH, + ) + end = time.time() + elapsed_time = (end - start) / 60 + FLOG.lprint(f"Finished creating flow files in {str(elapsed_time).split('.')[0]} minutes \n") + else: + FLOG.lprint("Generate Flow step skipped") + + FLOG.lprint("") + if step_num <= 2: + # Generate CatFIM mapping (not used by stage) + manage_catfim_mapping( + fim_run_dir, + output_flows_dir, + output_catfim_dir, + catfim_method, + job_number_huc, + job_number_inundate, + FLOG.LOG_FILE_PATH, + ) + else: + FLOG.lprint("manage_catfim_mapping step skipped") + # end if else + + FLOG.lprint("") + if ( + step_num <= 3 + ): # can later be changed to is_flow_based and step_num > 3, so stage can have it's own numbers + # Updating mapping status + FLOG.lprint('Updating mapping status...') + update_flow_mapping_status(output_mapping_dir, catfim_sites_file_path) + FLOG.lprint('Updating mapping status complete') + else: + FLOG.lprint("Updating mapping status step skipped") + + FLOG.lprint("================================") + FLOG.lprint("End generate categorical fim") + + overall_end_time = datetime.now(timezone.utc) + dt_string = overall_end_time.strftime("%m/%d/%Y %H:%M:%S") + FLOG.lprint(f"Ended (UTC): {dt_string}") + + # calculate duration + time_duration = overall_end_time - overall_start_time + FLOG.lprint(f"Duration: {str(time_duration).split('.')[0]}") + return + + +def get_list_ahps_with_library_gpkgs(output_mapping_dir): + + # as it is a set it will dig out unique files + ahps_ids_with_gpkgs = [] + # gpkg_file_names = + file_pattern = os.path.join(output_mapping_dir, "gpkg") + '/*.gpkg' + # print(file_pattern) + for file_path in glob.glob(file_pattern): + file_name = os.path.basename(file_path) + file_name_segs = file_name.split("_") + if len(file_name_segs) <= 1: + continue + ahps_id = file_name_segs[1] + if len(ahps_id) == 5: # yes, we assume the ahps in the second arg + if ahps_id not in ahps_ids_with_gpkgs: + ahps_ids_with_gpkgs.append(ahps_id) + + return ahps_ids_with_gpkgs + + +def update_flow_mapping_status(output_mapping_dir, catfim_sites_file_path): + ''' + Overview: + - Gets a list of valid ahps that have at least one gkpg file. If we have at least one, then the site mapped something + - We use that list compared to the original sites gpkg (or csv) file name to update the rows for the mapped column + By this point, most shoudl have had status messages until something failed in inundation or creating the gpkg. + - We also use a convention where if a status messsage starts with ---, then remove the ---. It is reserved for showing + that some magnitudes exists and some failed. + ''' + + # Import geopackage output from flows creation + if not os.path.exists(catfim_sites_file_path): + FLOG.critical( + f"Primary library gpkg of {catfim_sites_file_path} does not exist." + " Check logs for possible errors. Program aborted." + ) + sys.exit(1) + + sites_gdf = gpd.read_file(catfim_sites_file_path, engine='fiona') + + if len(sites_gdf) == 0: + FLOG.critical(f"flows_gdf is empty. Path is {catfim_sites_file_path}. Program aborted.") + sys.exit(1) + + try: + + valid_ahps_ids = get_list_ahps_with_library_gpkgs(output_mapping_dir) + + if len(valid_ahps_ids) == 0: + FLOG.critical(f"No valid ahps gpkg files found in {output_mapping_dir}/gpkg") + sys.exit(1) + + # we could have used lambda but the if/else logic got messy and unstable + for ind, row in sites_gdf.iterrows(): + ahps_id = row['ahps_lid'] + status_val = row['status'] + + if ahps_id not in valid_ahps_ids: + sites_gdf.at[ind, 'mapped'] = 'no' + + if status_val is None or status_val == "" or status_val == "Good": + sites_gdf.at[ind, 'status'] = 'Site resulted with no valid inundated files' + FLOG.warning(f"Mapped status was changed to no for {ahps_id}. No inundation files exist") + continue + # It is safe to assume a status message for invalid ones already exist + + sites_gdf.at[ind, 'mapped'] = 'yes' + # Mapped should be "yes", and "Good", + if status_val == "": + sites_gdf.at[ind, 'status'] = 'Good' + elif status_val.startswith("---") == True: # warning not an error + sites_gdf.at[ind, 'mapped'] = 'yes' + # remove the "---" from the start, as it is a warning, and not an error + status_val = status_val[3:] + sites_gdf.at[ind, 'status'] = status_val + + # sites_gdf.reset_index(inplace=True, drop=True) + + # We are re-saving the sites files + sites_gdf.to_file(catfim_sites_file_path, driver='GPKG', crs=VIZ_PROJECTION, engine="fiona") + + # csv flow file name + nws_lid_csv_file_path = catfim_sites_file_path.replace(".gpkg", ".csv") + + # and we write a csv version at this time as well. + # and this csv is good + sites_gdf.to_csv(nws_lid_csv_file_path) + + except Exception as e: + FLOG.critical(f"{output_mapping_dir} : No LIDs, \n Exception: \n {repr(e)} \n") + FLOG.critical(traceback.format_exc()) + return + + +# This is always called as part of Multi-processing so uses MP_LOG variable and +# creates it's own logging object. +# This does flow files and mapping in the same function by HUC +def iterate_through_huc_stage_based( + output_catfim_dir, + huc, + fim_dir, + huc_dictionary, + threshold_url, + all_lists, + past_major_interval_cap, + job_number_inundate, + job_number_intervals, + nwm_flows_region_df, + df_restricted_sites, + parent_log_output_file, + child_log_file_prefix, + progress_stmt, +): + """_summary_ + This and its children will create stage based tifs and catfim data based on a huc + """ + + try: + # This is setting up logging for this function to go up to the parent + # child_log_file_prefix is likely MP_iter_hucs + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + MP_LOG.lprint("**********************") + MP_LOG.lprint(f'Processing {huc} ...') + MP_LOG.lprint(f'... {progress_stmt} ...') + MP_LOG.lprint("") + + all_messages = [] + stage_based_att_dict = {} + + mapping_dir = os.path.join(output_catfim_dir, "mapping") + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + huc_messages_dir = os.path.join(mapping_dir, 'huc_messages') + + # Make output directory for the particular huc in the mapping folder + mapping_huc_directory = os.path.join(mapping_dir, huc) + if not os.path.exists(mapping_huc_directory): + os.mkdir(mapping_huc_directory) + + # Define paths to necessary HAND and HAND-related files. + usgs_elev_table = os.path.join(fim_dir, huc, 'usgs_elev_table.csv') + branch_dir = os.path.join(fim_dir, huc, 'branches') + + # Loop through each lid in nws_lids list + huc_nws_lids = huc_dictionary[huc] + + nws_lids = [] + # sometimes we are getting duplicates but no idea how/why + [nws_lids.append(val) for val in huc_nws_lids if val not in nws_lids] + + MP_LOG.lprint(f"Lids to process for {huc} are {nws_lids}") + + skip_lid_process = False + # -- If necessary files exist, continue -- # + # Yes, each lid gets a record no matter what, so we need some of these messages duplicated + # per lid record + if not os.path.exists(usgs_elev_table): + msg = ":Internal Error: Missing key data from HUC record (usgs_elev_table missing)" + # all_messages.append(huc + msg) + MP_LOG.warning(huc + msg) + skip_lid_process = True + + if not os.path.exists(branch_dir): + msg = ":branch directory missing" + # all_messages.append(huc + msg) + MP_LOG.warning(huc + msg) + skip_lid_process = True + + categories = ['action', 'minor', 'moderate', 'major', 'record'] + + if skip_lid_process == False: # else skip to message processing + usgs_elev_df = pd.read_csv(usgs_elev_table) + + df_cols = { + "nws_lid": pd.Series(dtype='str'), + "name": pd.Series(dtype='str'), + "WFO": pd.Series(dtype='str'), + "rfc": pd.Series(dtype='str'), + "huc": pd.Series(dtype='str'), + "state": pd.Series(dtype='str'), + "county": pd.Series(dtype='str'), + "magnitude": pd.Series(dtype='str'), + "q": pd.Series(dtype='str'), + "q_uni": pd.Series(dtype='str'), + "q_src": pd.Series(dtype='str'), + "stage": pd.Series(dtype='float'), + "stage_uni": pd.Series(dtype='str'), + "s_src": pd.Series(dtype='str'), + "wrds_time": pd.Series(dtype='str'), + "nrldb_time": pd.Series(dtype='str'), + "nwis_time": pd.Series(dtype='str'), + "lat": pd.Series(dtype='float'), + "lon": pd.Series(dtype='float'), + "dtm_adj_ft": pd.Series(dtype='str'), + "dadj_w_ft": pd.Series(dtype='float'), + "dadj_w_m": pd.Series(dtype='float'), + "lid_alt_ft": pd.Series(dtype='float'), + "lid_alt_m": pd.Series(dtype='float'), + } + + for lid in nws_lids: + + # debugging + # if lid.upper() not in ['ALWP1','ILTP1', 'JRSP1']: + # continue + + # TODO: Oct 2024, yes. this is goofy but temporary + # Some lids will add a status message but are allowed to continue. + # When we want to keep a lid processing but have a message, we add :three dashes + # ":---"" in front of the message which will be stripped off the front. + # However, most other that pick up a status message are likely stopped + # being processed. Later the status message will go through some tests + # analyzing the status message as one factor to decide if the record + # is or should be mapped. + + lid = lid.lower() # Convert lid to lower case + + MP_LOG.lprint("-----------------------------------") + huc_lid_id = f"{huc} : {lid}" + MP_LOG.lprint(f"processing {huc_lid_id}") + + found_restrict_lid = df_restricted_sites.loc[df_restricted_sites['nws_lid'] == lid.upper()] + + # print(found_restrict_lid) + + # Assume only one rec for now, fix later + if len(found_restrict_lid) > 0: + reason = found_restrict_lid.iloc[ + 0, found_restrict_lid.columns.get_loc("restricted_reason") + ] + msg = ':' + reason + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + if len(lid) != 5: + msg = ":This lid value is invalid" + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Get stages and flows for each threshold from the WRDS API. Priority given to USGS calculated flows. + thresholds, flows = get_thresholds( + threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' + ) + + # MP_LOG.lprint(f"thresholds are {thresholds}") + # MP_LOG.lprint(f"flows are {flows}") + + if thresholds is None or len(thresholds) == 0: + msg = ':Error getting thresholds from WRDS API' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Check if stages are supplied, if not write message and exit. + if all(thresholds.get(category, None) is None for category in categories): + msg = ':Missing all threshold stage data' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Read stage values and calculate thresholds + # The error and warning message is already formatted correctly if applicable + # Hold the warning_msg to the end + stage_values_df, valid_stage_names, stage_warning_msg, err_msg = __calc_stage_values( + categories, thresholds + ) + + MP_LOG.trace( + f"{huc_lid_id}:" f" stage values (pre-processed) are {stage_values_df.values.tolist()}" + ) + + if err_msg != "": + # The error message is already formatted correctly + all_messages.append(lid + err_msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Look for acceptable elevs + acceptable_usgs_elev_df = __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id) + if acceptable_usgs_elev_df is None or len(acceptable_usgs_elev_df) == 0: + msg = ":Unable to find gage data" + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Get the dem_adj_elevation value from usgs_elev_table.csv. + # Prioritize the value that is not from branch 0. + lid_usgs_elev, dem_eval_messages = __adj_dem_evalation_val( + acceptable_usgs_elev_df, lid, huc_lid_id + ) + all_messages = all_messages + dem_eval_messages + if len(dem_eval_messages) > 0: + continue + + # Initialize nested dict for lid attributes + stage_based_att_dict.update({lid: {}}) + + # Find lid metadata from master list of metadata dictionaries. + metadata = next( + (item for item in all_lists if item['identifiers']['nws_lid'] == lid.upper()), False + ) + lid_altitude = metadata['usgs_data']['altitude'] + if lid_altitude is None or lid_altitude == 0: + msg = ':AHPS site altitude value is invalid' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Filter out sites that don't have "good" data + try: + ## Removed this part to relax coordinate accuracy requirements + # if not metadata['usgs_data']['coord_accuracy_code'] in acceptable_coord_acc_code_list: + # MP_LOG.warning( + # f"\t{huc_lid_id}: {metadata['usgs_data']['coord_accuracy_code']} " + # "Not in acceptable coord acc codes" + # ) + # continue + # if not metadata['usgs_data']['coord_method_code'] in acceptable_coord_method_code_list: + # MP_LOG.warning(f"\t{huc_lid_id}: Not in acceptable coord method codes") + # continue + if not metadata['usgs_data']['alt_method_code'] in acceptable_alt_meth_code_list: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable alt method codes") + continue + if not metadata['usgs_data']['site_type'] in acceptable_site_type_list: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable site type codes") + continue + if not float(metadata['usgs_data']['alt_accuracy_code']) <= acceptable_alt_acc_thresh: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable threshold range") + continue + except Exception: + MP_LOG.error(f"{huc_lid_id}: Filtering out 'bad' data in the usgs data") + MP_LOG.error(traceback.format_exc()) + continue + + datum_adj_ft, datum_messages = __adjust_datum_ft(flows, metadata, lid, huc_lid_id) + all_messages = all_messages + datum_messages + if datum_adj_ft is None: + continue + + # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. + unfiltered_segments = list(set(get_nwm_segs(metadata))) + + # Filter segments to be of like stream order. + desired_order = metadata['nwm_feature_data']['stream_order'] + segments = filter_nwm_segments_by_stream_order( + unfiltered_segments, desired_order, nwm_flows_region_df + ) + + # If no segments, write message and exit out + if not segments or len(segments) == 0: + msg = ':missing nwm segments' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Check for large discrepancies between the elevation values from WRDS and HAND. + # Otherwise this causes bad mapping. + elevation_diff = lid_usgs_elev - (lid_altitude * 0.3048) + if abs(elevation_diff) > 10: + msg = ':Large discrepancy in elevation estimates from gage and HAND' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # This function sometimes is called within a MP but sometimes not. + # So, we might have an MP inside an MP + # and we will need a new prefix for it. + + # For each flood category / magnitude + MP_LOG.lprint(f"{huc_lid_id}: About to process flood categories") + # child_log_file_prefix_tifs = MP_LOG.MP_calc_prefix_name( + # MP_LOG.LOG_FILE_PATH, child_log_file_prefix + "_MP_tifs" + # ) + + # Make mapping lid_directory. + mapping_lid_directory = os.path.join(mapping_huc_directory, lid) + if not os.path.exists(mapping_lid_directory): + os.mkdir(mapping_lid_directory) + + # At this point we have at least one valid stage/category + # cyle through on the stages that are valid + # This are not interval values + for idx, stage_row in stage_values_df.iterrows(): + # MP_LOG.lprint(f"{huc_lid_id}: Magnitude is {category}") + # Pull stage value and confirm it's valid, then process + + category = stage_row['stage_name'] + stage_value = stage_row['stage_value'] + + # messages already included in the stage_warning_msg above + if stage_value == -1: + continue + + MP_LOG.trace(f"About to create tifs for {huc_lid_id} : {category} : {stage_value}") + + # datum_adj_ft should not be None at this point + # Call function to execute mapping of the TIFs. + + # Calcluate a portion of the file name which includes the category, + # a formatted stage value and a possible "i" to show it is an interval file + category_key = __calculate_category_key(category, stage_value, False) + + # These are the up to 5 magnitudes being inundated at their stage value + (messages, hand_stage, datum_adj_wse, datum_adj_wse_m) = produce_stage_based_lid_tifs( + stage_value, + False, + datum_adj_ft, + branch_dir, + lid_usgs_elev, + lid_altitude, + fim_dir, + segments, + lid, + huc, + mapping_lid_directory, + category, + category_key, + job_number_inundate, + MP_LOG.LOG_FILE_PATH, + child_log_file_prefix, + ) + + # If we get a message back, then something went wrong with the site adn we need to + # remove it as a valid site + + all_messages += messages + + # Extra metadata for alternative CatFIM technique. + # TODO Revisit because branches complicate things + stage_based_att_dict[lid].update( + { + category: { + 'datum_adj_wse_ft': datum_adj_wse, + 'datum_adj_wse_m': datum_adj_wse_m, + 'hand_stage': hand_stage, + 'datum_adj_ft': datum_adj_ft, + 'lid_alt_ft': lid_altitude, + 'lid_alt_m': lid_altitude * 0.3048, + } + } + ) + + # Let's see any tifs made it, if not.. change this to an invalid stage value + stage_file_name = os.path.join( + mapping_lid_directory, lid + '_' + category_key + '_extent.tif' + ) + if os.path.exists(stage_file_name) == False: + # somethign failed and we didn't get a rolled up extent file, so we need to reject the stage + stage_values_df.at[idx, 'stage_value'] = -1 + + # So, we might have an MP inside an MP + # let's merge what we have at this point, before we go into another MP + # MP_LOG.merge_log_files(MP_LOG.LOG_FILE_PATH, child_log_file_prefix_tifs, True) + + # we do intervals only on non-record and valid stages + non_rec_stage_values_df_unsorted = stage_values_df[ + (stage_values_df["stage_value"] != -1) & (stage_values_df["stage_name"] != 'record') + ] + + non_rec_stage_values_df = non_rec_stage_values_df_unsorted.sort_values( + by='stage_value' + ).reset_index() + + # MP_LOG.trace(f"non_rec_stage_values_df is {non_rec_stage_values_df}") + + # We already inundated and created files for the specific stages just not the intervals + # Make list of interval recs to be created + interval_list = [] # might stay empty + + num_non_rec_stages = len(non_rec_stage_values_df) + if num_non_rec_stages > 0: + + interval_list = __calc_stage_intervals( + non_rec_stage_values_df, past_major_interval_cap, huc_lid_id + ) + + tif_child_log_file_prefix = MP_LOG.MP_calc_prefix_name( + parent_log_output_file, "MP_sb_interval_tifs" + ) + + # Now we add the interval tifs but no interval tifs for the "record" stage if there is one. + with ProcessPoolExecutor(max_workers=job_number_intervals) as executor: + try: + + for interval_rec in interval_list: # list of lists + + category = interval_rec[0] # stage name + interval_stage_value = interval_rec[1] + + # Calcluate a portion of the file name which includes the category, + # a formatted stage value and a possible "i" to show it is an interval file + category_key = __calculate_category_key(category, interval_stage_value, True) + + executor.submit( + produce_stage_based_lid_tifs, + interval_stage_value, + True, + datum_adj_ft, + branch_dir, + lid_usgs_elev, + lid_altitude, + fim_dir, + segments, + lid, + huc, + mapping_lid_directory, + category, + category_key, + job_number_inundate, + parent_log_output_file, + tif_child_log_file_prefix, + ) + except TypeError: # sometimes the thresholds are Nonetypes + MP_LOG.error( + f"{huc_lid_id}: ERROR: type error in ProcessPool," + " likely in the interval code" + ) + MP_LOG.error(traceback.format_exc()) + continue + + except Exception: + MP_LOG.critical(f"{huc_lid_id}: ERROR: ProcessPool has an error") + MP_LOG.critical(traceback.format_exc()) + # merge MP Logs (Yes) + MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) + sys.exit(1) + + # merge MP Logs (merging MP into an MP (proc_pool in a proc_pool)) + MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) + + else: + MP_LOG.lprint( + f"{huc_lid_id}: Skipping intervals as there are not any 'non-record' stages" + ) + + # end of skip_add_intervals == False + + # For each valid stage, and if all goes well, they should have files + # that end with "_extent.tif". If there is anything between _extent and .tif + # it is a branch file adn our test is it at least one rollup exists + inundate_lid_files = glob.glob(f"{mapping_lid_directory}/*_extent.tif") + if len(inundate_lid_files) == 0: + msg = ':All stages failed to inundate' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + # Create a csv with same information as geopackage but with each threshold as new record. + # Probably a less verbose way. + csv_df = pd.DataFrame(df_cols) # for first appending + + # for threshold in categories: (threshold and category are somewhat interchangeable) + # some may have failed inundation, which we will rectify later + MP_LOG.trace(f"{huc_lid_id}: updating threshhold values") + for threshold in valid_stage_names: + try: + + # we don't know if the magnitude/stage can be mapped yes it hasn't been inundated + line_df = pd.DataFrame( + { + 'nws_lid': [lid], + 'name': metadata['nws_data']['name'], + 'WFO': metadata['nws_data']['wfo'], + 'rfc': metadata['nws_data']['rfc'], + 'huc': [huc], + 'state': metadata['nws_data']['state'], + 'county': metadata['nws_data']['county'], + 'magnitude': threshold, + 'q': flows[threshold], + 'q_uni': flows['units'], + 'q_src': flows['source'], + 'stage': thresholds[threshold], + 'stage_uni': thresholds['units'], + 's_src': thresholds['source'], + 'wrds_time': thresholds['wrds_timestamp'], + 'nrldb_time': metadata['nrldb_timestamp'], + 'nwis_time': metadata['nwis_timestamp'], + 'lat': [float(metadata['nws_preferred']['latitude'])], + 'lon': [float(metadata['nws_preferred']['longitude'])], + 'dtm_adj_ft': stage_based_att_dict[lid][threshold]['datum_adj_ft'], + 'dadj_w_ft': stage_based_att_dict[lid][threshold]['datum_adj_wse_ft'], + 'dadj_w_m': stage_based_att_dict[lid][threshold]['datum_adj_wse_m'], + 'lid_alt_ft': stage_based_att_dict[lid][threshold]['lid_alt_ft'], + 'lid_alt_m': stage_based_att_dict[lid][threshold]['lid_alt_m'], + 'mapped': '', + 'status': '', + } + ) + csv_df = pd.concat([csv_df, line_df], ignore_index=True) + + except Exception: + # is this the text we want users to see + msg = f':Error with threshold {threshold}' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + MP_LOG.error(traceback.format_exc()) + continue + # sys.exit(1) + + # might be that none of the lids for this HUC passed + # If a site folder exists (ie a flow file was written) save files containing site attributes. + # if os.path.exists(mapping_lid_directory): + if len(csv_df) > 0: + # Round flow and stage columns to 2 decimal places. + csv_df = csv_df.round({'q': 2, 'stage': 2}) + + # Export DataFrame to csv containing attributes + attributes_filepath = os.path.join(attributes_dir, f'{lid}_attributes.csv') + csv_df.to_csv(attributes_filepath, index=False) + + # If it made it to this point (i.e. no continues), there were no major preventers of mapping + # well.. mostly. If it fails, we can change back to + + if stage_warning_msg == "": # does not mean the lid is good. + all_messages.append(lid + ':Good') + else: # we will leave the ":---" on it for now if it is does have a warnning message + all_messages.append(lid + stage_warning_msg) + MP_LOG.warning(huc_lid_id + stage_warning_msg) + else: + msg = ':Missing all calculated flows' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + + # MP_LOG.success(f'{huc_lid_id}: Complete') + # mark_complete(mapping_lid_directory) + # end of for loop + # end of if + + # Write all_messages by HUC to be scraped later. + if len(all_messages) > 0: + + # TODO: Aug 2024: This is now identical to the way flow handles messages + # but the system should probably be changed to somethign more elegant but good enough + # for now. At least is is MP safe. + huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') + with open(huc_messages_txt_file, 'w') as f: + for item in all_messages: + item = item.strip() + # f.write("%s\n" % item) + f.write(f"{item}\n") + + except Exception: + MP_LOG.error(f"{huc} : {lid} Error iterating through huc stage based") + MP_LOG.error(traceback.format_exc()) + + return + + +def __calc_stage_values(categories, thresholds): + ''' + Overview: + Calculates values for each of the five stages. Any that do not have a threshold or have invalid values. + Parameters: + - categories: all 5 of the names ("action", "moderate" etc) + - thresholds: values from WRDS for the stages (anywhere from 0 to 5 stages) + - huc_lid_id: a string of the huc and lid values just for logging or display (not logic) + + returns: + - a dataframe with rows for each five stages, defaulted to -1. + - a list of categories names with valid values + - A warning message if some stages are missing values (stages inc record) + - An error message if one exists (including if all five stages are invalid) + ''' + + err_msg = "" + warning_msg = "" + + # default values + default_stage_data = [['action', -1], ['minor', -1], ['moderate', -1], ['major', -1], ['record', -1]] + + valid_stage_names = [] + + # Setting up a default df (not counting record) + stage_values_df = pd.DataFrame(default_stage_data, columns=['stage_name', 'stage_value']) + + for stage in categories: + + if stage in thresholds: + stage_val = thresholds[stage] + if stage_val is not None and stage_val != "" and stage_val > 0: + stage_values_df.loc[stage_values_df.stage_name == stage, 'stage_value'] = stage_val + valid_stage_names.append(stage) + + invalid_stages_df = stage_values_df[stage_values_df["stage_value"] <= 0] + + if len(invalid_stages_df) == 5: + err_msg = ':All threshold values are unavailable or invalid' # already formatted + return None, [], "", err_msg + + # Yes.. a bit weird, we are going to put three dashs in front of the message + # to help show it is valid even with a missing stage msg. + # any other record with a status value that is not "Good" + # or does not start with a --- is assumed to be possibly bad (not mapped) + warning_msg = "" + + for ind, stage_row in invalid_stages_df.iterrows(): + if warning_msg == "": + warning_msg = f":---Missing stage data for {stage_row['stage_name']}" + else: + warning_msg += f"; {stage_row['stage_name']}" + + return stage_values_df, valid_stage_names, warning_msg, err_msg + + +def __calc_stage_intervals(non_rec_stage_values_df, past_major_interval_cap, huc_lid_id): + ''' + Return: + interval_recs is a list of list [["action", 21], ["action", 22],....] + This represents stage names and depths to inundate against to generate all interval recs we need + ''' + + interval_recs = [] + + stage_values_claimed = [] + + MP_LOG.trace( + f"{huc_lid_id}: Calculating intervals for non_rec_stage_values_df is {non_rec_stage_values_df}" + ) + + num_stage_value_recs = len(non_rec_stage_values_df) + MP_LOG.trace(f"{huc_lid_id}: num_stage_value_recs is {num_stage_value_recs}") + + # recs will be in order + # we do this one stage at a time, so we keep track of the stage name associated with the interval + for idx in non_rec_stage_values_df.index: + + row = non_rec_stage_values_df.loc[idx] + + # MP_LOG.trace(f"{huc_lid_id}: interval calcs - non_rec_stage_value is idx: {idx}; {row}") + + cur_stage_name = row["stage_name"] + cur_stage_val = row["stage_value"] + + # calc the intervals between the cur and the next stage + # for the cur, we need to round up, but the curr and the next + # to stay at full integers. We do this as it is possible for stages to be decimals + # ie) action is 2.4, and mod is 4.6, we want intervals at 3 and 4. + # The highest value of the interval_list is not included + + if float(cur_stage_val) % 1 == 0: # then we have a whole number + # we only put it on the stage_value_claimed if is whole becuase + # the intervals are whole numbers and are looking for dups + + cur_stage_val = int(cur_stage_val) + stage_values_claimed.append(cur_stage_val) + min_interval_val = int(cur_stage_val) + 1 + + else: + # round up to nextg whole number + min_interval_val = math.ceil(cur_stage_val) + 1 + + if idx < len(non_rec_stage_values_df) - 1: # not the last record + # get the next stage value + next_stage_val = non_rec_stage_values_df.iloc[idx + 1]["stage_value"] + max_interval_val = int(next_stage_val) + # MP_LOG.trace(f"{huc_lid_id}: Next stage value is {max_interval_val}") + else: + # last rec. Just add 5 more (or the value from the input args) + max_interval_val = int(min_interval_val) + past_major_interval_cap + # MP_LOG.trace(f"{huc_lid_id}: Last rec and max_in is {max_interval_val}") + + # + 1 as the last interval is not included + # MP_LOG.lprint(f"{huc_lid_id}: {cur_stage_name} is {cur_stage_val} and" + # f" min_interval_val is {min_interval_val} ; max interval value is {max_interval_val}") + interval_list = np.arange(min_interval_val, max_interval_val) + + # sometimes dups seem to slip through but not sure why, this fixes it + for int_val in interval_list: + if int_val not in stage_values_claimed: + interval_recs.append([cur_stage_name, int_val]) + # MP_LOG.trace(f"{huc_lid_id}: Added interval value of {int_val}") + stage_values_claimed.append(int_val) + + MP_LOG.lprint(f"{huc_lid_id} interval recs are {interval_recs}") + + return interval_recs + + +def load_restricted_sites(): + """ + At this point, only stage based uses this. But a arg of "catfim_type (stage or flow) or something + can be added later. + + Returns: a dataframe for the restricted lid and the reason why: + "nws_lid", "restricted_reason" + """ + + file_name = "stage_based_ahps_restricted_sites.csv" + current_script_folder = os.path.dirname(__file__) + file_path = os.path.join(current_script_folder, file_name) + + df_restricted_sites = pd.read_csv(file_path, dtype=str) + + df_restricted_sites['nws_lid'].fillna("", inplace=True) + df_restricted_sites['restricted_reason'].fillna("", inplace=True) + + # Need to drop the comment lines before doing any more processing + df_restricted_sites.drop( + df_restricted_sites[df_restricted_sites.nws_lid.str.startswith("#")].index, inplace=True + ) + + df_restricted_sites['nws_lid'] = df_restricted_sites['nws_lid'].str.upper() + + # There are enough conditions and a low number of rows that it is easier to + # test / change them via a for loop + indexs_for_recs_to_be_removed_from_list = [] + for ind, row in df_restricted_sites.iterrows(): + nws_lid = row['nws_lid'] + restricted_reason = row['restricted_reason'] + + if len(nws_lid) != 5: # could be just a blank row in the + FLOG.warning( + f"From the ahps_restricted_sites list, an invalid nws_lid value of '{nws_lid}'" + " and has dropped from processing" + ) + indexs_for_recs_to_be_removed_from_list.append(ind) + continue + + if restricted_reason == "": + restricted_reason = "From the ahps_restricted_sites," + " the site will not be mapped, but a reason has not be provided." + df_restricted_sites.at[ind, 'restricted_reason'] = restricted_reason + FLOG.warning(f"{restricted_reason}. Lid is '{nws_lid}'") + continue + # end for + + # Invalid records (not dropping, just completely invalid recs from the csv) + # Could be just blank rows from the csv + if len(indexs_for_recs_to_be_removed_from_list) > 0: + df_restricted_sites = df_restricted_sites.drop(indexs_for_recs_to_be_removed_from_list).reset_index() + + # print(df_restricted_sites.head(10)) + + return df_restricted_sites + + +def __adjust_datum_ft(flows, metadata, lid, huc_lid_id): + + # TODO: Aug 2024: This whole parts needs revisiting. Lots of lid data has changed and this + # is all likely very old. + + # Jul 2024: For now, we will duplicate messages via all_messsages and via the logging system. + all_messages = [] + + datum_adj_ft = None + ### --- Do Datum Offset --- ### + # determine source of interpolated threshold flows, this will be the rating curve that will be used. + rating_curve_source = flows.get('source') + + # MP_LOG.trace(f"{huc_lid_id} : rating_curve_source is {rating_curve_source}") + + if rating_curve_source is None: + msg = ':No source for rating curve' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + return None, all_messages + + # Get the datum and adjust to NAVD if necessary. + nws_datum_info, usgs_datum_info = get_datum(metadata) + if rating_curve_source == 'USGS Rating Depot': + datum_data = usgs_datum_info + elif rating_curve_source == 'NRLDB': + datum_data = nws_datum_info + + # If datum not supplied, skip to new site + datum = datum_data.get('datum', None) + if datum is None: + msg = ':Datum info unavailable' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + return None, all_messages + + # ___________________________________________________________________________________________________# + # NOTE: !!!! + # When appending to a all_message and we may not automatcially want the record dropped + # then add "---" in front of the message. Whenever the code finds a message that does not + # start with a ---, it assumes if it is a fail and drops it. We will make a better system later. + + # ___________________________________________________________________________________________________# + # SPECIAL CASE: Workaround for "bmbp1" where the only valid datum is from NRLDB (USGS datum is null). + # Modifying rating curve source will influence the rating curve and + # datum retrieved for benchmark determinations. + if lid == 'bmbp1': + rating_curve_source = 'NRLDB' + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Custom workaround these sites have faulty crs from WRDS. CRS needed for NGVD29 + # conversion to NAVD88 + # USGS info indicates NAD83 for site: bgwn7, fatw3, mnvn4, nhpp1, pinn4, rgln4, rssk1, sign4, smfn7, + # stkn4, wlln7 + # Assumed to be NAD83 (no info from USGS or NWS data): dlrt2, eagi1, eppt2, jffw3, ldot2, rgdt2 + if lid in [ + 'bgwn7', + 'dlrt2', + 'eagi1', + 'eppt2', + 'fatw3', + 'jffw3', + 'ldot2', + 'mnvn4', + 'nhpp1', + 'pinn4', + 'rgdt2', + 'rgln4', + 'rssk1', + 'sign4', + 'smfn7', + 'stkn4', + 'wlln7', + ]: + datum_data.update(crs='NAD83') + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Workaround for bmbp1; CRS supplied by NRLDB is mis-assigned (NAD29) and + # is actually NAD27. + # This was verified by converting USGS coordinates (in NAD83) for bmbp1 to NAD27 and + # it matches NRLDB coordinates. + if lid == 'bmbp1': + datum_data.update(crs='NAD27') + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Custom workaround these sites have poorly defined vcs from WRDS. VCS needed to ensure + # datum reported in NAVD88. + # If NGVD29 it is converted to NAVD88. + # bgwn7, eagi1 vertical datum unknown, assume navd88 + # fatw3 USGS data indicates vcs is NAVD88 (USGS and NWS info agree on datum value). + # wlln7 USGS data indicates vcs is NGVD29 (USGS and NWS info agree on datum value). + if lid in ['bgwn7', 'eagi1', 'fatw3']: + datum_data.update(vcs='NAVD88') + elif lid == 'wlln7': + datum_data.update(vcs='NGVD29') + # ___________________________________________________________________________________________________# + + # Adjust datum to NAVD88 if needed + # Default datum_adj_ft to 0.0 + datum_adj_ft = 0.0 + crs = datum_data.get('crs') + if datum_data.get('vcs') in ['NGVD29', 'NGVD 1929', 'NGVD,1929', 'NGVD OF 1929', 'NGVD']: + # Get the datum adjustment to convert NGVD to NAVD. Sites not in contiguous US are previously + # removed otherwise the region needs changed. + try: + datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') + except Exception as ex: + MP_LOG.error(f"ERROR: {huc_lid_id}: ngvd_to_navd_ft") + MP_LOG.error(traceback.format_exc()) + ex = str(ex) + if crs is None: + msg = ':NOAA VDatum adjustment error, CRS is missing' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + if 'HTTPSConnectionPool' in ex: + time.sleep(10) # Maybe the API needs a break, so wait 10 seconds + try: + datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') + except Exception: + msg = ':NOAA VDatum adjustment error, possible API issue' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + if 'Invalid projection' in ex: + msg = f':NOAA VDatum adjustment error, invalid projection: crs={crs}' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + return None, all_messages + + return datum_adj_ft, all_messages + + +def __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id): + acceptable_usgs_elev_df = None + try: + # Drop columns that offend acceptance criteria + usgs_elev_df['acceptable_codes'] = ( + # usgs_elev_df['usgs_data_coord_accuracy_code'].isin(acceptable_coord_acc_code_list) + # & usgs_elev_df['usgs_data_coord_method_code'].isin(acceptable_coord_method_code_list) + usgs_elev_df['usgs_data_alt_method_code'].isin(acceptable_alt_meth_code_list) + & usgs_elev_df['usgs_data_site_type'].isin(acceptable_site_type_list) + ) + + usgs_elev_df = usgs_elev_df.astype({'usgs_data_alt_accuracy_code': float}) + usgs_elev_df['acceptable_alt_error'] = np.where( + usgs_elev_df['usgs_data_alt_accuracy_code'] <= acceptable_alt_acc_thresh, True, False + ) + + acceptable_usgs_elev_df = usgs_elev_df[ + (usgs_elev_df['acceptable_codes'] == True) & (usgs_elev_df['acceptable_alt_error'] == True) + ] + + # # TEMP DEBUG Record row difference and write it to a CSV or something + # label = 'Old code' ## TEMP DEBUG + # num_potential_rows = usgs_elev_df.shape[0] + # num_acceptable_rows = acceptable_usgs_elev_df.shape[0] + # out_message = f'{label}: kept {num_acceptable_rows} rows out of {num_potential_rows} available rows.' + + except Exception: + # Not sure any of the sites actually have those USGS-related + # columns in this particular file, so just assume it's fine to use + + # print("(Various columns related to USGS probably not in this csv)") + # print(f"Exception: \n {repr(e)} \n") + MP_LOG.error(f"{huc_lid_id}: An error has occurred while working with the usgs_elev table") + MP_LOG.error(traceback.format_exc()) + acceptable_usgs_elev_df = usgs_elev_df + + return acceptable_usgs_elev_df + + +def __adj_dem_evalation_val(acceptable_usgs_elev_df, lid, huc_lid_id): + + # MP_LOG.trace(locals()) + + lid_usgs_elev = 0 + all_messages = [] + try: + matching_rows = acceptable_usgs_elev_df.loc[ + acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' + ] + + if len(matching_rows) == 0: + msg = ':Gage not in HAND usgs gage records' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + return lid_usgs_elev, all_messages + + # It means there are two level paths, use the one that is not 0 + # There will never be more than two + if len(matching_rows) == 2: + lid_usgs_elev = acceptable_usgs_elev_df.loc[ + (acceptable_usgs_elev_df['nws_lid'] == lid.upper()) + & (acceptable_usgs_elev_df['levpa_id'] != 0), + 'dem_adj_elevation', + ].values[0] + else: + lid_usgs_elev = acceptable_usgs_elev_df.loc[ + acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' + ].values[0] + + if lid_usgs_elev == 0: + msg = ':DEM adjusted elevation is 0 or not set' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + return lid_usgs_elev, all_messages + + except IndexError: # Occurs when LID is missing from table (yes. warning) + msg = ':Error when extracting dem adjusted elevation value' + all_messages.append(lid + msg) + MP_LOG.warning(f"{huc_lid_id}: adjusting dem_adj_elevation") + MP_LOG.warning(huc_lid_id + msg) + MP_LOG.warning(traceback.format_exc()) + + MP_LOG.trace(f"{huc_lid_id} : lid_usgs_elev is {lid_usgs_elev}") + + return lid_usgs_elev, all_messages + + +def __calculate_category_key(category, stage_value, is_interval_stage): + + # Calcuate the category key which becomes part of a file name + # Change to an int if whole number only. ie.. we don't want 22.00 but 22.0, but keep 22.15 + # category_key comes things like this: action, action_24.0ft, or action_24.6ft + # and yes... this needs a better answer. + + category_key = category + "_" # ie) action_ + + if float(stage_value) % 1 == 0: # then we have a whole number + # then we will turn it into a int and manually add ".0" on it + category_key += str(int(stage_value)) + ".0" + else: + category_key += "{:.2f}".format(stage_value) + + category_key += "ft" + + # The "i" in the end means it is an interval + # Now we are action_24.0ft or action_24.6ft or action_24.65ft or action_24.0fti + + if is_interval_stage == True: + category_key += "i" + + return category_key + + +# This creates a HUC iterator with each HUC creating its flow files and tifs +def generate_stage_based_categorical_fim( + output_catfim_dir, + fim_run_dir, + nwm_us_search, + nwm_ds_search, + lid_to_run, + env_file, + job_number_inundate, + job_number_huc, + lst_hucs, + job_number_intervals, + past_major_interval_cap, + nwm_metafile, + df_restricted_sites, +): + ''' + Sep 2024, + I believe this can be radically simplied, but just startign with a dataframe for each ahps and populate what we + can as we go. By the end of this, it will know it's mapped status and reasons why. It can save one per huc and + merged later. This would drop the whole huc_messages system and the need to updates status later. It would + also make it much easier to read. If we write a bit carefully with functions where reasonable, flow based + can likely use most of them too. + ''' + + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + + # Create HUC message directory to store messages that will be read and joined after multiprocessing + huc_messages_dir = os.path.join(output_mapping_dir, 'huc_messages') + os.makedirs(huc_messages_dir, exist_ok=True) + + FLOG.lprint("Starting generate_flows (Stage Based)") + + # If it is stage based, generate flows returns all of these objects. + # If flow based, generate flows returns only + + # Generate flows is only using one of the incoming job number params + # so let's multiply -jh (huc) and -jn (inundate) + job_flows = job_number_huc * job_number_inundate + if job_flows > 90: + job_flows == 90 + + # If stage based, generate flows, mostly returns values sent in with a few changes + # stage based doesn't really need generated flow data + # But for flow based, it really does use it to generate flows. + # + (huc_dictionary, sites_gdf, ___, threshold_url, all_lists, nwm_flows_df, nwm_flows_alaska_df) = ( + generate_flows( + output_catfim_dir, + nwm_us_search, + nwm_ds_search, + lid_to_run, + env_file, + job_flows, + True, + lst_hucs, + nwm_metafile, + str(FLOG.LOG_FILE_PATH), + ) + ) + + # FLOG.trace("Huc distionary is ...") + # FLOG.trace(huc_dictionary) + + child_log_file_prefix = FLOG.MP_calc_prefix_name(FLOG.LOG_FILE_PATH, "MP_iter_hucs") + + FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + FLOG.lprint("Start processing HUCs for Stage-Based CatFIM") + num_hucs = len(lst_hucs) + huc_index = 0 + FLOG.lprint(f"Number of hucs to process is {num_hucs}") + + with ProcessPoolExecutor(max_workers=job_number_huc) as executor: + try: + for huc in huc_dictionary: + if huc in lst_hucs: + # FLOG.lprint(f'Generating stage based catfim for : {huc}') + + nwm_flows_region_df = nwm_flows_alaska_df if str(huc[:2]) == '19' else nwm_flows_df + + progress_stmt = f"index {huc_index + 1} of {num_hucs}" + executor.submit( + iterate_through_huc_stage_based, + output_catfim_dir, + huc, + fim_run_dir, + huc_dictionary, + threshold_url, + all_lists, + past_major_interval_cap, + job_number_inundate, + job_number_intervals, + nwm_flows_region_df, + df_restricted_sites, + str(FLOG.LOG_FILE_PATH), + child_log_file_prefix, + progress_stmt, + ) + huc_index += 1 + + except Exception: + FLOG.critical("ERROR: ProcessPool has an error") + FLOG.critical(traceback.format_exc()) + sys.exit(1) + + # Need to merge MP logs here, merged into the "master log file" + FLOG.merge_log_files(FLOG.LOG_FILE_PATH, child_log_file_prefix, True) + + FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + FLOG.lprint('Wrapping up processing HUCs for Stage-Based CatFIM...') + + attrib_csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('_attributes.csv')] + + # print(f"attrib_csv_files are {attrib_csv_files}") + + all_csv_df = pd.DataFrame() + refined_csv_files_list = [] + for csv_file in attrib_csv_files: + + full_csv_path = os.path.join(attributes_dir, csv_file) + # HUC has to be read in as string to preserve leading zeros. + try: + temp_df = pd.read_csv(full_csv_path, dtype={'huc': str}) + if len(temp_df) > 0: + all_csv_df = pd.concat([all_csv_df, temp_df], ignore_index=True) + refined_csv_files_list.append(csv_file) + except Exception: # Happens if a file is empty (i.e. no mapping) + FLOG.error(f"ERROR: loading csv {full_csv_path}") + FLOG.error(traceback.format_exc()) + pass + + # Write to file + if len(all_csv_df) == 0: + raise Exception("no csv files found") + + all_csv_df.to_csv(os.path.join(attributes_dir, 'nws_lid_attributes.csv'), index=False) + + # This section populates a geopackage of all potential sites and details + # whether it was mapped or not (mapped field) and if not, why (status field). + + # Preprocess the out_gdf GeoDataFrame. Reproject and reformat fields. + + # TODO: Accomodate AK projection? Yes.. and Alaska and CONUS should all end up as the same projection output + # epsg:5070, we really want 3857 out for all outputs + sites_gdf = sites_gdf.to_crs(VIZ_PROJECTION) + sites_gdf.rename( + columns={ + 'identifiers_nwm_feature_id': 'nwm_seg', + 'identifiers_nws_lid': 'nws_lid', + 'identifiers_usgs_site_code': 'usgs_gage', + }, + inplace=True, + ) + sites_gdf['nws_lid'] = sites_gdf['nws_lid'].str.lower() + + # Using list of csv_files, populate DataFrame of all nws_lids that had + # a flow file produced and denote with "mapped" column. + nws_lids = [] + for csv_file in attrib_csv_files: + nws_lids.append(csv_file.split('_attributes')[0]) + lids_df = pd.DataFrame(nws_lids, columns=['nws_lid']) + + # Identify what lids were mapped by merging with lids_df. Populate + # 'mapped' column with 'No' if sites did not map. + sites_gdf = sites_gdf.merge(lids_df, how='left', on='nws_lid') + + # Added here, but may be changed later if files don't inundate + sites_gdf['mapped'] = "no" + + # Read all messages for all HUCs + # This is basically identical to a chunk in flow based. At a min, consolidate + # or better yet, find a more elegant, yet still MP safe, system than .txt files + # but it works.. so maybe someday. + huc_message_list = [] + huc_messages_dir_list = os.listdir(huc_messages_dir) + for huc_message_file in huc_messages_dir_list: + full_path_file = os.path.join(huc_messages_dir, huc_message_file) + with open(full_path_file, 'r') as f: + if full_path_file.endswith('.txt'): + lines = f.readlines() + for line in lines: + line = line.strip() + huc_message_list.append(line) + + # Filter out columns and write out to file + # flow based doesn't make it here only stage + nws_lid_gpkg_file_path = os.path.join(output_mapping_dir, 'stage_based_catfim_sites.gpkg') + + # Write messages to DataFrame, split into columns, aggregate messages. + if len(huc_message_list) > 0: + + FLOG.lprint(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : adding messages") + messages_df = pd.DataFrame(huc_message_list, columns=['message']) + + messages_df = ( + messages_df['message'] + .str.split(':', n=1, expand=True) + .rename(columns={0: 'nws_lid', 1: 'status'}) + ) + + # see if there are any duplicates. Their should not be in theory + # We pull them out so we can print dups, before removing them from the parent + duplicate_lids = messages_df[messages_df.duplicated(['nws_lid'])] + if len(duplicate_lids) > 0: + FLOG.warning("Duplicate ahps ids found...") + FLOG.warning(duplicate_lids) + # let's just pick the last one (gulp) + messages_df = messages_df.drop_duplicates(subset=['nws_lid'], keep='last').reset_index(drop=True) + + # Join messages to populate status field to candidate sites. Assign + # status for null fields. + sites_gdf = sites_gdf.merge(messages_df, how='left', on='nws_lid') + + # TODO: This is ugly. It is possible that there are no inundation files for any given lid + # if that is true, we need to update this sites csv. We will figure that out in final + # library mapping and update the sites csv at the same time for those scenarios + + # (msg in flows) + # sites_gdf['status'] = sites_gdf['status'].fillna('Good') # hummmm + # Status could still be starting with --- at this point and leave it for now. + + # Add acceptance criteria to viz_out_gdf before writing + sites_gdf['acceptable_coord_acc_code_list'] = str(acceptable_coord_acc_code_list) + sites_gdf['acceptable_coord_method_code_list'] = str(acceptable_coord_method_code_list) + sites_gdf['acceptable_alt_acc_thresh'] = float(acceptable_alt_acc_thresh) + sites_gdf['acceptable_alt_meth_code_list'] = str(acceptable_alt_meth_code_list) + sites_gdf['acceptable_site_type_list'] = str(acceptable_site_type_list) + + # Rename the stage_based_catfim db column from nws_lid to ahps_lid to be + # consistant with all other CatFIM outputs + sites_gdf.rename(columns={"nws_lid": "ahps_lid"}, inplace=True) + + # Index = False as it already has a field called Index from the merge + sites_gdf.to_file(nws_lid_gpkg_file_path, driver='GPKG', index=False, engine='fiona') + + csv_file_path = nws_lid_gpkg_file_path.replace(".gpkg", ".csv") + sites_gdf.to_csv(csv_file_path, index=False) + else: + FLOG.warning(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : has no messages and should have some") + + +def set_start_files_folders( + step_num, output_catfim_dir, output_mapping_dir, output_flows_dir, attributes_dir, overwrite +): + + # ================================ + # Folder cleaning based on step system + if step_num == 0: + # The override is not for the parent folder as we want to keep logs around with or without override + if os.path.exists(output_catfim_dir) == False: + os.mkdir(output_catfim_dir) + + # Create output directories (check against maping only as a proxy for all three) + if os.path.exists(output_mapping_dir) == True: + if overwrite == False: + raise Exception( + f"The output mapping folder of {output_catfim_dir} already exists." + " If you want to overwrite it, please add the -o flag. Note: When overwritten, " + " the three folders of mapping, flows and attributes wil be deleted and rebuilt" + ) + shutil.rmtree(output_flows_dir, ignore_errors=True) + shutil.rmtree(attributes_dir, ignore_errors=True) + shutil.rmtree(output_mapping_dir, ignore_errors=True) + + os.makedirs(output_flows_dir, exist_ok=True) + os.makedirs(output_mapping_dir, exist_ok=True) + os.makedirs(attributes_dir, exist_ok=True) + + # Always keeps the logs folder + log_dir = os.path.join(output_catfim_dir, "logs") + log_output_file = FLOG.calc_log_name_and_path(log_dir, "catfim") + FLOG.setup(log_output_file) + + +if __name__ == '__main__': + + ''' + Sample + python /foss_fim/tools/generate_categorical_fim.py -f /outputs/Rob_catfim_test_1 -jh 1 -jn 10 -ji 8 + -e /data/config/catfim.env -t /data/catfim/rob_test/docker_test_1 + -me '/data/catfim/rob_test/nwm_metafile.pkl' -sb -step 2 + ''' + + # Parse arguments + parser = argparse.ArgumentParser(description='Run Categorical FIM') + parser.add_argument( + '-f', + '--fim_run_dir', + help='Path to directory containing HAND outputs, e.g. /data/previous_fim/fim_4_5_2_11', + required=True, + ) + parser.add_argument( + '-e', + '--env_file', + help='Docker mount path to the catfim environment file. ie) data/config/catfim.env', + required=True, + ) + parser.add_argument( + '-jh', + '--job_number_huc', + help='OPTIONAL: Number of processes to use for HUC scale operations.' + ' HUC and inundation job numbers should multiply to no more than one less than the CPU count of the' + ' machine. CatFIM sites generally only have 2-3 branches overlapping a site, so this number can be ' + 'kept low (2-4). Defaults to 1.', + required=False, + default=1, + type=int, + ) + parser.add_argument( + '-jn', + '--job_number_inundate', + help='OPTIONAL: Number of processes to use for inundating' + ' HUC and inundation job numbers should multiply to no more than one less than the CPU count' + ' of the machine. Defaults to 1.', + required=False, + default=1, + type=int, + ) + + parser.add_argument( + '-ji', + '--job_number_intervals', + help='OPTIONAL: Number of processes to use for inundating multiple intervals in stage-based' + ' inundation and interval job numbers should multiply to no more than one less than the CPU count ' + 'of the machine. Defaults to 1.', + required=False, + default=1, + type=int, + ) + + parser.add_argument( + '-sb', + '--is_stage_based', + help='Run stage-based CatFIM instead of flow-based? Add this -sb param to make it stage based,' + ' leave it off for flow based', + required=False, + default=False, + action='store_true', + ) + parser.add_argument( + '-t', + '--output_folder', + help='OPTIONAL: Target location, Where the output folder will be. Defaults to /data/catfim/', + required=False, + default='/data/catfim/', + ) + parser.add_argument( + '-s', + '--search', + help='OPTIONAL: Upstream and downstream search in miles. How far up and downstream do you want to go? Defaults to 5.', + required=False, + default='5', + ) + + ## Deprecated, use lst_hucs instead + # TODO: lid_to_run functionality... remove? for now, just hard code lid_to_run as "all" + # parser.add_argument( + # '-l', + # '--lid_to_run', + # help='OPTIONAL: NWS LID, lowercase, to produce CatFIM for. Currently only accepts one. Defaults to all sites', + # required=False, + # default='all', + # ) + + # NOTE: The HUCs you put in this, MUST be a HUC that is valid in your -f/ --fim_run_dir (HAND output folder) + parser.add_argument( + '-lh', + '--lst_hucs', + help='OPTIONAL: Space-delimited list of HUCs to produce CatFIM for. Defaults to all HUCs', + required=False, + default='all', + ) + + parser.add_argument( + '-mc', + '--past_major_interval_cap', + help='OPTIONAL: Stage-Based Only. How many feet past major do you want to go for the interval FIMs?' + ' of the machine. Defaults to 5.', + required=False, + default=5.0, + type=float, + ) + + parser.add_argument( + '-step', + '--step-num', + help='OPTIONAL: By adding a number here, you may be able to skip levels of processing. The number' + ' you submit means it will start at that step. e.g. step of 2 means start at step 2 which for flow' + ' based is the creating of tifs and gpkgs. Note: This assumes' + ' those previous steps have already been processed and the files are present.' + ' Defaults to 0 which means all steps processed.', + required=False, + default=0, + type=int, + ) + + # NOTE: This params is for quick debugging only and should not be used in a production mode + parser.add_argument( + '-me', + '--nwm_metafile', + help='OPTIONAL: If you have a pre-existing nwm metadata pickle file, you can path to it here.' + ' e.g.: /data/catfim/nwm_metafile.pkl', + required=False, + default="", + ) + + parser.add_argument( + '-o', '--overwrite', help='OPTIONAL: Overwrite files', required=False, action="store_true" + ) + + args = vars(parser.parse_args()) + + try: + + # call main program + process_generate_categorical_fim(**args) + + except Exception: + FLOG.critical(traceback.format_exc()) diff --git a/tools/generate_categorical_fim_flows.py b/tools/catfim/generate_categorical_fim_flows.py similarity index 62% rename from tools/generate_categorical_fim_flows.py rename to tools/catfim/generate_categorical_fim_flows.py index 7acfb4631..3646fd7c3 100755 --- a/tools/generate_categorical_fim_flows.py +++ b/tools/catfim/generate_categorical_fim_flows.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -# import csv - import argparse import copy +import glob import os import pickle import random +import shutil import sys import time import traceback @@ -44,7 +44,7 @@ def get_env_paths(env_file): - if os.path.exists(env_file) is False: + if os.path.exists(env_file) == False: raise Exception(f"The environment file of {env_file} does not seem to exist") load_dotenv(env_file) @@ -80,13 +80,14 @@ def generate_flows_for_huc( # A bit of start staggering to help not overload the MP (20 sec) time_delay = random.randrange(0, 20) # MP_LOG.lprint(f" ... {huc} start time is {dt_string} and delay is {time_delay}") + MP_LOG.lprint("") MP_LOG.lprint(f" ... {huc} flow generation start time is {dt_string}") time.sleep(time_delay) - # Process each huc unit, first define message variable and flood categories. + # Process each huc unit, first define message variable and categories. all_messages = [] - flood_categories = ['action', 'minor', 'moderate', 'major', 'record'] + categories = ['action', 'minor', 'moderate', 'major', 'record'] nws_lids = huc_dictionary[huc] @@ -94,8 +95,35 @@ def generate_flows_for_huc( MP_LOG.lprint(f"huc {huc} has no applicable nws_lids") return + # less columns then stage cols + df_cols = { + "nws_lid": pd.Series(dtype='str'), + "name": pd.Series(dtype='str'), + "WFO": pd.Series(dtype='str'), + "rfc": pd.Series(dtype='str'), + "huc": pd.Series(dtype='str'), + "state": pd.Series(dtype='str'), + "county": pd.Series(dtype='str'), + "magnitude": pd.Series(dtype='str'), + "q": pd.Series(dtype='str'), + "q_uni": pd.Series(dtype='str'), + "q_src": pd.Series(dtype='str'), + "stage": pd.Series(dtype='float'), + "stage_uni": pd.Series(dtype='str'), + "s_src": pd.Series(dtype='str'), + "wrds_time": pd.Series(dtype='str'), + "nrldb_time": pd.Series(dtype='str'), + "nwis_time": pd.Series(dtype='str'), + "lat": pd.Series(dtype='float'), + "lon": pd.Series(dtype='float'), + } + # Loop through each lid in list to create flow file for lid in nws_lids: + MP_LOG.lprint("-----------------------------------") + huc_lid_id = f"{huc} : {lid}" + MP_LOG.lprint(f"processing {huc_lid_id}") + # Convert lid to lower case lid = lid.lower() @@ -105,36 +133,50 @@ def generate_flows_for_huc( # Careful, for "all_message.append" the syntax into it must be f'{lid}: (whever messages) # this is gets parsed and logic used against it. - MP_LOG.trace(f'Getting thresholds for {lid}') + # for Stage based, is uses stage values from threshold data supplied by WRDS + # but here (for flow), it uses the values from the flows data from WRDS stages, flows = get_thresholds( threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' ) - if len(stages) == 0 or len(flows) == 0: - message = f'{lid}:no stages or flows exist, likely WRDS error' - all_messages.append(message) - MP_LOG.warning(f"{huc} - {message}") + # Yes.. stages, we will handle missing flows later even though we don't use the stage here + if stages is None or len(stages) == 0: + msg = ':Error getting stages values from WRDS API' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) continue + # MP_LOG.lprint(f"Thresholds for {huc_lid_id} are : {thresholds}") + # Check if stages are supplied, if not write message and exit. - if all(stages.get(category, None) is None for category in flood_categories): - message = f'{lid}:missing threshold stages' + if all(stages.get(category, None) is None for category in categories): + message = f'{lid}:Missing all stage data' all_messages.append(message) MP_LOG.warning(f"{huc} - {message}") continue # Check if calculated flows are supplied, if not write message and exit. - if all(flows.get(category, None) is None for category in flood_categories): - message = f'{lid}:missing calculated flows' + if all(flows.get(category, None) is None for category in categories): + message = f'{lid}:Missing all calculated flows for all stages' all_messages.append(message) MP_LOG.warning(f"{huc} - {message}") continue + # Sept 2024: Can flows be missing a category, yes, but we jsut filter them later + # Find lid metadata from master list of metadata dictionaries (line 66). metadata = next( (item for item in all_meta_lists if item['identifiers']['nws_lid'] == lid.upper()), False ) + # Sept 2024: Should we skip these functions that are seen in stage based? Yes + # Flow doesn't need all of the evalation stuff + # acceptable_usgs_elev_df = __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id) + # lid_usgs_elev, dem_eval_messages = __adj_dem_evalation_val( + # lid_altitude = metadata['usgs_data']['altitude'] + # Filter out sites that don't have "good" data (coords) + # datum_adj_ft, datum_messages = __adjust_datum_ft + # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. unfiltered_segments = list(set(get_nwm_segs(metadata))) desired_order = metadata['nwm_feature_data']['stream_order'] @@ -144,40 +186,58 @@ def generate_flows_for_huc( # If there are no segments, write message and exit out if not segments or len(segments) == 0: - message = f'{lid}:missing nwm segments' + message = f'{lid}:Missing nwm stream segments' all_messages.append(message) MP_LOG.warning(f"{huc} - {message}") continue + missing_wrds_data_msg = "" + invalid_flows = [] + # If we got here, we have at least one value stage/threshold # For each flood category - for category in flood_categories: - # Get the flow + for category in categories: + # In stage we use the threshold data here from WRDS, but here it is flows flow = flows[category] - if flow is not None and flow != 0: + MP_LOG.trace(f"{huc} : {lid} : {category} : flow value is {flow}") + + if flow is not None and flow != 0 and flow != "": # If there is a valid flow value, write a flow file. # if flow: # round flow to nearest hundredth flow = round(flow, 2) + # value of flow shows up in the {lid}_attributes.csv file as the "q" column + # Create the guts of the flow file. flow_info = flow_data(segments, flow) # Define destination path and create folders csv_output_folder = os.path.join(output_flows_dir, huc, lid, category) os.makedirs(csv_output_folder, exist_ok=True) - output_file = os.path.join( - csv_output_folder, f'ahps_{lid}_huc_{huc}_flows_{category}.csv' - ) + output_file = os.path.join(csv_output_folder, f'{lid}_huc_{huc}_flows_{category}.csv') # Write flow file to file flow_info.to_csv(output_file, index=False) else: - message = f'{lid}:{category} is missing calculated flow' - all_messages.append(message) - MP_LOG.warning(f"{huc} - {message}") + if missing_wrds_data_msg == "": + missing_wrds_data_msg = f":---Missing flow data for {category}" + else: + missing_wrds_data_msg += f"; {category}" + + invalid_flows.append(category) + + if len(invalid_flows) == 5: + msg = ':No valid flow values are available' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + + if missing_wrds_data_msg != "": + all_messages.append(lid + missing_wrds_data_msg) + MP_LOG.warning(huc_lid_id + missing_wrds_data_msg) # Get various attributes of the site. lat = float(metadata['nws_preferred']['latitude']) @@ -194,64 +254,94 @@ def generate_flows_for_huc( nwis_timestamp = metadata['nwis_timestamp'] # Create a csv with same information as shapefile but with each threshold as new record. - csv_df = pd.DataFrame() - for threshold in flood_categories: - line_df = pd.DataFrame( - { - 'nws_lid': [lid], - 'name': name, - 'WFO': wfo, - 'rfc': rfc, - 'huc': [huc], - 'state': state, - 'county': county, - 'magnitude': threshold, - 'q': flows[threshold], - 'q_uni': flows['units'], - 'q_src': flow_source, - 'stage': stages[threshold], - 'stage_uni': stages['units'], - 's_src': stage_source, - 'wrds_time': wrds_timestamp, - 'nrldb_time': nrldb_timestamp, - 'nwis_time': nwis_timestamp, - 'lat': [lat], - 'lon': [lon], - } - ) - csv_df = pd.concat([csv_df, line_df]) - - # Round flow and stage columns to 2 decimal places. - csv_df = csv_df.round({'q': 2, 'stage': 2}) - - # If a site folder exists (ie a flow file was written) save files containing site attributes. - huc_lid_flow_dir = os.path.join(output_flows_dir, huc, lid) - - if os.path.exists(huc_lid_flow_dir): + # if we got here, mapped is fine. + csv_df = pd.DataFrame(df_cols) # for first appending + for threshold in categories: + try: + line_df = pd.DataFrame( + { + 'nws_lid': [lid], + 'name': name, + 'WFO': wfo, + 'rfc': rfc, + 'huc': [huc], + 'state': state, + 'county': county, + 'magnitude': threshold, + 'q': flows[threshold], + 'q_uni': flows['units'], + 'q_src': flow_source, + 'stage': stages[threshold], + 'stage_uni': stages['units'], + 's_src': stage_source, + 'wrds_time': wrds_timestamp, + 'nrldb_time': nrldb_timestamp, + 'nwis_time': nwis_timestamp, + 'lat': [lat], + 'lon': [lon], + } + ) + csv_df = pd.concat([csv_df, line_df], ignore_index=True) + + except Exception: + # is this the text we want users to see + msg = f':Error with flow/threshold processing {threshold}' + all_messages.append(huc_lid_id + msg) + MP_LOG.error(huc_lid_id + msg) + MP_LOG.error(traceback.format_exc()) + continue + # sys.exit(1) + + # might be that none of the lids for this HUC passed + # MP_LOG.trace(f"len(csv_df) is {len(csv_df)}") + if len(csv_df) > 0: + # Round flow and stage columns to 2 decimal places. + csv_df = csv_df.round({'q': 2, 'stage': 2}) + # Export DataFrame to csv containing attributes - csv_df.to_csv(os.path.join(attributes_dir, f'{lid}_attributes.csv'), index=False) - message = f'{lid}:flows available' - all_messages.append(message) + file_name = os.path.join(attributes_dir, f'{lid}_attributes.csv') + csv_df.to_csv(file_name, index=False) + + if missing_wrds_data_msg == "": + all_messages.append(lid + ':Good') else: - message = f'{lid}:missing all calculated flows' - all_messages.append(message) - MP_LOG.warning(f"Missing all calculated flows for {huc} - {lid}") - - # Write all_messages to huc-specific file. - # MP_LOG.lprint(f'Writing message file for {huc}') - huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') - with open(huc_messages_txt_file, 'w') as f: - for item in all_messages: - item = item.strip() - # f.write("%s\n" % item) - f.write(f"{item}\n") - # MP_LOG.lprint(f'--- generate_flow_for_huc done for {huc}') - - end_time = datetime.now(timezone.utc) - dt_string = end_time.strftime("%m/%d/%Y %H:%M:%S") - time_duration = end_time - start_time - MP_LOG.lprint(f" ... {huc} end time is {dt_string} : Duration: {str(time_duration).split('.')[0]}") - print("") + msg = ':Missing all calculated flows' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + + MP_LOG.success(f'{huc_lid_id}: Complete') + + # Write all_messages by HUC to be scraped later. + if len(all_messages) > 0: + + # Check for duplicate sites in the messages + lids = [msg.split(':')[0] for msg in all_messages] + duplicate_lids = set([x for x in lids if lids.count(x) > 1]) + + # If there are duplicate sites and one of the lines has '---', drop that line + filtered_messages = [ + msg for msg in all_messages if not (msg.split(':')[0] in duplicate_lids and ':---' in msg) + ] + + # TODO: Aug 2024: This is now identical to the way flow handles messages + # but the system should probably be changed to somethign more elegant but good enough + # for now. At least is is MP safe. + + # Write filtered_messages to huc-specific file. + # MP_LOG.lprint(f'Writing message file for {huc}') + huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') + with open(huc_messages_txt_file, 'w') as f: + for item in filtered_messages: + item = item.strip() + # f.write("%s\n" % item) + f.write(f"{item}\n") + # MP_LOG.lprint(f'--- generate_flow_for_huc done for {huc}') + + # end_time = datetime.now(timezone.utc) + # dt_string = end_time.strftime("%m/%d/%Y %H:%M:%S") + # time_duration = end_time - start_time + # MP_LOG.lprint(f" ... {huc} end time is {dt_string} : Duration: {str(time_duration).split('.')[0]}") + # print("") except Exception as ex: MP_LOG.error(f"An error occured while generating flows for huc {huc}") @@ -300,17 +390,17 @@ def generate_flows( wbd_path : STR Location of HUC geospatial data (geopackage). - Returns - ------- - nws_lid_gpkg_file_path. - Name and path of the nws_lid file ''' FLOG.setup(log_output_file) # reusing the parent logs + FLOG.lprint("Gettting flows") # FLOG.trace("args coming into generate flows") # FLOG.trace(locals()) # see all args coming in to the function attributes_dir = os.path.join(output_catfim_dir, 'attributes') + os.makedirs(attributes_dir, exist_ok=True) + mapping_dir = os.path.join(output_catfim_dir, "mapping") # create var but don't make folder yet all_start = datetime.now(timezone.utc) @@ -323,7 +413,8 @@ def generate_flows( # Create HUC message directory to store messages that will be read and joined after multiprocessing huc_messages_dir = os.path.join(mapping_dir, 'huc_messages') - os.makedirs(huc_messages_dir, exist_ok=True) + if not os.path.exists(huc_messages_dir): + os.mkdir(huc_messages_dir) FLOG.lprint("Loading nwm flow metadata") start_dt = datetime.now(timezone.utc) @@ -332,8 +423,8 @@ def generate_flows( nwm_flows_gpkg = r'/data/inputs/nwm_hydrofabric/nwm_flows.gpkg' nwm_flows_df = gpd.read_file(nwm_flows_gpkg) - # nwm_flows_alaska_gpkg = r'/data/inputs/nwm_hydrofabric/nwm_flows_alaska_nwmV3_ID.gpkg' # Uncomment to include Alaska - # nwm_flows_alaska_df = gpd.read_file(nwm_flows_alaska_gpkg) # Uncomment to include Alaska + nwm_flows_alaska_gpkg = r'/data/inputs/nwm_hydrofabric/nwm_flows_alaska_nwmV3_ID.gpkg' + nwm_flows_alaska_df = gpd.read_file(nwm_flows_alaska_gpkg) # nwm_metafile might be an empty string # maybe ensure all projections are changed to one standard output of 3857 (see shared_variables) as the come out @@ -359,6 +450,7 @@ def generate_flows( huc_dictionary, out_gdf = aggregate_wbd_hucs(all_meta_lists, WBD_LAYER, True, lst_hucs) + # FLOG.lprint(f"WBD LAYER USED: {WBD_LAYER}") # TEMP DEBUG # Drop list fields if invalid out_gdf = out_gdf.drop(['downstream_nwm_features'], axis=1, errors='ignore') out_gdf = out_gdf.drop(['upstream_nwm_features'], axis=1, errors='ignore') @@ -368,6 +460,7 @@ def generate_flows( time_duration = end_dt - start_dt FLOG.lprint(f"End aggregate_wbd_hucs - Duration: {str(time_duration).split('.')[0]}") + FLOG.lprint("") FLOG.lprint("Start Flow Generation") # It this is stage-based, it returns all of these objects here, but if it continues @@ -380,8 +473,8 @@ def generate_flows( threshold_url, all_meta_lists, nwm_flows_df, - ) # No Alaska - # return (huc_dictionary, out_gdf, metadata_url, threshold_url, all_meta_lists, nwm_flows_df, nwm_flows_alaska_df) # Alaska + nwm_flows_alaska_df, + ) # only flow based needs the "flow" dir output_flows_dir = os.path.join(output_catfim_dir, "flows") @@ -397,8 +490,10 @@ def generate_flows( with ProcessPoolExecutor(max_workers=job_number_huc) as executor: for huc in huc_dictionary: - nwm_flows_region_df = nwm_flows_df # To exclude Alaska - # nwm_flows_region_df = nwm_flows_alaska_df if huc[:2] == '19' else nwm_flows_df # To include Alaska + # nwm_flows_region_df = nwm_flows_df # To exclude Alaska + nwm_flows_region_df = ( + nwm_flows_alaska_df if huc[:2] == '19' else nwm_flows_df + ) # To include Alaska # Deep copy that speed up Multi-Proc a little as all_meta_lists # is a huge object. Need to figure out how to filter that down somehow @@ -429,14 +524,15 @@ def generate_flows( FLOG.lprint('Start merging and finalizing flows generation data') # Recursively find all *_attributes csv files and append - csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('_attributes.csv')] + # attrib_csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('_attributes.csv')] + attrib_csv_files = glob.glob(f"{attributes_dir}/*_attributes.csv") - if len(csv_files) == 0: - MP_LOG.critical(f"No new flow files exist in the {attributes_dir} folder (errors in creating them?)") + if len(attrib_csv_files) == 0: + FLOG.critical(f"No new flow files exist in the {attributes_dir} folder (errors in creating them?)") sys.exit(1) all_csv_df = pd.DataFrame() - for csv_file in csv_files: + for csv_file in attrib_csv_files: full_csv_path = os.path.join(attributes_dir, csv_file) # Huc has to be read in as string to preserve leading zeros. temp_df = pd.read_csv(full_csv_path, dtype={'huc': str}) @@ -462,15 +558,19 @@ def generate_flows( # Using list of csv_files, populate DataFrame of all nws_lids that had # a flow file produced and denote with "mapped" column. nws_lids = [] - for csv_file in csv_files: + for csv_file in attrib_csv_files: nws_lids.append(csv_file.split('_attributes')[0]) lids_df = pd.DataFrame(nws_lids, columns=['nws_lid']) - lids_df['mapped'] = 'yes' + # lids_df['mapped'] = 'yes' # Identify what lids were mapped by merging with lids_df. Populate # 'mapped' column with 'No' if sites did not map. viz_out_gdf = viz_out_gdf.merge(lids_df, how='left', on='nws_lid') - viz_out_gdf['mapped'] = viz_out_gdf['mapped'].fillna('no') + viz_out_gdf.reset_index(inplace=True, drop=True) + if 'mapped' not in viz_out_gdf.columns: + viz_out_gdf['mapped'] = 'no' + else: + viz_out_gdf['mapped'] = viz_out_gdf['mapped'].fillna('no') # Read all messages for all HUCs # this is basically identical to a stage based set. Seach for huc_message_list and see my notes @@ -495,9 +595,6 @@ def generate_flows( .rename(columns={0: 'nws_lid', 1: 'status'}) ) - # There could be duplicate message for one ahps (ie. missing nwm segments), so drop dups - messages_df.drop_duplicates(subset=["nws_lid", "status"], keep="first", inplace=True) - # We want one viz_out_gdf record per ahps and if there are more than one, contact the messages # status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() @@ -511,20 +608,29 @@ def generate_flows( # status for null fields. viz_out_gdf = viz_out_gdf.merge(status_df, how='left', on='nws_lid') - viz_out_gdf['status'] = viz_out_gdf['status'].fillna('all calculated flows available') + viz_out_gdf['status'] = viz_out_gdf['status'].fillna('Good') + # viz_out_gdf['status'] = viz_out_gdf['status'].apply(lambda x: x[3:] if x.startswith("---") else x) + + # There could be duplicate message for one ahps (ie. missing nwm segments), so drop dups + messages_df.drop_duplicates(subset=["nws_lid", "status"], keep="first", inplace=True) # Filter out columns and write out to file # viz_out_gdf = viz_out_gdf.filter( # ['nws_lid', 'usgs_gage', 'nwm_seg', 'HUC8', 'mapped', 'status', 'geometry'] # ) + # rename the column from nws_lid to ahps_lid + viz_out_gdf.rename(columns={'nws_lid': 'ahps_lid'}, inplace=True) + # stage based doesn't get here # crs is 3857 - web mercator at this point + + # The csv will be updated later if something fails during inundation nws_lid_csv_file_path = os.path.join(mapping_dir, 'flow_based_catfim_sites.csv') viz_out_gdf.to_csv(nws_lid_csv_file_path) - nws_lid_gpkg_file_path = os.path.join(mapping_dir, 'flow_based_catfim_sites.gpkg') - viz_out_gdf.to_file(nws_lid_gpkg_file_path, driver='GPKG', index=False, engine='fiona') + catfim_sites_gpkg_file_path = os.path.join(mapping_dir, 'flow_based_catfim_sites.gpkg') + viz_out_gdf.to_file(catfim_sites_gpkg_file_path, driver='GPKG', crs=VIZ_PROJECTION, engine='fiona') # time operation all_end = datetime.now(timezone.utc) @@ -532,13 +638,12 @@ def generate_flows( FLOG.lprint(f"End Wrapping up flows generation Duration: {str(all_time_duration).split('.')[0]}") print() - return nws_lid_gpkg_file_path - # local script calls __load_nwm_metadata so FLOG is already setup def __load_nwm_metadata( output_catfim_dir, metadata_url, nwm_us_search, nwm_ds_search, lid_to_run, nwm_metafile ): + FLOG.trace(metadata_url) all_meta_lists = [] @@ -546,7 +651,7 @@ def __load_nwm_metadata( # This feature means we can copy the pickle file to another enviro (AWS?) as it won't need to call # WRDS unless we need a smaller or modified version. This one likely has all nws_lid data. - if os.path.isfile(nwm_metafile) is True: + if os.path.isfile(nwm_metafile) == True: FLOG.lprint(f"Meta file already downloaded and exists at {nwm_metafile}") with open(nwm_metafile, "rb") as p_handle: @@ -557,9 +662,11 @@ def __load_nwm_metadata( FLOG.lprint(f"Meta file will be downloaded and saved at {meta_file}") - # lid_to_run coudl be a single lid or the word "all" - if lid_to_run != "all": + # single lid for now + + # must_include_value variable not yet tested + # must_include_value = 'nws_data.rfc_forecast_point' if lid_to_run not in ['HI', 'PR', 'AK'] else None all_meta_lists, ___ = get_metadata( metadata_url, select_by='nws_lid', @@ -569,7 +676,19 @@ def __load_nwm_metadata( downstream_trace_distance=nwm_ds_search, ) else: - conus_list, ___ = get_metadata( + # This gets all records including AK, HI and PR, but only if they ahve forecast points + + # Note: Nov 2024: AK has 152 sites with forecast points, but after research + # non of the AK sites that nws_data.rfc_forecast_point = False so we can + # exclude them. + # Previously we allowed HI and PR sites to come in and most were failing. + # So, we will include HI and PR as well here + + # We can not just filter out based on dup lids as depending on which + # metadata load they are on, dup lid records will have different data + + # orig_meta_lists, ___ = get_metadata( + all_meta_lists, ___ = get_metadata( metadata_url, select_by='nws_lid', selector=['all'], @@ -577,17 +696,37 @@ def __load_nwm_metadata( upstream_trace_distance=nwm_us_search, downstream_trace_distance=nwm_ds_search, ) - # Get metadata for Islands and Alaska - islands_list, ___ = get_metadata( - metadata_url, - select_by='state', - selector=['HI', 'PR', 'AK'], - must_include=None, - upstream_trace_distance=nwm_us_search, - downstream_trace_distance=nwm_ds_search, - ) - # Append the lists - all_meta_lists = conus_list + islands_list + + # If we decided to put HI and PR back in we can do two loads one + # with the flag and one without. Then iterate through the meta_lists + # results and filtered out based on the state full value. Then + # call WRDS again with those specific states and simply concat them. + + # filtered_meta_data = [] + # for metadata in orig_meta_lists: + # df = pd.json_normalize(metadata) + # state = df['nws_data.state'].item() + # lid = df['identifiers.nws_lid'].item() + # if state.lower() not in ["alaska", "hawaii", "puerto rico"]: + # filtered_meta_data.append(metadata) + + # must_include='nws_data.rfc_forecast_point', + + # Nov 2024: We used to call them site specific and may add them back in but ok to leave then + + # islands_list, ___ = get_metadata( + # metadata_url, + # select_by='state', + # selector=['HI', 'PR'], + # must_include=None, + # upstream_trace_distance=nwm_us_search, + # downstream_trace_distance=nwm_ds_search, + # ) + + # Append the lists + # all_meta_lists = filtered_all_meta_list + islands_list + + # print(f"len(all_meta_lists) is {len(all_meta_lists)}") with open(meta_file, "wb") as p_handle: pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/tools/generate_categorical_fim_mapping.py b/tools/catfim/generate_categorical_fim_mapping.py similarity index 69% rename from tools/generate_categorical_fim_mapping.py rename to tools/catfim/generate_categorical_fim_mapping.py index a09a5c65b..f6648f360 100755 --- a/tools/generate_categorical_fim_mapping.py +++ b/tools/catfim/generate_categorical_fim_mapping.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import argparse -# import glob +import argparse +import glob import os # import shutil @@ -41,8 +41,9 @@ # Technically, this is once called as a non MP, but also called in an MP pool # we will use an MP object either way -def produce_stage_based_catfim_tifs( - stage, +def produce_stage_based_lid_tifs( + stage_val, + is_interval_stage, datum_adj_ft, branch_dir, lid_usgs_elev, @@ -53,21 +54,21 @@ def produce_stage_based_catfim_tifs( huc, lid_directory, category, + category_key, number_of_jobs, - parent_log_output_file, + mp_parent_log_file, child_log_file_prefix, ): - MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + MP_LOG.MP_Log_setup(mp_parent_log_file, child_log_file_prefix) messages = [] - MP_LOG.lprint("-----------------") - huc_lid_cat_id = f"{huc} : {lid} : {category}" - MP_LOG.lprint(f"{huc_lid_cat_id}: Starting to create tifs") + huc_lid_cat_id = f"{huc} : {lid} : {category_key}" + MP_LOG.trace(f"{huc_lid_cat_id}: Starting to create tifs") # Determine datum-offset water surface elevation (from above). - datum_adj_wse = stage + datum_adj_ft + lid_altitude + datum_adj_wse = stage_val + datum_adj_ft + lid_altitude datum_adj_wse_m = datum_adj_wse * 0.3048 # Convert ft to m # Subtract HAND gage elevation from HAND WSE to get HAND stage. @@ -77,26 +78,27 @@ def produce_stage_based_catfim_tifs( # If no segments, write message and exit out if not segments or len(segments) == 0: - msg = ': missing nwm segments' + msg = ':missing nwm segments' messages.append(lid + msg) MP_LOG.warning(huc_lid_cat_id + msg) return messages, hand_stage, datum_adj_wse, datum_adj_wse_m # Produce extent tif hand_stage. Multiprocess across branches. # branches = os.listdir(branch_dir) - MP_LOG.lprint(f"{huc_lid_cat_id} branch_dir is {branch_dir}") + # MP_LOG.trace(f"{huc_lid_cat_id} branch_dir is {branch_dir}") branches = [x for x in os.listdir(branch_dir) if os.path.isdir(os.path.join(branch_dir, x))] branches.sort() + # MP_LOG.trace(f"{huc_lid_cat_id} branches are {branches}") - # We need to merge what we have up to this point. - # MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix) - # child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_huc_mag_stage", huc) - child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_huc_mag_stage") + # This is an MP in an MP. We want this set of mp's to roll up to the + # parent MP file, and not the full catfim parent log. We roll this child MP into + # it's parent mp and later that parent MP will rollup to the catfim file. + child_log_file_prefix = MP_LOG.MP_calc_prefix_name(MP_LOG.LOG_FILE_PATH, "MP_branch") with ProcessPoolExecutor(max_workers=number_of_jobs) as executor: for branch in branches: - msg_id_w_branch = f"{huc} - {branch} - {lid} - {category}" - MP_LOG.trace(f"{msg_id_w_branch} : inundating branches") + msg_id_w_branch = f"{huc_lid_cat_id}: {branch}" + # MP_LOG.trace(f"{msg_id_w_branch} : inundating branch") # Define paths to necessary files to produce inundation grids. full_branch_path = os.path.join(branch_dir, branch) rem_path = os.path.join(fim_dir, huc, full_branch_path, 'rem_zeroed_masked_' + branch + '.tif') @@ -108,19 +110,20 @@ def produce_stage_based_catfim_tifs( ) hydrotable_path = os.path.join(fim_dir, huc, full_branch_path, 'hydroTable_' + branch + '.csv') + # sometimes, these can fail to exist if a branchf initial failed during HAND generation if not os.path.exists(rem_path): - msg = ": rem doesn't exist" - messages.append(lid + msg) + msg = ":rem doesn't exist (could be bad branch)" + # messages.append(lid + msg) MP_LOG.warning(msg_id_w_branch + msg) continue if not os.path.exists(catchments_path): - msg = ": catchments files don't exist" - messages.append(lid + msg) + msg = ":catchments files don't exist (could be bad branch)" + # messages.append(lid + msg) MP_LOG.warning(msg_id_w_branch + msg) continue if not os.path.exists(hydrotable_path): - msg = ": hydrotable doesn't exist" - messages.append(lid + msg) + msg = ":hydrotable doesn't exist (could be bad branch)" + # messages.append(lid + msg) MP_LOG.warning(msg_id_w_branch + msg) continue @@ -138,86 +141,87 @@ def produce_stage_based_catfim_tifs( hydroid_list += list(subset_hydrotable_df.HydroID.unique()) except IndexError: MP_LOG.trace( - f"Index Error for {huc} -- {branch} -- {category}. FeatureId is {feature_id} : Continuing on." + f"Index Error for {msg_id_w_branch}. FeatureId is {feature_id} : Continuing on." ) pass # Create inundation maps with branch and stage data + # only sites /categories that got this far are valid and can be inundated try: - # print("Generating stage-based FIM for " + huc + " and branch " + branch) - # - # MP_LOG.lprint(f"{huc_lid_cat_id} : Generating stage-based FIM") + # MP_LOG.trace(f"{huc_lid_cat_id} : branch = {branch} : Generating stage-based FIM") executor.submit( - produce_tif_per_huc_per_mag_for_stage, + produce_inundated_branch_tif, rem_path, catchments_path, hydroid_list, hand_stage, lid_directory, - category, + category_key, huc, lid, branch, - parent_log_output_file, + MP_LOG.LOG_FILE_PATH, child_log_file_prefix, ) except Exception: - msg = f': inundation failed at {category}' + msg = f':inundation failed at {category}' messages.append(lid + msg) MP_LOG.warning(msg_id_w_branch + msg) MP_LOG.error(traceback.format_exc()) - MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) + # Nov 2024: Fix this. The logs get out of order as it is can be a MP in MP + # hold on this + # MP_LOG.merge_log_files(mp_parent_log_file, child_log_file_prefix, True) # -- MOSAIC -- # # Merge all rasters in lid_directory that have the same magnitude/category. path_list = [] - MP_LOG.trace(f"Merging files from {lid_directory}") - lid_dir_list = os.listdir(lid_directory) + # we are looking for the branch files for the category/stage + # or any given stage interval - MP_LOG.lprint(f"{huc}: Merging {category}") - # MP_LOG.trace("lid_dir_list is ") - # MP_LOG.trace(lid_dir_list) - # MP_LOG.lprint("") + lid_dir_list = [x for x in os.listdir(lid_directory) if category_key in x] + lid_dir_list.sort() # To force branch 0 first in list, sort - for f in lid_dir_list: - if category in f: - path_list.append(os.path.join(lid_directory, f)) - - # MP_LOG.error("???") - # MP_LOG.trace(f"path_list is (pre sort) is {path_list}") - # path_list.sort() # To force branch 0 first in list, sort it isn't branchs and we don't care the order for mosaiking - # MP_LOG.trace(f"path_list is (post sort) is {path_list}") + # MP_LOG.lprint(f"{huc}: {lid} : Merging branch files {huc_lid_cat_id}") + # MP_LOG.trace(f"{huc_lid_cat_id} : lid_dir_list is... {lid_dir_list}") + # MP_LOG.trace("") - path_list.sort() # To force branch 0 first in list, sort - - # MP_LOG.trace(f"len of path_list is {len(path_list)}") + for f in lid_dir_list: + path_list.append(os.path.join(lid_directory, f)) - if len(path_list) > 0: + # Merging all of the branch tifs into one lid_category tif + if len(lid_dir_list) > 0: zero_branch_grid = path_list[0] zero_branch_src = rasterio.open(zero_branch_grid) zero_branch_array = zero_branch_src.read(1) + # zero_branch_array.nodata = 0 summed_array = zero_branch_array # Initialize it as the branch zero array + output_tif = os.path.join(lid_directory, lid + '_' + category_key + '_extent.tif') + MP_LOG.trace(f"{huc_lid_cat_id}: Merging all branches into output file to be saved as {output_tif}") + # Loop through remaining items in list and sum them with summed_array for remaining_raster in path_list[1:]: + remaining_raster_src = rasterio.open(remaining_raster) remaining_raster_array_original = remaining_raster_src.read(1) + # TODO: Nov 2024: We should need to reproject at all (Research if this works wihtout it) # Reproject non-branch-zero grids so I can sum them with the branch zero grid remaining_raster_array = np.empty(zero_branch_array.shape, dtype=np.int8) reproject( remaining_raster_array_original, destination=remaining_raster_array, src_transform=remaining_raster_src.transform, - src_crs=remaining_raster_src.crs, # TODO: Accomodate AK projection? + src_crs=remaining_raster_src.crs, src_nodata=remaining_raster_src.nodata, dst_transform=zero_branch_src.transform, dst_crs=zero_branch_src.crs, # TODO: Accomodate AK projection? - dst_nodata=-1, + # dst_nodata=-1, + dst_nodata=0, dst_resolution=zero_branch_src.res, resampling=Resampling.nearest, ) @@ -227,26 +231,43 @@ def produce_stage_based_catfim_tifs( del zero_branch_array # Clean up # Define path to merged file, in same format as expected by post_process_cat_fim_for_viz function - output_tif = os.path.join(lid_directory, lid + '_' + category + '_extent.tif') profile = zero_branch_src.profile summed_array = summed_array.astype('uint8') with rasterio.open(output_tif, 'w', **profile) as dst: dst.write(summed_array, 1) - MP_LOG.lprint(f"output_tif is {output_tif}") - del summed_array + MP_LOG.lprint(f"{huc_lid_cat_id}: branch rollup extent file saved at {output_tif}") + # del summed_array + + # For space reasons, we need to delete all of the intermediary files such as: + # Stage: grmn3_action_extent_0.tif, grmn3_action_extent_1933000003.tif. The give aways are a number before + # the .tif + # Flows: allm1_action_12p0ft_extent_01010002_0.tif, allm1_action_12p0ft_extent_01010002_7170000001.tif + # your give away is to just delete any file that has the HUC number in teh file name + # The intermediatary are all inundated branch tifs. + + # The ones we want to keep end at _extent.tif and remove ones that have _extent_*.tif + MP_LOG.lprint(f"{huc_lid_cat_id}: Removing interium inundated branch files") + branch_tifs = glob.glob(f"{lid_directory}/{lid}_{category_key}_extent_*.tif") + for tif_file in branch_tifs: + os.remove(tif_file) + + # else: + # MP_LOG.warning(f"{huc}: {lid}: Merging {category_key} : no valid inundated branches") return messages, hand_stage, datum_adj_wse, datum_adj_wse_m # This is part of an MP call and needs MP_LOG -# This does not actually inundate, it just uses the stage and the catchment to create a tif -def produce_tif_per_huc_per_mag_for_stage( +# This is a form of inundation which we are doing ourselves +# as we only have one flow value and our normal inundation tools +# are looking for files not single values +def produce_inundated_branch_tif( rem_path, catchments_path, hydroid_list, hand_stage, lid_directory, - category, + category_key, huc, lid, branch, @@ -255,17 +276,26 @@ def produce_tif_per_huc_per_mag_for_stage( ): """ # Open rem_path and catchment_path using rasterio. + + Note: category can have different formats, depending if it is an interval or not or int or float + If it has a stage number it, it is an interval value + ie) action, action_24ft, action_24.6, or action_24.6ft """ try: # This is setting up logging for this function to go up to the parent MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + file_name = lid + '_' + category_key + '_extent_' + huc + '_' + branch + output_tif = os.path.join(lid_directory, file_name + '.tif') + # MP_LOG.lprint("+++++++++++++++++++++++") - # MP_LOG.lprint(f"At the start of producing a tif for {huc}") + # MP_LOG.lprint(f"... At the start of producing a tif for {file_name}") # MP_LOG.trace(locals()) + # MP_LOG.lprint(f"output_tif is {output_tif} (if it is valid)") # MP_LOG.trace("+++++++++++++++++++++++") + # both of these have a nodata value of 0 (well.. not by the image but by cell values) rem_src = rasterio.open(rem_path) catchments_src = rasterio.open(catchments_path) rem_array = rem_src.read(1) @@ -278,38 +308,55 @@ def produce_tif_per_huc_per_mag_for_stage( # Use numpy.where operation to reclassify rem_path on the condition that the pixel values # are <= to hand_stage and the catchments value is in the hydroid_list. + reclass_rem_array = np.where((rem_array <= hand_stage) & (rem_array != rem_src.nodata), 1, 0).astype( 'uint8' ) + hydroid_mask = np.isin(catchments_array, hydroid_list) + target_catchments_array = np.where( ((hydroid_mask == True) & (catchments_array != catchments_src.nodata)), 1, 0 ).astype('uint8') + masked_reclass_rem_array = np.where( - ((reclass_rem_array == 1) & (target_catchments_array == 1)), 1, 0 + ((reclass_rem_array >= 1) & (target_catchments_array >= 1)), 1, 0 ).astype('uint8') + # change it all to either 1 or 0 (one being inundated) + # masked_reclass_rem_array[np.where(masked_reclass_rem_array <= 0)] = 0 + # masked_reclass_rem_array[np.where(masked_reclass_rem_array > 0)] = 1 + # Save resulting array to new tif with appropriate name. ie) brdc1_record_extent_18060005.tif # to our mapping/huc/lid site - is_all_zero = np.all((masked_reclass_rem_array == 0)) + # No cells were inundated which is common. Lots of branches don't inundate as they are out of the extent area + is_all_zero = np.all(masked_reclass_rem_array == 0) # MP_LOG.lprint(f"{huc}: masked_reclass_rem_array, is_all_zero is {is_all_zero} for {rem_path}") # if not is_all_zero: - # if is_all_zero is False: # this logic didn't let ANY files get saved + # if is_all_zero == False: # this logic didn't let ANY files get saved # 'is False' means that the object does not exist and not that it really equals the value of False - if is_all_zero == False: # corrected logic - output_tif = os.path.join( - lid_directory, lid + '_' + category + '_extent_' + huc + '_' + branch + '.tif' - ) - # MP_LOG.lprint(f" +++ Output_Tif is {output_tif}") + if is_all_zero == False: + # output_tif = os.path.join( + # lid_directory, lid + '_' + category_key + '_extent_' + huc + '_' + branch + '.tif' + # ) + # # # File may or may not exist + # # if os.path.exists(output_tif): + MP_LOG.lprint(f" +++ Branch output_tif is {output_tif}") with rasterio.Env(): profile = rem_src.profile profile.update(dtype=rasterio.uint8) - profile.update(nodata=10) + profile.update(nodata=0) + + # Replace any existing nodata values with the new one + # masked_reclass_rem_array[masked_reclass_rem_array == profile["nodata"]] = 0 with rasterio.open(output_tif, 'w', **profile) as dst: + # dst.nodata = 0 dst.write(masked_reclass_rem_array, 1) + # else: + # MP_LOG.trace(f"{file_name} : inundation was all zero cells") except Exception: MP_LOG.error(f"{huc} : {lid} Error producing inundation maps with stage") @@ -328,6 +375,8 @@ def run_catfim_inundation( print() FLOG.lprint(">>> Start Inundating and Mosaicking") + # The output_flows_dir will only have HUCs that were valid which means + # it will not include necessarily all hucs from the output run. source_flow_huc_dir_list = [ x for x in os.listdir(output_flows_dir) @@ -339,61 +388,74 @@ def run_catfim_inundation( if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] ] # Log missing hucs + # Depending on filtering, errors, and the valid_ahps_huc list at the start of the program + # this list could have only a few matches missing_hucs = list(set(source_flow_huc_dir_list) - set(fim_source_huc_dir_list)) + missing_hucs = [huc for huc in missing_hucs if "." not in huc] # Loop through matching huc directories in the source_flow directory matching_hucs = list(set(fim_source_huc_dir_list) & set(source_flow_huc_dir_list)) matching_hucs.sort() + FLOG.trace(f"matching_hucs now are {matching_hucs}") + child_log_file_prefix = FLOG.MP_calc_prefix_name(log_output_file, "MP_run_ind") with ProcessPoolExecutor(max_workers=job_number_huc) as executor: - for huc in matching_hucs: - if "." in huc: - continue - - # Get list of AHPS site directories - huc_flows_dir = os.path.join(output_flows_dir, huc) - - # ahps_site_dir_list = os.listdir(ahps_site_dir) - ahps_site_dir_list = [ - x for x in os.listdir(huc_flows_dir) if os.path.isdir(os.path.join(huc_flows_dir, x)) - ] - - # Map path to huc directory inside the mapping directory - huc_mapping_dir = os.path.join(output_mapping_dir, huc) - if not os.path.exists(huc_mapping_dir): - os.mkdir(huc_mapping_dir) - - # Loop through AHPS sites - for ahps_site in ahps_site_dir_list: - # map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj) - ahps_site_parent = os.path.join(huc_flows_dir, ahps_site) - - # thresholds_dir_list = os.listdir(ahps_site_parent) - thresholds_dir_list = [ - x - for x in os.listdir(ahps_site_parent) - if os.path.isdir(os.path.join(ahps_site_parent, x)) - ] - - # Map parent directory for all inundation output files output files. - huc_site_mapping_dir = os.path.join(huc_mapping_dir, ahps_site) - if not os.path.exists(huc_site_mapping_dir): - os.mkdir(huc_site_mapping_dir) - - # Loop through thresholds/magnitudes and define inundation output files paths - for magnitude in thresholds_dir_list: - if "." in magnitude: - continue - magnitude_flows_csv = os.path.join( - ahps_site_parent, - magnitude, - 'ahps_' + ahps_site + '_huc_' + huc + '_flows_' + magnitude + '.csv', - ) - # print(f"magnitude_flows_csv is {magnitude_flows_csv}") - if os.path.exists(magnitude_flows_csv): - tif_name = ahps_site + '_' + magnitude + '_extent.tif' + try: + for huc in matching_hucs: + + # Get list of AHPS site directories + huc_flows_dir = os.path.join(output_flows_dir, huc) + + # Map path to huc directory inside the mapping directory + huc_mapping_dir = os.path.join(output_mapping_dir, huc) + if not os.path.exists(huc_mapping_dir): + os.makedirs(huc_mapping_dir, exist_ok=True) + + # ahps_site_dir_list = os.listdir(ahps_site_dir) + ahps_site_dir_list = [ + x for x in os.listdir(huc_flows_dir) if os.path.isdir(os.path.join(huc_flows_dir, x)) + ] # ahps folder names under the huc folder + + FLOG.trace(f"{huc} : ahps_site_dir_list is {ahps_site_dir_list}") + + # Loop through AHPS sites + for ahps_id in ahps_site_dir_list: + # map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj) + ahps_site_parent = os.path.join(huc_flows_dir, ahps_id) + + # thresholds_dir_list = os.listdir(ahps_site_parent) + thresholds_dir_list = [ + x + for x in os.listdir(ahps_site_parent) + if os.path.isdir(os.path.join(ahps_site_parent, x)) + ] + + # but we can just extract it from the csv files names which are + # patterned as: 04130003/chrn6/moderate/chrn6_huc_04130003_flows_moderate.csv + + # Map parent directory for all inundation output files output files. + huc_site_mapping_dir = os.path.join(huc_mapping_dir, ahps_id) + if not os.path.exists(huc_site_mapping_dir): + os.makedirs(huc_site_mapping_dir, exist_ok=True) + + # Loop through thresholds/magnitudes and define inundation output files paths + + FLOG.trace(f"{huc}: {ahps_id} threshold dir list is {thresholds_dir_list}") + + for magnitude in thresholds_dir_list: + if "." in magnitude: + continue + + magnitude_flows_csv = os.path.join( + ahps_site_parent, + magnitude, + ahps_id + '_huc_' + huc + '_flows_' + magnitude + '.csv', + ) + # print(f"magnitude_flows_csv is {magnitude_flows_csv}") + tif_name = ahps_id + '_' + magnitude + '_extent.tif' output_extent_tif = os.path.join(huc_site_mapping_dir, tif_name) + FLOG.trace(f"Begin inundation for {tif_name}") try: executor.submit( @@ -402,7 +464,7 @@ def run_catfim_inundation( huc, huc_site_mapping_dir, output_extent_tif, - ahps_site, + ahps_id, magnitude, fim_run_dir, job_number_inundate, @@ -412,13 +474,19 @@ def run_catfim_inundation( except Exception: FLOG.critical( - "An critical error occured while attempting inundation" - f" for {huc} -- {ahps_site} -- {magnitude}" + "A critical error occured while attempting inundation" + f" for {huc} - {ahps_id}-- {magnitude}" ) FLOG.critical(traceback.format_exc()) FLOG.merge_log_files(log_output_file, child_log_file_prefix) sys.exit(1) + except Exception: + FLOG.critical("A critical error occured while attempting all hucs inundation") + FLOG.critical(traceback.format_exc()) + FLOG.merge_log_files(log_output_file, child_log_file_prefix) + sys.exit(1) + # end of ProcessPoolExecutor # rolls up logs from child MP processes into this parent_log_output_file @@ -433,6 +501,10 @@ def run_catfim_inundation( # This is part of an MP Pool +# It use used for flow-based +# It inundate each set based on the ahps/mangnitude list and for each segment in the +# the branch hydrotable +# Then each is inundated per branch and mosiaked for the ahps def run_inundation( magnitude_flows_csv, huc, @@ -503,18 +575,6 @@ def run_inundation( MP_LOG.error(f"FAILURE_huc_{huc} - {ahps_site} - {magnitude} map failed to create") return - # For space reasons, we need to delete all of the intermediary files such as: - # Stage: grmn3_action_extent_0.tif, grmn3_action_extent_1933000003.tif. The give aways are a number before - # the .tif - # Flows: allm1_action_12p0ft_extent_01010002_0.tif, allm1_action_12p0ft_extent_01010002_7170000001.tif - # your give away is to just delete any file that has the HUC number in teh file name - # The intermediatary are all inundated branch tifs. - - # The ones we want to keep stop at _extent.tif - # branch_tifs = glob.glob(os.path.join(output_huc_site_mapping_dir, '*_extent_*.tif')) - # for tif_file in branch_tifs: - # os.remove(tif_file) - return @@ -549,31 +609,19 @@ def post_process_huc( mapping_huc_lid_dir = os.path.join(huc_dir, ahps_lid) MP_LOG.trace(f"mapping_huc_lid_dir is {mapping_huc_lid_dir}") - # Append desired filenames to list. (notice.. no value after the word extent) - # tif_list = [x for x in os.listdir(mapping_huc_lid_dir) if x.endswith("extent.tif")] # doesn't match the old filenames - - # new logic actually finds the extent tifs - # It will find only the mag rollups and not its branches - - # if stage based, the file names looks like this: masm1_major_20p0ft_extent.tif - # but there also is masm1_major_extent.tif, so we want both - # if flow based, the file name looks like this: masm1_action_extent.tif - + # aka. ends with "extent.tif" which means it is a rolled up version up for there branches tif_list = [x for x in os.listdir(mapping_huc_lid_dir) if ('extent.tif') in x] if len(tif_list) == 0: - MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") + # This is perfectly fine for there to be none + # MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") continue for tif in tif_list: - if 'extent.tif' in tif: - # as we processing tifs for just one ahps at a time, we can further files that have that - # ahps_lid in it. - if ahps_lid in tif: - tifs_to_reformat_list.append(os.path.join(mapping_huc_lid_dir, tif)) + tifs_to_reformat_list.append(os.path.join(mapping_huc_lid_dir, tif)) if len(tifs_to_reformat_list) == 0: - MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") + # MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") continue # Stage-Based CatFIM uses attributes from individual CSVs instead of the master CSV. @@ -581,7 +629,7 @@ def post_process_huc( # There may not necessarily be an attributes.csv for this lid, depending on how flow processing went # lots of lids fall out in the attributes or flow steps. - if os.path.exists(nws_lid_attributes_filename) is False: + if os.path.exists(nws_lid_attributes_filename) == False: MP_LOG.warning(f"{ahps_lid} has no attributes file which may perfectly fine.") continue @@ -596,37 +644,38 @@ def post_process_huc( # for log_file in old_refomat_log_files: # os.remove(log_file) - # with ProcessPoolExecutor(max_workers=job_number_inundate) as executor: - # TODO: - # Aug 2024, Using MP (job number inundate has very little value, drop it) - # Clean up that ji MP - # with ProcessPoolExecutor(max_workers=job_number_inundate) as executor: + # we only have the rolled up, no branch versions by now for tif_to_process in tifs_to_reformat_list: # If not os.path.exists(tif_to_process): # continue - # If stage based, the file names looks like this: masm1_major_20p0ft_extent.tif - # but there also is masm1_major_extent.tif, so we want both + # If stage based, the file names looks like this: + # masm1_major_extent.tif (non-interval, whole number) + # masm1_major_20.6_extent.tif (non-interval, float) + # masm1_major_20.0ft_extent.tif (interval) # If flow based, the file name looks like this: masm1_action_extent.tif MP_LOG.trace(f".. Tif to Process = {tif_to_process}") try: tif_file_name = os.path.basename(tif_to_process) file_name_parts = tif_file_name.split("_") - magnitude = file_name_parts[1] + magnitude = file_name_parts[1] # part 0 is the lid + + # but if it doesn't have "fti" at the end it is not an interval - if "ft" in tif_file_name: # stage based, ie grnm1_action_11p0ft_extent.tif + # careful. ft can be part of the site name, so only check part 3 + interval_stage = None + if len(file_name_parts) >= 3 and "fti" in file_name_parts[2]: try: - interval_stage = float(file_name_parts[2].replace('p', '.').replace("ft", "")) + stage_val = file_name_parts[2].replace("fti", "") + interval_stage = float(stage_val) except ValueError: interval_stage = None MP_LOG.error( f"Value Error for {huc} - {ahps_lid} - magnitude {magnitude}" - " at {mapping_huc_lid_dir}" + f" at {mapping_huc_lid_dir}" ) MP_LOG.error(traceback.format_exc()) - else: # flow based. ie) cfkt2_action_extent.tif - interval_stage = None reformat_inundation_maps( ahps_lid, @@ -649,6 +698,9 @@ def post_process_huc( # rolls up logs from child MP processes into this parent_log_output_file # MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) + # TODO: Roll up the independent related ahps gpkgs into a huc level gkpg, still in the gpkg dir + # all of the gkpgs we want will have the huc number in front of it + except Exception: MP_LOG.error(f"An error has occurred in post processing for {huc}") MP_LOG.error(traceback.format_exc()) @@ -668,26 +720,28 @@ def post_process_cat_fim_for_viz( FLOG.lprint("Start post processing TIFs (TIF extents into poly into gpkg)...") output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') gpkg_dir = os.path.join(output_mapping_dir, 'gpkg') - os.mkdir(gpkg_dir) + os.makedirs(gpkg_dir, exist_ok=True) huc_ahps_dir_list = [ x for x in os.listdir(output_mapping_dir) - if os.path.isdir(os.path.join(output_mapping_dir, x)) and x[0] in ['0', '1', '2'] + if os.path.isdir(os.path.join(output_mapping_dir, x)) and x[0] in ['0', '1', '2', '9'] ] - skip_list = ['errors', 'logs', 'gpkg', 'missing_files.txt', 'messages'] + # if we don't have a huc_ahps_dir_list, something went catestrophically bad + if len(huc_ahps_dir_list) == 0: + raise Exception("Critical Error: Not possible to be here with no huc/ahps list") num_hucs = len(huc_ahps_dir_list) huc_index = 0 FLOG.lprint(f"Number of hucs to post process is {num_hucs}") + # TODO: Sep 2024: we need to remove the process pool here and put it post_process_huc (for tif inundation) + child_log_file_prefix = MP_LOG.MP_calc_prefix_name(log_output_file, "MP_post_process") with ProcessPoolExecutor(max_workers=job_huc_ahps) as huc_exector: for huc in huc_ahps_dir_list: FLOG.lprint(f"TIF post processing for {huc}") - if huc in skip_list: - continue huc_dir = os.path.join(output_mapping_dir, huc) progress_stmt = f"index {huc_index + 1} of {num_hucs}" @@ -728,28 +782,27 @@ def post_process_cat_fim_for_viz( gpkg_files = [x for x in os.listdir(gpkg_dir) if x.endswith('.gpkg')] FLOG.lprint(f"Merging {len(gpkg_files)} from layers in {gpkg_dir}") - # TODO: put a tqdm in here for visual only. - gpkg_files.sort() merged_layers_gdf = None ctr = 0 - for layer in tqdm( - gpkg_files, - total=len(gpkg_files), - desc="Merging gpkg layers", - bar_format="{desc}:({n_fmt}/{total_fmt})|{bar}| {percentage:.1f}% ", - ncols=80, - ): + num_gpkg_files = len(gpkg_files) + for gpkg_file in gpkg_files: # for ctr, layer in enumerate(gpkg_files): # FLOG.lprint(f"Merging gpkg ({ctr+1} of {len(gpkg_files)} - {}") - FLOG.trace(f"Merging gpkg ({ctr+1} of {len(gpkg_files)} : {layer}") + FLOG.trace(f"Merging gpkg ({ctr+1} of {num_gpkg_files} : {gpkg_file}") - # Concatenate each /gpkg/{aphs}_{magnitude}_extent_{huc}_dissolved.gkpg - diss_extent_filename = os.path.join(gpkg_dir, layer) + # Concatenate each /gpkg/{huc}_{aphs}_{magnitude}_extent.gpkg + diss_extent_filename = os.path.join(gpkg_dir, gpkg_file) diss_extent_gdf = gpd.read_file(diss_extent_filename, engine='fiona') - diss_extent_gdf['viz'] = 'yes' + + if 'interval_stage' in diss_extent_gdf.columns: + # Update the stage column value to be the interval value if an interval values exists + + diss_extent_gdf.loc[diss_extent_gdf["interval_stage"] > 0, "stage"] = diss_extent_gdf[ + "interval_stage" + ] if ctr == 0: merged_layers_gdf = diss_extent_gdf @@ -762,7 +815,7 @@ def post_process_cat_fim_for_viz( if merged_layers_gdf is None or len(merged_layers_gdf) == 0: raise Exception(f"No gpkgs found in {gpkg_dir}") - # TODO: July 9, 2024: Consider deleting all of the interium .gkpg files in the gkpg folder. + # TODO: July 9, 2024: Consider deleting all of the interium .gpkg files in the gpkg folder. # It will get very big quick. But not yet. # shutil.rmtree(gpkg_dir) @@ -774,14 +827,21 @@ def post_process_cat_fim_for_viz( FLOG.lprint("Dissolving flow based catfim_libary by ahps and magnitudes") merged_layers_gdf = merged_layers_gdf.dissolve(by=['ahps_lid', 'magnitude'], as_index=False) - merged_layers_gdf.reset_index(inplace=True) + if 'level_0' in merged_layers_gdf: + merged_layers_gdf = merged_layers_gdf.drop(['level_0'], axis=1) + + if 'status' in merged_layers_gdf: + merged_layers_gdf = merged_layers_gdf.drop(['status'], axis=1) + + if 'mapped' in merged_layers_gdf: + merged_layers_gdf = merged_layers_gdf.drop(['mapped'], axis=1) output_file_name = f"{catfim_method}_catfim_library" # TODO: Aug 2024: gpkg are not opening in qgis now? project, wkt, non defined geometry columns? - # gkpg_file_path = os.path.join(output_mapping_dir, f'{output_file_name}.gpkg') - # FLOG.lprint(f"Saving catfim library gpkg version to {gkpg_file_path}") - # merged_layers_gdf.to_file(gkpg_file_path, driver='GPKG', index=True, engine="fiona", crs=PREP_PROJECTION) + gpkg_file_path = os.path.join(output_mapping_dir, f'{output_file_name}.gpkg') + FLOG.lprint(f"Saving catfim library gpkg version to {gpkg_file_path}") + merged_layers_gdf.to_file(gpkg_file_path, driver='GPKG', engine="fiona") csv_file_path = os.path.join(output_mapping_dir, f'{output_file_name}.csv') FLOG.lprint(f"Saving catfim library csv version to {csv_file_path}") @@ -812,8 +872,7 @@ def reformat_inundation_maps( # interval stage might come in as null and that is ok # Note: child_log_file_prefix is "MP_reformat_tifs_{huc}", meaning all logs created by this - # function start with the phrase "MP_reformat_tifs_{huc}". This will rollup to the master - # catfim logs + # function start with the phrase will rollup to the master catfim logs # This is setting up logging for this function to go up to the parent MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) @@ -822,7 +881,7 @@ def reformat_inundation_maps( MP_LOG.trace( f"{huc} : {ahps_lid} : {magnitude} -- Start reformat_inundation_maps" " (tif extent to gpkg poly)" ) - MP_LOG.trace(F"Tif to process is {tif_to_process}") + MP_LOG.trace(F"tif to process is {tif_to_process}") # Convert raster to shapes with rasterio.open(tif_to_process) as src: @@ -837,7 +896,11 @@ def reformat_inundation_maps( # Convert list of shapes to polygon # lots of polys - extent_poly = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION) # Previous code + # extent_poly = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION) # Previous code + extent_poly = gpd.GeoDataFrame.from_features( + list(results), crs=src.crs + ) # Updating to fix AK proj issue, worked for CONUS and for AK! + # extent_poly = gpd.GeoDataFrame.from_features(list(results)) # Updated to accomodate AK projection # extent_poly = extent_poly.set_crs(src.crs) # Update to accomodate AK projection @@ -865,11 +928,12 @@ def reformat_inundation_maps( left_on=['ahps_lid', 'magnitude', 'huc'], right_on=['nws_lid', 'magnitude', 'huc'], ) + # already has an ahps_lid column which we want and not the nws_lid column extent_poly_diss = extent_poly_diss.drop(columns='nws_lid') # Save dissolved multipolygon handle = os.path.split(tif_to_process)[1].replace('.tif', '') - diss_extent_filename = os.path.join(gpkg_dir, f"{huc}_{handle}_dissolved.gpkg") + diss_extent_filename = os.path.join(gpkg_dir, f"{huc}_{handle}.gpkg") extent_poly_diss["geometry"] = [ MultiPolygon([feature]) if type(feature) is Polygon else feature for feature in extent_poly_diss["geometry"] @@ -877,7 +941,7 @@ def reformat_inundation_maps( if not extent_poly_diss.empty: extent_poly_diss.to_file( - diss_extent_filename, driver=getDriver(diss_extent_filename), index=False + diss_extent_filename, driver=getDriver(diss_extent_filename), index=False, engine='fiona' ) # MP_LOG.trace( # f"{huc} : {ahps_lid} : {magnitude} - Reformatted inundation map saved" @@ -914,7 +978,6 @@ def manage_catfim_mapping( catfim_method, job_number_huc, job_number_inundate, - depthtif, log_output_file, step_number=1, ): @@ -943,7 +1006,8 @@ def manage_catfim_mapping( # FLOG.lprint("Aggregating Categorical FIM") # Get fim_version. - fim_version = os.path.basename(os.path.normpath(fim_run_dir)).replace('fim_', '').replace('_', '.') + + fim_version = os.path.basename(os.path.normpath(fim_run_dir)) # Step 2 # TODO: Aug 2024, so we need to clean it up diff --git a/tools/catfim/images/screenshot_vis_settings.JPG b/tools/catfim/images/screenshot_vis_settings.JPG new file mode 100644 index 000000000..5b2e7e373 Binary files /dev/null and b/tools/catfim/images/screenshot_vis_settings.JPG differ diff --git a/tools/catfim/stage_based_ahps_restricted_sites.csv b/tools/catfim/stage_based_ahps_restricted_sites.csv new file mode 100644 index 000000000..db5cb88fa --- /dev/null +++ b/tools/catfim/stage_based_ahps_restricted_sites.csv @@ -0,0 +1,69 @@ +nws_lid,restricted_reason +# RULES: Ensure that the top line has no comments above the header line above: +# -- Careful not to add commas other then the one after the lid ID. +# -- Don't allow for any duplicate LID values in this list please. +# +# Comment lines can be added as any independent line in this csv. +# +AABDB,Test Site +ADLG1,Stage thresholds seem to be based on sea level and not channel thalweg +AUGG1,Stage thresholds seem to be based on sea level and not channel thalweg +BAXG1,Stage thresholds seem to be based on sea level and not channel thalweg +BRKTS,Test Site +CNNN6,Pool elevation thresholds. Disabled at request of MARFC +DMBT2,Test Site +DMSF1,Tidal Gauge +DVNIA,Test Site +GVDA1,bad data in API +HFMW4,gage relocated. check again later for review +HLZU1,bad data in API +HNXCA,Test Site +HRAG1,Stage thresholds seem to be based on sea level and not channel thalweg +HUSKR,Test Site +HZT00,Test Site +HZT02,Test Site +ILMNA,Test Site +JTEST,Test Site +JYNT2,Test Site +KLNQ9,Outside U.S. +LAMF1,Stage thresholds seem to be based on sea level and not channel thalweg. +MCVA3,historical gauge +NVRN6,Pool elevation thresholds. Disabled at request of MARFC. +PEPN6,Pool elevation thresholds. Disabled at request of MARFC. +PRXQ9,Outside U.S. +ONDN6,Inundation issues +QUTG1,Stage thresholds seem to be based on sea level and not channel thalweg +RCYF1,Out of Service +RWBW3, historical gauge +SMSV2,point not in AHPS +STNG1,Stage thresholds seem to be based on sea level and not channel thalweg +TESM8,Test Site +TEST,Test Site +TEST1,Test Site +TEST11,Test Site +TEST2,Test Site +TEST3,Test Site +TEST4,Test Site +TEST9,Test Site +TESTA,Test Site +TESTB,Test Site +TESTC,Test Site +TESTPT1,Test Site +TETM8,Test Site +TS2W3,Test Site +TS3W3,Test Site +TS4W3,Test Site +TSTMO,Test Site +TSTN6,Test Site +TSTSP,Test Site +TSTUP,Test Site +TTARX,Test Site +VEFNV,Test Site +WBYA1,Out of Service +WLSA1, bad data in API +WUNTS, not a real gauge +XXXN3,Test Site +YDAQ9,Outside U.S. +YLKA2,Discontinued Gage +YWRQ9,Outside U.S. +ZZZT2,Test Site diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py deleted file mode 100755 index 5d2a3bcde..000000000 --- a/tools/generate_categorical_fim.py +++ /dev/null @@ -1,1332 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import csv -import glob -import os -import shutil -import sys -import time -import traceback -from concurrent.futures import ProcessPoolExecutor, as_completed, wait - -# from logging.handlers import QueueHandler, QueueListener -from datetime import datetime, timezone -from itertools import repeat -from pathlib import Path - -import geopandas as gpd -import numpy as np -import pandas as pd -import rasterio -from dotenv import load_dotenv -from generate_categorical_fim_flows import generate_flows -from generate_categorical_fim_mapping import ( - manage_catfim_mapping, - post_process_cat_fim_for_viz, - produce_stage_based_catfim_tifs, -) -from tools_shared_functions import ( - filter_nwm_segments_by_stream_order, - get_datum, - get_nwm_segs, - get_thresholds, - ngvd_to_navd_ft, -) -from tools_shared_variables import ( - acceptable_alt_acc_thresh, - acceptable_alt_meth_code_list, - acceptable_coord_acc_code_list, - acceptable_coord_method_code_list, - acceptable_site_type_list, -) - -import utils.fim_logger as fl -from utils.shared_variables import VIZ_PROJECTION - - -# global RLOG -FLOG = fl.FIM_logger() # the non mp version -MP_LOG = fl.FIM_logger() # the Multi Proc version - -gpd.options.io_engine = "pyogrio" - - -# TODO: Aug 2024: This script was upgraded significantly with lots of misc TODO's embedded. -# Lots of inline documenation needs updating as well - - -""" -Jun 17, 2024 -This system is continuing to mature over time. It has a number of optimizations that can still -be applied in areas such as logic, performance and error handling. - -In the interium there is still a consider amount of debug lines and tools embedded in that can -be commented on/off as required. - - -NOTE: For now.. all logs roll up to the parent log file. ie) catfim_2024_07_09-22-20-12.log -This creates a VERY large final log file, but the warnings and errors file should be manageable. -Later: Let's split this to seperate log files per huc. Easy to do that for Stage Based it has -"iterate_through_stage_based" function. Flow based? we have to think that one out a bit - -""" - - -def process_generate_categorical_fim( - fim_run_dir, - env_file, - job_number_huc, - job_number_inundate, - is_stage_based, - output_folder, - overwrite, - search, - lid_to_run, - job_number_intervals, - past_major_interval_cap, - nwm_metafile, -): - - # ================================ - # Validation and setup - - # Append option configuration (flow_based or stage_based) to output folder name. - if is_stage_based: - catfim_method = "stage_based" - else: - catfim_method = "flow_based" - - # Define output directories - if output_folder.endswith("/"): - output_folder = output_folder[:-1] - output_catfim_dir = output_folder + "_" + catfim_method - - output_flows_dir = os.path.join(output_catfim_dir, 'flows') - output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') - attributes_dir = os.path.join(output_catfim_dir, 'attributes') - - # The override is not for the parent folder as we want to keep logs around with or without override - if os.path.exists(output_catfim_dir) is False: - os.mkdir(output_catfim_dir) - - # Create output directories (check against maping only as a proxy for all three) - if os.path.exists(output_mapping_dir) is True: - if overwrite is False: - raise Exception( - f"The output mapping folder of {output_catfim_dir} already exists." - " If you want to overwrite it, please add the -o flag. Note: When overwritten, " - " the three folders of mapping, flows and attributes wil be deleted and rebuilt" - ) - - gpkg_dir = os.path.join(output_mapping_dir, 'gpkg') - shutil.rmtree(gpkg_dir, ignore_errors=True) - - shutil.rmtree(output_mapping_dir, ignore_errors=True) - shutil.rmtree(output_flows_dir, ignore_errors=True) - shutil.rmtree(attributes_dir, ignore_errors=True) - - # Keeps the logs folder - - if nwm_metafile != "": - if os.path.exists(nwm_metafile) is False: - raise Exception("The nwm_metadata (-me) file can not be found. Please remove or fix pathing.") - file_ext = os.path.splitext(nwm_metafile) - if file_ext.count == 0: - raise Exception("The nwm_metadata (-me) file appears to be invalid. It is missing an extension.") - if file_ext[1].lower() != ".pkl": - raise Exception("The nwm_metadata (-me) file appears to be invalid. The extention is not pkl.") - - # Define default arguments. Modify these if necessary - fim_version = os.path.split(fim_run_dir)[1] - - # TODO: Aug 2024: Job values are not well used. There are some times where not - # all three job values are not being used. This needs to be cleaned up. - # Check job numbers and raise error if necessary - # Considering how we are using each CPU very well at all, we could experiment - # with either overclocking or chagnign to threading. Of course, if we change - # to threading we ahve to be super careful about file and thread collisions (locking) - - # commented out for now for some small overclocking tests (carefully of course) - # total_cpus_requested = job_number_huc * job_number_inundate * job_number_intervals - # total_cpus_available = os.cpu_count() - 2 - # if total_cpus_requested > total_cpus_available: - # raise ValueError( - # f"The HUC job number (jh) [{job_number_huc}]" - # f" multiplied by the inundate job number (jn) [{job_number_inundate}]" - # f" multiplied by the job number intervals (ji) [{job_number_intervals}]" - # " exceeds your machine\'s available CPU count minus one." - # " Please lower one or more of those values accordingly." - # ) - - # we are getting too many folders and files. We want just huc folders. - # output_flow_dir_list = os.listdir(fim_run_dir) - # looking for folders only starting with 0, 1, or 2 - # Code variation for dropping all Alaska HUCS: - valid_ahps_hucs = [ - x - for x in os.listdir(fim_run_dir) - if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] and x[:2] != "19" - ] - - # # Code variation for KEEPING Alaska HUCS: - # valid_ahps_hucs = [ - # x - # for x in os.listdir(fim_run_dir) - # if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] - # ] - - valid_ahps_hucs.sort() - - num_hucs = len(valid_ahps_hucs) - if num_hucs == 0: - raise ValueError( - f'Output directory {fim_run_dir} is empty. Verify that you have the correct input folder.' - ) - # End of Validation and setup - # ================================ - - log_dir = os.path.join(output_catfim_dir, "logs") - log_output_file = FLOG.calc_log_name_and_path(log_dir, "catfim") - FLOG.setup(log_output_file) - - overall_start_time = datetime.now(timezone.utc) - dt_string = overall_start_time.strftime("%m/%d/%Y %H:%M:%S") - - # os.makedirs(output_flows_dir, exist_ok=True) # Stage doesn't use it - os.makedirs(output_mapping_dir, exist_ok=True) - os.makedirs(attributes_dir, exist_ok=True) - - FLOG.lprint("================================") - FLOG.lprint(f"Start generate categorical fim for {catfim_method} - (UTC): {dt_string}") - FLOG.lprint("") - - FLOG.lprint( - f"Processing {num_hucs} huc(s) with Alaska temporarily removed" - ) # Code variation for DROPPING Alaska HUCs - # FLOG.lprint(f"Processing {num_hucs} huc(s)") # Code variation for KEEPING Alaska HUCs - - load_dotenv(env_file) - API_BASE_URL = os.getenv('API_BASE_URL') - if API_BASE_URL is None: - raise ValueError( - 'API base url not found. ' - 'Ensure inundation_mapping/tools/ has an .env file with the following info: ' - 'API_BASE_URL, EVALUATED_SITES_CSV, WBD_LAYER, NWM_FLOWS_MS, ' - 'USGS_METADATA_URL, USGS_DOWNLOAD_URL' - ) - - # TODO: Add check for if lid_to_run and lst_hucs parameters conflict - - # Check that fim_inputs.csv exists and raise error if necessary - fim_inputs_csv_path = os.path.join(fim_run_dir, 'fim_inputs.csv') - if not os.path.exists(fim_inputs_csv_path): - raise ValueError(f'{fim_inputs_csv_path} not found. Verify that you have the correct input files.') - - # print() - - # FLOG.lprint("Filtering out HUCs that do not have related ahps site in them.") - # valid_ahps_hucs = __filter_hucs_to_ahps(lst_hucs) - - # num_valid_hucs = len(valid_ahps_hucs) - # if num_valid_hucs == 0: - # raise Exception("None of the HUCs supplied have ahps sites in them. Check your fim output folder") - # else: - # FLOG.lprint(f"Processing {num_valid_hucs} huc(s) with AHPS sites") - - # Define upstream and downstream search in miles - nwm_us_search, nwm_ds_search = search, search - nws_lid_gpkg_file_path = "" - - # STAGE-BASED - if is_stage_based: - # Generate Stage-Based CatFIM mapping - # does flows and inundation (mapping) - nws_lid_gpkg_file_path = generate_stage_based_categorical_fim( - output_catfim_dir, - fim_run_dir, - nwm_us_search, - nwm_ds_search, - lid_to_run, - env_file, - job_number_inundate, - job_number_huc, - valid_ahps_hucs, - job_number_intervals, - past_major_interval_cap, - nwm_metafile, - ) - - # creates the gkpgs (tif's created above) - # TODO: Aug 2024, so we need to clean it up - # This step does not need a job_number_inundate as it can't really use use it. - # It processes primarily hucs and ahps in multiproc - # for now, we will manuall multiple the huc * 5 (max number of ahps types) - ahps_jobs = job_number_huc * 5 - post_process_cat_fim_for_viz( - catfim_method, output_catfim_dir, ahps_jobs, fim_version, log_output_file - ) - - # FLOW-BASED - else: - FLOG.lprint('Creating flow files using the ' + catfim_method + ' technique...') - start = time.time() - - # generate flows is only using one of the incoming job number params - # so let's multiply -jh (huc) and -jn (inundate) - job_flows = job_number_huc * job_number_inundate - nws_lid_gpkg_file_path = generate_flows( - output_catfim_dir, - nwm_us_search, - nwm_ds_search, - lid_to_run, - env_file, - job_flows, - is_stage_based, - valid_ahps_hucs, - nwm_metafile, - log_output_file, - ) - end = time.time() - elapsed_time = (end - start) / 60 - FLOG.lprint(f"Finished creating flow files in {str(elapsed_time).split('.')[0]} minutes") - - # Generate CatFIM mapping (both flow and stage based need it, but if different orders - manage_catfim_mapping( - fim_run_dir, - output_flows_dir, - output_catfim_dir, - catfim_method, - job_number_huc, - job_number_inundate, - False, - log_output_file, - ) - - # end if else - - # Updating mapping status - FLOG.lprint('Updating mapping status...') - update_flow_mapping_status(output_mapping_dir, nws_lid_gpkg_file_path) - - # Create CSV versions of the final geopackages. - # FLOG.lprint('Creating CSVs. This may take several minutes.') - # create_csvs(output_mapping_dir, is_stage_based) - - FLOG.lprint("================================") - FLOG.lprint("End generate categorical fim") - - overall_end_time = datetime.now(timezone.utc) - dt_string = overall_end_time.strftime("%m/%d/%Y %H:%M:%S") - FLOG.lprint(f"Ended (UTC): {dt_string}") - - # calculate duration - time_duration = overall_end_time - overall_start_time - FLOG.lprint(f"Duration: {str(time_duration).split('.')[0]}") - - return - - -def update_flow_mapping_status(output_mapping_dir, nws_lid_gpkg_file_path): - ''' - Updates the status for nws_lids from the flows subdirectory. Status - is updated for sites where the inundation.py routine was not able to - produce inundation for the supplied flow files. It is assumed that if - an error occured in inundation.py that all flow files for a given site - experienced the error as they all would have the same nwm segments. - - If there is no valid mapping files, update the nws_lids record - - Parameters - ---------- - output_mapping_dir : STR - Path to the output directory of all inundation maps. - nws_lid_gpkg_file_path : STR - - - Returns - ------- - None. - - ''' - # Find all LIDs with empty mapping output folders - subdirs = [str(i) for i in Path(output_mapping_dir).rglob('**/*') if i.is_dir()] - - print("") - - empty_nws_lids = [Path(directory).name for directory in subdirs if not list(Path(directory).iterdir())] - if len(empty_nws_lids) > 0: - FLOG.warning(f"Empty_nws_lids are.. {empty_nws_lids}") - - # Write list of empty nws_lids to DataFrame, these are sites that failed in inundation.py - mapping_df = pd.DataFrame({'nws_lid': empty_nws_lids}) - - mapping_df['did_it_map'] = 'no' - mapping_df['map_status'] = ' and all categories failed to map' - - # Import geopackage output from flows creation - flows_gdf = gpd.read_file(nws_lid_gpkg_file_path, engine='fiona') - - if len(flows_gdf) == 0: - FLOG.critical(f"flows_gdf is empty. Path is {nws_lid_gpkg_file_path}. Program aborted.") - sys.exit(1) - - try: - # Join failed sites to flows df - flows_gdf = flows_gdf.merge(mapping_df, how='left', on='nws_lid') - - # Switch mapped column to no for failed sites and update status - flows_gdf.loc[flows_gdf['did_it_map'] == 'no', 'mapped'] = 'no' - flows_gdf.loc[flows_gdf['did_it_map'] == 'no', 'status'] = ( - flows_gdf['status'] + flows_gdf['map_status'] - ) - - # Clean up GeoDataFrame and rename columns for consistency - flows_gdf = flows_gdf.drop(columns=['did_it_map', 'map_status']) - flows_gdf = flows_gdf.rename(columns={'nws_lid': 'ahps_lid'}) - - # Write out to file - # TODO: Aug 29, 204: Not 100% sure why, but the gpkg errors out... likely missing a projection - flows_gdf.to_file(nws_lid_gpkg_file_path, index=False, driver='GPKG', engine="fiona") - - # csv flow file name - nws_lid_csv_file_path = nws_lid_gpkg_file_path.replace(".gkpg", ".csv") - - # and we write a csv version at this time as well. - # and this csv is good - flows_gdf.to_csv(nws_lid_csv_file_path) - - except Exception as e: - FLOG.critical(f"{output_mapping_dir} : No LIDs, \n Exception: \n {repr(e)} \n") - FLOG.critical(traceback.format_exc()) - return - - -# This is always called as part of Multi-processing so uses MP_LOG variable and -# creates it's own logging object. -# This does flow files and mapping in the same function by HUC -def iterate_through_huc_stage_based( - output_catfim_dir, - huc, - fim_dir, - huc_dictionary, - threshold_url, - magnitudes, - all_lists, - past_major_interval_cap, - job_number_inundate, - job_number_intervals, - nwm_flows_region_df, - parent_log_output_file, - child_log_file_prefix, - progress_stmt, -): - """_summary_ - This and its children will create stage based tifs and catfim data based on a huc - """ - - try: - # This is setting up logging for this function to go up to the parent - # child_log_file_prefix is likely MP_iter_hucs - MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) - MP_LOG.lprint("\n**********************") - MP_LOG.lprint(f'Processing {huc} ...') - MP_LOG.lprint(f'... {progress_stmt} ...') - MP_LOG.lprint("") - - missing_huc_files = [] - all_messages = [] - stage_based_att_dict = {} - - mapping_dir = os.path.join(output_catfim_dir, "mapping") - attributes_dir = os.path.join(output_catfim_dir, 'attributes') - # output_flows_dir = os.path.join(output_catfim_dir, "flows") - huc_messages_dir = os.path.join(mapping_dir, 'huc_messages') - - # Make output directory for the particular huc in the mapping folder - mapping_huc_directory = os.path.join(mapping_dir, huc) - if not os.path.exists(mapping_huc_directory): - os.mkdir(mapping_huc_directory) - - # Define paths to necessary HAND and HAND-related files. - usgs_elev_table = os.path.join(fim_dir, huc, 'usgs_elev_table.csv') - branch_dir = os.path.join(fim_dir, huc, 'branches') - - # Loop through each lid in nws_lids list - nws_lids = huc_dictionary[huc] - - MP_LOG.lprint(f"Lids to process for {huc} are {nws_lids}") - - for lid in nws_lids: - MP_LOG.lprint("-----------------------------------") - huc_lid_id = f"{huc} : {lid}" - MP_LOG.lprint(huc_lid_id) - - lid = lid.lower() # Convert lid to lower case - # -- If necessary files exist, continue -- # - # Yes, each lid gets a record no matter what, so we need some of these messages duplicated - # per lid record - if not os.path.exists(usgs_elev_table): - msg = ":usgs_elev_table missing, likely unacceptable gage datum error -- more details to come in future release" - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - if not os.path.exists(branch_dir): - msg = ":branch directory missing" - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - usgs_elev_df = pd.read_csv(usgs_elev_table) - - # Make mapping lid_directory. - mapping_lid_directory = os.path.join(mapping_huc_directory, lid) - if not os.path.exists(mapping_lid_directory): - os.mkdir(mapping_lid_directory) - - # Get stages and flows for each threshold from the WRDS API. Priority given to USGS calculated flows. - stages, flows = get_thresholds( - threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' - ) - - if stages is None: - msg = ':error getting thresholds from WRDS API' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - - # Check if stages are supplied, if not write message and exit. - if all(stages.get(category, None) is None for category in magnitudes): - msg = ':missing threshold stages' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - - acceptable_usgs_elev_df = __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id) - if acceptable_usgs_elev_df is None: - # This should only happen in a catastrophic code error. - # Exceptions inside the function, normally return usgs_elev_df or a variant of it - raise Exception("acceptable_usgs_elev_df failed to be created") - - # Get the dem_adj_elevation value from usgs_elev_table.csv. - # Prioritize the value that is not from branch 0. - lid_usgs_elev, dem_eval_messages = __adj_dem_evalation_val( - acceptable_usgs_elev_df, lid, huc_lid_id - ) - all_messages += dem_eval_messages - if lid_usgs_elev is None: - continue - - # Initialize nested dict for lid attributes - stage_based_att_dict.update({lid: {}}) - - # Find lid metadata from master list of metadata dictionaries. - metadata = next( - (item for item in all_lists if item['identifiers']['nws_lid'] == lid.upper()), False - ) - lid_altitude = metadata['usgs_data']['altitude'] - - # Filter out sites that don't have "good" data - try: - ## Removed this part to relax coordinate accuracy requirements - # if not metadata['usgs_data']['coord_accuracy_code'] in acceptable_coord_acc_code_list: - # MP_LOG.warning( - # f"\t{huc_lid_id}: {metadata['usgs_data']['coord_accuracy_code']} " - # "Not in acceptable coord acc codes" - # ) - # continue - # if not metadata['usgs_data']['coord_method_code'] in acceptable_coord_method_code_list: - # MP_LOG.warning(f"\t{huc_lid_id}: Not in acceptable coord method codes") - # continue - if not metadata['usgs_data']['alt_method_code'] in acceptable_alt_meth_code_list: - MP_LOG.warning(f"{huc_lid_id}: Not in acceptable alt method codes") - continue - if not metadata['usgs_data']['site_type'] in acceptable_site_type_list: - MP_LOG.warning(f"{huc_lid_id}: Not in acceptable site type codes") - continue - if not float(metadata['usgs_data']['alt_accuracy_code']) <= acceptable_alt_acc_thresh: - MP_LOG.warning(f"{huc_lid_id}: Not in acceptable threshold range") - continue - except Exception: - MP_LOG.error(f"{huc_lid_id}: filtering out 'bad' data in the usgs_data") - MP_LOG.error(traceback.format_exc()) - continue - - datum_adj_ft, datum_messages = __adjust_datum_ft(flows, metadata, lid, huc_lid_id) - - all_messages = all_messages + datum_messages - if datum_adj_ft is None: - continue - - ### -- Concluded Datum Offset --- ### - # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. - unfiltered_segments = list(set(get_nwm_segs(metadata))) - - # Filter segments to be of like stream order. - desired_order = metadata['nwm_feature_data']['stream_order'] - segments = filter_nwm_segments_by_stream_order( - unfiltered_segments, desired_order, nwm_flows_region_df - ) - action_stage = stages['action'] - minor_stage = stages['minor'] - moderate_stage = stages['moderate'] - major_stage = stages['major'] - - stage_list = [ - i for i in [action_stage, minor_stage, moderate_stage, major_stage] if i is not None - ] - - if stage_list == []: - msg = ':no stage values available' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - - # TODO: Aug 2024, Is it really ok that record is missing? hummm - # Earlier code lower was doing comparisons to see if the interval - # value was been each of these 4 but sometimes one or more was None - missing_stages = "" - for stage_type in ["action", "minor", "moderate", "major"]: - stage_val = stages[stage_type] - if stage_val is None: - if missing_stages != "": - missing_stages += ", " - missing_stages += stage_type - - if missing_stages != "": - msg = f':Missing Stages of {missing_stages}' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - - interval_list = np.arange( - min(stage_list), max(stage_list) + past_major_interval_cap, 1.0 - ) # Go an extra 10 ft beyond the max stage, arbitrary - - # Check for large discrepancies between the elevation values from WRDS and HAND. - # Otherwise this causes bad mapping. - elevation_diff = lid_usgs_elev - (lid_altitude * 0.3048) - if abs(elevation_diff) > 10: - msg = ':large discrepancy in elevation estimates from gage and HAND' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - continue - - # This function sometimes is called within a MP but sometimes not. - # So, we might have an MP inside an MP - # and we will need a new prefix for it. - - # Becuase we already are in an MP, lets merge up what we have at this point - # Before creating child MP files - MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix) - - # For each flood category / magnitude - MP_LOG.lprint(f"{huc_lid_id}: About to process flood categories") - child_log_file_prefix = MP_LOG.MP_calc_prefix_name( - parent_log_output_file, "MP_produce_catfim_tifs" - ) - # print(f"child log name is {child_log_file_prefix}") - for category in magnitudes: - # MP_LOG.lprint(f"{huc_lid_id}: Magnitude is {category}") - # Pull stage value and confirm it's valid, then process - stage = stages[category] - - if stage is not None and datum_adj_ft is not None and lid_altitude is not None: - # Call function to execute mapping of the TIFs. - - # These are the 5 magnitudes being inundated at their stage value - (messages, hand_stage, datum_adj_wse, datum_adj_wse_m) = produce_stage_based_catfim_tifs( - stage, - datum_adj_ft, - branch_dir, - lid_usgs_elev, - lid_altitude, - fim_dir, - segments, - lid, - huc, - mapping_lid_directory, - category, - job_number_inundate, - parent_log_output_file, - child_log_file_prefix, - ) - all_messages += messages - - # Extra metadata for alternative CatFIM technique. - # TODO Revisit because branches complicate things - stage_based_att_dict[lid].update( - { - category: { - 'datum_adj_wse_ft': datum_adj_wse, - 'datum_adj_wse_m': datum_adj_wse_m, - 'hand_stage': hand_stage, - 'datum_adj_ft': datum_adj_ft, - 'lid_alt_ft': lid_altitude, - 'lid_alt_m': lid_altitude * 0.3048, - } - } - ) - - # If missing HUC file data, write message - if huc in missing_huc_files: - msg = ':missing some HUC data' - all_messages.append(lid + msg) - MP_LOG.error(huc_lid_id + msg) - - # So, we might have an MP inside an MP - # let's merge what we have at this point, before we go into another MP - MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) - - # Now we will do another set of inundations, but this one is based on - # not the stage flow but flow based on each interval - tif_child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_sb_tifs") - with ProcessPoolExecutor(max_workers=job_number_intervals) as executor: - try: - # MP_LOG.lprint(f"{huc_lid_id} : action_stage is {action_stage}") - # MP_LOG.lprint(f"{huc_lid_id} : minor_stage is {minor_stage}") - # MP_LOG.lprint(f"{huc_lid_id} : moderate_stage is {moderate_stage}") - # MP_LOG.lprint(f"{huc_lid_id} : major_stage is {major_stage}") - - for interval_stage in interval_list: - # MP_LOG.lprint(f"{huc_lid_id} : interval_stage is {interval_stage}") - - # Determine category the stage value belongs with. - if action_stage <= interval_stage < minor_stage: - category = 'action_' + str(interval_stage).replace('.', 'p') + 'ft' - if minor_stage <= interval_stage < moderate_stage: - category = 'minor_' + str(interval_stage).replace('.', 'p') + 'ft' - if moderate_stage <= interval_stage < major_stage: - category = 'moderate_' + str(interval_stage).replace('.', 'p') + 'ft' - if interval_stage >= major_stage: - category = 'major_' + str(interval_stage).replace('.', 'p') + 'ft' - executor.submit( - produce_stage_based_catfim_tifs, - interval_stage, - datum_adj_ft, - branch_dir, - lid_usgs_elev, - lid_altitude, - fim_dir, - segments, - lid, - huc, - mapping_lid_directory, - category, - job_number_inundate, - parent_log_output_file, - tif_child_log_file_prefix, - ) - except TypeError: # sometimes the thresholds are Nonetypes - MP_LOG.error("ERROR: type error in ProcessPool, likely in the interval tests") - MP_LOG.error(traceback.format_exc()) - continue - - except Exception: - MP_LOG.critical("ERROR: ProcessPool has an error") - MP_LOG.critical(traceback.format_exc()) - # merge MP Logs (Yes) - MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) - sys.exit(1) - - # merge MP Logs (merging MP into an MP (proc_pool in a proc_pool)) - MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) - - # Create a csv with same information as geopackage but with each threshold as new record. - # Probably a less verbose way. - csv_df = pd.DataFrame() - for threshold in magnitudes: - try: - line_df = pd.DataFrame( - { - 'nws_lid': [lid], - 'name': metadata['nws_data']['name'], - 'WFO': metadata['nws_data']['wfo'], - 'rfc': metadata['nws_data']['rfc'], - 'huc': [huc], - 'state': metadata['nws_data']['state'], - 'county': metadata['nws_data']['county'], - 'magnitude': threshold, - 'q': flows[threshold], - 'q_uni': flows['units'], - 'q_src': flows['source'], - 'stage': stages[threshold], - 'stage_uni': stages['units'], - 's_src': stages['source'], - 'wrds_time': stages['wrds_timestamp'], - 'nrldb_time': metadata['nrldb_timestamp'], - 'nwis_time': metadata['nwis_timestamp'], - 'lat': [float(metadata['nws_preferred']['latitude'])], - 'lon': [float(metadata['nws_preferred']['longitude'])], - 'dtm_adj_ft': stage_based_att_dict[lid][threshold]['datum_adj_ft'], - 'dadj_w_ft': stage_based_att_dict[lid][threshold]['datum_adj_wse_ft'], - 'dadj_w_m': stage_based_att_dict[lid][threshold]['datum_adj_wse_m'], - 'lid_alt_ft': stage_based_att_dict[lid][threshold]['lid_alt_ft'], - 'lid_alt_m': stage_based_att_dict[lid][threshold]['lid_alt_m'], - } - ) - csv_df = pd.concat([csv_df, line_df]) - - except Exception: - MP_LOG.error("ERROR: threshold has an error") - MP_LOG.error(traceback.format_exc()) - return - # sys.exit(1) - - # Round flow and stage columns to 2 decimal places. - csv_df = csv_df.round({'q': 2, 'stage': 2}) - - # If a site folder exists (ie a flow file was written) save files containing site attributes. - if os.path.exists(mapping_lid_directory): - # Export DataFrame to csv containing attributes - attributes_filepath = os.path.join(attributes_dir, f'{lid}_attributes.csv') - - csv_df.to_csv(attributes_filepath, index=False) - else: - msg = ':missing all calculated flows' - all_messages.append(lid + msg) - MP_LOG.error(huc_lid_id + msg) - - # If it made it to this point (i.e. no continues), there were no major preventers of mapping - all_messages.append(lid + ':OK') - MP_LOG.success(f'{huc_lid_id}: Complete') - # mark_complete(mapping_lid_directory) - - # Write all_messages by HUC to be scraped later. - if len(all_messages) > 0: - - # TODO: Aug 2024: This is now identical to the way flow handles messages - # but the system should probably be changed to somethign more elegant but good enough - # for now. At least is is MP safe. - huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') - with open(huc_messages_txt_file, 'w') as f: - for item in all_messages: - item = item.strip() - # f.write("%s\n" % item) - f.write(f"{item}\n") - else: - # something has likely gone very poorly. - MP_LOG.error(f"No messages found for {huc}") - sys.exit(1) - - except Exception: - MP_LOG.error(f"{huc} : {lid} Error iterating through huc stage based") - MP_LOG.error(traceback.format_exc()) - - return - - -def __adjust_datum_ft(flows, metadata, lid, huc_lid_id): - - # TODO: Aug 2024: This whole parts needs revisiting. Lots of lid data has changed and this - # is all likely very old. - - # Jul 2024: For now, we will duplicate messages via all_messsages and via the logging system. - all_messages = [] - - datum_adj_ft = None - ### --- Do Datum Offset --- ### - # determine source of interpolated threshold flows, this will be the rating curve that will be used. - rating_curve_source = flows.get('source') - if rating_curve_source is None: - msg = f'{huc_lid_id}:No source for rating curve' - all_messages.append(msg) - MP_LOG.warning(msg) - return None, all_messages - - # Get the datum and adjust to NAVD if necessary. - nws_datum_info, usgs_datum_info = get_datum(metadata) - if rating_curve_source == 'USGS Rating Depot': - datum_data = usgs_datum_info - elif rating_curve_source == 'NRLDB': - datum_data = nws_datum_info - - # If datum not supplied, skip to new site - datum = datum_data.get('datum', None) - if datum is None: - msg = f'{huc_lid_id}:datum info unavailable' - all_messages.append(msg) - MP_LOG.warning(msg) - return None, all_messages - - # ___________________________________________________________________________________________________# - # SPECIAL CASE: Workaround for "bmbp1" where the only valid datum is from NRLDB (USGS datum is null). - # Modifying rating curve source will influence the rating curve and - # datum retrieved for benchmark determinations. - if lid == 'bmbp1': - rating_curve_source = 'NRLDB' - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Custom workaround these sites have faulty crs from WRDS. CRS needed for NGVD29 - # conversion to NAVD88 - # USGS info indicates NAD83 for site: bgwn7, fatw3, mnvn4, nhpp1, pinn4, rgln4, rssk1, sign4, smfn7, - # stkn4, wlln7 - # Assumed to be NAD83 (no info from USGS or NWS data): dlrt2, eagi1, eppt2, jffw3, ldot2, rgdt2 - if lid in [ - 'bgwn7', - 'dlrt2', - 'eagi1', - 'eppt2', - 'fatw3', - 'jffw3', - 'ldot2', - 'mnvn4', - 'nhpp1', - 'pinn4', - 'rgdt2', - 'rgln4', - 'rssk1', - 'sign4', - 'smfn7', - 'stkn4', - 'wlln7', - ]: - datum_data.update(crs='NAD83') - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Workaround for bmbp1; CRS supplied by NRLDB is mis-assigned (NAD29) and - # is actually NAD27. - # This was verified by converting USGS coordinates (in NAD83) for bmbp1 to NAD27 and - # it matches NRLDB coordinates. - if lid == 'bmbp1': - datum_data.update(crs='NAD27') - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Custom workaround these sites have poorly defined vcs from WRDS. VCS needed to ensure - # datum reported in NAVD88. - # If NGVD29 it is converted to NAVD88. - # bgwn7, eagi1 vertical datum unknown, assume navd88 - # fatw3 USGS data indicates vcs is NAVD88 (USGS and NWS info agree on datum value). - # wlln7 USGS data indicates vcs is NGVD29 (USGS and NWS info agree on datum value). - if lid in ['bgwn7', 'eagi1', 'fatw3']: - datum_data.update(vcs='NAVD88') - elif lid == 'wlln7': - datum_data.update(vcs='NGVD29') - # ___________________________________________________________________________________________________# - - # Adjust datum to NAVD88 if needed - # Default datum_adj_ft to 0.0 - datum_adj_ft = 0.0 - crs = datum_data.get('crs') - if datum_data.get('vcs') in ['NGVD29', 'NGVD 1929', 'NGVD,1929', 'NGVD OF 1929', 'NGVD']: - # Get the datum adjustment to convert NGVD to NAVD. Sites not in contiguous US are previously - # removed otherwise the region needs changed. - try: - datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') - except Exception as ex: - MP_LOG.error(f"ERROR: {huc_lid_id}: ngvd_to_navd_ft") - MP_LOG.error(traceback.format_exc()) - ex = str(ex) - if crs is None: - msg = f'{huc_lid_id}:NOAA VDatum adjustment error, CRS is missing' - all_messages.append(msg) - MP_LOG.error(msg) - if 'HTTPSConnectionPool' in ex: - time.sleep(10) # Maybe the API needs a break, so wait 10 seconds - try: - datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') - except Exception: - msg = f'{huc_lid_id}:NOAA VDatum adjustment error, possible API issue' - all_messages.append(msg) - MP_LOG.error(msg) - if 'Invalid projection' in ex: - msg = f'{huc_lid_id}:NOAA VDatum adjustment error, invalid projection: crs={crs}' - all_messages.append(msg) - MP_LOG.error(msg) - return None, all_messages - - return datum_adj_ft, all_messages - - -def __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id): - acceptable_usgs_elev_df = None - try: - # Drop columns that offend acceptance criteria - usgs_elev_df['acceptable_codes'] = ( - # usgs_elev_df['usgs_data_coord_accuracy_code'].isin(acceptable_coord_acc_code_list) - # & usgs_elev_df['usgs_data_coord_method_code'].isin(acceptable_coord_method_code_list) - usgs_elev_df['usgs_data_alt_method_code'].isin(acceptable_alt_meth_code_list) - & usgs_elev_df['usgs_data_site_type'].isin(acceptable_site_type_list) - ) - - usgs_elev_df = usgs_elev_df.astype({'usgs_data_alt_accuracy_code': float}) - usgs_elev_df['acceptable_alt_error'] = np.where( - usgs_elev_df['usgs_data_alt_accuracy_code'] <= acceptable_alt_acc_thresh, True, False - ) - - acceptable_usgs_elev_df = usgs_elev_df[ - (usgs_elev_df['acceptable_codes'] == True) & (usgs_elev_df['acceptable_alt_error'] == True) - ] - - # # TEMP DEBUG Record row difference and write it to a CSV or something - # label = 'Old code' ## TEMP DEBUG - # num_potential_rows = usgs_elev_df.shape[0] - # num_acceptable_rows = acceptable_usgs_elev_df.shape[0] - # out_message = f'{label}: kept {num_acceptable_rows} rows out of {num_potential_rows} available rows.' - - except Exception: - # Not sure any of the sites actually have those USGS-related - # columns in this particular file, so just assume it's fine to use - - # print("(Various columns related to USGS probably not in this csv)") - # print(f"Exception: \n {repr(e)} \n") - MP_LOG.error(f"{huc_lid_id}: An error has occurred while working with the usgs_elev table") - MP_LOG.error(traceback.format_exc()) - acceptable_usgs_elev_df = usgs_elev_df - - return acceptable_usgs_elev_df - - -def __adj_dem_evalation_val(acceptable_usgs_elev_df, lid, huc_lid_id): - - lid_usgs_elev = None - all_messages = [] - try: - matching_rows = acceptable_usgs_elev_df.loc[ - acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' - ] - - if len(matching_rows) == 2: # It means there are two level paths, use the one that is not 0 - lid_usgs_elev = acceptable_usgs_elev_df.loc[ - (acceptable_usgs_elev_df['nws_lid'] == lid.upper()) - & (acceptable_usgs_elev_df['levpa_id'] != 0), - 'dem_adj_elevation', - ].values[0] - else: - lid_usgs_elev = acceptable_usgs_elev_df.loc[ - acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' - ].values[0] - - except IndexError: # Occurs when LID is missing from table (yes. warning) - MP_LOG.warning(f"{huc_lid_id}: adjusting dem_adj_elevation") - MP_LOG.warning(traceback.format_exc()) - msg = ':likely unacceptable gage datum error or accuracy code(s); please see acceptance criteria' - all_messages.append(lid + msg) - MP_LOG.warning(huc_lid_id + msg) - - return lid_usgs_elev, all_messages - - -# This creates a HUC iterator with each HUC creating its flow files and tifs -def generate_stage_based_categorical_fim( - output_catfim_dir, - fim_run_dir, - nwm_us_search, - nwm_ds_search, - lid_to_run, - env_file, - job_number_inundate, - job_number_huc, - lst_hucs, - job_number_intervals, - past_major_interval_cap, - nwm_metafile, -): - magnitudes = ['action', 'minor', 'moderate', 'major', 'record'] - - output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') - attributes_dir = os.path.join(output_catfim_dir, 'attributes') - - # Create HUC message directory to store messages that will be read and joined after multiprocessing - huc_messages_dir = os.path.join(output_mapping_dir, 'huc_messages') - os.makedirs(huc_messages_dir, exist_ok=True) - - FLOG.lprint("Starting generate_flows (Stage Based)") - - # If it is stage based, generate flows returns all of these objects. - # If flow based, generate flows returns only - # (huc_dictionary, out_gdf, ___, threshold_url, all_lists, nwm_flows_df, nwm_flows_alaska_df) = generate_flows( # With Alaska - - # Generate flows is only using one of the incoming job number params - # so let's multiply -jh (huc) and -jn (inundate) - job_flows = job_number_huc * job_number_inundate - (huc_dictionary, out_gdf, ___, threshold_url, all_lists, all_nwm_flows_df) = generate_flows( # No Alaska - output_catfim_dir, - nwm_us_search, - nwm_ds_search, - lid_to_run, - env_file, - job_flows, - True, - lst_hucs, - nwm_metafile, - str(FLOG.LOG_FILE_PATH), - ) - FLOG.lprint("End generate_flows (Stage Based)") - - child_log_file_prefix = FLOG.MP_calc_prefix_name(FLOG.LOG_FILE_PATH, "MP_iter_hucs") - - FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") - FLOG.lprint("Start processing HUCs for Stage-Based CatFIM") - num_hucs = len(lst_hucs) - huc_index = 0 - FLOG.lprint(f"Number of hucs to process is {num_hucs}") - - with ProcessPoolExecutor(max_workers=job_number_huc) as executor: - for huc in huc_dictionary: - if huc in lst_hucs: - # FLOG.lprint(f'Generating stage based catfim for : {huc}') - - # Code variation for DROPPING Alaska HUCs - nwm_flows_region_df = all_nwm_flows_df - - # # Code variation for keeping alaska HUCs - # nwm_flows_region_df = nwm_flows_alaska_df if str(huc[:2]) == '19' else nwm_flows_df - - progress_stmt = f"index {huc_index + 1} of {num_hucs}" - executor.submit( - iterate_through_huc_stage_based, - output_catfim_dir, - huc, - fim_run_dir, - huc_dictionary, - threshold_url, - magnitudes, - all_lists, - past_major_interval_cap, - job_number_inundate, - job_number_intervals, - nwm_flows_region_df, - str(FLOG.LOG_FILE_PATH), - child_log_file_prefix, - progress_stmt, - ) - huc_index += 1 - # Need to merge MP logs here, merged into the "master log file" - - FLOG.merge_log_files(FLOG.LOG_FILE_PATH, child_log_file_prefix, True) - - FLOG.lprint('Wrapping up processing HUCs for Stage-Based CatFIM...') - FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") - - csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('.csv')] - - all_csv_df = pd.DataFrame() - refined_csv_files_list = [] - for csv_file in csv_files: - - full_csv_path = os.path.join(attributes_dir, csv_file) - # HUC has to be read in as string to preserve leading zeros. - try: - temp_df = pd.read_csv(full_csv_path, dtype={'huc': str}) - all_csv_df = pd.concat([all_csv_df, temp_df], ignore_index=True) - refined_csv_files_list.append(csv_file) - except Exception: # Happens if a file is empty (i.e. no mapping) - FLOG.error(f"ERROR: loading csv {full_csv_path}") - FLOG.error(traceback.format_exc()) - pass - - # Write to file - all_csv_df.to_csv(os.path.join(attributes_dir, 'nws_lid_attributes.csv'), index=False) - - # This section populates a geopackage of all potential sites and details - # whether it was mapped or not (mapped field) and if not, why (status field). - - # Preprocess the out_gdf GeoDataFrame. Reproject and reformat fields. - - # TODO: Accomodate AK projection? Yes.. and Alaska and CONUS should all end up as the same projection output - # epsg:5070, we really want 3857 out for all outputs - viz_out_gdf = out_gdf.to_crs(VIZ_PROJECTION) # TODO: Accomodate AK projection? - viz_out_gdf.rename( - columns={ - 'identifiers_nwm_feature_id': 'nwm_seg', - 'identifiers_nws_lid': 'nws_lid', - 'identifiers_usgs_site_code': 'usgs_gage', - }, - inplace=True, - ) - viz_out_gdf['nws_lid'] = viz_out_gdf['nws_lid'].str.lower() - - # Using list of csv_files, populate DataFrame of all nws_lids that had - # a flow file produced and denote with "mapped" column. - nws_lids = [] - for csv_file in csv_files: - nws_lids.append(csv_file.split('_attributes')[0]) - lids_df = pd.DataFrame(nws_lids, columns=['nws_lid']) - lids_df['mapped'] = 'yes' - - # Identify what lids were mapped by merging with lids_df. Populate - # 'mapped' column with 'No' if sites did not map. - viz_out_gdf = viz_out_gdf.merge(lids_df, how='left', on='nws_lid') - viz_out_gdf['mapped'] = viz_out_gdf['mapped'].fillna('no') - - # Read all messages for all HUCs - # This is basically identical to a chunk in flow based. At a min, consolidate - # or better yet, find a more elegant, yet still MP safe, system than .txt files - # but it works.. so maybe someday. - huc_message_list = [] - huc_messages_dir_list = os.listdir(huc_messages_dir) - for huc_message_file in huc_messages_dir_list: - full_path_file = os.path.join(huc_messages_dir, huc_message_file) - with open(full_path_file, 'r') as f: - if full_path_file.endswith('.txt'): - lines = f.readlines() - for line in lines: - line = line.strip() - huc_message_list.append(line) - - # Filter out columns and write out to file - # flow based doesn't make it here only stage - nws_lid_gpkg_file_path = os.path.join(output_mapping_dir, 'stage_based_catfim_sites.gpkg') - - # Write messages to DataFrame, split into columns, aggregate messages. - if len(huc_message_list) > 0: - - FLOG.lprint(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : adding messages") - messages_df = pd.DataFrame(huc_message_list, columns=['message']) - - messages_df = ( - messages_df['message'] - .str.split(':', n=1, expand=True) - .rename(columns={0: 'nws_lid', 1: 'status'}) - ) - - # We want one viz_out_gdf record per ahps and if there are more than one, contact the messages - # status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() - status_df = messages_df.groupby(['nws_lid'])['status'].agg(lambda x: ',\n'.join(x)).reset_index() - - # Join messages to populate status field to candidate sites. Assign - # status for null fields. - viz_out_gdf = viz_out_gdf.merge(status_df, how='left', on='nws_lid') - - # viz_out_gdf.reset_index(inplace=True) - - viz_out_gdf['status'] = viz_out_gdf['status'].fillna('undetermined') - - # Add acceptance criteria to viz_out_gdf before writing - viz_out_gdf['acceptable_coord_acc_code_list'] = str(acceptable_coord_acc_code_list) - viz_out_gdf['acceptable_coord_method_code_list'] = str(acceptable_coord_method_code_list) - viz_out_gdf['acceptable_alt_acc_thresh'] = float(acceptable_alt_acc_thresh) - viz_out_gdf['acceptable_alt_meth_code_list'] = str(acceptable_alt_meth_code_list) - viz_out_gdf['acceptable_site_type_list'] = str(acceptable_site_type_list) - - viz_out_gdf.to_file(nws_lid_gpkg_file_path, driver='GPKG', index=True, engine='fiona') - - csv_file_path = nws_lid_gpkg_file_path.replace(".gpkg", ".csv") - viz_out_gdf.to_csv(csv_file_path) - else: - FLOG.lprint(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : has no messages") - - return nws_lid_gpkg_file_path - - -if __name__ == '__main__': - # Parse arguments - parser = argparse.ArgumentParser(description='Run Categorical FIM') - parser.add_argument( - '-f', - '--fim_run_dir', - help='Path to directory containing HAND outputs, e.g. /data/previous_fim/fim_4_0_9_2', - required=True, - ) - parser.add_argument( - '-e', - '--env_file', - help='Docker mount path to the catfim environment file. ie) data/config/catfim.env', - required=True, - ) - parser.add_argument( - '-jh', - '--job_number_huc', - help='OPTIONAL: Number of processes to use for HUC scale operations.' - ' HUC and inundation job numbers should multiply to no more than one less than the CPU count of the' - ' machine. CatFIM sites generally only have 2-3 branches overlapping a site, so this number can be ' - 'kept low (2-4). Defaults to 1.', - required=False, - default=1, - type=int, - ) - parser.add_argument( - '-jn', - '--job_number_inundate', - help='OPTIONAL: Number of processes to use for inundating' - ' HUC and inundation job numbers should multiply to no more than one less than the CPU count' - ' of the machine. Defaults to 1.', - required=False, - default=1, - type=int, - ) - - parser.add_argument( - '-ji', - '--job_number_intervals', - help='OPTIONAL: Number of processes to use for inundating multiple intervals in stage-based' - ' inundation and interval job numbers should multiply to no more than one less than the CPU count ' - 'of the machine. Defaults to 1.', - required=False, - default=1, - type=int, - ) - - parser.add_argument( - '-a', - '--is_stage_based', - help='Run stage-based CatFIM instead of flow-based? Add this -a param to make it stage based,' - ' leave it off for flow based', - required=False, - default=False, - action='store_true', - ) - parser.add_argument( - '-t', - '--output_folder', - help='OPTIONAL: Target location, Where the output folder will be. Defaults to /data/catfim/', - required=False, - default='/data/catfim/', - ) - parser.add_argument( - '-o', '--overwrite', help='OPTIONAL: Overwrite files', required=False, action="store_true" - ) - parser.add_argument( - '-s', - '--search', - help='OPTIONAL: Upstream and downstream search in miles. How far up and downstream do you want to go? Defaults to 5.', - required=False, - default='5', - ) - - parser.add_argument( - '-l', - '--lid_to_run', - help='OPTIONAL: NWS LID, lowercase, to produce CatFIM for. Currently only accepts one. Defaults to all sites', - required=False, - default='all', - ) - - # lst_hucs temp disabled. All hucs in fim outputs in a directory will used - # parser.add_argument( - # '-lh', - # '--lst_hucs', - # help='OPTIONAL: Space-delimited list of HUCs to produce CatFIM for. Defaults to all HUCs', - # required=False, - # default='all', - # ) - - parser.add_argument( - '-mc', - '--past_major_interval_cap', - help='OPTIONAL: Stage-Based Only. How many feet past major do you want to go for the interval FIMs?' - ' of the machine. Defaults to 5.', - required=False, - default=5.0, - type=float, - ) - - # NOTE: This params is for quick debugging only and should not be used in a production mode - parser.add_argument( - '-me', - '--nwm_metafile', - help='OPTIONAL: If you have a pre-existing nwm metadata pickle file, you can path to it here.' - ' e.g.: /data/catfim/nwm_metafile.pkl', - required=False, - default="", - ) - - args = vars(parser.parse_args()) - - try: - - # call main program - process_generate_categorical_fim(**args) - - except Exception: - FLOG.critical(traceback.format_exc())