Merge pull request NCAR#88 from rmshkv/component-api

API updates to specify components
TeaganKing · May 3, 2024 · d14ee72 · d14ee72
2 parents bf65cdd + f114b83
commit d14ee72
Show file tree

Hide file tree

Showing 14 changed files with 195 additions and 261 deletions.
diff --git a/README.md b/README.md
@@ -82,9 +82,16 @@ Usage: cupid-run [OPTIONS] CONFIG_PATH
 Options:
   -s, --serial        Do not use LocalCluster objects
   -ts, --time-series  Run time series generation scripts prior to diagnostics
+  -atm, --atmosphere  Run atmosphere component diagnostics
+  -ocn, --ocean       Run ocean component diagnostics
+  -lnd, --land        Run land component diagnostics
+  -ice, --seaice      Run sea ice component diagnostics
+  -glc, --landice     Run land ice component diagnostics
   -h, --help          Show this message and exit.
 ```
 
+#### Running in serial
+
 By default, several of the example notebooks provided use a dask `LocalCluster` object to run in parallel.
 However, the `--serial` option will pass a logical flag to each notebook that can be used to skip starting the cluster.
 
@@ -98,6 +105,10 @@ if not serial:
 client
 ```
 
+#### Specifying components
+
+If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice config.yml`.
+
+
 ### Timeseries File Generation
 CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run config.yml -ts`.
-
diff --git a/cupid/run.py b/cupid/run.py
@@ -11,21 +11,24 @@
 import dask
 import time
 import ploomber
-import yaml
+import warnings
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
-
 @click.command(context_settings=CONTEXT_SETTINGS)
 @click.option("--serial", "-s", is_flag=True, help="Do not use LocalCluster objects")
-@click.option(
-    "--time-series",
-    "-ts",
-    is_flag=True,
-    help="Run time series generation scripts prior to diagnostics",
-)
+@click.option("--time-series", "-ts", is_flag=True,
+              help="Run time series generation scripts prior to diagnostics")
+# Options to turn components on or off
+@click.option("--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics")
+@click.option("--ocean", "-ocn", is_flag=True, help="Run ocean component diagnostics")
+@click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics")
+@click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics")
+@click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics")
 @click.argument("config_path")
-def run(config_path, serial=False, time_series=False):
+
+def run(config_path, serial=False, time_series=False, 
+        all=False, atmosphere=False, ocean=False, land=False, seaice=False, landice=False):
     """
     Main engine to set up running all the notebooks.
     """
@@ -34,41 +37,56 @@ def run(config_path, serial=False, time_series=False):
     control = cupid.util.get_control_dict(config_path)
     cupid.util.setup_book(config_path)
 
-   #####################################################################
+    component_options = {"atm": atmosphere,
+                         "ocn": ocean,
+                         "lnd": land,
+                         "ice": seaice,
+                         "glc": landice}
+
+    # Automatically run all if no components specified
+
+    if True not in [atmosphere, ocean, land, seaice, landice]:
+        all = True
+        for key in component_options.keys():
+            component_options[key] = True
+
+    #####################################################################
     # Managing global parameters
 
     global_params = dict()
 
     if "global_params" in control:
         global_params = control["global_params"]
+
+    global_params['serial'] = serial
+
     ####################################################################
-
+    
     if time_series:
         timeseries_params = control["timeseries"]
 
         # general timeseries arguments for all components
         num_procs = timeseries_params["num_procs"]
 
-
-
-        for component in ['atm', 'ocn', 'lnd', 'ice', 'glc']:
-            cupid.timeseries.create_time_series(
-            component,
-            timeseries_params[component]["vars"],
-            timeseries_params[component]["derive_vars"],
-            [timeseries_params["case_name"]],  # could also grab from compute_notebooks section of config file
-            timeseries_params[component]["hist_str"],
-            [global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"],  # could also grab from compute_notebooks section of config file
-            [global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'],
-            # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/
-            timeseries_params["ts_done"],
-            timeseries_params["overwrite_ts"],
-            timeseries_params[component]["start_years"],  # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date
-            timeseries_params[component]["end_years"],  # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date
-            timeseries_params[component]["level"],
-            num_procs,
-            serial,
-        )
+        for component, comp_bool in component_options.items():
+            if comp_bool:
+                cupid.timeseries.create_time_series(
+                    component,
+                    timeseries_params[component]["vars"],
+                    timeseries_params[component]["derive_vars"],
+                    [timeseries_params["case_name"]],  # could also grab from compute_notebooks section of config file
+                    timeseries_params[component]["hist_str"],
+                    [global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"],  # could also grab from compute_notebooks section of config file
+                    [global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'],
+                    # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/
+                    timeseries_params["ts_done"],
+                    timeseries_params["overwrite_ts"],
+                    timeseries_params[component]["start_years"],  # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date
+                    timeseries_params[component]["end_years"],  # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date
+                    timeseries_params[component]["level"],
+                    num_procs,
+                    serial,
+                    )
 
     # Grab paths
 
@@ -113,35 +131,39 @@ def run(config_path, serial=False, time_series=False):
     dag = ploomber.DAG(executor=ploomber.executors.Serial())
 
     #####################################################################
-    # Organizing notebooks - holdover from manually managing dependencies before
-
-    all_nbs = dict()
-
-    for nb, info in control["compute_notebooks"].items():
-
-        all_nbs[nb] = info
-
-    # Setting up notebook tasks
-
-    for nb, info in all_nbs.items():
-
-        global_params["serial"] = serial
-        if "dependency" in info:
-            cupid.util.create_ploomber_nb_task(
-                nb,
-                info,
-                cat_path,
-                nb_path_root,
-                output_dir,
-                global_params,
-                dag,
-                dependency=info["dependency"],
-            )
-
-        else:
-            cupid.util.create_ploomber_nb_task(
-                nb, info, cat_path, nb_path_root, output_dir, global_params, dag
-            )
+    # Organizing notebooks to run
+
+    if 'compute_notebooks' in control:
+
+        all_nbs = dict()
+
+        for nb, info in control['compute_notebooks']['infrastructure'].items():
+            all_nbs[nb] = info
+            all_nbs[nb]['nb_path_root'] = nb_path_root + '/infrastructure'
+            all_nbs[nb]['output_dir'] = output_dir + '/infrastructure'
+
+        for comp_name, comp_bool in component_options.items():
+            if comp_name in control['compute_notebooks'] and comp_bool:
+                for nb, info in control['compute_notebooks'][comp_name].items():
+                    all_nbs[nb] = info
+                    all_nbs[nb]['nb_path_root'] = nb_path_root + '/' + comp_name
+                    all_nbs[nb]['output_dir'] = output_dir + '/' + comp_name
+            elif comp_bool and not all:
+                warnings.warn(f"No notebooks for {comp_name} component specified in config file.")
+
+        # Checking for existence of environments
+
+        for nb, info in all_nbs.copy().items():
+            if not control["env_check"][info["kernel_name"]]:
+                bad_env = info["kernel_name"]
+                warnings.warn(f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions.")
+                all_nbs.pop(nb)
+
+        # Setting up notebook tasks
+
+        for nb, info in all_nbs.items():     
+            cupid.util.create_ploomber_nb_task(nb, info, cat_path, info["nb_path_root"], 
+                                               info["output_dir"], global_params, dag, dependency=info.get("dependency"))
 
     #####################################################################
     # Organizing scripts
@@ -150,29 +172,27 @@ def run(config_path, serial=False, time_series=False):
 
         all_scripts = dict()
 
-        for script, info in control["compute_scripts"].items():
-
-            all_scripts[script] = info
-
+        for comp_name, comp_bool in component_options.items():
+            if comp_name in control['compute_scripts'] and comp_bool:
+                for script, info in control['compute_scripts'][comp_name].items():
+                    all_scripts[script] = info
+                    all_scripts[script]['nb_path_root'] = nb_path_root + '/' + comp_name
+            elif comp_bool and not all:
+                warnings.warn(f"No scripts for {comp_name} component specified in config file.")
+
+        # Checking for existence of environments
+
+        for script, info in all_scripts.copy().items():
+            if not control["env_check"][info["kernel_name"]]:
+                bad_env = info["kernel_name"]
+                warnings.warn(f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run.")
+                all_scripts.pop(script)
+
         # Setting up script tasks
 
         for script, info in all_scripts.items():
-
-            if "dependency" in info:
-                cupid.util.create_ploomber_script_task(
-                    script,
-                    info,
-                    cat_path,
-                    nb_path_root,
-                    global_params,
-                    dag,
-                    dependency=info["dependency"],
-                )
-
-            else:
-                cupid.util.create_ploomber_script_task(
-                    script, info, cat_path, nb_path_root, global_params, dag
-                )
+            cupid.util.create_ploomber_script_task(script, info, cat_path, info['nb_path_root'], 
+                                                   global_params, dag, dependency=info.get("dependency"))
 
     # Run the full DAG