continue refactoring data step

m2lines · Aug 22, 2023 · fa9300c · fa9300c
1 parent aa93cd9
commit fa9300c
Show file tree

Hide file tree

Showing 7 changed files with 363 additions and 97 deletions.
diff --git a/src-new/forcing_computation.py b/src-new/forcing_computation.py
diff --git a/src-new/step_data.py b/src-new/step_data.py
diff --git a/src/gz21_ocean_momentum/new/common/bounding_box.py b/src/gz21_ocean_momentum/new/common/bounding_box.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+from typing import Optional
+from typing import Tuple
+
+@dataclass
+class BoundingBox():
+    lat_min:  float
+    lat_max:  float
+    long_min: float
+    long_max: float
diff --git a/src/gz21_ocean_momentum/new/data/cli.py b/src/gz21_ocean_momentum/new/data/cli.py
@@ -0,0 +1,39 @@
+import gz21_ocean_momentum.new.data.step as step
+from   gz21_ocean_momentum.new.common.bounding_box import BoundingBox
+
+import configargparse
+
+DESCRIPTION = "Read data from the CM2.6 and \
+        apply coarse graining. Stores the resulting dataset into an MLFLOW \
+        experiment within a specific run."
+
+p = configargparse.ArgParser()
+p.add("--config-file", is_config_file=True, help="config file path")
+p.add("--out-dir",  type=str,   required=True)
+p.add("--lat-min",  type=float, required=True)
+p.add("--lat-max",  type=float, required=True)
+p.add("--long-min", type=float, required=True)
+p.add("--long-max", type=float, required=True)
+p.add("--cyclize",  action="store_true", help="global data; make cyclic along longitude")
+p.add("--ntimes",   type=int,   required=True, help="number of days (TODO)")
+p.add("--co2-increase", action="store_true", help="use 1%% annual CO2 increase CM2.6 dataset. By default, uses control (no increase)")
+p.add("--factor",   type=int, help="resolution degradation factor")
+
+options = p.parse_args()
+
+# form bounding box from input arguments
+bounding_box = BoundingBox(
+        options.lat_min,  options.lat_max,
+        options.long_min, options.long_max)
+
+CATALOG_URL = "https://raw.githubusercontent.com/\
+pangeo-data/pangeo-datastore/\
+master/\
+intake-catalogs/master.yaml"
+
+surface_fields, grid = step.download_cm2_6(CATALOG_URL, options.co2_increase)
+forcings = step.preprocess_and_compute_forcings(
+        grid, surface_fields, bounding_box, options.ntimes, options.cyclize,
+        options.factor, "usurf", "vsurf")
+
+forcings.to_zarr(options.out_dir)
diff --git a/src/gz21_ocean_momentum/new/data/coarsen.py b/src/gz21_ocean_momentum/new/data/coarsen.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Routines for coarsening a dataset."""
+
+import logging
+import xarray as xr
+from scipy.ndimage import gaussian_filter
+import numpy as np
+
+def eddy_forcing(
+    u_v_dataset: xr.Dataset,
+    grid_data: xr.Dataset,
+    scale: int,
+    nan_or_zero: str = "zero",
+) -> xr.Dataset:
+    """
+    Compute the sub-grid forcing terms using mean coarse-graining.
+
+    Parameters
+    ----------
+    u_v_dataset : xarray Dataset
+        High-resolution velocity field.
+    grid_data : xarray Dataset
+        High-resolution grid details.
+    scale : float
+        factor (TODO)
+    nan_or_zero: str, optional
+        String set to either 'nan' or 'zero'. Determines whether we keep the
+        nan values in the initial surface velocities array or whether we
+        replace them by zeros before applying the procedure.
+        In the second case, remaining zeros after applying the procedure will
+        be replaced by nans for consistency.
+        The default is 'zero'.
+
+    Returns
+    -------
+    forcing : xarray Dataset
+        Dataset containing the low-resolution velocity field and forcing.
+
+    TODO: we edit u_v_dataset, and for some reason we were returning it (but
+    calls were silently ignoring it, or something).
+    """
+    # Replace nan values with zeros.
+    if nan_or_zero == "zero":
+        u_v_dataset = u_v_dataset.fillna(0.0)
+
+    # Interpolate temperature
+    # interp_coords = dict(xt_ocean=u_v_dataset.coords['xu_ocean'],
+    #                      yt_ocean=u_v_dataset.coords['yu_ocean'])
+    # u_v_dataset['temp'] = u_v_dataset['surface_temperature'].interp(
+    #     interp_coords)
+
+    scale_filter = scale / 2
+    # High res advection terms
+    adv = advections(u_v_dataset, grid_data)
+    # Filtered advections
+    filtered_adv = spatial_filter_dataset(adv, grid_data, scale_filter)
+    # Filtered u,v field and temperature
+    u_v_filtered = spatial_filter_dataset(u_v_dataset, grid_data, scale_filter)
+    # Advection term from filtered velocity field
+    adv_filtered = advections(u_v_filtered, grid_data)
+    # Forcing
+    forcing = adv_filtered - filtered_adv
+    forcing = forcing.rename({"adv_x": "S_x", "adv_y": "S_y"})
+    # Merge filtered u,v, temperature and forcing terms
+    forcing = forcing.merge(u_v_filtered)
+    # TODO logging
+    #logging.debug(forcing)
+
+    # Coarsen
+    forcing_coarse = forcing.coarsen(
+        {"xu_ocean": int(scale_filter), "yu_ocean": int(scale_filter)}, boundary="trim"
+    )
+
+    forcing_coarse = forcing_coarse.mean()
+
+    if nan_or_zero == "zero":
+        # Replace zeros with nans for consistency
+        forcing_coarse = forcing_coarse.where(forcing_coarse["usurf"] != 0)
+    u_v_dataset = u_v_dataset.merge(adv)
+    filtered_adv = filtered_adv.rename({"adv_x": "f_adv_x", "adv_y": "f_adv_y"})
+    adv_filtered = adv_filtered.rename({"adv_x": "adv_f_x", "adv_y": "adv_f_y"})
+    u_v_filtered = u_v_filtered.rename({"usurf": "f_usurf", "vsurf": "f_vsurf"})
+    u_v_dataset = xr.merge(
+        (
+            u_v_dataset,
+            u_v_filtered,
+            adv,
+            filtered_adv,
+            adv_filtered,
+            forcing[["S_x", "S_y"]],
+        )
+    )
+    return forcing_coarse
+
+def advections(u_v_field: xr.Dataset, grid_data: xr.Dataset):
+    """
+    Compute advection terms corresponding to the passed velocity field.
+
+    Parameters
+    ----------
+    u_v_field : xarray Dataset
+        Velocity field, must contains variables "usurf" and "vsurf"
+    grid_data : xarray Dataset
+        grid data, must contain variables "dxu" and "dyu"
+
+    Returns
+    -------
+    result : xarray Dataset
+        Advection components, under variable names "adv_x" and "adv_y"
+    """
+    dxu = grid_data["dxu"]
+    dyu = grid_data["dyu"]
+    gradient_x = u_v_field.diff(dim="xu_ocean") / dxu
+    gradient_y = u_v_field.diff(dim="yu_ocean") / dyu
+    # Interpolate back the gradients
+    interp_coords = {
+        "xu_ocean": u_v_field.coords["xu_ocean"],
+        "yu_ocean": u_v_field.coords["yu_ocean"],
+    }
+    gradient_x = gradient_x.interp(interp_coords)
+    gradient_y = gradient_y.interp(interp_coords)
+    u, v = u_v_field["usurf"], u_v_field["vsurf"]
+    adv_x = u * gradient_x["usurf"] + v * gradient_y["usurf"]
+    adv_y = u * gradient_x["vsurf"] + v * gradient_y["vsurf"]
+    result = xr.Dataset({"adv_x": adv_x, "adv_y": adv_y})
+    # TODO check if we can simply prevent the previous operation from adding
+    # chunks
+    # result = result.chunk(dict(xu_ocean=-1, yu_ocean=-1))
+    return result
+
+def spatial_filter_dataset(
+        dataset: xr.Dataset, grid_info: xr.Dataset, sigma: float
+        ) -> xr.Dataset:
+    """
+    Apply spatial filtering to the dataset across the spatial dimensions.
+
+    Parameters
+    ----------
+    dataset : xarray Dataset
+        Dataset to filter. First dimension must be time, followed by spatial dimensions
+    grid_info : xarray Dataset
+        grid data,  must include variables "dxu" and "dyu"
+    sigma : float
+        Scale of the filtering, same unit as those of the grid (often, meters)
+
+    Returns
+    -------
+    filt_dataset : xarray Dataset
+        Filtered dataset
+    """
+    area_u = grid_info["dxu"] * grid_info["dyu"] / 1e8
+    dataset = dataset * area_u
+    # Normalisation term, so that if the quantity we filter is constant
+    # over the domain, the filtered quantity is constant with the same value
+    norm = xr.apply_ufunc(
+        lambda x: gaussian_filter(x, (sigma, sigma), mode="constant"),
+        area_u,
+        dask="parallelized",
+        output_dtypes=[
+            float,
+        ],
+    )
+    filtered = xr.apply_ufunc(
+        lambda x: spatial_filter(x, sigma),
+        dataset,
+        dask="parallelized",
+        output_dtypes=[
+            float,
+        ],
+    )
+    return filtered / norm
+
+def spatial_filter(data: np.ndarray, sigma: float):
+    """
+    Apply a gaussian filter to spatial data.
+
+    Apply scipy gaussian filter to along all dimensions except first one, which
+    corresponds to time.
+
+    Parameters
+    ----------
+    data : ndarray
+        Data to filter.
+    sigma : float
+        Unitless scale of the filter.
+
+    Returns
+    -------
+    result : ndarray
+        Filtered data
+    """
+    result = np.zeros_like(data)
+    for t in range(data.shape[0]):
+        data_t = data[t, ...]
+        result_t = gaussian_filter(data_t, (sigma, sigma), mode="constant")
+        result[t, ...] = result_t
+    return result