Skip to content

Commit

Permalink
continue refactoring data step
Browse files Browse the repository at this point in the history
  • Loading branch information
raehik committed Aug 22, 2023
1 parent aa93cd9 commit fa9300c
Show file tree
Hide file tree
Showing 7 changed files with 363 additions and 97 deletions.
79 changes: 0 additions & 79 deletions src-new/forcing_computation.py

This file was deleted.

18 changes: 0 additions & 18 deletions src-new/step_data.py

This file was deleted.

10 changes: 10 additions & 0 deletions src/gz21_ocean_momentum/new/common/bounding_box.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass
from typing import Optional
from typing import Tuple

@dataclass
class BoundingBox():
lat_min: float
lat_max: float
long_min: float
long_max: float
39 changes: 39 additions & 0 deletions src/gz21_ocean_momentum/new/data/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import gz21_ocean_momentum.new.data.step as step
from gz21_ocean_momentum.new.common.bounding_box import BoundingBox

import configargparse

DESCRIPTION = "Read data from the CM2.6 and \
apply coarse graining. Stores the resulting dataset into an MLFLOW \
experiment within a specific run."

p = configargparse.ArgParser()
p.add("--config-file", is_config_file=True, help="config file path")
p.add("--out-dir", type=str, required=True)
p.add("--lat-min", type=float, required=True)
p.add("--lat-max", type=float, required=True)
p.add("--long-min", type=float, required=True)
p.add("--long-max", type=float, required=True)
p.add("--cyclize", action="store_true", help="global data; make cyclic along longitude")
p.add("--ntimes", type=int, required=True, help="number of days (TODO)")
p.add("--co2-increase", action="store_true", help="use 1%% annual CO2 increase CM2.6 dataset. By default, uses control (no increase)")
p.add("--factor", type=int, help="resolution degradation factor")

options = p.parse_args()

# form bounding box from input arguments
bounding_box = BoundingBox(
options.lat_min, options.lat_max,
options.long_min, options.long_max)

CATALOG_URL = "https://raw.githubusercontent.com/\
pangeo-data/pangeo-datastore/\
master/\
intake-catalogs/master.yaml"

surface_fields, grid = step.download_cm2_6(CATALOG_URL, options.co2_increase)
forcings = step.preprocess_and_compute_forcings(
grid, surface_fields, bounding_box, options.ntimes, options.cyclize,
options.factor, "usurf", "vsurf")

forcings.to_zarr(options.out_dir)
198 changes: 198 additions & 0 deletions src/gz21_ocean_momentum/new/data/coarsen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Routines for coarsening a dataset."""

import logging
import xarray as xr
from scipy.ndimage import gaussian_filter
import numpy as np

def eddy_forcing(
u_v_dataset: xr.Dataset,
grid_data: xr.Dataset,
scale: int,
nan_or_zero: str = "zero",
) -> xr.Dataset:
"""
Compute the sub-grid forcing terms using mean coarse-graining.
Parameters
----------
u_v_dataset : xarray Dataset
High-resolution velocity field.
grid_data : xarray Dataset
High-resolution grid details.
scale : float
factor (TODO)
nan_or_zero: str, optional
String set to either 'nan' or 'zero'. Determines whether we keep the
nan values in the initial surface velocities array or whether we
replace them by zeros before applying the procedure.
In the second case, remaining zeros after applying the procedure will
be replaced by nans for consistency.
The default is 'zero'.
Returns
-------
forcing : xarray Dataset
Dataset containing the low-resolution velocity field and forcing.
TODO: we edit u_v_dataset, and for some reason we were returning it (but
calls were silently ignoring it, or something).
"""
# Replace nan values with zeros.
if nan_or_zero == "zero":
u_v_dataset = u_v_dataset.fillna(0.0)

# Interpolate temperature
# interp_coords = dict(xt_ocean=u_v_dataset.coords['xu_ocean'],
# yt_ocean=u_v_dataset.coords['yu_ocean'])
# u_v_dataset['temp'] = u_v_dataset['surface_temperature'].interp(
# interp_coords)

scale_filter = scale / 2
# High res advection terms
adv = advections(u_v_dataset, grid_data)
# Filtered advections
filtered_adv = spatial_filter_dataset(adv, grid_data, scale_filter)
# Filtered u,v field and temperature
u_v_filtered = spatial_filter_dataset(u_v_dataset, grid_data, scale_filter)
# Advection term from filtered velocity field
adv_filtered = advections(u_v_filtered, grid_data)
# Forcing
forcing = adv_filtered - filtered_adv
forcing = forcing.rename({"adv_x": "S_x", "adv_y": "S_y"})
# Merge filtered u,v, temperature and forcing terms
forcing = forcing.merge(u_v_filtered)
# TODO logging
#logging.debug(forcing)

# Coarsen
forcing_coarse = forcing.coarsen(
{"xu_ocean": int(scale_filter), "yu_ocean": int(scale_filter)}, boundary="trim"
)

forcing_coarse = forcing_coarse.mean()

if nan_or_zero == "zero":
# Replace zeros with nans for consistency
forcing_coarse = forcing_coarse.where(forcing_coarse["usurf"] != 0)
u_v_dataset = u_v_dataset.merge(adv)
filtered_adv = filtered_adv.rename({"adv_x": "f_adv_x", "adv_y": "f_adv_y"})
adv_filtered = adv_filtered.rename({"adv_x": "adv_f_x", "adv_y": "adv_f_y"})
u_v_filtered = u_v_filtered.rename({"usurf": "f_usurf", "vsurf": "f_vsurf"})
u_v_dataset = xr.merge(
(
u_v_dataset,
u_v_filtered,
adv,
filtered_adv,
adv_filtered,
forcing[["S_x", "S_y"]],
)
)
return forcing_coarse

def advections(u_v_field: xr.Dataset, grid_data: xr.Dataset):
"""
Compute advection terms corresponding to the passed velocity field.
Parameters
----------
u_v_field : xarray Dataset
Velocity field, must contains variables "usurf" and "vsurf"
grid_data : xarray Dataset
grid data, must contain variables "dxu" and "dyu"
Returns
-------
result : xarray Dataset
Advection components, under variable names "adv_x" and "adv_y"
"""
dxu = grid_data["dxu"]
dyu = grid_data["dyu"]
gradient_x = u_v_field.diff(dim="xu_ocean") / dxu
gradient_y = u_v_field.diff(dim="yu_ocean") / dyu
# Interpolate back the gradients
interp_coords = {
"xu_ocean": u_v_field.coords["xu_ocean"],
"yu_ocean": u_v_field.coords["yu_ocean"],
}
gradient_x = gradient_x.interp(interp_coords)
gradient_y = gradient_y.interp(interp_coords)
u, v = u_v_field["usurf"], u_v_field["vsurf"]
adv_x = u * gradient_x["usurf"] + v * gradient_y["usurf"]
adv_y = u * gradient_x["vsurf"] + v * gradient_y["vsurf"]
result = xr.Dataset({"adv_x": adv_x, "adv_y": adv_y})
# TODO check if we can simply prevent the previous operation from adding
# chunks
# result = result.chunk(dict(xu_ocean=-1, yu_ocean=-1))
return result

def spatial_filter_dataset(
dataset: xr.Dataset, grid_info: xr.Dataset, sigma: float
) -> xr.Dataset:
"""
Apply spatial filtering to the dataset across the spatial dimensions.
Parameters
----------
dataset : xarray Dataset
Dataset to filter. First dimension must be time, followed by spatial dimensions
grid_info : xarray Dataset
grid data, must include variables "dxu" and "dyu"
sigma : float
Scale of the filtering, same unit as those of the grid (often, meters)
Returns
-------
filt_dataset : xarray Dataset
Filtered dataset
"""
area_u = grid_info["dxu"] * grid_info["dyu"] / 1e8
dataset = dataset * area_u
# Normalisation term, so that if the quantity we filter is constant
# over the domain, the filtered quantity is constant with the same value
norm = xr.apply_ufunc(
lambda x: gaussian_filter(x, (sigma, sigma), mode="constant"),
area_u,
dask="parallelized",
output_dtypes=[
float,
],
)
filtered = xr.apply_ufunc(
lambda x: spatial_filter(x, sigma),
dataset,
dask="parallelized",
output_dtypes=[
float,
],
)
return filtered / norm

def spatial_filter(data: np.ndarray, sigma: float):
"""
Apply a gaussian filter to spatial data.
Apply scipy gaussian filter to along all dimensions except first one, which
corresponds to time.
Parameters
----------
data : ndarray
Data to filter.
sigma : float
Unitless scale of the filter.
Returns
-------
result : ndarray
Filtered data
"""
result = np.zeros_like(data)
for t in range(data.shape[0]):
data_t = data[t, ...]
result_t = gaussian_filter(data_t, (sigma, sigma), mode="constant")
result[t, ...] = result_t
return result
Loading

0 comments on commit fa9300c

Please sign in to comment.