Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial xarray support #35

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions modules/xarray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Installation (from VCS)

```sh
pip install "git+https://github.com/magnusuMET/fimex.git@feature/xarray#subdirectory=modules/xarray"
```

# Usage

The module is installed as an engine which `xarray` can use directly. You can open a dataset using
```python
import xarray

xarray.open_dataset("mydataset.nc", engine="fimex")
```

Pass `config` or `filetype` if necessary:
```python
import xarray

xarray.open_dataset("mydataset", filtype="grbml", config="myconfig.xml", engine="fimex")
```
1 change: 1 addition & 0 deletions modules/xarray/fimex_xarray/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .xr import FimexBackendEntrypoint, FimexDataVariable
180 changes: 180 additions & 0 deletions modules/xarray/fimex_xarray/xr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#! /usr/bin/env python3
import pyfimex0
import numpy as np
import xarray
from xarray.backends import BackendEntrypoint, BackendArray
import os


DTYPE_MAPPINGS = {
pyfimex0.CDMDataType.FLOAT: np.float32,
pyfimex0.CDMDataType.DOUBLE: np.float64,
pyfimex0.CDMDataType.INT: np.intc,
pyfimex0.CDMDataType.INT64: np.int64,
pyfimex0.CDMDataType.UINT: np.uint,
pyfimex0.CDMDataType.UINT64: np.uint64,
pyfimex0.CDMDataType.SHORT: np.short,
pyfimex0.CDMDataType.USHORT: np.ushort,
pyfimex0.CDMDataType.STRING: str,
pyfimex0.CDMDataType.CHAR: np.byte,
pyfimex0.CDMDataType.UCHAR: np.ubyte,
}


class FimexBackendEntrypoint(BackendEntrypoint):
def open_dataset(self, filename_or_obj, drop_variables, *, config=None, filetype=None, decode_times=True, scale_offset=True):
if filetype is None:
filetype = ""
if config is None:
config = ""
fh = pyfimex0.createFileReader(filetype, filename_or_obj, config)
cdm = fh.getCDM()

var_names = set(cdm.getVariableNames())
attr_names = cdm.getGlobalAttributeNames()

attrs = dict()
for aname in attr_names:
attr = cdm.getGlobalAttribute(aname)
attrv = attr.getData().values()
attrs[aname] = attrv

coord_names = cdm.getDimensionNames()
coords = dict()
for cname in set(coord_names).intersection(var_names):
var = cdm.getVariable(cname)
vardata = var.getData()
if vardata is not None:
values = vardata.values()
shape = var.getShape()
if len(shape) != 1:
raise Exception()
else:
values = fh.getData(cname).values()
shape = [len(values)]

if scale_offset:
try:
scale_factor = cdm.getAttribute(cname, "scale_factor").getData().values()
except RuntimeError:
scale_factor = 1
try:
add_offset = cdm.getAttribute(cname, "add_offset").getData().values()
except RuntimeError:
add_offset = 0
values = values*scale_factor + add_offset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use getScaledData

coords[cname] = values
try:
var_names.remove(cname)
except KeyError:
pass

vars = dict()
for vname in var_names:
var = cdm.getVariable(vname)
dims = list(reversed(var.getShape()))
attrs = dict()
for aname in cdm.getAttributeNames(vname):
attr = cdm.getAttribute(vname, aname)
attrv = attr.getData().values()
if len(attrv) == 1:
attrv = attrv[0]
attrs[aname] = attrv

data = FimexDataVariable(fh, cdm, var, scale_offset=scale_offset)
data = xarray.core.indexing.LazilyIndexedArray(data)
var = xarray.Variable(data=data, dims=dims, attrs=attrs)
vars[vname] = var

if decode_times:
if "time" in coord_names:
var = coords["time"]
var = np.array([np.datetime64(int(v), "s") for v in var])

coords["time"] = var

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really know what this should do in xarray, but the time-variable in netcdf is every variable with a time unit, i.e. 'days since 2005-09-03 +15:00'. One cannot rely on the dimension-name, and that there is only one "time"-axis per file.
Fimex supports conversion of the time-units, i.e. by getScaledDataSizeInUnit(...) so you can get a unified handling (needs doubles or quads, I would say). Not really sure what you do her (expect time to be epoch-seconds?).

ds = xarray.Dataset(data_vars=vars, attrs=attrs, coords=coords)

return ds

open_dataset_parameters = ["filename_or_obj", "drop_variables", "config", "filetype", "decode_times", "scale_offset"]

def guess_can_open(self, filename_or_obj):
try:
_, ext = os.path.splitext(filename_or_obj)
except TypeError:
return False

return ext in {".grbml", ".ncml", ".grb", ".nc"}

description = "Read using fimex"
url = "https://github.com/metno/fimex"


class FimexDataVariable(BackendArray):
def __init__(self, fh, cdm, var, scale_offset):
super().__init__()
self.fh = fh
self.var = var
self.cdm = cdm
self.data = var.getData()
self.scale_offset = scale_offset

shape = []
for s in var.getShape():
shape.append(cdm.getDimension(s).getLength())
self.shape = list(reversed(shape))
try:
dtype = self.data.getDataType()
self.dtype = np.dtype(DTYPE_MAPPINGS[dtype])
except AttributeError:
self.dtype = np.dtype(np.float64)

def __getitem__(
self, key: xarray.core.indexing.ExplicitIndexer
) -> np.typing.ArrayLike:
return xarray.core.indexing.explicit_indexing_adapter(
key,
self.shape,
xarray.core.indexing.IndexingSupport.BASIC,
self._raw_indexing_method,
)

def _raw_indexing_method(self, key: tuple) -> np.typing.ArrayLike:
vname = self.var.getName()

slicebuilder = pyfimex0.SliceBuilder(self.cdm, vname, False)
dimsizes = []
for k, dim, dimname in zip(key, self.shape, reversed(self.var.getShape())):
ignore_dim = dimname
if isinstance(k, int):
slicebuilder.setStartAndSize(dimname, k, 1)
elif isinstance(k, slice):
start = k.start if k.start is not None else 0
step = k.step if k.step is not None else 1
stop = k.stop if k.stop is not None else dim
size = (stop - start) // step
slicebuilder.setStartAndSize(dimname, start, size)
dimsizes.append(size)
else:
raise TypeError(f"Unknown type of {k}: {type(k)}")

data = self.fh.getDataSliceSB(vname, slicebuilder)
if len(dimsizes) == 0:
# Bug? Fimex can't read scalar data?
data_shaped = -1
else:
datav = data.values()
data_shaped = np.reshape(datav, dimsizes)
if self.scale_offset:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider using getScaledDataSliceSB and let fimex do the scale/offset handling

try:
scale_factor = self.cdm.getAttribute(vname, "scale_factor").getData().values()
except RuntimeError:
scale_factor = 1
try:
add_offset = self.cdm.getAttribute(vname, "add_offset").getData().values()
except RuntimeError:
add_offset = 0
data_shaped = data_shaped*scale_factor + add_offset

return data_shaped
22 changes: 22 additions & 0 deletions modules/xarray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "fimex_xarray"
version = "0.0.1"
readme = "README.md"

dependencies = [
# "pyfimex0",
"numpy",
"xarray",
]

[project.optional-dependencies]
dev = [
"pytest",
]

[project.entry-points."xarray.backends"]
fimex = "fimex_xarray.xr:FimexBackendEntrypoint"
35 changes: 35 additions & 0 deletions modules/xarray/tests/test_fimex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import xarray
import os
import numpy as np

test_srcdir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "test")


def test_open_hirlam12():
path = os.path.join(test_srcdir, "hirlam12.nc")
path = os.path.join(test_srcdir, "verticalPressure.nc")
path = os.path.join(test_srcdir, "erai.sfc.40N.0.75d.200301011200.nc")
path = os.path.join(test_srcdir, "verticalOceanSG2.nc")
ds = xarray.open_dataset(path, engine="fimex", decode_times=False)

temp = ds["temp"]
selection = temp.isel(ocean_time=3, s_rho=[4, 6, 9], xi_rho=range(4, 7))
values = selection.values


def test_isel_slice():
path = os.path.join(test_srcdir, "verticalOceanSG2.nc")
ds = xarray.open_dataset(path, engine="fimex", decode_times=False)

zeta = ds["zeta"]

zeta_sub = ds.isel(eta_rho=slice(None, None, None))["zeta"]
assert np.all(zeta == zeta_sub)

zeta_sub = ds.isel(eta_rho=slice(2, None, None))["zeta"]
zeta_isel = zeta.isel(eta_rho=slice(2, None, None))
assert np.all(zeta_isel == zeta_sub)

zeta_sub = ds.isel(eta_rho=slice(2, 10, None))["zeta"]
zeta_isel = zeta.isel(eta_rho=slice(2, 10, None))
assert np.all(zeta_isel == zeta_sub)