Skip to content

Commit

Permalink
Merge pull request #250 from bird-house/chunk-performance
Browse files Browse the repository at this point in the history
Removed mandatory chunking for local files
  • Loading branch information
aulemahal authored Nov 4, 2022
2 parents 3feca0e + 75ab092 commit 72e72be
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Changes
- Input "rcp" has been renamed to "scenario".
- Input "dataset_name" has been fixed and renamed to "dataset".
* Update to xclim 0.38.0.
* Improved subset_grid_point_dataset & subset_bbox_dataset performance when using local files.

0.9.2 (2022-07-19)
==================
Expand Down
3 changes: 2 additions & 1 deletion finch/processes/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def _subset(resource: ComplexInput):

# if not subsetting by time, it's not necessary to decode times
time_subset = start_date is not None or end_date is not None
dataset = try_opendap(resource, decode_times=time_subset)
# No chunking needed for a single gridpoint.
dataset = try_opendap(resource, chunks=False, decode_times=time_subset)

with lock:
count += 1
Expand Down
25 changes: 19 additions & 6 deletions finch/processes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,36 +356,49 @@ def drs_filename(ds: xr.Dataset, variable: str = None):
def try_opendap(
input: ComplexInput,
*,
chunks=None,
chunks='auto',
decode_times=True,
chunk_dims=None,
logging_function=lambda message: None,
) -> xr.Dataset:
"""Try to open the file as an OPeNDAP url and chunk it.
If OPeNDAP fails, access the file directly.
By default, chunks are to be determined by xarray/dask.
If `chunks=None` or `chunks_dims` is given, finch rechunks the dataset according to
the logic of `chunk_dataset`.
Pass `chunks=False` to disable dask entirely on this dataset.
"""
url = input.url
logging_function(f"Try opening DAP link {url}")

if is_opendap_url(url):
ds = xr.open_dataset(url, chunks=chunks, decode_times=decode_times)
path = url
logging_function(f"Opened dataset as an OPeNDAP url: {url}")
else:
if url.startswith("http"):
# Accessing the file property writes it to disk if it's a url
logging_function(f"Downloading dataset for url: {url}")
else:
logging_function(f"Opening as local file: {input.file}")
path = input.file

ds = xr.open_dataset(input.file, chunks=chunks, decode_times=decode_times)
try:
# Try to open the dataset
ds = xr.open_dataset(path, chunks=chunks or None, decode_times=decode_times)
except NotImplementedError:
if chunks == 'auto':
# Some dtypes are not compatible with auto chunking (object, so unbounded strings)
logging_function("xarray auto-chunking failed, opening with no chunks and inferring chunks ourselves.")
chunks = None
ds = xr.open_dataset(path, chunks=None, decode_times=decode_times)
else:
raise

# To handle large number of grid cells (50+) in subsetted data
if "region" in ds.dims and "time" in ds.dims:
chunks = dict(time=-1, region=5)
ds = ds.chunk(chunks)

if not chunks:
elif chunks is None or chunk_dims is not None:
ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims))
return ds

Expand Down
4 changes: 2 additions & 2 deletions finch/processes/wps_sdba.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ def _log(message, percentage):
ds = try_opendap(request.inputs[key][0])
name = variable or list(ds.data_vars)[0]

# Force calendar to noleap
res[key] = convert_calendar(ds[name], "noleap")
# Force calendar to noleap and rechunk
res[key] = convert_calendar(ds[name], "noleap").chunk({'time': -1})

elif key in group_args:
group[key] = single_input_or_none(request.inputs, key)
Expand Down

0 comments on commit 72e72be

Please sign in to comment.