From 76dd9b89d6a3845677f7f155b3c89cfbdbee3d6b Mon Sep 17 00:00:00 2001 From: Louis-David Perron <100434291+perronld@users.noreply.github.com> Date: Wed, 20 Jul 2022 08:52:12 -0400 Subject: [PATCH 1/6] Removed mandatory chunking for local files --- finch/processes/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finch/processes/utils.py b/finch/processes/utils.py index 5cb59988..fc301825 100644 --- a/finch/processes/utils.py +++ b/finch/processes/utils.py @@ -301,6 +301,8 @@ def try_opendap( if is_opendap_url(url): ds = xr.open_dataset(url, chunks=chunks, decode_times=decode_times) logging_function(f"Opened dataset as an OPeNDAP url: {url}") + if not chunks: + ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims)) else: if url.startswith("http"): # Accessing the file property writes it to disk if it's a url @@ -315,8 +317,6 @@ def try_opendap( chunks = dict(time=-1, region=5) ds = ds.chunk(chunks) - if not chunks: - ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims)) return ds From 36d57ecac7eeedca3193572b5ab8358d5564f400 Mon Sep 17 00:00:00 2001 From: Louis-David Perron <100434291+perronld@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:55:55 -0400 Subject: [PATCH 2/6] Added comment in CHANGES.rst --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 417a9933..cd8c8576 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,10 @@ Changes ******* +0.9.3 (unreleased) +================== +* Improved subset_grid_point_dataset & subset_bbox_dataset performance when using local files + 0.9.2 (2022-07-19) ================== * Fix Finch unable to startup in the Docker image. From c2610c73242c85adf0287c723076d2069c014bb3 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 22 Jul 2022 12:47:17 -0400 Subject: [PATCH 3/6] Switch to auto chunking - disable on gridpoint --- finch/processes/subset.py | 3 ++- finch/processes/utils.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/finch/processes/subset.py b/finch/processes/subset.py index a3884b85..69c134c6 100644 --- a/finch/processes/subset.py +++ b/finch/processes/subset.py @@ -83,7 +83,8 @@ def _subset(resource: ComplexInput): # if not subsetting by time, it's not necessary to decode times time_subset = start_date is not None or end_date is not None - dataset = try_opendap(resource, decode_times=time_subset) + # No chunking needed for a single gridpoint. + dataset = try_opendap(resource, chunks=False, decode_times=time_subset) with lock: count += 1 diff --git a/finch/processes/utils.py b/finch/processes/utils.py index fc301825..5a171587 100644 --- a/finch/processes/utils.py +++ b/finch/processes/utils.py @@ -286,23 +286,24 @@ def drs_filename(ds: xr.Dataset, variable: str = None): def try_opendap( input: ComplexInput, *, - chunks=None, + chunks='auto', decode_times=True, chunk_dims=None, logging_function=lambda message: None, ) -> xr.Dataset: """Try to open the file as an OPeNDAP url and chunk it. - If OPeNDAP fails, access the file directly. + By default, chunks are to be determined by xarray/dask. + If `chunks=None` or `chunks_dims` is given, finch rechunks the dataset according to + the logic of `chunk_dataset`. + Pass `chunks=False` to disable dask entirely on this dataset. """ url = input.url logging_function(f"Try opening DAP link {url}") if is_opendap_url(url): - ds = xr.open_dataset(url, chunks=chunks, decode_times=decode_times) + ds = xr.open_dataset(url, chunks=chunks or None, decode_times=decode_times) logging_function(f"Opened dataset as an OPeNDAP url: {url}") - if not chunks: - ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims)) else: if url.startswith("http"): # Accessing the file property writes it to disk if it's a url @@ -310,13 +311,14 @@ def try_opendap( else: logging_function(f"Opening as local file: {input.file}") - ds = xr.open_dataset(input.file, chunks=chunks, decode_times=decode_times) + ds = xr.open_dataset(input.file, chunks=chunks or None, decode_times=decode_times) # To handle large number of grid cells (50+) in subsetted data if "region" in ds.dims and "time" in ds.dims: chunks = dict(time=-1, region=5) ds = ds.chunk(chunks) - + elif chunks is None or chunk_dims is not None: + ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims)) return ds From 1dd4100e6dff31864f2b7f982620e87dccdb94d5 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 22 Jul 2022 12:49:24 -0400 Subject: [PATCH 4/6] lint --- finch/processes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finch/processes/utils.py b/finch/processes/utils.py index 5a171587..576888d7 100644 --- a/finch/processes/utils.py +++ b/finch/processes/utils.py @@ -294,7 +294,7 @@ def try_opendap( """Try to open the file as an OPeNDAP url and chunk it. By default, chunks are to be determined by xarray/dask. - If `chunks=None` or `chunks_dims` is given, finch rechunks the dataset according to + If `chunks=None` or `chunks_dims` is given, finch rechunks the dataset according to the logic of `chunk_dataset`. Pass `chunks=False` to disable dask entirely on this dataset. """ From 9afd3a6d9446f289e2f9f919ff9cb590c6297906 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 13 Oct 2022 14:36:02 -0400 Subject: [PATCH 5/6] Avoid failing with object vars - rechunk for sdba --- finch/processes/utils.py | 15 +++++++++++++-- finch/processes/wps_sdba.py | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/finch/processes/utils.py b/finch/processes/utils.py index 576888d7..b51185ac 100644 --- a/finch/processes/utils.py +++ b/finch/processes/utils.py @@ -302,7 +302,7 @@ def try_opendap( logging_function(f"Try opening DAP link {url}") if is_opendap_url(url): - ds = xr.open_dataset(url, chunks=chunks or None, decode_times=decode_times) + path = url logging_function(f"Opened dataset as an OPeNDAP url: {url}") else: if url.startswith("http"): @@ -310,8 +310,19 @@ def try_opendap( logging_function(f"Downloading dataset for url: {url}") else: logging_function(f"Opening as local file: {input.file}") + path = input.file - ds = xr.open_dataset(input.file, chunks=chunks or None, decode_times=decode_times) + try: + # Try to open the dataset + ds = xr.open_dataset(path, chunks=chunks or None, decode_times=decode_times) + except NotImplementedError: + if chunks == 'auto': + # Some dtypes are not compatible with auto chunking (object, so unbounded strings) + logging_function(f"xarray auto-chunking failed, opening with no chunks and inferring chunks ourselves.") + chunks = None + ds = xr.open_dataset(path, chunks=None, decode_times=decode_times) + else: + raise # To handle large number of grid cells (50+) in subsetted data if "region" in ds.dims and "time" in ds.dims: diff --git a/finch/processes/wps_sdba.py b/finch/processes/wps_sdba.py index f8b5c215..9ebf47b8 100644 --- a/finch/processes/wps_sdba.py +++ b/finch/processes/wps_sdba.py @@ -163,8 +163,8 @@ def _log(message, percentage): ds = try_opendap(request.inputs[key][0]) name = variable or list(ds.data_vars)[0] - # Force calendar to noleap - res[key] = convert_calendar(ds[name], "noleap") + # Force calendar to noleap and rechunk + res[key] = convert_calendar(ds[name], "noleap").chunk({'time': -1}) elif key in group_args: group[key] = single_input_or_none(request.inputs, key) From 203fa93cba2266d6f942bee88fe57415b197a561 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 13 Oct 2022 14:37:25 -0400 Subject: [PATCH 6/6] remove unused f --- finch/processes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finch/processes/utils.py b/finch/processes/utils.py index b51185ac..9fdaff9c 100644 --- a/finch/processes/utils.py +++ b/finch/processes/utils.py @@ -318,7 +318,7 @@ def try_opendap( except NotImplementedError: if chunks == 'auto': # Some dtypes are not compatible with auto chunking (object, so unbounded strings) - logging_function(f"xarray auto-chunking failed, opening with no chunks and inferring chunks ourselves.") + logging_function("xarray auto-chunking failed, opening with no chunks and inferring chunks ourselves.") chunks = None ds = xr.open_dataset(path, chunks=None, decode_times=decode_times) else: