diff --git a/rechunk.ipynb b/rechunk.ipynb new file mode 100644 index 0000000..b89929c --- /dev/null +++ b/rechunk.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75688ac6-879d-4449-b73e-74f03a5f991f", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "\n", + "\n", + "# Geospatial Dataset Rechunking\n", + "\n", + "This is a national water model: https://registry.opendata.aws/nwm-archive/" + ] + }, + { + "cell_type": "markdown", + "id": "5dd71599-465f-4c97-baaa-19d900d2a070", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Set up cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24beda07-03c8-4a23-8600-80dbe10298ce", + "metadata": {}, + "outputs": [], + "source": [ + "import dask\n", + "\n", + "dask.config.set({\n", + " \"array.rechunk.method\": \"p2p\",\n", + " \"optimization.fuse.active\": False,\n", + " \"distributed.comm.retry.count\": 20,\n", + " \"distributed.comm.timeouts.connect\": 120,\n", + "});" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60b08a1c-d042-40f2-aaaa-e7665ca85d64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import coiled\n", + "\n", + "cluster = coiled.Cluster(\n", + " n_workers=100,\n", + " region=\"us-east-1\",\n", + ")\n", + "client = cluster.get_client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "id": "8185966d-6659-482b-bcbb-826b8f30b1e3", + "metadata": {}, + "source": [ + "## Load NWM data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b1749a-0d64-4278-823c-892120bf1a5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xarray as xr\n", + "\n", + "ds = xr.open_zarr(\n", + " \"s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr\",\n", + " consolidated=True,\n", + ").drop_encoding()\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2147fc5c-60ee-4409-8c22-69c5e68a4c63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ds.nbytes / 1e12 # half-petabyte" + ] + }, + { + "cell_type": "markdown", + "id": "0911fb96-7c08-4ca6-a35a-22e2a5a908cd", + "metadata": { + "tags": [] + }, + "source": [ + "## Time-optimized rechunking\n", + "\n", + "Let's look at two months worth of data (~1 TB) and rechunk it to be optimized for time dimension selections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a6fb91d-6a02-4afc-8d8a-ec3529f805f4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data = ds.zwattablrt.sel(time=slice(\"2020-01-01\", \"2020-03-01\")) # 1 TB of data\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8057c72c-7212-49fa-ad18-7aa346beb8cc", + "metadata": {}, + "outputs": [], + "source": [ + "result = data.chunk({\"time\": 1, \"x\": \"auto\", \"y\": \"auto\"})\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c7e99e-dec7-4201-9344-2738e5f8bca3", + "metadata": {}, + "outputs": [], + "source": [ + "result.to_zarr(\"s3://oss-scratch-space/nwm-time-optimized.zarr\", mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57e3a741-ad69-4f54-9094-78586a59d29e", + "metadata": {}, + "outputs": [], + "source": [ + "import fsspec\n", + "\n", + "fs = fsspec.filesystem(\"s3\")\n", + "fs.ls(\"s3://oss-scratch-space/nwm-time-optimized.zarr/\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}