Skip to content

Commit

Permalink
WIP attempt to implement tiff reader using zarr
Browse files Browse the repository at this point in the history
  • Loading branch information
TomNicholas committed Nov 14, 2024
1 parent ec9748c commit 29a407e
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 57 deletions.
82 changes: 33 additions & 49 deletions virtualizarr/readers/tiff.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,15 @@
import warnings
from typing import Iterable, Mapping, Optional

from xarray import Dataset, Index
from xarray import DataArray, Dataset, Index
import zarr

from virtualizarr.readers.common import (
VirtualBackend,
construct_virtual_dataset,
open_loadable_vars_and_indexes,
)
from virtualizarr.translators.kerchunk import (
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.utils import check_for_collisions
from virtualizarr.readers.common import VirtualBackend
from virtualizarr.readers.zarr import virtual_variable_from_zarr_array


class TIFFVirtualBackend(VirtualBackend):
@staticmethod
def open_virtual_dataset(
def open_virtual_dataarray(
filepath: str,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
Expand All @@ -26,48 +18,40 @@ def open_virtual_dataset(
indexes: Mapping[str, Index] | None = None,
reader_options: Optional[dict] = None,
) -> Dataset:
from kerchunk.tiff import tiff_to_zarr

from tifffile import imread

drop_variables, loadable_variables = check_for_collisions(
drop_variables=drop_variables, loadable_variables=loadable_variables
)
store = imread(filepath, aszarr=True)

if reader_options is None:
reader_options = {}
# TODO exception handling for TIFF files with multiple arrays
za = zarr.open_array(store=store, mode="r")

reader_options.pop("storage_options", {})
warnings.warn(
"storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr",
UserWarning,
)
vv = virtual_variable_from_zarr_array(za)

# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)})
# TODO should we generate any pixel coordnate arrays like kerhunk seems to do?

print(refs)
return DataArray(data=vv, dims=vv.dims, attrs=za.attrs)

# refs = extract_group(refs, group)
@staticmethod
def open_virtual_dataset(
filepath: str,
group: str | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
indexes: Mapping[str, Index] | None = None,
reader_options: Optional[dict] = None,
) -> Dataset:

from tifffile import imread

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
refs,
loadable_variables,
drop_variables,
)
store = imread(filepath, aszarr=True)

loadable_vars, indexes = open_loadable_vars_and_indexes(
filepath,
loadable_variables=loadable_variables,
reader_options=reader_options,
drop_variables=drop_variables,
indexes=indexes,
group=group,
decode_times=decode_times,
)
try:
zg = zarr.open_group(store, mode="r")
except zarr.errors.ContainsArrayError as err:
# TODO tidy this up
print("TIFF file contains only a single array, please use `open_virtual_dataarray` instead")
raise

return construct_virtual_dataset(
virtual_vars=virtual_vars,
loadable_vars=loadable_vars,
indexes=indexes,
coord_names=coord_names,
attrs=attrs,
)
raise NotImplementedError()
50 changes: 50 additions & 0 deletions virtualizarr/readers/zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import io

from xarray import Variable
import zarr

from virtualizarr.zarr import ZArray
from virtualizarr.manifests import ChunkManifest, ManifestArray


def virtual_variable_from_zarr_array(za: zarr.Array) -> Variable:
"""
Create a virtual xarray.Variable wrapping a ManifestArray from a single zarr.Array.
"""

# TODO this only works with zarr-python v2 for now

attrs = dict(za.attrs)

# extract _ARRAY_DIMENSIONS and remove it from attrs
# TODO handle v3 DIMENSION_NAMES too
dims = attrs.pop("_ARRAY_DIMENSIONS")

zarray = ZArray(
shape=za.shape,
chunks=za.chunks,
dtype=za.dtype,
fill_value=za.fill_value,
order=za.order,
compressor=za.compressor,
filters=za.filters,
#zarr_format=za.zarr_format,
)

manifest = chunkmanifest_from_zarr_array(za)

ma = ManifestArray(chunkmanifest=manifest, zarray=zarray)

return Variable(data=ma, dims=dims, attrs=attrs)


def chunkmanifest_from_zarr_array(za: zarr.Array) -> ChunkManifest:
import ujson

of2 = io.StringIO()

# TODO handle remote urls
za.store.write_fsspec(of2)# , url=url)
out = ujson.loads(of2.getvalue())

print(out)
12 changes: 4 additions & 8 deletions virtualizarr/tests/test_readers/test_tiff.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
import numpy as np
from xarray import Dataset
from xarray import DataArray

from virtualizarr import open_virtual_dataset
from virtualizarr import open_virtual_dataarray
from virtualizarr.manifests import ManifestArray
from virtualizarr.tests import requires_pillow


@requires_pillow
def test_random_tiff(random_tiff):
vds = open_virtual_dataset(random_tiff, indexes={})
vda = open_virtual_dataarray(random_tiff, indexes={})

assert isinstance(vds, Dataset)

# TODO what is the name of this array expected to be??
assert list(vds.variables) == ["foo"]
vda = vds["foo"]
assert isinstance(vda, DataArray)

assert vda.sizes == {"X": 128, "Y": 128}
assert vda.dtype == np.uint8
Expand Down

0 comments on commit 29a407e

Please sign in to comment.