From e3a305869557ddc6e9845affe93e7639afb1ac7a Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 09:41:11 +0000 Subject: [PATCH 01/11] add tiffile --- requirements.txt | 1 + src/litdata/streaming/serializers.py | 31 +++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 06a629a0..2021c8e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ filelock numpy boto3 requests +tifffile \ No newline at end of file diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index f453f538..3a50430c 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -20,6 +20,7 @@ from contextlib import suppress from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +import tifffile import numpy as np import torch @@ -387,13 +388,41 @@ def can_serialize(self, data: float) -> bool: return isinstance(data, float) +class TIFFSerializer(Serializer): + """Serializer for TIFF files using tifffile.""" + + def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: + if not isinstance(item, str) or not os.path.isfile(item): + raise ValueError(f"The item to serialize must be a valid file path. Received: {item}") + + # Read the TIFF file as bytes + with open(item, "rb") as f: + data = f.read() + + # Store metadata if needed + _, ext = os.path.splitext(item) + metadata = f"tifffile:{ext.lower()}" + return data, metadata + + def deserialize(self, data: bytes) -> Any: + image_array = tifffile.imread(io.BytesIO(data)) + return image_array # This is a NumPy array + + def can_serialize(self, item: Any) -> bool: + return ( + isinstance(item, str) + and os.path.isfile(item) + and item.lower().endswith(('.tif', '.tiff')) + ) + + _SERIALIZERS = OrderedDict( **{ "str": StringSerializer(), "int": IntegerSerializer(), "float": FloatSerializer(), "video": VideoSerializer(), - "tif": FileSerializer(), + "tifffile": TIFFSerializer(), "file": FileSerializer(), "pil": PILSerializer(), "jpeg": JPEGSerializer(), From 876f6d8dac3712d01e5e0339d1fb423a99bb7d19 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 09:46:51 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements.txt | 2 +- src/litdata/streaming/serializers.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2021c8e6..76da06d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ filelock numpy boto3 requests -tifffile \ No newline at end of file +tifffile diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 3a50430c..6748de96 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -20,9 +20,9 @@ from contextlib import suppress from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union -import tifffile import numpy as np +import tifffile import torch from lightning_utilities.core.imports import RequirementCache @@ -409,11 +409,7 @@ def deserialize(self, data: bytes) -> Any: return image_array # This is a NumPy array def can_serialize(self, item: Any) -> bool: - return ( - isinstance(item, str) - and os.path.isfile(item) - and item.lower().endswith(('.tif', '.tiff')) - ) + return isinstance(item, str) and os.path.isfile(item) and item.lower().endswith((".tif", ".tiff")) _SERIALIZERS = OrderedDict( From 7dd69c12db65717695d3ae8c746f04414cc1dba2 Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 10:15:48 +0000 Subject: [PATCH 03/11] Add test --- tests/streaming/test_serializer.py | 48 +++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 70d10a41..a3db3a4d 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -34,6 +34,7 @@ NumpySerializer, PILSerializer, TensorSerializer, + TIFFSerializer, VideoSerializer, _get_serializers, ) @@ -46,6 +47,7 @@ def seed_everything(random_seed): _PIL_AVAILABLE = RequirementCache("PIL") +_TIFFFILE_AVAILABLE = RequirementCache("tifffile") def test_serializers(): @@ -55,7 +57,7 @@ def test_serializers(): "int", "float", "video", - "tif", + "tifffile", "file", "pil", "jpeg", @@ -265,3 +267,47 @@ def test_deserialize_empty_no_header_tensor(): serializer.setup(name) new_t = serializer.deserialize(data) assert torch.equal(t, new_t) + + +@pytest.mark.skipif(not _TIFFFILE_AVAILABLE, reason="Requires: ['tifffile']") +def test_tiff_serializer(): + serializer = TIFFSerializer() + + # Create a synthetic multispectral image + height, width, bands = 28, 28, 12 + np_data = np.random.randint(0, 65535, size=(height, width, bands), dtype=np.uint16) + + # Write to a temporary file + import tempfile + with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp_file: + tifffile.imwrite(tmp_file.name, np_data) + file_path = tmp_file.name + + # Test can_serialize + assert serializer.can_serialize(file_path) + + # Serialize + data, metadata = serializer.serialize(file_path) + assert isinstance(data, bytes) + assert metadata == f"tiff:{os.path.splitext(file_path)[1].lower()}" + + # Deserialize + serializer.setup(metadata) + deserialized_data = serializer.deserialize(data) + assert isinstance(deserialized_data, np.ndarray) + assert deserialized_data.shape == (height, width, bands) + assert deserialized_data.dtype == np.uint16 + + # Validate data content + assert np.array_equal(np_data, deserialized_data) + + # Clean up + os.remove(file_path) + +def test_tiff_serializer_can_serialize(): + serializer = TIFFSerializer() + assert serializer.can_serialize('image.tif') + assert serializer.can_serialize('image.tiff') + assert not serializer.can_serialize('image.jpg') + assert not serializer.can_serialize(None) + assert not serializer.can_serialize(123) \ No newline at end of file From 9bc589cf015de745abcb7b922b6107c248375895 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:21:49 +0000 Subject: [PATCH 04/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/streaming/test_serializer.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index a3db3a4d..b008f609 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -279,7 +279,8 @@ def test_tiff_serializer(): # Write to a temporary file import tempfile - with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp_file: + + with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp_file: tifffile.imwrite(tmp_file.name, np_data) file_path = tmp_file.name @@ -304,10 +305,11 @@ def test_tiff_serializer(): # Clean up os.remove(file_path) + def test_tiff_serializer_can_serialize(): serializer = TIFFSerializer() - assert serializer.can_serialize('image.tif') - assert serializer.can_serialize('image.tiff') - assert not serializer.can_serialize('image.jpg') + assert serializer.can_serialize("image.tif") + assert serializer.can_serialize("image.tiff") + assert not serializer.can_serialize("image.jpg") assert not serializer.can_serialize(None) - assert not serializer.can_serialize(123) \ No newline at end of file + assert not serializer.can_serialize(123) From 9eb578359e764af72c05cb9fa864930e774752a6 Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 10:30:50 +0000 Subject: [PATCH 05/11] Update import --- src/litdata/streaming/serializers.py | 3 +-- tests/streaming/test_serializer.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 6748de96..db957e18 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -405,8 +405,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: return data, metadata def deserialize(self, data: bytes) -> Any: - image_array = tifffile.imread(io.BytesIO(data)) - return image_array # This is a NumPy array + return tifffile.imread(io.BytesIO(data)) # This is a NumPy array def can_serialize(self, item: Any) -> bool: return isinstance(item, str) and os.path.isfile(item) and item.lower().endswith((".tif", ".tiff")) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index b008f609..01390327 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -16,6 +16,8 @@ import random import sys from unittest import mock +import tempfile +import tifffile import numpy as np import pytest @@ -277,9 +279,6 @@ def test_tiff_serializer(): height, width, bands = 28, 28, 12 np_data = np.random.randint(0, 65535, size=(height, width, bands), dtype=np.uint16) - # Write to a temporary file - import tempfile - with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp_file: tifffile.imwrite(tmp_file.name, np_data) file_path = tmp_file.name From 86fda0b08ea7170e265fd9d9ac224cd40891f2b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:31:02 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/streaming/test_serializer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 01390327..7ef829a6 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -15,12 +15,12 @@ import os import random import sys -from unittest import mock import tempfile -import tifffile +from unittest import mock import numpy as np import pytest +import tifffile import torch from lightning_utilities.core.imports import RequirementCache from litdata.streaming.serializers import ( From 3c8c3bd5372555dff4a357cc2e68e29a57a9afe5 Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 12:08:22 +0000 Subject: [PATCH 07/11] Remove metadata --- src/litdata/streaming/serializers.py | 5 +---- tests/streaming/test_serializer.py | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index db957e18..01181b2e 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -399,10 +399,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: with open(item, "rb") as f: data = f.read() - # Store metadata if needed - _, ext = os.path.splitext(item) - metadata = f"tifffile:{ext.lower()}" - return data, metadata + return data def deserialize(self, data: bytes) -> Any: return tifffile.imread(io.BytesIO(data)) # This is a NumPy array diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 7ef829a6..97be9ec9 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -287,12 +287,10 @@ def test_tiff_serializer(): assert serializer.can_serialize(file_path) # Serialize - data, metadata = serializer.serialize(file_path) + data = serializer.serialize(file_path) assert isinstance(data, bytes) - assert metadata == f"tiff:{os.path.splitext(file_path)[1].lower()}" # Deserialize - serializer.setup(metadata) deserialized_data = serializer.deserialize(data) assert isinstance(deserialized_data, np.ndarray) assert deserialized_data.shape == (height, width, bands) From 1e696a2dfce1ae4b89efa1ff3e384a5e6e525def Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 12:13:56 +0000 Subject: [PATCH 08/11] fix return --- src/litdata/streaming/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 01181b2e..41c58179 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -399,7 +399,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: with open(item, "rb") as f: data = f.read() - return data + return data, None def deserialize(self, data: bytes) -> Any: return tifffile.imread(io.BytesIO(data)) # This is a NumPy array From ce08419d3793aeb6f12fed19446c0e030a6d8892 Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 12:47:00 +0000 Subject: [PATCH 09/11] fix test_tiff_serializer --- src/litdata/streaming/serializers.py | 1 + tests/streaming/test_serializer.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 41c58179..9e3b7665 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -405,6 +405,7 @@ def deserialize(self, data: bytes) -> Any: return tifffile.imread(io.BytesIO(data)) # This is a NumPy array def can_serialize(self, item: Any) -> bool: + print(f"Checking can_serialize for: {item}, exists: {os.path.isfile(item)}, suffix: {item.lower().endswith(('.tif', '.tiff'))}") return isinstance(item, str) and os.path.isfile(item) and item.lower().endswith((".tif", ".tiff")) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 97be9ec9..952014ee 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -287,9 +287,10 @@ def test_tiff_serializer(): assert serializer.can_serialize(file_path) # Serialize - data = serializer.serialize(file_path) + data, _ = serializer.serialize(file_path) assert isinstance(data, bytes) + # Deserialize deserialized_data = serializer.deserialize(data) assert isinstance(deserialized_data, np.ndarray) From 8c0366d863e898589652e9c8ea02e9dae5443d00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 12:47:29 +0000 Subject: [PATCH 10/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/litdata/streaming/serializers.py | 4 +++- tests/streaming/test_serializer.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 9e3b7665..62e386ed 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -405,7 +405,9 @@ def deserialize(self, data: bytes) -> Any: return tifffile.imread(io.BytesIO(data)) # This is a NumPy array def can_serialize(self, item: Any) -> bool: - print(f"Checking can_serialize for: {item}, exists: {os.path.isfile(item)}, suffix: {item.lower().endswith(('.tif', '.tiff'))}") + print( + f"Checking can_serialize for: {item}, exists: {os.path.isfile(item)}, suffix: {item.lower().endswith(('.tif', '.tiff'))}" + ) return isinstance(item, str) and os.path.isfile(item) and item.lower().endswith((".tif", ".tiff")) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 952014ee..4d14b052 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -290,7 +290,6 @@ def test_tiff_serializer(): data, _ = serializer.serialize(file_path) assert isinstance(data, bytes) - # Deserialize deserialized_data = serializer.deserialize(data) assert isinstance(deserialized_data, np.ndarray) From 8c8dd789f1543c90c0e60232351b68d580d334fd Mon Sep 17 00:00:00 2001 From: Robin Cole Date: Thu, 28 Nov 2024 12:54:43 +0000 Subject: [PATCH 11/11] remove second test --- src/litdata/streaming/serializers.py | 3 --- tests/streaming/test_serializer.py | 9 --------- 2 files changed, 12 deletions(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 62e386ed..41c58179 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -405,9 +405,6 @@ def deserialize(self, data: bytes) -> Any: return tifffile.imread(io.BytesIO(data)) # This is a NumPy array def can_serialize(self, item: Any) -> bool: - print( - f"Checking can_serialize for: {item}, exists: {os.path.isfile(item)}, suffix: {item.lower().endswith(('.tif', '.tiff'))}" - ) return isinstance(item, str) and os.path.isfile(item) and item.lower().endswith((".tif", ".tiff")) diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 4d14b052..c3998e90 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -301,12 +301,3 @@ def test_tiff_serializer(): # Clean up os.remove(file_path) - - -def test_tiff_serializer_can_serialize(): - serializer = TIFFSerializer() - assert serializer.can_serialize("image.tif") - assert serializer.can_serialize("image.tiff") - assert not serializer.can_serialize("image.jpg") - assert not serializer.can_serialize(None) - assert not serializer.can_serialize(123)