From e4329a01ad132cbc0b4b3fdbbe57a252d024f745 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 00:03:07 +0000 Subject: [PATCH 1/2] [pre-commit.ci] pre-commit suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v4.6.0) - [github.com/asottile/pyupgrade: v3.15.1 → v3.16.0](https://github.com/asottile/pyupgrade/compare/v3.15.1...v3.16.0) - [github.com/codespell-project/codespell: v2.2.6 → v2.3.0](https://github.com/codespell-project/codespell/compare/v2.2.6...v2.3.0) - [github.com/astral-sh/ruff-pre-commit: v0.3.2 → v0.5.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.3.2...v0.5.0) --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1de0b1a5..327ddfce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace @@ -51,14 +51,14 @@ repos: - id: detect-private-key - repo: https://github.com/asottile/pyupgrade - rev: v3.15.1 + rev: v3.16.0 hooks: - id: pyupgrade args: ["--py38-plus"] name: Upgrade code - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: [tomli] @@ -72,7 +72,7 @@ repos: args: ["--in-place"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.2 + rev: v0.5.0 hooks: - id: ruff-format - id: ruff From d427a8c8e42529fee5a5113a9ae24343247e4859 Mon Sep 17 00:00:00 2001 From: jirka Date: Thu, 18 Jul 2024 21:29:31 +0200 Subject: [PATCH 2/2] fixing --- src/litdata/processing/data_processor.py | 6 +++--- src/litdata/streaming/combined.py | 4 ++-- src/litdata/streaming/dataloader.py | 4 +--- src/litdata/streaming/resolver.py | 7 ++----- src/litdata/streaming/sampler.py | 6 +++--- src/litdata/streaming/writer.py | 4 ++-- src/litdata/utilities/dataset_utilities.py | 2 +- src/litdata/utilities/shuffle.py | 2 +- src/litdata/utilities/train_test_split.py | 2 +- tests/streaming/test_writer.py | 2 +- tests/utilities/test_shuffle.py | 16 ++++++++-------- tests/utilities/test_subsample.py | 8 ++++---- 12 files changed, 29 insertions(+), 34 deletions(-) diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py index 56480a9f..39182a51 100644 --- a/src/litdata/processing/data_processor.py +++ b/src/litdata/processing/data_processor.py @@ -785,8 +785,8 @@ def _done(self, size: int, delete_cached_files: bool, output_dir: Dir) -> _Resul if num_nodes == node_rank + 1: config = load_index_file(cache_dir) - size = sum([c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]]) - num_bytes = sum([c["chunk_bytes"] for c in config["chunks"]]) + size = sum(c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]) + num_bytes = sum(c["chunk_bytes"] for c in config["chunks"]) if config["config"] is not None: data_format = tree_unflatten( config["config"]["data_format"], treespec_loads(config["config"]["data_spec"]) @@ -1013,7 +1013,7 @@ def run(self, data_recipe: DataRecipe) -> None: workers_user_items = [w[:items_to_keep] for w in workers_user_items] print(f"Fast dev run is enabled. Limiting to {items_to_keep} items per process.") - num_items = sum([len(items) for items in workers_user_items]) + num_items = sum(len(items) for items in workers_user_items) self._cleanup_cache() diff --git a/src/litdata/streaming/combined.py b/src/litdata/streaming/combined.py index 4741b75d..56e7ce19 100644 --- a/src/litdata/streaming/combined.py +++ b/src/litdata/streaming/combined.py @@ -194,7 +194,7 @@ def __init__( self._num_samples_yielded = num_samples_yielded or [0 for _ in range(len(datasets))] self._original_weights = deepcopy(weights) self._weights = deepcopy(weights) - self._rng = random.Random(seed) + self._rng = random.Random(seed) # noqa: S311 self._iterate_over_all = iterate_over_all self._is_done = False @@ -226,7 +226,7 @@ def __next__(self) -> Any: self._dataset_indexes[dataset_index] = None self._weights[dataset_index] = None # type: ignore - new_sum = sum([w for w in self._weights if w is not None]) + new_sum = sum(w for w in self._weights if w is not None) self._weights = [None if w is None else w / new_sum for w in self._weights] # stop on the first iteration diff --git a/src/litdata/streaming/dataloader.py b/src/litdata/streaming/dataloader.py index 05286ff1..5fa66119 100644 --- a/src/litdata/streaming/dataloader.py +++ b/src/litdata/streaming/dataloader.py @@ -64,9 +64,7 @@ def _equal_item(d1: Any, d2: Any) -> bool: equality = d1 == d2 if isinstance(equality, torch.Tensor): return bool(equality.all().item()) - if equality is True: - return True - return False + return bool(equality) class CacheDataset(Dataset): diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py index 02052484..47b09e8c 100644 --- a/src/litdata/streaming/resolver.py +++ b/src/litdata/streaming/resolver.py @@ -87,14 +87,11 @@ def _match_studio(target_id: Optional[str], target_name: Optional[str], cloudspa if target_id is not None and cloudspace.id == target_id: return True - if ( + return ( cloudspace.display_name is not None and target_name is not None and cloudspace.display_name.lower() == target_name.lower() - ): - return True - - return False + ) def _resolve_studio(dir_path: str, target_name: Optional[str], target_id: Optional[str]) -> Dir: diff --git a/src/litdata/streaming/sampler.py b/src/litdata/streaming/sampler.py index 350dd3b9..0f1a29a0 100644 --- a/src/litdata/streaming/sampler.py +++ b/src/litdata/streaming/sampler.py @@ -112,7 +112,7 @@ def __iter_non_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]: end = self._dataset_size if is_last else (worker_idx + 1) * worker_size worker_indices.append(indices[start:end]) - assert sum([len(s) for s in worker_indices]) == self._dataset_size + assert sum(len(s) for s in worker_indices) == self._dataset_size worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices] @@ -140,7 +140,7 @@ def __iter_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]: end_worker = replica_size if is_last_worker else (worker_idx + 1) * worker_size worker_indices.append(replica_indices[start_worker:end_worker]) - assert sum([len(s) for s in worker_indices]) == len(replica_indices) + assert sum(len(s) for s in worker_indices) == len(replica_indices) worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices] @@ -202,7 +202,7 @@ def __iter_indices_per_workers__( ) -> Iterator[List[Union[int, ChunkedIndex]]]: batches: List[List[Union[int, ChunkedIndex]]] = [] counter = 0 - while sum([len(v) for v in indices_per_workers]) != 0: + while sum(len(v) for v in indices_per_workers) != 0: worker_indices = indices_per_workers[counter % self._num_workers] if len(worker_indices) == 0: batches.append([]) diff --git a/src/litdata/streaming/writer.py b/src/litdata/streaming/writer.py index 577fbbe3..548dcfc5 100644 --- a/src/litdata/streaming/writer.py +++ b/src/litdata/streaming/writer.py @@ -241,7 +241,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes: sample_data = b"".join([item.data for item in items]) data = num_items.tobytes() + offsets.tobytes() + sample_data - current_chunk_bytes = sum([item.bytes for item in items]) + current_chunk_bytes = sum(item.bytes for item in items) # Whether to encrypt the data at the chunk level if self._encryption and self._encryption.level == EncryptionLevel.CHUNK: @@ -260,7 +260,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes: dim: Optional[int] = None if items[0].dim: - dim = sum([item.dim if item.dim is not None else 0 for item in items]) + dim = sum(item.dim if item.dim is not None else 0 for item in items) chunk_info = { "chunk_bytes": current_chunk_bytes, diff --git a/src/litdata/utilities/dataset_utilities.py b/src/litdata/utilities/dataset_utilities.py index a45b2f51..8bc8636d 100644 --- a/src/litdata/utilities/dataset_utilities.py +++ b/src/litdata/utilities/dataset_utilities.py @@ -75,7 +75,7 @@ def subsample_streaming_dataset( # checking if subsample is 1, as if user wants complete data, then let shuffler and sampler do the work original_chunks, roi = shuffle_lists_together(original_chunks, roi, random_seed_sampler) - num_items_to_subsample = int(sum([roi[1] - roi[0] for roi in roi]) * subsample) + num_items_to_subsample = int(sum(roi[1] - roi[0] for roi in roi) * subsample) subsampled_files, roi, _, _ = subsample_filenames_and_roi(original_chunks, roi, num_items_to_subsample) diff --git a/src/litdata/utilities/shuffle.py b/src/litdata/utilities/shuffle.py index aa8b519b..a50454a8 100644 --- a/src/litdata/utilities/shuffle.py +++ b/src/litdata/utilities/shuffle.py @@ -50,7 +50,7 @@ def _associate_chunks_and_internals_to_ranks( num_workers: int = 1, batch_size: int = 1, ) -> Tuple[List[List[int]], List[Any]]: - num_items = sum([(interval[2] - interval[1]) for interval in chunk_intervals]) + num_items = sum((interval[2] - interval[1]) for interval in chunk_intervals) num_items_per_ranks: List[int] = [ num_items // distributed_env.world_size + num_items % distributed_env.world_size if rank == distributed_env.world_size - 1 and not drop_last diff --git a/src/litdata/utilities/train_test_split.py b/src/litdata/utilities/train_test_split.py index d31fb076..47d619f4 100644 --- a/src/litdata/utilities/train_test_split.py +++ b/src/litdata/utilities/train_test_split.py @@ -66,7 +66,7 @@ def train_test_split( new_datasets = [deepcopy(streaming_dataset) for _ in splits] - dataset_length = sum([my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi]) + dataset_length = sum(my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi) subsampled_chunks, dummy_subsampled_roi = shuffle_lists_together( subsampled_chunks, dummy_subsampled_roi, np.random.RandomState([seed]) diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py index bb198528..8faae0a5 100644 --- a/tests/streaming/test_writer.py +++ b/tests/streaming/test_writer.py @@ -183,7 +183,7 @@ def test_binary_writer_with_jpeg_filepath_and_int(tmpdir): assert data["chunks"][0]["chunk_size"] == 7 assert data["chunks"][1]["chunk_size"] == 7 assert data["chunks"][-1]["chunk_size"] == 2 - assert sum([chunk["chunk_size"] for chunk in data["chunks"]]) == 100 + assert sum(chunk["chunk_size"] for chunk in data["chunks"]) == 100 reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9) for i in range(100): diff --git a/tests/utilities/test_shuffle.py b/tests/utilities/test_shuffle.py index 2645f487..119db814 100644 --- a/tests/utilities/test_shuffle.py +++ b/tests/utilities/test_shuffle.py @@ -68,10 +68,10 @@ def test_associate_chunks_and_internals_to_ranks(): ) assert chunks_per_ranks == [[0, 1], [1, 2], [2, 3, 4, 5], [5, 6, 7]] - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 105 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 105 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 105 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 105 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 105 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 105 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 105 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 105 assert intervals_per_ranks == [ [[0, 0, 50, 50], [0, 0, 55, 150]], @@ -99,10 +99,10 @@ def test_associate_chunks_and_internals_to_ranks(): ) assert chunks_per_ranks == [[0, 1], [1], [1, 2, 3, 4, 5], [5, 6, 7]] - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 64 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 64 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 64 - assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 64 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 64 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 64 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 64 + assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 64 assert intervals_per_ranks == [ [[0, 0, 5, 5], [0, 0, 59, 150]], [[0, 59, 123, 150]], diff --git a/tests/utilities/test_subsample.py b/tests/utilities/test_subsample.py index 73d4a26c..79b758bf 100644 --- a/tests/utilities/test_subsample.py +++ b/tests/utilities/test_subsample.py @@ -39,7 +39,7 @@ def test_subsample_filenames_and_roi(): roi_list = [(0, 50), (0, 25), (0, 75), (0, 35), (0, 5)] - total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list]) + total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list) subsample = 0.42 @@ -47,7 +47,7 @@ def test_subsample_filenames_and_roi(): _, roi_list, _, left_roi = subsample_filenames_and_roi(my_chunks, roi_list, target) - assert target == sum([roi[1] - roi[0] for roi in roi_list]) + assert target == sum(roi[1] - roi[0] for roi in roi_list) assert (total_chunk_roi_length - target) == sum(_roi[1] - _roi[0] for _roi in left_roi) @@ -61,7 +61,7 @@ def test_subsample_filenames_and_roi_exact(): roi_list = [(0, 50), (0, 50), (0, 50), (0, 50)] - total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list]) + total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list) subsample = 0.5 @@ -71,5 +71,5 @@ def test_subsample_filenames_and_roi_exact(): assert len(roi_list) == 2 assert len(right_roi) == 2 - assert target == sum([roi[1] - roi[0] for roi in roi_list]) + assert target == sum(roi[1] - roi[0] for roi in roi_list) assert (total_chunk_roi_length - target) == np.sum(right_roi)