Lightning-AI · pre-commit-ci · Jul 2, 2024 · Jul 18, 2024 · Jul 18, 2024
@@ -23,7 +23,7 @@ ci:
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: end-of-file-fixer
       - id: trailing-whitespace
@@ -51,14 +51,14 @@ repos:
       - id: detect-private-key
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.15.1
+    rev: v3.16.0
     hooks:
       - id: pyupgrade
         args: ["--py38-plus"]
         name: Upgrade code
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -72,7 +72,7 @@ repos:
         args: ["--in-place"]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.5.0
     hooks:
       - id: ruff-format
       - id: ruff

@@ -785,8 +785,8 @@ def _done(self, size: int, delete_cached_files: bool, output_dir: Dir) -> _Resul
         if num_nodes == node_rank + 1:
             config = load_index_file(cache_dir)
 
-            size = sum([c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]])
-            num_bytes = sum([c["chunk_bytes"] for c in config["chunks"]])
+            size = sum(c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"])
+            num_bytes = sum(c["chunk_bytes"] for c in config["chunks"])
             if config["config"] is not None:
                 data_format = tree_unflatten(
                     config["config"]["data_format"], treespec_loads(config["config"]["data_spec"])
@@ -1013,7 +1013,7 @@ def run(self, data_recipe: DataRecipe) -> None:
             workers_user_items = [w[:items_to_keep] for w in workers_user_items]
             print(f"Fast dev run is enabled. Limiting to {items_to_keep} items per process.")
 
-        num_items = sum([len(items) for items in workers_user_items])
+        num_items = sum(len(items) for items in workers_user_items)
 
         self._cleanup_cache()
 

@@ -194,7 +194,7 @@ def __init__(
         self._num_samples_yielded = num_samples_yielded or [0 for _ in range(len(datasets))]
         self._original_weights = deepcopy(weights)
         self._weights = deepcopy(weights)
-        self._rng = random.Random(seed)
+        self._rng = random.Random(seed)  # noqa: S311
         self._iterate_over_all = iterate_over_all
         self._is_done = False
 
@@ -226,7 +226,7 @@ def __next__(self) -> Any:
 
                     self._dataset_indexes[dataset_index] = None
                     self._weights[dataset_index] = None  # type: ignore
-                    new_sum = sum([w for w in self._weights if w is not None])
+                    new_sum = sum(w for w in self._weights if w is not None)
                     self._weights = [None if w is None else w / new_sum for w in self._weights]
 
         # stop on the first iteration

@@ -64,9 +64,7 @@ def _equal_item(d1: Any, d2: Any) -> bool:
     equality = d1 == d2
     if isinstance(equality, torch.Tensor):
         return bool(equality.all().item())
-    if equality is True:
-        return True
-    return False
+    return bool(equality)
 
 
 class CacheDataset(Dataset):

@@ -87,14 +87,11 @@ def _match_studio(target_id: Optional[str], target_name: Optional[str], cloudspa
     if target_id is not None and cloudspace.id == target_id:
         return True
 
-    if (
+    return (
         cloudspace.display_name is not None
         and target_name is not None
         and cloudspace.display_name.lower() == target_name.lower()
-    ):
-        return True
-
-    return False
+    )
 
 
 def _resolve_studio(dir_path: str, target_name: Optional[str], target_id: Optional[str]) -> Dir:

@@ -112,7 +112,7 @@ def __iter_non_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
             end = self._dataset_size if is_last else (worker_idx + 1) * worker_size
             worker_indices.append(indices[start:end])
 
-        assert sum([len(s) for s in worker_indices]) == self._dataset_size
+        assert sum(len(s) for s in worker_indices) == self._dataset_size
 
         worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]
 
@@ -140,7 +140,7 @@ def __iter_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
                 end_worker = replica_size if is_last_worker else (worker_idx + 1) * worker_size
                 worker_indices.append(replica_indices[start_worker:end_worker])
 
-        assert sum([len(s) for s in worker_indices]) == len(replica_indices)
+        assert sum(len(s) for s in worker_indices) == len(replica_indices)
 
         worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]
 
@@ -202,7 +202,7 @@ def __iter_indices_per_workers__(
     ) -> Iterator[List[Union[int, ChunkedIndex]]]:
         batches: List[List[Union[int, ChunkedIndex]]] = []
         counter = 0
-        while sum([len(v) for v in indices_per_workers]) != 0:
+        while sum(len(v) for v in indices_per_workers) != 0:
             worker_indices = indices_per_workers[counter % self._num_workers]
             if len(worker_indices) == 0:
                 batches.append([])

@@ -241,7 +241,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:
         sample_data = b"".join([item.data for item in items])
         data = num_items.tobytes() + offsets.tobytes() + sample_data
 
-        current_chunk_bytes = sum([item.bytes for item in items])
+        current_chunk_bytes = sum(item.bytes for item in items)
 
         # Whether to encrypt the data at the chunk level
         if self._encryption and self._encryption.level == EncryptionLevel.CHUNK:
@@ -260,7 +260,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:
 
         dim: Optional[int] = None
         if items[0].dim:
-            dim = sum([item.dim if item.dim is not None else 0 for item in items])
+            dim = sum(item.dim if item.dim is not None else 0 for item in items)
 
         chunk_info = {
             "chunk_bytes": current_chunk_bytes,

@@ -75,7 +75,7 @@ def subsample_streaming_dataset(
         # checking if subsample is 1, as if user wants complete data, then let shuffler and sampler do the work
         original_chunks, roi = shuffle_lists_together(original_chunks, roi, random_seed_sampler)
 
-    num_items_to_subsample = int(sum([roi[1] - roi[0] for roi in roi]) * subsample)
+    num_items_to_subsample = int(sum(roi[1] - roi[0] for roi in roi) * subsample)
 
     subsampled_files, roi, _, _ = subsample_filenames_and_roi(original_chunks, roi, num_items_to_subsample)
 

@@ -50,7 +50,7 @@ def _associate_chunks_and_internals_to_ranks(
     num_workers: int = 1,
     batch_size: int = 1,
 ) -> Tuple[List[List[int]], List[Any]]:
-    num_items = sum([(interval[2] - interval[1]) for interval in chunk_intervals])
+    num_items = sum((interval[2] - interval[1]) for interval in chunk_intervals)
     num_items_per_ranks: List[int] = [
         num_items // distributed_env.world_size + num_items % distributed_env.world_size
         if rank == distributed_env.world_size - 1 and not drop_last

@@ -66,7 +66,7 @@ def train_test_split(
 
     new_datasets = [deepcopy(streaming_dataset) for _ in splits]
 
-    dataset_length = sum([my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi])
+    dataset_length = sum(my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi)
 
     subsampled_chunks, dummy_subsampled_roi = shuffle_lists_together(
         subsampled_chunks, dummy_subsampled_roi, np.random.RandomState([seed])

@@ -183,7 +183,7 @@ def test_binary_writer_with_jpeg_filepath_and_int(tmpdir):
     assert data["chunks"][0]["chunk_size"] == 7
     assert data["chunks"][1]["chunk_size"] == 7
     assert data["chunks"][-1]["chunk_size"] == 2
-    assert sum([chunk["chunk_size"] for chunk in data["chunks"]]) == 100
+    assert sum(chunk["chunk_size"] for chunk in data["chunks"]) == 100
 
     reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9)
     for i in range(100):

@@ -68,10 +68,10 @@ def test_associate_chunks_and_internals_to_ranks():
     )
 
     assert chunks_per_ranks == [[0, 1], [1, 2], [2, 3, 4, 5], [5, 6, 7]]
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 105
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 105
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 105
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 105
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 105
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 105
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 105
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 105
 
     assert intervals_per_ranks == [
         [[0, 0, 50, 50], [0, 0, 55, 150]],
@@ -99,10 +99,10 @@ def test_associate_chunks_and_internals_to_ranks():
     )
 
     assert chunks_per_ranks == [[0, 1], [1], [1, 2, 3, 4, 5], [5, 6, 7]]
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 64
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 64
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 64
-    assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 64
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 64
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 64
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 64
+    assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 64
     assert intervals_per_ranks == [
         [[0, 0, 5, 5], [0, 0, 59, 150]],
         [[0, 59, 123, 150]],

@@ -39,15 +39,15 @@ def test_subsample_filenames_and_roi():
 
     roi_list = [(0, 50), (0, 25), (0, 75), (0, 35), (0, 5)]
 
-    total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
+    total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)
 
     subsample = 0.42
 
     target = int(total_chunk_roi_length * subsample)
 
     _, roi_list, _, left_roi = subsample_filenames_and_roi(my_chunks, roi_list, target)
 
-    assert target == sum([roi[1] - roi[0] for roi in roi_list])
+    assert target == sum(roi[1] - roi[0] for roi in roi_list)
     assert (total_chunk_roi_length - target) == sum(_roi[1] - _roi[0] for _roi in left_roi)
 
 
@@ -61,7 +61,7 @@ def test_subsample_filenames_and_roi_exact():
 
     roi_list = [(0, 50), (0, 50), (0, 50), (0, 50)]
 
-    total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
+    total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)
 
     subsample = 0.5
 
@@ -71,5 +71,5 @@ def test_subsample_filenames_and_roi_exact():
 
     assert len(roi_list) == 2
     assert len(right_roi) == 2
-    assert target == sum([roi[1] - roi[0] for roi in roi_list])
+    assert target == sum(roi[1] - roi[0] for roi in roi_list)
     assert (total_chunk_roi_length - target) == np.sum(right_roi)