Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pre-commit.ci] pre-commit suggestions #205

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ci:

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
Expand Down Expand Up @@ -51,14 +51,14 @@ repos:
- id: detect-private-key

- repo: https://github.com/asottile/pyupgrade
rev: v3.15.1
rev: v3.16.0
hooks:
- id: pyupgrade
args: ["--py38-plus"]
name: Upgrade code

- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies: [tomli]
Expand All @@ -72,7 +72,7 @@ repos:
args: ["--in-place"]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.2
rev: v0.5.0
hooks:
- id: ruff-format
- id: ruff
Expand Down
6 changes: 3 additions & 3 deletions src/litdata/processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,8 @@ def _done(self, size: int, delete_cached_files: bool, output_dir: Dir) -> _Resul
if num_nodes == node_rank + 1:
config = load_index_file(cache_dir)

size = sum([c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]])
num_bytes = sum([c["chunk_bytes"] for c in config["chunks"]])
size = sum(c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"])
num_bytes = sum(c["chunk_bytes"] for c in config["chunks"])
if config["config"] is not None:
data_format = tree_unflatten(
config["config"]["data_format"], treespec_loads(config["config"]["data_spec"])
Expand Down Expand Up @@ -1013,7 +1013,7 @@ def run(self, data_recipe: DataRecipe) -> None:
workers_user_items = [w[:items_to_keep] for w in workers_user_items]
print(f"Fast dev run is enabled. Limiting to {items_to_keep} items per process.")

num_items = sum([len(items) for items in workers_user_items])
num_items = sum(len(items) for items in workers_user_items)

self._cleanup_cache()

Expand Down
4 changes: 2 additions & 2 deletions src/litdata/streaming/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def __init__(
self._num_samples_yielded = num_samples_yielded or [0 for _ in range(len(datasets))]
self._original_weights = deepcopy(weights)
self._weights = deepcopy(weights)
self._rng = random.Random(seed)
self._rng = random.Random(seed) # noqa: S311
self._iterate_over_all = iterate_over_all
self._is_done = False

Expand Down Expand Up @@ -226,7 +226,7 @@ def __next__(self) -> Any:

self._dataset_indexes[dataset_index] = None
self._weights[dataset_index] = None # type: ignore
new_sum = sum([w for w in self._weights if w is not None])
new_sum = sum(w for w in self._weights if w is not None)
self._weights = [None if w is None else w / new_sum for w in self._weights]

# stop on the first iteration
Expand Down
4 changes: 1 addition & 3 deletions src/litdata/streaming/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def _equal_item(d1: Any, d2: Any) -> bool:
equality = d1 == d2
if isinstance(equality, torch.Tensor):
return bool(equality.all().item())
if equality is True:
return True
return False
return bool(equality)


class CacheDataset(Dataset):
Expand Down
7 changes: 2 additions & 5 deletions src/litdata/streaming/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,11 @@ def _match_studio(target_id: Optional[str], target_name: Optional[str], cloudspa
if target_id is not None and cloudspace.id == target_id:
return True

if (
return (
cloudspace.display_name is not None
and target_name is not None
and cloudspace.display_name.lower() == target_name.lower()
):
return True

return False
)


def _resolve_studio(dir_path: str, target_name: Optional[str], target_id: Optional[str]) -> Dir:
Expand Down
6 changes: 3 additions & 3 deletions src/litdata/streaming/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __iter_non_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
end = self._dataset_size if is_last else (worker_idx + 1) * worker_size
worker_indices.append(indices[start:end])

assert sum([len(s) for s in worker_indices]) == self._dataset_size
assert sum(len(s) for s in worker_indices) == self._dataset_size

worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]

Expand Down Expand Up @@ -140,7 +140,7 @@ def __iter_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
end_worker = replica_size if is_last_worker else (worker_idx + 1) * worker_size
worker_indices.append(replica_indices[start_worker:end_worker])

assert sum([len(s) for s in worker_indices]) == len(replica_indices)
assert sum(len(s) for s in worker_indices) == len(replica_indices)

worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]

Expand Down Expand Up @@ -202,7 +202,7 @@ def __iter_indices_per_workers__(
) -> Iterator[List[Union[int, ChunkedIndex]]]:
batches: List[List[Union[int, ChunkedIndex]]] = []
counter = 0
while sum([len(v) for v in indices_per_workers]) != 0:
while sum(len(v) for v in indices_per_workers) != 0:
worker_indices = indices_per_workers[counter % self._num_workers]
if len(worker_indices) == 0:
batches.append([])
Expand Down
4 changes: 2 additions & 2 deletions src/litdata/streaming/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:
sample_data = b"".join([item.data for item in items])
data = num_items.tobytes() + offsets.tobytes() + sample_data

current_chunk_bytes = sum([item.bytes for item in items])
current_chunk_bytes = sum(item.bytes for item in items)

# Whether to encrypt the data at the chunk level
if self._encryption and self._encryption.level == EncryptionLevel.CHUNK:
Expand All @@ -260,7 +260,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:

dim: Optional[int] = None
if items[0].dim:
dim = sum([item.dim if item.dim is not None else 0 for item in items])
dim = sum(item.dim if item.dim is not None else 0 for item in items)

chunk_info = {
"chunk_bytes": current_chunk_bytes,
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/dataset_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def subsample_streaming_dataset(
# checking if subsample is 1, as if user wants complete data, then let shuffler and sampler do the work
original_chunks, roi = shuffle_lists_together(original_chunks, roi, random_seed_sampler)

num_items_to_subsample = int(sum([roi[1] - roi[0] for roi in roi]) * subsample)
num_items_to_subsample = int(sum(roi[1] - roi[0] for roi in roi) * subsample)

subsampled_files, roi, _, _ = subsample_filenames_and_roi(original_chunks, roi, num_items_to_subsample)

Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _associate_chunks_and_internals_to_ranks(
num_workers: int = 1,
batch_size: int = 1,
) -> Tuple[List[List[int]], List[Any]]:
num_items = sum([(interval[2] - interval[1]) for interval in chunk_intervals])
num_items = sum((interval[2] - interval[1]) for interval in chunk_intervals)
num_items_per_ranks: List[int] = [
num_items // distributed_env.world_size + num_items % distributed_env.world_size
if rank == distributed_env.world_size - 1 and not drop_last
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def train_test_split(

new_datasets = [deepcopy(streaming_dataset) for _ in splits]

dataset_length = sum([my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi])
dataset_length = sum(my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi)

subsampled_chunks, dummy_subsampled_roi = shuffle_lists_together(
subsampled_chunks, dummy_subsampled_roi, np.random.RandomState([seed])
Expand Down
2 changes: 1 addition & 1 deletion tests/streaming/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_binary_writer_with_jpeg_filepath_and_int(tmpdir):
assert data["chunks"][0]["chunk_size"] == 7
assert data["chunks"][1]["chunk_size"] == 7
assert data["chunks"][-1]["chunk_size"] == 2
assert sum([chunk["chunk_size"] for chunk in data["chunks"]]) == 100
assert sum(chunk["chunk_size"] for chunk in data["chunks"]) == 100

reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9)
for i in range(100):
Expand Down
16 changes: 8 additions & 8 deletions tests/utilities/test_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def test_associate_chunks_and_internals_to_ranks():
)

assert chunks_per_ranks == [[0, 1], [1, 2], [2, 3, 4, 5], [5, 6, 7]]
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 105

assert intervals_per_ranks == [
[[0, 0, 50, 50], [0, 0, 55, 150]],
Expand Down Expand Up @@ -99,10 +99,10 @@ def test_associate_chunks_and_internals_to_ranks():
)

assert chunks_per_ranks == [[0, 1], [1], [1, 2, 3, 4, 5], [5, 6, 7]]
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 64
assert intervals_per_ranks == [
[[0, 0, 5, 5], [0, 0, 59, 150]],
[[0, 59, 123, 150]],
Expand Down
8 changes: 4 additions & 4 deletions tests/utilities/test_subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ def test_subsample_filenames_and_roi():

roi_list = [(0, 50), (0, 25), (0, 75), (0, 35), (0, 5)]

total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)

subsample = 0.42

target = int(total_chunk_roi_length * subsample)

_, roi_list, _, left_roi = subsample_filenames_and_roi(my_chunks, roi_list, target)

assert target == sum([roi[1] - roi[0] for roi in roi_list])
assert target == sum(roi[1] - roi[0] for roi in roi_list)
assert (total_chunk_roi_length - target) == sum(_roi[1] - _roi[0] for _roi in left_roi)


Expand All @@ -61,7 +61,7 @@ def test_subsample_filenames_and_roi_exact():

roi_list = [(0, 50), (0, 50), (0, 50), (0, 50)]

total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)

subsample = 0.5

Expand All @@ -71,5 +71,5 @@ def test_subsample_filenames_and_roi_exact():

assert len(roi_list) == 2
assert len(right_roi) == 2
assert target == sum([roi[1] - roi[0] for roi in roi_list])
assert target == sum(roi[1] - roi[0] for roi in roi_list)
assert (total_chunk_roi_length - target) == np.sum(right_roi)
Loading