Skip to content

Commit

Permalink
fixing
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda committed Jul 18, 2024
1 parent 4bdc06c commit d427a8c
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 34 deletions.
6 changes: 3 additions & 3 deletions src/litdata/processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,8 @@ def _done(self, size: int, delete_cached_files: bool, output_dir: Dir) -> _Resul
if num_nodes == node_rank + 1:
config = load_index_file(cache_dir)

size = sum([c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]])
num_bytes = sum([c["chunk_bytes"] for c in config["chunks"]])
size = sum(c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"])
num_bytes = sum(c["chunk_bytes"] for c in config["chunks"])
if config["config"] is not None:
data_format = tree_unflatten(
config["config"]["data_format"], treespec_loads(config["config"]["data_spec"])
Expand Down Expand Up @@ -1013,7 +1013,7 @@ def run(self, data_recipe: DataRecipe) -> None:
workers_user_items = [w[:items_to_keep] for w in workers_user_items]
print(f"Fast dev run is enabled. Limiting to {items_to_keep} items per process.")

num_items = sum([len(items) for items in workers_user_items])
num_items = sum(len(items) for items in workers_user_items)

self._cleanup_cache()

Expand Down
4 changes: 2 additions & 2 deletions src/litdata/streaming/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def __init__(
self._num_samples_yielded = num_samples_yielded or [0 for _ in range(len(datasets))]
self._original_weights = deepcopy(weights)
self._weights = deepcopy(weights)
self._rng = random.Random(seed)
self._rng = random.Random(seed) # noqa: S311
self._iterate_over_all = iterate_over_all
self._is_done = False

Expand Down Expand Up @@ -226,7 +226,7 @@ def __next__(self) -> Any:

self._dataset_indexes[dataset_index] = None
self._weights[dataset_index] = None # type: ignore
new_sum = sum([w for w in self._weights if w is not None])
new_sum = sum(w for w in self._weights if w is not None)
self._weights = [None if w is None else w / new_sum for w in self._weights]

# stop on the first iteration
Expand Down
4 changes: 1 addition & 3 deletions src/litdata/streaming/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def _equal_item(d1: Any, d2: Any) -> bool:
equality = d1 == d2
if isinstance(equality, torch.Tensor):
return bool(equality.all().item())
if equality is True:
return True
return False
return bool(equality)


class CacheDataset(Dataset):
Expand Down
7 changes: 2 additions & 5 deletions src/litdata/streaming/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,11 @@ def _match_studio(target_id: Optional[str], target_name: Optional[str], cloudspa
if target_id is not None and cloudspace.id == target_id:
return True

if (
return (
cloudspace.display_name is not None
and target_name is not None
and cloudspace.display_name.lower() == target_name.lower()
):
return True

return False
)


def _resolve_studio(dir_path: str, target_name: Optional[str], target_id: Optional[str]) -> Dir:
Expand Down
6 changes: 3 additions & 3 deletions src/litdata/streaming/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __iter_non_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
end = self._dataset_size if is_last else (worker_idx + 1) * worker_size
worker_indices.append(indices[start:end])

assert sum([len(s) for s in worker_indices]) == self._dataset_size
assert sum(len(s) for s in worker_indices) == self._dataset_size

worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]

Expand Down Expand Up @@ -140,7 +140,7 @@ def __iter_distributed__(self) -> Iterator[List[Union[int, ChunkedIndex]]]:
end_worker = replica_size if is_last_worker else (worker_idx + 1) * worker_size
worker_indices.append(replica_indices[start_worker:end_worker])

assert sum([len(s) for s in worker_indices]) == len(replica_indices)
assert sum(len(s) for s in worker_indices) == len(replica_indices)

worker_indices_batches = [self._chunk_list(indices, self._batch_size) for indices in worker_indices]

Expand Down Expand Up @@ -202,7 +202,7 @@ def __iter_indices_per_workers__(
) -> Iterator[List[Union[int, ChunkedIndex]]]:
batches: List[List[Union[int, ChunkedIndex]]] = []
counter = 0
while sum([len(v) for v in indices_per_workers]) != 0:
while sum(len(v) for v in indices_per_workers) != 0:
worker_indices = indices_per_workers[counter % self._num_workers]
if len(worker_indices) == 0:
batches.append([])
Expand Down
4 changes: 2 additions & 2 deletions src/litdata/streaming/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:
sample_data = b"".join([item.data for item in items])
data = num_items.tobytes() + offsets.tobytes() + sample_data

current_chunk_bytes = sum([item.bytes for item in items])
current_chunk_bytes = sum(item.bytes for item in items)

# Whether to encrypt the data at the chunk level
if self._encryption and self._encryption.level == EncryptionLevel.CHUNK:
Expand All @@ -260,7 +260,7 @@ def _create_chunk(self, filename: str, on_done: bool = False) -> bytes:

dim: Optional[int] = None
if items[0].dim:
dim = sum([item.dim if item.dim is not None else 0 for item in items])
dim = sum(item.dim if item.dim is not None else 0 for item in items)

chunk_info = {
"chunk_bytes": current_chunk_bytes,
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/dataset_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def subsample_streaming_dataset(
# checking if subsample is 1, as if user wants complete data, then let shuffler and sampler do the work
original_chunks, roi = shuffle_lists_together(original_chunks, roi, random_seed_sampler)

num_items_to_subsample = int(sum([roi[1] - roi[0] for roi in roi]) * subsample)
num_items_to_subsample = int(sum(roi[1] - roi[0] for roi in roi) * subsample)

subsampled_files, roi, _, _ = subsample_filenames_and_roi(original_chunks, roi, num_items_to_subsample)

Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _associate_chunks_and_internals_to_ranks(
num_workers: int = 1,
batch_size: int = 1,
) -> Tuple[List[List[int]], List[Any]]:
num_items = sum([(interval[2] - interval[1]) for interval in chunk_intervals])
num_items = sum((interval[2] - interval[1]) for interval in chunk_intervals)
num_items_per_ranks: List[int] = [
num_items // distributed_env.world_size + num_items % distributed_env.world_size
if rank == distributed_env.world_size - 1 and not drop_last
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/utilities/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def train_test_split(

new_datasets = [deepcopy(streaming_dataset) for _ in splits]

dataset_length = sum([my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi])
dataset_length = sum(my_roi[1] - my_roi[0] for my_roi in dummy_subsampled_roi)

subsampled_chunks, dummy_subsampled_roi = shuffle_lists_together(
subsampled_chunks, dummy_subsampled_roi, np.random.RandomState([seed])
Expand Down
2 changes: 1 addition & 1 deletion tests/streaming/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_binary_writer_with_jpeg_filepath_and_int(tmpdir):
assert data["chunks"][0]["chunk_size"] == 7
assert data["chunks"][1]["chunk_size"] == 7
assert data["chunks"][-1]["chunk_size"] == 2
assert sum([chunk["chunk_size"] for chunk in data["chunks"]]) == 100
assert sum(chunk["chunk_size"] for chunk in data["chunks"]) == 100

reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9)
for i in range(100):
Expand Down
16 changes: 8 additions & 8 deletions tests/utilities/test_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def test_associate_chunks_and_internals_to_ranks():
)

assert chunks_per_ranks == [[0, 1], [1, 2], [2, 3, 4, 5], [5, 6, 7]]
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 105
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 105
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 105

assert intervals_per_ranks == [
[[0, 0, 50, 50], [0, 0, 55, 150]],
Expand Down Expand Up @@ -99,10 +99,10 @@ def test_associate_chunks_and_internals_to_ranks():
)

assert chunks_per_ranks == [[0, 1], [1], [1, 2, 3, 4, 5], [5, 6, 7]]
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[0]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[1]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[2]]) == 64
assert sum([interval[2] - interval[1] for interval in intervals_per_ranks[3]]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[0]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[1]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[2]) == 64
assert sum(interval[2] - interval[1] for interval in intervals_per_ranks[3]) == 64
assert intervals_per_ranks == [
[[0, 0, 5, 5], [0, 0, 59, 150]],
[[0, 59, 123, 150]],
Expand Down
8 changes: 4 additions & 4 deletions tests/utilities/test_subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ def test_subsample_filenames_and_roi():

roi_list = [(0, 50), (0, 25), (0, 75), (0, 35), (0, 5)]

total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)

subsample = 0.42

target = int(total_chunk_roi_length * subsample)

_, roi_list, _, left_roi = subsample_filenames_and_roi(my_chunks, roi_list, target)

assert target == sum([roi[1] - roi[0] for roi in roi_list])
assert target == sum(roi[1] - roi[0] for roi in roi_list)
assert (total_chunk_roi_length - target) == sum(_roi[1] - _roi[0] for _roi in left_roi)


Expand All @@ -61,7 +61,7 @@ def test_subsample_filenames_and_roi_exact():

roi_list = [(0, 50), (0, 50), (0, 50), (0, 50)]

total_chunk_roi_length = sum([roi[1] - roi[0] for roi in roi_list])
total_chunk_roi_length = sum(roi[1] - roi[0] for roi in roi_list)

subsample = 0.5

Expand All @@ -71,5 +71,5 @@ def test_subsample_filenames_and_roi_exact():

assert len(roi_list) == 2
assert len(right_roi) == 2
assert target == sum([roi[1] - roi[0] for roi in roi_list])
assert target == sum(roi[1] - roi[0] for roi in roi_list)
assert (total_chunk_roi_length - target) == np.sum(right_roi)

0 comments on commit d427a8c

Please sign in to comment.