Skip to content

Commit

Permalink
Speed up the search for chunks to skip deletion for (#312)
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli authored Aug 7, 2024
1 parent c16d00e commit 428752d
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
17 changes: 16 additions & 1 deletion src/litdata/utilities/shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,28 @@ def _find_chunks_per_workers_on_which_to_skip_deletion(
counter = 0

while True:
# PART 1: Consume as many batches all at once for every worker and their respective current chunk
if num_of_samples_to_carry_to_next_chunk is None:
sizes = [size for size in workers_interval_sizes_for_this_rank if len(size)]
min_interval_size = min(size[0] for size in sizes)
# -1 here because we need the logic in PART 2 to .pop() the list for the last batch
num_batches = (min_interval_size // batch_size) - 1
num_batches = max(num_batches, 0)
for i in range(len(workers_interval_sizes_for_this_rank)):
if workers_interval_sizes_for_this_rank[i]:
workers_interval_sizes_for_this_rank[i][0] -= num_batches * batch_size
worker_tracker_idx += num_batches * len(sizes)
counter += num_batches * batch_size * len(sizes)

interval_size_of_current_worker = workers_interval_sizes_for_this_rank[worker_tracker_idx % num_workers]
if len(interval_size_of_current_worker) == 0:
worker_tracker_idx += 1
continue

# PART 2: We have leftover samples to consume
# We consume them one by one because we're at the end of a chunk and may have to handle
# a remainder from the previous iteration
num_samples_left_for_this_worker_chunk = interval_size_of_current_worker[0]

# To consume a batch, we want to subtract `batch_size` from the size we have left,
# unless we had a remainder (< batch size) from the previous iteration/chunk
remover = (
Expand Down
11 changes: 10 additions & 1 deletion tests/utilities/test_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,15 @@ def test_find_chunks_per_workers_on_which_to_skip_deletion():
)
assert chunks_to_disable == {1: [1]}

# world size = 1, 2 workers sharing one chunk, different sizes with remainders to next chunk
chunks_to_disable = _find_chunks_per_workers_on_which_to_skip_deletion(
num_workers=2,
batch_size=25,
workers_chunks=[[0, 1], [1, 2]],
workers_intervals=[[(0, 0, 70, 100), (0, 0, 55, 100)], [(0, 0, 105, 50), (0, 0, 55, 100)]],
)
assert chunks_to_disable == {1: [0]}

# world size = 1, 4 workers sharing one chunk
chunks_to_disable = _find_chunks_per_workers_on_which_to_skip_deletion(
num_workers=4,
Expand All @@ -241,7 +250,7 @@ def test_find_chunks_per_workers_on_which_to_skip_deletion():
assert chunks_to_disable == {0: [0, 1, 3]}

# world size 2, 2 workers per rank, varying batch size
for batch_size in range(1, 6):
for batch_size in range(1, 7):
chunks_to_disable = _find_chunks_per_workers_on_which_to_skip_deletion(
num_workers=2,
batch_size=batch_size,
Expand Down

0 comments on commit 428752d

Please sign in to comment.