Skip to content

Commit

Permalink
correct the chunk size by adding header size
Browse files Browse the repository at this point in the history
  • Loading branch information
Hao Shen committed Oct 10, 2024
1 parent b9aa903 commit af5db07
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/litdata/streaming/item_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,17 @@ def load_item_from_chunk(
del self._chunk_filepaths[chunk_filepath]

if chunk_filepath not in self._chunk_filepaths:
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes
# Get size of chunk header
# The number of items + the number of offsets (number of items in the chunk + 1)
# multiplied by the header encoding dtype (np.uint32)
chunk_header_bytes = (1 + self._chunks[chunk_index]["chunk_size"] + 1) * 4
filesize_bytes = chunk_bytes + chunk_header_bytes

exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes

while not exists:
sleep(0.1)
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes

self._chunk_filepaths[chunk_filepath] = True

Expand Down

0 comments on commit af5db07

Please sign in to comment.