Skip to content

Commit

Permalink
Merge branch 'main' into bugfix/resume-dataloader
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli authored Jul 11, 2024
2 parents d4594fb + c4c9117 commit 44e73d3
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
3 changes: 1 addition & 2 deletions src/litdata/processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,8 +1096,7 @@ def run(self, data_recipe: DataRecipe) -> None:
print("Workers are finished.")
result = data_recipe._done(len(user_items), self.delete_cached_files, self.output_dir)

if num_nodes == node_rank + 1 and self.output_dir.url and _IS_IN_STUDIO:
assert self.output_dir.path
if num_nodes == node_rank + 1 and self.output_dir.url and self.output_dir.path is not None and _IS_IN_STUDIO:
_create_dataset(
input_dir=self.input_dir.path,
storage_dir=self.output_dir.path,
Expand Down
13 changes: 13 additions & 0 deletions src/litdata/processing/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from litdata.processing.data_processor import DataChunkRecipe, DataProcessor, DataTransformRecipe
from litdata.processing.readers import BaseReader
from litdata.processing.utilities import (
_get_work_dir,
extract_rank_and_index_from_filename,
optimize_dns_context,
read_index_file_content,
Expand Down Expand Up @@ -372,8 +373,20 @@ def optimize(
)

if num_nodes is None or int(os.getenv("DATA_OPTIMIZER_NUM_NODES", 0)) > 0:
DATA_OPTIMIZER_NUM_NODES = int(os.getenv("DATA_OPTIMIZER_NUM_NODES", 0))
_output_dir: Dir = _resolve_dir(output_dir)

if (
_output_dir.url is None
and _output_dir.path
and _output_dir.path.startswith("/teamspace/studios/this_studio")
and DATA_OPTIMIZER_NUM_NODES > 0
):
assert _output_dir.path
output_dir = _output_dir.path.replace("/teamspace/studios/this_studio", "")
output_dir = _get_work_dir().lstrip("/").rstrip("/") + "/" + output_dir.lstrip("/").rstrip("/")
_output_dir = _resolve_dir(output_dir)

if _output_dir.url is not None and "cloudspaces" in _output_dir.url:
raise ValueError(
f"The provided `output_dir` isn't valid. Found {_output_dir.path}."
Expand Down

0 comments on commit 44e73d3

Please sign in to comment.