From 93a5a5449cb4ca11d03ad20de1f66802b2bb359e Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Sep 2024 13:52:25 +0200 Subject: [PATCH 1/5] improve hint readibilty --- src/litdata/imports.py | 2 +- src/litdata/processing/functions.py | 4 ++-- src/litdata/processing/utilities.py | 4 ++-- src/litdata/streaming/dataloader.py | 4 ++-- src/litdata/streaming/dataset.py | 2 +- src/litdata/streaming/resolver.py | 28 +++++++++++----------- src/litdata/streaming/serializers.py | 2 +- src/litdata/utilities/dataset_utilities.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/litdata/imports.py b/src/litdata/imports.py index 3d415569..b4288ed9 100644 --- a/src/litdata/imports.py +++ b/src/litdata/imports.py @@ -97,7 +97,7 @@ def _check_requirement(self) -> None: self.message = f"Requirement {self.requirement!r} met" except Exception as ex: self.available = False - self.message = f"{ex.__class__.__name__}: {ex}. HINT: Try running `pip install -U {self.requirement!r}`" + self.message = f"{ex.__class__.__name__}: {ex}.\n HINT: Try running `pip install -U {self.requirement!r}`" requirement_contains_version_specifier = any(c in self.requirement for c in "=<>") if not requirement_contains_version_specifier or self.module is not None: module = self.requirement if self.module is None else self.module diff --git a/src/litdata/processing/functions.py b/src/litdata/processing/functions.py index 24b7564d..dd62909a 100644 --- a/src/litdata/processing/functions.py +++ b/src/litdata/processing/functions.py @@ -254,7 +254,7 @@ def map( if _output_dir.url and "cloudspaces" in _output_dir.url: raise ValueError( f"The provided `output_dir` isn't valid. Found {_output_dir.path if _output_dir else None}." - " HINT: You can either use `/teamspace/s3_connections/...` or `/teamspace/datasets/...`." + "\n HINT: You can either use `/teamspace/s3_connections/...` or `/teamspace/datasets/...`." ) if error_when_not_empty: @@ -400,7 +400,7 @@ def optimize( if _output_dir.url is not None and "cloudspaces" in _output_dir.url: raise ValueError( f"The provided `output_dir` isn't valid. Found {_output_dir.path}." - " HINT: You can either use `/teamspace/s3_connections/...` or `/teamspace/datasets/...`." + "\n HINT: You can either use `/teamspace/s3_connections/...` or `/teamspace/datasets/...`." ) _assert_dir_has_index_file(_output_dir, mode=mode, use_checkpoint=use_checkpoint) diff --git a/src/litdata/processing/utilities.py b/src/litdata/processing/utilities.py index 9547ec66..a50b9652 100644 --- a/src/litdata/processing/utilities.py +++ b/src/litdata/processing/utilities.py @@ -46,7 +46,7 @@ def _create_dataset( project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None) cluster_id = os.getenv("LIGHTNING_CLUSTER_ID", None) user_id = os.getenv("LIGHTNING_USER_ID", None) - cloud_space_id = os.getenv("LIGHTNING_CLOUD_SPACE_ID", None) + studio_id = os.getenv("LIGHTNING_CLOUD_SPACE_ID", None) lightning_app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", None) if project_id is None: @@ -64,7 +64,7 @@ def _create_dataset( try: client.dataset_service_create_dataset( body=ProjectIdDatasetsBody( - cloud_space_id=cloud_space_id if lightning_app_id is None else None, + cloud_space_id=studio_id if lightning_app_id is None else None, cluster_id=cluster_id, creator_id=user_id, empty=empty, diff --git a/src/litdata/streaming/dataloader.py b/src/litdata/streaming/dataloader.py index 3f0bf584..1612f55d 100644 --- a/src/litdata/streaming/dataloader.py +++ b/src/litdata/streaming/dataloader.py @@ -103,7 +103,7 @@ def __getitem__(self, index: int) -> Any: if not _equal_items(data_1, data2): raise ValueError( f"Your dataset items aren't deterministic. Found {data_1} and {data2} for index {index}." - " HINT: Use the `litdata.cache.Cache` directly within your dataset." + "\n HINT: Use the `litdata.cache.Cache` directly within your dataset." ) self._is_deterministic = True self._cache[index] = data_1 @@ -115,7 +115,7 @@ class CacheCollateFn: During the chunking phase, there is no need to return any data from the DataLoader reducing some time. - Additionally, if the user makes their __getitem__ asynchronous, the collate executes them in parallel. + Additionally, if the user makes their __getitem__ asynchronous, collate executes them in parallel. """ diff --git a/src/litdata/streaming/dataset.py b/src/litdata/streaming/dataset.py index df2228aa..ea82ce7a 100644 --- a/src/litdata/streaming/dataset.py +++ b/src/litdata/streaming/dataset.py @@ -177,7 +177,7 @@ def _create_cache(self, worker_env: _WorkerEnv) -> Cache: if not cache.filled: raise ValueError( f"The provided dataset `{self.input_dir}` doesn't contain any {_INDEX_FILENAME} file." - " HINT: Did you successfully optimize a dataset to the provided `input_dir`?" + "\n HINT: Did you successfully optimize a dataset to the provided `input_dir`?" ) return cache diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py index 1e388745..6c8dc9d3 100644 --- a/src/litdata/streaming/resolver.py +++ b/src/litdata/streaming/resolver.py @@ -169,16 +169,16 @@ def _resolve_datasets(dir_path: str) -> Dir: # Get the ids from env variables cluster_id = os.getenv("LIGHTNING_CLUSTER_ID", None) project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None) - cloud_space_id = os.getenv("LIGHTNING_CLOUD_SPACE_ID", None) + studio_id = os.getenv("LIGHTNING_CLOUD_SPACE_ID", None) if cluster_id is None: - raise RuntimeError("The `cluster_id` couldn't be found from the environment variables.") + raise RuntimeError("The `LIGHTNING_CLUSTER_ID` couldn't be found from the environment variables.") if project_id is None: - raise RuntimeError("The `project_id` couldn't be found from the environment variables.") + raise RuntimeError("The `LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables.") - if cloud_space_id is None: - raise RuntimeError("The `cloud_space_id` couldn't be found from the environment variables.") + if studio_id is None: + raise RuntimeError("The `LIGHTNING_CLOUD_SPACE_ID` couldn't be found from the environment variables.") clusters = client.cluster_service_list_project_clusters(project_id).clusters @@ -187,17 +187,17 @@ def _resolve_datasets(dir_path: str) -> Dir: for cloudspace in client.cloud_space_service_list_cloud_spaces( project_id=project_id, cluster_id=cluster_id ).cloudspaces - if cloudspace.id == cloud_space_id + if cloudspace.id == studio_id ] if not target_cloud_space: - raise ValueError(f"We didn't find any matching Studio for the provided id `{cloud_space_id}`.") + raise ValueError(f"We didn't find any matching Studio for the provided id `{studio_id}`.") target_cluster = [cluster for cluster in clusters if cluster.id == target_cloud_space[0].cluster_id] if not target_cluster: raise ValueError( - f"We didn't find a matching cluster associated with the id {target_cloud_space[0].cluster_id}." + f"We didn't find a matching cluster associated with the id `{target_cloud_space[0].cluster_id}`." ) return Dir( @@ -211,7 +211,7 @@ def _resolve_datasets(dir_path: str) -> Dir: def _assert_dir_is_empty(output_dir: Dir, append: bool = False, overwrite: bool = False) -> None: if not isinstance(output_dir, Dir): - raise ValueError("The provided output_dir isn't a Dir Object.") + raise ValueError("The provided output_dir isn't a `Dir` Object.") if output_dir.url is None: return @@ -234,7 +234,7 @@ def _assert_dir_is_empty(output_dir: Dir, append: bool = False, overwrite: bool if objects["KeyCount"] > 0: raise RuntimeError( f"The provided output_dir `{output_dir.path}` already contains data and datasets are meant to be immutable." - " HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" + "\n HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" ) @@ -261,8 +261,8 @@ def _assert_dir_has_index_file( if os.path.exists(index_file) and mode is None: raise RuntimeError( f"The provided output_dir `{output_dir.path}` already contains an optimized immutable datasets." - " HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" - " HINT: If you want to append/overwrite to the existing dataset, use `mode='append | overwrite'`." + "\n HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" + "\n HINT: If you want to append/overwrite to the existing dataset, use `mode='append | overwrite'`." ) # delete index.json file and chunks @@ -310,8 +310,8 @@ def _assert_dir_has_index_file( if has_index_file and mode is None: raise RuntimeError( f"The provided output_dir `{output_dir.path}` already contains an optimized immutable datasets." - " HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" - " HINT: If you want to append/overwrite to the existing dataset, use `mode='append | overwrite'`." + "\n HINT: Did you consider changing the `output_dir` with your own versioning as a suffix?" + "\n HINT: If you want to append/overwrite to the existing dataset, use `mode='append | overwrite'`." ) # Delete all the files (including the index file in overwrite mode) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 57d3e85e..2b1baa79 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -103,7 +103,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: if isinstance(item, JpegImageFile): if not hasattr(item, "filename"): raise ValueError( - "The JPEG Image's filename isn't defined. HINT: Open the image in your Dataset __getitem__ method." + "The JPEG Image's filename isn't defined.\n HINT: Open the image in your Dataset __getitem__ method." ) if item.filename and os.path.isfile(item.filename): # read the content of the file directly diff --git a/src/litdata/utilities/dataset_utilities.py b/src/litdata/utilities/dataset_utilities.py index 0c482074..a23d9e3f 100644 --- a/src/litdata/utilities/dataset_utilities.py +++ b/src/litdata/utilities/dataset_utilities.py @@ -64,7 +64,7 @@ def subsample_streaming_dataset( else: raise ValueError( f"The provided dataset `{input_dir.path}` doesn't contain any {_INDEX_FILENAME} file." - " HINT: Did you successfully optimize a dataset to the provided `input_dir`?" + "\n HINT: Did you successfully optimize a dataset to the provided `input_dir`?" ) assert len(original_chunks) > 0, f"No chunks found in the `{input_dir}/index.json` file" From 95d9c0284fdaabbf7763e105990fcaee9e67783c Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Sep 2024 13:56:02 +0200 Subject: [PATCH 2/5] lint --- src/litdata/streaming/serializers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py index 2b1baa79..0624e739 100644 --- a/src/litdata/streaming/serializers.py +++ b/src/litdata/streaming/serializers.py @@ -103,7 +103,8 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]: if isinstance(item, JpegImageFile): if not hasattr(item, "filename"): raise ValueError( - "The JPEG Image's filename isn't defined.\n HINT: Open the image in your Dataset __getitem__ method." + "The JPEG Image's filename isn't defined." + "\n HINT: Open the image in your Dataset `__getitem__` method." ) if item.filename and os.path.isfile(item.filename): # read the content of the file directly From 9a818e435ff4c3dbd61f4d8420dddc789988a633 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Sep 2024 14:13:07 +0200 Subject: [PATCH 3/5] more... --- src/litdata/streaming/resolver.py | 6 +++--- tests/streaming/test_resolver.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py index 6c8dc9d3..98ce5fef 100644 --- a/src/litdata/streaming/resolver.py +++ b/src/litdata/streaming/resolver.py @@ -108,10 +108,10 @@ def _resolve_studio(dir_path: str, target_name: Optional[str], target_id: Option project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None) if cluster_id is None: - raise RuntimeError("The `cluster_id` couldn't be found from the environment variables.") + raise RuntimeError("The `LIGHTNING_CLUSTER_ID` couldn't be found from the environment variables.") if project_id is None: - raise RuntimeError("The `project_id` couldn't be found from the environment variables.") + raise RuntimeError("The `LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables.") clusters = client.cluster_service_list_project_clusters(project_id).clusters @@ -147,7 +147,7 @@ def _resolve_s3_connections(dir_path: str) -> Dir: # Get the ids from env variables project_id = os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None) if project_id is None: - raise RuntimeError("The `project_id` couldn't be found from the environment variables.") + raise RuntimeError("The `LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables.") target_name = dir_path.split("/")[3] diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py index 742f066d..164120f5 100644 --- a/tests/streaming/test_resolver.py +++ b/tests/streaming/test_resolver.py @@ -23,7 +23,7 @@ def test_src_resolver_s3_connections(monkeypatch, lightning_cloud_mock): auth = login.Auth() auth.save(user_id="7c8455e3-7c5f-4697-8a6d-105971d6b9bd", api_key="e63fae57-2b50-498b-bc46-d6204cbf330e") - with pytest.raises(RuntimeError, match="`project_id` couldn't be found from the environment variables."): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables."): resolver._resolve_dir("/teamspace/s3_connections/imagenet") monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "project_id") From 9e74f307a708372702598c7af22c138cde24273c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:14:19 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/streaming/test_resolver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py index 164120f5..661c4bce 100644 --- a/tests/streaming/test_resolver.py +++ b/tests/streaming/test_resolver.py @@ -23,7 +23,9 @@ def test_src_resolver_s3_connections(monkeypatch, lightning_cloud_mock): auth = login.Auth() auth.save(user_id="7c8455e3-7c5f-4697-8a6d-105971d6b9bd", api_key="e63fae57-2b50-498b-bc46-d6204cbf330e") - with pytest.raises(RuntimeError, match="`LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables."): + with pytest.raises( + RuntimeError, match="`LIGHTNING_CLOUD_PROJECT_ID` couldn't be found from the environment variables." + ): resolver._resolve_dir("/teamspace/s3_connections/imagenet") monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "project_id") From 1881c0e3f528c9b2725eca3b1a50636fa890adc7 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 18 Sep 2024 14:34:02 +0200 Subject: [PATCH 5/5] tests --- tests/streaming/test_resolver.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py index 164120f5..62b664dd 100644 --- a/tests/streaming/test_resolver.py +++ b/tests/streaming/test_resolver.py @@ -60,12 +60,12 @@ def test_src_resolver_studios(monkeypatch, lightning_cloud_mock): auth = login.Auth() auth.save(user_id="7c8455e3-7c5f-4697-8a6d-105971d6b9bd", api_key="e63fae57-2b50-498b-bc46-d6204cbf330e") - with pytest.raises(RuntimeError, match="`cluster_id`"): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLUSTER_ID`"): resolver._resolve_dir("/teamspace/studios/other_studio") monkeypatch.setenv("LIGHTNING_CLUSTER_ID", "cluster_id") - with pytest.raises(RuntimeError, match="`project_id`"): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLOUD_PROJECT_ID`"): resolver._resolve_dir("/teamspace/studios/other_studio") monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "project_id") @@ -138,17 +138,17 @@ def test_src_resolver_datasets(monkeypatch, lightning_cloud_mock): assert resolver._resolve_dir("s3://bucket_name").url == "s3://bucket_name" - with pytest.raises(RuntimeError, match="`cluster_id`"): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLUSTER_ID`"): resolver._resolve_dir("/teamspace/datasets/imagenet") monkeypatch.setenv("LIGHTNING_CLUSTER_ID", "cluster_id") - with pytest.raises(RuntimeError, match="`project_id`"): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLOUD_PROJECT_ID`"): resolver._resolve_dir("/teamspace/datasets/imagenet") monkeypatch.setenv("LIGHTNING_CLOUD_PROJECT_ID", "project_id") - with pytest.raises(RuntimeError, match="`cloud_space_id`"): + with pytest.raises(RuntimeError, match="`LIGHTNING_CLOUD_SPACE_ID`"): resolver._resolve_dir("/teamspace/datasets/imagenet") monkeypatch.setenv("LIGHTNING_CLOUD_SPACE_ID", "cloud_space_id")