ci/lint: fixing & update

Lightning-AI · Mar 14, 2024 · 3261e37 · 3261e37
1 parent 4e1f9bc
commit 3261e37
Show file tree

Hide file tree

Showing 24 changed files with 28 additions and 70 deletions.
diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml
@@ -11,8 +11,6 @@ concurrency:
   cancel-in-progress: ${{ ! (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/')) }}
 
 jobs:
-  check-precommit:
-    uses: Lightning-AI/utilities/.github/workflows/check-precommit.yml@main
 
   check-typing:
     # TODO: switch to main after fix lends
@@ -21,12 +19,12 @@ jobs:
       actions-ref: main
 
   check-schema:
-    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@main
+    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.10.1
     with:
       azure-dir: ""
 
   check-package:
-    uses: Lightning-AI/utilities/.github/workflows/check-package.yml@main
+    uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.10.1
     with:
       actions-ref: v0.10.1
       import-name: "litdata"

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,29 +79,14 @@ lint.ignore-init-module-imports = true
 [tool.ruff.lint.per-file-ignores]
 ".actions/*" = ["S101", "S310"]
 "setup.py" = ["S101", "SIM115"]
-"examples/**" = [
-    "S101",  # Use of `assert` detected
-    "S113",  # todo: Probable use of requests call without
-    "S104",  # Possible binding to all interface
-    "F821",  # Undefined name `...`
-    "S311",  # Standard pseudo-random generators are not suitable for cryptographic purposes
-    "S501",  # Probable use of `requests` call with `verify=False` disabling SSL certificate checks
-    "S108",  # Probable insecure usage of temporary file or directory: "/tmp/data/MNIST"
-]
 "src/**" = [
     "S101",  # todo: Use of `assert` detected
-    "S105", "S106", "S107",  # todo: Possible hardcoded password: ...
-    "S113",  # todo: Probable use of requests call without timeout
-    "S301",  # todo: `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue
-    "S324",  # todo: Probable use of insecure hash functions in `hashlib`
     "S403",  # todo: `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure
     "S404",  # todo: `subprocess` module is possibly insecure
     "S602",  # todo: `subprocess` call with `shell=True` identified, security issue
     "S603",  # todo: `subprocess` call: check for execution of untrusted input
     "S605",  # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
     "S607",  # todo: Starting a process with a partial executable path
-    "RET504",  # todo:Unnecessary variable assignment before `return` statement
-    "RET503",
     "S310",  # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected.
 ]
 "tests/**" = [
@@ -118,11 +103,6 @@ lint.ignore-init-module-imports = true
     "S603",  # todo: `subprocess` call: check for execution of untrusted input
     "S605",  # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
     "S607",  # todo: Starting a process with a partial executable path
-    "RET504",  # todo:Unnecessary variable assignment before `return` statement
-    "PT004",  # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
-    "PT011",  # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
-    "PT012",  # todo: `pytest.raises()` block should contain a single simple statement
-    "PT019",  # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
 ]
 
 [tool.ruff.lint.mccabe]

diff --git a/src/litdata/processing/functions.py b/src/litdata/processing/functions.py
@@ -42,14 +42,12 @@
 def _get_indexed_paths(data: Any) -> Dict[int, str]:
     flattened_item, _ = tree_flatten(data)
 
-    indexed_paths = {
+    return {
         index: element
         for index, element in enumerate(flattened_item)
         if isinstance(element, str) and os.path.exists(element)
     }
 
-    return indexed_paths
-
 
 def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]:
     indexed_paths = _get_indexed_paths(inputs[0])

diff --git a/src/litdata/processing/utilities.py b/src/litdata/processing/utilities.py
@@ -95,7 +95,7 @@ def _wrapper(*args: Any, **kwargs: Any) -> Tuple[Any, Optional[Exception]]:
 def make_request(
     url: str,
     timeout: int = 10,
-    user_agent_token: str = "pytorch-lightning",
+    user_agent_token: str = "pytorch-lightning",  # noqa: S107
 ) -> io.BytesIO:
     """Download an image with urllib."""
     user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
@@ -105,8 +105,7 @@ def make_request(
     with urllib.request.urlopen(
         urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string}), timeout=timeout
     ) as r:
-        img_stream = io.BytesIO(r.read())
-    return img_stream
+        return io.BytesIO(r.read())
 
 
 @contextmanager

diff --git a/src/litdata/streaming/dataset.py b/src/litdata/streaming/dataset.py
@@ -319,7 +319,7 @@ def state_dict(self, num_samples_yielded: int, num_workers: int, batch_size: int
             self._state_dict["num_samples_yielded"] = num_samples_yielded
             return self._state_dict
 
-        state = {
+        return {
             "num_samples_yielded": num_samples_yielded,
             "num_workers": num_workers,
             "batch_size": batch_size,
@@ -333,8 +333,6 @@ def state_dict(self, num_samples_yielded: int, num_workers: int, batch_size: int
             "shuffle": self.shuffle,
         }
 
-        return state
-
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         if state_dict:
             # the state is restored within the workers
@@ -402,7 +400,7 @@ def _validate_state_dict(self) -> None:
 
 
 def _try_create_cache_dir(input_dir: Optional[str]) -> Optional[str]:
-    hash_object = hashlib.md5((input_dir or "").encode())
+    hash_object = hashlib.md5((input_dir or "").encode())  # noqa: S324
     if "LIGHTNING_CLUSTER_ID" not in os.environ or "LIGHTNING_CLOUD_PROJECT_ID" not in os.environ:
         cache_dir = os.path.join(_DEFAULT_CACHE_DIR, hash_object.hexdigest())
         os.makedirs(cache_dir, exist_ok=True)

diff --git a/src/litdata/streaming/reader.py b/src/litdata/streaming/reader.py
@@ -36,7 +36,7 @@
 logger = Logger(__name__)
 
 
-_END_TOKEN = "END"
+_END_TOKEN = "END"  # noqa: S105
 
 # Note: The timeout here should not be too short. We need to prevent the caller from aggressively
 # querying the queue and consuming too many CPU cycles.

diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py
@@ -272,7 +272,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]:
         return pickle.dumps(item), None
 
     def deserialize(self, data: bytes) -> Any:
-        return pickle.loads(data)
+        return pickle.loads(data)  # noqa: S301
 
     def can_serialize(self, _: Any) -> bool:
         return True

diff --git a/src/litdata/streaming/writer.py b/src/litdata/streaming/writer.py
@@ -130,14 +130,13 @@ def rank(self) -> int:
 
     def get_config(self) -> Dict[str, Any]:
         """Returns the config of the writer."""
-        out = {
+        return {
             "compression": self._compression,
             "chunk_size": self._chunk_size,
             "chunk_bytes": self._chunk_bytes,
             "data_format": self._data_format,
             "data_spec": treespec_dumps(self._data_spec) if self._data_spec else None,
         }
-        return out
 
     def serialize(self, items: Any) -> Tuple[bytes, Optional[int]]:
         """Serialize a dictionary into its binary format."""
@@ -291,12 +290,13 @@ def add_item(self, index: int, items: Any) -> Optional[str]:
             dim=dim,
         )
 
-        if self._should_write():
-            filepath = os.path.join(self._cache_dir, self.get_chunk_filename())
-            self.write_chunk()
-            self._min_index = None
-            self._max_index = None
-            return filepath
+        if not self._should_write():
+            return None
+        filepath = os.path.join(self._cache_dir, self.get_chunk_filename())
+        self.write_chunk()
+        self._min_index = None
+        self._max_index = None
+        return filepath
 
     def _should_write(self) -> bool:
         # TODO: Misleading method name, it modifies `self._min_index` and `self._max_index`!

diff --git a/src/litdata/utilities/broadcast.py b/src/litdata/utilities/broadcast.py
@@ -134,7 +134,7 @@ def set_and_get(self, key: str, value: Any) -> Any:
 
         if resp.status_code != 200:
             raise RuntimeError(f"Failed to broadcast the following {key=} {value=}.")
-        return pickle.loads(bytes(resp.json()["value"], "utf-8"))
+        return pickle.loads(bytes(resp.json()["value"], "utf-8"))  # noqa: S301
 
 
 def broadcast_object(key: str, obj: Any) -> Any:
@@ -151,7 +151,7 @@ def _get_token() -> Optional[str]:
 
     payload = {"apiKey": os.getenv("LIGHTNING_API_KEY"), "username": os.getenv("LIGHTNING_USERNAME")}
     url_login = os.getenv("LIGHTNING_CLOUD_URL", "") + "/v1/auth/login"
-    res = requests.post(url_login, data=json.dumps(payload))
+    res = requests.post(url_login, data=json.dumps(payload))  # noqa: S113
     if "token" not in res.json():
         raise RuntimeError(
             f"You haven't properly setup your environment variables with {url_login} and data: \n{payload}"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,7 +3,7 @@
 
 
 @pytest.fixture(autouse=True)
-def teardown_process_group():
+def teardown_process_group():  # noqa: PT004
     """Ensures that the distributed process group gets closed before the next test runs."""
     yield
     if torch.distributed.is_available() and torch.distributed.is_initialized():

diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py
@@ -11,7 +11,6 @@
 import torch
 from lightning import seed_everything
 from lightning_utilities.core.imports import RequirementCache
-
 from litdata.constants import _TORCH_AUDIO_AVAILABLE, _ZSTD_AVAILABLE
 from litdata.processing import data_processor as data_processor_module
 from litdata.processing import functions
@@ -1077,8 +1076,7 @@ def create_synthetic_audio_bytes(index) -> dict:
         torchaudio.save(f, data, 16000, format="wav")
         data = f.getvalue()
 
-    data = {"content": data}
-    return data
+    return {"content": data}
 
 
 @pytest.mark.skipif(condition=not _TORCH_AUDIO_AVAILABLE or not _ZSTD_AVAILABLE, reason="Requires: ['torchaudio']")

diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py
@@ -3,7 +3,6 @@
 from unittest import mock
 
 import pytest
-
 from litdata import walk
 from litdata.processing.functions import _get_input_dir
 

diff --git a/tests/processing/test_readers.py b/tests/processing/test_readers.py
@@ -2,7 +2,6 @@
 import sys
 
 import pytest
-
 from litdata import map
 from litdata.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader
 

diff --git a/tests/streaming/test_cache.py b/tests/streaming/test_cache.py
@@ -23,14 +23,13 @@
 from lightning.pytorch.demos.boring_classes import RandomDataset
 from lightning_utilities.core.imports import RequirementCache
 from lightning_utilities.test.warning import no_warning_call
-from torch.utils.data import Dataset
-
 from litdata.streaming import Cache
 from litdata.streaming.dataloader import CacheDataLoader
 from litdata.streaming.dataset import StreamingDataset
 from litdata.streaming.item_loader import TokensLoader
 from litdata.streaming.serializers import Serializer
 from litdata.utilities.env import _DistributedEnv
+from torch.utils.data import Dataset
 
 _PIL_AVAILABLE = RequirementCache("PIL")
 _TORCH_VISION_AVAILABLE = RequirementCache("torchvision")
@@ -219,8 +218,8 @@ def __len__(self) -> int:
     os.makedirs(os.path.join(tmpdir, "cache_2"), exist_ok=True)
     dataset = RandomDatasetAtRuntime(64, 64)
     dataloader = CacheDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_2"), chunk_bytes=2 << 12)
-    with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):
-        for batch in dataloader:
+    with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):  # noqa: PT012
+        for _ in dataloader:
             pass
 
 

diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
@@ -3,7 +3,6 @@
 from unittest import mock
 
 import pytest
-
 from litdata.streaming import client
 
 

diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py
@@ -4,13 +4,12 @@
 
 import pytest
 import torch
-from torch.utils.data import IterableDataset
-from torch.utils.data.dataloader import DataLoader
-
 from litdata.streaming.cache import Cache
 from litdata.streaming.combined import CombinedStreamingDataset
 from litdata.streaming.dataloader import StreamingDataLoader
 from litdata.streaming.dataset import Dir, StreamingDataset
+from torch.utils.data import IterableDataset
+from torch.utils.data.dataloader import DataLoader
 
 
 class TestCombinedStreamingDataset(CombinedStreamingDataset):

diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py
@@ -2,10 +2,9 @@
 
 import pytest
 import torch
-from torch import tensor
-
 from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader
 from litdata.streaming import dataloader as streaming_dataloader_module
+from torch import tensor
 
 
 class TestStatefulDataset:

diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
@@ -20,8 +20,6 @@
 import pytest
 import torch
 from lightning import seed_everything
-from torch.utils.data import DataLoader
-
 from litdata.processing import functions
 from litdata.streaming import Cache
 from litdata.streaming import dataset as dataset_module
@@ -39,6 +37,7 @@
 from litdata.streaming.item_loader import TokensLoader
 from litdata.streaming.shuffle import FullShuffle, NoShuffle
 from litdata.utilities.env import _DistributedEnv, _WorkerEnv
+from torch.utils.data import DataLoader
 
 
 def test_streaming_dataset(tmpdir, monkeypatch):

diff --git a/tests/streaming/test_reader.py b/tests/streaming/test_reader.py
@@ -3,7 +3,6 @@
 from time import sleep
 
 import numpy as np
-
 from litdata.streaming import reader
 from litdata.streaming.cache import Cache
 from litdata.streaming.config import ChunkedIndex

diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py
@@ -15,7 +15,6 @@
     V1ListClustersResponse,
     V1ListDataConnectionsResponse,
 )
-
 from litdata.streaming import resolver
 
 

diff --git a/tests/streaming/test_sampler.py b/tests/streaming/test_sampler.py
@@ -2,7 +2,6 @@
 
 import pytest
 from lightning import seed_everything
-
 from litdata.streaming.sampler import CacheBatchSampler
 
 

diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py
@@ -21,7 +21,6 @@
 import torch
 from lightning import seed_everything
 from lightning_utilities.core.imports import RequirementCache
-
 from litdata.streaming.serializers import (
     _AV_AVAILABLE,
     _NUMPY_DTYPES_MAPPING,

diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py
@@ -19,7 +19,6 @@
 import pytest
 from lightning import seed_everything
 from lightning_utilities.core.imports import RequirementCache
-
 from litdata.streaming.compression import _ZSTD_AVAILABLE
 from litdata.streaming.reader import BinaryReader
 from litdata.streaming.sampler import ChunkedIndex

diff --git a/tests/utilities/test_packing.py b/tests/utilities/test_packing.py
@@ -1,5 +1,4 @@
 import pytest
-
 from litdata.utilities.packing import _pack_greedily
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,7 +3,6 @@
		from unittest import mock

		import pytest

		from litdata.streaming import client


Expand Down
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,6 @@ @@
         V1ListClustersResponse,
         V1ListDataConnectionsResponse,
     )
     from litdata.streaming import resolver
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,7 +2,6 @@

		import pytest
		from lightning import seed_everything

		from litdata.streaming.sampler import CacheBatchSampler


Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,4 @@
		import pytest

		from litdata.utilities.packing import _pack_greedily


Expand Down