Skip to content

Commit

Permalink
ci/lint: fixing & update
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda committed Mar 14, 2024
1 parent 4e1f9bc commit 3261e37
Show file tree
Hide file tree
Showing 24 changed files with 28 additions and 70 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/ci-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ concurrency:
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release/')) }}

jobs:
check-precommit:
uses: Lightning-AI/utilities/.github/workflows/check-precommit.yml@main

check-typing:
# TODO: switch to main after fix lends
Expand All @@ -21,12 +19,12 @@ jobs:
actions-ref: main

check-schema:
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@main
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.10.1
with:
azure-dir: ""

check-package:
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@main
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.10.1
with:
actions-ref: v0.10.1
import-name: "litdata"
Expand Down
20 changes: 0 additions & 20 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,29 +79,14 @@ lint.ignore-init-module-imports = true
[tool.ruff.lint.per-file-ignores]
".actions/*" = ["S101", "S310"]
"setup.py" = ["S101", "SIM115"]
"examples/**" = [
"S101", # Use of `assert` detected
"S113", # todo: Probable use of requests call without
"S104", # Possible binding to all interface
"F821", # Undefined name `...`
"S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
"S501", # Probable use of `requests` call with `verify=False` disabling SSL certificate checks
"S108", # Probable insecure usage of temporary file or directory: "/tmp/data/MNIST"
]
"src/**" = [
"S101", # todo: Use of `assert` detected
"S105", "S106", "S107", # todo: Possible hardcoded password: ...
"S113", # todo: Probable use of requests call without timeout
"S301", # todo: `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue
"S324", # todo: Probable use of insecure hash functions in `hashlib`
"S403", # todo: `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure
"S404", # todo: `subprocess` module is possibly insecure
"S602", # todo: `subprocess` call with `shell=True` identified, security issue
"S603", # todo: `subprocess` call: check for execution of untrusted input
"S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
"S607", # todo: Starting a process with a partial executable path
"RET504", # todo:Unnecessary variable assignment before `return` statement
"RET503",
"S310", # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected.
]
"tests/**" = [
Expand All @@ -118,11 +103,6 @@ lint.ignore-init-module-imports = true
"S603", # todo: `subprocess` call: check for execution of untrusted input
"S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
"S607", # todo: Starting a process with a partial executable path
"RET504", # todo:Unnecessary variable assignment before `return` statement
"PT004", # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
"PT011", # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
"PT012", # todo: `pytest.raises()` block should contain a single simple statement
"PT019", # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
]

[tool.ruff.lint.mccabe]
Expand Down
4 changes: 1 addition & 3 deletions src/litdata/processing/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,12 @@
def _get_indexed_paths(data: Any) -> Dict[int, str]:
flattened_item, _ = tree_flatten(data)

indexed_paths = {
return {
index: element
for index, element in enumerate(flattened_item)
if isinstance(element, str) and os.path.exists(element)
}

return indexed_paths


def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]:
indexed_paths = _get_indexed_paths(inputs[0])
Expand Down
5 changes: 2 additions & 3 deletions src/litdata/processing/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _wrapper(*args: Any, **kwargs: Any) -> Tuple[Any, Optional[Exception]]:
def make_request(
url: str,
timeout: int = 10,
user_agent_token: str = "pytorch-lightning",
user_agent_token: str = "pytorch-lightning", # noqa: S107
) -> io.BytesIO:
"""Download an image with urllib."""
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
Expand All @@ -105,8 +105,7 @@ def make_request(
with urllib.request.urlopen(
urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string}), timeout=timeout
) as r:
img_stream = io.BytesIO(r.read())
return img_stream
return io.BytesIO(r.read())


@contextmanager
Expand Down
6 changes: 2 additions & 4 deletions src/litdata/streaming/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def state_dict(self, num_samples_yielded: int, num_workers: int, batch_size: int
self._state_dict["num_samples_yielded"] = num_samples_yielded
return self._state_dict

state = {
return {
"num_samples_yielded": num_samples_yielded,
"num_workers": num_workers,
"batch_size": batch_size,
Expand All @@ -333,8 +333,6 @@ def state_dict(self, num_samples_yielded: int, num_workers: int, batch_size: int
"shuffle": self.shuffle,
}

return state

def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
if state_dict:
# the state is restored within the workers
Expand Down Expand Up @@ -402,7 +400,7 @@ def _validate_state_dict(self) -> None:


def _try_create_cache_dir(input_dir: Optional[str]) -> Optional[str]:
hash_object = hashlib.md5((input_dir or "").encode())
hash_object = hashlib.md5((input_dir or "").encode()) # noqa: S324
if "LIGHTNING_CLUSTER_ID" not in os.environ or "LIGHTNING_CLOUD_PROJECT_ID" not in os.environ:
cache_dir = os.path.join(_DEFAULT_CACHE_DIR, hash_object.hexdigest())
os.makedirs(cache_dir, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
logger = Logger(__name__)


_END_TOKEN = "END"
_END_TOKEN = "END" # noqa: S105

# Note: The timeout here should not be too short. We need to prevent the caller from aggressively
# querying the queue and consuming too many CPU cycles.
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def serialize(self, item: Any) -> Tuple[bytes, Optional[str]]:
return pickle.dumps(item), None

def deserialize(self, data: bytes) -> Any:
return pickle.loads(data)
return pickle.loads(data) # noqa: S301

def can_serialize(self, _: Any) -> bool:
return True
Expand Down
16 changes: 8 additions & 8 deletions src/litdata/streaming/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,13 @@ def rank(self) -> int:

def get_config(self) -> Dict[str, Any]:
"""Returns the config of the writer."""
out = {
return {
"compression": self._compression,
"chunk_size": self._chunk_size,
"chunk_bytes": self._chunk_bytes,
"data_format": self._data_format,
"data_spec": treespec_dumps(self._data_spec) if self._data_spec else None,
}
return out

def serialize(self, items: Any) -> Tuple[bytes, Optional[int]]:
"""Serialize a dictionary into its binary format."""
Expand Down Expand Up @@ -291,12 +290,13 @@ def add_item(self, index: int, items: Any) -> Optional[str]:
dim=dim,
)

if self._should_write():
filepath = os.path.join(self._cache_dir, self.get_chunk_filename())
self.write_chunk()
self._min_index = None
self._max_index = None
return filepath
if not self._should_write():
return None
filepath = os.path.join(self._cache_dir, self.get_chunk_filename())
self.write_chunk()
self._min_index = None
self._max_index = None
return filepath

def _should_write(self) -> bool:
# TODO: Misleading method name, it modifies `self._min_index` and `self._max_index`!
Expand Down
4 changes: 2 additions & 2 deletions src/litdata/utilities/broadcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def set_and_get(self, key: str, value: Any) -> Any:

if resp.status_code != 200:
raise RuntimeError(f"Failed to broadcast the following {key=} {value=}.")
return pickle.loads(bytes(resp.json()["value"], "utf-8"))
return pickle.loads(bytes(resp.json()["value"], "utf-8")) # noqa: S301


def broadcast_object(key: str, obj: Any) -> Any:
Expand All @@ -151,7 +151,7 @@ def _get_token() -> Optional[str]:

payload = {"apiKey": os.getenv("LIGHTNING_API_KEY"), "username": os.getenv("LIGHTNING_USERNAME")}
url_login = os.getenv("LIGHTNING_CLOUD_URL", "") + "/v1/auth/login"
res = requests.post(url_login, data=json.dumps(payload))
res = requests.post(url_login, data=json.dumps(payload)) # noqa: S113
if "token" not in res.json():
raise RuntimeError(
f"You haven't properly setup your environment variables with {url_login} and data: \n{payload}"
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


@pytest.fixture(autouse=True)
def teardown_process_group():
def teardown_process_group(): # noqa: PT004
"""Ensures that the distributed process group gets closed before the next test runs."""
yield
if torch.distributed.is_available() and torch.distributed.is_initialized():
Expand Down
4 changes: 1 addition & 3 deletions tests/processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import torch
from lightning import seed_everything
from lightning_utilities.core.imports import RequirementCache

from litdata.constants import _TORCH_AUDIO_AVAILABLE, _ZSTD_AVAILABLE
from litdata.processing import data_processor as data_processor_module
from litdata.processing import functions
Expand Down Expand Up @@ -1077,8 +1076,7 @@ def create_synthetic_audio_bytes(index) -> dict:
torchaudio.save(f, data, 16000, format="wav")
data = f.getvalue()

data = {"content": data}
return data
return {"content": data}


@pytest.mark.skipif(condition=not _TORCH_AUDIO_AVAILABLE or not _ZSTD_AVAILABLE, reason="Requires: ['torchaudio']")
Expand Down
1 change: 0 additions & 1 deletion tests/processing/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from unittest import mock

import pytest

from litdata import walk
from litdata.processing.functions import _get_input_dir

Expand Down
1 change: 0 additions & 1 deletion tests/processing/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys

import pytest

from litdata import map
from litdata.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader

Expand Down
7 changes: 3 additions & 4 deletions tests/streaming/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@
from lightning.pytorch.demos.boring_classes import RandomDataset
from lightning_utilities.core.imports import RequirementCache
from lightning_utilities.test.warning import no_warning_call
from torch.utils.data import Dataset

from litdata.streaming import Cache
from litdata.streaming.dataloader import CacheDataLoader
from litdata.streaming.dataset import StreamingDataset
from litdata.streaming.item_loader import TokensLoader
from litdata.streaming.serializers import Serializer
from litdata.utilities.env import _DistributedEnv
from torch.utils.data import Dataset

_PIL_AVAILABLE = RequirementCache("PIL")
_TORCH_VISION_AVAILABLE = RequirementCache("torchvision")
Expand Down Expand Up @@ -219,8 +218,8 @@ def __len__(self) -> int:
os.makedirs(os.path.join(tmpdir, "cache_2"), exist_ok=True)
dataset = RandomDatasetAtRuntime(64, 64)
dataloader = CacheDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_2"), chunk_bytes=2 << 12)
with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):
for batch in dataloader:
with pytest.raises(ValueError, match="Your dataset items aren't deterministic"): # noqa: PT012
for _ in dataloader:
pass


Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from unittest import mock

import pytest

from litdata.streaming import client


Expand Down
5 changes: 2 additions & 3 deletions tests/streaming/test_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@

import pytest
import torch
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader

from litdata.streaming.cache import Cache
from litdata.streaming.combined import CombinedStreamingDataset
from litdata.streaming.dataloader import StreamingDataLoader
from litdata.streaming.dataset import Dir, StreamingDataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader


class TestCombinedStreamingDataset(CombinedStreamingDataset):
Expand Down
3 changes: 1 addition & 2 deletions tests/streaming/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

import pytest
import torch
from torch import tensor

from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader
from litdata.streaming import dataloader as streaming_dataloader_module
from torch import tensor


class TestStatefulDataset:
Expand Down
3 changes: 1 addition & 2 deletions tests/streaming/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
import pytest
import torch
from lightning import seed_everything
from torch.utils.data import DataLoader

from litdata.processing import functions
from litdata.streaming import Cache
from litdata.streaming import dataset as dataset_module
Expand All @@ -39,6 +37,7 @@
from litdata.streaming.item_loader import TokensLoader
from litdata.streaming.shuffle import FullShuffle, NoShuffle
from litdata.utilities.env import _DistributedEnv, _WorkerEnv
from torch.utils.data import DataLoader


def test_streaming_dataset(tmpdir, monkeypatch):
Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from time import sleep

import numpy as np

from litdata.streaming import reader
from litdata.streaming.cache import Cache
from litdata.streaming.config import ChunkedIndex
Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
V1ListClustersResponse,
V1ListDataConnectionsResponse,
)

from litdata.streaming import resolver


Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pytest
from lightning import seed_everything

from litdata.streaming.sampler import CacheBatchSampler


Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import torch
from lightning import seed_everything
from lightning_utilities.core.imports import RequirementCache

from litdata.streaming.serializers import (
_AV_AVAILABLE,
_NUMPY_DTYPES_MAPPING,
Expand Down
1 change: 0 additions & 1 deletion tests/streaming/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import pytest
from lightning import seed_everything
from lightning_utilities.core.imports import RequirementCache

from litdata.streaming.compression import _ZSTD_AVAILABLE
from litdata.streaming.reader import BinaryReader
from litdata.streaming.sampler import ChunkedIndex
Expand Down
1 change: 0 additions & 1 deletion tests/utilities/test_packing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest

from litdata.utilities.packing import _pack_greedily


Expand Down

0 comments on commit 3261e37

Please sign in to comment.