diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 286a59b5..2f260695 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,7 +2,7 @@ Before submitting - [ ] Was this discussed/agreed via a Github issue? (no need for typos and docs improvements) -- [ ] Did you read the [contributor guideline](https://github.com/Lightning-AI/pytorch-lightning/blob/main/.github/CONTRIBUTING.md), Pull Request section? +- [ ] Did you read the [contributor guideline](https://github.com/Lightning-AI/lit-data/blob/main/.github/CONTRIBUTING.md), Pull Request section? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? diff --git a/.github/workflows/check-typing.yml b/.github/workflows/check-typing.yml new file mode 100644 index 00000000..a48baace --- /dev/null +++ b/.github/workflows/check-typing.yml @@ -0,0 +1,82 @@ +name: Check formatting flow + +on: + workflow_call: + inputs: + actions-ref: + description: "Version of actions, normally the same as workflow" + required: true + type: string + python-version: + description: "Python version to use" + default: "3.9" + required: false + type: string + extra-typing: + description: "Package extra to be installed for type checks + include mypy" + default: "test" + required: false + type: string + +defaults: + run: + shell: bash + +jobs: + mypy: + runs-on: ubuntu-20.04 + steps: + - name: Checkout 🛎️ + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python 🐍 ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Install dependencies + timeout-minutes: 20 + run: | + # don't use --upgrade to respect the version installed via setup.py + pip install -e '.[${{ inputs.extra-typing }}]' mypy \ + --extra-index-url https://download.pytorch.org/whl/cpu/torch_stable.html + pip list + + - name: Pull reusable 🤖 actions️ + uses: actions/checkout@v4 + with: + ref: ${{ inputs.actions-ref }} + path: .cicd + repository: Lightning-AI/utilities + - name: Print 🖨️ dependencies + uses: ./.cicd/.github/actions/pip-list + with: + unfold: true + + # see: https://github.com/python/mypy/issues/10600#issuecomment-857351152 + - run: yes | mypy src --install-types || true + + - name: Check typing + # mypy uses the config file found in the following order: + # 1. mypy.ini + # 2. pyproject.toml + # 3. setup.cfg + # 4. $XDG_CONFIG_HOME/mypy/config + # 5. ~/.config/mypy/config + # 6. ~/.mypy.ini + # https://mypy.readthedocs.io/en/stable/config_file.html + run: mypy + + - name: suggest ignores + if: failure() + run: | + mypy --no-error-summary 2>&1 \ + | tr ':' ' ' \ + | awk '{print $1}' \ + | sort \ + | uniq \ + | sed 's/\.py//g; s|src/||g; s|\/__init__||g; s|\/|\.|g' \ + | xargs -I {} echo '"{}",' \ + || true diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml index 6d962896..6cf189ff 100644 --- a/.github/workflows/ci-checks.yml +++ b/.github/workflows/ci-checks.yml @@ -25,10 +25,10 @@ jobs: azure-dir: "" check-package: - uses: Lightning-AI/utilities/.github/workflows/check-package.yml@main + uses: ./check-typing.yml with: actions-ref: main - import-name: "pl_sandbox" + import-name: "lit_data" artifact-name: dist-packages-${{ github.sha }} testing-matrix: | { @@ -39,4 +39,4 @@ jobs: check-docs: uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@main with: - requirements-file: "_requirements/docs.txt" + requirements-file: "requirements/docs.txt" diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index 6d5fa02b..b5bc815d 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -41,11 +41,16 @@ jobs: - name: Set min. dependencies if: matrix.requires == 'oldest' run: | - for fpath in ('requirements.txt', '_requirements/test.txt'): + for fpath in ('requirements.txt', 'requirements/test.txt'): req = open(fpath).read().replace('>=', '==') open(fpath, 'w').write(req) shell: python + - name: Display dependencies + run: | + cat requirements.txt + cat requirements/test.txt + - name: Get pip cache dir id: pip-cache run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT @@ -65,7 +70,7 @@ jobs: - name: Tests run: | - coverage run --source pl_sandbox -m pytest src tests -v + coverage run --source lit_data -m pytest tests -v - name: Statistics if: success() diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 29670d7b..341eb41c 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -10,7 +10,7 @@ jobs: build-docs: uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@main with: - requirements-file: "_requirements/docs.txt" + requirements-file: "requirements/docs.txt" # https://github.com/marketplace/actions/deploy-to-github-pages docs-deploy: diff --git a/.github/workflows/label-conflicts.yml b/.github/workflows/label-conflicts.yml deleted file mode 100644 index f15f5845..00000000 --- a/.github/workflows/label-conflicts.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Label conflicts - -on: - push: - branches: ["main"] - pull_request_target: - types: ["synchronize", "reopened", "opened"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: true - -jobs: - triage-conflicts: - runs-on: ubuntu-latest - steps: - - uses: mschilde/auto-label-merge-conflicts@591722e97f3c4142df3eca156ed0dcf2bcd362bd # Oct 25, 2021 - with: - CONFLICT_LABEL_NAME: "has conflicts" - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - MAX_RETRIES: 3 - WAIT_MS: 5000 diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index f3d6ebfe..3b33f3e3 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -21,8 +21,10 @@ jobs: - name: Install dependencies run: pip install -U build twine + - name: Build package run: python -m build + - name: Check package run: twine check dist/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea7b710c..295c2f1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,89 +1,113 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + default_language_version: python: python3 ci: autofix_prs: true autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions" - autoupdate_schedule: "monthly" + autoupdate_schedule: quarterly # submodules: true repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace - - id: check-case-conflict + # keep formatting in README flexible + exclude: README.md + - id: check-json - id: check-yaml - id: check-toml - - id: check-json - - id: check-added-large-files - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-case-conflict + - id: check-added-large-files + args: ["--maxkb=350", "--enforce-all"] + exclude: | + (?x)^( + docs/source-pytorch/_static/images/general/fast_2.gif| + docs/source-pytorch/_static/images/mnist_imgs/pt_to_pl.jpg| + docs/source-pytorch/_static/images/lightning_module/pt_to_pl.png| + docs/source-pytorch/_static/images/general/pl_quick_start_full_compressed.gif| + docs/source-pytorch/_static/images/general/pl_overview_flat.jpg| + docs/source-pytorch/_static/images/general/pl_overview.gif| + src/lightning/fabric/CHANGELOG.md| + src/lightning/pytorch/CHANGELOG.md + )$ - id: detect-private-key - repo: https://github.com/asottile/pyupgrade - rev: v3.9.0 + rev: v3.15.0 hooks: - id: pyupgrade args: ["--py38-plus"] name: Upgrade code - - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.10.0 - hooks: - - id: python-use-type-annotations - - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell additional_dependencies: [tomli] - #args: ["--write-changes"] + #args: ["--write-changes"] # uncomment if you want to get automatic fixing - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.3 - hooks: - - id: prettier - # https://prettier.io/docs/en/options.html#print-width - args: ["--print-width=120"] - - - repo: https://github.com/myint/docformatter + - repo: https://github.com/PyCQA/docformatter rev: v1.7.5 hooks: - id: docformatter additional_dependencies: [tomli] args: ["--in-place"] - - repo: https://github.com/psf/black - rev: 23.7.0 + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 hooks: - - id: black - name: Black code + - id: yesqa + name: Unused noqa + additional_dependencies: + #- pep8-naming + - flake8-pytest-style + - flake8-bandit + - flake8-simplify + - flake8-return + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: "v0.2.0" + hooks: + - id: ruff + args: ["--fix", "--preview"] + - id: ruff-format + args: ["--preview"] - repo: https://github.com/executablebooks/mdformat - rev: 0.7.16 + rev: 0.7.17 hooks: - id: mdformat additional_dependencies: - mdformat-gfm - - mdformat-black + #- mdformat-black - mdformat_frontmatter - exclude: CHANGELOG.md - - - repo: https://github.com/asottile/yesqa - rev: v1.5.0 - hooks: - - id: yesqa - additional_dependencies: - - pep8-naming - - pydocstyle - #- flake8-comprehensions - #- flake8-pytest-style - #- flake8-return - #- flake8-simplify + exclude: | + (?x)^( + src/data/CHANGELOG.md| + README.md + )$ - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.277 + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 hooks: - - id: ruff - args: ["--fix"] + - id: prettier + # https://prettier.io/docs/en/options.html#print-width + args: ["--print-width=120"] diff --git a/.readthedocs.yml b/.readthedocs.yml index f6bb36d5..e67b1544 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -17,4 +17,4 @@ formats: all python: version: 3.7 install: - - requirements: _requirements/docs.txt + - requirements: requirements/docs.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 28581466..0b6fbf91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] - YYYY-MM-DD +## \[Unreleased\] - YYYY-MM-DD ### Added diff --git a/MANIFEST.in b/MANIFEST.in index 4cd9bf95..cf416353 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,7 +5,7 @@ recursive-exclude __pycache__ *.py[cod] *.orig # Include the README and CHANGELOG include *.md -recursive-include src *.md +recursive-include lit_data *.md # Include the license file include LICENSE @@ -28,7 +28,7 @@ exclude docs # Include the Requirements include requirements.txt -recursive-include _requirements *.tx;t +recursive-include requirements *.tx;t # Exclude Makefile exclude Makefile diff --git a/Makefile b/Makefile index 8d1105ed..ee079558 100644 --- a/Makefile +++ b/Makefile @@ -7,14 +7,14 @@ export SPHINX_MOCK_REQUIREMENTS=0 test: clean pip install -q -r requirements.txt - pip install -q -r _requirements/test.txt + pip install -q -r requirements/test.txt # use this to run tests - python -m coverage run --source pl_sandbox -m pytest src tests -v --flake8 + python -m coverage run --source lit_data -m pytest src -v --flake8 python -m coverage report docs: clean - pip install . --quiet -r _requirements/docs.txt + pip install . --quiet -r requirements/docs.txt python -m sphinx -b html -W --keep-going docs/source docs/build clean: diff --git a/README.md b/README.md index 5e7e9aa0..e59488be 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
-Lightning +Lightning

@@ -32,7 +32,7 @@ Lightning Data can be installed with `pip`: ```bash -pip install --no-cache-dir git+https://github.com/Lightning-AI/pytorch-lightning.git@master +pip install --no-cache-dir git+https://github.com/Lightning-AI/lit-data.git@master ``` ## 🏁 Quick Start @@ -45,7 +45,7 @@ Convert your raw dataset into Lightning Streaming format using the `optimize` op ```python import numpy as np -from lightning.data import optimize +from lit_data import optimize from PIL import Image @@ -84,7 +84,7 @@ Here is an example with [AWS S3](https://aws.amazon.com/s3). ### 3. Use StreamingDataset and DataLoader ```python -from lightning.data import StreamingDataset +from lit_data import StreamingDataset from torch.utils.data import DataLoader # Remote path where full dataset is persistently stored @@ -135,7 +135,7 @@ for i in range(1000): ```python import os -from lightning.data import map +from lit_data import map from PIL import Image input_dir = "s3://my-bucket/my_images" @@ -174,7 +174,7 @@ We have end-to-end free [Studios](https://lightning.ai) showing all the steps to To scale data processing, create a free account on [lightning.ai](https://lightning.ai/) platform. With the platform, the `optimize` and `map` can start multiple machines to make data processing drastically faster as follows: ```python -from lightning.data import optimize, Machine +from lit_data import optimize, Machine optimize( ... @@ -186,7 +186,7 @@ optimize( OR ```python -from lightning.data import map, Machine +from lit_data import map, Machine map( ... @@ -216,8 +216,8 @@ The `StreamingDataset` and `StreamingDataLoader` takes care of everything for yo You can easily experiment with dataset mixtures using the CombinedStreamingDataset. ```python -from lightning.data import StreamingDataset, CombinedStreamingDataset -from lightning.data.streaming.item_loader import TokensLoader +from lit_data import StreamingDataset, CombinedStreamingDataset +from lit_data.streaming.item_loader import TokensLoader from tqdm import tqdm import os from torch.utils.data import DataLoader @@ -257,7 +257,7 @@ Note: The `StreamingDataLoader` is used by [Lit-GPT](https://github.com/Lightnin ```python import os import torch -from lightning.data import StreamingDataset, StreamingDataLoader +from lit_data import StreamingDataset, StreamingDataLoader dataset = StreamingDataset("s3://my-bucket/my-data", shuffle=True) dataloader = StreamingDataLoader(dataset, num_workers=os.cpu_count(), batch_size=64) @@ -280,7 +280,7 @@ for batch_idx, batch in enumerate(dataloader): The `StreamingDataLoader` supports profiling your data loading. Simply use the `profile_batches` argument as follows: ```python -from lightning.data import StreamingDataset, StreamingDataLoader +from lit_data import StreamingDataset, StreamingDataLoader StreamingDataLoader(..., profile_batches=5) ``` @@ -292,7 +292,7 @@ This generates a Chrome trace called `result.json`. You can visualize this trace Access the data you need when you need it. ```python -from lightning.data import StreamingDataset +from lit_data import StreamingDataset dataset = StreamingDataset(...) @@ -304,7 +304,7 @@ print(dataset[42]) # show the 42th element of the dataset ## ✢ Use data transforms ```python -from lightning.data import StreamingDataset, StreamingDataLoader +from lit_data import StreamingDataset, StreamingDataLoader import torchvision.transforms.v2.functional as F class ImagenetStreamingDataset(StreamingDataset): @@ -326,7 +326,7 @@ for batch in dataloader: Limit the size of the cache holding the chunks. ```python -from lightning.data import StreamingDataset +from lit_data import StreamingDataset dataset = StreamingDataset(..., max_cache_size="10GB") ``` @@ -338,7 +338,7 @@ When processing large files like compressed [parquet files](https://en.wikipedia ```python from pathlib import Path import pyarrow.parquet as pq -from lightning.data import optimize +from lit_data import optimize from tokenizer import Tokenizer from functools import partial diff --git a/_requirements/test.txt b/_requirements/test.txt deleted file mode 100644 index c3fbc921..00000000 --- a/_requirements/test.txt +++ /dev/null @@ -1,4 +0,0 @@ -coverage>=5.0 -pytest>=6.0 -pytest-cov -mypy==1.8.0 diff --git a/docs/source/_templates/theme_variables.jinja b/docs/source/_templates/theme_variables.jinja index 5d092539..28f05a5e 100644 --- a/docs/source/_templates/theme_variables.jinja +++ b/docs/source/_templates/theme_variables.jinja @@ -1,18 +1,18 @@ {%- set external_urls = { - 'github': 'https://github.com/Lightning-AI/lightning-Sandbox', - 'github_issues': 'https://github.com/Lightning-AI/lightning-Sandbox/issues', + 'github': 'https://github.com/Lightning-AI/lit-data', + 'github_issues': 'https://github.com/Lightning-AI/lit-data/issues', 'contributing': 'https://github.com/Lightning-AI/lightning/blob/master/CONTRIBUTING.md', 'governance': 'https://github.com/Lightning-AI/lightning/blob/master/governance.md', - 'docs': 'https://lightning-ai.github.io/lightning-Sandbox/', + 'docs': 'https://lightning-ai.github.io/lit-data/', 'twitter': 'https://twitter.com/LightningAI', 'discuss': 'https://discord.com/invite/tfXFetEZxv', 'tutorials': 'https://lightning.ai', - 'previous_pytorch_versions': 'https://lightning-ai.github.io/lightning-Sandbox/', - 'home': 'https://lightning-ai.github.io/lightning-Sandbox/', + 'previous_pytorch_versions': 'https://lightning-ai.github.io/lit-data/', + 'home': 'https://lightning-ai.github.io/lit-data/', 'get_started': 'https://lightning.ai', - 'features': 'https://lightning-ai.github.io/lightning-Sandbox/', + 'features': 'https://lightning-ai.github.io/lit-data/', 'blog': 'https://www.Lightning.ai/blog', 'resources': 'https://lightning.ai', - 'support': 'https://lightning-ai.github.io/lightning-Sandbox/', + 'support': 'https://lightning-ai.github.io/lit-data/', } -%} diff --git a/docs/source/conf.py b/docs/source/conf.py index 0651f76b..a6da23d1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,20 +16,19 @@ _PATH_HERE = os.path.abspath(os.path.dirname(__file__)) _PATH_ROOT = os.path.realpath(os.path.join(_PATH_HERE, "..", "..")) -_PATH_SOURCE = os.path.join(_PATH_ROOT, "src") sys.path.insert(0, os.path.abspath(_PATH_ROOT)) SPHINX_MOCK_REQUIREMENTS = int(os.environ.get("SPHINX_MOCK_REQUIREMENTS", True)) # alternative https://stackoverflow.com/a/67692/4521646 -spec = spec_from_file_location("pl_sandbox/__about__.py", os.path.join(_PATH_SOURCE, "pl_sandbox", "__about__.py")) +spec = spec_from_file_location("lit_data/__about__.py", os.path.join(_PATH_ROOT, "lit_data", "__about__.py")) about = module_from_spec(spec) spec.loader.exec_module(about) # -- Project information ----------------------------------------------------- # this name shall match the project name in Github as it is used for linking to code -project = "Lightning-Sandbox" +project = "lit-data" copyright = about.__copyright__ author = about.__author__ @@ -317,8 +316,8 @@ def find_source(): fname = inspect.getsourcefile(obj) # https://github.com/rtfd/readthedocs.org/issues/5735 if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")): - # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/ - # devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176 + # /home/docs/checkouts/readthedocs.org/user_builds/lit_data/checkouts/ + # devel/lit_data/utilities/cls_experiment.py#L26-L176 path_top = os.path.abspath(os.path.join("..", "..", "..")) fname = os.path.relpath(fname, start=path_top) else: @@ -381,8 +380,8 @@ def find_source(): import os import torch -import pytorch_lightning as pl -from pytorch_lightning import Trainer, LightningModule +import lit_data +from lit_data import StreamingDataset """ coverage_skip_undoc_in_source = True diff --git a/docs/source/index.rst b/docs/source/index.rst index 2899f863..d5a27347 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,7 +3,7 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Lightning-Sandbox +lit-data ================= .. toctree:: diff --git a/lit_data/CHANGELOG.md b/lit_data/CHANGELOG.md new file mode 100644 index 00000000..e3655c93 --- /dev/null +++ b/lit_data/CHANGELOG.md @@ -0,0 +1,37 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +## \[unReleased\] - 2024-MM-DD + +### Added + +- +- +- + +### Changed + +- +- +- + +### Deprecated + +- +- +- + +### Removed + +- +- +- + +### Fixed + +- +- +- diff --git a/lit_data/__about__.py b/lit_data/__about__.py new file mode 100644 index 00000000..fdf610bb --- /dev/null +++ b/lit_data/__about__.py @@ -0,0 +1,43 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +__author__ = "Lightning AI et al." +__author_email__ = "pytorch@lightning.ai" +__license__ = "Apache-2.0" +__copyright__ = f"Copyright (c) 2018-{time.strftime('%Y')}, {__author__}." +__homepage__ = "https://github.com/Lightning-AI/lit-data" +__docs_url__ = "https://lightning.ai/docs/pytorch/stable/" +# this has to be simple string, see: https://github.com/pypa/twine/issues/522 +__docs__ = "The Deep Learning framework to train, deploy, and ship AI products Lightning fast." +__long_doc__ = """ +What is it? +----------- + +TBD @eden +""" # TODO + +__all__ = [ + "__author__", + "__author_email__", + "__copyright__", + "__docs__", + "__docs_url__", + "__homepage__", + "__license__", + "__version__", +] + +__version__ = "0.0.1" diff --git a/src/__init__.py b/lit_data/__init__.py similarity index 100% rename from src/__init__.py rename to lit_data/__init__.py diff --git a/src/constants.py b/lit_data/constants.py similarity index 100% rename from src/constants.py rename to lit_data/constants.py diff --git a/src/processing/__init__.py b/lit_data/processing/__init__.py similarity index 100% rename from src/processing/__init__.py rename to lit_data/processing/__init__.py diff --git a/src/processing/data_processor.py b/lit_data/processing/data_processor.py similarity index 99% rename from src/processing/data_processor.py rename to lit_data/processing/data_processor.py index 33602799..52afaea5 100644 --- a/src/processing/data_processor.py +++ b/lit_data/processing/data_processor.py @@ -17,9 +17,9 @@ from urllib import parse import numpy as np +from lightning import seed_everything from tqdm.auto import tqdm as _tqdm -from lightning import seed_everything from lit_data.constants import ( _BOTO3_AVAILABLE, _DEFAULT_FAST_DEV_RUN_ITEMS, @@ -427,7 +427,7 @@ def _loop(self) -> None: uploader.join() if self.remove: - assert self.remover + assert self.remover # noqa: S101 self.remove_queue.put(None) self.remover.join() @@ -487,7 +487,7 @@ def _try_upload(self, data: Optional[Union[str, Tuple[str, str]]]) -> None: if isinstance(data, str): assert os.path.exists(data), data else: - assert os.path.exists(data[-1]), data + assert os.path.exists(data[-1]), data # noqa: S101 self.to_upload_queues[self._counter % self.num_uploaders].put(data) @@ -772,7 +772,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra # Get the index file locally for node_rank in range(num_nodes - 1): output_dir_path = output_dir.url if output_dir.url else output_dir.path - assert output_dir_path + assert output_dir_path # noqa: S101 remote_filepath = os.path.join(output_dir_path, f"{node_rank}-{_INDEX_FILENAME}") node_index_filepath = os.path.join(cache_dir, os.path.basename(remote_filepath)) if obj.scheme == "s3": @@ -941,7 +941,7 @@ def run(self, data_recipe: DataRecipe) -> None: error = self.error_queue.get(timeout=0.001) self._exit_on_error(error) except Empty: - assert self.progress_queue + assert self.progress_queue # noqa: S101 try: index, counter = self.progress_queue.get(timeout=0.001) except Empty: diff --git a/src/processing/functions.py b/lit_data/processing/functions.py similarity index 99% rename from src/processing/functions.py rename to lit_data/processing/functions.py index 83b632cf..afa516e6 100644 --- a/src/processing/functions.py +++ b/lit_data/processing/functions.py @@ -41,14 +41,12 @@ def _get_indexed_paths(data: Any) -> Dict[int, str]: flattened_item, _ = tree_flatten(data) - indexed_paths = { + return { index: element for index, element in enumerate(flattened_item) if isinstance(element, str) and os.path.exists(element) } - return indexed_paths - def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]: indexed_paths = _get_indexed_paths(inputs[0]) diff --git a/src/processing/readers.py b/lit_data/processing/readers.py similarity index 100% rename from src/processing/readers.py rename to lit_data/processing/readers.py diff --git a/src/processing/utilities.py b/lit_data/processing/utilities.py similarity index 95% rename from src/processing/utilities.py rename to lit_data/processing/utilities.py index 051e25dd..1bdf9c71 100644 --- a/src/processing/utilities.py +++ b/lit_data/processing/utilities.py @@ -2,7 +2,7 @@ import os import urllib from contextlib import contextmanager -from subprocess import Popen +from subprocess import Popen # noqa: S404 from typing import Any, Callable, Optional, Tuple from lit_data.constants import _IS_IN_STUDIO @@ -29,12 +29,12 @@ def _wrapper(*args: Any, **kwargs: Any) -> Tuple[Any, Optional[Exception]]: def make_request( url: str, timeout: int = 10, - user_agent_token: str = "pytorch-lightning", + user_agent_token: str = "lit-data", ) -> io.BytesIO: """Download an image with urllib.""" user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0" if user_agent_token: - user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/Lightning-AI/pytorch-lightning)" + user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/Lightning-AI/lit-data)" with urllib.request.urlopen( # noqa: S310 urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string}), timeout=timeout diff --git a/src/streaming/__init__.py b/lit_data/streaming/__init__.py similarity index 100% rename from src/streaming/__init__.py rename to lit_data/streaming/__init__.py diff --git a/src/streaming/cache.py b/lit_data/streaming/cache.py similarity index 100% rename from src/streaming/cache.py rename to lit_data/streaming/cache.py diff --git a/src/streaming/client.py b/lit_data/streaming/client.py similarity index 100% rename from src/streaming/client.py rename to lit_data/streaming/client.py diff --git a/src/streaming/combined.py b/lit_data/streaming/combined.py similarity index 100% rename from src/streaming/combined.py rename to lit_data/streaming/combined.py diff --git a/src/streaming/compression.py b/lit_data/streaming/compression.py similarity index 100% rename from src/streaming/compression.py rename to lit_data/streaming/compression.py diff --git a/src/streaming/config.py b/lit_data/streaming/config.py similarity index 100% rename from src/streaming/config.py rename to lit_data/streaming/config.py diff --git a/src/streaming/dataloader.py b/lit_data/streaming/dataloader.py similarity index 100% rename from src/streaming/dataloader.py rename to lit_data/streaming/dataloader.py diff --git a/src/streaming/dataset.py b/lit_data/streaming/dataset.py similarity index 100% rename from src/streaming/dataset.py rename to lit_data/streaming/dataset.py diff --git a/src/streaming/downloader.py b/lit_data/streaming/downloader.py similarity index 100% rename from src/streaming/downloader.py rename to lit_data/streaming/downloader.py diff --git a/src/streaming/item_loader.py b/lit_data/streaming/item_loader.py similarity index 100% rename from src/streaming/item_loader.py rename to lit_data/streaming/item_loader.py diff --git a/src/streaming/reader.py b/lit_data/streaming/reader.py similarity index 100% rename from src/streaming/reader.py rename to lit_data/streaming/reader.py diff --git a/src/streaming/resolver.py b/lit_data/streaming/resolver.py similarity index 100% rename from src/streaming/resolver.py rename to lit_data/streaming/resolver.py diff --git a/src/streaming/sampler.py b/lit_data/streaming/sampler.py similarity index 100% rename from src/streaming/sampler.py rename to lit_data/streaming/sampler.py diff --git a/src/streaming/serializers.py b/lit_data/streaming/serializers.py similarity index 100% rename from src/streaming/serializers.py rename to lit_data/streaming/serializers.py diff --git a/src/streaming/shuffle.py b/lit_data/streaming/shuffle.py similarity index 100% rename from src/streaming/shuffle.py rename to lit_data/streaming/shuffle.py diff --git a/src/streaming/writer.py b/lit_data/streaming/writer.py similarity index 100% rename from src/streaming/writer.py rename to lit_data/streaming/writer.py diff --git a/src/utilities/__init__.py b/lit_data/utilities/__init__.py similarity index 100% rename from src/utilities/__init__.py rename to lit_data/utilities/__init__.py diff --git a/src/utilities/broadcast.py b/lit_data/utilities/broadcast.py similarity index 100% rename from src/utilities/broadcast.py rename to lit_data/utilities/broadcast.py diff --git a/src/utilities/env.py b/lit_data/utilities/env.py similarity index 100% rename from src/utilities/env.py rename to lit_data/utilities/env.py diff --git a/src/utilities/format.py b/lit_data/utilities/format.py similarity index 100% rename from src/utilities/format.py rename to lit_data/utilities/format.py diff --git a/src/utilities/packing.py b/lit_data/utilities/packing.py similarity index 100% rename from src/utilities/packing.py rename to lit_data/utilities/packing.py diff --git a/src/utilities/shuffle.py b/lit_data/utilities/shuffle.py similarity index 100% rename from src/utilities/shuffle.py rename to lit_data/utilities/shuffle.py diff --git a/pyproject.toml b/pyproject.toml index d3a55d64..fa123ec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,21 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + [metadata] -license_file = "LICENSE" -description-file = "README.md" +name = "lit_data" +author = "Lightning-AI et al." +url = "https://github.com/Lightning-AI/lit-data" [build-system] requires = [ @@ -9,125 +24,172 @@ requires = [ ] -[tool.check-manifest] -ignore = [ - "*.yml", - ".github", - ".github/*" -] - - -[tool.pytest.ini_options] -norecursedirs = [ - ".git", - ".github", - "dist", - "build", - "docs", -] -addopts = [ - "--strict-markers", - "--doctest-modules", - "--color=yes", - "--disable-pytest-warnings", -] -filterwarnings = [ - "error::FutureWarning", -] -xfail_strict = true -junit_duration_report = "call" - -[tool.coverage.report] -exclude_lines = [ - "pragma: no cover", - "pass", -] - -[tool.codespell] -#skip = '*.py' -quiet-level = 3 -# comma separated list of words; waiting for: -# https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603 -# also adding links until they ignored by its: nature -# https://github.com/codespell-project/codespell/issues/2243#issuecomment-1732019960 -#ignore-words-list = "" - - [tool.black] -# https://github.com/psf/black line-length = 120 -exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)" +exclude = '(_notebooks/.*)' [tool.docformatter] recursive = true -wrap-summaries = 120 +# this need to be shorter as some docstings are r"""... +wrap-summaries = 119 wrap-descriptions = 120 blank = true - -[tool.mypy] -files = [ - "src", -] -install_types = true -non_interactive = true -disallow_untyped_defs = true -ignore_missing_imports = true -show_error_codes = true -warn_redundant_casts = true -warn_unused_configs = true -warn_unused_ignores = true -allow_redefinition = true -# disable this rule as the Trainer attributes are defined in the connectors, not in its __init__ -disable_error_code = "attr-defined" -# style choices -warn_no_return = false +[tool.codespell] +# Todo: enable also python files in a next step +skip = '*.py' +quiet-level = 3 +# comma separated list of words; waiting for: +# https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603 +# also adding links until they ignored by its: nature +# https://github.com/codespell-project/codespell/issues/2243#issuecomment-1732019960 +ignore-words-list = "te, compiletime" [tool.ruff] line-length = 120 # Enable Pyflakes `E` and `F` codes by default. -select = [ +lint.select = [ "E", "W", # see: https://pypi.org/project/pycodestyle "F", # see: https://pypi.org/project/pyflakes - "D", # see: https://pypi.org/project/pydocstyle - "N", # see: https://pypi.org/project/pep8-naming + "S", # see: https://pypi.org/project/flake8-bandit + "RUF018", # see: https://docs.astral.sh/ruff/rules/assignment-in-assert ] -extend-select = [ +lint.extend-select = [ + "I", # see: isort "C4", # see: https://pypi.org/project/flake8-comprehensions - "PT", # see: https://pypi.org/project/flake8-pytest-style - "RET", # see: https://pypi.org/project/flake8-return "SIM", # see: https://pypi.org/project/flake8-simplify + "RET", # see: https://pypi.org/project/flake8-return + "PT", # see: https://pypi.org/project/flake8-pytest-style ] -ignore = [ +lint.ignore = [ "E731", # Do not assign a lambda expression, use a def + "S108", + "E203", # conflicts with black ] # Exclude a variety of commonly ignored directories. exclude = [ - ".eggs", ".git", - ".mypy_cache", - ".ruff_cache", - "__pypackages__", - "_build", - "build", - "dist", - "docs" + "docs", + "_notebooks" ] -ignore-init-module-imports = true +lint.ignore-init-module-imports = true + +[tool.ruff.lint.per-file-ignores] +".actions/*" = ["S101", "S310"] +"setup.py" = ["S101", "SIM115"] +"examples/**" = [ + "S101", # Use of `assert` detected + "S113", # todo: Probable use of requests call without + "S104", # Possible binding to all interface + "F821", # Undefined name `...` + "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes + "S501", # Probable use of `requests` call with `verify=False` disabling SSL certificate checks + "S108", # Probable insecure usage of temporary file or directory: "/tmp/data/MNIST" +] +"lit_data/**" = [ + "S101", # todo: Use of `assert` detected + "S105", "S106", "S107", # todo: Possible hardcoded password: ... + "S113", # todo: Probable use of requests call without timeout + "S301", # todo: `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue + "S324", # todo: Probable use of insecure hash functions in `hashlib` + "S403", # todo: `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure + "S404", # todo: `subprocess` module is possibly insecure + "S602", # todo: `subprocess` call with `shell=True` identified, security issue + "S603", # todo: `subprocess` call: check for execution of untrusted input + "S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell` + "S607", # todo: Starting a process with a partial executable path + "RET504", # todo:Unnecessary variable assignment before `return` statement + "RET503", + "S310", # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected. +] +"tests/**" = [ + "S101", # Use of `assert` detected + "S105", "S106", # todo: Possible hardcoded password: ... + "S301", # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue + "S113", # todo: Probable use of requests call without timeout + "S311", # todo: Standard pseudo-random generators are not suitable for cryptographic purposes + "S108", # todo: Probable insecure usage of temporary file or directory: "/tmp/sys-customizations-sync" + "S202", # Uses of `tarfile.extractall()` + "S403", # `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure + "S404", # `subprocess` module is possibly insecure + "S602", # todo: `subprocess` call with `shell=True` identified, security issue + "S603", # todo: `subprocess` call: check for execution of untrusted input + "S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell` + "S607", # todo: Starting a process with a partial executable path + "RET504", # todo:Unnecessary variable assignment before `return` statement + "PT004", # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore + "PT011", # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception + "PT012", # todo: `pytest.raises()` block should contain a single simple statement + "PT019", # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead +] + +[tool.ruff.lint.mccabe] +# Unlike Flake8, default to a complexity level of 10. +max-complexity = 10 -[tool.ruff.per-file-ignores] -"setup.py" = ["D100", "SIM115"] -"__about__.py" = ["D100"] -"__init__.py" = ["D100"] -[tool.ruff.pydocstyle] -# Use Google-style docstrings. -convention = "google" +[tool.mypy] +files = [ + "src/lightning", +] +# This section is for folders with "-" as they are not valid python modules +exclude = [ +] +install_types = "True" +non_interactive = "True" +disallow_untyped_defs = "True" +ignore_missing_imports = "True" +show_error_codes = "True" +warn_redundant_casts = "True" +warn_unused_configs = "True" +warn_unused_ignores = "True" +allow_redefinition = "True" +# disable this rule as the Trainer attributes are defined in the connectors, not in its __init__ +disable_error_code = "attr-defined" +# style choices +warn_no_return = "False" + +# Ignore mypy errors for these files +# TODO: the goal is for this to be empty +[[tool.mypy.overrides]] +# the list can be generated with: +# mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' +module = [ +] +ignore_errors = "True" -#[tool.ruff.pycodestyle] -#ignore-overlong-task-comments = true -[tool.ruff.mccabe] -# Unlike Flake8, default to a complexity level of 10. -max-complexity = 10 +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "warnings", + "pass", + "rank_zero_warn", + "raise NotImplementedError", +] + + +[tool.pytest.ini_options] +norecursedirs = [ + ".git", + ".github", + "dist", + "build", + "docs", +] +addopts = [ + "--strict-markers", + "--doctest-modules", + "--color=yes", + "--disable-pytest-warnings", + "--ignore=legacy/checkpoints", +] +markers = [ + "cloud: Run the cloud tests for example", +] +filterwarnings = [ + "error::FutureWarning", +] +xfail_strict = true +junit_duration_report = "call" diff --git a/requirements.txt b/requirements.txt index 9df9b07b..efc63699 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,9 @@ -torch >=2.0.0 -lightning >=2.0.0 +lightning-utilities >=0.8.0, <0.10.0 +lightning-cloud == 0.5.64 # Must be pinned to ensure compatibility +# to be able to include also PL 2.0 and preserve `>` needed for CI min version bypass +torch >=2.1.0, <=2.2.0 +filelock +tqdm +numpy +torchvision +pillow diff --git a/_requirements/docs.txt b/requirements/docs.txt similarity index 100% rename from _requirements/docs.txt rename to requirements/docs.txt diff --git a/requirements/test.txt b/requirements/test.txt new file mode 100644 index 00000000..90b502be --- /dev/null +++ b/requirements/test.txt @@ -0,0 +1,12 @@ +coverage ==7.3.1 +pytest ==7.4.0 +pytest-cov ==4.1.0 +pytest-timeout ==2.1.0 +pytest-rerunfailures ==12.0 +pytest-random-order ==1.1.0 +viztracer +pandas +pyarrow +pillow +lightning +mypy diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index fcadb15d..e8e19bc7 --- a/setup.py +++ b/setup.py @@ -8,12 +8,11 @@ from setuptools import find_packages, setup _PATH_ROOT = os.path.dirname(__file__) -_PATH_SOURCE = os.path.join(_PATH_ROOT, "src") -_PATH_REQUIRES = os.path.join(_PATH_ROOT, "_requirements") +_PATH_REQUIRES = os.path.join(_PATH_ROOT, "requirements") -def _load_py_module(fname, pkg="pl_sandbox"): - spec = spec_from_file_location(os.path.join(pkg, fname), os.path.join(_PATH_SOURCE, pkg, fname)) +def _load_py_module(fname, pkg="lit_data"): + spec = spec_from_file_location(os.path.join(pkg, fname), os.path.join(_PATH_ROOT, pkg, fname)) py = module_from_spec(spec) spec.loader.exec_module(py) return py @@ -29,10 +28,10 @@ def _load_requirements(path_dir: str = _PATH_ROOT, file_name: str = "requirement readme = fopen.read() -def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = ("devel.txt", "docs.txt")) -> dict: +def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = ()) -> dict: # https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras # Define package extras. These are only installed if you specify them. - # From remote, use like `pip install pytorch-lightning[dev, docs]` + # From remote, use like `pip install lit-data[dev, docs]` # From local copy of repo, use like `pip install ".[dev, docs]"` req_files = [Path(p) for p in glob.glob(os.path.join(requirements_dir, "*.txt"))] extras = { @@ -52,16 +51,16 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = # the goal of the project is simplicity for researchers, don't want to add too much # engineer specific practices setup( - name="lightning-sandbox", + name="lit_data", version=about.__version__, description=about.__docs__, author=about.__author__, author_email=about.__author_email__, url=about.__homepage__, - download_url="https://github.com/Lightning-AI/lightning-sandbox", + download_url="https://github.com/Lightning-AI/lit-data", license=about.__license__, - packages=find_packages(where="src"), - package_dir={"": "src"}, + packages=find_packages(where="lit_data"), + package_dir={"": "lit_data"}, long_description=readme, long_description_content_type="text/markdown", include_package_data=True, @@ -72,9 +71,9 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = install_requires=_load_requirements(), extras_require=_prepare_extras(), project_urls={ - "Bug Tracker": "https://github.com/Lightning-AI/lightning-sandbox/issues", - "Documentation": "https://lightning-ai.github.io/lightning-sandbox/", - "Source Code": "https://github.com/Lightning-AI/lightning-sandbox", + "Bug Tracker": "https://github.com/Lightning-AI/lit-data/issues", + "Documentation": "https://lightning-ai.github.io/lit-data/", + "Source Code": "https://github.com/Lightning-AI/lit-data", }, classifiers=[ "Environment :: Console", @@ -92,9 +91,8 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], ) diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py index c410893a..0dcf3eff 100644 --- a/tests/processing/test_data_processor.py +++ b/tests/processing/test_data_processor.py @@ -10,6 +10,8 @@ import pytest import torch from lightning import seed_everything +from lightning_utilities.core.imports import RequirementCache + from lit_data.processing import data_processor as data_processor_module from lit_data.processing import functions from lit_data.processing.data_processor import ( @@ -28,7 +30,6 @@ from lit_data.processing.functions import LambdaDataTransformRecipe, map, optimize from lit_data.streaming import resolver from lit_data.streaming.cache import Cache, Dir -from lightning_utilities.core.imports import RequirementCache _PIL_AVAILABLE = RequirementCache("PIL") diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py index edc041dc..b8d927ca 100644 --- a/tests/processing/test_functions.py +++ b/tests/processing/test_functions.py @@ -3,6 +3,7 @@ from unittest import mock import pytest + from lit_data import walk from lit_data.processing.functions import _get_input_dir diff --git a/tests/processing/test_readers.py b/tests/processing/test_readers.py index cf1d2ae4..f7e5adfd 100644 --- a/tests/processing/test_readers.py +++ b/tests/processing/test_readers.py @@ -2,6 +2,7 @@ import sys import pytest + from lit_data import map from lit_data.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader diff --git a/tests/streaming/test_cache.py b/tests/streaming/test_cache.py index b8daece1..4ce8472f 100644 --- a/tests/streaming/test_cache.py +++ b/tests/streaming/test_cache.py @@ -19,17 +19,18 @@ import pytest import torch from lightning import seed_everything +from lightning.fabric import Fabric +from lightning.pytorch.demos.boring_classes import RandomDataset +from lightning_utilities.core.imports import RequirementCache +from lightning_utilities.test.warning import no_warning_call +from torch.utils.data import Dataset + from lit_data.streaming import Cache from lit_data.streaming.dataloader import CacheDataLoader from lit_data.streaming.dataset import StreamingDataset from lit_data.streaming.item_loader import TokensLoader from lit_data.streaming.serializers import Serializer from lit_data.utilities.env import _DistributedEnv -from lightning.fabric import Fabric -from lightning.pytorch.demos.boring_classes import RandomDataset -from lightning_utilities.core.imports import RequirementCache -from lightning_utilities.test.warning import no_warning_call -from torch.utils.data import Dataset _PIL_AVAILABLE = RequirementCache("PIL") _TORCH_VISION_AVAILABLE = RequirementCache("torchvision") diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py index b9ca68ef..23f06c54 100644 --- a/tests/streaming/test_client.py +++ b/tests/streaming/test_client.py @@ -3,6 +3,7 @@ from unittest import mock import pytest + from lit_data.streaming import client diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py index 76b5af84..077d3d05 100644 --- a/tests/streaming/test_combined.py +++ b/tests/streaming/test_combined.py @@ -4,12 +4,13 @@ import pytest import torch +from torch.utils.data import IterableDataset +from torch.utils.data.dataloader import DataLoader + from lit_data.streaming.cache import Cache from lit_data.streaming.combined import CombinedStreamingDataset from lit_data.streaming.dataloader import StreamingDataLoader from lit_data.streaming.dataset import Dir, StreamingDataset -from torch.utils.data import IterableDataset -from torch.utils.data.dataloader import DataLoader class TestCombinedStreamingDataset(CombinedStreamingDataset): diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py index c74dec82..fc72ea4c 100644 --- a/tests/streaming/test_dataloader.py +++ b/tests/streaming/test_dataloader.py @@ -2,9 +2,10 @@ import pytest import torch +from torch import tensor + from lit_data.streaming import CombinedStreamingDataset, StreamingDataLoader from lit_data.streaming import dataloader as streaming_dataloader_module -from torch import tensor class TestStatefulDataset: diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py index 752cf7e9..52ae8706 100644 --- a/tests/streaming/test_dataset.py +++ b/tests/streaming/test_dataset.py @@ -20,6 +20,8 @@ import pytest import torch from lightning import seed_everything +from torch.utils.data import DataLoader + from lit_data.processing import functions from lit_data.streaming import Cache from lit_data.streaming import dataset as dataset_module @@ -37,7 +39,6 @@ from lit_data.streaming.item_loader import TokensLoader from lit_data.streaming.shuffle import FullShuffle, NoShuffle from lit_data.utilities.env import _DistributedEnv, _WorkerEnv -from torch.utils.data import DataLoader def test_streaming_dataset(tmpdir, monkeypatch): diff --git a/tests/streaming/test_reader.py b/tests/streaming/test_reader.py index 5571a22e..6483b558 100644 --- a/tests/streaming/test_reader.py +++ b/tests/streaming/test_reader.py @@ -3,6 +3,7 @@ from time import sleep import numpy as np + from lit_data.streaming import reader from lit_data.streaming.cache import Cache from lit_data.streaming.config import ChunkedIndex @@ -44,7 +45,7 @@ def test_reader_chunk_removal(tmpdir): index = ChunkedIndex(i, cache._get_chunk_index_from_index(i), is_last_index=i == 24) assert cache[index] == i - assert len(os.listdir(cache_dir)) == 3 + assert len(os.listdir(cache_dir)) in [2, 3] def test_get_folder_size(tmpdir): diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py index 6b1923e7..36a47ed3 100644 --- a/tests/streaming/test_resolver.py +++ b/tests/streaming/test_resolver.py @@ -3,7 +3,6 @@ from unittest import mock import pytest -from lit_data.streaming import resolver from lightning_cloud import login from lightning_cloud.openapi import ( Externalv1Cluster, @@ -17,6 +16,8 @@ V1ListDataConnectionsResponse, ) +from lit_data.streaming import resolver + @pytest.mark.skipif(sys.platform == "win32", reason="windows isn't supported") def test_src_resolver_s3_connections(monkeypatch): diff --git a/tests/streaming/test_sampler.py b/tests/streaming/test_sampler.py index 911ea278..8b1ee8c5 100644 --- a/tests/streaming/test_sampler.py +++ b/tests/streaming/test_sampler.py @@ -2,6 +2,7 @@ import pytest from lightning import seed_everything + from lit_data.streaming.sampler import CacheBatchSampler diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index 902f8aa8..d07e0629 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -20,6 +20,8 @@ import pytest import torch from lightning import seed_everything +from lightning_utilities.core.imports import RequirementCache + from lit_data.streaming.serializers import ( _AV_AVAILABLE, _NUMPY_DTYPES_MAPPING, @@ -36,7 +38,6 @@ TensorSerializer, VideoSerializer, ) -from lightning_utilities.core.imports import RequirementCache _PIL_AVAILABLE = RequirementCache("PIL") diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py index dc00afa3..84e4ee91 100644 --- a/tests/streaming/test_writer.py +++ b/tests/streaming/test_writer.py @@ -18,12 +18,13 @@ import numpy as np import pytest from lightning import seed_everything +from lightning_utilities.core.imports import RequirementCache + from lit_data.streaming.compression import _ZSTD_AVAILABLE from lit_data.streaming.reader import BinaryReader from lit_data.streaming.sampler import ChunkedIndex from lit_data.streaming.writer import BinaryWriter from lit_data.utilities.format import _FORMAT_TO_RATIO -from lightning_utilities.core.imports import RequirementCache _PIL_AVAILABLE = RequirementCache("PIL") diff --git a/tests/utilities/test_packing.py b/tests/utilities/test_packing.py index cb7e7928..41af8591 100644 --- a/tests/utilities/test_packing.py +++ b/tests/utilities/test_packing.py @@ -1,4 +1,5 @@ import pytest + from lit_data.utilities.packing import _pack_greedily