diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 286a59b5..2f260695 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,7 +2,7 @@
Before submitting
- [ ] Was this discussed/agreed via a Github issue? (no need for typos and docs improvements)
-- [ ] Did you read the [contributor guideline](https://github.com/Lightning-AI/pytorch-lightning/blob/main/.github/CONTRIBUTING.md), Pull Request section?
+- [ ] Did you read the [contributor guideline](https://github.com/Lightning-AI/lit-data/blob/main/.github/CONTRIBUTING.md), Pull Request section?
- [ ] Did you make sure to update the docs?
- [ ] Did you write any new necessary tests?
diff --git a/.github/workflows/check-typing.yml b/.github/workflows/check-typing.yml
new file mode 100644
index 00000000..a48baace
--- /dev/null
+++ b/.github/workflows/check-typing.yml
@@ -0,0 +1,82 @@
+name: Check formatting flow
+
+on:
+ workflow_call:
+ inputs:
+ actions-ref:
+ description: "Version of actions, normally the same as workflow"
+ required: true
+ type: string
+ python-version:
+ description: "Python version to use"
+ default: "3.9"
+ required: false
+ type: string
+ extra-typing:
+ description: "Package extra to be installed for type checks + include mypy"
+ default: "test"
+ required: false
+ type: string
+
+defaults:
+ run:
+ shell: bash
+
+jobs:
+ mypy:
+ runs-on: ubuntu-20.04
+ steps:
+ - name: Checkout 🛎️
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Set up Python 🐍 ${{ inputs.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ inputs.python-version }}
+
+ - name: Install dependencies
+ timeout-minutes: 20
+ run: |
+ # don't use --upgrade to respect the version installed via setup.py
+ pip install -e '.[${{ inputs.extra-typing }}]' mypy \
+ --extra-index-url https://download.pytorch.org/whl/cpu/torch_stable.html
+ pip list
+
+ - name: Pull reusable 🤖 actions️
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ inputs.actions-ref }}
+ path: .cicd
+ repository: Lightning-AI/utilities
+ - name: Print 🖨️ dependencies
+ uses: ./.cicd/.github/actions/pip-list
+ with:
+ unfold: true
+
+ # see: https://github.com/python/mypy/issues/10600#issuecomment-857351152
+ - run: yes | mypy src --install-types || true
+
+ - name: Check typing
+ # mypy uses the config file found in the following order:
+ # 1. mypy.ini
+ # 2. pyproject.toml
+ # 3. setup.cfg
+ # 4. $XDG_CONFIG_HOME/mypy/config
+ # 5. ~/.config/mypy/config
+ # 6. ~/.mypy.ini
+ # https://mypy.readthedocs.io/en/stable/config_file.html
+ run: mypy
+
+ - name: suggest ignores
+ if: failure()
+ run: |
+ mypy --no-error-summary 2>&1 \
+ | tr ':' ' ' \
+ | awk '{print $1}' \
+ | sort \
+ | uniq \
+ | sed 's/\.py//g; s|src/||g; s|\/__init__||g; s|\/|\.|g' \
+ | xargs -I {} echo '"{}",' \
+ || true
diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml
index 6d962896..6cf189ff 100644
--- a/.github/workflows/ci-checks.yml
+++ b/.github/workflows/ci-checks.yml
@@ -25,10 +25,10 @@ jobs:
azure-dir: ""
check-package:
- uses: Lightning-AI/utilities/.github/workflows/check-package.yml@main
+ uses: ./check-typing.yml
with:
actions-ref: main
- import-name: "pl_sandbox"
+ import-name: "lit_data"
artifact-name: dist-packages-${{ github.sha }}
testing-matrix: |
{
@@ -39,4 +39,4 @@ jobs:
check-docs:
uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@main
with:
- requirements-file: "_requirements/docs.txt"
+ requirements-file: "requirements/docs.txt"
diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 6d5fa02b..b5bc815d 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -41,11 +41,16 @@ jobs:
- name: Set min. dependencies
if: matrix.requires == 'oldest'
run: |
- for fpath in ('requirements.txt', '_requirements/test.txt'):
+ for fpath in ('requirements.txt', 'requirements/test.txt'):
req = open(fpath).read().replace('>=', '==')
open(fpath, 'w').write(req)
shell: python
+ - name: Display dependencies
+ run: |
+ cat requirements.txt
+ cat requirements/test.txt
+
- name: Get pip cache dir
id: pip-cache
run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
@@ -65,7 +70,7 @@ jobs:
- name: Tests
run: |
- coverage run --source pl_sandbox -m pytest src tests -v
+ coverage run --source lit_data -m pytest tests -v
- name: Statistics
if: success()
diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
index 29670d7b..341eb41c 100644
--- a/.github/workflows/docs-build.yml
+++ b/.github/workflows/docs-build.yml
@@ -10,7 +10,7 @@ jobs:
build-docs:
uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@main
with:
- requirements-file: "_requirements/docs.txt"
+ requirements-file: "requirements/docs.txt"
# https://github.com/marketplace/actions/deploy-to-github-pages
docs-deploy:
diff --git a/.github/workflows/label-conflicts.yml b/.github/workflows/label-conflicts.yml
deleted file mode 100644
index f15f5845..00000000
--- a/.github/workflows/label-conflicts.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Label conflicts
-
-on:
- push:
- branches: ["main"]
- pull_request_target:
- types: ["synchronize", "reopened", "opened"]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
- cancel-in-progress: true
-
-jobs:
- triage-conflicts:
- runs-on: ubuntu-latest
- steps:
- - uses: mschilde/auto-label-merge-conflicts@591722e97f3c4142df3eca156ed0dcf2bcd362bd # Oct 25, 2021
- with:
- CONFLICT_LABEL_NAME: "has conflicts"
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- MAX_RETRIES: 3
- WAIT_MS: 5000
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index f3d6ebfe..3b33f3e3 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -21,8 +21,10 @@ jobs:
- name: Install dependencies
run: pip install -U build twine
+
- name: Build package
run: python -m build
+
- name: Check package
run: twine check dist/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ea7b710c..295c2f1f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,89 +1,113 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
default_language_version:
python: python3
ci:
autofix_prs: true
autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions"
- autoupdate_schedule: "monthly"
+ autoupdate_schedule: quarterly
# submodules: true
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.4.0
+ rev: v4.5.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
- - id: check-case-conflict
+ # keep formatting in README flexible
+ exclude: README.md
+ - id: check-json
- id: check-yaml
- id: check-toml
- - id: check-json
- - id: check-added-large-files
- id: check-docstring-first
+ - id: check-executables-have-shebangs
+ - id: check-case-conflict
+ - id: check-added-large-files
+ args: ["--maxkb=350", "--enforce-all"]
+ exclude: |
+ (?x)^(
+ docs/source-pytorch/_static/images/general/fast_2.gif|
+ docs/source-pytorch/_static/images/mnist_imgs/pt_to_pl.jpg|
+ docs/source-pytorch/_static/images/lightning_module/pt_to_pl.png|
+ docs/source-pytorch/_static/images/general/pl_quick_start_full_compressed.gif|
+ docs/source-pytorch/_static/images/general/pl_overview_flat.jpg|
+ docs/source-pytorch/_static/images/general/pl_overview.gif|
+ src/lightning/fabric/CHANGELOG.md|
+ src/lightning/pytorch/CHANGELOG.md
+ )$
- id: detect-private-key
- repo: https://github.com/asottile/pyupgrade
- rev: v3.9.0
+ rev: v3.15.0
hooks:
- id: pyupgrade
args: ["--py38-plus"]
name: Upgrade code
- - repo: https://github.com/pre-commit/pygrep-hooks
- rev: v1.10.0
- hooks:
- - id: python-use-type-annotations
-
- repo: https://github.com/codespell-project/codespell
- rev: v2.2.5
+ rev: v2.2.6
hooks:
- id: codespell
additional_dependencies: [tomli]
- #args: ["--write-changes"]
+ #args: ["--write-changes"] # uncomment if you want to get automatic fixing
- - repo: https://github.com/pre-commit/mirrors-prettier
- rev: v3.0.3
- hooks:
- - id: prettier
- # https://prettier.io/docs/en/options.html#print-width
- args: ["--print-width=120"]
-
- - repo: https://github.com/myint/docformatter
+ - repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
hooks:
- id: docformatter
additional_dependencies: [tomli]
args: ["--in-place"]
- - repo: https://github.com/psf/black
- rev: 23.7.0
+ - repo: https://github.com/asottile/yesqa
+ rev: v1.5.0
hooks:
- - id: black
- name: Black code
+ - id: yesqa
+ name: Unused noqa
+ additional_dependencies:
+ #- pep8-naming
+ - flake8-pytest-style
+ - flake8-bandit
+ - flake8-simplify
+ - flake8-return
+
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: "v0.2.0"
+ hooks:
+ - id: ruff
+ args: ["--fix", "--preview"]
+ - id: ruff-format
+ args: ["--preview"]
- repo: https://github.com/executablebooks/mdformat
- rev: 0.7.16
+ rev: 0.7.17
hooks:
- id: mdformat
additional_dependencies:
- mdformat-gfm
- - mdformat-black
+ #- mdformat-black
- mdformat_frontmatter
- exclude: CHANGELOG.md
-
- - repo: https://github.com/asottile/yesqa
- rev: v1.5.0
- hooks:
- - id: yesqa
- additional_dependencies:
- - pep8-naming
- - pydocstyle
- #- flake8-comprehensions
- #- flake8-pytest-style
- #- flake8-return
- #- flake8-simplify
+ exclude: |
+ (?x)^(
+ src/data/CHANGELOG.md|
+ README.md
+ )$
- - repo: https://github.com/charliermarsh/ruff-pre-commit
- rev: v0.0.277
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: v3.1.0
hooks:
- - id: ruff
- args: ["--fix"]
+ - id: prettier
+ # https://prettier.io/docs/en/options.html#print-width
+ args: ["--print-width=120"]
diff --git a/.readthedocs.yml b/.readthedocs.yml
index f6bb36d5..e67b1544 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -17,4 +17,4 @@ formats: all
python:
version: 3.7
install:
- - requirements: _requirements/docs.txt
+ - requirements: requirements/docs.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 28581466..0b6fbf91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [Unreleased] - YYYY-MM-DD
+## \[Unreleased\] - YYYY-MM-DD
### Added
diff --git a/MANIFEST.in b/MANIFEST.in
index 4cd9bf95..cf416353 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,7 +5,7 @@ recursive-exclude __pycache__ *.py[cod] *.orig
# Include the README and CHANGELOG
include *.md
-recursive-include src *.md
+recursive-include lit_data *.md
# Include the license file
include LICENSE
@@ -28,7 +28,7 @@ exclude docs
# Include the Requirements
include requirements.txt
-recursive-include _requirements *.tx;t
+recursive-include requirements *.tx;t
# Exclude Makefile
exclude Makefile
diff --git a/Makefile b/Makefile
index 8d1105ed..ee079558 100644
--- a/Makefile
+++ b/Makefile
@@ -7,14 +7,14 @@ export SPHINX_MOCK_REQUIREMENTS=0
test: clean
pip install -q -r requirements.txt
- pip install -q -r _requirements/test.txt
+ pip install -q -r requirements/test.txt
# use this to run tests
- python -m coverage run --source pl_sandbox -m pytest src tests -v --flake8
+ python -m coverage run --source lit_data -m pytest src -v --flake8
python -m coverage report
docs: clean
- pip install . --quiet -r _requirements/docs.txt
+ pip install . --quiet -r requirements/docs.txt
python -m sphinx -b html -W --keep-going docs/source docs/build
clean:
diff --git a/README.md b/README.md
index 5e7e9aa0..e59488be 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-
+
@@ -32,7 +32,7 @@ Lightning Data can be installed with `pip`:
```bash
-pip install --no-cache-dir git+https://github.com/Lightning-AI/pytorch-lightning.git@master
+pip install --no-cache-dir git+https://github.com/Lightning-AI/lit-data.git@master
```
## 🏁 Quick Start
@@ -45,7 +45,7 @@ Convert your raw dataset into Lightning Streaming format using the `optimize` op
```python
import numpy as np
-from lightning.data import optimize
+from lit_data import optimize
from PIL import Image
@@ -84,7 +84,7 @@ Here is an example with [AWS S3](https://aws.amazon.com/s3).
### 3. Use StreamingDataset and DataLoader
```python
-from lightning.data import StreamingDataset
+from lit_data import StreamingDataset
from torch.utils.data import DataLoader
# Remote path where full dataset is persistently stored
@@ -135,7 +135,7 @@ for i in range(1000):
```python
import os
-from lightning.data import map
+from lit_data import map
from PIL import Image
input_dir = "s3://my-bucket/my_images"
@@ -174,7 +174,7 @@ We have end-to-end free [Studios](https://lightning.ai) showing all the steps to
To scale data processing, create a free account on [lightning.ai](https://lightning.ai/) platform. With the platform, the `optimize` and `map` can start multiple machines to make data processing drastically faster as follows:
```python
-from lightning.data import optimize, Machine
+from lit_data import optimize, Machine
optimize(
...
@@ -186,7 +186,7 @@ optimize(
OR
```python
-from lightning.data import map, Machine
+from lit_data import map, Machine
map(
...
@@ -216,8 +216,8 @@ The `StreamingDataset` and `StreamingDataLoader` takes care of everything for yo
You can easily experiment with dataset mixtures using the CombinedStreamingDataset.
```python
-from lightning.data import StreamingDataset, CombinedStreamingDataset
-from lightning.data.streaming.item_loader import TokensLoader
+from lit_data import StreamingDataset, CombinedStreamingDataset
+from lit_data.streaming.item_loader import TokensLoader
from tqdm import tqdm
import os
from torch.utils.data import DataLoader
@@ -257,7 +257,7 @@ Note: The `StreamingDataLoader` is used by [Lit-GPT](https://github.com/Lightnin
```python
import os
import torch
-from lightning.data import StreamingDataset, StreamingDataLoader
+from lit_data import StreamingDataset, StreamingDataLoader
dataset = StreamingDataset("s3://my-bucket/my-data", shuffle=True)
dataloader = StreamingDataLoader(dataset, num_workers=os.cpu_count(), batch_size=64)
@@ -280,7 +280,7 @@ for batch_idx, batch in enumerate(dataloader):
The `StreamingDataLoader` supports profiling your data loading. Simply use the `profile_batches` argument as follows:
```python
-from lightning.data import StreamingDataset, StreamingDataLoader
+from lit_data import StreamingDataset, StreamingDataLoader
StreamingDataLoader(..., profile_batches=5)
```
@@ -292,7 +292,7 @@ This generates a Chrome trace called `result.json`. You can visualize this trace
Access the data you need when you need it.
```python
-from lightning.data import StreamingDataset
+from lit_data import StreamingDataset
dataset = StreamingDataset(...)
@@ -304,7 +304,7 @@ print(dataset[42]) # show the 42th element of the dataset
## ✢ Use data transforms
```python
-from lightning.data import StreamingDataset, StreamingDataLoader
+from lit_data import StreamingDataset, StreamingDataLoader
import torchvision.transforms.v2.functional as F
class ImagenetStreamingDataset(StreamingDataset):
@@ -326,7 +326,7 @@ for batch in dataloader:
Limit the size of the cache holding the chunks.
```python
-from lightning.data import StreamingDataset
+from lit_data import StreamingDataset
dataset = StreamingDataset(..., max_cache_size="10GB")
```
@@ -338,7 +338,7 @@ When processing large files like compressed [parquet files](https://en.wikipedia
```python
from pathlib import Path
import pyarrow.parquet as pq
-from lightning.data import optimize
+from lit_data import optimize
from tokenizer import Tokenizer
from functools import partial
diff --git a/_requirements/test.txt b/_requirements/test.txt
deleted file mode 100644
index c3fbc921..00000000
--- a/_requirements/test.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-coverage>=5.0
-pytest>=6.0
-pytest-cov
-mypy==1.8.0
diff --git a/docs/source/_templates/theme_variables.jinja b/docs/source/_templates/theme_variables.jinja
index 5d092539..28f05a5e 100644
--- a/docs/source/_templates/theme_variables.jinja
+++ b/docs/source/_templates/theme_variables.jinja
@@ -1,18 +1,18 @@
{%- set external_urls = {
- 'github': 'https://github.com/Lightning-AI/lightning-Sandbox',
- 'github_issues': 'https://github.com/Lightning-AI/lightning-Sandbox/issues',
+ 'github': 'https://github.com/Lightning-AI/lit-data',
+ 'github_issues': 'https://github.com/Lightning-AI/lit-data/issues',
'contributing': 'https://github.com/Lightning-AI/lightning/blob/master/CONTRIBUTING.md',
'governance': 'https://github.com/Lightning-AI/lightning/blob/master/governance.md',
- 'docs': 'https://lightning-ai.github.io/lightning-Sandbox/',
+ 'docs': 'https://lightning-ai.github.io/lit-data/',
'twitter': 'https://twitter.com/LightningAI',
'discuss': 'https://discord.com/invite/tfXFetEZxv',
'tutorials': 'https://lightning.ai',
- 'previous_pytorch_versions': 'https://lightning-ai.github.io/lightning-Sandbox/',
- 'home': 'https://lightning-ai.github.io/lightning-Sandbox/',
+ 'previous_pytorch_versions': 'https://lightning-ai.github.io/lit-data/',
+ 'home': 'https://lightning-ai.github.io/lit-data/',
'get_started': 'https://lightning.ai',
- 'features': 'https://lightning-ai.github.io/lightning-Sandbox/',
+ 'features': 'https://lightning-ai.github.io/lit-data/',
'blog': 'https://www.Lightning.ai/blog',
'resources': 'https://lightning.ai',
- 'support': 'https://lightning-ai.github.io/lightning-Sandbox/',
+ 'support': 'https://lightning-ai.github.io/lit-data/',
}
-%}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0651f76b..a6da23d1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -16,20 +16,19 @@
_PATH_HERE = os.path.abspath(os.path.dirname(__file__))
_PATH_ROOT = os.path.realpath(os.path.join(_PATH_HERE, "..", ".."))
-_PATH_SOURCE = os.path.join(_PATH_ROOT, "src")
sys.path.insert(0, os.path.abspath(_PATH_ROOT))
SPHINX_MOCK_REQUIREMENTS = int(os.environ.get("SPHINX_MOCK_REQUIREMENTS", True))
# alternative https://stackoverflow.com/a/67692/4521646
-spec = spec_from_file_location("pl_sandbox/__about__.py", os.path.join(_PATH_SOURCE, "pl_sandbox", "__about__.py"))
+spec = spec_from_file_location("lit_data/__about__.py", os.path.join(_PATH_ROOT, "lit_data", "__about__.py"))
about = module_from_spec(spec)
spec.loader.exec_module(about)
# -- Project information -----------------------------------------------------
# this name shall match the project name in Github as it is used for linking to code
-project = "Lightning-Sandbox"
+project = "lit-data"
copyright = about.__copyright__
author = about.__author__
@@ -317,8 +316,8 @@ def find_source():
fname = inspect.getsourcefile(obj)
# https://github.com/rtfd/readthedocs.org/issues/5735
if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")):
- # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/
- # devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176
+ # /home/docs/checkouts/readthedocs.org/user_builds/lit_data/checkouts/
+ # devel/lit_data/utilities/cls_experiment.py#L26-L176
path_top = os.path.abspath(os.path.join("..", "..", ".."))
fname = os.path.relpath(fname, start=path_top)
else:
@@ -381,8 +380,8 @@ def find_source():
import os
import torch
-import pytorch_lightning as pl
-from pytorch_lightning import Trainer, LightningModule
+import lit_data
+from lit_data import StreamingDataset
"""
coverage_skip_undoc_in_source = True
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2899f863..d5a27347 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,7 +3,7 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
-Lightning-Sandbox
+lit-data
=================
.. toctree::
diff --git a/lit_data/CHANGELOG.md b/lit_data/CHANGELOG.md
new file mode 100644
index 00000000..e3655c93
--- /dev/null
+++ b/lit_data/CHANGELOG.md
@@ -0,0 +1,37 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
+
+## \[unReleased\] - 2024-MM-DD
+
+### Added
+
+-
+-
+-
+
+### Changed
+
+-
+-
+-
+
+### Deprecated
+
+-
+-
+-
+
+### Removed
+
+-
+-
+-
+
+### Fixed
+
+-
+-
+-
diff --git a/lit_data/__about__.py b/lit_data/__about__.py
new file mode 100644
index 00000000..fdf610bb
--- /dev/null
+++ b/lit_data/__about__.py
@@ -0,0 +1,43 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+__author__ = "Lightning AI et al."
+__author_email__ = "pytorch@lightning.ai"
+__license__ = "Apache-2.0"
+__copyright__ = f"Copyright (c) 2018-{time.strftime('%Y')}, {__author__}."
+__homepage__ = "https://github.com/Lightning-AI/lit-data"
+__docs_url__ = "https://lightning.ai/docs/pytorch/stable/"
+# this has to be simple string, see: https://github.com/pypa/twine/issues/522
+__docs__ = "The Deep Learning framework to train, deploy, and ship AI products Lightning fast."
+__long_doc__ = """
+What is it?
+-----------
+
+TBD @eden
+""" # TODO
+
+__all__ = [
+ "__author__",
+ "__author_email__",
+ "__copyright__",
+ "__docs__",
+ "__docs_url__",
+ "__homepage__",
+ "__license__",
+ "__version__",
+]
+
+__version__ = "0.0.1"
diff --git a/src/__init__.py b/lit_data/__init__.py
similarity index 100%
rename from src/__init__.py
rename to lit_data/__init__.py
diff --git a/src/constants.py b/lit_data/constants.py
similarity index 100%
rename from src/constants.py
rename to lit_data/constants.py
diff --git a/src/processing/__init__.py b/lit_data/processing/__init__.py
similarity index 100%
rename from src/processing/__init__.py
rename to lit_data/processing/__init__.py
diff --git a/src/processing/data_processor.py b/lit_data/processing/data_processor.py
similarity index 99%
rename from src/processing/data_processor.py
rename to lit_data/processing/data_processor.py
index 33602799..52afaea5 100644
--- a/src/processing/data_processor.py
+++ b/lit_data/processing/data_processor.py
@@ -17,9 +17,9 @@
from urllib import parse
import numpy as np
+from lightning import seed_everything
from tqdm.auto import tqdm as _tqdm
-from lightning import seed_everything
from lit_data.constants import (
_BOTO3_AVAILABLE,
_DEFAULT_FAST_DEV_RUN_ITEMS,
@@ -427,7 +427,7 @@ def _loop(self) -> None:
uploader.join()
if self.remove:
- assert self.remover
+ assert self.remover # noqa: S101
self.remove_queue.put(None)
self.remover.join()
@@ -487,7 +487,7 @@ def _try_upload(self, data: Optional[Union[str, Tuple[str, str]]]) -> None:
if isinstance(data, str):
assert os.path.exists(data), data
else:
- assert os.path.exists(data[-1]), data
+ assert os.path.exists(data[-1]), data # noqa: S101
self.to_upload_queues[self._counter % self.num_uploaders].put(data)
@@ -772,7 +772,7 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
# Get the index file locally
for node_rank in range(num_nodes - 1):
output_dir_path = output_dir.url if output_dir.url else output_dir.path
- assert output_dir_path
+ assert output_dir_path # noqa: S101
remote_filepath = os.path.join(output_dir_path, f"{node_rank}-{_INDEX_FILENAME}")
node_index_filepath = os.path.join(cache_dir, os.path.basename(remote_filepath))
if obj.scheme == "s3":
@@ -941,7 +941,7 @@ def run(self, data_recipe: DataRecipe) -> None:
error = self.error_queue.get(timeout=0.001)
self._exit_on_error(error)
except Empty:
- assert self.progress_queue
+ assert self.progress_queue # noqa: S101
try:
index, counter = self.progress_queue.get(timeout=0.001)
except Empty:
diff --git a/src/processing/functions.py b/lit_data/processing/functions.py
similarity index 99%
rename from src/processing/functions.py
rename to lit_data/processing/functions.py
index 83b632cf..afa516e6 100644
--- a/src/processing/functions.py
+++ b/lit_data/processing/functions.py
@@ -41,14 +41,12 @@
def _get_indexed_paths(data: Any) -> Dict[int, str]:
flattened_item, _ = tree_flatten(data)
- indexed_paths = {
+ return {
index: element
for index, element in enumerate(flattened_item)
if isinstance(element, str) and os.path.exists(element)
}
- return indexed_paths
-
def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]:
indexed_paths = _get_indexed_paths(inputs[0])
diff --git a/src/processing/readers.py b/lit_data/processing/readers.py
similarity index 100%
rename from src/processing/readers.py
rename to lit_data/processing/readers.py
diff --git a/src/processing/utilities.py b/lit_data/processing/utilities.py
similarity index 95%
rename from src/processing/utilities.py
rename to lit_data/processing/utilities.py
index 051e25dd..1bdf9c71 100644
--- a/src/processing/utilities.py
+++ b/lit_data/processing/utilities.py
@@ -2,7 +2,7 @@
import os
import urllib
from contextlib import contextmanager
-from subprocess import Popen
+from subprocess import Popen # noqa: S404
from typing import Any, Callable, Optional, Tuple
from lit_data.constants import _IS_IN_STUDIO
@@ -29,12 +29,12 @@ def _wrapper(*args: Any, **kwargs: Any) -> Tuple[Any, Optional[Exception]]:
def make_request(
url: str,
timeout: int = 10,
- user_agent_token: str = "pytorch-lightning",
+ user_agent_token: str = "lit-data",
) -> io.BytesIO:
"""Download an image with urllib."""
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
if user_agent_token:
- user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/Lightning-AI/pytorch-lightning)"
+ user_agent_string += f" (compatible; {user_agent_token}; +https://github.com/Lightning-AI/lit-data)"
with urllib.request.urlopen( # noqa: S310
urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string}), timeout=timeout
diff --git a/src/streaming/__init__.py b/lit_data/streaming/__init__.py
similarity index 100%
rename from src/streaming/__init__.py
rename to lit_data/streaming/__init__.py
diff --git a/src/streaming/cache.py b/lit_data/streaming/cache.py
similarity index 100%
rename from src/streaming/cache.py
rename to lit_data/streaming/cache.py
diff --git a/src/streaming/client.py b/lit_data/streaming/client.py
similarity index 100%
rename from src/streaming/client.py
rename to lit_data/streaming/client.py
diff --git a/src/streaming/combined.py b/lit_data/streaming/combined.py
similarity index 100%
rename from src/streaming/combined.py
rename to lit_data/streaming/combined.py
diff --git a/src/streaming/compression.py b/lit_data/streaming/compression.py
similarity index 100%
rename from src/streaming/compression.py
rename to lit_data/streaming/compression.py
diff --git a/src/streaming/config.py b/lit_data/streaming/config.py
similarity index 100%
rename from src/streaming/config.py
rename to lit_data/streaming/config.py
diff --git a/src/streaming/dataloader.py b/lit_data/streaming/dataloader.py
similarity index 100%
rename from src/streaming/dataloader.py
rename to lit_data/streaming/dataloader.py
diff --git a/src/streaming/dataset.py b/lit_data/streaming/dataset.py
similarity index 100%
rename from src/streaming/dataset.py
rename to lit_data/streaming/dataset.py
diff --git a/src/streaming/downloader.py b/lit_data/streaming/downloader.py
similarity index 100%
rename from src/streaming/downloader.py
rename to lit_data/streaming/downloader.py
diff --git a/src/streaming/item_loader.py b/lit_data/streaming/item_loader.py
similarity index 100%
rename from src/streaming/item_loader.py
rename to lit_data/streaming/item_loader.py
diff --git a/src/streaming/reader.py b/lit_data/streaming/reader.py
similarity index 100%
rename from src/streaming/reader.py
rename to lit_data/streaming/reader.py
diff --git a/src/streaming/resolver.py b/lit_data/streaming/resolver.py
similarity index 100%
rename from src/streaming/resolver.py
rename to lit_data/streaming/resolver.py
diff --git a/src/streaming/sampler.py b/lit_data/streaming/sampler.py
similarity index 100%
rename from src/streaming/sampler.py
rename to lit_data/streaming/sampler.py
diff --git a/src/streaming/serializers.py b/lit_data/streaming/serializers.py
similarity index 100%
rename from src/streaming/serializers.py
rename to lit_data/streaming/serializers.py
diff --git a/src/streaming/shuffle.py b/lit_data/streaming/shuffle.py
similarity index 100%
rename from src/streaming/shuffle.py
rename to lit_data/streaming/shuffle.py
diff --git a/src/streaming/writer.py b/lit_data/streaming/writer.py
similarity index 100%
rename from src/streaming/writer.py
rename to lit_data/streaming/writer.py
diff --git a/src/utilities/__init__.py b/lit_data/utilities/__init__.py
similarity index 100%
rename from src/utilities/__init__.py
rename to lit_data/utilities/__init__.py
diff --git a/src/utilities/broadcast.py b/lit_data/utilities/broadcast.py
similarity index 100%
rename from src/utilities/broadcast.py
rename to lit_data/utilities/broadcast.py
diff --git a/src/utilities/env.py b/lit_data/utilities/env.py
similarity index 100%
rename from src/utilities/env.py
rename to lit_data/utilities/env.py
diff --git a/src/utilities/format.py b/lit_data/utilities/format.py
similarity index 100%
rename from src/utilities/format.py
rename to lit_data/utilities/format.py
diff --git a/src/utilities/packing.py b/lit_data/utilities/packing.py
similarity index 100%
rename from src/utilities/packing.py
rename to lit_data/utilities/packing.py
diff --git a/src/utilities/shuffle.py b/lit_data/utilities/shuffle.py
similarity index 100%
rename from src/utilities/shuffle.py
rename to lit_data/utilities/shuffle.py
diff --git a/pyproject.toml b/pyproject.toml
index d3a55d64..fa123ec0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,21 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
[metadata]
-license_file = "LICENSE"
-description-file = "README.md"
+name = "lit_data"
+author = "Lightning-AI et al."
+url = "https://github.com/Lightning-AI/lit-data"
[build-system]
requires = [
@@ -9,125 +24,172 @@ requires = [
]
-[tool.check-manifest]
-ignore = [
- "*.yml",
- ".github",
- ".github/*"
-]
-
-
-[tool.pytest.ini_options]
-norecursedirs = [
- ".git",
- ".github",
- "dist",
- "build",
- "docs",
-]
-addopts = [
- "--strict-markers",
- "--doctest-modules",
- "--color=yes",
- "--disable-pytest-warnings",
-]
-filterwarnings = [
- "error::FutureWarning",
-]
-xfail_strict = true
-junit_duration_report = "call"
-
-[tool.coverage.report]
-exclude_lines = [
- "pragma: no cover",
- "pass",
-]
-
-[tool.codespell]
-#skip = '*.py'
-quiet-level = 3
-# comma separated list of words; waiting for:
-# https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603
-# also adding links until they ignored by its: nature
-# https://github.com/codespell-project/codespell/issues/2243#issuecomment-1732019960
-#ignore-words-list = ""
-
-
[tool.black]
-# https://github.com/psf/black
line-length = 120
-exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
+exclude = '(_notebooks/.*)'
[tool.docformatter]
recursive = true
-wrap-summaries = 120
+# this need to be shorter as some docstings are r"""...
+wrap-summaries = 119
wrap-descriptions = 120
blank = true
-
-[tool.mypy]
-files = [
- "src",
-]
-install_types = true
-non_interactive = true
-disallow_untyped_defs = true
-ignore_missing_imports = true
-show_error_codes = true
-warn_redundant_casts = true
-warn_unused_configs = true
-warn_unused_ignores = true
-allow_redefinition = true
-# disable this rule as the Trainer attributes are defined in the connectors, not in its __init__
-disable_error_code = "attr-defined"
-# style choices
-warn_no_return = false
+[tool.codespell]
+# Todo: enable also python files in a next step
+skip = '*.py'
+quiet-level = 3
+# comma separated list of words; waiting for:
+# https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603
+# also adding links until they ignored by its: nature
+# https://github.com/codespell-project/codespell/issues/2243#issuecomment-1732019960
+ignore-words-list = "te, compiletime"
[tool.ruff]
line-length = 120
# Enable Pyflakes `E` and `F` codes by default.
-select = [
+lint.select = [
"E", "W", # see: https://pypi.org/project/pycodestyle
"F", # see: https://pypi.org/project/pyflakes
- "D", # see: https://pypi.org/project/pydocstyle
- "N", # see: https://pypi.org/project/pep8-naming
+ "S", # see: https://pypi.org/project/flake8-bandit
+ "RUF018", # see: https://docs.astral.sh/ruff/rules/assignment-in-assert
]
-extend-select = [
+lint.extend-select = [
+ "I", # see: isort
"C4", # see: https://pypi.org/project/flake8-comprehensions
- "PT", # see: https://pypi.org/project/flake8-pytest-style
- "RET", # see: https://pypi.org/project/flake8-return
"SIM", # see: https://pypi.org/project/flake8-simplify
+ "RET", # see: https://pypi.org/project/flake8-return
+ "PT", # see: https://pypi.org/project/flake8-pytest-style
]
-ignore = [
+lint.ignore = [
"E731", # Do not assign a lambda expression, use a def
+ "S108",
+ "E203", # conflicts with black
]
# Exclude a variety of commonly ignored directories.
exclude = [
- ".eggs",
".git",
- ".mypy_cache",
- ".ruff_cache",
- "__pypackages__",
- "_build",
- "build",
- "dist",
- "docs"
+ "docs",
+ "_notebooks"
]
-ignore-init-module-imports = true
+lint.ignore-init-module-imports = true
+
+[tool.ruff.lint.per-file-ignores]
+".actions/*" = ["S101", "S310"]
+"setup.py" = ["S101", "SIM115"]
+"examples/**" = [
+ "S101", # Use of `assert` detected
+ "S113", # todo: Probable use of requests call without
+ "S104", # Possible binding to all interface
+ "F821", # Undefined name `...`
+ "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
+ "S501", # Probable use of `requests` call with `verify=False` disabling SSL certificate checks
+ "S108", # Probable insecure usage of temporary file or directory: "/tmp/data/MNIST"
+]
+"lit_data/**" = [
+ "S101", # todo: Use of `assert` detected
+ "S105", "S106", "S107", # todo: Possible hardcoded password: ...
+ "S113", # todo: Probable use of requests call without timeout
+ "S301", # todo: `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue
+ "S324", # todo: Probable use of insecure hash functions in `hashlib`
+ "S403", # todo: `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure
+ "S404", # todo: `subprocess` module is possibly insecure
+ "S602", # todo: `subprocess` call with `shell=True` identified, security issue
+ "S603", # todo: `subprocess` call: check for execution of untrusted input
+ "S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
+ "S607", # todo: Starting a process with a partial executable path
+ "RET504", # todo:Unnecessary variable assignment before `return` statement
+ "RET503",
+ "S310", # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected.
+]
+"tests/**" = [
+ "S101", # Use of `assert` detected
+ "S105", "S106", # todo: Possible hardcoded password: ...
+ "S301", # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue
+ "S113", # todo: Probable use of requests call without timeout
+ "S311", # todo: Standard pseudo-random generators are not suitable for cryptographic purposes
+ "S108", # todo: Probable insecure usage of temporary file or directory: "/tmp/sys-customizations-sync"
+ "S202", # Uses of `tarfile.extractall()`
+ "S403", # `pickle`, `cPickle`, `dill`, and `shelve` modules are possibly insecure
+ "S404", # `subprocess` module is possibly insecure
+ "S602", # todo: `subprocess` call with `shell=True` identified, security issue
+ "S603", # todo: `subprocess` call: check for execution of untrusted input
+ "S605", # todo: Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell`
+ "S607", # todo: Starting a process with a partial executable path
+ "RET504", # todo:Unnecessary variable assignment before `return` statement
+ "PT004", # todo: Fixture `tmpdir_unittest_fixture` does not return anything, add leading underscore
+ "PT011", # todo: `pytest.raises(ValueError)` is too broad, set the `match` parameter or use a more specific exception
+ "PT012", # todo: `pytest.raises()` block should contain a single simple statement
+ "PT019", # todo: Fixture `_` without value is injected as parameter, use `@pytest.mark.usefixtures` instead
+]
+
+[tool.ruff.lint.mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
-[tool.ruff.per-file-ignores]
-"setup.py" = ["D100", "SIM115"]
-"__about__.py" = ["D100"]
-"__init__.py" = ["D100"]
-[tool.ruff.pydocstyle]
-# Use Google-style docstrings.
-convention = "google"
+[tool.mypy]
+files = [
+ "src/lightning",
+]
+# This section is for folders with "-" as they are not valid python modules
+exclude = [
+]
+install_types = "True"
+non_interactive = "True"
+disallow_untyped_defs = "True"
+ignore_missing_imports = "True"
+show_error_codes = "True"
+warn_redundant_casts = "True"
+warn_unused_configs = "True"
+warn_unused_ignores = "True"
+allow_redefinition = "True"
+# disable this rule as the Trainer attributes are defined in the connectors, not in its __init__
+disable_error_code = "attr-defined"
+# style choices
+warn_no_return = "False"
+
+# Ignore mypy errors for these files
+# TODO: the goal is for this to be empty
+[[tool.mypy.overrides]]
+# the list can be generated with:
+# mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",'
+module = [
+]
+ignore_errors = "True"
-#[tool.ruff.pycodestyle]
-#ignore-overlong-task-comments = true
-[tool.ruff.mccabe]
-# Unlike Flake8, default to a complexity level of 10.
-max-complexity = 10
+[tool.coverage.report]
+exclude_lines = [
+ "pragma: no cover",
+ "warnings",
+ "pass",
+ "rank_zero_warn",
+ "raise NotImplementedError",
+]
+
+
+[tool.pytest.ini_options]
+norecursedirs = [
+ ".git",
+ ".github",
+ "dist",
+ "build",
+ "docs",
+]
+addopts = [
+ "--strict-markers",
+ "--doctest-modules",
+ "--color=yes",
+ "--disable-pytest-warnings",
+ "--ignore=legacy/checkpoints",
+]
+markers = [
+ "cloud: Run the cloud tests for example",
+]
+filterwarnings = [
+ "error::FutureWarning",
+]
+xfail_strict = true
+junit_duration_report = "call"
diff --git a/requirements.txt b/requirements.txt
index 9df9b07b..efc63699 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,9 @@
-torch >=2.0.0
-lightning >=2.0.0
+lightning-utilities >=0.8.0, <0.10.0
+lightning-cloud == 0.5.64 # Must be pinned to ensure compatibility
+# to be able to include also PL 2.0 and preserve `>` needed for CI min version bypass
+torch >=2.1.0, <=2.2.0
+filelock
+tqdm
+numpy
+torchvision
+pillow
diff --git a/_requirements/docs.txt b/requirements/docs.txt
similarity index 100%
rename from _requirements/docs.txt
rename to requirements/docs.txt
diff --git a/requirements/test.txt b/requirements/test.txt
new file mode 100644
index 00000000..90b502be
--- /dev/null
+++ b/requirements/test.txt
@@ -0,0 +1,12 @@
+coverage ==7.3.1
+pytest ==7.4.0
+pytest-cov ==4.1.0
+pytest-timeout ==2.1.0
+pytest-rerunfailures ==12.0
+pytest-random-order ==1.1.0
+viztracer
+pandas
+pyarrow
+pillow
+lightning
+mypy
diff --git a/setup.py b/setup.py
old mode 100755
new mode 100644
index fcadb15d..e8e19bc7
--- a/setup.py
+++ b/setup.py
@@ -8,12 +8,11 @@
from setuptools import find_packages, setup
_PATH_ROOT = os.path.dirname(__file__)
-_PATH_SOURCE = os.path.join(_PATH_ROOT, "src")
-_PATH_REQUIRES = os.path.join(_PATH_ROOT, "_requirements")
+_PATH_REQUIRES = os.path.join(_PATH_ROOT, "requirements")
-def _load_py_module(fname, pkg="pl_sandbox"):
- spec = spec_from_file_location(os.path.join(pkg, fname), os.path.join(_PATH_SOURCE, pkg, fname))
+def _load_py_module(fname, pkg="lit_data"):
+ spec = spec_from_file_location(os.path.join(pkg, fname), os.path.join(_PATH_ROOT, pkg, fname))
py = module_from_spec(spec)
spec.loader.exec_module(py)
return py
@@ -29,10 +28,10 @@ def _load_requirements(path_dir: str = _PATH_ROOT, file_name: str = "requirement
readme = fopen.read()
-def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = ("devel.txt", "docs.txt")) -> dict:
+def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = ()) -> dict:
# https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras
# Define package extras. These are only installed if you specify them.
- # From remote, use like `pip install pytorch-lightning[dev, docs]`
+ # From remote, use like `pip install lit-data[dev, docs]`
# From local copy of repo, use like `pip install ".[dev, docs]"`
req_files = [Path(p) for p in glob.glob(os.path.join(requirements_dir, "*.txt"))]
extras = {
@@ -52,16 +51,16 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple =
# the goal of the project is simplicity for researchers, don't want to add too much
# engineer specific practices
setup(
- name="lightning-sandbox",
+ name="lit_data",
version=about.__version__,
description=about.__docs__,
author=about.__author__,
author_email=about.__author_email__,
url=about.__homepage__,
- download_url="https://github.com/Lightning-AI/lightning-sandbox",
+ download_url="https://github.com/Lightning-AI/lit-data",
license=about.__license__,
- packages=find_packages(where="src"),
- package_dir={"": "src"},
+ packages=find_packages(where="lit_data"),
+ package_dir={"": "lit_data"},
long_description=readme,
long_description_content_type="text/markdown",
include_package_data=True,
@@ -72,9 +71,9 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple =
install_requires=_load_requirements(),
extras_require=_prepare_extras(),
project_urls={
- "Bug Tracker": "https://github.com/Lightning-AI/lightning-sandbox/issues",
- "Documentation": "https://lightning-ai.github.io/lightning-sandbox/",
- "Source Code": "https://github.com/Lightning-AI/lightning-sandbox",
+ "Bug Tracker": "https://github.com/Lightning-AI/lit-data/issues",
+ "Documentation": "https://lightning-ai.github.io/lit-data/",
+ "Source Code": "https://github.com/Lightning-AI/lit-data",
},
classifiers=[
"Environment :: Console",
@@ -92,9 +91,8 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple =
# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
],
)
diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py
index c410893a..0dcf3eff 100644
--- a/tests/processing/test_data_processor.py
+++ b/tests/processing/test_data_processor.py
@@ -10,6 +10,8 @@
import pytest
import torch
from lightning import seed_everything
+from lightning_utilities.core.imports import RequirementCache
+
from lit_data.processing import data_processor as data_processor_module
from lit_data.processing import functions
from lit_data.processing.data_processor import (
@@ -28,7 +30,6 @@
from lit_data.processing.functions import LambdaDataTransformRecipe, map, optimize
from lit_data.streaming import resolver
from lit_data.streaming.cache import Cache, Dir
-from lightning_utilities.core.imports import RequirementCache
_PIL_AVAILABLE = RequirementCache("PIL")
diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py
index edc041dc..b8d927ca 100644
--- a/tests/processing/test_functions.py
+++ b/tests/processing/test_functions.py
@@ -3,6 +3,7 @@
from unittest import mock
import pytest
+
from lit_data import walk
from lit_data.processing.functions import _get_input_dir
diff --git a/tests/processing/test_readers.py b/tests/processing/test_readers.py
index cf1d2ae4..f7e5adfd 100644
--- a/tests/processing/test_readers.py
+++ b/tests/processing/test_readers.py
@@ -2,6 +2,7 @@
import sys
import pytest
+
from lit_data import map
from lit_data.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader
diff --git a/tests/streaming/test_cache.py b/tests/streaming/test_cache.py
index b8daece1..4ce8472f 100644
--- a/tests/streaming/test_cache.py
+++ b/tests/streaming/test_cache.py
@@ -19,17 +19,18 @@
import pytest
import torch
from lightning import seed_everything
+from lightning.fabric import Fabric
+from lightning.pytorch.demos.boring_classes import RandomDataset
+from lightning_utilities.core.imports import RequirementCache
+from lightning_utilities.test.warning import no_warning_call
+from torch.utils.data import Dataset
+
from lit_data.streaming import Cache
from lit_data.streaming.dataloader import CacheDataLoader
from lit_data.streaming.dataset import StreamingDataset
from lit_data.streaming.item_loader import TokensLoader
from lit_data.streaming.serializers import Serializer
from lit_data.utilities.env import _DistributedEnv
-from lightning.fabric import Fabric
-from lightning.pytorch.demos.boring_classes import RandomDataset
-from lightning_utilities.core.imports import RequirementCache
-from lightning_utilities.test.warning import no_warning_call
-from torch.utils.data import Dataset
_PIL_AVAILABLE = RequirementCache("PIL")
_TORCH_VISION_AVAILABLE = RequirementCache("torchvision")
diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py
index b9ca68ef..23f06c54 100644
--- a/tests/streaming/test_client.py
+++ b/tests/streaming/test_client.py
@@ -3,6 +3,7 @@
from unittest import mock
import pytest
+
from lit_data.streaming import client
diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py
index 76b5af84..077d3d05 100644
--- a/tests/streaming/test_combined.py
+++ b/tests/streaming/test_combined.py
@@ -4,12 +4,13 @@
import pytest
import torch
+from torch.utils.data import IterableDataset
+from torch.utils.data.dataloader import DataLoader
+
from lit_data.streaming.cache import Cache
from lit_data.streaming.combined import CombinedStreamingDataset
from lit_data.streaming.dataloader import StreamingDataLoader
from lit_data.streaming.dataset import Dir, StreamingDataset
-from torch.utils.data import IterableDataset
-from torch.utils.data.dataloader import DataLoader
class TestCombinedStreamingDataset(CombinedStreamingDataset):
diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py
index c74dec82..fc72ea4c 100644
--- a/tests/streaming/test_dataloader.py
+++ b/tests/streaming/test_dataloader.py
@@ -2,9 +2,10 @@
import pytest
import torch
+from torch import tensor
+
from lit_data.streaming import CombinedStreamingDataset, StreamingDataLoader
from lit_data.streaming import dataloader as streaming_dataloader_module
-from torch import tensor
class TestStatefulDataset:
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
index 752cf7e9..52ae8706 100644
--- a/tests/streaming/test_dataset.py
+++ b/tests/streaming/test_dataset.py
@@ -20,6 +20,8 @@
import pytest
import torch
from lightning import seed_everything
+from torch.utils.data import DataLoader
+
from lit_data.processing import functions
from lit_data.streaming import Cache
from lit_data.streaming import dataset as dataset_module
@@ -37,7 +39,6 @@
from lit_data.streaming.item_loader import TokensLoader
from lit_data.streaming.shuffle import FullShuffle, NoShuffle
from lit_data.utilities.env import _DistributedEnv, _WorkerEnv
-from torch.utils.data import DataLoader
def test_streaming_dataset(tmpdir, monkeypatch):
diff --git a/tests/streaming/test_reader.py b/tests/streaming/test_reader.py
index 5571a22e..6483b558 100644
--- a/tests/streaming/test_reader.py
+++ b/tests/streaming/test_reader.py
@@ -3,6 +3,7 @@
from time import sleep
import numpy as np
+
from lit_data.streaming import reader
from lit_data.streaming.cache import Cache
from lit_data.streaming.config import ChunkedIndex
@@ -44,7 +45,7 @@ def test_reader_chunk_removal(tmpdir):
index = ChunkedIndex(i, cache._get_chunk_index_from_index(i), is_last_index=i == 24)
assert cache[index] == i
- assert len(os.listdir(cache_dir)) == 3
+ assert len(os.listdir(cache_dir)) in [2, 3]
def test_get_folder_size(tmpdir):
diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py
index 6b1923e7..36a47ed3 100644
--- a/tests/streaming/test_resolver.py
+++ b/tests/streaming/test_resolver.py
@@ -3,7 +3,6 @@
from unittest import mock
import pytest
-from lit_data.streaming import resolver
from lightning_cloud import login
from lightning_cloud.openapi import (
Externalv1Cluster,
@@ -17,6 +16,8 @@
V1ListDataConnectionsResponse,
)
+from lit_data.streaming import resolver
+
@pytest.mark.skipif(sys.platform == "win32", reason="windows isn't supported")
def test_src_resolver_s3_connections(monkeypatch):
diff --git a/tests/streaming/test_sampler.py b/tests/streaming/test_sampler.py
index 911ea278..8b1ee8c5 100644
--- a/tests/streaming/test_sampler.py
+++ b/tests/streaming/test_sampler.py
@@ -2,6 +2,7 @@
import pytest
from lightning import seed_everything
+
from lit_data.streaming.sampler import CacheBatchSampler
diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py
index 902f8aa8..d07e0629 100644
--- a/tests/streaming/test_serializer.py
+++ b/tests/streaming/test_serializer.py
@@ -20,6 +20,8 @@
import pytest
import torch
from lightning import seed_everything
+from lightning_utilities.core.imports import RequirementCache
+
from lit_data.streaming.serializers import (
_AV_AVAILABLE,
_NUMPY_DTYPES_MAPPING,
@@ -36,7 +38,6 @@
TensorSerializer,
VideoSerializer,
)
-from lightning_utilities.core.imports import RequirementCache
_PIL_AVAILABLE = RequirementCache("PIL")
diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py
index dc00afa3..84e4ee91 100644
--- a/tests/streaming/test_writer.py
+++ b/tests/streaming/test_writer.py
@@ -18,12 +18,13 @@
import numpy as np
import pytest
from lightning import seed_everything
+from lightning_utilities.core.imports import RequirementCache
+
from lit_data.streaming.compression import _ZSTD_AVAILABLE
from lit_data.streaming.reader import BinaryReader
from lit_data.streaming.sampler import ChunkedIndex
from lit_data.streaming.writer import BinaryWriter
from lit_data.utilities.format import _FORMAT_TO_RATIO
-from lightning_utilities.core.imports import RequirementCache
_PIL_AVAILABLE = RequirementCache("PIL")
diff --git a/tests/utilities/test_packing.py b/tests/utilities/test_packing.py
index cb7e7928..41af8591 100644
--- a/tests/utilities/test_packing.py
+++ b/tests/utilities/test_packing.py
@@ -1,4 +1,5 @@
import pytest
+
from lit_data.utilities.packing import _pack_greedily