Skip to content

Commit

Permalink
Merge branch 'main' into fix/combined-dataset-loading-states
Browse files Browse the repository at this point in the history
  • Loading branch information
bhimrazy authored Sep 19, 2024
2 parents a05e6f1 + 3e3c86b commit 3762b11
Show file tree
Hide file tree
Showing 45 changed files with 723 additions and 990 deletions.
7 changes: 0 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ repos:
additional_dependencies: [tomli]
#args: ["--write-changes"] # uncomment if you want to get automatic fixing

- repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
hooks:
- id: docformatter
additional_dependencies: [tomli]
args: ["--in-place"]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.2
hooks:
Expand Down
169 changes: 91 additions & 78 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,8 @@ Additionally, you can inject client connection settings for [S3](https://boto3.a
from litdata import StreamingDataset

storage_options = {
"endpoint_url": "your_endpoint_url",
"aws_access_key_id": "your_access_key_id",
"aws_secret_access_key": "your_secret_access_key",
"key": "your_access_key_id",
"secret": "your_secret_access_key",
}

dataset = StreamingDataset('s3://my-bucket/my-data', storage_options=storage_options)
Expand Down Expand Up @@ -264,33 +263,47 @@ for batch in val_dataloader:

 

The StreamingDataset supports reading optimized datasets from common cloud providers.
The StreamingDataset supports reading optimized datasets from common cloud providers.

```python
import os
import litdata as ld

# Read data from AWS S3
aws_storage_options={
"AWS_ACCESS_KEY_ID": os.environ['AWS_ACCESS_KEY_ID'],
"AWS_SECRET_ACCESS_KEY": os.environ['AWS_SECRET_ACCESS_KEY'],
"key": os.environ['AWS_ACCESS_KEY_ID'],
"secret": os.environ['AWS_SECRET_ACCESS_KEY'],
}
dataset = ld.StreamingDataset("s3://my-bucket/my-data", storage_options=aws_storage_options)

# Read data from GCS
gcp_storage_options={
"project": os.environ['PROJECT_ID'],
"token": {
# dumped from cat ~/.config/gcloud/application_default_credentials.json
"account": "",
"client_id": "your_client_id",
"client_secret": "your_client_secret",
"quota_project_id": "your_quota_project_id",
"refresh_token": "your_refresh_token",
"type": "authorized_user",
"universe_domain": "googleapis.com",
}
}
dataset = ld.StreamingDataset("gs://my-bucket/my-data", storage_options=gcp_storage_options)

# Read data from Azure
azure_storage_options={
"account_url": f"https://{os.environ['AZURE_ACCOUNT_NAME']}.blob.core.windows.net",
"credential": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
"account_name": "azure_account_name",
"account_key": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
}
dataset = ld.StreamingDataset("azure://my-bucket/my-data", storage_options=azure_storage_options)
```

- For more details on which storage options are supported, please refer to:
- [AWS S3 storage options](https://github.com/fsspec/s3fs/blob/main/s3fs/core.py#L176)
- [GCS storage options](https://github.com/fsspec/gcsfs/blob/main/gcsfs/core.py#L154)
- [Azure storage options](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L124)

</details>

<details>
Expand Down Expand Up @@ -745,6 +758,75 @@ print(dataset[:])

</details>

<details>
<summary> ✅ Encrypt, decrypt data at chunk/sample level</summary>
&nbsp;

Secure data by applying encryption to individual samples or chunks, ensuring sensitive information is protected during storage.

This example shows how to use the `FernetEncryption` class for sample-level encryption with a data optimization function.

```python
from litdata import optimize
from litdata.utilities.encryption import FernetEncryption
import numpy as np
from PIL import Image

# Initialize FernetEncryption with a password for sample-level encryption
fernet = FernetEncryption(password="your_secure_password", level="sample")
data_dir = "s3://my-bucket/optimized_data"

def random_image(index):
"""Generate a random image for demonstration purposes."""
fake_img = Image.fromarray(np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8))
return {"image": fake_img, "class": index}

# Optimize data while applying encryption
optimize(
fn=random_image,
inputs=list(range(5)), # Example inputs: [0, 1, 2, 3, 4]
num_workers=1,
output_dir=data_dir,
chunk_bytes="64MB",
encryption=fernet,
)

# Save the encryption key to a file for later use
fernet.save("fernet.pem")
```

Load the encrypted data using the `StreamingDataset` class as follows:

```python
from litdata import StreamingDataset
from litdata.utilities.encryption import FernetEncryption

# Load the encryption key
fernet = FernetEncryption(password="your_secure_password", level="sample")
fernet.load("fernet.pem")

# Create a streaming dataset for reading the encrypted samples
ds = StreamingDataset(input_dir=data_dir, encryption=fernet)
```

Implement your own encryption method: Subclass the `Encryption` class and define the necessary methods:

```python
from litdata.utilities.encryption import Encryption

class CustomEncryption(Encryption):
def encrypt(self, data):
# Implement your custom encryption logic here
return data

def decrypt(self, data):
# Implement your custom decryption logic here
return data
```

This allows the data to remain secure while maintaining flexibility in the encryption method.
</details>

&nbsp;

## Features for transforming datasets
Expand Down Expand Up @@ -817,75 +899,6 @@ Explore an example setup of litdata with MinIO in the [LitData with MinIO](https

</details>

<details>
<summary> ✅ Supports encryption and decryption of data at chunk/sample level</summary>
&nbsp;

Secure your data by applying encryption to individual samples or chunks, ensuring sensitive information is protected during storage.

This example demonstrates how to use the `FernetEncryption` class for sample-level encryption with a data optimization function.

```python
from litdata import optimize
from litdata.utilities.encryption import FernetEncryption
import numpy as np
from PIL import Image

# Initialize FernetEncryption with a password for sample-level encryption
fernet = FernetEncryption(password="your_secure_password", level="sample")
data_dir = "s3://my-bucket/optimized_data"

def random_image(index):
"""Generate a random image for demonstration purposes."""
fake_img = Image.fromarray(np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8))
return {"image": fake_img, "class": index}

# Optimize data while applying encryption
optimize(
fn=random_image,
inputs=list(range(5)), # Example inputs: [0, 1, 2, 3, 4]
num_workers=1,
output_dir=data_dir,
chunk_bytes="64MB",
encryption=fernet,
)

# Save the encryption key to a file for later use
fernet.save("fernet.pem")
```

You can load the encrypted data using the `StreamingDataset` class as follows:

```python
from litdata import StreamingDataset
from litdata.utilities.encryption import FernetEncryption

# Load the encryption key
fernet = FernetEncryption(password="your_secure_password", level="sample")
fernet.load("fernet.pem")

# Create a streaming dataset for reading the encrypted samples
ds = StreamingDataset(input_dir=data_dir, encryption=fernet)
```

If you want to implement your own encryption method, you can subclass the `Encryption` class and define the necessary methods:

```python
from litdata.utilities.encryption import Encryption

class CustomEncryption(Encryption):
def encrypt(self, data):
# Implement your custom encryption logic here
return data

def decrypt(self, data):
# Implement your custom decryption logic here
return data
```

With this setup, you can ensure that your data remains secure while maintaining flexibility in how you handle encryption.
</details>

&nbsp;

----
Expand Down
6 changes: 1 addition & 5 deletions examples/multi_modal/create_labelencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@


def create_labelencoder():
"""
Create a label encoder
Returns:
"""
"""Create a label encoder."""
data = ["Cancelation", "IBAN Change", "Damage Report"]
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
Expand Down
44 changes: 19 additions & 25 deletions examples/multi_modal/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,12 @@ def __init__(self):
self.hyperparameters = HYPERPARAMETERS

def load_labelencoder(self):
"""
Function to load the label encoder from s3
Returns:
"""
"""Function to load the label encoder from s3."""
return joblib.load(self.hyperparameters["label_encoder_name"])

def load_tokenizer(self):
"""
load the tokenizer files and the pre training model path from s3 spezified in the hyperparameters
"""Loads the tokenizer files and the pre-training model path from s3 specified in the hyperparameters.
Returns: tokenizer
"""
# Load Bert tokenizer
Expand All @@ -62,13 +59,10 @@ def __init__(self, input_dir: Union[str, Any], hyperparameters: Union[dict, Any]
self.labelencoder = EC.load_labelencoder()

def tokenize_data(self, tokenizer, texts, max_length: int):
"""
Tokenize the text
Args:
tokenizer:
texts:
max_length:
Returns: input_ids, attention_masks
"""Tokenize the text.
Returns: input_ids, attention_masks.
"""
encoded_text = tokenizer(
texts,
Expand Down Expand Up @@ -98,11 +92,10 @@ class MixedDataModule(pl.LightningDataModule):
"""Own DataModule form the pytorch lightning DataModule."""

def __init__(self, hyperparameters: dict):
"""
Init if the Data Module
"""Initialize if the Data Module.
Args:
data_path: dataframe with the data
hyperparameters: Hyperparameters
hyperparameters: Hyperparameters.
"""
super().__init__()
self.hyperparameters = hyperparameters
Expand Down Expand Up @@ -130,10 +123,11 @@ def __init__(self, hyperparameters: dict):
)

def train_dataloader(self) -> DataLoader:
"""
Define the training dataloader
"""Define the training dataloader.
Returns:
training dataloader
training dataloader.
"""
dataset_train = DocumentClassificationDataset(
hyperparameters=self.hyperparameters,
Expand All @@ -150,10 +144,10 @@ def train_dataloader(self) -> DataLoader:
)

def val_dataloader(self) -> DataLoader:
"""
Define the validation dataloader
"""Defines the validation dataloader.
Returns:
validation dataloader
validation dataloader.
"""
dataset_val = DocumentClassificationDataset(
hyperparameters=self.hyperparameters,
Expand All @@ -169,8 +163,8 @@ def val_dataloader(self) -> DataLoader:
)

def test_dataloader(self) -> DataLoader:
"""
Define the test dataloader
"""Defines the test dataloader.
Returns:
test dataloader
"""
Expand Down
Loading

0 comments on commit 3762b11

Please sign in to comment.