Skip to content

Commit

Permalink
Merge branch 'main' into fix/chunks-deletion-issue
Browse files Browse the repository at this point in the history
  • Loading branch information
deependujha authored Sep 19, 2024
2 parents c752bc5 + 3e3c86b commit a26881d
Show file tree
Hide file tree
Showing 42 changed files with 632 additions and 868 deletions.
7 changes: 0 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ repos:
additional_dependencies: [tomli]
#args: ["--write-changes"] # uncomment if you want to get automatic fixing

- repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
hooks:
- id: docformatter
additional_dependencies: [tomli]
args: ["--in-place"]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.2
hooks:
Expand Down
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,8 @@ Additionally, you can inject client connection settings for [S3](https://boto3.a
from litdata import StreamingDataset

storage_options = {
"endpoint_url": "your_endpoint_url",
"aws_access_key_id": "your_access_key_id",
"aws_secret_access_key": "your_secret_access_key",
"key": "your_access_key_id",
"secret": "your_secret_access_key",
}

dataset = StreamingDataset('s3://my-bucket/my-data', storage_options=storage_options)
Expand Down Expand Up @@ -264,33 +263,47 @@ for batch in val_dataloader:

 

The StreamingDataset supports reading optimized datasets from common cloud providers.
The StreamingDataset supports reading optimized datasets from common cloud providers.

```python
import os
import litdata as ld

# Read data from AWS S3
aws_storage_options={
"AWS_ACCESS_KEY_ID": os.environ['AWS_ACCESS_KEY_ID'],
"AWS_SECRET_ACCESS_KEY": os.environ['AWS_SECRET_ACCESS_KEY'],
"key": os.environ['AWS_ACCESS_KEY_ID'],
"secret": os.environ['AWS_SECRET_ACCESS_KEY'],
}
dataset = ld.StreamingDataset("s3://my-bucket/my-data", storage_options=aws_storage_options)

# Read data from GCS
gcp_storage_options={
"project": os.environ['PROJECT_ID'],
"token": {
# dumped from cat ~/.config/gcloud/application_default_credentials.json
"account": "",
"client_id": "your_client_id",
"client_secret": "your_client_secret",
"quota_project_id": "your_quota_project_id",
"refresh_token": "your_refresh_token",
"type": "authorized_user",
"universe_domain": "googleapis.com",
}
}
dataset = ld.StreamingDataset("gs://my-bucket/my-data", storage_options=gcp_storage_options)

# Read data from Azure
azure_storage_options={
"account_url": f"https://{os.environ['AZURE_ACCOUNT_NAME']}.blob.core.windows.net",
"credential": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
"account_name": "azure_account_name",
"account_key": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
}
dataset = ld.StreamingDataset("azure://my-bucket/my-data", storage_options=azure_storage_options)
```

- For more details on which storage options are supported, please refer to:
- [AWS S3 storage options](https://github.com/fsspec/s3fs/blob/main/s3fs/core.py#L176)
- [GCS storage options](https://github.com/fsspec/gcsfs/blob/main/gcsfs/core.py#L154)
- [Azure storage options](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L124)

</details>

<details>
Expand Down
6 changes: 1 addition & 5 deletions examples/multi_modal/create_labelencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@


def create_labelencoder():
"""
Create a label encoder
Returns:
"""
"""Create a label encoder."""
data = ["Cancelation", "IBAN Change", "Damage Report"]
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
Expand Down
44 changes: 19 additions & 25 deletions examples/multi_modal/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,12 @@ def __init__(self):
self.hyperparameters = HYPERPARAMETERS

def load_labelencoder(self):
"""
Function to load the label encoder from s3
Returns:
"""
"""Function to load the label encoder from s3."""
return joblib.load(self.hyperparameters["label_encoder_name"])

def load_tokenizer(self):
"""
load the tokenizer files and the pre-training model path from s3 specified in the hyperparameters
"""Loads the tokenizer files and the pre-training model path from s3 specified in the hyperparameters.
Returns: tokenizer
"""
# Load Bert tokenizer
Expand All @@ -62,13 +59,10 @@ def __init__(self, input_dir: Union[str, Any], hyperparameters: Union[dict, Any]
self.labelencoder = EC.load_labelencoder()

def tokenize_data(self, tokenizer, texts, max_length: int):
"""
Tokenize the text
Args:
tokenizer:
texts:
max_length:
Returns: input_ids, attention_masks
"""Tokenize the text.
Returns: input_ids, attention_masks.
"""
encoded_text = tokenizer(
texts,
Expand Down Expand Up @@ -98,11 +92,10 @@ class MixedDataModule(pl.LightningDataModule):
"""Own DataModule form the pytorch lightning DataModule."""

def __init__(self, hyperparameters: dict):
"""
Init if the Data Module
"""Initialize if the Data Module.
Args:
data_path: dataframe with the data
hyperparameters: Hyperparameters
hyperparameters: Hyperparameters.
"""
super().__init__()
self.hyperparameters = hyperparameters
Expand Down Expand Up @@ -130,10 +123,11 @@ def __init__(self, hyperparameters: dict):
)

def train_dataloader(self) -> DataLoader:
"""
Define the training dataloader
"""Define the training dataloader.
Returns:
training dataloader
training dataloader.
"""
dataset_train = DocumentClassificationDataset(
hyperparameters=self.hyperparameters,
Expand All @@ -150,10 +144,10 @@ def train_dataloader(self) -> DataLoader:
)

def val_dataloader(self) -> DataLoader:
"""
Define the validation dataloader
"""Defines the validation dataloader.
Returns:
validation dataloader
validation dataloader.
"""
dataset_val = DocumentClassificationDataset(
hyperparameters=self.hyperparameters,
Expand All @@ -169,8 +163,8 @@ def val_dataloader(self) -> DataLoader:
)

def test_dataloader(self) -> DataLoader:
"""
Define the test dataloader
"""Defines the test dataloader.
Returns:
test dataloader
"""
Expand Down
104 changes: 27 additions & 77 deletions examples/multi_modal/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def save_reports(self, model_dir, mode, report_confusion_matrix, report):
mode: train, test or val
report_confusion_matrix: sklearn confusion matrix
report: sklear classification report
Returns:
"""
df_cm = pd.DataFrame(report_confusion_matrix)
Expand All @@ -87,17 +86,7 @@ def save_reports(self, model_dir, mode, report_confusion_matrix, report):
logger.info("Confusion Matrix and Classification report are saved.")

def save_test_evaluations(self, model_dir, mode, y_pred, y_true, confis, numerical_id_):
"""
Save a pandas dataframe with prediction and ground truth and identifier (numerical id) of the test dataset
Args:
model_dir:
mode:
y_pred:
y_true:
confis:
numerical_id_:
Returns:
"""
"""Save pandas dataframe with prediction and ground truth and identifier (numerical id) of the test dataset."""
df_test = pd.DataFrame()
df_test["pred"] = y_pred
df_test["confidence"] = confis.max(axis=1)
Expand Down Expand Up @@ -151,43 +140,37 @@ def forward(
"""Forward path, calculate the computational graph in the forward direction.
Used for train, test and val.
Args:
y: tensor with text data as tokens
Returns:
computional graph
"""
return self.module(x, y, z)

def training_step(self, batch: Dict[str, torch.Tensor]) -> Dict:
"""
Call the eval share for training
Args:
batch: tensor
"""Call the eval share for training.
Returns:
dict with loss, outputs and ground_truth
dict with loss, outputs and ground_truth.
"""
return self._shared_eval_step(batch, "train")

def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict:
"""
Call the eval share for validation
Args:
batch:
batch_idx:
"""Call the eval share for validation.
Returns:
dict with loss, outputs and ground_truth
dict with loss, outputs and ground_truth.
"""
return self._shared_eval_step(batch, "val")

def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict:
"""
Call the eval share for test
Args:
batch:
batch_idx:
"""Call the eval share for test.
Returns:
dict with loss, outputs and ground_truth
dict with loss, outputs and ground_truth.
"""
ret = self._shared_eval_step(batch, "test")
self.pred_list.append(ret)
Expand All @@ -199,6 +182,7 @@ def _shared_eval_step(self, batch: Dict[str, torch.Tensor], mode: str) -> Dict:
Args:
batch: tensor
mode: train, test or val
Returns:
dict with loss, outputs and ground_truth
Expand Down Expand Up @@ -227,14 +211,8 @@ def _shared_eval_step(self, batch: Dict[str, torch.Tensor], mode: str) -> Dict:

return {"outputs": out, "loss": loss, "ground_truth": ground_truth, "numerical_id": numerical_id}

def _epoch_end(self, mode: str):
"""
Calculate loss and metricies at end of epoch
Args:
mode:
Returns:
None
"""
def _epoch_end(self, mode: str) -> None:
"""Calculate loss and metrics at end of epoch."""
if mode == "val":
output = self.val_metrics.compute()
self.log_dict(output)
Expand All @@ -249,14 +227,7 @@ def _epoch_end(self, mode: str):
self.test_metrics.reset()

def predict(self, batch: Dict[str, torch.Tensor], batch_idx: int = 0, dataloader_idx: int = 0) -> torch.Tensor:
"""Model prediction without softmax and argmax to predict class label.
Args:
outputs:
Returns:
None
"""
"""Model prediction without softmax and argmax to predict class label."""
self.eval()
with torch.no_grad():
ids = batch["ID"]
Expand All @@ -265,51 +236,30 @@ def predict(self, batch: Dict[str, torch.Tensor], batch_idx: int = 0, dataloader
return self.forward(ids, atts, img)

def on_test_epoch_end(self) -> None:
"""
Calculate the metrics at the end of epoch for test step
Args:
outputs:
Returns:
None
"""
"""Calculate the metrics at the end of epoch for test step."""
self._epoch_end("test")

def on_validation_epoch_end(self):
"""
Calculate the metrics at the end of epoch for val step
Args:
outputs:
Returns:
None
"""
def on_validation_epoch_end(self) -> None:
"""Calculate the metrics at the end of epoch for val step."""
self._epoch_end("val")

def on_train_epoch_end(self):
"""
Calculate the metrics at the end of epoch for train step
Args:
outputs:
Returns:
None
"""
def on_train_epoch_end(self) -> None:
"""Calculate the metrics at the end of epoch for train step."""
self._epoch_end("train")

def configure_optimizers(self) -> Any:
"""
Configure the optimizer
"""Configure the optimizer.
Returns:
optimizer
"""
optimizer = AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.hyperparameters["weight_decay"])
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
return [optimizer], [{"scheduler": scheduler, "interval": "epoch"}]

def configure_callbacks(self) -> Union[Sequence[pl.pytorch.Callback], pl.pytorch.Callback]:
"""Configure Early stopping or Model Checkpointing.
Returns:
"""
"""Configure Early stopping or Model Checkpointing."""
early_stop = EarlyStopping(
monitor="val_MulticlassAccuracy", patience=self.hyperparameters["patience"], mode="max"
)
Expand Down
Loading

0 comments on commit a26881d

Please sign in to comment.