Merge branch 'main' into fix/chunks-deletion-issue

Lightning-AI · Sep 19, 2024 · a26881d · a26881d
2 parents c752bc5 + 3e3c86b
commit a26881d
Show file tree

Hide file tree

Showing 42 changed files with 632 additions and 868 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -46,13 +46,6 @@ repos:
         additional_dependencies: [tomli]
         #args: ["--write-changes"] # uncomment if you want to get automatic fixing
 
-  - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
-    hooks:
-      - id: docformatter
-        additional_dependencies: [tomli]
-        args: ["--in-place"]
-
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.3.2
     hooks:

diff --git a/README.md b/README.md
@@ -217,9 +217,8 @@ Additionally, you can inject client connection settings for [S3](https://boto3.a
 from litdata import StreamingDataset
 
 storage_options = {
-    "endpoint_url": "your_endpoint_url",
-    "aws_access_key_id": "your_access_key_id",
-    "aws_secret_access_key": "your_secret_access_key",
+    "key": "your_access_key_id",
+    "secret": "your_secret_access_key",
 }
 
 dataset = StreamingDataset('s3://my-bucket/my-data', storage_options=storage_options)
@@ -264,33 +263,47 @@ for batch in val_dataloader:
 
 &nbsp;
 
-The StreamingDataset supports reading optimized datasets from common cloud providers. 
+The StreamingDataset supports reading optimized datasets from common cloud providers.
 
 ```python
 import os
 import litdata as ld
 
 # Read data from AWS S3
 aws_storage_options={
-    "AWS_ACCESS_KEY_ID": os.environ['AWS_ACCESS_KEY_ID'],
-    "AWS_SECRET_ACCESS_KEY": os.environ['AWS_SECRET_ACCESS_KEY'],
+    "key": os.environ['AWS_ACCESS_KEY_ID'],
+    "secret": os.environ['AWS_SECRET_ACCESS_KEY'],
 }
 dataset = ld.StreamingDataset("s3://my-bucket/my-data", storage_options=aws_storage_options)
 
 # Read data from GCS
 gcp_storage_options={
-    "project": os.environ['PROJECT_ID'],
+    "token": {
+      # dumped from cat ~/.config/gcloud/application_default_credentials.json 
+        "account": "",
+        "client_id": "your_client_id",
+        "client_secret": "your_client_secret",
+        "quota_project_id": "your_quota_project_id",
+        "refresh_token": "your_refresh_token",
+        "type": "authorized_user",
+        "universe_domain": "googleapis.com",
+    }
 }
 dataset = ld.StreamingDataset("gs://my-bucket/my-data", storage_options=gcp_storage_options)
 
 # Read data from Azure
 azure_storage_options={
-    "account_url": f"https://{os.environ['AZURE_ACCOUNT_NAME']}.blob.core.windows.net",
-    "credential": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
+    "account_name": "azure_account_name",
+    "account_key": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
 }
 dataset = ld.StreamingDataset("azure://my-bucket/my-data", storage_options=azure_storage_options)
 ```
 
+- For more details on which storage options are supported, please refer to:
+  - [AWS S3 storage options](https://github.com/fsspec/s3fs/blob/main/s3fs/core.py#L176)
+  - [GCS storage options](https://github.com/fsspec/gcsfs/blob/main/gcsfs/core.py#L154)
+  - [Azure storage options](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L124)
+
 </details>  
 
 <details>

diff --git a/examples/multi_modal/create_labelencoder.py b/examples/multi_modal/create_labelencoder.py
@@ -3,11 +3,7 @@
 
 
 def create_labelencoder():
-    """
-    Create a label encoder
-    Returns:
-
-    """
+    """Create a label encoder."""
     data = ["Cancelation", "IBAN Change", "Damage Report"]
     # Create an instance of LabelEncoder
     label_encoder = LabelEncoder()

diff --git a/examples/multi_modal/dataloader.py b/examples/multi_modal/dataloader.py
@@ -29,15 +29,12 @@ def __init__(self):
         self.hyperparameters = HYPERPARAMETERS
 
     def load_labelencoder(self):
-        """
-        Function to load the label encoder from s3
-        Returns:
-        """
+        """Function to load the label encoder from s3."""
         return joblib.load(self.hyperparameters["label_encoder_name"])
 
     def load_tokenizer(self):
-        """
-        load the tokenizer files and the pre-training model path from s3 specified in the hyperparameters
+        """Loads the tokenizer files and the pre-training model path from s3 specified in the hyperparameters.
+
         Returns: tokenizer
         """
         # Load Bert tokenizer
@@ -62,13 +59,10 @@ def __init__(self, input_dir: Union[str, Any], hyperparameters: Union[dict, Any]
         self.labelencoder = EC.load_labelencoder()
 
     def tokenize_data(self, tokenizer, texts, max_length: int):
-        """
-        Tokenize the text
-        Args:
-            tokenizer:
-            texts:
-            max_length:
-        Returns: input_ids, attention_masks
+        """Tokenize the text.
+
+        Returns: input_ids, attention_masks.
+
         """
         encoded_text = tokenizer(
             texts,
@@ -98,11 +92,10 @@ class MixedDataModule(pl.LightningDataModule):
     """Own DataModule form the pytorch lightning DataModule."""
 
     def __init__(self, hyperparameters: dict):
-        """
-        Init if the Data Module
+        """Initialize if the Data Module.
+
         Args:
-            data_path: dataframe with the data
-            hyperparameters:  Hyperparameters
+            hyperparameters:  Hyperparameters.
         """
         super().__init__()
         self.hyperparameters = hyperparameters
@@ -130,10 +123,11 @@ def __init__(self, hyperparameters: dict):
         )
 
     def train_dataloader(self) -> DataLoader:
-        """
-        Define the training dataloader
+        """Define the training dataloader.
+
         Returns:
-            training dataloader
+            training dataloader.
+
         """
         dataset_train = DocumentClassificationDataset(
             hyperparameters=self.hyperparameters,
@@ -150,10 +144,10 @@ def train_dataloader(self) -> DataLoader:
         )
 
     def val_dataloader(self) -> DataLoader:
-        """
-        Define the validation dataloader
+        """Defines the validation dataloader.
+
         Returns:
-            validation dataloader
+            validation dataloader.
         """
         dataset_val = DocumentClassificationDataset(
             hyperparameters=self.hyperparameters,
@@ -169,8 +163,8 @@ def val_dataloader(self) -> DataLoader:
         )
 
     def test_dataloader(self) -> DataLoader:
-        """
-        Define the test dataloader
+        """Defines the test dataloader.
+
         Returns:
             test dataloader
         """

diff --git a/examples/multi_modal/loop.py b/examples/multi_modal/loop.py
@@ -77,7 +77,6 @@ def save_reports(self, model_dir, mode, report_confusion_matrix, report):
             mode: train, test or val
             report_confusion_matrix: sklearn confusion matrix
             report: sklear classification report
-        Returns:
 
         """
         df_cm = pd.DataFrame(report_confusion_matrix)
@@ -87,17 +86,7 @@ def save_reports(self, model_dir, mode, report_confusion_matrix, report):
         logger.info("Confusion Matrix and Classification report are saved.")
 
     def save_test_evaluations(self, model_dir, mode, y_pred, y_true, confis, numerical_id_):
-        """
-        Save a pandas dataframe with prediction and ground truth and identifier (numerical id) of the test dataset
-        Args:
-            model_dir:
-            mode:
-            y_pred:
-            y_true:
-            confis:
-            numerical_id_:
-        Returns:
-        """
+        """Save pandas dataframe with prediction and ground truth and identifier (numerical id) of the test dataset."""
         df_test = pd.DataFrame()
         df_test["pred"] = y_pred
         df_test["confidence"] = confis.max(axis=1)
@@ -151,43 +140,37 @@ def forward(
         """Forward path, calculate the computational graph in the forward direction.
 
         Used for train, test and val.
-        Args:
-            y: tensor with text data as tokens
+
         Returns:
             computional graph
 
         """
         return self.module(x, y, z)
 
     def training_step(self, batch: Dict[str, torch.Tensor]) -> Dict:
-        """
-        Call the eval share for training
-        Args:
-            batch: tensor
+        """Call the eval share for training.
+
         Returns:
-            dict with loss, outputs and ground_truth
+            dict with loss, outputs and ground_truth.
+
         """
         return self._shared_eval_step(batch, "train")
 
     def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict:
-        """
-        Call the eval share for validation
-        Args:
-            batch:
-            batch_idx:
+        """Call the eval share for validation.
+
         Returns:
-            dict with loss, outputs and ground_truth
+            dict with loss, outputs and ground_truth.
+
         """
         return self._shared_eval_step(batch, "val")
 
     def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> Dict:
-        """
-        Call the eval share for test
-        Args:
-            batch:
-            batch_idx:
+        """Call the eval share for test.
+
         Returns:
-            dict with loss, outputs and ground_truth
+            dict with loss, outputs and ground_truth.
+
         """
         ret = self._shared_eval_step(batch, "test")
         self.pred_list.append(ret)
@@ -199,6 +182,7 @@ def _shared_eval_step(self, batch: Dict[str, torch.Tensor], mode: str) -> Dict:
         Args:
             batch: tensor
             mode: train, test or val
+
         Returns:
             dict with loss, outputs and ground_truth
 
@@ -227,14 +211,8 @@ def _shared_eval_step(self, batch: Dict[str, torch.Tensor], mode: str) -> Dict:
 
         return {"outputs": out, "loss": loss, "ground_truth": ground_truth, "numerical_id": numerical_id}
 
-    def _epoch_end(self, mode: str):
-        """
-        Calculate loss and metricies at end of epoch
-        Args:
-            mode:
-        Returns:
-            None
-        """
+    def _epoch_end(self, mode: str) -> None:
+        """Calculate loss and metrics at end of epoch."""
         if mode == "val":
             output = self.val_metrics.compute()
             self.log_dict(output)
@@ -249,14 +227,7 @@ def _epoch_end(self, mode: str):
             self.test_metrics.reset()
 
     def predict(self, batch: Dict[str, torch.Tensor], batch_idx: int = 0, dataloader_idx: int = 0) -> torch.Tensor:
-        """Model prediction  without softmax and argmax to predict class label.
-
-        Args:
-            outputs:
-        Returns:
-            None
-
-        """
+        """Model prediction  without softmax and argmax to predict class label."""
         self.eval()
         with torch.no_grad():
             ids = batch["ID"]
@@ -265,51 +236,30 @@ def predict(self, batch: Dict[str, torch.Tensor], batch_idx: int = 0, dataloader
             return self.forward(ids, atts, img)
 
     def on_test_epoch_end(self) -> None:
-        """
-        Calculate the metrics at the end of epoch for test step
-        Args:
-            outputs:
-        Returns:
-            None
-        """
+        """Calculate the metrics at the end of epoch for test step."""
         self._epoch_end("test")
 
-    def on_validation_epoch_end(self):
-        """
-        Calculate the metrics at the end of epoch for val step
-        Args:
-            outputs:
-        Returns:
-            None
-        """
+    def on_validation_epoch_end(self) -> None:
+        """Calculate the metrics at the end of epoch for val step."""
         self._epoch_end("val")
 
-    def on_train_epoch_end(self):
-        """
-        Calculate the metrics at the end of epoch for train step
-        Args:
-            outputs:
-        Returns:
-            None
-        """
+    def on_train_epoch_end(self) -> None:
+        """Calculate the metrics at the end of epoch for train step."""
         self._epoch_end("train")
 
     def configure_optimizers(self) -> Any:
-        """
-        Configure the optimizer
+        """Configure the optimizer.
+
         Returns:
             optimizer
+
         """
         optimizer = AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.hyperparameters["weight_decay"])
         scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
         return [optimizer], [{"scheduler": scheduler, "interval": "epoch"}]
 
     def configure_callbacks(self) -> Union[Sequence[pl.pytorch.Callback], pl.pytorch.Callback]:
-        """Configure Early stopping or Model Checkpointing.
-
-        Returns:
-
-        """
+        """Configure Early stopping or Model Checkpointing."""
         early_stop = EarlyStopping(
             monitor="val_MulticlassAccuracy", patience=self.hyperparameters["patience"], mode="max"
         )