support huggingface dataset

BerkeleyAutomation · Apr 10, 2024 · ef6db8b · ef6db8b
1 parent 16d7a48
commit ef6db8b
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 45 deletions.
diff --git a/examples/dataloader/huggingface.py b/examples/dataloader/huggingface.py
@@ -0,0 +1,15 @@
+import fog_rtx
+
+dataset = fog_rtx.dataset.Dataset(
+    name="demo_ds",
+    path="~/test_dataset",
+)
+
+dataset.load_rtx_episodes(
+    name="berkeley_autolab_ur5",
+    split="train[:1]",
+)
+
+huggingface_ds = dataset.get_as_huggingface_dataset()
+
+print(f"Hugging face dataset: {huggingface_ds}")
diff --git a/fog_rtx/dataset.py b/fog_rtx/dataset.py
@@ -402,48 +402,16 @@ def __getitem__(self, idx):
 
         return pytorch_dataset
 
-    def tensorflow_dataset_builder(
-        self, metadata=None, batch_size=32, shuffle_buffer_size=None, **kwargs
-    ):
-        # TODO: doesn't work yet
-        """
-        Build a TensorFlow dataset.
-
-        :param batch_size: The size of the batches of data.
-        :param shuffle_buffer_size: The buffer size for shuffling the data. If None, no shuffling will be performed.
-        :param kwargs: Additional arguments for TensorFlow data transformations.
-        :return: A tf.data.Dataset object.
-        """
-        import tensorflow as tf
-
-        if metadata is None:
-            metadata_df = self.get_metadata_as_pandas_df()
-        else:
-            metadata_df = metadata
-
-        episodes = self.read_by(metadata_df)
-        print(episodes)
-        # Convert data into tensors
-        # Assuming episodes_data is a list of numpy arrays or a similar format that can be directly converted to tensors.
-        # This might require additional processing depending on the data format and the features.
-        episodes_tensors = [
-            tf.convert_to_tensor(episode.drop(columns=["Timestamp", "index"]))
-            for episode in episodes
-        ]
-
-        # Create a tf.data.Dataset from tensors
-        dataset = tf.data.Dataset.from_tensor_slices(episodes_tensors)
-
-        # Shuffle the dataset if shuffle_buffer_size is provided
-        if shuffle_buffer_size:
-            dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
-
-        # Batch the dataset
-        dataset = dataset.batch(batch_size)
-
-        # Apply any additional transformations provided in kwargs
-        for transformation, parameters in kwargs.items():
-            if hasattr(dataset, transformation):
-                dataset = getattr(dataset, transformation)(**parameters)
-
-        return dataset
+    def get_as_huggingface_dataset(self):
+        import datasets
+
+        # TODO: currently the support for huggingg face dataset is limited 
+        # it only shows its capability of easily returning a hf dataset 
+        # TODO #1: add features from the episode metadata 
+        # TODO #2: allow selecting episodes based on queries 
+        # doing so requires creating a new copy of the dataset on disk 
+        dataset_path = self.path + "/" + self.name
+        parquet_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path)]
+
+        hf_dataset = datasets.load_dataset('parquet', data_files=parquet_files)
+        return hf_dataset