Skip to content

Commit

Permalink
support huggingface dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
KeplerC committed Apr 10, 2024
1 parent 16d7a48 commit ef6db8b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 45 deletions.
15 changes: 15 additions & 0 deletions examples/dataloader/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import fog_rtx

dataset = fog_rtx.dataset.Dataset(
name="demo_ds",
path="~/test_dataset",
)

dataset.load_rtx_episodes(
name="berkeley_autolab_ur5",
split="train[:1]",
)

huggingface_ds = dataset.get_as_huggingface_dataset()

print(f"Hugging face dataset: {huggingface_ds}")
58 changes: 13 additions & 45 deletions fog_rtx/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,48 +402,16 @@ def __getitem__(self, idx):

return pytorch_dataset

def tensorflow_dataset_builder(
self, metadata=None, batch_size=32, shuffle_buffer_size=None, **kwargs
):
# TODO: doesn't work yet
"""
Build a TensorFlow dataset.
:param batch_size: The size of the batches of data.
:param shuffle_buffer_size: The buffer size for shuffling the data. If None, no shuffling will be performed.
:param kwargs: Additional arguments for TensorFlow data transformations.
:return: A tf.data.Dataset object.
"""
import tensorflow as tf

if metadata is None:
metadata_df = self.get_metadata_as_pandas_df()
else:
metadata_df = metadata

episodes = self.read_by(metadata_df)
print(episodes)
# Convert data into tensors
# Assuming episodes_data is a list of numpy arrays or a similar format that can be directly converted to tensors.
# This might require additional processing depending on the data format and the features.
episodes_tensors = [
tf.convert_to_tensor(episode.drop(columns=["Timestamp", "index"]))
for episode in episodes
]

# Create a tf.data.Dataset from tensors
dataset = tf.data.Dataset.from_tensor_slices(episodes_tensors)

# Shuffle the dataset if shuffle_buffer_size is provided
if shuffle_buffer_size:
dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

# Batch the dataset
dataset = dataset.batch(batch_size)

# Apply any additional transformations provided in kwargs
for transformation, parameters in kwargs.items():
if hasattr(dataset, transformation):
dataset = getattr(dataset, transformation)(**parameters)

return dataset
def get_as_huggingface_dataset(self):
import datasets

# TODO: currently the support for huggingg face dataset is limited
# it only shows its capability of easily returning a hf dataset
# TODO #1: add features from the episode metadata
# TODO #2: allow selecting episodes based on queries
# doing so requires creating a new copy of the dataset on disk
dataset_path = self.path + "/" + self.name
parquet_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path)]

hf_dataset = datasets.load_dataset('parquet', data_files=parquet_files)
return hf_dataset

0 comments on commit ef6db8b

Please sign in to comment.