Skip to content

Commit

Permalink
Porting jettjaniak/tinyevals#19
Browse files Browse the repository at this point in the history
  • Loading branch information
Jai committed Feb 1, 2024
1 parent 55a1c98 commit 6d4ec82
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions src/delphi/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ def load_clean_dataset(split: str, tokenized: bool = False) -> list[str]:
# checking just startswith, because you can include slice like "train[:1000]"
assert split.startswith("train") or split.startswith("validation")
hf_ds = load_dataset(
f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}"
f"jbrinkma/tinystories-v2-clean{'-tokenized' if tokenized else ''}",
split=split,
)
dataset = []
# hf_ds technically isn't guaranteed to be subscriptable, but it is in this case
for sample_txt in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore
dataset.append(sample_txt)
for sample in tqdm(hf_ds["tokens" if tokenized else "text"]): # type: ignore
dataset.append(sample)
return dataset

0 comments on commit 6d4ec82

Please sign in to comment.