Skip to content

Commit

Permalink
use util in tokenize_dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jettjaniak committed May 22, 2024
1 parent 4e4b969 commit 3e2af53
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion scripts/tokenize_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from huggingface_hub import HfApi
from transformers import AutoTokenizer

from delphi import utils
from delphi.dataset.tokenization import get_tokenized_chunks

if __name__ == "__main__":
Expand Down Expand Up @@ -107,7 +108,7 @@
)

print(f"Tokenizing split='{args.split}'...")
split_name = args.split.split("[")[0]
split_name = utils.hf_split_to_split_name(args.split)
for chunk_idx, ds_chunk in enumerate(ds_chunks_it):
chunk_name = f"{split_name}-{chunk_idx:05}.parquet"
if args.out_dir:
Expand Down

0 comments on commit 3e2af53

Please sign in to comment.