From 0a5882574e3767d02a779e8cf58d6d7d75d1d168 Mon Sep 17 00:00:00 2001 From: Jett Date: Wed, 15 May 2024 13:40:29 +0200 Subject: [PATCH] tokenizer & tokenization improvements (#136) * add in tokenizer training * allow saving tokenizer locally * allow saving tokenized dataset locally * tokenize into seq_len instead of seq_len+1 * logging * update tokenization test (new tokenizer with pad token) --- configs/stories/llama2/base.json | 1 + configs/stories/mamba/base.json | 1 + scripts/tokenize_dataset.py | 59 +++++++++++++++++++++++------- scripts/train_tokenizer.py | 31 ++++++++++++---- src/delphi/dataset/tokenization.py | 53 ++++++--------------------- tests/dataset/test_tokeniation.py | 14 +++---- 6 files changed, 90 insertions(+), 69 deletions(-) diff --git a/configs/stories/llama2/base.json b/configs/stories/llama2/base.json index 427e124f..dceaea3c 100644 --- a/configs/stories/llama2/base.json +++ b/configs/stories/llama2/base.json @@ -8,6 +8,7 @@ "rms_norm_eps": 1e-06, "bos_token_id": 0, "eos_token_id": 1, + "pad_token_id": 2, "tie_word_embeddings": false, "rope_theta": 10000.0, "rope_scaling": null, diff --git a/configs/stories/mamba/base.json b/configs/stories/mamba/base.json index ede565ef..5f11ff9f 100644 --- a/configs/stories/mamba/base.json +++ b/configs/stories/mamba/base.json @@ -6,6 +6,7 @@ "layer_norm_epsilon": 1e-5, "bos_token_id": 0, "eos_token_id": 1, + "pad_token_id": 2, "expand": 2, "conv_kernel": 4, "use_bias": false, diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py index a48bddbe..a74df312 100755 --- a/scripts/tokenize_dataset.py +++ b/scripts/tokenize_dataset.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import argparse +import io +import os +from pathlib import Path from datasets import Dataset, Features, Value, load_dataset from huggingface_hub import HfApi from transformers import AutoTokenizer -from delphi.dataset.tokenization import tokenize_and_upload_split +from delphi.dataset.tokenization import get_tokenized_chunks if __name__ == "__main__": parser = argparse.ArgumentParser(description="", allow_abbrev=False) @@ -31,19 +34,24 @@ required=True, help="Split of the dataset to be tokenized, supports slicing like 'train[:10%%]'", ) + parser.add_argument( + "--out-dir", + type=str, + required=False, + help="Local directory to save the resulting dataset", + ) parser.add_argument( "--out-repo-id", - "-o", type=str, - required=True, - help="Name of the tokenized dataset to upload to huggingface", + required=False, + help="HF repo id to upload the resulting dataset", ) parser.add_argument( "--tokenizer", - "-r", + "-t", type=str, required=True, - help="Name of the tokenizer from huggingface", + help="HF repo id or local directory containing the tokenizer", ) parser.add_argument( "--seq-len", @@ -67,6 +75,9 @@ help="Size of the parquet chunks uploaded to HuggingFace", ) args = parser.parse_args() + assert ( + args.out_repo_id or args.out_dir + ), "You need to provide --out-repo-id or --out-dir" print(f"Loading dataset '{args.in_repo_id}'...") in_dataset_split = load_dataset( @@ -75,20 +86,42 @@ features=Features({args.feature: Value("string")}), ) assert isinstance(in_dataset_split, Dataset) - print(f"Loading tokenizer '{args.tokenizer}'...") + print(f"Loading tokenizer from '{args.tokenizer}'...") tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) assert tokenizer.bos_token_id is not None, "Tokenizer must have a bos_token_id" assert tokenizer.eos_token_id is not None, "Tokenizer must have a eos_token_id" - api = HfApi() - api.create_repo(repo_id=args.out_repo_id, repo_type="dataset", exist_ok=True) - tokenize_and_upload_split( + api = None + if args.out_repo_id: + api = HfApi() + api.create_repo(repo_id=args.out_repo_id, repo_type="dataset", exist_ok=True) + if args.out_dir: + os.makedirs(args.out_dir, exist_ok=True) + + ds_chunks_it = get_tokenized_chunks( dataset_split=in_dataset_split, - split_name=args.split.split("[")[0], tokenizer=tokenizer, seq_len=args.seq_len, batch_size=args.batch_size, chunk_size=args.chunk_size, - out_repo_id=args.out_repo_id, - api=api, ) + + print(f"Tokenizing split='{args.split}'...") + split_name = args.split.split("[")[0] + for chunk_idx, ds_chunk in enumerate(ds_chunks_it): + chunk_name = f"{split_name}-{chunk_idx:05}.parquet" + if args.out_dir: + ds_parquet_chunk = Path(args.out_dir) / chunk_name + print(f"Saving '{ds_parquet_chunk}'...") + else: + ds_parquet_chunk = io.BytesIO() + ds_chunk.to_parquet(ds_parquet_chunk) + if api: + print(f"Uploading '{chunk_name}' to '{args.out_repo_id}'...") + api.upload_file( + path_or_fileobj=ds_parquet_chunk, + path_in_repo=f"data/{chunk_name}", + repo_id=args.out_repo_id, + repo_type="dataset", + ) + print(f"Done saving/uploading '{chunk_name}'") diff --git a/scripts/train_tokenizer.py b/scripts/train_tokenizer.py index 98308243..83e071ae 100755 --- a/scripts/train_tokenizer.py +++ b/scripts/train_tokenizer.py @@ -3,7 +3,6 @@ from datasets import Dataset, Features, Value, load_dataset from tokenizers import ByteLevelBPETokenizer # type: ignore -from tqdm.auto import tqdm from transformers import PreTrainedTokenizerFast @@ -15,7 +14,7 @@ def train_byte_level_bpe( tokenizer.train_from_iterator( text_generator, vocab_size=vocab_size, - special_tokens=["", ""], + special_tokens=["", "", ""], show_progress=True, length=len(dataset), ) @@ -23,6 +22,7 @@ def train_byte_level_bpe( tokenizer_object=tokenizer, bos_token="", eos_token="", + pad_token="", ) @@ -57,14 +57,22 @@ def train_byte_level_bpe( required=True, help="Vocabulary size of the tokenizer", ) + parser.add_argument( + "--out-dir", + type=str, + required=False, + help="Local directory to save the resulting tokenizer", + ) parser.add_argument( "--out-repo-id", - "-o", type=str, - required=True, - help="Where to push the resulting tokenizer", + required=False, + help="HF repo id to upload the resulting tokenizer", ) args = parser.parse_args() + assert ( + args.out_repo_id or args.out_dir + ), "You need to provide out_repo_id or out_dir" print(f"Loading dataset '{args.in_repo_id}'...") in_dataset_split = load_dataset( @@ -78,6 +86,13 @@ def train_byte_level_bpe( feature=args.feature, vocab_size=args.vocab_size, ) - tokenizer.push_to_hub( - repo_id=args.out_repo_id, - ) + if args.out_dir: + print(f"Saving tokenizer to '{args.out_dir}' directory...") + tokenizer.save_pretrained(args.out_dir) + print("Done.") + if args.out_repo_id: + print(f"Pushing tokenizer to HF repo '{args.out_repo_id}'...") + tokenizer.push_to_hub( + repo_id=args.out_repo_id, + ) + print("Done.") diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py index 9447a4d8..e368864a 100644 --- a/src/delphi/dataset/tokenization.py +++ b/src/delphi/dataset/tokenization.py @@ -1,10 +1,8 @@ -import io +import itertools from collections import deque -from collections.abc import Generator +from collections.abc import Iterator from datasets import Dataset -from huggingface_hub import HfApi -from tqdm.auto import trange from transformers import PreTrainedTokenizerBase @@ -45,7 +43,7 @@ def extend_deque( return doc_idx -def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> list[int]: +def make_new_sample(deq: deque[int], seq_len: int, bos_token_id: int) -> list[int]: """ Generates new sample for training by creating sequence of tokens from the deque until the deque. @@ -62,10 +60,10 @@ def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> li list[int]: token sequence. """ sample = [bos_token_id] - # For the first (n-1) elements, pop from the left of the deque - # and add to the new sample, the n-th element will be retained + # For the first n-2 elements, pop from the left of the deque + # and add to the new sample, the (n-1)-th element will be retained # in the deque for making the next sample. - for _ in range(context_size - 1): + for _ in range(seq_len - 2): sample.append(deq.popleft()) sample.append(deq[0]) return sample @@ -76,7 +74,7 @@ def tokenize_dataset( tokenizer: PreTrainedTokenizerBase, seq_len: int, batch_size: int, -) -> Generator[list[int], None, None]: +) -> Iterator[list[int]]: """ Tokenizes the input text documents using the provided tokenizer and generates token sequences of the specified length. @@ -100,45 +98,18 @@ def tokenize_dataset( # We discard the last chunk, so no processing on the remainder of the deque here -def tokenize_and_upload_split( +def get_tokenized_chunks( dataset_split: Dataset, - split_name: str, tokenizer: PreTrainedTokenizerBase, seq_len: int, batch_size: int, chunk_size: int, - out_repo_id: str, - api: HfApi, -): - seq_gen = tokenize_dataset( +) -> Iterator[Dataset]: + seq_it = tokenize_dataset( dataset_split, tokenizer, seq_len=seq_len, batch_size=batch_size, ) - seq_it = iter(seq_gen) - print(f"Tokenizing {split_name=}...") - chunk_idx = 0 - done = False - while not done: - tokens = [] - print(f"Processing chunk {chunk_idx}...") - for _ in trange(chunk_size): - try: - tokens.append(next(seq_it)) - except StopIteration: - done = True - break - ds_chunk = Dataset.from_dict({"tokens": tokens}) - ds_parquet_chunk = io.BytesIO() - ds_chunk.to_parquet(ds_parquet_chunk) - chunk_name = f"{split_name}-{chunk_idx:05}.parquet" - print(f"Uploading {chunk_name}...") - api.upload_file( - path_or_fileobj=ds_parquet_chunk, - path_in_repo=f"data/{chunk_name}", - repo_id=out_repo_id, - repo_type="dataset", - ) - chunk_idx += 1 - print("Done.") + while tokens_chunk := tuple(itertools.islice(seq_it, chunk_size)): + yield Dataset.from_dict({"tokens": tokens_chunk}) diff --git a/tests/dataset/test_tokeniation.py b/tests/dataset/test_tokeniation.py index 4d765fbc..bb4180ba 100644 --- a/tests/dataset/test_tokeniation.py +++ b/tests/dataset/test_tokeniation.py @@ -73,7 +73,7 @@ def test_make_new_sample(tokenizer): def test_tokenize_dataset(tokenizer): - CTX_SIZE = 10 + SEQ_LEN = 11 BATCH_SIZE = 2 documents = [ @@ -86,11 +86,11 @@ def test_tokenize_dataset(tokenizer): feature_name = get_random_feature_name() dataset = Dataset.from_dict({feature_name: documents}) expected = [ - [0, 431, 440, 260, 1, 46, 499, 1945, 368, 3443, 15], - [0, 15, 340, 576, 355, 337, 1887, 1, 431, 440, 260], - [0, 260, 399, 13, 314, 260, 560, 1005, 13, 402, 284], - [0, 284, 260, 2606, 1, 431, 440, 260, 399, 13, 402], - [0, 402, 284, 260, 1, 1370, 268, 415, 484, 412, 15], + [0, 432, 441, 261, 1, 47, 500, 1946, 369, 3444, 16], + [0, 16, 341, 577, 356, 338, 1888, 1, 432, 441, 261], + [0, 261, 400, 14, 315, 261, 561, 1006, 14, 403, 285], + [0, 285, 261, 2607, 1, 432, 441, 261, 400, 14, 403], + [0, 403, 285, 261, 1, 1371, 269, 416, 485, 413, 16], ] - actual = [x for x in tokenize_dataset(dataset, tokenizer, CTX_SIZE, BATCH_SIZE)] + actual = [x for x in tokenize_dataset(dataset, tokenizer, SEQ_LEN, BATCH_SIZE)] assert actual == expected