From e7ab2e85af8f538b9767025c701bc437b4763b62 Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Sat, 9 Mar 2024 21:03:15 -0800 Subject: [PATCH 1/8] Add function to tokenize text stories and split into batches --- src/delphi/train/dataset_tokenization.py | 39 ++++++++++++++++++++ tests/train/test_tokenizer.py | 47 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 src/delphi/train/dataset_tokenization.py create mode 100644 tests/train/test_tokenizer.py diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py new file mode 100644 index 00000000..5ca3371f --- /dev/null +++ b/src/delphi/train/dataset_tokenization.py @@ -0,0 +1,39 @@ +from collections import deque +from typing import Union + +from transformers import PreTrainedTokenizerBase + + +def get_tokenized_batches( + text_stories: Union[list[str], list[list[int]]], + tokenizer: PreTrainedTokenizerBase, + context_size: int, + input_tokenized=False, +) -> list[list[int]]: + dq = deque() + samples = [] + + prompt_idx = 0 + while prompt_idx < len(text_stories): + while len(dq) < context_size: + text_story = text_stories[prompt_idx] + if not input_tokenized: + dq.extend( + tokenizer.encode(text_story, add_special_tokens=False) + + [tokenizer.eos_token_id] + ) + else: + dq.extend(text_story) + dq.append(tokenizer.eos_token_id) + prompt_idx += 1 + + sample = [tokenizer.bos_token_id] + for i in range(context_size - 1): # peek at and not pop the last element + sample.append(dq.popleft()) + sample.append(dq[0]) + + samples.append(sample) + + if dq: + samples.append([tokenizer.bos_token_id] + list(dq)) + return samples diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py new file mode 100644 index 00000000..ca0415f8 --- /dev/null +++ b/tests/train/test_tokenizer.py @@ -0,0 +1,47 @@ +from transformers import AutoTokenizer + +from delphi.train.dataset_tokenization import get_tokenized_batches + + +def test_get_tokenized_batches(): + CTX_SIZE = 10 + tokenizer = AutoTokenizer.from_pretrained("delphi-suite/v0-llama2-tokenizer") + + text_stories = [ + "Once upon a", + "Mother woke up alert. She put on her coat", + "Once upon a time, in a small town, there was a weird", + "Once upon a time, there was a", + "Sara and Tom are friends. They like to play in the park.", + ] + correct_batches = [ + [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037], + [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261], + [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286], + [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406], + [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037], + [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], + ] + assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches + + tokenized_stories = [ + [1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573], + [46, 3515, 2941, 1637, 1377], + [1439, 3378, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871, 1973], + [1163, 1358, 1930, 3590, 2216, 3659, 278], + [604, 2920, 1330, 2240, 786, 4088, 1416, 2122, 1556, 3501, 3159, 3427], + ] + correct_batches = [ + [1, 1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573, 2], + [1, 2, 46, 3515, 2941, 1637, 1377, 2, 1439, 3378, 3897], + [1, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871], + [1, 2871, 1973, 2, 1163, 1358, 1930, 3590, 2216, 3659, 278], + [1, 278, 2, 604, 2920, 1330, 2240, 786, 4088, 1416, 2122], + [1, 2122, 1556, 3501, 3159, 3427, 2], + ] + assert ( + get_tokenized_batches( + tokenized_stories, tokenizer, CTX_SIZE, input_tokenized=True + ) + == correct_batches + ) From 2e69942e16168795ce5579669d70472ab81b65d6 Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Tue, 12 Mar 2024 19:39:07 -0700 Subject: [PATCH 2/8] Split the tokenization function into two parts, fixing the while-loop issues --- src/delphi/train/dataset_tokenization.py | 59 +++++++++++-------- tests/train/test_tokenizer.py | 73 ++++++++++++++++-------- 2 files changed, 84 insertions(+), 48 deletions(-) diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py index 5ca3371f..ffc0a985 100644 --- a/src/delphi/train/dataset_tokenization.py +++ b/src/delphi/train/dataset_tokenization.py @@ -1,39 +1,50 @@ from collections import deque -from typing import Union from transformers import PreTrainedTokenizerBase +def extend_deque( + dq: deque[int], + context_size: int, + text_stories: list[str], + prompt_idx: int, + tokenizer: PreTrainedTokenizerBase, +) -> int: + while len(dq) < context_size and prompt_idx < len(text_stories): + text_story = text_stories[prompt_idx] + dq.extend( + tokenizer.encode(text_story, add_special_tokens=False) + + [tokenizer.eos_token_id] + ) + prompt_idx += 1 + return prompt_idx + + +def make_new_samples( + dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase +) -> list[list[int]]: + samples = [] + while len(dq) >= context_size: + sample = [tokenizer.bos_token_id] + for _ in range(context_size - 1): # peek at and not pop the last element + sample.append(dq.popleft()) + sample.append(dq[0]) + samples.append(sample) + return samples + + def get_tokenized_batches( - text_stories: Union[list[str], list[list[int]]], + text_stories: list[str], tokenizer: PreTrainedTokenizerBase, context_size: int, - input_tokenized=False, ) -> list[list[int]]: dq = deque() + prompt_idx = 0 samples = [] - prompt_idx = 0 while prompt_idx < len(text_stories): - while len(dq) < context_size: - text_story = text_stories[prompt_idx] - if not input_tokenized: - dq.extend( - tokenizer.encode(text_story, add_special_tokens=False) - + [tokenizer.eos_token_id] - ) - else: - dq.extend(text_story) - dq.append(tokenizer.eos_token_id) - prompt_idx += 1 - - sample = [tokenizer.bos_token_id] - for i in range(context_size - 1): # peek at and not pop the last element - sample.append(dq.popleft()) - sample.append(dq[0]) - - samples.append(sample) + prompt_idx = extend_deque(dq, context_size, text_stories, prompt_idx, tokenizer) + samples.extend(make_new_samples(dq, context_size, tokenizer)) - if dq: - samples.append([tokenizer.bos_token_id] + list(dq)) + # We discard the last chunk, so no processing on the remainder of the deque here return samples diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py index ca0415f8..9b3552de 100644 --- a/tests/train/test_tokenizer.py +++ b/tests/train/test_tokenizer.py @@ -1,11 +1,58 @@ +import collections +import random + from transformers import AutoTokenizer -from delphi.train.dataset_tokenization import get_tokenized_batches +from delphi.eval.utils import load_validation_dataset +from delphi.train.dataset_tokenization import ( + extend_deque, + get_tokenized_batches, + make_new_samples, +) + +tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") + + +def test_extend_deque(): + CTX_SIZE = 10 + dataset = load_validation_dataset("delphi-suite/tinystories-v2-clean") + text_stories = dataset["story"][:100] + prompt_idx = 0 + dq = collections.deque() + + while prompt_idx < len(text_stories): + prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer) + if prompt_idx < len(text_stories) - 1: + assert len(dq) >= CTX_SIZE + while len(dq) >= CTX_SIZE: + for _ in range(CTX_SIZE - 1): + dq.popleft() + + +def test_make_new_sample(): + for _ in range(100): + total_tokens = random.randint(100, 1000) + context_size = random.randint(5, total_tokens // 2) + dq = collections.deque([random.randint(3, 1000) for _ in range(total_tokens)]) + samples = make_new_samples(dq, context_size, tokenizer) + tokens_cnt = 0 + for i, sample in enumerate(samples): + assert sample[0] == tokenizer.bos_token_id + if i > 0: + assert sample[1] == samples[i - 1][-1] + tokens_cnt += len(sample) + + # We discard the last chunk so the following lines are only for testing + tokens_cnt += 1 + len(dq) # the last batch with BOS in the beginning + assert tokens_cnt == total_tokens + ( + 2 * len(samples) + 1 + ) # BOS for each batch + overlapping of the last tokens in the batches + assert len(dq) > 0 # always leaving at least one element in the deque def test_get_tokenized_batches(): CTX_SIZE = 10 - tokenizer = AutoTokenizer.from_pretrained("delphi-suite/v0-llama2-tokenizer") + tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") text_stories = [ "Once upon a", @@ -23,25 +70,3 @@ def test_get_tokenized_batches(): [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], ] assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches - - tokenized_stories = [ - [1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573], - [46, 3515, 2941, 1637, 1377], - [1439, 3378, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871, 1973], - [1163, 1358, 1930, 3590, 2216, 3659, 278], - [604, 2920, 1330, 2240, 786, 4088, 1416, 2122, 1556, 3501, 3159, 3427], - ] - correct_batches = [ - [1, 1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573, 2], - [1, 2, 46, 3515, 2941, 1637, 1377, 2, 1439, 3378, 3897], - [1, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871], - [1, 2871, 1973, 2, 1163, 1358, 1930, 3590, 2216, 3659, 278], - [1, 278, 2, 604, 2920, 1330, 2240, 786, 4088, 1416, 2122], - [1, 2122, 1556, 3501, 3159, 3427, 2], - ] - assert ( - get_tokenized_batches( - tokenized_stories, tokenizer, CTX_SIZE, input_tokenized=True - ) - == correct_batches - ) From 7609e4f66253f6b1f3a48d6fd36da00298356df8 Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Sun, 17 Mar 2024 18:35:53 -0700 Subject: [PATCH 3/8] Add docstrings to the functions --- src/delphi/train/dataset_tokenization.py | 46 ++++++++++++++++++++++++ tests/train/test_tokenizer.py | 13 +++++-- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py index ffc0a985..23309e4d 100644 --- a/src/delphi/train/dataset_tokenization.py +++ b/src/delphi/train/dataset_tokenization.py @@ -10,6 +10,23 @@ def extend_deque( prompt_idx: int, tokenizer: PreTrainedTokenizerBase, ) -> int: + """ + Extends the deque with tokenized text stories until the deque grows large + enough to reach the context size, or until all text stories are processed. + + The usage of a deque here aims to save the memory as opposed to + load all the stories and tokenize them at once. + + Args: + dq (deque[int]): Deque to extend with tokenized tokens. + context_size (int): Size of the context(input sequences). + text_stories (list[str]): List of (untokenized) text stories to be tokenized. + prompt_idx (int): Index of the current text story. + tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. + + Returns: + int: Updated index in the text stories dataset. + """ while len(dq) < context_size and prompt_idx < len(text_stories): text_story = text_stories[prompt_idx] dq.extend( @@ -23,6 +40,22 @@ def extend_deque( def make_new_samples( dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase ) -> list[list[int]]: + """ + Generates new samples for training by creating sequences of tokens + from the deque until the deque is empty. + + Note: the model is unable to use the last token in an input sequence, + so we repeat this token in the next input sequence. + + Args: + dq (deque[int]): Deque containing tokenized tokens. + context_size (int): Size of the context (input sequences). + tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. + + Returns: + list[list[int]]: List of token sequences of the same length(context_size). + """ + samples = [] while len(dq) >= context_size: sample = [tokenizer.bos_token_id] @@ -38,6 +71,19 @@ def get_tokenized_batches( tokenizer: PreTrainedTokenizerBase, context_size: int, ) -> list[list[int]]: + """ + Tokenizes the input text stories using the provided tokenizer and + generates token sequences of the specified length. + + Args: + text_stories (list[str]): List of text stories to be tokenized. + tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. + context_size (int): Size of the context (input sequences). + + Returns: + list[list[int]]: List of token sequences of length equal to context_size. + """ + dq = deque() prompt_idx = 0 samples = [] diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py index 9b3552de..a10f72a4 100644 --- a/tests/train/test_tokenizer.py +++ b/tests/train/test_tokenizer.py @@ -15,14 +15,21 @@ def test_extend_deque(): CTX_SIZE = 10 - dataset = load_validation_dataset("delphi-suite/tinystories-v2-clean") - text_stories = dataset["story"][:100] + # generate 100 random stories + text_stories = [ + [ + random.randint(3, tokenizer.vocab_size) + for _ in range(random.randint(100, 800)) + ] + for _ in range(100) + ] prompt_idx = 0 dq = collections.deque() while prompt_idx < len(text_stories): prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer) if prompt_idx < len(text_stories) - 1: + # assert that the deque has grown large enough in each round assert len(dq) >= CTX_SIZE while len(dq) >= CTX_SIZE: for _ in range(CTX_SIZE - 1): @@ -39,6 +46,8 @@ def test_make_new_sample(): for i, sample in enumerate(samples): assert sample[0] == tokenizer.bos_token_id if i > 0: + # assert that there is an overlap of the last token in the previous sample + # and the first token in its following sample assert sample[1] == samples[i - 1][-1] tokens_cnt += len(sample) From b98f81de8de0d1bb8d44550fbd5ef6fd3e260ef5 Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Tue, 19 Mar 2024 15:57:00 -0700 Subject: [PATCH 4/8] Minor edits in the code, fix the test --- src/delphi/dataset/tokenization.py | 102 +++++++++++++++++++++++++++++ tests/dataset/test_tokenizer.py | 83 +++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 src/delphi/dataset/tokenization.py create mode 100644 tests/dataset/test_tokenizer.py diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py new file mode 100644 index 00000000..295ff938 --- /dev/null +++ b/src/delphi/dataset/tokenization.py @@ -0,0 +1,102 @@ +from collections import deque +from typing import Optional + +from transformers import PreTrainedTokenizerBase + + +def extend_deque( + dq: deque[int], + context_size: int, + text_documents: list[str], + doc_idx: int, + tokenizer: PreTrainedTokenizerBase, +) -> int: + """ + Extends the deque with tokenized text documents until the deque grows large + enough to reach the context size, or until all text documents are processed. + + The usage of a deque here aims to save the memory as opposed to + load all the documents and tokenize them at once. + + Args: + dq: Deque to extend with tokenized tokens. + context_size: Size of the context(input sequences). + text_documents: List of (untokenized) text documents to be tokenized. + doc_idx: Index of the current text story. + tokenizer: Tokenizer to encode the text strings. + Returns: + int: Updated index in the text documents dataset. + """ + while len(dq) < context_size and doc_idx < len(text_documents): + text_story = text_documents[doc_idx] + dq.extend( + tokenizer.encode(text_story, add_special_tokens=False) + + [tokenizer.eos_token_id] + ) + doc_idx += 1 + return doc_idx + + +def make_new_samples( + dq: deque[int], context_size: int, bos_token_id: Optional[int] +) -> list[list[int]]: + """ + Generates new samples for training by creating sequences of tokens + from the deque until the deque does not hold enough tokens to generate + another sample. + + Note: the model is unable to use the last token in an input sequence, + so we repeat this token in the next input sequence. + + Args: + dq: Deque containing tokenized tokens. + context_size: Size of the context (input sequences). + bos_token_id: bos_token_id of the tokenizer used. + + Returns: + list[list[int]]: List of token sequences of the same length(context_size). + """ + + samples = [] + while len(dq) >= context_size: + sample = [bos_token_id] + + # For the first (n-1) elements, pop from the left of the deque + # and add to the new sample, the n-th element will be retained + # in the deque for making the next sample. + for _ in range(context_size - 1): + sample.append(dq.popleft()) + sample.append(dq[0]) + + samples.append(sample) + return samples + + +def get_tokenized_batches( + text_documents: list[str], + tokenizer: PreTrainedTokenizerBase, + context_size: int, +) -> list[list[int]]: + """ + Tokenizes the input text documents using the provided tokenizer and + generates token sequences of the specified length. + + Args: + text_documents: List[str], + tokenizer, + context_size, + + Returns: + list[list[int]]: List of token sequences of length equal to context_size. + """ + + dq = deque() + doc_idx = 0 + samples = [] + + while doc_idx < len(text_documents): + doc_idx = extend_deque(dq, context_size, text_documents, doc_idx, tokenizer) + samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id)) + + # We discard the last chunk, so no processing on the remainder of the deque here + return samples diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py new file mode 100644 index 00000000..43e558c2 --- /dev/null +++ b/tests/dataset/test_tokenizer.py @@ -0,0 +1,83 @@ +import collections +import random + +from transformers import AutoTokenizer + +from delphi.dataset.tokenization import ( + extend_deque, + get_tokenized_batches, + make_new_samples, +) +from delphi.eval.utils import load_validation_dataset + +tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") + + +def test_extend_deque(): + CTX_SIZE = 10 + # generate 100 random stories + text_stories = [ + " ".join( + [ + tokenizer.decode(random.randint(3, tokenizer.vocab_size)) + for _ in range(random.randint(100, 800)) + ] + ) + for _ in range(100) + ] + prompt_idx = 0 + dq = collections.deque() + + while prompt_idx < len(text_stories): + prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer) + if prompt_idx < len(text_stories) - 1: + # assert that the deque has grown large enough in each round + assert len(dq) >= CTX_SIZE + while len(dq) >= CTX_SIZE: + for _ in range(CTX_SIZE - 1): + dq.popleft() + + +def test_make_new_sample(): + for _ in range(100): + total_tokens = random.randint(100, 1000) + context_size = random.randint(5, total_tokens // 2) + dq = collections.deque(random.choices(range(3, 1000), k=total_tokens)) + samples = make_new_samples(dq, context_size, tokenizer.bos_token_id) + tokens_cnt = 0 + for i, sample in enumerate(samples): + assert sample[0] == tokenizer.bos_token_id + if i > 0: + # assert that there is an overlap of the last token in the previous sample + # and the first token in its following sample + assert sample[1] == samples[i - 1][-1] + tokens_cnt += len(sample) + + # We discard the last chunk so the following lines are only for testing + tokens_cnt += 1 + len(dq) # the last batch with BOS in the beginning + assert tokens_cnt == total_tokens + ( + 2 * len(samples) + 1 + ) # BOS for each batch + overlapping of the last tokens in the batches + assert len(dq) > 0 # always leaving at least one element in the deque + + +def test_get_tokenized_batches(): + CTX_SIZE = 10 + tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") + + text_stories = [ + "Once upon a", + "Mother woke up alert. She put on her coat", + "Once upon a time, in a small town, there was a weird", + "Once upon a time, there was a", + "Sara and Tom are friends. They like to play in the park.", + ] + correct_batches = [ + [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037], + [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261], + [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286], + [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406], + [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037], + [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], + ] + assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches From 9869df7bef942ef75f69f4f27682c483745bc551 Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Wed, 20 Mar 2024 10:39:38 -0700 Subject: [PATCH 5/8] Uses batch_encode() method to save time --- src/delphi/dataset/tokenization.py | 21 +++++++++++++-------- tests/dataset/test_tokenizer.py | 18 ++++++++++++++---- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py index 295ff938..f340b4fa 100644 --- a/src/delphi/dataset/tokenization.py +++ b/src/delphi/dataset/tokenization.py @@ -10,6 +10,7 @@ def extend_deque( text_documents: list[str], doc_idx: int, tokenizer: PreTrainedTokenizerBase, + batch_size: int, ) -> int: """ Extends the deque with tokenized text documents until the deque grows large @@ -28,17 +29,18 @@ def extend_deque( int: Updated index in the text documents dataset. """ while len(dq) < context_size and doc_idx < len(text_documents): - text_story = text_documents[doc_idx] - dq.extend( - tokenizer.encode(text_story, add_special_tokens=False) - + [tokenizer.eos_token_id] - ) - doc_idx += 1 + text_doc = text_documents[doc_idx : doc_idx + batch_size] + batch_input_ids = tokenizer( + text_doc, return_attention_mask=False, add_special_tokens=False + )["input_ids"] + for input_ids in batch_input_ids: + dq.extend(input_ids + [tokenizer.eos_token_id]) + doc_idx += batch_size return doc_idx def make_new_samples( - dq: deque[int], context_size: int, bos_token_id: Optional[int] + dq: deque[int], context_size: int, bos_token_id: int ) -> list[list[int]]: """ Generates new samples for training by creating sequences of tokens @@ -76,6 +78,7 @@ def get_tokenized_batches( text_documents: list[str], tokenizer: PreTrainedTokenizerBase, context_size: int, + batch_size: int, ) -> list[list[int]]: """ Tokenizes the input text documents using the provided tokenizer and @@ -95,7 +98,9 @@ def get_tokenized_batches( samples = [] while doc_idx < len(text_documents): - doc_idx = extend_deque(dq, context_size, text_documents, doc_idx, tokenizer) + doc_idx = extend_deque( + dq, context_size, text_documents, doc_idx, tokenizer, batch_size + ) samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id)) # We discard the last chunk, so no processing on the remainder of the deque here diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py index 43e558c2..a24d1d82 100644 --- a/tests/dataset/test_tokenizer.py +++ b/tests/dataset/test_tokenizer.py @@ -1,6 +1,7 @@ import collections import random +import pytest from transformers import AutoTokenizer from delphi.dataset.tokenization import ( @@ -10,11 +11,15 @@ ) from delphi.eval.utils import load_validation_dataset -tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") + +@pytest.fixture +def tokenizer(): + return AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") def test_extend_deque(): CTX_SIZE = 10 + BATCH_SIZE = 2 # generate 100 random stories text_stories = [ " ".join( @@ -29,7 +34,9 @@ def test_extend_deque(): dq = collections.deque() while prompt_idx < len(text_stories): - prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer) + prompt_idx = extend_deque( + dq, CTX_SIZE, text_stories, prompt_idx, tokenizer, BATCH_SIZE + ) if prompt_idx < len(text_stories) - 1: # assert that the deque has grown large enough in each round assert len(dq) >= CTX_SIZE @@ -63,7 +70,7 @@ def test_make_new_sample(): def test_get_tokenized_batches(): CTX_SIZE = 10 - tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") + BATCH_SIZE = 2 text_stories = [ "Once upon a", @@ -80,4 +87,7 @@ def test_get_tokenized_batches(): [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037], [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], ] - assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches + assert ( + get_tokenized_batches(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE) + == correct_batches + ) From 7bd9ef9039b037972d23a44a9925c002b72f0fac Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Wed, 20 Mar 2024 11:23:13 -0700 Subject: [PATCH 6/8] Add script to upload to delphi-suite/batched-tokenized-stories --- scripts/tokenize_dataset.py | 78 +++++++++++++++++++ src/delphi/train/dataset_tokenization.py | 96 ------------------------ tests/dataset/test_tokenizer.py | 6 +- 3 files changed, 81 insertions(+), 99 deletions(-) create mode 100755 scripts/tokenize_dataset.py delete mode 100644 src/delphi/train/dataset_tokenization.py diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py new file mode 100755 index 00000000..7bafff84 --- /dev/null +++ b/scripts/tokenize_dataset.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +import argparse + +from datasets import Dataset +from transformers import AutoTokenizer + +from delphi.dataset.tokenization import get_tokenized_batches +from delphi.eval.utils import load_validation_dataset + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + + parser.add_argument( + "--input-dataset-name", + type=str, + help="Text dataset from huggingface to tokenize", + ) + parser.add_argument( + "--output-dataset-name", + type=str, + help="Name of the tokenized dataset to upload to huggingface", + ) + parser.add_argument( + "--tokenizer-name", + type=str, + help="Name of the tokenizer from huggingface", + ) + parser.add_argument( + "--token", + type=str, + help="Hugging Face API token", + ) + parser.add_argument( + "--context-size", + type=int, + default=512, + help="Context size of the tokenized dataset as input of the model", + ) + parser.add_argument( + "--batch-size", + type=int, + default=50, + help="Batch size of text inputs into the tokenizer", + ) + parser.add_argument( + "--column-name", + type=str, + help="Name of the column containing text documents in the input dataset", + ) + args = parser.parse_args() + + input_dataset = load_validation_dataset(f"delphi-suite/{args.input_dataset_name}") + tokenizer = AutoTokenizer.from_pretrained(f"delphi-suite/{args.tokenizer_name}") + + if args.column_name: + text_docs = input_dataset[args.column_name] + else: + if len(input_dataset.column_names) > 1: + raise ValueError("There are more than one column in the specified dataset") + text_docs = input_dataset[input_dataset.column_names[0]] + + output_dataset = Dataset.from_dict( + { + "tokens": get_tokenized_batches( + text_docs, + tokenizer, + context_size=args.context_size, + batch_size=args.batch_size, + ) + } + ) + + output_dataset.push_to_hub( + repo_id=f"delphi-suite/{args.output_dataset_name}", + private=False, + token=args.token, + ) diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py deleted file mode 100644 index 23309e4d..00000000 --- a/src/delphi/train/dataset_tokenization.py +++ /dev/null @@ -1,96 +0,0 @@ -from collections import deque - -from transformers import PreTrainedTokenizerBase - - -def extend_deque( - dq: deque[int], - context_size: int, - text_stories: list[str], - prompt_idx: int, - tokenizer: PreTrainedTokenizerBase, -) -> int: - """ - Extends the deque with tokenized text stories until the deque grows large - enough to reach the context size, or until all text stories are processed. - - The usage of a deque here aims to save the memory as opposed to - load all the stories and tokenize them at once. - - Args: - dq (deque[int]): Deque to extend with tokenized tokens. - context_size (int): Size of the context(input sequences). - text_stories (list[str]): List of (untokenized) text stories to be tokenized. - prompt_idx (int): Index of the current text story. - tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. - - Returns: - int: Updated index in the text stories dataset. - """ - while len(dq) < context_size and prompt_idx < len(text_stories): - text_story = text_stories[prompt_idx] - dq.extend( - tokenizer.encode(text_story, add_special_tokens=False) - + [tokenizer.eos_token_id] - ) - prompt_idx += 1 - return prompt_idx - - -def make_new_samples( - dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase -) -> list[list[int]]: - """ - Generates new samples for training by creating sequences of tokens - from the deque until the deque is empty. - - Note: the model is unable to use the last token in an input sequence, - so we repeat this token in the next input sequence. - - Args: - dq (deque[int]): Deque containing tokenized tokens. - context_size (int): Size of the context (input sequences). - tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. - - Returns: - list[list[int]]: List of token sequences of the same length(context_size). - """ - - samples = [] - while len(dq) >= context_size: - sample = [tokenizer.bos_token_id] - for _ in range(context_size - 1): # peek at and not pop the last element - sample.append(dq.popleft()) - sample.append(dq[0]) - samples.append(sample) - return samples - - -def get_tokenized_batches( - text_stories: list[str], - tokenizer: PreTrainedTokenizerBase, - context_size: int, -) -> list[list[int]]: - """ - Tokenizes the input text stories using the provided tokenizer and - generates token sequences of the specified length. - - Args: - text_stories (list[str]): List of text stories to be tokenized. - tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings. - context_size (int): Size of the context (input sequences). - - Returns: - list[list[int]]: List of token sequences of length equal to context_size. - """ - - dq = deque() - prompt_idx = 0 - samples = [] - - while prompt_idx < len(text_stories): - prompt_idx = extend_deque(dq, context_size, text_stories, prompt_idx, tokenizer) - samples.extend(make_new_samples(dq, context_size, tokenizer)) - - # We discard the last chunk, so no processing on the remainder of the deque here - return samples diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py index a24d1d82..55f0fcbf 100644 --- a/tests/dataset/test_tokenizer.py +++ b/tests/dataset/test_tokenizer.py @@ -17,7 +17,7 @@ def tokenizer(): return AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") -def test_extend_deque(): +def test_extend_deque(tokenizer): CTX_SIZE = 10 BATCH_SIZE = 2 # generate 100 random stories @@ -45,7 +45,7 @@ def test_extend_deque(): dq.popleft() -def test_make_new_sample(): +def test_make_new_sample(tokenizer): for _ in range(100): total_tokens = random.randint(100, 1000) context_size = random.randint(5, total_tokens // 2) @@ -68,7 +68,7 @@ def test_make_new_sample(): assert len(dq) > 0 # always leaving at least one element in the deque -def test_get_tokenized_batches(): +def test_get_tokenized_batches(tokenizer): CTX_SIZE = 10 BATCH_SIZE = 2 From c5c0e09f6fda457febf93bb5f4a49dab9d192abf Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Wed, 20 Mar 2024 11:46:06 -0700 Subject: [PATCH 7/8] Remove the test file in tests/train to pass pytest --- tests/train/test_tokenizer.py | 81 ----------------------------------- 1 file changed, 81 deletions(-) delete mode 100644 tests/train/test_tokenizer.py diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py deleted file mode 100644 index a10f72a4..00000000 --- a/tests/train/test_tokenizer.py +++ /dev/null @@ -1,81 +0,0 @@ -import collections -import random - -from transformers import AutoTokenizer - -from delphi.eval.utils import load_validation_dataset -from delphi.train.dataset_tokenization import ( - extend_deque, - get_tokenized_batches, - make_new_samples, -) - -tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") - - -def test_extend_deque(): - CTX_SIZE = 10 - # generate 100 random stories - text_stories = [ - [ - random.randint(3, tokenizer.vocab_size) - for _ in range(random.randint(100, 800)) - ] - for _ in range(100) - ] - prompt_idx = 0 - dq = collections.deque() - - while prompt_idx < len(text_stories): - prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer) - if prompt_idx < len(text_stories) - 1: - # assert that the deque has grown large enough in each round - assert len(dq) >= CTX_SIZE - while len(dq) >= CTX_SIZE: - for _ in range(CTX_SIZE - 1): - dq.popleft() - - -def test_make_new_sample(): - for _ in range(100): - total_tokens = random.randint(100, 1000) - context_size = random.randint(5, total_tokens // 2) - dq = collections.deque([random.randint(3, 1000) for _ in range(total_tokens)]) - samples = make_new_samples(dq, context_size, tokenizer) - tokens_cnt = 0 - for i, sample in enumerate(samples): - assert sample[0] == tokenizer.bos_token_id - if i > 0: - # assert that there is an overlap of the last token in the previous sample - # and the first token in its following sample - assert sample[1] == samples[i - 1][-1] - tokens_cnt += len(sample) - - # We discard the last chunk so the following lines are only for testing - tokens_cnt += 1 + len(dq) # the last batch with BOS in the beginning - assert tokens_cnt == total_tokens + ( - 2 * len(samples) + 1 - ) # BOS for each batch + overlapping of the last tokens in the batches - assert len(dq) > 0 # always leaving at least one element in the deque - - -def test_get_tokenized_batches(): - CTX_SIZE = 10 - tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer") - - text_stories = [ - "Once upon a", - "Mother woke up alert. She put on her coat", - "Once upon a time, in a small town, there was a weird", - "Once upon a time, there was a", - "Sara and Tom are friends. They like to play in the park.", - ] - correct_batches = [ - [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037], - [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261], - [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286], - [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406], - [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037], - [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], - ] - assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches From ba1b10999b7722e5d26a06b6940ab2d1925f75fc Mon Sep 17 00:00:00 2001 From: Siwei Li Date: Sat, 23 Mar 2024 10:41:30 -0700 Subject: [PATCH 8/8] Update function name --- scripts/tokenize_dataset.py | 4 ++-- src/delphi/dataset/tokenization.py | 2 +- tests/dataset/test_tokenizer.py | 11 +++-------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py index 7bafff84..5a3d01d5 100755 --- a/scripts/tokenize_dataset.py +++ b/scripts/tokenize_dataset.py @@ -5,7 +5,7 @@ from datasets import Dataset from transformers import AutoTokenizer -from delphi.dataset.tokenization import get_tokenized_batches +from delphi.dataset.tokenization import tokenize_dataset from delphi.eval.utils import load_validation_dataset if __name__ == "__main__": @@ -62,7 +62,7 @@ output_dataset = Dataset.from_dict( { - "tokens": get_tokenized_batches( + "tokens": tokenize_dataset( text_docs, tokenizer, context_size=args.context_size, diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py index f340b4fa..b800b64b 100644 --- a/src/delphi/dataset/tokenization.py +++ b/src/delphi/dataset/tokenization.py @@ -74,7 +74,7 @@ def make_new_samples( return samples -def get_tokenized_batches( +def tokenize_dataset( text_documents: list[str], tokenizer: PreTrainedTokenizerBase, context_size: int, diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py index 55f0fcbf..99b2dcb3 100644 --- a/tests/dataset/test_tokenizer.py +++ b/tests/dataset/test_tokenizer.py @@ -4,12 +4,7 @@ import pytest from transformers import AutoTokenizer -from delphi.dataset.tokenization import ( - extend_deque, - get_tokenized_batches, - make_new_samples, -) -from delphi.eval.utils import load_validation_dataset +from delphi.dataset.tokenization import extend_deque, make_new_samples, tokenize_dataset @pytest.fixture @@ -68,7 +63,7 @@ def test_make_new_sample(tokenizer): assert len(dq) > 0 # always leaving at least one element in the deque -def test_get_tokenized_batches(tokenizer): +def test_tokenize_dataset(tokenizer): CTX_SIZE = 10 BATCH_SIZE = 2 @@ -88,6 +83,6 @@ def test_get_tokenized_batches(tokenizer): [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2], ] assert ( - get_tokenized_batches(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE) + tokenize_dataset(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE) == correct_batches )