From e7ab2e85af8f538b9767025c701bc437b4763b62 Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Sat, 9 Mar 2024 21:03:15 -0800
Subject: [PATCH 1/8] Add function to tokenize text stories and split into
 batches

---
 src/delphi/train/dataset_tokenization.py | 39 ++++++++++++++++++++
 tests/train/test_tokenizer.py            | 47 ++++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 src/delphi/train/dataset_tokenization.py
 create mode 100644 tests/train/test_tokenizer.py

diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py
new file mode 100644
index 00000000..5ca3371f
--- /dev/null
+++ b/src/delphi/train/dataset_tokenization.py
@@ -0,0 +1,39 @@
+from collections import deque
+from typing import Union
+
+from transformers import PreTrainedTokenizerBase
+
+
+def get_tokenized_batches(
+    text_stories: Union[list[str], list[list[int]]],
+    tokenizer: PreTrainedTokenizerBase,
+    context_size: int,
+    input_tokenized=False,
+) -> list[list[int]]:
+    dq = deque()
+    samples = []
+
+    prompt_idx = 0
+    while prompt_idx < len(text_stories):
+        while len(dq) < context_size:
+            text_story = text_stories[prompt_idx]
+            if not input_tokenized:
+                dq.extend(
+                    tokenizer.encode(text_story, add_special_tokens=False)
+                    + [tokenizer.eos_token_id]
+                )
+            else:
+                dq.extend(text_story)
+                dq.append(tokenizer.eos_token_id)
+            prompt_idx += 1
+
+        sample = [tokenizer.bos_token_id]
+        for i in range(context_size - 1):  # peek at and not pop the last element
+            sample.append(dq.popleft())
+        sample.append(dq[0])
+
+        samples.append(sample)
+
+    if dq:
+        samples.append([tokenizer.bos_token_id] + list(dq))
+    return samples
diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py
new file mode 100644
index 00000000..ca0415f8
--- /dev/null
+++ b/tests/train/test_tokenizer.py
@@ -0,0 +1,47 @@
+from transformers import AutoTokenizer
+
+from delphi.train.dataset_tokenization import get_tokenized_batches
+
+
+def test_get_tokenized_batches():
+    CTX_SIZE = 10
+    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/v0-llama2-tokenizer")
+
+    text_stories = [
+        "Once upon a",
+        "Mother woke up alert. She put on her coat",
+        "Once upon a time, in a small town, there was a weird",
+        "Once upon a time, there was a",
+        "Sara and Tom are friends. They like to play in the park.",
+    ]
+    correct_batches = [
+        [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037],
+        [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261],
+        [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286],
+        [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406],
+        [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037],
+        [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
+    ]
+    assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches
+
+    tokenized_stories = [
+        [1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573],
+        [46, 3515, 2941, 1637, 1377],
+        [1439, 3378, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871, 1973],
+        [1163, 1358, 1930, 3590, 2216, 3659, 278],
+        [604, 2920, 1330, 2240, 786, 4088, 1416, 2122, 1556, 3501, 3159, 3427],
+    ]
+    correct_batches = [
+        [1, 1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573, 2],
+        [1, 2, 46, 3515, 2941, 1637, 1377, 2, 1439, 3378, 3897],
+        [1, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871],
+        [1, 2871, 1973, 2, 1163, 1358, 1930, 3590, 2216, 3659, 278],
+        [1, 278, 2, 604, 2920, 1330, 2240, 786, 4088, 1416, 2122],
+        [1, 2122, 1556, 3501, 3159, 3427, 2],
+    ]
+    assert (
+        get_tokenized_batches(
+            tokenized_stories, tokenizer, CTX_SIZE, input_tokenized=True
+        )
+        == correct_batches
+    )

From 2e69942e16168795ce5579669d70472ab81b65d6 Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Tue, 12 Mar 2024 19:39:07 -0700
Subject: [PATCH 2/8] Split the tokenization function into two parts, fixing
 the while-loop issues

---
 src/delphi/train/dataset_tokenization.py | 59 +++++++++++--------
 tests/train/test_tokenizer.py            | 73 ++++++++++++++++--------
 2 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py
index 5ca3371f..ffc0a985 100644
--- a/src/delphi/train/dataset_tokenization.py
+++ b/src/delphi/train/dataset_tokenization.py
@@ -1,39 +1,50 @@
 from collections import deque
-from typing import Union
 
 from transformers import PreTrainedTokenizerBase
 
 
+def extend_deque(
+    dq: deque[int],
+    context_size: int,
+    text_stories: list[str],
+    prompt_idx: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> int:
+    while len(dq) < context_size and prompt_idx < len(text_stories):
+        text_story = text_stories[prompt_idx]
+        dq.extend(
+            tokenizer.encode(text_story, add_special_tokens=False)
+            + [tokenizer.eos_token_id]
+        )
+        prompt_idx += 1
+    return prompt_idx
+
+
+def make_new_samples(
+    dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase
+) -> list[list[int]]:
+    samples = []
+    while len(dq) >= context_size:
+        sample = [tokenizer.bos_token_id]
+        for _ in range(context_size - 1):  # peek at and not pop the last element
+            sample.append(dq.popleft())
+        sample.append(dq[0])
+        samples.append(sample)
+    return samples
+
+
 def get_tokenized_batches(
-    text_stories: Union[list[str], list[list[int]]],
+    text_stories: list[str],
     tokenizer: PreTrainedTokenizerBase,
     context_size: int,
-    input_tokenized=False,
 ) -> list[list[int]]:
     dq = deque()
+    prompt_idx = 0
     samples = []
 
-    prompt_idx = 0
     while prompt_idx < len(text_stories):
-        while len(dq) < context_size:
-            text_story = text_stories[prompt_idx]
-            if not input_tokenized:
-                dq.extend(
-                    tokenizer.encode(text_story, add_special_tokens=False)
-                    + [tokenizer.eos_token_id]
-                )
-            else:
-                dq.extend(text_story)
-                dq.append(tokenizer.eos_token_id)
-            prompt_idx += 1
-
-        sample = [tokenizer.bos_token_id]
-        for i in range(context_size - 1):  # peek at and not pop the last element
-            sample.append(dq.popleft())
-        sample.append(dq[0])
-
-        samples.append(sample)
+        prompt_idx = extend_deque(dq, context_size, text_stories, prompt_idx, tokenizer)
+        samples.extend(make_new_samples(dq, context_size, tokenizer))
 
-    if dq:
-        samples.append([tokenizer.bos_token_id] + list(dq))
+    # We discard the last chunk, so no processing on the remainder of the deque here
     return samples
diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py
index ca0415f8..9b3552de 100644
--- a/tests/train/test_tokenizer.py
+++ b/tests/train/test_tokenizer.py
@@ -1,11 +1,58 @@
+import collections
+import random
+
 from transformers import AutoTokenizer
 
-from delphi.train.dataset_tokenization import get_tokenized_batches
+from delphi.eval.utils import load_validation_dataset
+from delphi.train.dataset_tokenization import (
+    extend_deque,
+    get_tokenized_batches,
+    make_new_samples,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
+
+
+def test_extend_deque():
+    CTX_SIZE = 10
+    dataset = load_validation_dataset("delphi-suite/tinystories-v2-clean")
+    text_stories = dataset["story"][:100]
+    prompt_idx = 0
+    dq = collections.deque()
+
+    while prompt_idx < len(text_stories):
+        prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer)
+        if prompt_idx < len(text_stories) - 1:
+            assert len(dq) >= CTX_SIZE
+        while len(dq) >= CTX_SIZE:
+            for _ in range(CTX_SIZE - 1):
+                dq.popleft()
+
+
+def test_make_new_sample():
+    for _ in range(100):
+        total_tokens = random.randint(100, 1000)
+        context_size = random.randint(5, total_tokens // 2)
+        dq = collections.deque([random.randint(3, 1000) for _ in range(total_tokens)])
+        samples = make_new_samples(dq, context_size, tokenizer)
+        tokens_cnt = 0
+        for i, sample in enumerate(samples):
+            assert sample[0] == tokenizer.bos_token_id
+            if i > 0:
+                assert sample[1] == samples[i - 1][-1]
+            tokens_cnt += len(sample)
+
+        # We discard the last chunk so the following lines are only for testing
+        tokens_cnt += 1 + len(dq)  # the last batch with BOS in the beginning
+        assert tokens_cnt == total_tokens + (
+            2 * len(samples) + 1
+        )  # BOS for each batch + overlapping of the last tokens in the batches
+        assert len(dq) > 0  # always leaving at least one element in the deque
 
 
 def test_get_tokenized_batches():
     CTX_SIZE = 10
-    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/v0-llama2-tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
 
     text_stories = [
         "Once upon a",
@@ -23,25 +70,3 @@ def test_get_tokenized_batches():
         [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
     ]
     assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches
-
-    tokenized_stories = [
-        [1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573],
-        [46, 3515, 2941, 1637, 1377],
-        [1439, 3378, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871, 1973],
-        [1163, 1358, 1930, 3590, 2216, 3659, 278],
-        [604, 2920, 1330, 2240, 786, 4088, 1416, 2122, 1556, 3501, 3159, 3427],
-    ]
-    correct_batches = [
-        [1, 1618, 3520, 2223, 3961, 853, 3376, 1820, 1442, 1573, 2],
-        [1, 2, 46, 3515, 2941, 1637, 1377, 2, 1439, 3378, 3897],
-        [1, 3897, 3807, 343, 1140, 3843, 3848, 1343, 3812, 947, 2871],
-        [1, 2871, 1973, 2, 1163, 1358, 1930, 3590, 2216, 3659, 278],
-        [1, 278, 2, 604, 2920, 1330, 2240, 786, 4088, 1416, 2122],
-        [1, 2122, 1556, 3501, 3159, 3427, 2],
-    ]
-    assert (
-        get_tokenized_batches(
-            tokenized_stories, tokenizer, CTX_SIZE, input_tokenized=True
-        )
-        == correct_batches
-    )

From 7609e4f66253f6b1f3a48d6fd36da00298356df8 Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Sun, 17 Mar 2024 18:35:53 -0700
Subject: [PATCH 3/8] Add docstrings to the functions

---
 src/delphi/train/dataset_tokenization.py | 46 ++++++++++++++++++++++++
 tests/train/test_tokenizer.py            | 13 +++++--
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py
index ffc0a985..23309e4d 100644
--- a/src/delphi/train/dataset_tokenization.py
+++ b/src/delphi/train/dataset_tokenization.py
@@ -10,6 +10,23 @@ def extend_deque(
     prompt_idx: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> int:
+    """
+    Extends the deque with tokenized text stories until the deque grows large
+    enough to reach the context size, or until all text stories are processed.
+
+    The usage of a deque here aims to save the memory as opposed to
+    load all the stories and tokenize them at once.
+
+    Args:
+        dq (deque[int]): Deque to extend with tokenized tokens.
+        context_size (int): Size of the context(input sequences).
+        text_stories (list[str]): List of (untokenized) text stories to be tokenized.
+        prompt_idx (int): Index of the current text story.
+        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
+
+    Returns:
+        int: Updated index in the text stories dataset.
+    """
     while len(dq) < context_size and prompt_idx < len(text_stories):
         text_story = text_stories[prompt_idx]
         dq.extend(
@@ -23,6 +40,22 @@ def extend_deque(
 def make_new_samples(
     dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase
 ) -> list[list[int]]:
+    """
+    Generates new samples for training by creating sequences of tokens
+    from the deque until the deque is empty.
+
+    Note: the model is unable to use the last token in an input sequence,
+    so we repeat this token in the next input sequence.
+
+    Args:
+        dq (deque[int]): Deque containing tokenized tokens.
+        context_size (int): Size of the context (input sequences).
+        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
+
+    Returns:
+        list[list[int]]: List of token sequences of the same length(context_size).
+    """
+
     samples = []
     while len(dq) >= context_size:
         sample = [tokenizer.bos_token_id]
@@ -38,6 +71,19 @@ def get_tokenized_batches(
     tokenizer: PreTrainedTokenizerBase,
     context_size: int,
 ) -> list[list[int]]:
+    """
+    Tokenizes the input text stories using the provided tokenizer and
+    generates token sequences of the specified length.
+
+    Args:
+        text_stories (list[str]): List of text stories to be tokenized.
+        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
+        context_size (int): Size of the context (input sequences).
+
+    Returns:
+        list[list[int]]: List of token sequences of length equal to context_size.
+    """
+
     dq = deque()
     prompt_idx = 0
     samples = []
diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py
index 9b3552de..a10f72a4 100644
--- a/tests/train/test_tokenizer.py
+++ b/tests/train/test_tokenizer.py
@@ -15,14 +15,21 @@
 
 def test_extend_deque():
     CTX_SIZE = 10
-    dataset = load_validation_dataset("delphi-suite/tinystories-v2-clean")
-    text_stories = dataset["story"][:100]
+    # generate 100 random stories
+    text_stories = [
+        [
+            random.randint(3, tokenizer.vocab_size)
+            for _ in range(random.randint(100, 800))
+        ]
+        for _ in range(100)
+    ]
     prompt_idx = 0
     dq = collections.deque()
 
     while prompt_idx < len(text_stories):
         prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer)
         if prompt_idx < len(text_stories) - 1:
+            # assert that the deque has grown large enough in each round
             assert len(dq) >= CTX_SIZE
         while len(dq) >= CTX_SIZE:
             for _ in range(CTX_SIZE - 1):
@@ -39,6 +46,8 @@ def test_make_new_sample():
         for i, sample in enumerate(samples):
             assert sample[0] == tokenizer.bos_token_id
             if i > 0:
+                # assert that there is an overlap of the last token in the previous sample
+                # and the first token in its following sample
                 assert sample[1] == samples[i - 1][-1]
             tokens_cnt += len(sample)
 

From b98f81de8de0d1bb8d44550fbd5ef6fd3e260ef5 Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Tue, 19 Mar 2024 15:57:00 -0700
Subject: [PATCH 4/8] Minor edits in the code, fix the test

---
 src/delphi/dataset/tokenization.py | 102 +++++++++++++++++++++++++++++
 tests/dataset/test_tokenizer.py    |  83 +++++++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 src/delphi/dataset/tokenization.py
 create mode 100644 tests/dataset/test_tokenizer.py

diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py
new file mode 100644
index 00000000..295ff938
--- /dev/null
+++ b/src/delphi/dataset/tokenization.py
@@ -0,0 +1,102 @@
+from collections import deque
+from typing import Optional
+
+from transformers import PreTrainedTokenizerBase
+
+
+def extend_deque(
+    dq: deque[int],
+    context_size: int,
+    text_documents: list[str],
+    doc_idx: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> int:
+    """
+    Extends the deque with tokenized text documents until the deque grows large
+    enough to reach the context size, or until all text documents are processed.
+
+    The usage of a deque here aims to save the memory as opposed to
+    load all the documents and tokenize them at once.
+
+    Args:
+        dq: Deque to extend with tokenized tokens.
+        context_size: Size of the context(input sequences).
+        text_documents: List of (untokenized) text documents to be tokenized.
+        doc_idx: Index of the current text story.
+        tokenizer: Tokenizer to encode the text strings.
+    Returns:
+        int: Updated index in the text documents dataset.
+    """
+    while len(dq) < context_size and doc_idx < len(text_documents):
+        text_story = text_documents[doc_idx]
+        dq.extend(
+            tokenizer.encode(text_story, add_special_tokens=False)
+            + [tokenizer.eos_token_id]
+        )
+        doc_idx += 1
+    return doc_idx
+
+
+def make_new_samples(
+    dq: deque[int], context_size: int, bos_token_id: Optional[int]
+) -> list[list[int]]:
+    """
+    Generates new samples for training by creating sequences of tokens
+    from the deque until the deque does not hold enough tokens to generate
+    another sample.
+
+    Note: the model is unable to use the last token in an input sequence,
+    so we repeat this token in the next input sequence.
+
+    Args:
+        dq: Deque containing tokenized tokens.
+        context_size: Size of the context (input sequences).
+        bos_token_id: bos_token_id of the tokenizer used.
+
+    Returns:
+        list[list[int]]: List of token sequences of the same length(context_size).
+    """
+
+    samples = []
+    while len(dq) >= context_size:
+        sample = [bos_token_id]
+
+        # For the first (n-1) elements, pop from the left of the deque
+        # and add to the new sample, the n-th element will be retained
+        # in the deque for making the next sample.
+        for _ in range(context_size - 1):
+            sample.append(dq.popleft())
+        sample.append(dq[0])
+
+        samples.append(sample)
+    return samples
+
+
+def get_tokenized_batches(
+    text_documents: list[str],
+    tokenizer: PreTrainedTokenizerBase,
+    context_size: int,
+) -> list[list[int]]:
+    """
+    Tokenizes the input text documents using the provided tokenizer and
+    generates token sequences of the specified length.
+
+    Args:
+        text_documents: List[str],
+        tokenizer,
+        context_size,
+
+    Returns:
+        list[list[int]]: List of token sequences of length equal to context_size.
+    """
+
+    dq = deque()
+    doc_idx = 0
+    samples = []
+
+    while doc_idx < len(text_documents):
+        doc_idx = extend_deque(dq, context_size, text_documents, doc_idx, tokenizer)
+        samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id))
+
+    # We discard the last chunk, so no processing on the remainder of the deque here
+    return samples
diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py
new file mode 100644
index 00000000..43e558c2
--- /dev/null
+++ b/tests/dataset/test_tokenizer.py
@@ -0,0 +1,83 @@
+import collections
+import random
+
+from transformers import AutoTokenizer
+
+from delphi.dataset.tokenization import (
+    extend_deque,
+    get_tokenized_batches,
+    make_new_samples,
+)
+from delphi.eval.utils import load_validation_dataset
+
+tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
+
+
+def test_extend_deque():
+    CTX_SIZE = 10
+    # generate 100 random stories
+    text_stories = [
+        " ".join(
+            [
+                tokenizer.decode(random.randint(3, tokenizer.vocab_size))
+                for _ in range(random.randint(100, 800))
+            ]
+        )
+        for _ in range(100)
+    ]
+    prompt_idx = 0
+    dq = collections.deque()
+
+    while prompt_idx < len(text_stories):
+        prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer)
+        if prompt_idx < len(text_stories) - 1:
+            # assert that the deque has grown large enough in each round
+            assert len(dq) >= CTX_SIZE
+        while len(dq) >= CTX_SIZE:
+            for _ in range(CTX_SIZE - 1):
+                dq.popleft()
+
+
+def test_make_new_sample():
+    for _ in range(100):
+        total_tokens = random.randint(100, 1000)
+        context_size = random.randint(5, total_tokens // 2)
+        dq = collections.deque(random.choices(range(3, 1000), k=total_tokens))
+        samples = make_new_samples(dq, context_size, tokenizer.bos_token_id)
+        tokens_cnt = 0
+        for i, sample in enumerate(samples):
+            assert sample[0] == tokenizer.bos_token_id
+            if i > 0:
+                # assert that there is an overlap of the last token in the previous sample
+                # and the first token in its following sample
+                assert sample[1] == samples[i - 1][-1]
+            tokens_cnt += len(sample)
+
+        # We discard the last chunk so the following lines are only for testing
+        tokens_cnt += 1 + len(dq)  # the last batch with BOS in the beginning
+        assert tokens_cnt == total_tokens + (
+            2 * len(samples) + 1
+        )  # BOS for each batch + overlapping of the last tokens in the batches
+        assert len(dq) > 0  # always leaving at least one element in the deque
+
+
+def test_get_tokenized_batches():
+    CTX_SIZE = 10
+    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
+
+    text_stories = [
+        "Once upon a",
+        "Mother woke up alert. She put on her coat",
+        "Once upon a time, in a small town, there was a weird",
+        "Once upon a time, there was a",
+        "Sara and Tom are friends. They like to play in the park.",
+    ]
+    correct_batches = [
+        [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037],
+        [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261],
+        [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286],
+        [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406],
+        [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037],
+        [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
+    ]
+    assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches

From 9869df7bef942ef75f69f4f27682c483745bc551 Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Wed, 20 Mar 2024 10:39:38 -0700
Subject: [PATCH 5/8] Uses batch_encode() method to save time

---
 src/delphi/dataset/tokenization.py | 21 +++++++++++++--------
 tests/dataset/test_tokenizer.py    | 18 ++++++++++++++----
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py
index 295ff938..f340b4fa 100644
--- a/src/delphi/dataset/tokenization.py
+++ b/src/delphi/dataset/tokenization.py
@@ -10,6 +10,7 @@ def extend_deque(
     text_documents: list[str],
     doc_idx: int,
     tokenizer: PreTrainedTokenizerBase,
+    batch_size: int,
 ) -> int:
     """
     Extends the deque with tokenized text documents until the deque grows large
@@ -28,17 +29,18 @@ def extend_deque(
         int: Updated index in the text documents dataset.
     """
     while len(dq) < context_size and doc_idx < len(text_documents):
-        text_story = text_documents[doc_idx]
-        dq.extend(
-            tokenizer.encode(text_story, add_special_tokens=False)
-            + [tokenizer.eos_token_id]
-        )
-        doc_idx += 1
+        text_doc = text_documents[doc_idx : doc_idx + batch_size]
+        batch_input_ids = tokenizer(
+            text_doc, return_attention_mask=False, add_special_tokens=False
+        )["input_ids"]
+        for input_ids in batch_input_ids:
+            dq.extend(input_ids + [tokenizer.eos_token_id])
+        doc_idx += batch_size
     return doc_idx
 
 
 def make_new_samples(
-    dq: deque[int], context_size: int, bos_token_id: Optional[int]
+    dq: deque[int], context_size: int, bos_token_id: int
 ) -> list[list[int]]:
     """
     Generates new samples for training by creating sequences of tokens
@@ -76,6 +78,7 @@ def get_tokenized_batches(
     text_documents: list[str],
     tokenizer: PreTrainedTokenizerBase,
     context_size: int,
+    batch_size: int,
 ) -> list[list[int]]:
     """
     Tokenizes the input text documents using the provided tokenizer and
@@ -95,7 +98,9 @@ def get_tokenized_batches(
     samples = []
 
     while doc_idx < len(text_documents):
-        doc_idx = extend_deque(dq, context_size, text_documents, doc_idx, tokenizer)
+        doc_idx = extend_deque(
+            dq, context_size, text_documents, doc_idx, tokenizer, batch_size
+        )
         samples.extend(make_new_samples(dq, context_size, tokenizer.bos_token_id))
 
     # We discard the last chunk, so no processing on the remainder of the deque here
diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py
index 43e558c2..a24d1d82 100644
--- a/tests/dataset/test_tokenizer.py
+++ b/tests/dataset/test_tokenizer.py
@@ -1,6 +1,7 @@
 import collections
 import random
 
+import pytest
 from transformers import AutoTokenizer
 
 from delphi.dataset.tokenization import (
@@ -10,11 +11,15 @@
 )
 from delphi.eval.utils import load_validation_dataset
 
-tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
+
+@pytest.fixture
+def tokenizer():
+    return AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
 
 
 def test_extend_deque():
     CTX_SIZE = 10
+    BATCH_SIZE = 2
     # generate 100 random stories
     text_stories = [
         " ".join(
@@ -29,7 +34,9 @@ def test_extend_deque():
     dq = collections.deque()
 
     while prompt_idx < len(text_stories):
-        prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer)
+        prompt_idx = extend_deque(
+            dq, CTX_SIZE, text_stories, prompt_idx, tokenizer, BATCH_SIZE
+        )
         if prompt_idx < len(text_stories) - 1:
             # assert that the deque has grown large enough in each round
             assert len(dq) >= CTX_SIZE
@@ -63,7 +70,7 @@ def test_make_new_sample():
 
 def test_get_tokenized_batches():
     CTX_SIZE = 10
-    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
+    BATCH_SIZE = 2
 
     text_stories = [
         "Once upon a",
@@ -80,4 +87,7 @@ def test_get_tokenized_batches():
         [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037],
         [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
     ]
-    assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches
+    assert (
+        get_tokenized_batches(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE)
+        == correct_batches
+    )

From 7bd9ef9039b037972d23a44a9925c002b72f0fac Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Wed, 20 Mar 2024 11:23:13 -0700
Subject: [PATCH 6/8] Add script to upload to
 delphi-suite/batched-tokenized-stories

---
 scripts/tokenize_dataset.py              | 78 +++++++++++++++++++
 src/delphi/train/dataset_tokenization.py | 96 ------------------------
 tests/dataset/test_tokenizer.py          |  6 +-
 3 files changed, 81 insertions(+), 99 deletions(-)
 create mode 100755 scripts/tokenize_dataset.py
 delete mode 100644 src/delphi/train/dataset_tokenization.py

diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py
new file mode 100755
index 00000000..7bafff84
--- /dev/null
+++ b/scripts/tokenize_dataset.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+from delphi.dataset.tokenization import get_tokenized_batches
+from delphi.eval.utils import load_validation_dataset
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+
+    parser.add_argument(
+        "--input-dataset-name",
+        type=str,
+        help="Text dataset from huggingface to tokenize",
+    )
+    parser.add_argument(
+        "--output-dataset-name",
+        type=str,
+        help="Name of the tokenized dataset to upload to huggingface",
+    )
+    parser.add_argument(
+        "--tokenizer-name",
+        type=str,
+        help="Name of the tokenizer from huggingface",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="Hugging Face API token",
+    )
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=512,
+        help="Context size of the tokenized dataset as input of the model",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=50,
+        help="Batch size of text inputs into the tokenizer",
+    )
+    parser.add_argument(
+        "--column-name",
+        type=str,
+        help="Name of the column containing text documents in the input dataset",
+    )
+    args = parser.parse_args()
+
+    input_dataset = load_validation_dataset(f"delphi-suite/{args.input_dataset_name}")
+    tokenizer = AutoTokenizer.from_pretrained(f"delphi-suite/{args.tokenizer_name}")
+
+    if args.column_name:
+        text_docs = input_dataset[args.column_name]
+    else:
+        if len(input_dataset.column_names) > 1:
+            raise ValueError("There are more than one column in the specified dataset")
+        text_docs = input_dataset[input_dataset.column_names[0]]
+
+    output_dataset = Dataset.from_dict(
+        {
+            "tokens": get_tokenized_batches(
+                text_docs,
+                tokenizer,
+                context_size=args.context_size,
+                batch_size=args.batch_size,
+            )
+        }
+    )
+
+    output_dataset.push_to_hub(
+        repo_id=f"delphi-suite/{args.output_dataset_name}",
+        private=False,
+        token=args.token,
+    )
diff --git a/src/delphi/train/dataset_tokenization.py b/src/delphi/train/dataset_tokenization.py
deleted file mode 100644
index 23309e4d..00000000
--- a/src/delphi/train/dataset_tokenization.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from collections import deque
-
-from transformers import PreTrainedTokenizerBase
-
-
-def extend_deque(
-    dq: deque[int],
-    context_size: int,
-    text_stories: list[str],
-    prompt_idx: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> int:
-    """
-    Extends the deque with tokenized text stories until the deque grows large
-    enough to reach the context size, or until all text stories are processed.
-
-    The usage of a deque here aims to save the memory as opposed to
-    load all the stories and tokenize them at once.
-
-    Args:
-        dq (deque[int]): Deque to extend with tokenized tokens.
-        context_size (int): Size of the context(input sequences).
-        text_stories (list[str]): List of (untokenized) text stories to be tokenized.
-        prompt_idx (int): Index of the current text story.
-        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
-
-    Returns:
-        int: Updated index in the text stories dataset.
-    """
-    while len(dq) < context_size and prompt_idx < len(text_stories):
-        text_story = text_stories[prompt_idx]
-        dq.extend(
-            tokenizer.encode(text_story, add_special_tokens=False)
-            + [tokenizer.eos_token_id]
-        )
-        prompt_idx += 1
-    return prompt_idx
-
-
-def make_new_samples(
-    dq: deque[int], context_size: int, tokenizer: PreTrainedTokenizerBase
-) -> list[list[int]]:
-    """
-    Generates new samples for training by creating sequences of tokens
-    from the deque until the deque is empty.
-
-    Note: the model is unable to use the last token in an input sequence,
-    so we repeat this token in the next input sequence.
-
-    Args:
-        dq (deque[int]): Deque containing tokenized tokens.
-        context_size (int): Size of the context (input sequences).
-        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
-
-    Returns:
-        list[list[int]]: List of token sequences of the same length(context_size).
-    """
-
-    samples = []
-    while len(dq) >= context_size:
-        sample = [tokenizer.bos_token_id]
-        for _ in range(context_size - 1):  # peek at and not pop the last element
-            sample.append(dq.popleft())
-        sample.append(dq[0])
-        samples.append(sample)
-    return samples
-
-
-def get_tokenized_batches(
-    text_stories: list[str],
-    tokenizer: PreTrainedTokenizerBase,
-    context_size: int,
-) -> list[list[int]]:
-    """
-    Tokenizes the input text stories using the provided tokenizer and
-    generates token sequences of the specified length.
-
-    Args:
-        text_stories (list[str]): List of text stories to be tokenized.
-        tokenizer (PreTrainedTokenizerBase): Tokenizer to encode the text strings.
-        context_size (int): Size of the context (input sequences).
-
-    Returns:
-        list[list[int]]: List of token sequences of length equal to context_size.
-    """
-
-    dq = deque()
-    prompt_idx = 0
-    samples = []
-
-    while prompt_idx < len(text_stories):
-        prompt_idx = extend_deque(dq, context_size, text_stories, prompt_idx, tokenizer)
-        samples.extend(make_new_samples(dq, context_size, tokenizer))
-
-    # We discard the last chunk, so no processing on the remainder of the deque here
-    return samples
diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py
index a24d1d82..55f0fcbf 100644
--- a/tests/dataset/test_tokenizer.py
+++ b/tests/dataset/test_tokenizer.py
@@ -17,7 +17,7 @@ def tokenizer():
     return AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
 
 
-def test_extend_deque():
+def test_extend_deque(tokenizer):
     CTX_SIZE = 10
     BATCH_SIZE = 2
     # generate 100 random stories
@@ -45,7 +45,7 @@ def test_extend_deque():
                 dq.popleft()
 
 
-def test_make_new_sample():
+def test_make_new_sample(tokenizer):
     for _ in range(100):
         total_tokens = random.randint(100, 1000)
         context_size = random.randint(5, total_tokens // 2)
@@ -68,7 +68,7 @@ def test_make_new_sample():
         assert len(dq) > 0  # always leaving at least one element in the deque
 
 
-def test_get_tokenized_batches():
+def test_get_tokenized_batches(tokenizer):
     CTX_SIZE = 10
     BATCH_SIZE = 2
 

From c5c0e09f6fda457febf93bb5f4a49dab9d192abf Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Wed, 20 Mar 2024 11:46:06 -0700
Subject: [PATCH 7/8] Remove the test file in tests/train to pass pytest

---
 tests/train/test_tokenizer.py | 81 -----------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 tests/train/test_tokenizer.py

diff --git a/tests/train/test_tokenizer.py b/tests/train/test_tokenizer.py
deleted file mode 100644
index a10f72a4..00000000
--- a/tests/train/test_tokenizer.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import collections
-import random
-
-from transformers import AutoTokenizer
-
-from delphi.eval.utils import load_validation_dataset
-from delphi.train.dataset_tokenization import (
-    extend_deque,
-    get_tokenized_batches,
-    make_new_samples,
-)
-
-tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
-
-
-def test_extend_deque():
-    CTX_SIZE = 10
-    # generate 100 random stories
-    text_stories = [
-        [
-            random.randint(3, tokenizer.vocab_size)
-            for _ in range(random.randint(100, 800))
-        ]
-        for _ in range(100)
-    ]
-    prompt_idx = 0
-    dq = collections.deque()
-
-    while prompt_idx < len(text_stories):
-        prompt_idx = extend_deque(dq, CTX_SIZE, text_stories, prompt_idx, tokenizer)
-        if prompt_idx < len(text_stories) - 1:
-            # assert that the deque has grown large enough in each round
-            assert len(dq) >= CTX_SIZE
-        while len(dq) >= CTX_SIZE:
-            for _ in range(CTX_SIZE - 1):
-                dq.popleft()
-
-
-def test_make_new_sample():
-    for _ in range(100):
-        total_tokens = random.randint(100, 1000)
-        context_size = random.randint(5, total_tokens // 2)
-        dq = collections.deque([random.randint(3, 1000) for _ in range(total_tokens)])
-        samples = make_new_samples(dq, context_size, tokenizer)
-        tokens_cnt = 0
-        for i, sample in enumerate(samples):
-            assert sample[0] == tokenizer.bos_token_id
-            if i > 0:
-                # assert that there is an overlap of the last token in the previous sample
-                # and the first token in its following sample
-                assert sample[1] == samples[i - 1][-1]
-            tokens_cnt += len(sample)
-
-        # We discard the last chunk so the following lines are only for testing
-        tokens_cnt += 1 + len(dq)  # the last batch with BOS in the beginning
-        assert tokens_cnt == total_tokens + (
-            2 * len(samples) + 1
-        )  # BOS for each batch + overlapping of the last tokens in the batches
-        assert len(dq) > 0  # always leaving at least one element in the deque
-
-
-def test_get_tokenized_batches():
-    CTX_SIZE = 10
-    tokenizer = AutoTokenizer.from_pretrained("delphi-suite/stories-tokenizer")
-
-    text_stories = [
-        "Once upon a",
-        "Mother woke up alert. She put on her coat",
-        "Once upon a time, in a small town, there was a weird",
-        "Once upon a time, there was a",
-        "Sara and Tom are friends. They like to play in the park.",
-    ]
-    correct_batches = [
-        [1, 432, 440, 261, 2, 367, 501, 1917, 372, 3398, 4037],
-        [1, 4037, 341, 577, 359, 342, 1854, 2, 432, 440, 261],
-        [1, 261, 403, 4045, 317, 261, 560, 1000, 4045, 406, 286],
-        [1, 286, 261, 2567, 2, 432, 440, 261, 403, 4045, 406],
-        [1, 406, 286, 261, 2, 787, 269, 396, 484, 415, 4037],
-        [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
-    ]
-    assert get_tokenized_batches(text_stories, tokenizer, CTX_SIZE) == correct_batches

From ba1b10999b7722e5d26a06b6940ab2d1925f75fc Mon Sep 17 00:00:00 2001
From: Siwei Li <siwli@ea.com>
Date: Sat, 23 Mar 2024 10:41:30 -0700
Subject: [PATCH 8/8] Update function name

---
 scripts/tokenize_dataset.py        |  4 ++--
 src/delphi/dataset/tokenization.py |  2 +-
 tests/dataset/test_tokenizer.py    | 11 +++--------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py
index 7bafff84..5a3d01d5 100755
--- a/scripts/tokenize_dataset.py
+++ b/scripts/tokenize_dataset.py
@@ -5,7 +5,7 @@
 from datasets import Dataset
 from transformers import AutoTokenizer
 
-from delphi.dataset.tokenization import get_tokenized_batches
+from delphi.dataset.tokenization import tokenize_dataset
 from delphi.eval.utils import load_validation_dataset
 
 if __name__ == "__main__":
@@ -62,7 +62,7 @@
 
     output_dataset = Dataset.from_dict(
         {
-            "tokens": get_tokenized_batches(
+            "tokens": tokenize_dataset(
                 text_docs,
                 tokenizer,
                 context_size=args.context_size,
diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py
index f340b4fa..b800b64b 100644
--- a/src/delphi/dataset/tokenization.py
+++ b/src/delphi/dataset/tokenization.py
@@ -74,7 +74,7 @@ def make_new_samples(
     return samples
 
 
-def get_tokenized_batches(
+def tokenize_dataset(
     text_documents: list[str],
     tokenizer: PreTrainedTokenizerBase,
     context_size: int,
diff --git a/tests/dataset/test_tokenizer.py b/tests/dataset/test_tokenizer.py
index 55f0fcbf..99b2dcb3 100644
--- a/tests/dataset/test_tokenizer.py
+++ b/tests/dataset/test_tokenizer.py
@@ -4,12 +4,7 @@
 import pytest
 from transformers import AutoTokenizer
 
-from delphi.dataset.tokenization import (
-    extend_deque,
-    get_tokenized_batches,
-    make_new_samples,
-)
-from delphi.eval.utils import load_validation_dataset
+from delphi.dataset.tokenization import extend_deque, make_new_samples, tokenize_dataset
 
 
 @pytest.fixture
@@ -68,7 +63,7 @@ def test_make_new_sample(tokenizer):
         assert len(dq) > 0  # always leaving at least one element in the deque
 
 
-def test_get_tokenized_batches(tokenizer):
+def test_tokenize_dataset(tokenizer):
     CTX_SIZE = 10
     BATCH_SIZE = 2
 
@@ -88,6 +83,6 @@ def test_get_tokenized_batches(tokenizer):
         [1, 4037, 311, 519, 268, 326, 317, 264, 525, 4037, 2],
     ]
     assert (
-        get_tokenized_batches(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE)
+        tokenize_dataset(text_stories, tokenizer, CTX_SIZE, BATCH_SIZE)
         == correct_batches
     )