From 411756a94fe521e17bc1803ed025069019c56bce Mon Sep 17 00:00:00 2001
From: Jett <jettjaniak@gmail.com>
Date: Tue, 14 May 2024 21:54:55 +0200
Subject: [PATCH] tokenize into seq_len instead of seq_len+1

---
 src/delphi/dataset/tokenization.py | 11 ++++-------
 tests/dataset/test_tokeniation.py  |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py
index 4896172a..25d8ec80 100644
--- a/src/delphi/dataset/tokenization.py
+++ b/src/delphi/dataset/tokenization.py
@@ -2,11 +2,8 @@
 import itertools
 from collections import deque
 from collections.abc import Iterator
-from pathlib import Path
 
 from datasets import Dataset
-from huggingface_hub import HfApi
-from tqdm.auto import trange
 from transformers import PreTrainedTokenizerBase
 
 
@@ -47,7 +44,7 @@ def extend_deque(
     return doc_idx
 
 
-def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> list[int]:
+def make_new_sample(deq: deque[int], seq_len: int, bos_token_id: int) -> list[int]:
     """
     Generates new sample for training by creating sequence of tokens
     from the deque until the deque.
@@ -64,10 +61,10 @@ def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> li
         list[int]: token sequence.
     """
     sample = [bos_token_id]
-    # For the first (n-1) elements, pop from the left of the deque
-    # and add to the new sample, the n-th element will be retained
+    # For the first n-2 elements, pop from the left of the deque
+    # and add to the new sample, the (n-1)-th element will be retained
     # in the deque for making the next sample.
-    for _ in range(context_size - 1):
+    for _ in range(seq_len - 2):
         sample.append(deq.popleft())
     sample.append(deq[0])
     return sample
diff --git a/tests/dataset/test_tokeniation.py b/tests/dataset/test_tokeniation.py
index 4d765fbc..a5cf8368 100644
--- a/tests/dataset/test_tokeniation.py
+++ b/tests/dataset/test_tokeniation.py
@@ -73,7 +73,7 @@ def test_make_new_sample(tokenizer):
 
 
 def test_tokenize_dataset(tokenizer):
-    CTX_SIZE = 10
+    SEQ_LEN = 11
     BATCH_SIZE = 2
 
     documents = [
@@ -92,5 +92,5 @@ def test_tokenize_dataset(tokenizer):
         [0, 284, 260, 2606, 1, 431, 440, 260, 399, 13, 402],
         [0, 402, 284, 260, 1, 1370, 268, 415, 484, 412, 15],
     ]
-    actual = [x for x in tokenize_dataset(dataset, tokenizer, CTX_SIZE, BATCH_SIZE)]
+    actual = [x for x in tokenize_dataset(dataset, tokenizer, SEQ_LEN, BATCH_SIZE)]
     assert actual == expected