From 411756a94fe521e17bc1803ed025069019c56bce Mon Sep 17 00:00:00 2001 From: Jett Date: Tue, 14 May 2024 21:54:55 +0200 Subject: [PATCH] tokenize into seq_len instead of seq_len+1 --- src/delphi/dataset/tokenization.py | 11 ++++------- tests/dataset/test_tokeniation.py | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/delphi/dataset/tokenization.py b/src/delphi/dataset/tokenization.py index 4896172a..25d8ec80 100644 --- a/src/delphi/dataset/tokenization.py +++ b/src/delphi/dataset/tokenization.py @@ -2,11 +2,8 @@ import itertools from collections import deque from collections.abc import Iterator -from pathlib import Path from datasets import Dataset -from huggingface_hub import HfApi -from tqdm.auto import trange from transformers import PreTrainedTokenizerBase @@ -47,7 +44,7 @@ def extend_deque( return doc_idx -def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> list[int]: +def make_new_sample(deq: deque[int], seq_len: int, bos_token_id: int) -> list[int]: """ Generates new sample for training by creating sequence of tokens from the deque until the deque. @@ -64,10 +61,10 @@ def make_new_sample(deq: deque[int], context_size: int, bos_token_id: int) -> li list[int]: token sequence. """ sample = [bos_token_id] - # For the first (n-1) elements, pop from the left of the deque - # and add to the new sample, the n-th element will be retained + # For the first n-2 elements, pop from the left of the deque + # and add to the new sample, the (n-1)-th element will be retained # in the deque for making the next sample. - for _ in range(context_size - 1): + for _ in range(seq_len - 2): sample.append(deq.popleft()) sample.append(deq[0]) return sample diff --git a/tests/dataset/test_tokeniation.py b/tests/dataset/test_tokeniation.py index 4d765fbc..a5cf8368 100644 --- a/tests/dataset/test_tokeniation.py +++ b/tests/dataset/test_tokeniation.py @@ -73,7 +73,7 @@ def test_make_new_sample(tokenizer): def test_tokenize_dataset(tokenizer): - CTX_SIZE = 10 + SEQ_LEN = 11 BATCH_SIZE = 2 documents = [ @@ -92,5 +92,5 @@ def test_tokenize_dataset(tokenizer): [0, 284, 260, 2606, 1, 431, 440, 260, 399, 13, 402], [0, 402, 284, 260, 1, 1370, 268, 415, 484, 412, 15], ] - actual = [x for x in tokenize_dataset(dataset, tokenizer, CTX_SIZE, BATCH_SIZE)] + actual = [x for x in tokenize_dataset(dataset, tokenizer, SEQ_LEN, BATCH_SIZE)] assert actual == expected