-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataset.py
85 lines (67 loc) · 2.66 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Manages datasets for the transformer code."""
# From haiku transformer example.
from collections.abc import Iterable, Iterator
import itertools
import random
from typing import NamedTuple, TypeVar
import numpy as np
VOCAB_SIZE = 128 # Number of ASCII code points.
PAD_TOKEN = 0
_T = TypeVar('_T')
class Batch(NamedTuple):
inputs: np.ndarray # Integer tokens, shape [B, T].
targets: np.ndarray # Integer tokens, shape [B, T].
def repeat(dataset: Iterable[_T]) -> Iterator[_T]:
return itertools.cycle(dataset)
def shuffle(dataset: Iterator[_T], buffer_size: int) -> Iterator[_T]:
buffer = [next(dataset) for _ in range(buffer_size)]
random.shuffle(buffer)
for item in dataset:
idx = random.randint(0, buffer_size - 1) # Inclusive.
result = buffer[idx]
buffer[idx] = item
yield result
def load_ascii_dataset(
corpus: str,
*,
batch_size: int,
sequence_length: int,
num_shuffle_batches: int = 10,
) -> Iterator[Batch]:
"""Loads a single-file ASCII dataset in memory."""
if not corpus.isascii():
raise ValueError('Loaded corpus is not ASCII.')
if chr(PAD_TOKEN) in corpus: # Reserve 0 codepoint for pad token.
raise ValueError('Corpus must not contain the null byte.')
# Naively tokenise by taking ASCII codepoints.
corpus = np.array([ord(c) for c in corpus]).astype(np.int32)
assert np.max(corpus) < VOCAB_SIZE
crop_len = sequence_length + 1
num_batches, remainder = divmod(corpus.size, batch_size * crop_len)
if remainder:
corpus = corpus[:-remainder] # Drop remainder (incomplete) batch.
ds = corpus.reshape([-1, crop_len])
if num_batches < num_shuffle_batches:
raise ValueError(
f'Only {num_batches} batches in the dataset; consider using a shorter '
'sequence length or a smaller batch batch size.',
)
ds = repeat(ds)
ds = shuffle(ds, buffer_size=batch_size * num_shuffle_batches)
while True:
batch = np.stack([next(ds) for _ in range(batch_size)])
yield Batch(inputs=batch[:, :-1], targets=batch[:, 1:])