diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..8a808272e Binary files /dev/null and b/.DS_Store differ diff --git a/src/encoder.py b/src/encoder.py index 5f52e723c..828ba9b40 100644 --- a/src/encoder.py +++ b/src/encoder.py @@ -56,39 +56,16 @@ def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) - pairs = get_pairs(word) - if not pairs: - return token + new_word = [] - while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break + for x in word: + curr = x + while new_word and (new_word[-1], curr) in self.bpe_ranks: + curr = self.bpe_ranks[(new_word.pop(), curr)] + new_word.append(curr) - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) + word = new_word word = ' '.join(word) self.cache[token] = word return word