Skip to content

Commit

Permalink
set overwrite default value to True in add_symbol
Browse files Browse the repository at this point in the history
This ensures compatibility with all the calls to add_symbol across the repo (which overwrite by default, as in the original implementation). The only place where the value is explicitly changed is when loading the dictionary from file (which was the source of the bug). In a file you have to explicitly say whether the tokens should be overwritten or duplicated
  • Loading branch information
lydianish authored Mar 8, 2024
1 parent b291c8d commit 1743314
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions fairseq/data/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def unk_string(self, escape=False):
else:
return self.unk_word

def add_symbol(self, word, n=1, overwrite=False):
def add_symbol(self, word, n=1, overwrite=True):
"""Adds a word to the dictionary"""
if word in self.indices and overwrite:
idx = self.indices[word]
Expand Down Expand Up @@ -337,7 +337,7 @@ def encode_line(

for i, word in enumerate(words):
if add_if_not_exist:
idx = self.add_symbol(word, overwrite=True)
idx = self.add_symbol(word)
else:
idx = self.index(word)
if consumer is not None:
Expand Down Expand Up @@ -367,7 +367,7 @@ def _add_file_to_dictionary_single_worker(
def add_file_to_dictionary(filename, dict, tokenize, num_workers):
def merge_result(counter):
for w, c in sorted(counter.items()):
dict.add_symbol(w, c, overwrite=True)
dict.add_symbol(w, c)

local_file = PathManager.get_local_path(filename)
offsets = find_offsets(local_file, num_workers)
Expand Down

0 comments on commit 1743314

Please sign in to comment.