From 3ed453aca9a909fe0a6c62af7ac51f906efcc883 Mon Sep 17 00:00:00 2001 From: Lydia Nishimwe Date: Fri, 15 Sep 2023 19:45:08 +0200 Subject: [PATCH] fix overwrite bug when adding symbol to dictionary This bug ignored the tokens that were meant to be overwritten and appends them to the end of the dictionary symbols. For example, a dictionary with 50K tokens that already has ``, ``, `` and `` with the #fairseq:overwrite tag will end up having 50004 tokens when loaded. --- fairseq/data/dictionary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py index 7ad590a19b..3b8b741c4d 100644 --- a/fairseq/data/dictionary.py +++ b/fairseq/data/dictionary.py @@ -126,7 +126,7 @@ def unk_string(self, escape=False): def add_symbol(self, word, n=1, overwrite=False): """Adds a word to the dictionary""" - if word in self.indices and not overwrite: + if word in self.indices and overwrite: idx = self.indices[word] self.count[idx] = self.count[idx] + n return idx