fix: fix issue where if last_chunk contain a single thread, only samp…

…les in that thread is saved
vTuanpham · Nov 17, 2023 · c38faf4 · c38faf4
1 parent 0c32423
commit c38faf4
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -346,7 +346,5 @@
 * Known issues: 
   * 'TypeError: "NoneType' object is not iterable"
      This issue is relevant to gender specific translation, you can read more here https://github.com/ssut/py-googletrans/issues/260
-  * Weird bug where if a chunk only has a single thread,
-  only examples in that thread is saved. For now, adjust the ***max_example_per_thread***
-  and the ***large_chunks_threshold*** args so that the last chunk cannot contain a single thread
+
 
diff --git a/tests/eli5_qaconfig_test.py b/tests/eli5_qaconfig_test.py
@@ -13,7 +13,7 @@ def step1(self):
         self.file_path = "examples/ELI5/ELI5_val_10_doc.json"
         self.output_dir = "examples/ELI5"
         self.parser = ELI5ValQAConfig(self.file_path, self.output_dir, target_lang="ru",
-                                      max_example_per_thread=100, large_chunks_threshold=800)
+                                      max_example_per_thread=50, large_chunks_threshold=500)
 
     def step2(self):
         self.parser.read()

diff --git a/tests/eli5_test.py b/tests/eli5_test.py
@@ -13,7 +13,7 @@ def step1(self):
         self.file_path = "examples/ELI5/ELI5_val_10_doc.json"
         self.output_dir = "examples/ELI5"
         self.parser = ELI5Val(self.file_path, self.output_dir, target_lang="de",
-                              max_example_per_thread=100, large_chunks_threshold=800)
+                              max_example_per_thread=50, large_chunks_threshold=500)
 
     def step2(self):
         self.parser.read()

diff --git a/translator/data_parser.py b/translator/data_parser.py
@@ -40,10 +40,6 @@ def __init__(self, file_path: str,
                  target_lang: str = "vi",
                  ) -> None:
 
-        # TODO: Fix weird bug where if a chunk only has a single thread,
-        #  only examples in that thread is saved. For now, adjust the max_example_per_thread
-        #  and the large_chunks_threshold args so that the last chunk cannot contain a single thread
-
         self.data_read = None
         self.converted_data = None
         self.file_path = file_path
@@ -187,13 +183,14 @@ def callback_done(future):
                         with lock:
                             translated_data += future.result()
                             finished_task += 1
-                            tqdm.write("\nTask finished, adding translated data to result...")
+                            tqdm.write("\nTask finished, adding translated data to result...\n")
                     else:
                         tqdm.write(f"\nTask failed with the following error: {future.exception()}."
                                    f"\nRestarting thread when others finished\n")
                         pass
 
                 for idx, chunk in enumerate(chunks):
+                    # Assign each thread with a new Translator instance
                     future_chunk = executor.submit(self.translate_converted, chunk, f"chunk {idx}", Translator())
                     future_chunk.add_done_callback(callback_done)
                     future_dict = {
@@ -241,7 +238,12 @@ def callback_done(future):
                 translated_data_example = self.translate_en2vi_advance_qa(example, translator)
                 translated_data.append(translated_data_example)
             if en_data: return translated_data
-            self.converted_data_translated = translated_data
+            if large_chunk:
+                # Assuming that the previous large chunk process already create self.converted_data_translated
+                # This cover the case where last large chunk only contain a single thread
+                self.converted_data_translated += translated_data
+            else:
+                self.converted_data_translated = translated_data
         except ConnectTimeout as e:
             if not desc:
                 raise ConnectTimeout(f" Connection timeout, please provide better connection")