Skip to content

Commit

Permalink
fix: fix issue where if last_chunk contain a single thread, only samp…
Browse files Browse the repository at this point in the history
…les in that thread is saved
  • Loading branch information
vTuanpham committed Nov 17, 2023
1 parent 0c32423 commit c38faf4
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 11 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,5 @@
* Known issues:
* 'TypeError: "NoneType' object is not iterable"
This issue is relevant to gender specific translation, you can read more here https://github.com/ssut/py-googletrans/issues/260
* Weird bug where if a chunk only has a single thread,
only examples in that thread is saved. For now, adjust the ***max_example_per_thread***
and the ***large_chunks_threshold*** args so that the last chunk cannot contain a single thread


2 changes: 1 addition & 1 deletion tests/eli5_qaconfig_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def step1(self):
self.file_path = "examples/ELI5/ELI5_val_10_doc.json"
self.output_dir = "examples/ELI5"
self.parser = ELI5ValQAConfig(self.file_path, self.output_dir, target_lang="ru",
max_example_per_thread=100, large_chunks_threshold=800)
max_example_per_thread=50, large_chunks_threshold=500)

def step2(self):
self.parser.read()
Expand Down
2 changes: 1 addition & 1 deletion tests/eli5_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def step1(self):
self.file_path = "examples/ELI5/ELI5_val_10_doc.json"
self.output_dir = "examples/ELI5"
self.parser = ELI5Val(self.file_path, self.output_dir, target_lang="de",
max_example_per_thread=100, large_chunks_threshold=800)
max_example_per_thread=50, large_chunks_threshold=500)

def step2(self):
self.parser.read()
Expand Down
14 changes: 8 additions & 6 deletions translator/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ def __init__(self, file_path: str,
target_lang: str = "vi",
) -> None:

# TODO: Fix weird bug where if a chunk only has a single thread,
# only examples in that thread is saved. For now, adjust the max_example_per_thread
# and the large_chunks_threshold args so that the last chunk cannot contain a single thread

self.data_read = None
self.converted_data = None
self.file_path = file_path
Expand Down Expand Up @@ -187,13 +183,14 @@ def callback_done(future):
with lock:
translated_data += future.result()
finished_task += 1
tqdm.write("\nTask finished, adding translated data to result...")
tqdm.write("\nTask finished, adding translated data to result...\n")
else:
tqdm.write(f"\nTask failed with the following error: {future.exception()}."
f"\nRestarting thread when others finished\n")
pass

for idx, chunk in enumerate(chunks):
# Assign each thread with a new Translator instance
future_chunk = executor.submit(self.translate_converted, chunk, f"chunk {idx}", Translator())
future_chunk.add_done_callback(callback_done)
future_dict = {
Expand Down Expand Up @@ -241,7 +238,12 @@ def callback_done(future):
translated_data_example = self.translate_en2vi_advance_qa(example, translator)
translated_data.append(translated_data_example)
if en_data: return translated_data
self.converted_data_translated = translated_data
if large_chunk:
# Assuming that the previous large chunk process already create self.converted_data_translated
# This cover the case where last large chunk only contain a single thread
self.converted_data_translated += translated_data
else:
self.converted_data_translated = translated_data
except ConnectTimeout as e:
if not desc:
raise ConnectTimeout(f" Connection timeout, please provide better connection")
Expand Down

0 comments on commit c38faf4

Please sign in to comment.