Skip to content

Commit

Permalink
working on domeval and multiref copyrate
Browse files Browse the repository at this point in the history
  • Loading branch information
Tommi Nieminen committed Oct 15, 2024
1 parent 82cd43a commit 21412a8
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 58 deletions.
6 changes: 6 additions & 0 deletions data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,17 @@ rule baseline_preprocessing:
input:
train_source="{project_name}/{src}-{trg}/{preprocessing}/train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{preprocessing}/train.{trg}.gz",
domeval_source="{project_name}/{src}-{trg}/{preprocessing}/domeval.{src}.gz",
domeval_target="{project_name}/{src}-{trg}/{preprocessing}/domeval.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/cleandev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/cleandev.{trg}.gz",
eval_source="{project_name}/{src}-{trg}/{preprocessing}/eval.{src}.gz",
eval_target="{project_name}/{src}-{trg}/{preprocessing}/eval.{trg}.gz"
output:
train_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-train.{src}.gz",
train_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-train.{trg}.gz",
domeval_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/domeval.{src}.gz",
domeval_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/domeval.{trg}.gz",
dev_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-cleandev.{src}.gz",
dev_target="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/train-cleandev.{trg}.gz",
eval_source="{project_name}/{src}-{trg}/{preprocessing}/baseline_preprocessing_{max_dev_sents}/eval.{src}.gz",
Expand All @@ -85,6 +89,8 @@ rule baseline_preprocessing:
ln {input.train_target} {output.train_target} >> {log} 2>&1 && \
ln {input.eval_source} {output.eval_source} >> {log} 2>&1 && \
ln {input.eval_target} {output.eval_target} >> {log} 2>&1 && \
ln {input.domeval_source} {output.domeval_source} >> {log} 2>&1 && \
ln {input.domeval_target} {output.domeval_target} >> {log} 2>&1 && \
{{ pigz -dc {input.dev_source} | head -n {wildcards.max_dev_sents} | pigz -c > {output.dev_source} ; }} 2>> {log} && \
{{ pigz -dc {input.dev_target} | head -n {wildcards.max_dev_sents} | pigz -c > {output.dev_target} ; }} 2>> {log}
"""
Expand Down
17 changes: 10 additions & 7 deletions eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,13 @@ checkpoint translate_domeval:
domain_src=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{domain}-domeval.{{src}}.gz", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)),
train_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/train-domeval.{src}.gz",
all_filtered_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/all_filtered-domeval.{src}.gz",
decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
decoder_config=f'{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml'
output:
output_dir=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval")
params:
domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}"
shell: '''pipeline/eval/translate-domeval.sh {params.domain_index_src_dir} {output.output_dir} {wildcards.src} {wildcards.trg} {input.decoder} {input.decoder_config} --mini-batch 128 --workspace 20000 >> {log} 2>&1'''
domain_index_src_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}",
uses_bands=lambda wildcards: "false" if "nobands" in wildcards.train_model else "true"
shell: '''pipeline/eval/translate-domeval.sh {params.domain_index_src_dir} {output.output_dir} {wildcards.src} {wildcards.trg} {input.decoder} {input.decoder_config} {params.uses_bands} --mini-batch 128 --workspace 20000 >> {log} 2>&1'''

# This evaluates the translations generated with translate_domeval
rule eval_domeval:
Expand All @@ -85,15 +86,16 @@ rule eval_domeval:
min_score="0\.\d+",
model="[\w-]+"
input:
domain_index_trg=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/domeval/{domain}-domeval.{{trg}}.gz", domain=find_translate_sets(wildcards, checkpoints.translate_domeval))
domain_index_trg=lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/domeval/{domain}-domeval.{{trg}}.gz", domain=find_translate_sets(wildcards, checkpoints.translate_domeval)),
baseline_translations="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subset_5M/baseline_preprocessing_2000/train_joint_spm_vocab_50000_prepend/train_model_train-baseteacher-train/eval/domeval.{trg}"
output:
report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
params:
input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval",
domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz",
system_id="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}"
shell: '''python pipeline/eval/score-domeval.py --input_dir {params.input_dir} --report {output} --src_lang {wildcards.src} --trg_lang {wildcards.trg} --system_id {params.system_id} --domeval_ids {params.domeval_ids} >> {log} 2>&1'''
shell: '''python pipeline/eval/score-domeval.py --input_dir {params.input_dir} --report {output} --src_lang {wildcards.src} --trg_lang {wildcards.trg} --system_id {params.system_id} --domeval_ids {params.domeval_ids} --baseline_translations {input.baseline_translations} >> {log} 2>&1'''

rule evaluate:
message: "Evaluating a model"
Expand All @@ -113,8 +115,9 @@ rule evaluate:
trg_spm='{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/vocab.spm',
model=f'{{project_name}}/{{src}}-{{trg}}/{{preprocessing}}/{{train_vocab}}/train_model_{{model_type}}-{{training_type}}/final.model.npz.best-{config["best-model-metric"]}.npz'
output:
report('{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/train_model_{model_type}-{training_type}/eval/{dataset}.metrics',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
metrics=report('{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/train_model_{model_type}-{training_type}/eval/{dataset}.metrics',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst'),
translations='{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/train_model_{model_type}-{training_type}/eval/{dataset}.{trg}'
params:
dataset_prefix='{project_name}/{src}-{trg}/{preprocessing}/{dataset}',
res_prefix='{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/train_model_{model_type}-{training_type}/eval/{dataset}',
Expand Down
1 change: 0 additions & 1 deletion pipeline/eval/merge_domain_translations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def replace_fuzzy_lines(non_fuzzy_file, fuzzy_file, fuzzy_line_number_file, outp

# Replace lines in non-fuzzy lines with those from fuzzy lines based on fuzzy line numbers
for (line_number_index, line_number) in enumerate(fuzzy_line_numbers):
print(line_number)
# Check if the line number is within range
if 1 <= line_number <= len(non_fuzzy_lines):
non_fuzzy_lines[line_number - 1] = fuzzy_lines[line_number_index]
Expand Down
Loading

0 comments on commit 21412a8

Please sign in to comment.