Skip to content

Commit

Permalink
more modularization, finetuning improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
TommiNieminen committed Sep 16, 2024
1 parent f4633d0 commit 616dea6
Show file tree
Hide file tree
Showing 13 changed files with 304 additions and 187 deletions.
107 changes: 21 additions & 86 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ vocab_config = {
"spm-train": f"{marian_dir}/spm_train",
"user-defined-symbols":"FUZZY_BREAK",
"spm-sample-size": 1000000,
"spm-character-coverage": 1.0 #since the data is cleaned, it does not contain many weird characters, causing character coverage to drop valid characters, such as Ö and the degree mark
"spm-character-coverage": 1.0
}

module vocab:
Expand All @@ -71,6 +71,18 @@ module train:

use rule * from train

opusmt_config = {
"marian": f"{marian_dir}/marian",
"gpus-num": gpus_num,
"best-model-metric": best_model_metric,
"finetune-args": get_args("finetune")}

module opusmt:
snakefile: "./opusmt.smk"
config: opusmt_config

use rule * from opusmt

eval_config = {
"marian-decoder": f"{marian_dir}/marian-decoder",
"gpus-num": gpus_num,
Expand Down Expand Up @@ -256,9 +268,15 @@ else:

shell.prefix(f"{envs} ")

results = expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"], dataset=eval_datasets)

# For base model, only generate the metrics once
results.extend(expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/basemodel-eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"][0], dataset=eval_datasets))

#print(results)

rule all:
input: f"{models_dir}/en-fi/corpus-50000-rat-train-baseteacher-0/{best_model}"
#input: results
input: results

wildcard_constraints:
term_ratio="\d+",
Expand Down Expand Up @@ -383,89 +401,6 @@ if augment_corpus:
"{input.src1}" "{input.src2}" "{input.trg1}" "{input.trg2}" "{output.res_src}" "{output.res_trg}" "" \
>> {log} 2>&1'''

# Three options for teacher: 1. download opus-mt model, 2. train teacher with pipeline, 3. path to pretrained teacher model
# TODO: make it possible to combine any of the above options, i.e. use opus-mt, train and use
# pretrained all in the same run. Probably should have a model list where you can define all the
# models to use, and then prefixes (opusmt_, train_, pretrained_, nllb_ etc.) determine how the models are
# created/used/connected to (in case of e.g. external APIs).
if 'opusmt-teacher' in config['experiment']:
rule download_teacher_model:
message: "Downloading OPUS-MT teacher model"
log: f"{log_dir}/download_teacher{{model_index}}-{{ens}}.log"
conda: "envs/base.yml"
threads: 1
wildcard_constraints:
model_index="\d+",
ens="\d+"
output: model=f'{teacher_base_dir}{{model_index}}-{{ens}}/{best_model}',vocab=f'{teacher_base_dir}{{model_index}}-{{ens}}/vocab.yml', model_dir=directory(f'{teacher_base_dir}{{model_index}}-{{ens}}'), src_spm=f'{teacher_base_dir}{{model_index}}-{{ens}}/source.spm', trg_spm=f'{teacher_base_dir}{{model_index}}-{{ens}}/target.spm'
params: teacher_dir=f'{teacher_base_dir}{{model_index}}-{{ens}}',
teacher_url=lambda wildcards: opusmt_teacher[int(wildcards.model_index)]
shell: '''bash pipeline/opusmt/download-model.sh \
"{params.teacher_url}" "{params.teacher_dir}" "{best_model}" {src_three_letter} {trg_three_letter} >> {log} 2>&1'''
elif not forward_pretrained:
rule train_teacher:
message: "Training teacher on all data"
log: f"{log_dir}/train_teacher{{model_index}}-{{ens}}.log"
conda: "envs/base.yml"
threads: gpus_num*3
resources: gpu=gpus_num
wildcard_constraints:
model_index="\d+",
ens="\d+"
input:
rules.merge_devset.output, train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz',
bin=ancient(trainer), vocab=vocab_path
output: model=f'{teacher_base_dir}{{model_index}}-{{ens}}/{best_model}'
params: prefix_train=teacher_corpus,
prefix_test=f"{original}/devset",
dir=directory(f'{teacher_base_dir}{{model_index}}-{{ens}}'),
args=get_args("training-teacher")
shell: '''bash pipeline/train/train.sh \
teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" "{params.dir}" \
"{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1'''


# This is normal teacher with alignments, NOT needed for term models, but might be useful later.
# Note that it uses train-student script, but that just adds the guided alignment
#NOT TESTED YET!
# rule train_teacher_with_alignment:
# message: "Training student"
# log: f"{log_dir}/train_student.log"
# conda: "envs/base.yml"
# threads: gpus_num*3
# resources: gpu=gpus_num
# #group 'student'
# input:
# rules.merge_devset.output, ancient(trainer),
# train_src=f'{teacher_corpus}.{src}.gz',train_trg=f'{teacher_corpus}.{trg}.gz',
# alignments=rules.teacher_alignments.output.alignment,
# vocab=vocab_path
# output: model=f'{teacher_base_dir}-align/{best_model}'
# params: prefix_train=teacher_corpus,prefix_test=f"{original}/devset",
# args=get_args("training-teacher")
# shell: '''bash pipeline/train/train-student.sh \
# "{input.alignments}" teacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \
# "{student_dir}" "{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1'''

rule train_term_teacher:
message: "Training teacher with term constraints"
log: f"{log_dir}/train_teacher-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}.log"
conda: "envs/base.yml"
threads: gpus_num*3
resources: gpu=gpus_num
input:
rules.merge_devset.output, ancient(trainer),
train_src=f"{term_data_dir}/teacher-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}/corpus.{src}.gz",
train_trg=f"{term_data_dir}/teacher-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}/corpus.{trg}.gz",
alignments=f"{term_data_dir}/teacher-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}/corpus.aln.gz",
vocab=vocab_path
output: model=f'{teacher_base_dir}-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}/{best_model}'
params: prefix_train=f"{term_data_dir}/teacher-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}/corpus",prefix_test=f"{original}/devset",
args=get_args("training-term-teacher"),teacher_term_dir=f"{teacher_base_dir}-term-{{scheme}}-{{term_ratio}}-{{sents_per_term_sent}}"
shell: '''bash pipeline/train/train-student.sh \
"{input.alignments}" baseteacher train {src} {trg} "{params.prefix_train}" "{params.prefix_test}" \
"{params.teacher_term_dir}" "{input.vocab}" "{best_model_metric}" {params.args} >> {log} 2>&1'''



if wmt23_termtask and finetune_teacher_with_terms:
Expand Down
60 changes: 60 additions & 0 deletions configs/opus_finetune/config.eng-fin.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
experiment:
name: finetune-uh
src: eng
trg: fin

opusmt-teacher: 'https://object.pouta.csc.fi/Tatoeba-MT-models/eng-fin/opusTCv20210807+bt-2021-09-01.zip'

teacher-ensemble: 1
# path to a pretrained backward model (optional)
backward-model: ""
# path to a pretrained vocabulary (optional)
vocab: ""

finetune:
learning-rates: ["00001","00003","000001"]

wmt23_termtask:
annotation-schemes:
- 'lemma-nonfac-int-append'
term-ratios: [2]
sents-per-term-sents: [1]
train-term-teacher: False
finetune-teacher-with-terms: True

# Since we are just doing fine-tuning, just use 5 mil sentences
parallel-max-sentences: 10000000

# limits per downloaded dataset
mono-max-sentences-src: 100000000
mono-max-sentences-trg: 20000000
# split corpus to parallelize translation
split-length: 1000000
# vocab training sample
spm-sample-size: 10000000

best-model: chrf

marian-args:
# these configs override pipeline/train/configs
finetune:
after: 1e


datasets:
# parallel training corpus
train:
- custom_mix-uh-tilde400K
tc_scored: ../data/finetuning_data/en-fi/tc_Tatoeba-Challenge-v2023-09-26.scored.gz
# datasets to merge for validation while training
devtest:
- flores_dev
- sacrebleu_wmt15
- sacrebleu_wmt16
# datasets for evaluation
test:
- flores_devtest
- sacrebleu_wmt17
- sacrebleu_wmt18
- sacrebleu_wmt19
- custom_finetune
53 changes: 52 additions & 1 deletion data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ wildcard_constraints:
ruleorder: download_tatoeba_corpus > download_corpus

# light-weight rules can run on login node
localrules: download_corpus, download_tatoeba_corpus, extract_tc_scored, subset_corpus, baseline_preprocessing
localrules: download_corpus, download_tatoeba_corpus, subset_corpus, baseline_preprocessing, use_custom_corpus

rule download_tatoeba_corpus:
message: "Downloading Tatoeba corpus"
Expand Down Expand Up @@ -111,6 +111,57 @@ rule subset_corpus:
{{ pigz -dc {input.train_target} | head -n {wildcards.max_train_sents}B | pigz -c > {output.train_target} ; }} 2>> {log}
"""


rule use_custom_corpus:
message: "Using custom corpus"
log: "{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/custom_corpus_{dataset}.log"
conda: None
container: None
threads: 1
# group: 'data'
cache: False # caching is broken in snakemake
wildcard_constraints:
dataset="[\w\d_-]+",
output:
train_source="{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/train.{src}.gz",
train_target="{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/train.{trg}.gz",
dev_source="{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/dev.{src}.gz",
dev_target="{datadir}/{project_name}/{src}-{trg}/corpus_custom_{dataset}/dev.{trg}.gz"
params:
prefix="{datadir}/{dataset}",
dataset="{dataset}"
shell:
"""
ln "{params.prefix}/train.{wildcards.src}.gz" "{output.train_source}" >> {log} 2>&1 && \
ln "{params.prefix}/train.{wildcards.trg}.gz" "{output.train_target}" >> {log} 2>&1 && \
ln "{params.prefix}/dev.{wildcards.src}.gz" "{output.dev_source}" >> {log} 2>&1 && \
ln "{params.prefix}/dev.{wildcards.trg}.gz" "{output.dev_target}" >> {log} 2>&1
"""

ruleorder: use_custom_eval > download_corpus

rule use_custom_eval:
message: "Using custom evalset"
log: "{datadir}/{project_name}/{src}-{trg}/{preprocessing}/custom_eval_{dataset}.log"
conda: None
container: None
threads: 1
# group: 'data'
cache: False # caching is broken in snakemake
wildcard_constraints:
dataset="[\w\d_]+",
output:
eval_source="{datadir}/{project_name}/{src}-{trg}/{preprocessing}/eval-custom_{dataset}.{src}.gz",
eval_target="{datadir}/{project_name}/{src}-{trg}/{preprocessing}/eval-custom_{dataset}.{trg}.gz",
params:
prefix="{datadir}/{dataset}",
dataset="{dataset}"
shell:
"""
ln "{params.prefix}/eval.{wildcards.src}.gz" "{output.eval_source}" >> {log} 2>&1 && \
ln "{params.prefix}/eval.{wildcards.trg}.gz" "{output.eval_target}" >> {log} 2>&1
"""

rule download_corpus:
message: "Downloading parallel corpus"
log: "{project_name}/{src}-{trg}/{preprocessing}/download_{kind}-{dataset}.log"
Expand Down
30 changes: 29 additions & 1 deletion eval.smk
Original file line number Diff line number Diff line change
@@ -1,10 +1,38 @@
wildcard_constraints:
src="\w{2,3}",
trg="\w{2,3}",
train_vocab="train_joint_spm_vocab[^/]+"
train_vocab="train_joint_spm_vocab[^/]+",
learn_rate="\d+"

gpus_num=config["gpus-num"]

#TODO: combine model evaluation rules by storing vocabs in model dir with normally trained models as well
rule evaluate_opus_model:
message: "Evaluating an OPUS model"
log: "{datadir}/{project_name}/{src}-{trg}/{preprocessing}/finetune_{learn_rate}_{model_name}/eval/evaluate_{modeltype}{dataset}.log"
conda: "envs/base.yml"
threads: 7
resources: gpu=1
priority: 50
wildcard_constraints:
modeltype="(basemodel-|)"
input:
ancient(config["marian-decoder"]),
eval_source='{datadir}/{project_name}/{src}-{trg}/{preprocessing}/{dataset}.{src}.gz',
eval_target='{datadir}/{project_name}/{src}-{trg}/{preprocessing}/{dataset}.{trg}.gz',
model=f'{{datadir}}/{{project_name}}/{{src}}-{{trg}}/{{preprocessing}}/finetune_{{learn_rate}}_{{model_name}}/final.model.npz.best-{config["best-model-metric"]}.npz'
output:
report('{datadir}/{project_name}/{src}-{trg}/{preprocessing}/finetune_{learn_rate}_{model_name}/eval/{modeltype}{dataset}.metrics',
category='evaluation', subcategory='{model}', caption='reports/evaluation.rst')
params:
dataset_prefix='{datadir}/{project_name}/{src}-{trg}/{preprocessing}/{dataset}',
res_prefix='{datadir}/{project_name}/{src}-{trg}/{preprocessing}/finetune_{learn_rate}_{model_name}/eval/{modeltype}{dataset}',
decoder_config=
lambda wildcards: f'{wildcards.datadir}/models/{wildcards.src}-{wildcards.trg}/{wildcards.model_name}/decoder.yml' if wildcards.modeltype=="basemodel-" else f'{wildcards.datadir}/{wildcards.project_name}/{wildcards.src}-{wildcards.trg}/{wildcards.preprocessing}/finetune_{wildcards.learn_rate}_{wildcards.model_name}/final.model.npz.best-{config["best-model-metric"]}.npz.decoder.yml',
decoder=config["marian-decoder"]
shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" {wildcards.src} {wildcards.trg} {params.decoder} "{params.decoder_config}" >> {log} 2>&1'''



rule evaluate:
message: "Evaluating a model"
Expand Down
4 changes: 2 additions & 2 deletions pipeline/data/filter-tc-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def process_files(src_file, trg_file, id_file, score_file, min_score, domain_eva
for corpus_name, (src_lines, trg_lines) in eval_lines.items():
eval_src_file_path = f"{output_dir}/{corpus_name}.eval.src.gz"
eval_trg_file_path = f"{output_dir}/{corpus_name}.eval.trg.gz"
with gzip.open(eval_file_path, 'wt', encoding='utf-8') as eval_src_file, \
gzip.open(eval_file_path, 'wt', encoding='utf-8') as eval_trg_file:
with gzip.open(eval_src_file_path, 'wt', encoding='utf-8') as eval_src_file, \
gzip.open(eval_trg_file_path, 'wt', encoding='utf-8') as eval_trg_file:
for src_line, trg_line in zip(src_lines, trg_lines):
eval_src_file.write(src_line)
eval_trg_file.write(trg_line)
Expand Down
25 changes: 20 additions & 5 deletions pipeline/eval/eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,39 @@ src=$3
trg=$4
marian_decoder=$5
decoder_config=$6
model_dir=$(dirname "${decoder_config}")
model_step=$(basename "${model_dir}")
args=( "${@:7}" )

mkdir -p "$(basename "${res_prefix}")"

source_file="${dataset_prefix}.${src}.gz"
output_file="${res_prefix}.${trg}"

if [[ "${model_step}" == *opus* ]]; then
source_spm_path="${model_dir}/source.spm"
target_spm_path="${model_dir}/target.spm"
source_file="${dataset_prefix}.sp.${src}.gz"
pigz -dc "${dataset_prefix}.${src}.gz" | "${MARIAN}/spm_encode" --model "${source_spm_path}" | pigz >"${source_file}"
output_file="${res_prefix}.sp.${trg}"
fi

echo "### Evaluating dataset: ${dataset_prefix}, pair: ${src}-${trg}, Results prefix: ${res_prefix}"

pigz -dc "${dataset_prefix}.${trg}.gz" > "${res_prefix}.${trg}.ref"

pigz -dc "${dataset_prefix}.${src}.gz" |
pigz -dc "${source_file}" |
tee "${res_prefix}.${src}" |
"${marian_decoder}" \
-c "${decoder_config}" \
--quiet \
--quiet-translation \
--log "${res_prefix}.log" \
"${args[@]}" |
tee "${res_prefix}.${trg}" |
sacrebleu "${res_prefix}.${trg}.ref" -d -f text --score-only -l "${src}-${trg}" -m bleu chrf |
tee "${res_prefix}.metrics"
"${args[@]}" > "${output_file}"

if [[ "${model_step}" == *opus* ]]; then
"${MARIAN}/spm_decode" --model "${target_spm_path}" < "${output_file}" > "${res_prefix}.${trg}"
fi
sacrebleu "${res_prefix}.${trg}.ref" -d -f text --score-only -l "${src}-${trg}" -m bleu chrf < "${res_prefix}.${trg}" > "${res_prefix}.metrics"

echo "###### Done: Evaluation of a model"
4 changes: 2 additions & 2 deletions pipeline/opusmt/download-model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ set -euo pipefail

echo "###### Downloading pretrained opus model"

download_url=$1

model_name=$1
model_dir=$2
best_model=$3
source_lang=$4
target_lang=$5
download_url="https://object.pouta.csc.fi/Tatoeba-MT-models/${source_lang}-${target_lang}/${model_name}.zip"

#if download url is best, find the best model from list
#TODO: this doesn't seem to work, the models are not ordered by score in the list
Expand Down
Loading

0 comments on commit 616dea6

Please sign in to comment.