diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py new file mode 100644 index 000000000..b2873ceba --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py @@ -0,0 +1,63 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + + +# pretraining +other_target_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_v1", + commit="1397363c5c0e3c4e3ab620be562730399c852493", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="other_target", +) + + +neg_hard_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_hard_v1", + commit="be51394d876428ad531e0786d80de43d6a8818af", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="hard_negatives", + ) + +neg_hard_pretrain_jobs = dict() +neg_hard_pretrain_jobs[0] = neg_hard_pretrain_job +for start_cp in [50, 100, 150, 200]: + neg_hard_pretrain_jobs[start_cp] = run_fairseq_pretraining( + exp_name=f"monophone_negatives_hard_after_{start_cp}ep_other_v1", + commit="be51394d876428ad531e0786d80de43d6a8818af", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + checkpoint=other_target_pretrain_job.out_models[start_cp].model, + negative_sampling_strategy="hard_negatives", + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +for start_cp in [50, 100, 150, 200]: + for additional_cp in range(50, 600+1-start_cp, 50): + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_hard_pretrain_jobs[start_cp].out_models[start_cp + additional_cp].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_negatives_hard", f"other_{start_cp}_hard_{additional_cp}"), + fairseq_root=fairseq_root, + ) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py new file mode 100644 index 000000000..abbc5ca46 --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py @@ -0,0 +1,172 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + + +# pretraining +neg_other_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_v1", + commit="1397363c5c0e3c4e3ab620be562730399c852493", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + negative_sampling_strategy="other_target", + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning + +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # negative sampling + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_neg_sampling_other_target", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + + +# finetuning experiments only for the last checkpoint +CHECKPOINT = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "random_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "1_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 2 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +model_conf_w2v["mask_other"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "1_2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.35 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "0_35_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.5 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "0_5_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.65 # base model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "0_65_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.8 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target", + "0_8_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py new file mode 100644 index 000000000..1cf01b6db --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py @@ -0,0 +1,170 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + +# Pretraining +neg_other_trg_phon_boundary_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_negatives_other_target_boundary_masking_v1", + commit="b768be5b81987364d39a07d1caad2bfe1e956896", + negative_sampling_strategy="other_target", + mask_strategy="phoneme", + mask_length=1, + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # negative sampling + phoneme boundary masking + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + f"checkpoint_{checkpoint}" + ), + fairseq_root=fairseq_root, + ) + + +# finetuning experiments only for the last checkpoint +CHECKPOINT = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "random" +model_conf_w2v["mask_length"] = 10 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "random_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_length` +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "1_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_length"] = 2 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_length"] = 1 +model_conf_w2v["mask_other"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "1_2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.35 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "0_35_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.5 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "0_5_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.65 # base model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "0_65_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.8 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_neg_sampling_other_target_phoneme_boundary_masking", + "0_8_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py new file mode 100644 index 000000000..e799cbdab --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_boundary.py @@ -0,0 +1,170 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining \ + +# Pretraining +phon_boundary_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_boundary_masking_v1", + commit="b768be5b81987364d39a07d1caad2bfe1e956896", + python_exe_hash_overwrite="itc_python_launcher_py310_torch", + mask_strategy="phoneme", + mask_length=1, + ) + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +checkpoints = [100, 200, 300, 400, 500, 600] +for checkpoint in checkpoints: + # phoneme boundary masking + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_phoneme_boundary_masking", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + +# finetuning experiments only for the last checkpoint +CHECKPOINT = 600 +# random vs phoneme mask in finetuning +model_conf_w2v = base_model_conf.copy() # base model, no need to set `mask_strategy` and `mask_length` +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "random_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# phoneme mask lengths in finetuning +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "1_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 2 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_strategy"] = "phoneme" +model_conf_w2v["mask_length"] = 1 +model_conf_w2v["mask_other"] = 1 +model_conf_w2v["mask_selection"] = "uniform" +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "1_2_phoneme_spec", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +# mask probability in finetuning +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.35 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "0_35_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.5 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "0_5_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.65 # base model +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "0_65_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) + +model_conf_w2v = base_model_conf.copy() +model_conf_w2v["w2v_path"] = phon_boundary_pretrain_job.out_models[CHECKPOINT].model +model_conf_w2v["mask_prob"] = 0.8 +eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join( + "w2v_phoneme_boundary_masking", + "0_8_phoneme_mask_prob", + f"checkpoint_{CHECKPOINT}" + ), + fairseq_root=fairseq_root, +) diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune copy.py similarity index 100% rename from users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune.py rename to users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_phoneme_pretrain_finetune copy.py diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py new file mode 100644 index 000000000..69800407b --- /dev/null +++ b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_positives.py @@ -0,0 +1,73 @@ +from sisyphus import tk +import os + +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base +from i6_experiments.users.vieting.experiments.librispeech.\ + librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \ + get_fairseq_root, \ + run_fairseq_pretraining + +# pretraining +# positive sampling +pos_sampling_5_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_positive_sampling_5_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=5, +) + +pos_sampling_10_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_positive_sampling_10_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=10, +) + +pos_sampling_15_pretrain_job = run_fairseq_pretraining( + exp_name="monophone_positive_sampling_15_v2", + commit="24d7d72c1e00f69689dc8a8ba2e0d75fe5f1cccd", + num_positives=15, +) + + +# fairseq root +fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3")) + +# Finetuning +base_model_conf = { + "_name": "wav2vec_ctc", + "apply_mask": True, + "mask_prob": 0.65, + "mask_channel_prob": 0.5, + "mask_channel_length": 64, + "layerdrop": 0.1, + "activation_dropout": 0.1, + "feature_grad_mult": 0.0, + "freeze_finetune_updates": 10000, # was 0 in fairseq config +} + +#checkpoint = 400 +for checkpoint in [100, 200, 300, 400, 500, 600]: + # positive sampling 5 + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = pos_sampling_5_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_positive_sampling", "pos_samples_5", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + # positive sampling 10 + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = pos_sampling_10_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_positive_sampling", "pos_samples_10", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) + # positive sampling 15 + model_conf_w2v = base_model_conf.copy() + model_conf_w2v["w2v_path"] = pos_sampling_15_pretrain_job.out_models[checkpoint].model + eow_phon_ls100_ctc_base( + model_conf_w2v=model_conf_w2v, + train_name_suffix=os.path.join("w2v_positive_sampling", "pos_samples_15", f"checkpoint_{checkpoint}"), + fairseq_root=fairseq_root, + ) \ No newline at end of file diff --git a/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla_pretrain_finetune.py b/users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla.py similarity index 100% rename from users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla_pretrain_finetune.py rename to users/vieting/experiments/librispeech/librispeech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_vanilla.py