Skip to content

Commit

Permalink
Update for 0.8.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed Nov 3, 2016
1 parent f61cf05 commit f21f793
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 29 deletions.
4 changes: 2 additions & 2 deletions aligner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__ver_major__ = 0
__ver_minor__ = 6
__ver_patch__ = 2
__ver_minor__ = 8
__ver_patch__ = 0
__ver_tuple__ = (__ver_major__, __ver_minor__, __ver_patch__)
__version__ = "%d.%d.%d" % __ver_tuple__

Expand Down
68 changes: 63 additions & 5 deletions aligner/aligner/pretrained.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import os
import shutil
from tqdm import tqdm
import time

from .base import BaseAligner, TEMP_DIR, TriphoneFmllrConfig, TriphoneConfig

from ..dictionary import Dictionary

from ..multiprocessing import (align, calc_fmllr, make_path_safe,thirdparty_binary, subprocess,convert_ali_to_textgrids)

class PretrainedAligner(BaseAligner):
'''
Class for aligning a dataset using a pretrained acoustic model
Expand All @@ -25,15 +30,16 @@ class PretrainedAligner(BaseAligner):
Specifies a call back function for alignment
'''
def __init__(self, archive, corpus, output_directory,
temp_directory = None, num_jobs = 3, call_back = None):
temp_directory = None, num_jobs = 3, speaker_independent = False,
call_back = None):

if temp_directory is None:
temp_directory = TEMP_DIR
self.temp_directory = temp_directory
self.output_directory = output_directory
self.corpus = corpus

self.dictionary = Dictionary(archive.dictionary_path, os.path.join(temp_directory, 'dictionary'))
self.speaker_independent = speaker_independent
self.dictionary = Dictionary(archive.dictionary_path, os.path.join(temp_directory, 'dictionary'), word_set=corpus.word_set)

self.dictionary.write()
archive.export_triphone_model(self.tri_directory)
Expand All @@ -47,11 +53,63 @@ def __init__(self, archive, corpus, output_directory,
self.verbose = False
self.tri_fmllr_config = TriphoneFmllrConfig(**{'realign_iters': [1, 2],
'fmllr_iters': [1],
'num_iters': 3})
'num_iters': 3,
#'boost_silence': 0
})
self.tri_config = TriphoneConfig()

def do_align(self):
'''
Perform alignment while calculating speaker transforms (fMLLR estimation)
'''
self.train_tri_fmllr()
self._init_tri()
if not self.speaker_independent:
self.train_tri_fmllr()

def _init_tri(self):
if not os.path.exists(self.tri_ali_directory):
self._align_fmllr()
os.makedirs(os.path.join(self.tri_fmllr_directory, 'log'), exist_ok = True)
begin = time.time()
self.corpus.setup_splits(self.dictionary)
shutil.copy(os.path.join(self.tri_directory,'final.mdl'),
os.path.join(self.tri_fmllr_directory,'1.mdl'))
for i in range(self.num_jobs):
shutil.copy(os.path.join(self.tri_ali_directory, 'fsts.{}'.format(i)),
os.path.join(self.tri_fmllr_directory, 'fsts.{}'.format(i)))
shutil.copy(os.path.join(self.tri_ali_directory, 'trans.{}'.format(i)),
os.path.join(self.tri_fmllr_directory, 'trans.{}'.format(i)))

def train_tri_fmllr(self):
directory = self.tri_fmllr_directory
sil_phones = self.dictionary.silence_csl
if self.call_back == print:
iters = tqdm(range(1, self.tri_fmllr_config.num_iters))
else:
iters = range(1, self.tri_fmllr_config.num_iters)
log_directory = os.path.join(directory, 'log')
for i in iters:
model_path = os.path.join(directory,'{}.mdl'.format(i))
occs_path = os.path.join(directory, '{}.occs'.format(i+1))
next_model_path = os.path.join(directory,'{}.mdl'.format(i+1))
if os.path.exists(next_model_path):
continue
align(i, directory, self.corpus.split_directory,
self.dictionary.optional_silence_csl,
self.num_jobs, self.tri_fmllr_config)
calc_fmllr(directory, self.corpus.split_directory, sil_phones,
self.num_jobs, self.tri_fmllr_config, initial = False, iteration = i)
os.rename(model_path, next_model_path)
self.parse_log_directory(log_directory, i)
os.rename(next_model_path, os.path.join(directory,'final.mdl'))

def export_textgrids(self):
'''
Export a TextGrid file for every sound file in the dataset
'''
if self.speaker_independent:
model_directory = self.tri_ali_directory
else:
model_directory = self.tri_fmllr_directory
convert_ali_to_textgrids(self.output_directory, model_directory, self.dictionary,
self.corpus, self.num_jobs)
42 changes: 31 additions & 11 deletions aligner/command_line/align.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import shutil, os
import time


def fix_path():
Expand Down Expand Up @@ -39,8 +40,8 @@ def unfix_path():

TEMP_DIR = os.path.expanduser('~/Documents/MFA')

def align_corpus(model_path, corpus_dir, output_directory, temp_dir,
speaker_characters, num_jobs, verbose, clean):
def align_corpus(model_path, corpus_dir, output_directory, temp_dir, args, debug = False):
all_begin = time.time()
if temp_dir == '':
temp_dir = TEMP_DIR
else:
Expand All @@ -50,32 +51,50 @@ def align_corpus(model_path, corpus_dir, output_directory, temp_dir,
corpus_dir = os.path.dirname(corpus_dir)
corpus_name = os.path.basename(corpus_dir)
data_directory = os.path.join(temp_dir, corpus_name)
if clean:
if args.clean:
shutil.rmtree(data_directory, ignore_errors = True)
shutil.rmtree(output_directory, ignore_errors = True)

os.makedirs(data_directory, exist_ok = True)
os.makedirs(output_directory, exist_ok = True)
corpus = Corpus(corpus_dir, data_directory, speaker_characters, num_jobs = num_jobs)
begin = time.time()
corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs)
print(corpus.speaker_utterance_info())
corpus.write()
if debug:
print('Wrote corpus information in {} seconds'.format(time.time() - begin))
begin = time.time()
corpus.create_mfccs()
if debug:
print('Calculated mfccs in {} seconds'.format(time.time() - begin))
archive = Archive(model_path)
begin = time.time()
a = PretrainedAligner(archive, corpus, output_directory,
temp_directory = data_directory, num_jobs = num_jobs)
a.verbose = verbose
temp_directory = data_directory, num_jobs = args.num_jobs, speaker_independent = args.no_speaker_adaptation)
if debug:
print('Setup pretrained aligner in {} seconds'.format(time.time() - begin))
a.verbose = args.verbose
begin = time.time()
corpus.setup_splits(a.dictionary)
if debug:
print('Setup splits in {} seconds'.format(time.time() - begin))
utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt')
if os.path.exists(utt_oov_path):
shutil.copy(utt_oov_path, output_directory)
oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt')
if os.path.exists(oov_path):
shutil.copy(oov_path, output_directory)
begin = time.time()
a.do_align()
if debug:
print('Performed alignment in {} seconds'.format(time.time() - begin))
begin = time.time()
a.export_textgrids()
if debug:
print('Exported textgrids in {} seconds'.format(time.time() - begin))
print('Done! Everything took {} seconds'.format(time.time() - all_begin))

def align_included_model(language, corpus_dir, output_directory, temp_dir,
speaker_characters, num_jobs, verbose, clean):
def align_included_model(language, corpus_dir, output_directory, temp_dir,args):
if language not in PRETRAINED_LANGUAGES:
raise(Exception('The language \'{}\' is not currently included in the distribution, please align via training or specify one of the following language names: {}.'.format(language, ', '.join(PRETRAINED_LANGUAGES))))

Expand All @@ -86,7 +105,7 @@ def align_included_model(language, corpus_dir, output_directory, temp_dir,
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(path)))
pretrained_dir = os.path.join(root_dir, 'pretrained_models')
model_path = os.path.join(pretrained_dir, '{}.zip'.format(language))
align_corpus(model_path, corpus_dir, output_directory, temp_dir, speaker_characters, num_jobs, verbose, clean)
align_corpus(model_path, corpus_dir, output_directory, temp_dir, args)

if __name__ == '__main__': # pragma: no cover
mp.freeze_support()
Expand All @@ -104,6 +123,7 @@ def align_included_model(language, corpus_dir, output_directory, temp_dir,
parser.add_argument('-v', '--verbose', help = "Output debug messages about alignment", action = 'store_true')
parser.add_argument('--language', type = str, default = '',
help = 'Specify whether to use an included pretrained model (english, french)')
parser.add_argument('-n', '--no_speaker_adaptation', help = "Only use speaker independent models, with no speaker adaptation", action = 'store_true')
parser.add_argument('-c', '--clean', help = "Remove files from previous runs", action = 'store_true')
args = parser.parse_args()
corpus_dir = os.path.expanduser(args.corpus_dir)
Expand All @@ -117,8 +137,8 @@ def align_included_model(language, corpus_dir, output_directory, temp_dir,
raise(Exception('Both language and model_path cannot be specified'))
if model_path != '':
align_corpus(model_path, corpus_dir, output_dir, temp_dir,
args.speaker_characters, args.num_jobs, args.verbose, args.clean)
args)
else:
align_included_model(language, corpus_dir, output_dir, temp_dir,
args.speaker_characters, args.num_jobs, args.verbose, args.clean)
args)
unfix_path()
2 changes: 1 addition & 1 deletion aligner/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def __init__(self, output_directory, job = None, kwargs = None):
if kwargs is None:
kwargs = {}
self.job = job
self.config_dict = {'use-energy':False}
self.config_dict = {'use-energy':False, 'frame-shift':10}
self.config_dict.update(kwargs)
self.output_directory = output_directory
self.write()
Expand Down
3 changes: 3 additions & 0 deletions aligner/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def __init__(self, directory, output_directory,
self.feat_mapping = {}
self.cmvn_mapping = {}
self.ignored_utterances = []
self.word_set = set()
feat_path = os.path.join(self.output_directory, 'feats.scp')
if os.path.exists(feat_path):
self.feat_mapping = load_scp(feat_path)
Expand Down Expand Up @@ -276,6 +277,7 @@ def __init__(self, directory, output_directory,
continue
lab_path = os.path.join(root, lab_name)
self.text_mapping[utt_name] = load_text(lab_path)
self.word_set.update(self.text_mapping[utt_name].split())
if self.speaker_directories:
speaker_id = os.path.basename(root)
else:
Expand Down Expand Up @@ -341,6 +343,7 @@ def __init__(self, directory, output_directory,
self.segments[utt_name] = '{} {} {}'.format(B_name, begin, end)
self.utt_wav_mapping[B_name] = B_path
self.text_mapping[utt_name] = label
self.word_set.update(label.split())
self.utt_speak_mapping[utt_name] = speaker_name
self.speak_utt_mapping[speaker_name].append(utt_name)
if len(self.ignored_utterances) > 0:
Expand Down
8 changes: 4 additions & 4 deletions aligner/dictionary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import shutil
import math
import subprocess
import re
Expand Down Expand Up @@ -53,7 +52,7 @@ def __init__(self, input_path, output_directory, oov_code = '<unk>',
position_dependent_phones = True, num_sil_states = 5,
num_nonsil_states = 3, shared_silence_phones = False,
pronunciation_probabilities = True,
sil_prob = 0.5):
sil_prob = 0.5, word_set = None):
self.output_directory = os.path.join(output_directory, 'dictionary')
self.num_sil_states = num_sil_states
self.num_nonsil_states = num_nonsil_states
Expand All @@ -80,8 +79,10 @@ def __init__(self, input_path, output_directory, oov_code = '<unk>',
continue
self.graphemes.update(word)
pron = line
self.words[word].append(pron)
self.nonsil_phones.update(pron)
if word_set is not None and word not in word_set:
continue
self.words[word].append(pron)
self.word_pattern = compile_graphemes(self.graphemes)
self.words['!SIL'].append(['sil'])
self.words[self.oov_code].append(['spn'])
Expand Down Expand Up @@ -303,7 +304,6 @@ def write(self):
print('Creating dictionary information...')
if not os.path.exists(self.phones_dir):
os.makedirs(self.phones_dir, exist_ok = True)

self._write_graphemes()
self._write_phone_map_file()
self._write_phone_sets()
Expand Down
8 changes: 4 additions & 4 deletions aligner/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,11 +230,11 @@ def align_func(directory, iteration, job_name, mdl, config, feat_path): # pragma
log_path = os.path.join(directory, 'log', 'align.{}.{}.log'.format(iteration, job_name))
ali_path = os.path.join(directory, 'ali.{}'.format(job_name))
with open(log_path, 'w') as logf, \
open(ali_path,'w') as outf:
open(ali_path,'wb') as outf:
align_proc = subprocess.Popen([thirdparty_binary('gmm-align-compiled')]+ config.scale_opts +
['--beam={}'.format(config.beam),
'--retry-beam={}'.format(config.beam * 4), '--careful=false', mdl,
"ark:"+fst_path, "ark:"+feat_path, "ark,t:-"],
"ark:"+fst_path, "ark:"+feat_path, "ark:-"],
stderr = logf,
stdout = outf)
align_proc.communicate()
Expand Down Expand Up @@ -300,12 +300,12 @@ def ali_to_textgrid_func(output_directory, model_directory, dictionary, corpus,
stdin = lin_proc.stdout, stderr = logf)
align_proc.communicate()

subprocess.call([thirdparty_binary('nbest-to-ctm'), 'ark:'+aligned_path,
subprocess.call([thirdparty_binary('nbest-to-ctm'), '--frame-shift={}'.format(corpus.mfcc_configs[0].config_dict['frame-shift']/1000), 'ark:'+aligned_path,
word_ctm_path], stderr = logf)
phone_proc = subprocess.Popen([thirdparty_binary('lattice-to-phone-lattice'), model_path,
'ark:'+aligned_path, "ark:-"], stdout = subprocess.PIPE,
stderr = logf)
nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), "ark:-", phone_ctm_path],
nbest_proc = subprocess.Popen([thirdparty_binary('nbest-to-ctm'), '--frame-shift={}'.format(corpus.mfcc_configs[0].config_dict['frame-shift']/1000), "ark:-", phone_ctm_path],
stdin = phone_proc.stdout, stderr = logf)
nbest_proc.communicate()

Expand Down
2 changes: 1 addition & 1 deletion aligner/textgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def parse_ctm(ctm_path, dictionary, mode = 'word'):
filename = line[0]
begin = float(line[2])
duration = float(line[3])
end = round(begin + duration, 2)
end = round(begin + duration, 3)
label = line[4]
try:
label = mapping[int(label)]
Expand Down
5 changes: 5 additions & 0 deletions docs/source/aligning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ Extra options (in addition to the common ones listed above):

Specify the language of the pretrained models distributed with the aligner

.. cmdoption:: -n
--no_speaker_adaptation

Flag to disable using speaker adaptation, useful if aligning a small dataset or if speed is more important

.. note::

The arguments ``model_path`` and ``language`` are mutually exclusive
Expand Down
7 changes: 7 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
Changelog
=========

0.8.0
-----

- Fixed an issue where aligning using pretrained models was improperly updating the original model with sparser data
- Added a flag to turn off speaker adaptation when aligning using a pretrained model
- Optimized training graph generation when aligning using a pretrained model

0.7.3
-----

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def run_tests(self):

if __name__ == '__main__':
setup(name='Montreal Forced Aligner',
version='0.7.2',
version='0.8.0',
description='',
long_description='',
classifiers=[
Expand Down

0 comments on commit f21f793

Please sign in to comment.