diff --git a/amrlib/alignments/faa_aligner/faa_aligner.py b/amrlib/alignments/faa_aligner/faa_aligner.py index d15f3cd..3bbb4cc 100644 --- a/amrlib/alignments/faa_aligner/faa_aligner.py +++ b/amrlib/alignments/faa_aligner/faa_aligner.py @@ -17,13 +17,22 @@ class FAA_Aligner(object): def __init__(self, **kwargs): self.model_dir = kwargs.get('model_dir', os.path.join(data_dir, 'model_aligner_faa')) - self.working_dir = kwargs.get('working_dir', os.path.join(data_dir, 'working_faa_aligner')) self.model_tar_fn = kwargs.get('model_tar_fn', os.path.join(this_dir, 'model_aligner_faa.tar.gz')) - os.makedirs(self.working_dir, exist_ok=True) self.setup_model_dir() self.aligner = TrainedAligner(self.model_dir, **kwargs) self.aligner.check_for_binaries() # Will raise FileNotFoundError if binaries can't be found + # Input space_tok_sents is a list of space tokenized strings + # graph_strings is a list and amr graph strings, the same size. + def align_sents(self, space_tok_sents, graph_strings): + assert len(space_tok_sents) == len(graph_strings) + graph_strings = [to_graph_line(g) for g in graph_strings] + data = preprocess_infer(space_tok_sents, graph_strings) + data.model_out_lines = self.aligner.align(data.eng_preproc_lines, data.amr_preproc_lines) + amr_surface_aligns, alignment_strings = postprocess(data) + return amr_surface_aligns, alignment_strings + + # check the model directory, if it doesn't have the metadata file try to create # the directory from the tar.gz file def setup_model_dir(self): @@ -43,13 +52,6 @@ def setup_model_dir(self): logger.critical('No model in model_dir and no local version available to extract') return False - def align_sents(self, sents, gstrings): - gstrings = [to_graph_line(g) for g in gstrings] - eng_td_lines, amr_td_lines = preprocess_infer(self.working_dir, sents, gstrings) - fa_out_lines = self.aligner.align(eng_td_lines, amr_td_lines) - amr_surface_aligns, alignment_strings = postprocess(self.working_dir, fa_out_lines, sents, gstrings) - return amr_surface_aligns, alignment_strings - # Code adapted from from https://github.com/clab/fast_align/blob/master/src/force_align.py class TrainedAligner: diff --git a/amrlib/alignments/faa_aligner/postprocess.py b/amrlib/alignments/faa_aligner/postprocess.py index e20e8c9..7e89aaa 100644 --- a/amrlib/alignments/faa_aligner/postprocess.py +++ b/amrlib/alignments/faa_aligner/postprocess.py @@ -10,48 +10,28 @@ # if model_out_lines is None, read from the file -def postprocess(wk_dir, model_out_lines=None, eng_lines=None, amr_lines=None, **kwargs): - # Input filenames - eng_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'sents.txt')) - amr_fn = os.path.join(wk_dir, kwargs.get('amr_fn', 'gstrings.txt')) - eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt')) - amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple', 'amr_tuple.txt')) - model_out_fn = os.path.join(wk_dir, kwargs.get('model_out_fn', 'model_out.txt')) +def postprocess(data, **kwargs): # Error log - align_to_str_fn = os.path.join(wk_dir, kwargs.get('align_to_str_fn', 'align_to_str.err')) - - # Read the input files and get the number of lines, which must be the same - if eng_lines is None or amr_lines is None: - with open(eng_fn) as f: - eng_lines = [l.strip() for l in f] - with open(amr_fn) as f: - amr_lines = [l.strip() for l in f] - assert len(eng_lines) == len(amr_lines) - lines_number = len(eng_lines) + log_dir = kwargs.get('log_dir', 'logs') + os.makedirs(log_dir, exist_ok=True) + align_to_str_fn = os.path.join(log_dir, kwargs.get('postprocess_log_fn', 'faa_postprocess.log')) # Read the output of the aligner or use the supplied input above # fast_align outputs with a dash but the code from the isi aligner is setup for spaces - if model_out_lines is None: - with open(model_out_fn) as f: - model_out_lines = f.readlines() - # fast_align outputs with dashes, giza does this without - giza_align_lines = [l.strip().replace('-', ' ') for l in model_out_lines] + giza_align_lines = [l.strip().replace('-', ' ') for l in data.model_out_lines] isi_align_lines = giza2isi(giza_align_lines) - align_real_lines = swap(isi_align_lines)[:lines_number] # rm data added for training, past original sentences + num_lines = len(data.amr_lines) + align_real_lines = swap(isi_align_lines)[:num_lines] # rm data added for training, past original sentences - # Load the original sentence tokenization positions (created in pre-process) - with open(eng_tok_pos_fn) as f: - eng_tok_origpos_lines = [l.strip() for l in f] - align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(eng_tok_origpos_lines, align_real_lines) + # Align the original position lines + align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(data.eng_tok_origpos_lines, align_real_lines) - # Load the amr tuples from the pre-process steps and add the alignments - with open(amr_tuple_fn) as f: - amr_tuple_lines = [l.strip() for l in f] - aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(amr_tuple_lines, align_origpos_lines) + # Get the aligned tuples + aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(data.amr_tuple_lines, align_origpos_lines) # Create amr graphs with surface alignments - amr_surface_aligns = feat2tree.align(amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn) - assert len(amr_surface_aligns) == len(eng_lines) + amr_surface_aligns = feat2tree.align(data.amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn) + assert len(amr_surface_aligns) == len(data.amr_lines) # Get the final alignment string from the surface alignments ga = GetAlignments.from_amr_strings(amr_surface_aligns) diff --git a/amrlib/alignments/faa_aligner/preprocess.py b/amrlib/alignments/faa_aligner/preprocess.py index 062b057..331e4c3 100644 --- a/amrlib/alignments/faa_aligner/preprocess.py +++ b/amrlib/alignments/faa_aligner/preprocess.py @@ -4,7 +4,7 @@ from .process_utils import stem_4_letters_word, stem_4_letters_line, stem_4_letters_string from .process_utils import filter_eng_by_stopwords, get_lineartok_with_rel from .process_utils import get_id_mapping_uniq - +from .proc_data import ProcData # Set the default data data for misc files default_res_dir = os.path.dirname(os.path.realpath(__file__)) @@ -12,38 +12,27 @@ # Preprocess for inference -def preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs): +def preprocess_infer(eng_lines, amr_lines, **kwargs): assert len(eng_lines) == len(amr_lines) # Resource filenames res_dir = kwargs.get('res_dir', default_res_dir) eng_sw_fn = kwargs.get('eng_sw_fn', os.path.join(res_dir, 'eng_stopwords.txt')) amr_sw_fn = kwargs.get('amr_sw_fn', os.path.join(res_dir, 'amr_stopwords.txt')) - # Output filenames - eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt')) - amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple_fn', 'amr_tuple.txt')) # Filter out stopwords from sentences eng_tok_filtered_lines, eng_tok_origpos_lines = filter_eng_by_stopwords(eng_lines, eng_sw_fn) - - # Save for post-processing - with open(eng_tok_pos_fn, 'w') as f: - for i, l in enumerate(eng_tok_origpos_lines): - assert l.strip(), '!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i - f.write(l + '\n') + for i, line in enumerate(eng_tok_origpos_lines): + if not line.strip(): + raise ValueError('!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i) # Stem sentence tokens - eng_tok_stemmed_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines] + eng_preproc_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines] # Process the AMR data / remove stopwords amr_linear_lines, amr_tuple_lines = get_lineartok_with_rel(amr_lines, amr_sw_fn) - # Save for post-processing - with open(amr_tuple_fn, 'w') as f: - for l in amr_tuple_lines: - f.write(l + '\n') - # Stem the AMR lines - amr_linear_stemmed_lines = [] + amr_preproc_lines = [] for line in amr_linear_lines: new_tokens = [] for token in line.split(): @@ -51,14 +40,18 @@ def preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs): token = token.replace('"', '') token = stem_4_letters_word(token).strip() new_tokens.append(token) - amr_linear_stemmed_lines.append(' '.join(new_tokens)) + amr_preproc_lines.append(' '.join(new_tokens)) - return eng_tok_stemmed_lines, amr_linear_stemmed_lines + # Gather the data + assert len(eng_preproc_lines) == len(amr_preproc_lines) + data = ProcData(eng_lines, amr_lines, eng_tok_origpos_lines, amr_tuple_lines, + eng_preproc_lines, amr_preproc_lines,) + return data # Preprocess the training data. This is the similar to inference but add a lot of # extra translation lines from resource files, etc.. -def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs): +def preprocess_train(eng_lines, amr_lines, **kwargs): repeat_td = kwargs.get('repeat_td', 10) # 10X is original value from isi aligner # Resource filenames res_dir = kwargs.get('res_dir', default_res_dir) @@ -67,10 +60,12 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs): amr_id_map_fn = kwargs.get('amr_id_map_fn', os.path.join(res_dir, 'amr_id_map.txt')) # Run the inference process which creates the basic translation data - eng_tok_stemmed_lines, amr_linear_stemmed_lines = preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs) + data = preprocess_infer(eng_lines, amr_lines, **kwargs) + eng_preproc_lines = data.eng_preproc_lines + amr_preproc_lines = data.amr_preproc_lines # Get tokens common between the two datasets (obvious translations - common_tok_lines = get_id_mapping_uniq(eng_tok_stemmed_lines, amr_linear_stemmed_lines) + common_tok_lines = get_id_mapping_uniq(eng_preproc_lines, amr_preproc_lines) eng_td_lines = common_tok_lines[:] # copy # Append the second field in prep-roles.id.txt @@ -101,8 +96,8 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs): # Create the final training data using the original sentences # and 10X copies of the additional data (other translations) - eng_td_lines = eng_tok_stemmed_lines + [l for _ in range(repeat_td) for l in eng_td_lines] - amr_td_lines = amr_linear_stemmed_lines + [l for _ in range(repeat_td) for l in amr_td_lines] - assert len(eng_td_lines) == len(amr_td_lines) + data.eng_preproc_lines += [l for _ in range(repeat_td) for l in eng_td_lines] + data.amr_preproc_lines += [l for _ in range(repeat_td) for l in amr_td_lines] + assert len(data.eng_preproc_lines) == len(data.amr_preproc_lines) - return eng_td_lines, amr_td_lines + return data diff --git a/amrlib/alignments/faa_aligner/proc_data.py b/amrlib/alignments/faa_aligner/proc_data.py new file mode 100644 index 0000000..d103104 --- /dev/null +++ b/amrlib/alignments/faa_aligner/proc_data.py @@ -0,0 +1,63 @@ +import os + +# Simple container for holding process data +# This is a little complicated because preprocess creates data that is used by the model for input +# and other data that is used during postprocessing. Keep track of it all here. +# Saving and loading is to facilitate training scripts. During inference, data will be held internally. +class ProcData(object): + def __init__(self, eng_lines=None, amr_lines=None, + eng_tok_origpos_lines=None, amr_tuple_lines=None, + eng_preproc_lines=None, amr_preproc_lines=None): + self.eng_lines = eng_lines + self.amr_lines = amr_lines + self.eng_tok_origpos_lines = eng_tok_origpos_lines + self.amr_tuple_lines = amr_tuple_lines + self.eng_preproc_lines = eng_preproc_lines + self.amr_preproc_lines = amr_preproc_lines + + # Save the preprocess and model input data (optionally the original x_lines data) + def save(self, wk_dir, save_input_data=False, **kwargs): + self.build_filenames(wk_dir, **kwargs) + if save_input_data: + self.save_lines(self.eng_fn, self.eng_lines) + self.save_lines(self.amr_fn, self.amr_lines) + self.save_lines(self.eng_tok_pos_fn, self.eng_tok_origpos_lines) + self.save_lines(self.amr_tuple_fn, self.amr_tuple_lines) + with open(self.fa_in_fn, 'w') as f: + for en_line, amr_line in zip(self.eng_preproc_lines, self.amr_preproc_lines): + f.write('%s ||| %s\n' % (en_line, amr_line)) + + # load data (not including the _preproc_lines) + @classmethod + def from_directory(cls, wk_dir, **kwargs): + self = cls() + self.build_filenames(wk_dir, **kwargs) + self.eng_lines = self.load_lines(self.eng_fn) + self.amr_lines = self.load_lines(self.amr_fn) + self.eng_tok_origpos_lines = self.load_lines(self.eng_tok_pos_fn) + self.amr_tuple_lines = self.load_lines(self.amr_tuple_fn) + self.model_out_lines = self.load_lines(self.model_out_fn) + return self + + # Create default filenames as members + def build_filenames(self, wk_dir, **kwargs): + self.eng_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'sents.txt')) + self.amr_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'gstrings.txt')) + self.eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt')) + self.amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple_fn', 'amr_tuple.txt')) + self.fa_in_fn = os.path.join(wk_dir, kwargs.get('fa_in_fn', 'fa_in.txt')) + self.model_out_fn = os.path.join(wk_dir, kwargs.get('model_out_fn', 'model_out.txt')) + + # Save a list of lines to a file + @staticmethod + def save_lines(fn, lines): + with open(fn, 'w') as f: + for line in lines: + f.write(line + '\n') + + # Load a list of lines from a file + @staticmethod + def load_lines(fn): + with open(fn) as f: + lines = [l.strip() for l in f] + return lines diff --git a/scripts/61_FAA_Aligner/12_Preprocess_train.py b/scripts/61_FAA_Aligner/12_Preprocess_train.py index aba4893..0c84126 100755 --- a/scripts/61_FAA_Aligner/12_Preprocess_train.py +++ b/scripts/61_FAA_Aligner/12_Preprocess_train.py @@ -8,7 +8,6 @@ working_dir = 'amrlib/data/train_faa_aligner' eng_fn = os.path.join(working_dir, 'sents.txt') amr_fn = os.path.join(working_dir, 'gstrings.txt') - fa_in_fn = os.path.join(working_dir, 'fa_in.txt') print('Reading and writing data in', working_dir) # Read in the english sentences and linearized AMR lines @@ -18,9 +17,8 @@ amr_lines = [l.strip().lower() for l in f] # Proprocess the data - eng_td_lines, amr_td_lines = preprocess_train(working_dir, eng_lines, amr_lines) + data = preprocess_train(eng_lines, amr_lines) - # Save in fast align training format - with open(fa_in_fn, 'w') as f: - for en_line, amr_line in zip(eng_td_lines, amr_td_lines): - f.write('%s ||| %s\n' % (en_line, amr_line)) + # Save the preprocess data and the model input file, the input data + # already in the working directory + data.save(working_dir, save_input_data=False) diff --git a/scripts/61_FAA_Aligner/16_PostProcess.py b/scripts/61_FAA_Aligner/16_PostProcess.py index 5016e61..3939cf8 100755 --- a/scripts/61_FAA_Aligner/16_PostProcess.py +++ b/scripts/61_FAA_Aligner/16_PostProcess.py @@ -2,6 +2,7 @@ import setup_run_dir # this import tricks script to run from 2 levels up import os from amrlib.alignments.faa_aligner.postprocess import postprocess +from amrlib.alignments.faa_aligner.proc_data import ProcData if __name__ == '__main__': @@ -10,7 +11,12 @@ surface_fn = 'amr_surface_aligned.txt' print('Reading and writing data in', working_dir) - amr_surface_aligns, alignment_strings = postprocess(working_dir) + + # Load the original, preprocess and model output data + data = ProcData.from_directory(working_dir) + + # Post process + amr_surface_aligns, alignment_strings = postprocess(data) # Save the final data fpath = os.path.join(working_dir, astrings_fn)