Skip to content

Commit

Permalink
Clean-up internal data api for faa_aligner
Browse files Browse the repository at this point in the history
  • Loading branch information
bjascob committed Jan 19, 2021
1 parent bde002e commit abeed54
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 76 deletions.
20 changes: 11 additions & 9 deletions amrlib/alignments/faa_aligner/faa_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,22 @@
class FAA_Aligner(object):
def __init__(self, **kwargs):
self.model_dir = kwargs.get('model_dir', os.path.join(data_dir, 'model_aligner_faa'))
self.working_dir = kwargs.get('working_dir', os.path.join(data_dir, 'working_faa_aligner'))
self.model_tar_fn = kwargs.get('model_tar_fn', os.path.join(this_dir, 'model_aligner_faa.tar.gz'))
os.makedirs(self.working_dir, exist_ok=True)
self.setup_model_dir()
self.aligner = TrainedAligner(self.model_dir, **kwargs)
self.aligner.check_for_binaries() # Will raise FileNotFoundError if binaries can't be found

# Input space_tok_sents is a list of space tokenized strings
# graph_strings is a list and amr graph strings, the same size.
def align_sents(self, space_tok_sents, graph_strings):
assert len(space_tok_sents) == len(graph_strings)
graph_strings = [to_graph_line(g) for g in graph_strings]
data = preprocess_infer(space_tok_sents, graph_strings)
data.model_out_lines = self.aligner.align(data.eng_preproc_lines, data.amr_preproc_lines)
amr_surface_aligns, alignment_strings = postprocess(data)
return amr_surface_aligns, alignment_strings


# check the model directory, if it doesn't have the metadata file try to create
# the directory from the tar.gz file
def setup_model_dir(self):
Expand All @@ -43,13 +52,6 @@ def setup_model_dir(self):
logger.critical('No model in model_dir and no local version available to extract')
return False

def align_sents(self, sents, gstrings):
gstrings = [to_graph_line(g) for g in gstrings]
eng_td_lines, amr_td_lines = preprocess_infer(self.working_dir, sents, gstrings)
fa_out_lines = self.aligner.align(eng_td_lines, amr_td_lines)
amr_surface_aligns, alignment_strings = postprocess(self.working_dir, fa_out_lines, sents, gstrings)
return amr_surface_aligns, alignment_strings


# Code adapted from from https://github.com/clab/fast_align/blob/master/src/force_align.py
class TrainedAligner:
Expand Down
46 changes: 13 additions & 33 deletions amrlib/alignments/faa_aligner/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,48 +10,28 @@


# if model_out_lines is None, read from the file
def postprocess(wk_dir, model_out_lines=None, eng_lines=None, amr_lines=None, **kwargs):
# Input filenames
eng_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'sents.txt'))
amr_fn = os.path.join(wk_dir, kwargs.get('amr_fn', 'gstrings.txt'))
eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple', 'amr_tuple.txt'))
model_out_fn = os.path.join(wk_dir, kwargs.get('model_out_fn', 'model_out.txt'))
def postprocess(data, **kwargs):
# Error log
align_to_str_fn = os.path.join(wk_dir, kwargs.get('align_to_str_fn', 'align_to_str.err'))

# Read the input files and get the number of lines, which must be the same
if eng_lines is None or amr_lines is None:
with open(eng_fn) as f:
eng_lines = [l.strip() for l in f]
with open(amr_fn) as f:
amr_lines = [l.strip() for l in f]
assert len(eng_lines) == len(amr_lines)
lines_number = len(eng_lines)
log_dir = kwargs.get('log_dir', 'logs')
os.makedirs(log_dir, exist_ok=True)
align_to_str_fn = os.path.join(log_dir, kwargs.get('postprocess_log_fn', 'faa_postprocess.log'))

# Read the output of the aligner or use the supplied input above
# fast_align outputs with a dash but the code from the isi aligner is setup for spaces
if model_out_lines is None:
with open(model_out_fn) as f:
model_out_lines = f.readlines()
# fast_align outputs with dashes, giza does this without
giza_align_lines = [l.strip().replace('-', ' ') for l in model_out_lines]
giza_align_lines = [l.strip().replace('-', ' ') for l in data.model_out_lines]
isi_align_lines = giza2isi(giza_align_lines)
align_real_lines = swap(isi_align_lines)[:lines_number] # rm data added for training, past original sentences
num_lines = len(data.amr_lines)
align_real_lines = swap(isi_align_lines)[:num_lines] # rm data added for training, past original sentences

# Load the original sentence tokenization positions (created in pre-process)
with open(eng_tok_pos_fn) as f:
eng_tok_origpos_lines = [l.strip() for l in f]
align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(eng_tok_origpos_lines, align_real_lines)
# Align the original position lines
align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(data.eng_tok_origpos_lines, align_real_lines)

# Load the amr tuples from the pre-process steps and add the alignments
with open(amr_tuple_fn) as f:
amr_tuple_lines = [l.strip() for l in f]
aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(amr_tuple_lines, align_origpos_lines)
# Get the aligned tuples
aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(data.amr_tuple_lines, align_origpos_lines)

# Create amr graphs with surface alignments
amr_surface_aligns = feat2tree.align(amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn)
assert len(amr_surface_aligns) == len(eng_lines)
amr_surface_aligns = feat2tree.align(data.amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn)
assert len(amr_surface_aligns) == len(data.amr_lines)

# Get the final alignment string from the surface alignments
ga = GetAlignments.from_amr_strings(amr_surface_aligns)
Expand Down
49 changes: 22 additions & 27 deletions amrlib/alignments/faa_aligner/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,61 +4,54 @@
from .process_utils import stem_4_letters_word, stem_4_letters_line, stem_4_letters_string
from .process_utils import filter_eng_by_stopwords, get_lineartok_with_rel
from .process_utils import get_id_mapping_uniq

from .proc_data import ProcData

# Set the default data data for misc files
default_res_dir = os.path.dirname(os.path.realpath(__file__))
default_res_dir = os.path.realpath(os.path.join(default_res_dir, 'resources'))


# Preprocess for inference
def preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs):
def preprocess_infer(eng_lines, amr_lines, **kwargs):
assert len(eng_lines) == len(amr_lines)
# Resource filenames
res_dir = kwargs.get('res_dir', default_res_dir)
eng_sw_fn = kwargs.get('eng_sw_fn', os.path.join(res_dir, 'eng_stopwords.txt'))
amr_sw_fn = kwargs.get('amr_sw_fn', os.path.join(res_dir, 'amr_stopwords.txt'))
# Output filenames
eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple_fn', 'amr_tuple.txt'))

# Filter out stopwords from sentences
eng_tok_filtered_lines, eng_tok_origpos_lines = filter_eng_by_stopwords(eng_lines, eng_sw_fn)

# Save for post-processing
with open(eng_tok_pos_fn, 'w') as f:
for i, l in enumerate(eng_tok_origpos_lines):
assert l.strip(), '!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i
f.write(l + '\n')
for i, line in enumerate(eng_tok_origpos_lines):
if not line.strip():
raise ValueError('!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i)

# Stem sentence tokens
eng_tok_stemmed_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines]
eng_preproc_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines]

# Process the AMR data / remove stopwords
amr_linear_lines, amr_tuple_lines = get_lineartok_with_rel(amr_lines, amr_sw_fn)

# Save for post-processing
with open(amr_tuple_fn, 'w') as f:
for l in amr_tuple_lines:
f.write(l + '\n')

# Stem the AMR lines
amr_linear_stemmed_lines = []
amr_preproc_lines = []
for line in amr_linear_lines:
new_tokens = []
for token in line.split():
token = re.sub(r'\-[0-9]{2,3}$', '', token)
token = token.replace('"', '')
token = stem_4_letters_word(token).strip()
new_tokens.append(token)
amr_linear_stemmed_lines.append(' '.join(new_tokens))
amr_preproc_lines.append(' '.join(new_tokens))

return eng_tok_stemmed_lines, amr_linear_stemmed_lines
# Gather the data
assert len(eng_preproc_lines) == len(amr_preproc_lines)
data = ProcData(eng_lines, amr_lines, eng_tok_origpos_lines, amr_tuple_lines,
eng_preproc_lines, amr_preproc_lines,)
return data


# Preprocess the training data. This is the similar to inference but add a lot of
# extra translation lines from resource files, etc..
def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
def preprocess_train(eng_lines, amr_lines, **kwargs):
repeat_td = kwargs.get('repeat_td', 10) # 10X is original value from isi aligner
# Resource filenames
res_dir = kwargs.get('res_dir', default_res_dir)
Expand All @@ -67,10 +60,12 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
amr_id_map_fn = kwargs.get('amr_id_map_fn', os.path.join(res_dir, 'amr_id_map.txt'))

# Run the inference process which creates the basic translation data
eng_tok_stemmed_lines, amr_linear_stemmed_lines = preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs)
data = preprocess_infer(eng_lines, amr_lines, **kwargs)
eng_preproc_lines = data.eng_preproc_lines
amr_preproc_lines = data.amr_preproc_lines

# Get tokens common between the two datasets (obvious translations
common_tok_lines = get_id_mapping_uniq(eng_tok_stemmed_lines, amr_linear_stemmed_lines)
common_tok_lines = get_id_mapping_uniq(eng_preproc_lines, amr_preproc_lines)
eng_td_lines = common_tok_lines[:] # copy

# Append the second field in prep-roles.id.txt
Expand Down Expand Up @@ -101,8 +96,8 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):

# Create the final training data using the original sentences
# and 10X copies of the additional data (other translations)
eng_td_lines = eng_tok_stemmed_lines + [l for _ in range(repeat_td) for l in eng_td_lines]
amr_td_lines = amr_linear_stemmed_lines + [l for _ in range(repeat_td) for l in amr_td_lines]
assert len(eng_td_lines) == len(amr_td_lines)
data.eng_preproc_lines += [l for _ in range(repeat_td) for l in eng_td_lines]
data.amr_preproc_lines += [l for _ in range(repeat_td) for l in amr_td_lines]
assert len(data.eng_preproc_lines) == len(data.amr_preproc_lines)

return eng_td_lines, amr_td_lines
return data
63 changes: 63 additions & 0 deletions amrlib/alignments/faa_aligner/proc_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os

# Simple container for holding process data
# This is a little complicated because preprocess creates data that is used by the model for input
# and other data that is used during postprocessing. Keep track of it all here.
# Saving and loading is to facilitate training scripts. During inference, data will be held internally.
class ProcData(object):
def __init__(self, eng_lines=None, amr_lines=None,
eng_tok_origpos_lines=None, amr_tuple_lines=None,
eng_preproc_lines=None, amr_preproc_lines=None):
self.eng_lines = eng_lines
self.amr_lines = amr_lines
self.eng_tok_origpos_lines = eng_tok_origpos_lines
self.amr_tuple_lines = amr_tuple_lines
self.eng_preproc_lines = eng_preproc_lines
self.amr_preproc_lines = amr_preproc_lines

# Save the preprocess and model input data (optionally the original x_lines data)
def save(self, wk_dir, save_input_data=False, **kwargs):
self.build_filenames(wk_dir, **kwargs)
if save_input_data:
self.save_lines(self.eng_fn, self.eng_lines)
self.save_lines(self.amr_fn, self.amr_lines)
self.save_lines(self.eng_tok_pos_fn, self.eng_tok_origpos_lines)
self.save_lines(self.amr_tuple_fn, self.amr_tuple_lines)
with open(self.fa_in_fn, 'w') as f:
for en_line, amr_line in zip(self.eng_preproc_lines, self.amr_preproc_lines):
f.write('%s ||| %s\n' % (en_line, amr_line))

# load data (not including the _preproc_lines)
@classmethod
def from_directory(cls, wk_dir, **kwargs):
self = cls()
self.build_filenames(wk_dir, **kwargs)
self.eng_lines = self.load_lines(self.eng_fn)
self.amr_lines = self.load_lines(self.amr_fn)
self.eng_tok_origpos_lines = self.load_lines(self.eng_tok_pos_fn)
self.amr_tuple_lines = self.load_lines(self.amr_tuple_fn)
self.model_out_lines = self.load_lines(self.model_out_fn)
return self

# Create default filenames as members
def build_filenames(self, wk_dir, **kwargs):
self.eng_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'sents.txt'))
self.amr_fn = os.path.join(wk_dir, kwargs.get('eng_fn', 'gstrings.txt'))
self.eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
self.amr_tuple_fn = os.path.join(wk_dir, kwargs.get('amr_tuple_fn', 'amr_tuple.txt'))
self.fa_in_fn = os.path.join(wk_dir, kwargs.get('fa_in_fn', 'fa_in.txt'))
self.model_out_fn = os.path.join(wk_dir, kwargs.get('model_out_fn', 'model_out.txt'))

# Save a list of lines to a file
@staticmethod
def save_lines(fn, lines):
with open(fn, 'w') as f:
for line in lines:
f.write(line + '\n')

# Load a list of lines from a file
@staticmethod
def load_lines(fn):
with open(fn) as f:
lines = [l.strip() for l in f]
return lines
10 changes: 4 additions & 6 deletions scripts/61_FAA_Aligner/12_Preprocess_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
working_dir = 'amrlib/data/train_faa_aligner'
eng_fn = os.path.join(working_dir, 'sents.txt')
amr_fn = os.path.join(working_dir, 'gstrings.txt')
fa_in_fn = os.path.join(working_dir, 'fa_in.txt')

print('Reading and writing data in', working_dir)
# Read in the english sentences and linearized AMR lines
Expand All @@ -18,9 +17,8 @@
amr_lines = [l.strip().lower() for l in f]

# Proprocess the data
eng_td_lines, amr_td_lines = preprocess_train(working_dir, eng_lines, amr_lines)
data = preprocess_train(eng_lines, amr_lines)

# Save in fast align training format
with open(fa_in_fn, 'w') as f:
for en_line, amr_line in zip(eng_td_lines, amr_td_lines):
f.write('%s ||| %s\n' % (en_line, amr_line))
# Save the preprocess data and the model input file, the input data
# already in the working directory
data.save(working_dir, save_input_data=False)
8 changes: 7 additions & 1 deletion scripts/61_FAA_Aligner/16_PostProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import setup_run_dir # this import tricks script to run from 2 levels up
import os
from amrlib.alignments.faa_aligner.postprocess import postprocess
from amrlib.alignments.faa_aligner.proc_data import ProcData


if __name__ == '__main__':
Expand All @@ -10,7 +11,12 @@
surface_fn = 'amr_surface_aligned.txt'

print('Reading and writing data in', working_dir)
amr_surface_aligns, alignment_strings = postprocess(working_dir)

# Load the original, preprocess and model output data
data = ProcData.from_directory(working_dir)

# Post process
amr_surface_aligns, alignment_strings = postprocess(data)

# Save the final data
fpath = os.path.join(working_dir, astrings_fn)
Expand Down

0 comments on commit abeed54

Please sign in to comment.