diff --git a/train/README.md b/train/README.md new file mode 100644 index 0000000..e60ebe1 --- /dev/null +++ b/train/README.md @@ -0,0 +1,66 @@ +# Description + +This directory may be used to train / re-train new models using the HSM framework. + +# Usage + +Code used for training (or re-training) models is located in the `train/` directory in this repository. The package should primarily be accessed via the script `train.py`. + +```bash +python train.py [OPTIONS] +``` +Additional options for using `train.py` may be listed using the `-h/--help` flag. + +The basic steps for training a new model are: +0. Pre-process domain-peptide interaction data. + +By default, the training code assumes that pre-processed data are located at `data/train/`, which can be downloaded (see [Data section](#data)). New data must be passed explicitly to the code (see the next section). + +A script for converting domain-peptide interaction data into the appropriate format for use with the model is available at `convert_binding_data.py`. The input format for this script is a csv file (no header) with the format: +``` +Domain-Type,Aligned-Domain-Sequence,Peptide-Type,Aligned-Peptidic-Sequence +``` +An example of the input file type is included with the downloaded data (`domain_peptide_train.csv`). Domain and peptide protein identifiers are typically UniProtKB IDs; however, this is not required. Domain type refers to the class of binding domain (e.g. SH2 domains). + + +To process the data, run the command: +```bash +python convert_binding_data.py [INPUT DATA FILE] [OUTPUT DATA DIRECTORY] +``` +The processed data is output to the `OUTPUT DATA DIRECTORY` argument with the data split into directories by domain-type. Each directory contains data in two formats: `tf-records/` and `hdf5-records`. Additional options are detailed using the `-h/--help` flag. + +1. Train new models + +Typically, models should be trained using the command: + +```bash +python train.py [VALIDATION INDEX] (-d [DOMAIN ...] | -a) (--generic | --shared_basis | --hierarchical) +``` + +The `VALIDATION INDEX` denotes data chunk that is excluded from the training process. The next argument, `-d [DOMAIN ...] | -a`, identifies the domains used in training the model. `-d` (or `--domains`) specifies a single or a subset of domains to train on. `a` (or `--all_domains`) specifies use all domains available. The final argument `(--generic | --shared_basis | --hierarchical)` specifies the model type: `--generic` specifies HSM/ID , `--hierarchical` denotes HSM/D, `--shared_basis` denotes HSM/D models trained for a single domain. + +Additional command-line options facilitate model training / optimization (*e.g.* regularization parameters) and are detailed with the help command. + +2. Predict and assess performance + +Data for the training process are typically output to `train/outputs/`. Processing the output directory can be accomplished using the `assess_performance.py` script: + +```bash +python assess_performance.py [INPUT DIRECTORY] +``` +where `INPUT DIRECTORY` denotes the path to the previously output directory. To control model training output, it can be helpful to re-direct outputs using the `--output_directory` command when running `train.py`. + +3. Finalize model + +To predict a combined model (*i.e.* using all training data) add the `--include_all` flag to the code + +```bash +python train.py [TEST INDEX] --include_all +``` + +Models (for use with the `predict` code) may be output using the `output_models.py` script: +```bash +python output_models.py [RESULTS FILE] [OUTPUT DIRECTORY] +``` +where `RESULTS FILE` denotes a file output by the `train.py` script and `OUTPUT DIRECTORY` the directory to place the processed models into (each domain type occupies one model file). + diff --git a/train/convert_binding_data.py b/train/convert_binding_data.py new file mode 100644 index 0000000..8111e3b --- /dev/null +++ b/train/convert_binding_data.py @@ -0,0 +1,141 @@ +import os +import h5py, csv +from collections import * +from itertools import * +from tqdm import tqdm as tqdm + +import numpy as np + +def _vectorize_sequence(sequence, amino_acid_ordering): + """ + Computes a one-hot embedding of an input sequence. + + Returns: + - list. Non-zero indices of one-hot embedding matrix of a sequence. + Non-flattened, this matrix has dimensions: + (sequence length, # of amino acids represented) + """ + aa_len = len(amino_acid_ordering) + + vectorized = list() + + sequence_indexed = [(sidx, saa) for sidx, saa in enumerate(sequence) if saa in amino_acid_ordering] + for sidx, saa in sequence_indexed: + idxed = sidx * aa_len + amino_acid_ordering[saa] + + vectorized.append(idxed) + + # Pad to equal length + npad = len(sequence) - len(vectorized) + vectorized.extend(-1 for _ in range(npad)) + + return vectorized + +def _vectorize_interaction(domain_sequence, peptide_sequence, amino_acid_ordering): + """ + Computes a one-hot embedding of the interaction between the domain- and peptidic-sequence. + + Returns: + - list. Non-zero indices for the interaction between domain and peptide sequences. + Non-flattened, this matrix has dimensions: + (domain sequence length, peptide sequence length, # of amino acids represented, # of amino acids represented) + + """ + + aa_len = len(amino_acid_ordering) + domain_idx_offset = len(peptide_sequence) * aa_len * aa_len + peptide_idx_offset = aa_len * aa_len + + vectorized = list() + + domain_indexed = [(didx, daa) for didx, daa in enumerate(domain_sequence) if daa in amino_acid_ordering] + peptide_indexed = [(pidx, paa) for pidx, paa in enumerate(peptide_sequence) if paa in amino_acid_ordering] + + for (didx, daa), (pidx, paa) in product(domain_indexed, peptide_indexed): + + idxed = didx * domain_idx_offset + pidx * peptide_idx_offset + amino_acid_ordering[daa] * aa_len + amino_acid_ordering[paa] + vectorized.append(idxed) + + # Pad to equal length + npad = len(domain_sequence) * len(peptide_sequence) - len(vectorized) + vectorized.extend(-1 for _ in range(npad)) + + return vectorized + +def _load_binding_data(input_data_file, amino_acid_ordering, progressbar=False): + """ + Input data format should be a csv file of the form: + Domain-Type,Domain-Protein-Identifier,Aligned-Domain-Sequence,Peptide-Type,Peptide-Protein-Identifier,Aligned-Peptidic-Sequence + + An iterator that returns data grouped by model type. Model types are automatically inferred from the input data file. + """ + get_model_type = lambda row: (row[0], row[2], len(row[1]), len(row[3])) + + model_types, total = set(), 0 + for row in csv.reader(open(input_data_file, 'r')): + total += 1 + model_type = get_model_type(row) + + model_types.add(model_type) + + for model_type in tqdm(model_types, disable=(not progressbar), desc="Model types"): + binds = list() + domain_seqs, peptide_seqs, interact_seqs = list(), list(), list() + + for row in tqdm(csv.reader(open(input_data_file, 'r')), disable=(not progressbar), desc="Data processing", total=total): + if get_model_type(row) != model_type: continue + + b = 1 if float(row[4]) > 0 else 0 + binds.append(b) + + domain_seqs.append(_vectorize_sequence(row[1], amino_acid_ordering)) + peptide_seqs.append(_vectorize_sequence(row[3], amino_acid_ordering)) + interact_seqs.append(_vectorize_interaction(row[1], row[3], amino_acid_ordering)) + + binds = np.array(binds) + vectorized = [np.array(a) for a in [domain_seqs, peptide_seqs, interact_seqs]] + + yield model_type, vectorized, binds + +def convert_binding_data(input_data_file, output_data_directory, amino_acid_ordering, progressbar=False): + """ + Function that converts data. Mostly wraps functions above. + """ + + assert os.path.exists(output_data_directory) + + model_fmt = list() + + processed_binding_data = defaultdict(list) + for model_type, seqs_vectorized, binds in _load_binding_data(input_data_file, amino_acid_ordering, progressbar=progressbar): + model_odirname = "{0}_{1}".format(*model_type) + model_odirpath = os.path.join(output_data_directory,model_odirname) + os.mkdir(model_odirpath) + + model_fmt.append((*model_type, model_odirname)) + + np.save(os.path.join(model_odirpath, 'binding.npy'), np.array(list(binds))) + + output_files = ["dseq_mtx.npy", "pseq_mtx.npy", "iseqs_mtx.npy"] + for seqs, ofname in zip(seqs_vectorized, output_files): + np.save(os.path.join(model_odirpath, ofname), seqs) + + with open(os.path.join(output_data_directory, 'amino_acid_ordering.txt'), 'w+') as f: + amino_acids_list = [aa for aa, idx in sorted(amino_acid_ordering.items(), key=lambda x: x[1])] + f.write('\n'.join(amino_acids_list)) + + with open(os.path.join(output_data_directory, 'models_specification.csv'), 'w+') as f: + writer = csv.writer(f, delimiter=',') + writer.writerows(model_fmt) + +if __name__=='__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("input_data_file", type=str) + parser.add_argument("output_data_directory", type=str) + parser.add_argument("-a", "--amino_acid_ordering", type=str, default="../data/amino_acid_ordering.txt") + parser.add_argument("-p", "--progressbar", action='store_true', default=False) + opts = parser.parse_args() + + aa_order = {aa.strip():idx for idx, aa in enumerate(open(opts.amino_acid_ordering, 'r'))} + convert_binding_data(opts.input_data_file, opts.output_data_directory, aa_order, progressbar=opts.progressbar) diff --git a/train/models/__init__.py b/train/models/__init__.py new file mode 100644 index 0000000..c4a0300 --- /dev/null +++ b/train/models/__init__.py @@ -0,0 +1,3 @@ +from . import hsm_id +from . import hsm_d_singledomain +from . import hsm_d diff --git a/train/models/constants.py b/train/models/constants.py new file mode 100644 index 0000000..680432e --- /dev/null +++ b/train/models/constants.py @@ -0,0 +1,52 @@ + + +""" +Hyper-parameters for model training +""" +DEF_lambda = 1e-5 +DEF_learning_rate = 1e-4 +DEF_init_std = 1e-2 +DEF_epochs = 100 +DEF_validate_step = 10 +DEF_chunk_size = 512 + +DEF_basis_size = 100 + +DEF_fold_seed = 0 +DEF_n_folds = 8 + +KEY_validation_chunk = "Validation Chunk Index" + +KEY_learning_rate = "Learning Rate" +KEY_lambdas = "Lambda Params" +KEY_standard_dev = "Standard Deviation" +KEY_epochs = "Epochs" +KEY_validate_step = "Validate Step" + +KEY_chunk_seed = "Chunking Seed" +KEY_exclude_chunks = "Exclude Chunks" +KEY_include_all_chunks = "Include All Chunks" +KEY_chunk_size = "Chunk Size" +KEY_n_folds = "Number of folds" + +KEY_basis_size = "Basis Size" +KEY_input_basis = "Input Basis Filepath" +KEY_train_basis = "Train Basis" + +KEY_output_directory = "Output Directory" + +binding_file = "binding.npy" +domain_file = "dseq_mtx.npy" +peptide_file = "pseq_mtx.npy" +interaction_file = "iseqs_mtx.npy" + +model_specification_file = "models_specification.csv" + +from collections import namedtuple +ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", [ + "domain_type", + "peptide_type", + "domain_length", + "peptide_length", + "directory" + ]) diff --git a/train/models/hsm_d.py b/train/models/hsm_d.py new file mode 100644 index 0000000..2d18044 --- /dev/null +++ b/train/models/hsm_d.py @@ -0,0 +1,330 @@ +import os +import json +from collections import * +from itertools import * +from tqdm import tqdm as tqdm + +import numpy as np +import tensorflow as tf +from sklearn.metrics import roc_auc_score + +from . import utils +from . import constants + +class HSMDomainsModel(object): + """ + Multi-domain model implementing the HSM / D model. + + Most arguments are passed into the model using the args initialization option. See constants.py for the + function for the keywords to use to set different options values. + + The required arguments are: + - validation_chunk: an index that is validated against. + - model_format: a ModelSpecificationTuple, as defined in constants.py + - lambda_params: a nested python dictionary; nested for domain type then peptide type. + - amino_acid_ordering: a dict that indexes the vectorized data. + - args: all additional arguments; see constants.py + """ + + def __init__(self, validation_chunk, models_format, lambda_params, amino_acid_ordering, args=dict()): + ## model_format inputs are namedtuples: + # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"]) + self.model_specs = dict() + for model_format in models_format: + self.model_specs[(model_format.domain_type, model_format.peptide_type)] = model_format + self.n_amino_acids = len(amino_acid_ordering) + + # Set model hyper-parameters. Defaults to constants in the case where + self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs) + self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed) + self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds) + self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size) + self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate) + self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std) + + self.lambda_params = dict() + for dtype, _lambdas in lambda_params.items(): + for ptype, lambdap in _lambdas.items(): + self.lambda_params[(dtype, ptype)] = lambdap + + self.validation_chunk = validation_chunk + self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step) + self.exclude_indices = args.get(constants.KEY_exclude_chunks, None) + self.include_all_data = args.get(constants.KEY_include_all_chunks, False) + + # Basis set size specification. + self.basis_size = args.get(constants.KEY_basis_size, constants.DEF_basis_size) + self.input_basis = args.get(constants.KEY_input_basis, None) + self.train_basis = args.get(constants.KEY_train_basis, False) + + # Initialize tensorflow graph. + self.sess = tf.Session() + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) + + self._variables(self.init_std) + + self._model(args) + + init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()) + self.sess.run(init_op) + + + def _variables(self, initialization_std): + """ + Intializes variable. Shouldn't be called outside of the class (called from __init__). + """ + # Define basis weights a bit earlier. + basis_weights_shape = (self.basis_size, self.n_amino_acids ** 2) + if self.input_basis is not None: + # Load basis weights from pre-loaded option + basis = np.load(self.input_basis) + self.basis_weights = tf.Variable(basis, name='basis_weights', dtype=tf.float64, trainable=self.train_basis) + else: + # Randomly initialize basis weights + self.basis_weights = tf.Variable(np.random.normal(scale=initialization_std, size=basis_weights_shape), + name="basis_weights", dtype=tf.float64, trainable=self.train_basis) + + PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"]) + WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"]) + + # Placeholders and variables for different domain models + self.placeholders, self.weights = dict(), dict() + for (dtype, ptype), model_fmt in self.model_specs.items(): + self.placeholders[(dtype, ptype)] = PlaceholdersTuple( + domains = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx_{0}_{1}".format(dtype, ptype)), + peptides = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx_{0}_{1}".format(dtype, ptype)), + interacts = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx_{0}_{1}".format(dtype, ptype)), + binds = tf.placeholder(dtype=tf.float64, name="binding_data_mtx_{0}_{1}".format(dtype, ptype)) + ) + + dlen, plen = model_fmt.domain_length, model_fmt.peptide_length + + domain_weights_shape = (dlen * self.n_amino_acids, 1) + dw = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape), + name="domain_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64) + + peptide_weights_shape = (plen * self.n_amino_acids, 1) + pw = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape), + name="peptide_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64) + + interact_weights_shape = (dlen * plen, self.basis_size) + iw = tf.Variable(np.random.normal(scale=initialization_std, size=interact_weights_shape), + name="interact_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64) + + bias = tf.Variable(0, name="bias_{0}_{1}".format(dtype, ptype), dtype=tf.float64) + + self.weights[(dtype, ptype)] = WeightsTuple( + domains=dw, + peptides=pw, + interacts=iw, + bias=bias + ) + + def _model(self, args): + """ + Defines model graph. Shouldn't be called outside of class (called from __init__). + """ + # Different values can be accessed with these dictionaries throughout the model. + self.sparsity_penalties = dict() + self.logits, self.cross_entropies = dict(), dict() + self.predict_ops = dict() + + for (dtype, ptype), model_fmt in self.model_specs.items(): + # Load placeholders, weights, and lambda for a specific model + # PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"]) + # WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"]) + + + placeholders = self.placeholders[(dtype, ptype)] + weights = self.weights[(dtype, ptype)] + lambdap = self.lambda_params[(dtype, ptype)] + + # Scoping it for each domain-peptide model. Helpful to use w/ tensorboard. + with tf.name_scope("compute_{0}_{1}".format(dtype, ptype)): + # Sparsity Penalty. Combined across domains below. + with tf.name_scope("sparsity"): + pen = (lambdap * tf.reduce_sum(tf.abs(weights.domains)) + + lambdap * tf.reduce_sum(tf.abs(weights.peptides)) + + lambdap * tf.reduce_sum(tf.abs(weights.interacts))) + self.sparsity_penalties[(dtype, ptype)] = pen + + # Logit computation. + with tf.name_scope("logits"): + # Basic model, computes logits from model weights. + domain_contrib = tf.sparse.sparse_dense_matmul(placeholders.domains, weights.domains) + peptide_contrib = tf.sparse.sparse_dense_matmul(placeholders.peptides, weights.peptides) + + _interact_weights = tf.expand_dims(tf.reshape(tf.matmul(weights.interacts, self.basis_weights), [-1]), -1) + interact_contrib = tf.sparse.sparse_dense_matmul(placeholders.interacts, _interact_weights) + + _logits = domain_contrib + peptide_contrib + interact_contrib + weights.bias + self.logits[(dtype, ptype)] = _logits + + # Cross entropy per-domain/peptide. Combined across domains below. + self.cross_entropies[(dtype, ptype)] = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=placeholders.binds, logits=_logits, name="cross_entropy")) + + # Predict ops for each domain. + self.predict_ops[(dtype, ptype)] = tf.math.sigmoid(_logits, name="predict") + + # Loss Function computation and optimization. + self.weight_penalty = tf.reduce_sum([v for v in self.sparsity_penalties.values()]) + self.cross_entropy = tf.reduce_sum([v for v in self.cross_entropies.values()]) + self.loss_function = self.cross_entropy + self.weight_penalty + + self.train_op = self.optimizer.minimize(self.loss_function) + + def optimize_step(self, data_iter_dct, nsteps): + """ + A single training epoch. Passes in an iterator over data and + + args: + - data_iter_dct: a set of data iterators feeding in data of the form (sequence_tuple, binds_arr) + sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences] + """ + costs = list() + + for step in range(nsteps): + feed_dict = dict() + + for (dtype, ptype), model_format in self.model_specs.items(): + # PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"]) + placeholders = self.placeholders[(dtype, ptype)] + + # Convert to sparse tensors + sequences, binds = next(data_iter_dct[(dtype, ptype)]) + sparse_seqs = utils.make_sparse(sequences, model_format.domain_length, model_format.peptide_length, self.n_amino_acids) + + # _p is the placeholder matched with the value, _v + for _p, _v in zip(placeholders, [*sparse_seqs, np.expand_dims(binds, -1)]): + feed_dict[_p] = _v + + train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict) + costs.append(train_cost) + + return costs + + def predict(self, data_dct): + """ + Make a single prediction. + + args: + - data_dct: a data tuple of the form InteractionDataTuple as defined in utils.py + """ + predictions_dct = dict() + + # This can be done in one call. Splitting it up for reader ease. + for (dtype, ptype), data in data_dct.items(): + binds, sequences = data[0], list(data[1:]) + + # Convert to sparse tensors. + model_fmt = self.model_specs[(dtype, ptype)] + dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, model_fmt.domain_length, model_fmt.peptide_length, self.n_amino_acids) + + placeholder = self.placeholders[(dtype, ptype)] + + feed_dict = { + placeholder.domains: dseqs_sp, + placeholder.peptides: pseqs_sp, + placeholder.interacts: iseqs_sp, + placeholder.binds: np.expand_dims(binds, -1) + } + + # Get domain-specific predict_op and predict. + predict_op = self.predict_ops[(dtype, ptype)] + predictions = self.sess.run(predict_op, feed_dict=feed_dict) + roc_auc = roc_auc_score(binds, predictions) + + predictions_dct[(dtype, ptype)] = (binds, predictions, roc_auc) + + return predictions_dct + + def train(self, data_directory): + """ + Fully train the model given the input parameters. Runs for the input / default number of epochs. + Mostly just wraps the optimize_step function. + + args: + - data_directory: the input data directory. The data is split using the input metadata. + """ + + # Initialize train and test data splits. + train_data_dct, test_data_dct = dict(), dict() + for (dtype, ptype), model_fmt in self.model_specs.items(): + # model_fmt is a ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"]) + domain_data_directory = os.path.join(data_directory, model_fmt.directory) + train_data, test_data = utils.split_data(domain_data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed) + + train_data_dct[(dtype, ptype)] = train_data + test_data_dct[(dtype, ptype)] = test_data + + # Variable input sizes. Calculate the maximum number of chunks needed for any one domain + # given the input max chunk size. + n_chunks = max((v[0].shape[0] // self.chunk_size) + 1 for v in train_data_dct.values()) + + # Initial data iterators. + data_iterators_dct = dict() + for (dtype, ptype), train_data in train_data_dct.items(): + data_iterators_dct[(dtype, ptype)] = utils.training_iterator(train_data, nchunks=n_chunks) + + self.costs = list() + for epoch in tqdm(range(self.epochs), desc="Epochs"): + epoch_iters_dct = {k:next(v) for k,v in data_iterators_dct.items()} + epoch_costs = self.optimize_step(epoch_iters_dct, n_chunks) + self.costs.append(epoch_costs) + + if (epoch + 1) % self.validate_step == 0: + _perf = self.predict(test_data_dct) + for (dtype, ptype), (_, _, auc) in _perf.items(): + print("AUC: {0} (Epoch: {1}; Domain: {2}, Peptide: {3})".format(epoch+1, auc, dtype, ptype)) + + self.final_predictions = self.predict(test_data_dct) + + def save_model(self, output_directory): + """ + Save model outputs from a trained model. The output files are of the form: .. + The id is defined as the current time (as a string). The spec defines the output, either metadata + or the domain type of output. ext is the file extension, either a numpy file or a json file. + + args: + - output_directory: a string defining a directory to output the saved model to. + """ + import time + + model_output_id = str(time.time()).replace("-", "_") + + # Metadata. Saved as dict (to JSON file) with four keys defining model type, + # model format, the results of the last prediction, and parameters + results = list() + for (dtype, ptype), preds in self.final_predictions.items(): + _r = {"binds": list(map(int, preds[0])), + "predictions": list(map(float, preds[1])), + "auc": float(preds[2]) + } + + results.append([[dtype, ptype], _r]) + output = { + "Model Type": "HSM / ID", + "Model Formats": [list(fmt) for fmt in self.model_specs.values()], + "results": results, + "parameters": { + "costs": self.costs, + "validation chunk": self.validation_chunk, + constants.KEY_chunk_seed: self.data_split_seed, + constants.KEY_n_folds: self.n_folds, + constants.KEY_epochs: self.epochs, + constants.KEY_learning_rate: self.learning_rate + } + } + + ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id)) + with open(ometadata_file, 'w+') as f: + json.dump(output, f) + + # The model, saved as an array archive (.npz) file. + for (dtype, ptype), weights in self.weights.items(): + model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, dtype, ptype)) + + # WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"]) + arr = [weights.domains, weights.peptides, weights.interacts, self.basis_weights, weights.bias] + dw, pw, iw, basis, bias = self.sess.run(arr) + np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, basis_weights=basis, bias=bias) diff --git a/train/models/hsm_d_singledomain.py b/train/models/hsm_d_singledomain.py new file mode 100644 index 0000000..00b1985 --- /dev/null +++ b/train/models/hsm_d_singledomain.py @@ -0,0 +1,249 @@ +import os +import json +from collections import * +from itertools import * +from tqdm import tqdm as tqdm + +import numpy as np +import tensorflow as tf +from sklearn.metrics import roc_auc_score + +from . import utils +from . import constants + +class HSMSingleDomainsModel(object): + """ + Single-domain model implementing the HSM / D model. Although not focused on in the paper, this is just + the HSM/D model applied to only one domain. + + Most arguments are passed into the model using the args initialization option. See constants.py for the + function for the keywords to use to set different options values. + + The required arguments are: + - validation_chunk: an index that is validated against. + - model_format: a ModelSpecificationTuple, as defined in constants.py + - lambda_params: a nested python dictionary; nested for domain type then peptide type. + - amino_acid_ordering: a dict that indexes the vectorized data. + - args: all additional arguments; see constants.py + """ + + def __init__(self, validation_chunk, model_format, lambda_params, amino_acid_ordering, args=dict()): + ## model_format inputs are namedtuples: + # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"]) + self.model_spec = model_format + self.domain_length = model_format.domain_length + self.peptide_length = model_format.peptide_length + self.n_amino_acids = len(amino_acid_ordering) + + # Set model hyper-parameters. Defaults to constants in the case where + self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs) + self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed) + self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds) + self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size) + self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate) + self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std) + self.lambda_param = lambda_params[model_format.domain_type][model_format.peptide_type] + + self.validation_chunk = validation_chunk + self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step) + self.exclude_indices = args.get(constants.KEY_exclude_chunks, None) + self.include_all_data = args.get(constants.KEY_include_all_chunks, False) + + # Basis set size specification. + self.basis_size = args.get(constants.KEY_basis_size, constants.DEF_basis_size) + self.input_basis = args.get(constants.KEY_input_basis, None) + self.train_basis = args.get(constants.KEY_train_basis, False) + + # Initialize tensorflow graph. + self.sess = tf.Session() + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) + + self._variables(self.init_std) + + self._model(args) + + init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()) + self.sess.run(init_op) + + + def _variables(self, initialization_std): + """ + Intializes variable. Shouldn't be called outside of the class (called from __init__). + """ + # Placeholders for different input data. + self.domain_data = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx") + self.peptide_data = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx") + self.interact_data = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx") + self.binding_data = tf.placeholder(dtype=tf.float64, name="binding_data_mtx") + + # Weights variables to be trained in the model. + domain_weights_shape = (self.domain_length * self.n_amino_acids, 1) + self.domain_weights = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape), + name="domain_weights", dtype=tf.float64) + + peptide_weights_shape = (self.peptide_length * self.n_amino_acids, 1) + self.peptide_weights = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape), + name="peptide_weights", dtype=tf.float64) + + interact_weights_shape = (self.domain_length * self.peptide_length, self.basis_size) + basis_weights_shape = (self.basis_size, self.n_amino_acids ** 2) + + ## NOTE: This is one of the major changes. Here, we define separate weights and basis functions. Multiplied below. + self.interaction_weights = tf.Variable(np.random.normal(scale=initialization_std, size=interact_weights_shape), + name="interaction_weights", dtype=tf.float64) + if self.input_basis is not None: + # Load basis weights from pre-loaded option + basis = np.load(self.input_basis) + self.basis_weights = tf.Variable(basis, name='basis_weights', dtype=tf.float64, trainable=self.train_basis) + else: + # Randomly initialize basis weights + self.basis_weights = tf.Variable(np.random.normal(scale=initialization_std, size=basis_weights_shape), + name="basis_weights", dtype=tf.float64, trainable=self.train_basis) + + self.bias = tf.Variable(0, name="bias", dtype=tf.float64) + + def _model(self, args): + """ + Defines model graph. Shouldn't be called outside of class (called from __init__). + """ + with tf.name_scope("sparsity"): + # L1-sparsity penalty. + self.weight_penalty = (self.lambda_param * tf.reduce_sum(tf.abs(self.domain_weights)) + + self.lambda_param * tf.reduce_sum(tf.abs(self.peptide_weights)) + + self.lambda_param * tf.reduce_sum(tf.abs(self.interaction_weights))) + + with tf.name_scope("logits"): + # Basic model, computes logits from model weights. + domain_contrib = tf.sparse.sparse_dense_matmul(self.domain_data, self.domain_weights) + peptide_contrib = tf.sparse.sparse_dense_matmul(self.peptide_data, self.peptide_weights) + + ## NOTE: The other major change. Multiply together the weights and basis functions. + _interact_weights = tf.expand_dims(tf.reshape(tf.matmul(self.interaction_weights, self.basis_weights), [-1]), -1) + interact_contrib = tf.sparse.sparse_dense_matmul(self.interact_data, _interact_weights) + + self.logits = domain_contrib + peptide_contrib + interact_contrib + self.bias + + # Loss Function computation and optimization. + self.cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.binding_data, logits=self.logits, name="cross_entropy")) + self.loss_function = self.cross_entropy + self.weight_penalty + + self.train_op = self.optimizer.minimize(self.loss_function) + + # Ops for computing predictions. + self.predict_op = tf.math.sigmoid(self.logits, name="predict_sigmoid") + + def optimize_step(self, data_iter): + """ + A single training epoch. Passes in an iterator over data and + + args: + - data_iter: a data iterator feeding in data of the form (sequence_tuple, binds_arr) + sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences] + """ + costs = list() + for sequences, binds in data_iter: + # Convert to sparse tensors for use in the model. + dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids) + + # Using feed_dicts is a bit unfortunate. It cleans up the data file handling on the + # front end. To optimize, switch over to TensorFlow's data processing pipeline. + feed_dict = { + self.domain_data: dseqs_sp, + self.peptide_data: pseqs_sp, + self.interact_data: iseqs_sp, + self.binding_data: np.expand_dims(binds, -1) + } + + # Compute step. + train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict) + costs.append(train_cost) + + return costs + + def predict(self, data): + """ + Make a single prediction. + + args: + - data: a data tuple of the form InteractionDataTuple as defined in utils.py + """ + binds, sequences = data[0], list(data[1:]) + + # Convert to sparse tensors. + dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids) + + feed_dict = { + self.domain_data: dseqs_sp, + self.peptide_data: pseqs_sp, + self.interact_data: iseqs_sp, + self.binding_data: np.expand_dims(binds, -1) + } + + predictions = self.sess.run(self.predict_op, feed_dict=feed_dict) + roc_auc = roc_auc_score(binds, predictions) + return binds, predictions, roc_auc + + def train(self, data_directory): + """ + Fully train the model given the input parameters. Runs for the input / default number of epochs. + Mostly just wraps the optimize_step function. + + args: + - data_directory: the input data directory. The data is split using the input metadata. + """ + train_data, test_data = utils.split_data(data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed) + data_iterator = utils.training_iterator(train_data, chunk_size=self.chunk_size) + + self.costs = list() + for epoch in tqdm(range(self.epochs), desc="Epochs"): + epoch_costs = self.optimize_step(next(data_iterator)) + self.costs.append(epoch_costs) + + if (epoch + 1) % self.validate_step == 0: + _, _, auc = self.predict(test_data) + print("AUC: {0} (Epoch: {1})".format(epoch+1, auc)) + + self.final_predictions = self.predict(test_data) + + def save_model(self, output_directory): + """ + Save model outputs from a trained model. The output files are of the form: .. + The id is defined as the current time (as a string). The spec defines the output, either metadata + or the domain type of output. ext is the file extension, either a numpy file or a json file. + + args: + - output_directory: a string defining a directory to output the saved model to. + """ + import time + + model_output_id = str(time.time()).replace("-", "_") + + # Metadata. Saved as dict (to JSON file) with four keys defining model type, + # model format, the results of the last prediction, and parameters + results = { + "binds": list(map(int, self.final_predictions[0])), + "predictions": list(map(float, self.final_predictions[1])), + "auc": float(self.final_predictions[2]) + } + output = { + "Model Type": "HSM / ID", + "Model Format": list(self.model_spec), + "results": results, + "parameters": { + "costs": self.costs, + "validation chunk": self.validation_chunk, + constants.KEY_chunk_seed: self.data_split_seed, + constants.KEY_n_folds: self.n_folds, + constants.KEY_epochs: self.epochs, + constants.KEY_learning_rate: self.learning_rate + } + } + + ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id)) + with open(ometadata_file, 'w+') as f: + json.dump(output, f) + + # The model, saved as an array archive (.npz) file. + model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, self.model_spec.domain_type, self.model_spec.peptide_type)) + dw, pw, iw, basis, bias = self.sess.run([self.domain_weights, self.peptide_weights, self.interaction_weights, self.basis_weights, self.bias]) + np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, basis=basis, bias=bias) diff --git a/train/models/hsm_id.py b/train/models/hsm_id.py new file mode 100644 index 0000000..a81364c --- /dev/null +++ b/train/models/hsm_id.py @@ -0,0 +1,229 @@ +import os +import json +from collections import * +from itertools import * +from tqdm import tqdm as tqdm + +import numpy as np +import tensorflow as tf +from sklearn.metrics import roc_auc_score + +from . import utils +from . import constants + +class HSMIndependentDomainsModel(object): + """ + Single-domain model implementing the HSM / ID model. + + Most arguments are passed into the model using the args initialization option. See constants.py for the + function for the keywords to use to set different options values. + + The required arguments are: + - validation_chunk: an index that is validated against. + - model_format: a ModelSpecificationTuple, as defined in constants.py + - lambda_params: a nested python dictionary; nested for domain type then peptide type. + - amino_acid_ordering: a dict that indexes the vectorized data. + - args: all additional arguments; see constants.py + """ + + def __init__(self, validation_chunk, model_format, lambda_params, amino_acid_ordering, args=dict()): + ## model_format inputs are namedtuples: + # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"]) + self.model_spec = model_format + self.domain_length = model_format.domain_length + self.peptide_length = model_format.peptide_length + self.n_amino_acids = len(amino_acid_ordering) + + # Set model hyper-parameters. Defaults to constants in the case where + self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs) + self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed) + self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds) + self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size) + self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate) + self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std) + self.lambda_param = lambda_params[model_format.domain_type][model_format.peptide_type] + + self.validation_chunk = validation_chunk + self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step) + self.exclude_indices = args.get(constants.KEY_exclude_chunks, None) + self.include_all_data = args.get(constants.KEY_include_all_chunks, False) + + # Initialize tensorflow graph. + self.sess = tf.Session() + self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) + + self._variables(self.init_std) + + self._model(args) + + init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()) + self.sess.run(init_op) + + + def _variables(self, initialization_std): + """ + Intializes variable. Shouldn't be called outside of the class (called from __init__). + """ + # Placeholders for different input data. + self.domain_data = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx") + self.peptide_data = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx") + self.interact_data = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx") + self.binding_data = tf.placeholder(dtype=tf.float64, name="binding_data_mtx") + + # Weights variables to be trained in the model. + domain_weights_shape = (self.domain_length * self.n_amino_acids, 1) + self.domain_weights = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape), + name="domain_weights", dtype=tf.float64) + + peptide_weights_shape = (self.peptide_length * self.n_amino_acids, 1) + self.peptide_weights = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape), + name="peptide_weights", dtype=tf.float64) + + interaction_weights_shape = (self.domain_length * self.peptide_length * (self.n_amino_acids ** 2), 1) + self.interaction_weights = tf.Variable(np.random.normal(scale=initialization_std, size=interaction_weights_shape), + name="interaction_weights", dtype=tf.float64) + + self.bias = tf.Variable(0, name="bias", dtype=tf.float64) + + def _model(self, args): + """ + Defines model graph. Shouldn't be called outside of class (called from __init__). + """ + with tf.name_scope("sparsity"): + # L1-sparsity penalty. + self.weight_penalty = (self.lambda_param * tf.reduce_sum(tf.abs(self.domain_weights)) + + self.lambda_param * tf.reduce_sum(tf.abs(self.peptide_weights)) + + self.lambda_param * tf.reduce_sum(tf.abs(self.interaction_weights))) + + with tf.name_scope("logits"): + # Basic model, computes logits from model weights. + domain_contrib = tf.sparse.sparse_dense_matmul(self.domain_data, self.domain_weights) + peptide_contrib = tf.sparse.sparse_dense_matmul(self.peptide_data, self.peptide_weights) + interact_contrib = tf.sparse.sparse_dense_matmul(self.interact_data, self.interaction_weights) + + self.logits = domain_contrib + peptide_contrib + interact_contrib + self.bias + + # Loss Function computation and optimization. + self.cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.binding_data, logits=self.logits, name="cross_entropy")) + self.loss_function = self.cross_entropy + self.weight_penalty + + self.train_op = self.optimizer.minimize(self.loss_function) + + # Ops for computing predictions. + self.predict_op = tf.math.sigmoid(self.logits, name="predict_sigmoid") + + def optimize_step(self, data_iter): + """ + A single training epoch. Passes in an iterator over data and + + args: + - data_iter: a data iterator feeding in data of the form (sequence_tuple, binds_arr) + sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences] + """ + costs = list() + for sequences, binds in data_iter: + # Convert to sparse tensors for use in the model. + dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids) + + # Using feed_dicts is a bit unfortunate. It cleans up the data file handling on the + # front end. To optimize, switch over to TensorFlow's data processing pipeline. + feed_dict = { + self.domain_data: dseqs_sp, + self.peptide_data: pseqs_sp, + self.interact_data: iseqs_sp, + self.binding_data: np.expand_dims(binds, -1) + } + + # Compute step. + train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict) + costs.append(train_cost) + + return costs + + def predict(self, data): + """ + Make a single prediction. + + args: + - data: a data tuple of the form InteractionDataTuple as defined in utils.py + """ + binds, sequences = data[0], list(data[1:]) + + # Convert to sparse tensors. + dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids) + + feed_dict = { + self.domain_data: dseqs_sp, + self.peptide_data: pseqs_sp, + self.interact_data: iseqs_sp, + self.binding_data: np.expand_dims(binds, -1) + } + + predictions = self.sess.run(self.predict_op, feed_dict=feed_dict) + roc_auc = roc_auc_score(binds, predictions) + return binds, predictions, roc_auc + + def train(self, data_directory): + """ + Fully train the model given the input parameters. Runs for the input / default number of epochs. + Mostly just wraps the optimize_step function. + + args: + - data_directory: the input data directory. The data is split using the input metadata. + """ + train_data, test_data = utils.split_data(data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed) + data_iterator = utils.training_iterator(train_data, chunk_size=self.chunk_size) + + self.costs = list() + for epoch in tqdm(range(self.epochs), desc="Epochs"): + epoch_costs = self.optimize_step(next(data_iterator)) + self.costs.append(epoch_costs) + + if (epoch + 1) % self.validate_step == 0: + _, _, auc = self.predict(test_data) + print("AUC: {0} (Epoch: {1})".format(epoch+1, auc)) + + self.final_predictions = self.predict(test_data) + + def save_model(self, output_directory): + """ + Save model outputs from a trained model. The output files are of the form: .. + The id is defined as the current time (as a string). The spec defines the output, either metadata + or the domain type of output. ext is the file extension, either a numpy file or a json file. + + args: + - output_directory: a string defining a directory to output the saved model to. + """ + import time + + model_output_id = str(time.time()).replace("-", "_") + + # Metadata. Saved as dict (to JSON file) with four keys defining model type, + # model format, the results of the last prediction, and parameters + results = { + "binds": list(map(int, self.final_predictions[0])), + "predictions": list(map(float, self.final_predictions[1])), + "auc": float(self.final_predictions[2]) + } + output = { + "Model Type": "HSM / ID", + "Model Format": list(self.model_spec), + "results": results, + "parameters": { + "costs": self.costs, + "validation chunk": self.validation_chunk, + constants.KEY_chunk_seed: self.data_split_seed, + constants.KEY_n_folds: self.n_folds, + constants.KEY_epochs: self.epochs, + constants.KEY_learning_rate: self.learning_rate + } + } + + ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id)) + with open(ometadata_file, 'w+') as f: + json.dump(output, f) + + # The model, saved as an array archive (.npz) file. + model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, self.model_spec.domain_type, self.model_spec.peptide_type)) + dw, pw, iw, bias = self.sess.run([self.domain_weights, self.peptide_weights, self.interaction_weights, self.bias]) + np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, bias=bias) diff --git a/train/models/utils.py b/train/models/utils.py new file mode 100644 index 0000000..1c14ecc --- /dev/null +++ b/train/models/utils.py @@ -0,0 +1,83 @@ +import os +from collections import * +from itertools import * + +import numpy as np +import tensorflow as tf + +from . import constants + +binding_data = "binding.npy" +domain_sequences = "dseq_mtx.npy" +peptide_sequences = "pseq_mtx.npy" +interaction_sequences = "iseqs_mtx.npy" + +InteractionDataTuple = namedtuple("InteractionDataTuple", ["binds", "domain_seqs", "peptide_seqs", "interaction_seqs"]) +def split_data(data_directory, validation_chunk, include_all=False, excluded_chunks=None, n_folds=constants.DEF_n_folds, seed=constants.DEF_fold_seed): + files = [constants.binding_file, constants.domain_file, constants.peptide_file, constants.interaction_file] + data = [np.load(os.path.join(data_directory, f)) for f in files] + + data_sz = data[0].shape[0] + + np.random.seed(seed) + randomized = np.random.permutation(data_sz) + chunks = np.array_split(randomized, n_folds) + + vindices = chunks[validation_chunk] + + validation_data = InteractionDataTuple(*[mtx[vindices] for mtx in data]) + + if include_all: + _excluded = [] + else: + _excluded = [validation_chunk, *(excluded_chunks if excluded_chunks is not None else [])] + + tindices = [i for cidx, chunk in enumerate(chunks) for i in chunk if cidx not in _excluded] + train_data = InteractionDataTuple(*[mtx[tindices] for mtx in data]) + + np.random.seed() + + return train_data, validation_data + +def _data_chunker(data, nchunks): + randomized = np.random.permutation(data.binds.shape[0]) + splits = np.array_split(randomized, nchunks) + + for spl in splits: + binds_spl = data.binds[spl] + seqs_spl = [s[spl] for s in data[1:]] + + yield seqs_spl, binds_spl + +def training_iterator(interaction_data, chunk_size=None, nchunks=None): + if chunk_size is None and nchunks is None: + raise ValueError("Need to pass a chunk size or num. of chunks") + + if nchunks is None: + nchunks = interaction_data.binds.shape[0] // chunk_size + 1 + + while True: + yield _data_chunker(interaction_data, nchunks) + +def make_sparse(sequences, domain_length, peptide_length, n_amino_acids): + def _sparsify(arr, ncols): + nrows = arr.shape[0] + + ridxes, compressed_cidxes = np.where(arr >= 0) + cidxes = arr[ridxes, compressed_cidxes] + + vals = np.ones(ridxes.size) + + idxes = np.vstack([ridxes, cidxes]).T + + shape = [nrows, ncols] + + return tf.SparseTensorValue( + indices = idxes, + values = vals, + dense_shape = shape + ) + + col_sizes = [domain_length * n_amino_acids, peptide_length * n_amino_acids, domain_length * peptide_length * n_amino_acids * n_amino_acids] + return [_sparsify(m, ncols) for m, ncols in zip(sequences, col_sizes)] + diff --git a/train/train.py b/train/train.py new file mode 100644 index 0000000..dff4f50 --- /dev/null +++ b/train/train.py @@ -0,0 +1,201 @@ +""" +This file provides a CLI for running models with a variety of possible +options. + +Note: you can also import the models directly and run them. + +The easiest way to understand these config options is likely by running the +built-in help method, i.e.: + $ python /train.py --help +""" +import os +import csv +import argparse + +from collections import * +from itertools import * + +import models +from models import constants as constants + +def create_parser(): + parser = argparse.ArgumentParser(description="A wrapper to specify, train, and assess different models. " + + "Models are implemented primarily in the models/ directory and may be accessed as a package." + ) + + parser.add_argument("validation_chunk_idx", type=int, default=0, help="Validation chunk") + + model_spec_group = parser.add_argument_group(title="Model Specification") + + # Specificy model type + model_group = model_spec_group.add_mutually_exclusive_group(required=True) + model_group.add_argument("--hsm_id", action="store_true", default=False, + help="Train the HSM/ID model. Domain specification must specify a single domain." ) + model_group.add_argument("--hsm_d", action="store_true", + help="Train the HSM/D model. Domain specification may include one or multiple domains.") + model_spec_group.add_argument("-d", "--domains", nargs='+', type=str, + help="Specify a domain type for training a new model. To specify a specific domain - peptide type, pass with a ':' character separating types.") + model_spec_group.add_argument("-a", "--include_all_domains", action='store_true', default=False, + help="Include all possible domains for which data are available") + + hyperparam_group = parser.add_argument_group(title="Learning hyperparameters") + hyperparam_group.add_argument("-l", "--lambdap", type=float, default=constants.DEF_lambda, + help="Sparsity parametrization parameter. Default: {0}".format(str(constants.DEF_lambda))) + hyperparam_group.add_argument("--lambda_params", type=str, nargs='+', + help="Sparsity parametrization per-domain, passed as a string of format {domain}:{lambda} or {domain}:{peptide}:{lambda} where each choice" + + "is comma-separated. Default: {0}".format(str(constants.DEF_lambda))) + hyperparam_group.add_argument("-r", "--learning_rate", type=float, default=constants.DEF_learning_rate, + help="Add an initial learning rate. Default: {0}".format(str(constants.DEF_learning_rate))) + hyperparam_group.add_argument("-s", "--standard_deviation", type=float, default=constants.DEF_init_std, + help="Set the initialization standard dev. Default: {0}".format(str(constants.DEF_init_std))) + hyperparam_group.add_argument("-e", "--epochs", type=int, default=constants.DEF_epochs, + help="Max number of epochs to run the strategy for. Default: {0}".format(str(constants.DEF_epochs))) + hyperparam_group.add_argument("-v", "--validate_step", type=int, default=constants.DEF_validate_step, + help="The step number to run validation on. Default: {0}.".format(str(constants.DEF_validate_step))) + + data_handling_group = parser.add_argument_group(title="Data handling group") + data_handling_group.add_argument("--chunk_seed", type=int, default=0, + help="Seed used to synchronize chunks across a single split. Use the same seed to train on different chunks.") + data_handling_group.add_argument("--exclude_chunks", nargs="+", type=int, + help="Exclude specific chunks from training or testing. Helpful for excluding a test chunk") + data_handling_group.add_argument("--include_all_data", action='store_true', default=False, + help='Use all data in training. Used at end to get final predictions.') + data_handling_group.add_argument("-c", "--chunk_size", type=int, default=constants.DEF_chunk_size, + help="The maximum chunk size for any iteration. Actual chunk size is less than this (calculated as the " + + "amount of data divided by chunk size plus one. Default: {0}.".format(str(constants.DEF_chunk_size))) + data_handling_group.add_argument("--n_folds", type=int, default=constants.DEF_n_folds, + help="Number of chunks to split dataset into. Default: {0}".format(constants.DEF_n_folds)) + + # Input / output data processing. + input_output_group = parser.add_argument_group(title="Input/output group") + input_output_group.add_argument("-i", "--input_directory", type=str, default="../data/train/hsm_preprocessed/", + help="Input data directory. The form of this directory should be the same as output by convert_binding_data.py") + input_output_group.add_argument("-o", "--output_directory", type=str, default="outputs/", + help="Location to output data to.") + input_output_group.add_argument("--amino_acids_ordering", type=str, default="../data/amino_acid_ordering.txt", + help="Amino acid ordering file. Allows user to add new amino acids (e.g. phospho-ser/thr).") + + # Model specific arguments: + basis_spec_group = parser.add_argument_group(title="Basis Specification") + basis_spec_group.add_argument("-b", "--basis_size", type=int, default=constants.DEF_basis_size, + help="Set the hierarchical/shared-basis size. Default: 100".format(str(constants.DEF_basis_size))) + basis_spec_group.add_argument("--load_basis", type=str, + help="Run the model w/ a pre-defined basis. Basis is passed as a numpy file.") + basis_spec_group.add_argument("--no_train", action="store_true", default=False, + help="Makes the basis set non-trainable for a given initialization.") + + return parser + +def format_arguments_directory(options): + arguments = dict() + + arguments[constants.KEY_learning_rate] = options.learning_rate + arguments[constants.KEY_standard_dev] = options.standard_deviation + arguments[constants.KEY_epochs] = options.epochs + arguments[constants.KEY_validate_step] = options.validate_step + + arguments[constants.KEY_chunk_seed] = options.chunk_seed + arguments[constants.KEY_exclude_chunks] = options.exclude_chunks + arguments[constants.KEY_include_all_chunks] = options.include_all_data + arguments[constants.KEY_chunk_size] = options.chunk_size + + arguments[constants.KEY_basis_size] = options.basis_size + arguments[constants.KEY_input_basis] = options.load_basis + arguments[constants.KEY_train_basis] = options.no_train + + arguments[constants.KEY_output_directory] = options.output_directory + arguments[constants.KEY_n_folds] = options.n_folds + + + arguments = {k:v for k,v in arguments.items() if v is not None} + + return arguments + + +def get_model(args, opts): + def _match_model(dinput, model_spec_dir): + if ":" in dinput: + dtype, ptype = dinput.split(":") + else: + dtype = dinput + ptype = None + + assert dtype in possible_models + + if ptype is None: + assert len(possible_models[dtype]) == 1 + return next(iter(possible_models[dtype].values())) + else: + assert ptype in possible_models[dtype] + return possible_models[dtype][ptype] + + model_formats_ifile = os.path.join(opts.input_directory, constants.model_specification_file) + + all_models = list() + possible_models = defaultdict(dict) + for dtype, ptype, dlen, plen, idir in csv.reader(open(model_formats_ifile, 'r'), delimiter=','): + mformat = constants.ModelSpecificationTuple(dtype, ptype, int(dlen), int(plen), idir) + possible_models[dtype][ptype] = mformat + all_models.append(mformat) + + def _process_lambdas(lambda_param, lambda_params, possible_models=possible_models): + lambda_dirs = defaultdict(dict) + if lambda_params is None: + for dtype, model_types in possible_models.items(): + lambda_dirs[dtype] = {ptype:lambda_param for ptype in possible_models[dtype].keys()} + else: + for arg in lambda_params: + spl = arg.split(":") + dtype, lp = spl[0], float(spl[-1]) + + if len(spl) == 2: + lambda_dirs[dtype] = {ptype:lp for ptype in possible_models[dtype].keys()} + elif len(spl) == 3: + ptype = spl[1] + lambda_dirs[dtype][ptype] = lp + else: + raise ValueError("Lambdas mis-specified") + + for dtype, model_types in possible_models.items(): + for ptype in model_types.keys(): + if ptype not in lambda_dirs[dtype]: + lambda_dirs[dtype][ptype] = constants.DEF_lambda + + return lambda_dirs + + lambda_params_dct = _process_lambdas(options.lambdap, options.lambda_params) + + if opts.hsm_id: + assert len(opts.domains) == 1 + + model_formats = _match_model(opts.domains[0], possible_models) + model_type = models.hsm_id.HSMIndependentDomainsModel + input_directories = os.path.join(opts.input_directory, model_formats.directory) + elif len(opts.domains) == 1: + model_formats = _match_model(opts.domains[0], possible_models) + model_type = models.hsm_d_singledomain.HSMSingleDomainsModel + input_directories = os.path.join(opts.input_directory, model_formats.directory) + elif opts.include_all_domains: + model_formats = model_formats_ifile + model_type = models.hsm_d.HSMDomainsModel + input_directories = opts.input_directory + else: + model_formats = [_match_model(d, possible_models) for d in opts.domains] + model_type = models.hsm_d.HSMDomainsModel + input_directories = opts.input_directory + + amino_acid_ordering = {aa.strip():idx for idx, aa in enumerate(open(opts.amino_acids_ordering, 'r'))} + + model = model_type(opts.validation_chunk_idx, model_formats, lambda_params_dct, amino_acid_ordering, args) + return model, input_directories + +if __name__=='__main__': + parser = create_parser() + options = parser.parse_args() + assert os.path.exists(options.input_directory) and os.path.exists(options.output_directory) and os.path.exists(options.amino_acids_ordering) + + arguments = format_arguments_directory(options) + + model, idirs = get_model(arguments, options) + model.train(idirs) + model.save_model(options.output_directory)