From e2a505483f225016bb655091e53ec87c21cde20a Mon Sep 17 00:00:00 2001
From: Joe <cunningham.j.16@gmail.com>
Date: Mon, 6 Jan 2020 00:35:36 -0500
Subject: [PATCH] Adding in train/ directory. Codebase for training /
 re-training models.

---
 train/README.md                    |  66 ++++++
 train/convert_binding_data.py      | 141 ++++++++++++
 train/models/__init__.py           |   3 +
 train/models/constants.py          |  52 +++++
 train/models/hsm_d.py              | 330 +++++++++++++++++++++++++++++
 train/models/hsm_d_singledomain.py | 249 ++++++++++++++++++++++
 train/models/hsm_id.py             | 229 ++++++++++++++++++++
 train/models/utils.py              |  83 ++++++++
 train/train.py                     | 201 ++++++++++++++++++
 9 files changed, 1354 insertions(+)
 create mode 100644 train/README.md
 create mode 100644 train/convert_binding_data.py
 create mode 100644 train/models/__init__.py
 create mode 100644 train/models/constants.py
 create mode 100644 train/models/hsm_d.py
 create mode 100644 train/models/hsm_d_singledomain.py
 create mode 100644 train/models/hsm_id.py
 create mode 100644 train/models/utils.py
 create mode 100644 train/train.py

diff --git a/train/README.md b/train/README.md
new file mode 100644
index 0000000..e60ebe1
--- /dev/null
+++ b/train/README.md
@@ -0,0 +1,66 @@
+# Description
+
+This directory may be used to train / re-train new models using the HSM framework.  
+
+# Usage
+
+Code used for training (or re-training) models is located in the `train/` directory in this repository. The package should primarily be accessed via the script `train.py`. 
+
+```bash
+python train.py [OPTIONS] 
+```
+Additional options for using `train.py` may be listed using the `-h/--help` flag. 
+
+The basic steps for training a new model are:
+0. Pre-process domain-peptide interaction data.
+
+By default, the training code assumes that pre-processed data are located at `data/train/`, which can be downloaded (see [Data section](#data)). New data must be passed explicitly to the code (see the next section). 
+
+A script for converting domain-peptide interaction data into the appropriate format for use with the model is available at `convert_binding_data.py`. The input format for this script is a csv file (no header) with the format: 
+```
+Domain-Type,Aligned-Domain-Sequence,Peptide-Type,Aligned-Peptidic-Sequence
+```
+An example of the input file type is included with the downloaded data (`domain_peptide_train.csv`). Domain and peptide protein identifiers are typically UniProtKB IDs; however, this is not required. Domain type refers to the class of binding domain (e.g. SH2 domains). 
+
+
+To process the data, run the command:
+```bash
+python convert_binding_data.py [INPUT DATA FILE] [OUTPUT DATA DIRECTORY]
+```
+The processed data is output to the `OUTPUT DATA DIRECTORY` argument with the data split into directories by domain-type. Each directory contains data in two formats: `tf-records/` and `hdf5-records`. Additional options are detailed using the `-h/--help` flag. 
+
+1. Train new models
+
+Typically, models should be trained using the command:
+
+```bash
+python train.py [VALIDATION INDEX] (-d [DOMAIN ...] | -a) (--generic | --shared_basis | --hierarchical) 
+```
+
+The `VALIDATION INDEX` denotes data chunk that is excluded from the training process. The next argument, `-d [DOMAIN ...] | -a`, identifies the domains used in training the model. `-d` (or `--domains`) specifies a single or a subset of domains to train on. `a` (or `--all_domains`) specifies use all domains available. The final argument `(--generic | --shared_basis | --hierarchical)` specifies the model type: `--generic` specifies HSM/ID , `--hierarchical` denotes HSM/D, `--shared_basis` denotes HSM/D models trained for a single domain.
+
+Additional command-line options facilitate model training / optimization (*e.g.* regularization parameters) and are detailed with the help command. 
+
+2. Predict and assess performance
+
+Data for the training process are typically output to `train/outputs/`. Processing the output directory can be accomplished using the `assess_performance.py` script:
+
+```bash
+python assess_performance.py [INPUT DIRECTORY]
+```
+where `INPUT DIRECTORY` denotes the path to the previously output directory. To control model training output, it can be helpful to re-direct outputs using the `--output_directory` command when running `train.py`.
+
+3. Finalize model
+
+To predict a combined model (*i.e.* using all training data) add the `--include_all` flag to the code 
+
+```bash
+python train.py [TEST INDEX] --include_all 
+```
+
+Models (for use with the `predict` code) may be output using the `output_models.py` script:
+```bash
+python output_models.py [RESULTS FILE] [OUTPUT DIRECTORY]
+```
+where `RESULTS FILE` denotes a file output by the `train.py` script and `OUTPUT DIRECTORY` the directory to place the processed models into (each domain type occupies one model file).  
+
diff --git a/train/convert_binding_data.py b/train/convert_binding_data.py
new file mode 100644
index 0000000..8111e3b
--- /dev/null
+++ b/train/convert_binding_data.py
@@ -0,0 +1,141 @@
+import os
+import h5py, csv
+from collections import *
+from itertools import *
+from tqdm import tqdm as tqdm
+
+import numpy as np
+
+def _vectorize_sequence(sequence, amino_acid_ordering):
+    """
+    Computes a one-hot embedding of an input sequence.  
+    
+    Returns:
+        - list. Non-zero indices of one-hot embedding matrix of a sequence.
+                Non-flattened, this matrix has dimensions:
+                    (sequence length, # of amino acids represented)
+    """
+    aa_len = len(amino_acid_ordering)
+    
+    vectorized = list()
+    
+    sequence_indexed = [(sidx, saa) for sidx, saa in enumerate(sequence) if saa in amino_acid_ordering]
+    for sidx, saa in sequence_indexed:
+        idxed = sidx * aa_len + amino_acid_ordering[saa]    
+
+        vectorized.append(idxed)
+    
+    # Pad to equal length
+    npad = len(sequence)  - len(vectorized)
+    vectorized.extend(-1 for _ in range(npad))
+
+    return vectorized
+ 
+def _vectorize_interaction(domain_sequence, peptide_sequence, amino_acid_ordering): 
+    """
+    Computes a one-hot embedding of the interaction between the domain- and peptidic-sequence.
+    
+    Returns: 
+        - list. Non-zero indices for the interaction between domain and peptide sequences. 
+                Non-flattened, this matrix has dimensions:
+                    (domain sequence length, peptide sequence length, # of amino acids represented, # of amino acids represented)
+
+    """
+    
+    aa_len = len(amino_acid_ordering)
+    domain_idx_offset = len(peptide_sequence) * aa_len * aa_len 
+    peptide_idx_offset = aa_len * aa_len 
+
+    vectorized = list()
+    
+    domain_indexed = [(didx, daa) for didx, daa in enumerate(domain_sequence) if daa in amino_acid_ordering]
+    peptide_indexed = [(pidx, paa) for pidx, paa in enumerate(peptide_sequence) if paa in amino_acid_ordering]
+
+    for (didx, daa), (pidx, paa) in product(domain_indexed, peptide_indexed):
+        
+        idxed = didx * domain_idx_offset + pidx * peptide_idx_offset + amino_acid_ordering[daa] * aa_len + amino_acid_ordering[paa]            
+        vectorized.append(idxed)
+    
+    # Pad to equal length
+    npad = len(domain_sequence) * len(peptide_sequence) - len(vectorized)
+    vectorized.extend(-1 for _ in range(npad))
+
+    return vectorized
+
+def _load_binding_data(input_data_file, amino_acid_ordering,  progressbar=False):
+    """
+    Input data format should be a csv file of the form:
+        Domain-Type,Domain-Protein-Identifier,Aligned-Domain-Sequence,Peptide-Type,Peptide-Protein-Identifier,Aligned-Peptidic-Sequence
+
+    An iterator that returns data grouped by model type. Model types are automatically inferred from the input data file.
+    """
+    get_model_type = lambda row: (row[0], row[2], len(row[1]), len(row[3]))
+    
+    model_types, total = set(), 0
+    for row in csv.reader(open(input_data_file, 'r')):
+        total += 1
+        model_type = get_model_type(row)
+
+        model_types.add(model_type)
+
+    for model_type in tqdm(model_types, disable=(not progressbar), desc="Model types"):
+        binds = list()
+        domain_seqs, peptide_seqs, interact_seqs = list(), list(), list()
+
+        for row in tqdm(csv.reader(open(input_data_file, 'r')), disable=(not progressbar), desc="Data processing", total=total):
+            if get_model_type(row) != model_type: continue
+
+            b = 1 if float(row[4]) > 0 else 0 
+            binds.append(b)
+
+            domain_seqs.append(_vectorize_sequence(row[1], amino_acid_ordering))
+            peptide_seqs.append(_vectorize_sequence(row[3], amino_acid_ordering))
+            interact_seqs.append(_vectorize_interaction(row[1], row[3], amino_acid_ordering))
+        
+        binds = np.array(binds)
+        vectorized = [np.array(a) for a in [domain_seqs, peptide_seqs, interact_seqs]]
+
+        yield model_type, vectorized, binds
+
+def convert_binding_data(input_data_file, output_data_directory, amino_acid_ordering, progressbar=False):
+    """
+    Function that converts data. Mostly wraps functions above.
+    """
+
+    assert os.path.exists(output_data_directory)
+    
+    model_fmt = list()
+
+    processed_binding_data = defaultdict(list)
+    for model_type, seqs_vectorized, binds in _load_binding_data(input_data_file, amino_acid_ordering, progressbar=progressbar):
+        model_odirname = "{0}_{1}".format(*model_type)
+        model_odirpath = os.path.join(output_data_directory,model_odirname) 
+        os.mkdir(model_odirpath)
+        
+        model_fmt.append((*model_type, model_odirname))
+        
+        np.save(os.path.join(model_odirpath, 'binding.npy'), np.array(list(binds)))
+        
+        output_files = ["dseq_mtx.npy", "pseq_mtx.npy", "iseqs_mtx.npy"]
+        for seqs, ofname in zip(seqs_vectorized, output_files):
+            np.save(os.path.join(model_odirpath, ofname), seqs)
+        
+    with open(os.path.join(output_data_directory, 'amino_acid_ordering.txt'), 'w+') as f:
+        amino_acids_list = [aa for aa, idx in sorted(amino_acid_ordering.items(), key=lambda x: x[1])]
+        f.write('\n'.join(amino_acids_list))
+    
+    with open(os.path.join(output_data_directory, 'models_specification.csv'), 'w+') as f:
+        writer = csv.writer(f, delimiter=',')
+        writer.writerows(model_fmt)
+   
+if __name__=='__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_data_file", type=str)
+    parser.add_argument("output_data_directory", type=str)
+    parser.add_argument("-a", "--amino_acid_ordering", type=str, default="../data/amino_acid_ordering.txt")
+    parser.add_argument("-p", "--progressbar", action='store_true', default=False)
+    opts = parser.parse_args()
+
+    aa_order = {aa.strip():idx for idx, aa in enumerate(open(opts.amino_acid_ordering, 'r'))}
+    convert_binding_data(opts.input_data_file, opts.output_data_directory, aa_order, progressbar=opts.progressbar)
diff --git a/train/models/__init__.py b/train/models/__init__.py
new file mode 100644
index 0000000..c4a0300
--- /dev/null
+++ b/train/models/__init__.py
@@ -0,0 +1,3 @@
+from . import hsm_id
+from . import hsm_d_singledomain
+from . import hsm_d
diff --git a/train/models/constants.py b/train/models/constants.py
new file mode 100644
index 0000000..680432e
--- /dev/null
+++ b/train/models/constants.py
@@ -0,0 +1,52 @@
+
+
+"""
+Hyper-parameters for model training
+"""
+DEF_lambda = 1e-5
+DEF_learning_rate = 1e-4
+DEF_init_std = 1e-2
+DEF_epochs = 100
+DEF_validate_step = 10
+DEF_chunk_size = 512
+
+DEF_basis_size = 100
+
+DEF_fold_seed = 0
+DEF_n_folds = 8
+
+KEY_validation_chunk = "Validation Chunk Index"
+
+KEY_learning_rate = "Learning Rate"
+KEY_lambdas = "Lambda Params"
+KEY_standard_dev = "Standard Deviation"
+KEY_epochs = "Epochs"
+KEY_validate_step = "Validate Step"
+
+KEY_chunk_seed = "Chunking Seed" 
+KEY_exclude_chunks = "Exclude Chunks"
+KEY_include_all_chunks = "Include All Chunks"
+KEY_chunk_size = "Chunk Size"
+KEY_n_folds = "Number of folds"
+
+KEY_basis_size = "Basis Size"
+KEY_input_basis = "Input Basis Filepath"
+KEY_train_basis = "Train Basis"
+
+KEY_output_directory = "Output Directory"
+
+binding_file = "binding.npy"
+domain_file = "dseq_mtx.npy"
+peptide_file = "pseq_mtx.npy"
+interaction_file = "iseqs_mtx.npy"
+
+model_specification_file = "models_specification.csv"
+
+from collections import namedtuple
+ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", [
+    "domain_type", 
+    "peptide_type", 
+    "domain_length", 
+    "peptide_length", 
+    "directory"
+    ])
diff --git a/train/models/hsm_d.py b/train/models/hsm_d.py
new file mode 100644
index 0000000..2d18044
--- /dev/null
+++ b/train/models/hsm_d.py
@@ -0,0 +1,330 @@
+import os
+import json
+from collections import *
+from itertools import *
+from tqdm import tqdm as tqdm
+
+import numpy as np
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score
+
+from . import utils
+from . import constants
+
+class HSMDomainsModel(object):
+    """
+    Multi-domain model implementing the HSM / D model.
+    
+    Most arguments are passed into the model using the args initialization option. See constants.py for the 
+    function for the keywords to use to set different options values.
+
+    The required arguments are:
+        - validation_chunk: an index that is validated against.
+        - model_format: a ModelSpecificationTuple, as defined in constants.py
+        - lambda_params: a nested python dictionary; nested for domain type then peptide type.
+        - amino_acid_ordering: a dict that indexes the vectorized data.
+        - args: all additional arguments; see constants.py
+    """
+
+    def __init__(self, validation_chunk, models_format, lambda_params, amino_acid_ordering, args=dict()):
+        ## model_format inputs are namedtuples:
+        # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"])
+        self.model_specs = dict()
+        for model_format in models_format:
+            self.model_specs[(model_format.domain_type, model_format.peptide_type)] = model_format
+        self.n_amino_acids = len(amino_acid_ordering)
+
+        # Set model hyper-parameters. Defaults to constants in the case where 
+        self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs)
+        self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed)
+        self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds)
+        self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size)
+        self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate)
+        self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std)
+        
+        self.lambda_params = dict()
+        for dtype, _lambdas in lambda_params.items():
+            for ptype, lambdap in _lambdas.items():
+                self.lambda_params[(dtype, ptype)] = lambdap
+               
+        self.validation_chunk = validation_chunk
+        self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step)
+        self.exclude_indices = args.get(constants.KEY_exclude_chunks, None)
+        self.include_all_data = args.get(constants.KEY_include_all_chunks, False)
+        
+        # Basis set size specification.
+        self.basis_size = args.get(constants.KEY_basis_size, constants.DEF_basis_size)
+        self.input_basis = args.get(constants.KEY_input_basis, None)
+        self.train_basis = args.get(constants.KEY_train_basis, False)
+
+        # Initialize tensorflow graph.
+        self.sess = tf.Session()
+        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 
+        
+        self._variables(self.init_std)
+        
+        self._model(args)
+
+        init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer())
+        self.sess.run(init_op)
+
+    
+    def _variables(self, initialization_std):
+        """
+        Intializes variable. Shouldn't be called outside of the class (called from __init__).
+        """
+        # Define basis weights a bit earlier.
+        basis_weights_shape = (self.basis_size, self.n_amino_acids ** 2)
+        if self.input_basis is not None:
+            # Load basis weights from pre-loaded option
+            basis = np.load(self.input_basis)
+            self.basis_weights = tf.Variable(basis, name='basis_weights', dtype=tf.float64, trainable=self.train_basis)
+        else:
+            # Randomly initialize basis weights
+            self.basis_weights = tf.Variable(np.random.normal(scale=initialization_std, size=basis_weights_shape),
+                    name="basis_weights", dtype=tf.float64, trainable=self.train_basis)
+        
+        PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"]) 
+        WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"]) 
+
+        # Placeholders and variables for different domain models 
+        self.placeholders, self.weights = dict(), dict()
+        for (dtype, ptype), model_fmt in self.model_specs.items():
+            self.placeholders[(dtype, ptype)] = PlaceholdersTuple(
+                    domains = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx_{0}_{1}".format(dtype, ptype)),
+                    peptides = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx_{0}_{1}".format(dtype, ptype)),
+                    interacts = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx_{0}_{1}".format(dtype, ptype)),
+                    binds = tf.placeholder(dtype=tf.float64, name="binding_data_mtx_{0}_{1}".format(dtype, ptype))
+                    )
+            
+            dlen, plen = model_fmt.domain_length, model_fmt.peptide_length 
+            
+            domain_weights_shape = (dlen * self.n_amino_acids, 1)
+            dw = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape), 
+                    name="domain_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64)
+            
+            peptide_weights_shape = (plen * self.n_amino_acids, 1)
+            pw = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape), 
+                    name="peptide_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64)
+          
+            interact_weights_shape = (dlen * plen, self.basis_size)
+            iw = tf.Variable(np.random.normal(scale=initialization_std, size=interact_weights_shape), 
+                    name="interact_weights_{0}_{1}".format(dtype, ptype), dtype=tf.float64)
+ 
+            bias = tf.Variable(0, name="bias_{0}_{1}".format(dtype, ptype), dtype=tf.float64)
+            
+            self.weights[(dtype, ptype)] = WeightsTuple(
+                        domains=dw,
+                        peptides=pw,
+                        interacts=iw,
+                        bias=bias
+                    )
+
+    def _model(self, args):
+        """
+        Defines model graph. Shouldn't be called outside of class (called from __init__).
+        """
+        # Different values can be accessed with these dictionaries throughout the model.
+        self.sparsity_penalties = dict()
+        self.logits, self.cross_entropies = dict(), dict()
+        self.predict_ops = dict()
+
+        for (dtype, ptype), model_fmt in self.model_specs.items():
+            # Load placeholders, weights, and lambda for a specific model
+            #   PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"]) 
+            #   WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"]) 
+
+
+            placeholders = self.placeholders[(dtype, ptype)]
+            weights = self.weights[(dtype, ptype)]
+            lambdap = self.lambda_params[(dtype, ptype)]
+            
+            # Scoping it for each domain-peptide model. Helpful to use w/ tensorboard.
+            with tf.name_scope("compute_{0}_{1}".format(dtype, ptype)):
+                # Sparsity Penalty. Combined across domains below.
+                with tf.name_scope("sparsity"):
+                    pen = (lambdap * tf.reduce_sum(tf.abs(weights.domains)) + 
+                           lambdap * tf.reduce_sum(tf.abs(weights.peptides)) + 
+                           lambdap * tf.reduce_sum(tf.abs(weights.interacts))) 
+                    self.sparsity_penalties[(dtype, ptype)] = pen
+                
+                # Logit computation.
+                with tf.name_scope("logits"):
+                    # Basic model, computes logits from model weights.
+                    domain_contrib = tf.sparse.sparse_dense_matmul(placeholders.domains, weights.domains)
+                    peptide_contrib = tf.sparse.sparse_dense_matmul(placeholders.peptides, weights.peptides)
+                    
+                    _interact_weights = tf.expand_dims(tf.reshape(tf.matmul(weights.interacts, self.basis_weights), [-1]), -1)
+                    interact_contrib = tf.sparse.sparse_dense_matmul(placeholders.interacts, _interact_weights)
+                    
+                    _logits = domain_contrib + peptide_contrib + interact_contrib + weights.bias            
+                    self.logits[(dtype, ptype)] = _logits 
+                
+                # Cross entropy per-domain/peptide. Combined across domains below.
+                self.cross_entropies[(dtype, ptype)] = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=placeholders.binds, logits=_logits, name="cross_entropy"))
+                
+                # Predict ops for each domain.
+                self.predict_ops[(dtype, ptype)] = tf.math.sigmoid(_logits, name="predict") 
+        
+        # Loss Function computation and optimization.
+        self.weight_penalty = tf.reduce_sum([v for v in self.sparsity_penalties.values()])
+        self.cross_entropy = tf.reduce_sum([v for v in self.cross_entropies.values()])
+        self.loss_function = self.cross_entropy + self.weight_penalty
+        
+        self.train_op = self.optimizer.minimize(self.loss_function)
+
+    def optimize_step(self, data_iter_dct, nsteps):
+        """
+        A single training epoch. Passes in an iterator over data and 
+
+        args:
+            - data_iter_dct: a set of data iterators feeding in data of the form (sequence_tuple, binds_arr)
+                sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences]                
+        """
+        costs = list() 
+        
+        for step in range(nsteps):
+            feed_dict = dict()
+                
+            for (dtype, ptype), model_format in self.model_specs.items():
+                # PlaceholdersTuple = namedtuple("PlaceholdersTuple", ["domains", "peptides", "interacts", "binds"])
+                placeholders = self.placeholders[(dtype, ptype)]
+                
+                # Convert to sparse tensors
+                sequences, binds = next(data_iter_dct[(dtype, ptype)])
+                sparse_seqs = utils.make_sparse(sequences, model_format.domain_length, model_format.peptide_length, self.n_amino_acids)
+                
+                # _p is the placeholder matched with the value, _v
+                for _p, _v in zip(placeholders, [*sparse_seqs, np.expand_dims(binds, -1)]):
+                    feed_dict[_p] = _v
+            
+            train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict)
+            costs.append(train_cost)
+        
+        return costs
+           
+    def predict(self, data_dct):
+        """
+        Make a single prediction. 
+
+        args:
+            - data_dct: a data tuple of the form InteractionDataTuple as defined in utils.py 
+        """
+        predictions_dct = dict()
+        
+        # This can be done in one call. Splitting it up for reader ease.
+        for (dtype, ptype), data in data_dct.items():
+            binds, sequences = data[0], list(data[1:])
+        
+            # Convert to sparse tensors.
+            model_fmt = self.model_specs[(dtype, ptype)]
+            dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, model_fmt.domain_length, model_fmt.peptide_length, self.n_amino_acids)
+            
+            placeholder = self.placeholders[(dtype, ptype)]
+
+            feed_dict = {
+                    placeholder.domains: dseqs_sp,
+                    placeholder.peptides: pseqs_sp,
+                    placeholder.interacts: iseqs_sp,
+                    placeholder.binds: np.expand_dims(binds, -1)
+                    }
+            
+            # Get domain-specific predict_op and predict.
+            predict_op = self.predict_ops[(dtype, ptype)]
+            predictions = self.sess.run(predict_op, feed_dict=feed_dict)
+            roc_auc = roc_auc_score(binds, predictions) 
+            
+            predictions_dct[(dtype, ptype)] = (binds, predictions, roc_auc)
+        
+        return predictions_dct 
+    
+    def train(self, data_directory):
+        """
+        Fully train the model given the input parameters. Runs for the input / default number of epochs. 
+        Mostly just wraps the optimize_step function. 
+
+        args:
+            - data_directory: the input data directory. The data is split using the input metadata.  
+        """
+
+        # Initialize train and test data splits.
+        train_data_dct, test_data_dct = dict(), dict() 
+        for (dtype, ptype), model_fmt in self.model_specs.items():
+            # model_fmt is a ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"])
+            domain_data_directory = os.path.join(data_directory, model_fmt.directory)
+            train_data, test_data = utils.split_data(domain_data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed)
+            
+            train_data_dct[(dtype, ptype)] = train_data
+            test_data_dct[(dtype, ptype)] = test_data
+        
+        # Variable input sizes. Calculate the maximum number of chunks needed for any one domain
+        #   given the input max chunk size.
+        n_chunks = max((v[0].shape[0] // self.chunk_size) + 1 for v in train_data_dct.values())
+        
+        # Initial data iterators.
+        data_iterators_dct = dict()
+        for (dtype, ptype), train_data in train_data_dct.items():
+            data_iterators_dct[(dtype, ptype)] = utils.training_iterator(train_data, nchunks=n_chunks)
+
+        self.costs = list() 
+        for epoch in tqdm(range(self.epochs), desc="Epochs"):
+            epoch_iters_dct = {k:next(v) for k,v in data_iterators_dct.items()}
+            epoch_costs = self.optimize_step(epoch_iters_dct, n_chunks)
+            self.costs.append(epoch_costs)
+            
+            if (epoch + 1) % self.validate_step == 0:
+                _perf = self.predict(test_data_dct)
+                for (dtype, ptype), (_, _, auc) in _perf.items():
+                    print("AUC: {0} (Epoch: {1}; Domain: {2}, Peptide: {3})".format(epoch+1, auc, dtype, ptype))
+
+        self.final_predictions = self.predict(test_data_dct)
+
+    def save_model(self, output_directory):
+        """
+        Save model outputs from a trained model. The output files are of the form: <id>.<spec>.<ext>
+        The id is defined as the current time (as a string). The spec defines the output, either metadata
+        or the domain type of output. ext is the file extension, either a numpy file or a json file.
+
+        args:
+            - output_directory: a string defining a directory to output the saved model to.  
+        """
+        import time        
+        
+        model_output_id = str(time.time()).replace("-", "_")
+        
+        # Metadata. Saved as dict (to JSON file) with four keys defining model type, 
+        # model format, the results of the last prediction, and parameters
+        results = list()
+        for (dtype, ptype), preds in self.final_predictions.items():
+            _r = {"binds": list(map(int, preds[0])),
+                  "predictions": list(map(float, preds[1])),
+                  "auc": float(preds[2])
+                  }
+
+            results.append([[dtype, ptype], _r]) 
+        output = {
+                "Model Type": "HSM / ID",
+                "Model Formats": [list(fmt) for fmt in self.model_specs.values()], 
+                "results": results, 
+                "parameters": {
+                        "costs": self.costs,
+                        "validation chunk": self.validation_chunk,
+                        constants.KEY_chunk_seed: self.data_split_seed,
+                        constants.KEY_n_folds: self.n_folds,
+                        constants.KEY_epochs: self.epochs,
+                        constants.KEY_learning_rate: self.learning_rate
+                    }
+                }
+        
+        ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id))
+        with open(ometadata_file, 'w+') as f:
+            json.dump(output, f)
+        
+        # The model, saved as an array archive (.npz) file.
+        for (dtype, ptype), weights in self.weights.items():
+            model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, dtype, ptype))
+            
+            # WeightsTuple = namedtuple("WeightsTuple", ["domains", "peptides", "interacts", "bias"])
+            arr = [weights.domains, weights.peptides, weights.interacts, self.basis_weights, weights.bias]
+            dw, pw, iw, basis, bias = self.sess.run(arr)
+            np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, basis_weights=basis, bias=bias)
diff --git a/train/models/hsm_d_singledomain.py b/train/models/hsm_d_singledomain.py
new file mode 100644
index 0000000..00b1985
--- /dev/null
+++ b/train/models/hsm_d_singledomain.py
@@ -0,0 +1,249 @@
+import os
+import json
+from collections import *
+from itertools import *
+from tqdm import tqdm as tqdm
+
+import numpy as np
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score
+
+from . import utils
+from . import constants
+
+class HSMSingleDomainsModel(object):
+    """
+    Single-domain model implementing the HSM / D model. Although not focused on in the paper, this is just
+    the HSM/D model applied to only one domain.
+    
+    Most arguments are passed into the model using the args initialization option. See constants.py for the 
+    function for the keywords to use to set different options values.
+
+    The required arguments are:
+        - validation_chunk: an index that is validated against.
+        - model_format: a ModelSpecificationTuple, as defined in constants.py
+        - lambda_params: a nested python dictionary; nested for domain type then peptide type.
+        - amino_acid_ordering: a dict that indexes the vectorized data.
+        - args: all additional arguments; see constants.py
+    """
+
+    def __init__(self, validation_chunk, model_format, lambda_params, amino_acid_ordering, args=dict()):
+        ## model_format inputs are namedtuples:
+        # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"])
+        self.model_spec = model_format
+        self.domain_length = model_format.domain_length
+        self.peptide_length = model_format.peptide_length
+        self.n_amino_acids = len(amino_acid_ordering)
+        
+        # Set model hyper-parameters. Defaults to constants in the case where 
+        self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs)
+        self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed)
+        self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds)
+        self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size)
+        self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate)
+        self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std)
+        self.lambda_param = lambda_params[model_format.domain_type][model_format.peptide_type] 
+        
+        self.validation_chunk = validation_chunk
+        self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step)
+        self.exclude_indices = args.get(constants.KEY_exclude_chunks, None)
+        self.include_all_data = args.get(constants.KEY_include_all_chunks, False)
+        
+        # Basis set size specification.
+        self.basis_size = args.get(constants.KEY_basis_size, constants.DEF_basis_size)
+        self.input_basis = args.get(constants.KEY_input_basis, None)
+        self.train_basis = args.get(constants.KEY_train_basis, False)
+
+        # Initialize tensorflow graph.
+        self.sess = tf.Session()
+        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 
+        
+        self._variables(self.init_std)
+        
+        self._model(args)
+
+        init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer())
+        self.sess.run(init_op)
+
+    
+    def _variables(self, initialization_std):
+        """
+        Intializes variable. Shouldn't be called outside of the class (called from __init__).
+        """
+        # Placeholders for different input data.
+        self.domain_data = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx")
+        self.peptide_data = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx")
+        self.interact_data = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx")
+        self.binding_data = tf.placeholder(dtype=tf.float64, name="binding_data_mtx")
+        
+        # Weights variables to be trained in the model.
+        domain_weights_shape = (self.domain_length * self.n_amino_acids, 1)
+        self.domain_weights = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape),
+                name="domain_weights", dtype=tf.float64)
+
+        peptide_weights_shape = (self.peptide_length * self.n_amino_acids, 1)
+        self.peptide_weights = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape),
+                name="peptide_weights", dtype=tf.float64)
+              
+        interact_weights_shape = (self.domain_length * self.peptide_length, self.basis_size)
+        basis_weights_shape = (self.basis_size, self.n_amino_acids ** 2)
+        
+        ## NOTE: This is one of the major changes. Here, we define separate weights and basis functions. Multiplied below.
+        self.interaction_weights = tf.Variable(np.random.normal(scale=initialization_std, size=interact_weights_shape),
+                name="interaction_weights", dtype=tf.float64)
+        if self.input_basis is not None:
+            # Load basis weights from pre-loaded option
+            basis = np.load(self.input_basis)
+            self.basis_weights = tf.Variable(basis, name='basis_weights', dtype=tf.float64, trainable=self.train_basis)
+        else:
+            # Randomly initialize basis weights
+            self.basis_weights = tf.Variable(np.random.normal(scale=initialization_std, size=basis_weights_shape),
+                    name="basis_weights", dtype=tf.float64, trainable=self.train_basis)
+       
+        self.bias = tf.Variable(0, name="bias", dtype=tf.float64)
+
+    def _model(self, args):
+        """
+        Defines model graph. Shouldn't be called outside of class (called from __init__).
+        """
+        with tf.name_scope("sparsity"):
+            # L1-sparsity penalty.
+            self.weight_penalty = (self.lambda_param * tf.reduce_sum(tf.abs(self.domain_weights)) + 
+                                  self.lambda_param * tf.reduce_sum(tf.abs(self.peptide_weights)) + 
+                                  self.lambda_param * tf.reduce_sum(tf.abs(self.interaction_weights))) 
+            
+        with tf.name_scope("logits"):
+            # Basic model, computes logits from model weights.
+            domain_contrib = tf.sparse.sparse_dense_matmul(self.domain_data, self.domain_weights) 
+            peptide_contrib = tf.sparse.sparse_dense_matmul(self.peptide_data, self.peptide_weights)
+            
+            ## NOTE: The other major change. Multiply together the weights and basis functions. 
+            _interact_weights = tf.expand_dims(tf.reshape(tf.matmul(self.interaction_weights, self.basis_weights), [-1]), -1)
+            interact_contrib = tf.sparse.sparse_dense_matmul(self.interact_data, _interact_weights)
+
+            self.logits = domain_contrib + peptide_contrib + interact_contrib + self.bias
+        
+        # Loss Function computation and optimization.
+        self.cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.binding_data, logits=self.logits, name="cross_entropy"))
+        self.loss_function = self.cross_entropy + self.weight_penalty
+        
+        self.train_op = self.optimizer.minimize(self.loss_function)
+        
+        # Ops for computing predictions.
+        self.predict_op = tf.math.sigmoid(self.logits, name="predict_sigmoid")
+
+    def optimize_step(self, data_iter):
+        """
+        A single training epoch. Passes in an iterator over data and 
+
+        args:
+            - data_iter: a data iterator feeding in data of the form (sequence_tuple, binds_arr)
+                sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences]                
+        """
+        costs = list() 
+        for sequences, binds in data_iter:
+            # Convert to sparse tensors for use in the model.
+            dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids)
+            
+            # Using feed_dicts is a bit unfortunate. It cleans up the data file handling on the
+            # front end. To optimize, switch over to TensorFlow's data processing pipeline.
+            feed_dict = {
+                    self.domain_data: dseqs_sp,
+                    self.peptide_data: pseqs_sp,
+                    self.interact_data: iseqs_sp,
+                    self.binding_data: np.expand_dims(binds, -1)
+                    }
+            
+            # Compute step.
+            train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict)
+            costs.append(train_cost)
+
+        return costs
+    
+    def predict(self, data):
+        """
+        Make a single prediction. 
+
+        args:
+            - data: a data tuple of the form InteractionDataTuple as defined in utils.py 
+        """
+        binds, sequences = data[0], list(data[1:])
+        
+        # Convert to sparse tensors.
+        dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids)
+
+        feed_dict = {
+                self.domain_data: dseqs_sp,
+                self.peptide_data: pseqs_sp,
+                self.interact_data: iseqs_sp,
+                self.binding_data: np.expand_dims(binds, -1)
+                }
+
+        predictions = self.sess.run(self.predict_op, feed_dict=feed_dict)
+        roc_auc = roc_auc_score(binds, predictions) 
+        return binds, predictions, roc_auc 
+    
+    def train(self, data_directory):
+        """
+        Fully train the model given the input parameters. Runs for the input / default number of epochs. 
+        Mostly just wraps the optimize_step function. 
+
+        args:
+            - data_directory: the input data directory. The data is split using the input metadata.  
+        """
+        train_data, test_data = utils.split_data(data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed)
+        data_iterator = utils.training_iterator(train_data, chunk_size=self.chunk_size)
+
+        self.costs = list() 
+        for epoch in tqdm(range(self.epochs), desc="Epochs"):
+            epoch_costs = self.optimize_step(next(data_iterator))
+            self.costs.append(epoch_costs)
+            
+            if (epoch + 1) % self.validate_step == 0:
+                _, _, auc = self.predict(test_data)
+                print("AUC: {0} (Epoch: {1})".format(epoch+1, auc))
+
+        self.final_predictions = self.predict(test_data)
+
+    def save_model(self, output_directory):
+        """
+        Save model outputs from a trained model. The output files are of the form: <id>.<spec>.<ext>
+        The id is defined as the current time (as a string). The spec defines the output, either metadata
+        or the domain type of output. ext is the file extension, either a numpy file or a json file.
+
+        args:
+            - output_directory: a string defining a directory to output the saved model to.  
+        """
+        import time        
+        
+        model_output_id = str(time.time()).replace("-", "_")
+        
+        # Metadata. Saved as dict (to JSON file) with four keys defining model type, 
+        # model format, the results of the last prediction, and parameters
+        results = {
+                "binds": list(map(int, self.final_predictions[0])),
+                "predictions": list(map(float, self.final_predictions[1])),
+                "auc": float(self.final_predictions[2])
+                }
+        output = {
+                "Model Type": "HSM / ID",
+                "Model Format": list(self.model_spec), 
+                "results": results, 
+                "parameters": {
+                        "costs": self.costs,
+                        "validation chunk": self.validation_chunk,
+                        constants.KEY_chunk_seed: self.data_split_seed,
+                        constants.KEY_n_folds: self.n_folds,
+                        constants.KEY_epochs: self.epochs,
+                        constants.KEY_learning_rate: self.learning_rate
+                    }
+                }
+        
+        ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id))
+        with open(ometadata_file, 'w+') as f:
+            json.dump(output, f)
+        
+        # The model, saved as an array archive (.npz) file. 
+        model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, self.model_spec.domain_type, self.model_spec.peptide_type))
+        dw, pw, iw, basis, bias = self.sess.run([self.domain_weights, self.peptide_weights, self.interaction_weights, self.basis_weights, self.bias]) 
+        np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, basis=basis, bias=bias)
diff --git a/train/models/hsm_id.py b/train/models/hsm_id.py
new file mode 100644
index 0000000..a81364c
--- /dev/null
+++ b/train/models/hsm_id.py
@@ -0,0 +1,229 @@
+import os
+import json
+from collections import *
+from itertools import *
+from tqdm import tqdm as tqdm
+
+import numpy as np
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score
+
+from . import utils
+from . import constants
+
+class HSMIndependentDomainsModel(object):
+    """
+    Single-domain model implementing the HSM / ID model.
+    
+    Most arguments are passed into the model using the args initialization option. See constants.py for the 
+    function for the keywords to use to set different options values.
+
+    The required arguments are:
+        - validation_chunk: an index that is validated against.
+        - model_format: a ModelSpecificationTuple, as defined in constants.py
+        - lambda_params: a nested python dictionary; nested for domain type then peptide type.
+        - amino_acid_ordering: a dict that indexes the vectorized data.
+        - args: all additional arguments; see constants.py
+    """
+
+    def __init__(self, validation_chunk, model_format, lambda_params, amino_acid_ordering, args=dict()):
+        ## model_format inputs are namedtuples:
+        # ModelSpecificationTuple = namedtuple("ModelSpecificationTuple", ["domain_type", "peptide_type", "domain_length", "peptide_length", "directory"])
+        self.model_spec = model_format
+        self.domain_length = model_format.domain_length
+        self.peptide_length = model_format.peptide_length
+        self.n_amino_acids = len(amino_acid_ordering)
+        
+        # Set model hyper-parameters. Defaults to constants in the case where 
+        self.epochs = args.get(constants.KEY_epochs, constants.DEF_epochs)
+        self.data_split_seed = args.get(constants.KEY_chunk_seed, constants.DEF_fold_seed)
+        self.n_folds = args.get(constants.KEY_n_folds, constants.DEF_n_folds)
+        self.chunk_size = args.get(constants.KEY_chunk_size, constants.DEF_chunk_size)
+        self.learning_rate = args.get(constants.KEY_learning_rate, constants.DEF_learning_rate)
+        self.init_std = args.get(constants.KEY_standard_dev, constants.DEF_init_std)
+        self.lambda_param = lambda_params[model_format.domain_type][model_format.peptide_type] 
+        
+        self.validation_chunk = validation_chunk
+        self.validate_step = args.get(constants.KEY_validate_step, constants.DEF_validate_step)
+        self.exclude_indices = args.get(constants.KEY_exclude_chunks, None)
+        self.include_all_data = args.get(constants.KEY_include_all_chunks, False)
+
+        # Initialize tensorflow graph.
+        self.sess = tf.Session()
+        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 
+        
+        self._variables(self.init_std)
+        
+        self._model(args)
+
+        init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer())
+        self.sess.run(init_op)
+
+    
+    def _variables(self, initialization_std):
+        """
+        Intializes variable. Shouldn't be called outside of the class (called from __init__).
+        """
+        # Placeholders for different input data.
+        self.domain_data = tf.sparse.placeholder(dtype=tf.float64, name="domain_data_mtx")
+        self.peptide_data = tf.sparse.placeholder(dtype=tf.float64, name="peptide_data_mtx")
+        self.interact_data = tf.sparse.placeholder(dtype=tf.float64, name="interaction_data_mtx")
+        self.binding_data = tf.placeholder(dtype=tf.float64, name="binding_data_mtx")
+        
+        # Weights variables to be trained in the model.
+        domain_weights_shape = (self.domain_length * self.n_amino_acids, 1)
+        self.domain_weights = tf.Variable(np.random.normal(scale=initialization_std, size=domain_weights_shape),
+                name="domain_weights", dtype=tf.float64)
+
+        peptide_weights_shape = (self.peptide_length * self.n_amino_acids, 1)
+        self.peptide_weights = tf.Variable(np.random.normal(scale=initialization_std, size=peptide_weights_shape),
+                name="peptide_weights", dtype=tf.float64)
+
+        interaction_weights_shape = (self.domain_length * self.peptide_length * (self.n_amino_acids ** 2), 1)
+        self.interaction_weights = tf.Variable(np.random.normal(scale=initialization_std, size=interaction_weights_shape), 
+                name="interaction_weights", dtype=tf.float64)
+
+        self.bias = tf.Variable(0, name="bias", dtype=tf.float64)
+
+    def _model(self, args):
+        """
+        Defines model graph. Shouldn't be called outside of class (called from __init__).
+        """
+        with tf.name_scope("sparsity"):
+            # L1-sparsity penalty.
+            self.weight_penalty = (self.lambda_param * tf.reduce_sum(tf.abs(self.domain_weights)) + 
+                                  self.lambda_param * tf.reduce_sum(tf.abs(self.peptide_weights)) + 
+                                  self.lambda_param * tf.reduce_sum(tf.abs(self.interaction_weights))) 
+            
+        with tf.name_scope("logits"):
+            # Basic model, computes logits from model weights.
+            domain_contrib = tf.sparse.sparse_dense_matmul(self.domain_data, self.domain_weights) 
+            peptide_contrib = tf.sparse.sparse_dense_matmul(self.peptide_data, self.peptide_weights)
+            interact_contrib = tf.sparse.sparse_dense_matmul(self.interact_data, self.interaction_weights)
+
+            self.logits = domain_contrib + peptide_contrib + interact_contrib + self.bias
+        
+        # Loss Function computation and optimization.
+        self.cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.binding_data, logits=self.logits, name="cross_entropy"))
+        self.loss_function = self.cross_entropy + self.weight_penalty
+        
+        self.train_op = self.optimizer.minimize(self.loss_function)
+        
+        # Ops for computing predictions.
+        self.predict_op = tf.math.sigmoid(self.logits, name="predict_sigmoid")
+
+    def optimize_step(self, data_iter):
+        """
+        A single training epoch. Passes in an iterator over data and 
+
+        args:
+            - data_iter: a data iterator feeding in data of the form (sequence_tuple, binds_arr)
+                sequence_tuple is ordered[domain sequences, peptide sequences, interact sequences]                
+        """
+        costs = list() 
+        for sequences, binds in data_iter:
+            # Convert to sparse tensors for use in the model.
+            dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids)
+            
+            # Using feed_dicts is a bit unfortunate. It cleans up the data file handling on the
+            # front end. To optimize, switch over to TensorFlow's data processing pipeline.
+            feed_dict = {
+                    self.domain_data: dseqs_sp,
+                    self.peptide_data: pseqs_sp,
+                    self.interact_data: iseqs_sp,
+                    self.binding_data: np.expand_dims(binds, -1)
+                    }
+            
+            # Compute step.
+            train_cost, _ = self.sess.run([self.loss_function, self.train_op], feed_dict=feed_dict)
+            costs.append(train_cost)
+
+        return costs
+    
+    def predict(self, data):
+        """
+        Make a single prediction. 
+
+        args:
+            - data: a data tuple of the form InteractionDataTuple as defined in utils.py 
+        """
+        binds, sequences = data[0], list(data[1:])
+        
+        # Convert to sparse tensors.
+        dseqs_sp, pseqs_sp, iseqs_sp = utils.make_sparse(sequences, self.domain_length, self.peptide_length, self.n_amino_acids)
+
+        feed_dict = {
+                self.domain_data: dseqs_sp,
+                self.peptide_data: pseqs_sp,
+                self.interact_data: iseqs_sp,
+                self.binding_data: np.expand_dims(binds, -1)
+                }
+
+        predictions = self.sess.run(self.predict_op, feed_dict=feed_dict)
+        roc_auc = roc_auc_score(binds, predictions) 
+        return binds, predictions, roc_auc 
+    
+    def train(self, data_directory):
+        """
+        Fully train the model given the input parameters. Runs for the input / default number of epochs. 
+        Mostly just wraps the optimize_step function. 
+
+        args:
+            - data_directory: the input data directory. The data is split using the input metadata.  
+        """
+        train_data, test_data = utils.split_data(data_directory, self.validation_chunk, include_all=self.include_all_data, excluded_chunks=self.exclude_indices, n_folds=self.n_folds, seed=self.data_split_seed)
+        data_iterator = utils.training_iterator(train_data, chunk_size=self.chunk_size)
+
+        self.costs = list() 
+        for epoch in tqdm(range(self.epochs), desc="Epochs"):
+            epoch_costs = self.optimize_step(next(data_iterator))
+            self.costs.append(epoch_costs)
+            
+            if (epoch + 1) % self.validate_step == 0:
+                _, _, auc = self.predict(test_data)
+                print("AUC: {0} (Epoch: {1})".format(epoch+1, auc))
+
+        self.final_predictions = self.predict(test_data)
+
+    def save_model(self, output_directory):
+        """
+        Save model outputs from a trained model. The output files are of the form: <id>.<spec>.<ext>
+        The id is defined as the current time (as a string). The spec defines the output, either metadata
+        or the domain type of output. ext is the file extension, either a numpy file or a json file.
+
+        args:
+            - output_directory: a string defining a directory to output the saved model to.  
+        """
+        import time        
+        
+        model_output_id = str(time.time()).replace("-", "_")
+        
+        # Metadata. Saved as dict (to JSON file) with four keys defining model type, 
+        # model format, the results of the last prediction, and parameters
+        results = {
+                "binds": list(map(int, self.final_predictions[0])),
+                "predictions": list(map(float, self.final_predictions[1])),
+                "auc": float(self.final_predictions[2])
+                }
+        output = {
+                "Model Type": "HSM / ID",
+                "Model Format": list(self.model_spec), 
+                "results": results, 
+                "parameters": {
+                        "costs": self.costs,
+                        "validation chunk": self.validation_chunk,
+                        constants.KEY_chunk_seed: self.data_split_seed,
+                        constants.KEY_n_folds: self.n_folds,
+                        constants.KEY_epochs: self.epochs,
+                        constants.KEY_learning_rate: self.learning_rate
+                    }
+                }
+        
+        ometadata_file = os.path.join(output_directory, "{0}.metadata.json".format(model_output_id))
+        with open(ometadata_file, 'w+') as f:
+            json.dump(output, f)
+        
+        # The model, saved as an array archive (.npz) file. 
+        model_file = os.path.join(output_directory, "{0}.{1}.{2}.npz".format(model_output_id, self.model_spec.domain_type, self.model_spec.peptide_type))
+        dw, pw, iw, bias = self.sess.run([self.domain_weights, self.peptide_weights, self.interaction_weights, self.bias]) 
+        np.savez(model_file, domain_weights=dw, peptide_weights=pw, interact_weights=iw, bias=bias)
diff --git a/train/models/utils.py b/train/models/utils.py
new file mode 100644
index 0000000..1c14ecc
--- /dev/null
+++ b/train/models/utils.py
@@ -0,0 +1,83 @@
+import os
+from collections import *
+from itertools import *
+
+import numpy as np
+import tensorflow as tf
+
+from . import constants
+
+binding_data = "binding.npy"
+domain_sequences = "dseq_mtx.npy"
+peptide_sequences = "pseq_mtx.npy"
+interaction_sequences = "iseqs_mtx.npy"
+
+InteractionDataTuple = namedtuple("InteractionDataTuple", ["binds", "domain_seqs", "peptide_seqs", "interaction_seqs"])
+def split_data(data_directory, validation_chunk, include_all=False, excluded_chunks=None, n_folds=constants.DEF_n_folds, seed=constants.DEF_fold_seed):
+    files = [constants.binding_file, constants.domain_file,  constants.peptide_file, constants.interaction_file]
+    data = [np.load(os.path.join(data_directory, f)) for f in files]
+    
+    data_sz = data[0].shape[0]
+    
+    np.random.seed(seed)
+    randomized = np.random.permutation(data_sz)
+    chunks = np.array_split(randomized, n_folds)
+    
+    vindices = chunks[validation_chunk]
+    
+    validation_data = InteractionDataTuple(*[mtx[vindices] for mtx in data])
+
+    if include_all:
+        _excluded = []
+    else:
+        _excluded = [validation_chunk, *(excluded_chunks if excluded_chunks is not None else [])]
+    
+    tindices = [i for cidx, chunk in enumerate(chunks) for i in chunk if cidx not in _excluded]
+    train_data = InteractionDataTuple(*[mtx[tindices] for mtx in data])
+
+    np.random.seed()
+    
+    return train_data, validation_data
+
+def _data_chunker(data, nchunks):
+    randomized = np.random.permutation(data.binds.shape[0])
+    splits = np.array_split(randomized, nchunks)
+
+    for spl in splits:
+        binds_spl = data.binds[spl]
+        seqs_spl = [s[spl] for s in data[1:]]
+
+        yield seqs_spl, binds_spl
+
+def training_iterator(interaction_data, chunk_size=None, nchunks=None):
+    if chunk_size is None and nchunks is None: 
+        raise ValueError("Need to pass a chunk size or num. of chunks")
+
+    if nchunks is None:
+        nchunks = interaction_data.binds.shape[0] // chunk_size + 1
+
+    while True:
+        yield _data_chunker(interaction_data, nchunks) 
+
+def make_sparse(sequences, domain_length, peptide_length, n_amino_acids):
+    def _sparsify(arr, ncols):
+        nrows = arr.shape[0]
+
+        ridxes, compressed_cidxes = np.where(arr >= 0)
+        cidxes = arr[ridxes, compressed_cidxes]
+        
+        vals = np.ones(ridxes.size)
+        
+        idxes = np.vstack([ridxes, cidxes]).T
+         
+        shape = [nrows, ncols]
+        
+        return tf.SparseTensorValue(
+                    indices = idxes, 
+                    values = vals,
+                    dense_shape = shape
+                )
+    
+    col_sizes = [domain_length * n_amino_acids, peptide_length * n_amino_acids, domain_length * peptide_length * n_amino_acids * n_amino_acids]
+    return [_sparsify(m, ncols) for m, ncols in zip(sequences, col_sizes)]
+
diff --git a/train/train.py b/train/train.py
new file mode 100644
index 0000000..dff4f50
--- /dev/null
+++ b/train/train.py
@@ -0,0 +1,201 @@
+"""
+This file provides a CLI for running models with a variety of possible 
+options.
+
+Note: you can also import the models directly and run them.
+
+The easiest way to understand these config options is likely by running the
+built-in help method, i.e.:
+    $ python <path_to_dir>/train.py --help
+"""
+import os
+import csv
+import argparse
+
+from collections import *
+from itertools import *
+
+import models
+from models import constants as constants
+
+def create_parser():
+    parser = argparse.ArgumentParser(description="A wrapper to specify, train, and assess different models. " +
+            "Models are implemented primarily in the models/ directory and may be accessed as a package."
+            )
+    
+    parser.add_argument("validation_chunk_idx", type=int, default=0, help="Validation chunk")
+
+    model_spec_group = parser.add_argument_group(title="Model Specification")
+    
+    # Specificy model type 
+    model_group = model_spec_group.add_mutually_exclusive_group(required=True)
+    model_group.add_argument("--hsm_id", action="store_true", default=False, 
+            help="Train the HSM/ID model. Domain specification must specify a single domain." ) 
+    model_group.add_argument("--hsm_d", action="store_true",  
+            help="Train the HSM/D model. Domain specification may include one or multiple domains.")
+    model_spec_group.add_argument("-d", "--domains", nargs='+', type=str,
+            help="Specify a domain type for training a new model. To specify a specific domain - peptide type, pass with a ':' character separating types.")
+    model_spec_group.add_argument("-a", "--include_all_domains", action='store_true', default=False,
+            help="Include all possible domains for which data are available")
+    
+    hyperparam_group = parser.add_argument_group(title="Learning hyperparameters")
+    hyperparam_group.add_argument("-l", "--lambdap", type=float, default=constants.DEF_lambda, 
+            help="Sparsity parametrization parameter. Default: {0}".format(str(constants.DEF_lambda)))
+    hyperparam_group.add_argument("--lambda_params", type=str, nargs='+',
+            help="Sparsity parametrization per-domain, passed as a string of format {domain}:{lambda} or {domain}:{peptide}:{lambda} where each choice" + 
+            "is comma-separated. Default: {0}".format(str(constants.DEF_lambda)))
+    hyperparam_group.add_argument("-r", "--learning_rate", type=float, default=constants.DEF_learning_rate, 
+            help="Add an initial learning rate. Default: {0}".format(str(constants.DEF_learning_rate)))
+    hyperparam_group.add_argument("-s", "--standard_deviation", type=float, default=constants.DEF_init_std, 
+            help="Set the initialization standard dev. Default: {0}".format(str(constants.DEF_init_std)))
+    hyperparam_group.add_argument("-e", "--epochs", type=int, default=constants.DEF_epochs, 
+            help="Max number of epochs to run the strategy for. Default: {0}".format(str(constants.DEF_epochs)))
+    hyperparam_group.add_argument("-v", "--validate_step", type=int, default=constants.DEF_validate_step,
+            help="The step number to run validation on. Default: {0}.".format(str(constants.DEF_validate_step)))
+    
+    data_handling_group = parser.add_argument_group(title="Data handling group") 
+    data_handling_group.add_argument("--chunk_seed", type=int, default=0,
+            help="Seed used to synchronize chunks across a single split. Use the same seed to train on different chunks.")
+    data_handling_group.add_argument("--exclude_chunks", nargs="+", type=int, 
+            help="Exclude specific chunks from training or testing. Helpful for excluding a test chunk")
+    data_handling_group.add_argument("--include_all_data", action='store_true', default=False, 
+            help='Use all data in training. Used at end to get final predictions.')
+    data_handling_group.add_argument("-c", "--chunk_size", type=int, default=constants.DEF_chunk_size,
+            help="The maximum chunk size for any iteration. Actual chunk size is less than this (calculated as the " +
+            "amount of data divided by chunk size plus one. Default: {0}.".format(str(constants.DEF_chunk_size)))
+    data_handling_group.add_argument("--n_folds", type=int, default=constants.DEF_n_folds, 
+            help="Number of chunks to split dataset into. Default: {0}".format(constants.DEF_n_folds))
+
+    # Input / output data processing.
+    input_output_group = parser.add_argument_group(title="Input/output group")
+    input_output_group.add_argument("-i", "--input_directory", type=str, default="../data/train/hsm_preprocessed/",
+            help="Input data directory. The form of this directory should be the same as output by convert_binding_data.py")
+    input_output_group.add_argument("-o", "--output_directory", type=str, default="outputs/", 
+            help="Location to output data to.")
+    input_output_group.add_argument("--amino_acids_ordering", type=str, default="../data/amino_acid_ordering.txt",
+            help="Amino acid ordering file. Allows user to add new amino acids (e.g. phospho-ser/thr).")
+
+    # Model specific arguments:
+    basis_spec_group = parser.add_argument_group(title="Basis Specification")
+    basis_spec_group.add_argument("-b", "--basis_size", type=int, default=constants.DEF_basis_size,
+            help="Set the hierarchical/shared-basis size. Default: 100".format(str(constants.DEF_basis_size)))
+    basis_spec_group.add_argument("--load_basis", type=str, 
+            help="Run the model w/ a pre-defined basis. Basis is passed as a numpy file.")
+    basis_spec_group.add_argument("--no_train", action="store_true", default=False,
+            help="Makes the basis set non-trainable for a given initialization.")
+    
+    return parser
+
+def format_arguments_directory(options):
+    arguments = dict()
+    
+    arguments[constants.KEY_learning_rate] = options.learning_rate
+    arguments[constants.KEY_standard_dev] = options.standard_deviation
+    arguments[constants.KEY_epochs] = options.epochs
+    arguments[constants.KEY_validate_step] = options.validate_step
+    
+    arguments[constants.KEY_chunk_seed] = options.chunk_seed
+    arguments[constants.KEY_exclude_chunks] = options.exclude_chunks
+    arguments[constants.KEY_include_all_chunks] = options.include_all_data
+    arguments[constants.KEY_chunk_size] = options.chunk_size
+
+    arguments[constants.KEY_basis_size] = options.basis_size
+    arguments[constants.KEY_input_basis] = options.load_basis
+    arguments[constants.KEY_train_basis] = options.no_train
+    
+    arguments[constants.KEY_output_directory] = options.output_directory
+    arguments[constants.KEY_n_folds] = options.n_folds
+
+    
+    arguments = {k:v for k,v in arguments.items() if v is not None}
+    
+    return arguments
+    
+    
+def get_model(args, opts):
+    def _match_model(dinput, model_spec_dir):
+        if ":" in dinput:
+            dtype, ptype = dinput.split(":")
+        else:
+            dtype = dinput
+            ptype = None
+
+        assert dtype in possible_models 
+        
+        if ptype is None:
+            assert len(possible_models[dtype]) == 1
+            return next(iter(possible_models[dtype].values()))
+        else:
+            assert ptype in possible_models[dtype]
+            return possible_models[dtype][ptype]
+
+    model_formats_ifile = os.path.join(opts.input_directory, constants.model_specification_file)
+    
+    all_models = list()
+    possible_models = defaultdict(dict)
+    for dtype, ptype, dlen, plen, idir in csv.reader(open(model_formats_ifile, 'r'), delimiter=','):
+        mformat = constants.ModelSpecificationTuple(dtype, ptype, int(dlen), int(plen), idir)
+        possible_models[dtype][ptype] = mformat
+        all_models.append(mformat)
+    
+    def _process_lambdas(lambda_param, lambda_params, possible_models=possible_models):
+        lambda_dirs = defaultdict(dict)
+        if lambda_params is None:
+            for dtype, model_types in possible_models.items():
+                lambda_dirs[dtype] = {ptype:lambda_param for ptype in possible_models[dtype].keys()}
+        else:
+            for arg in lambda_params:
+                spl = arg.split(":")
+                dtype, lp = spl[0], float(spl[-1])
+
+                if len(spl) == 2:
+                    lambda_dirs[dtype] = {ptype:lp for ptype in possible_models[dtype].keys()}
+                elif len(spl) == 3:
+                    ptype = spl[1]
+                    lambda_dirs[dtype][ptype] = lp
+                else:
+                    raise ValueError("Lambdas mis-specified")
+            
+            for dtype, model_types in possible_models.items():
+                for ptype in model_types.keys():
+                    if ptype not in lambda_dirs[dtype]:
+                        lambda_dirs[dtype][ptype] = constants.DEF_lambda
+        
+        return lambda_dirs
+    
+    lambda_params_dct = _process_lambdas(options.lambdap, options.lambda_params)
+
+    if opts.hsm_id:
+        assert len(opts.domains) == 1
+        
+        model_formats = _match_model(opts.domains[0], possible_models)
+        model_type = models.hsm_id.HSMIndependentDomainsModel
+        input_directories = os.path.join(opts.input_directory, model_formats.directory) 
+    elif len(opts.domains) == 1:
+        model_formats = _match_model(opts.domains[0], possible_models)
+        model_type = models.hsm_d_singledomain.HSMSingleDomainsModel
+        input_directories = os.path.join(opts.input_directory, model_formats.directory) 
+    elif opts.include_all_domains:
+        model_formats =  model_formats_ifile
+        model_type = models.hsm_d.HSMDomainsModel
+        input_directories = opts.input_directory
+    else:
+        model_formats = [_match_model(d, possible_models) for d in opts.domains]
+        model_type = models.hsm_d.HSMDomainsModel
+        input_directories = opts.input_directory
+    
+    amino_acid_ordering = {aa.strip():idx for idx, aa in enumerate(open(opts.amino_acids_ordering, 'r'))}
+
+    model = model_type(opts.validation_chunk_idx, model_formats, lambda_params_dct, amino_acid_ordering, args) 
+    return model, input_directories
+
+if __name__=='__main__':
+    parser = create_parser()
+    options = parser.parse_args()
+    assert os.path.exists(options.input_directory) and os.path.exists(options.output_directory) and os.path.exists(options.amino_acids_ordering)
+
+    arguments = format_arguments_directory(options)
+
+    model, idirs = get_model(arguments, options)
+    model.train(idirs)
+    model.save_model(options.output_directory)