diff --git a/predict/README.md b/predict/README.md index ca105d6..c6ecd83 100644 --- a/predict/README.md +++ b/predict/README.md @@ -4,7 +4,7 @@ This directory may be used to make novel domain-peptide or protein-protein inter # Usage -Predictions are run through one of two scripts, `predict_domains.py` and `predict_proteins.py`, for either domain-peptide or protein-protein predictions. All domain-peptide interactions that comprise a protein-protein prediction are output from the `predict_proteins.py` script. +Predictions are run through one of two scripts, `predict_domains.py` and `predict_proteins.py`, for either domain-peptide or protein-protein predictions. All domain-peptide interactions that comprise a protein-protein prediction are output from the `predict_proteins.py` script. By default, data is output to an `outputs/` directory; please create this directory before running. ## Domain-Peptide Interaction Predictions diff --git a/predict/predict_domains.py b/predict/predict_domains.py index 68a69a7..a73587b 100644 --- a/predict/predict_domains.py +++ b/predict/predict_domains.py @@ -2,7 +2,7 @@ import pickle, csv, json from collections import * from itertools import * -import tqdm +from tqdm import tqdm as tqdm import numpy as np @@ -11,7 +11,7 @@ # Models are specified using the named tuple (in utils/load_models.py): # ModelsSpecificationFormat = namedtuple("ModelsSpecificationFormat", ["models_format", "models"]) -def predict_domain_peptide_interactions(domain_file, peptide_file, models, output_file): +def predict_domain_peptide_interactions(domain_file, peptide_file, models, output_file, progressbar=False): """ Predicts domain peptide interactions between the domain and peptide metadata files. """ @@ -22,19 +22,19 @@ def predict_domain_peptide_interactions(domain_file, peptide_file, models, outpu with open(output_file, 'w+') as ofile: writer = csv.writer(ofile, delimiter=',') - for (did, dseq, dtype), (pid, pseq, ptype, mtype) in product(domain_metadata, peptide_metadata): - print(dtype, ptype) + total = len(domain_metadata) * len(peptide_metadata) + for (did, dseq, dtype), (pid, pseq, ptype, mtype) in tqdm(product(domain_metadata, peptide_metadata), disable=(not progressbar), desc="Pairwise Domain-peptide iteration", total=total): if dtype in models.models and ptype in models.models[dtype]: v = models.models[dtype][ptype](dseq, pseq) writer.writerow([dtype, did, dseq, ptype, pid, pseq, v]) - print(dtype, ptype, v) if __name__=='__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("domains_metadata", type=str) parser.add_argument("peptides_metadata", type=str) - parser.add_argument("-o", "--output_likelihoods", type=str, default="domain_peptide_predictions.csv") + parser.add_argument("-o", "--output_likelihoods", type=str, default="outputs/domain_peptide_predictions.csv") + parser.add_argument("-p", "--progressbar", action='store_true', default=False) model_specification_group = parser.add_argument_group(title="Trained models specification") model_specification_group.add_argument("-m", "--models", type=str, default="models/hsm_pretrained/", @@ -48,5 +48,4 @@ def predict_domain_peptide_interactions(domain_file, peptide_file, models, outpu opts = parser.parse_args() models_specification = utils.load_models.load_models_from_dir(opts.models, opts.model_format, opts.amino_acids_order) - print(models_specification) - predict_domain_peptide_interactions(opts.domains_metadata, opts.peptides_metadata, models_specification, opts.output_likelihoods) + predict_domain_peptide_interactions(opts.domains_metadata, opts.peptides_metadata, models_specification, opts.output_likelihoods, progressbar=opts.progressbar) diff --git a/predict/predict_proteins.py b/predict/predict_proteins.py index 86c0139..27e8ffc 100644 --- a/predict/predict_proteins.py +++ b/predict/predict_proteins.py @@ -2,7 +2,7 @@ import pickle, csv, json from collections import * from itertools import * -import tqdm +from tqdm import tqdm as tqdm import numpy as np @@ -12,58 +12,67 @@ # ModelsSpecificationFormat = namedtuple("ModelsSpecificationFormat", ["models_format", "models"]) MetadataTuple = namedtuple("MetadataTuple", ["pairs", "domain_metadata", "peptide_metadata"]) -def load_metadata(domain_metadata_fpath, peptide_metadata_fpath, pretrained_models, ppi_pairs_fpath=None): +def load_metadata(domain_metadata_fpath, peptide_metadata_fpath, pretrained_models, ppi_pairs_fpath=None, progressbar=False): """ Loads input metadata from the pre-trained models. """ + domain_mdata = defaultdict(list) + for protein_id, dseq, dtype in csv.reader(open(domain_metadata_fpath, 'r')): + domain_mdata[protein_id].append((dtype, dseq)) - domain_mdata = pickle.load(open(domain_metadata_fpath, 'rb')) - peptide_mdata = pickle.load(open(peptide_metadata_fpath, 'rb')) + peptide_mdata = defaultdict(list) + for protein_id, pseq, ptype, _ in csv.reader(open(peptide_metadata_fpath, 'r')): + peptide_mdata[protein_id].append((ptype, pseq)) if ppi_pairs_fpath is None: pairs = set(tuple(sorted(t)) for t in product(set(domain_mdata.keys()), set(peptide_mdata.keys()))) else: pairs = set(tuple(sorted(r)) for r in csv.reader(open(ppi_pairs_fpath, 'r'))) - pairs = [t for t in pairs if utils.ppi_prediction.is_valid(*t, domain_mdata, peptide_mdata, pretrained_models)] + # Filters out invalid pairs. + pairs = [t for t in tqdm(pairs, disable=(not progressbar), desc="Validate pairs loop") if utils.ppi_prediction.is_valid(*t, domain_mdata, peptide_mdata, pretrained_models)] + + print("Computing for {0} valid pairs".format(len(pairs))) return MetadataTuple(pairs, domain_mdata, peptide_mdata) -def predict_interactions(metadata, models, output_ppis_fname, output_dpis_fname): +def predict_interactions(metadata, models, output_ppis_fname, output_dpis_fname, progressbar=False): """ Predictions protein-protein interactions. Mostly wraps utils/ppi_prediction.py. """ with open(output_ppis_fname, 'w+') as ppif, open(output_dpis_fname, 'w+') as dpif: writer = csv.writer(ppif, delimiter=',') - for ui, uj in metadata.pairs: + for ui, uj in tqdm(metadata.pairs, disable=(not progressbar), desc="PPI Prediction"): ppi_p, dpi_ps = utils.ppi_prediction.predict_ppi(ui, uj, metadata.domain_metadata, metadata.peptide_metadata, models) writer.writerow([ui,uj,ppi_p]) - dpif.write(json.dumps(dpi_ps) + "\n") + dpif.write(json.dumps([ui,uj,dpi_ps]) + "\n") if __name__=='__main__': import argparse parser = argparse.ArgumentParser() - parser.add_argument("-p", "--ppi_pairs", type=str, default=None) + parser.add_argument("--ppi_pairs", type=str, default=None) parser.add_argument("--domain_metadata", type=str, default="../data/predict/domain_metadata.csv") parser.add_argument("--peptide_metadata", type=str, default="../data/predict/peptide_metadata.csv") - parser.add_argument("--output_ppi_prediction", type=str, default="ppi_predictions.csv", + parser.add_argument("--output_ppi_prediction", type=str, default="outputs/ppi_predictions.csv", help="Output file for PPI predictions. Output is a csv file with format ,,.") - parser.add_argument("--output_dpi_prediction", type=str, default="dpi_predictions.txt", + parser.add_argument("--output_dpi_prediction", type=str, default="outputs/dpi_predictions.txt", help="Output file for domain-peptide predictions. Output is a text file with each row representing a " + "JSON string with domain-peptide interactions.") - + parser.add_argument("-p", "--progressbar", action='store_true', default=False) + model_specification_group = parser.add_argument_group(title="Trained models specification") model_specification_group.add_argument("-m", "--models", type=str, default="models/hsm_pretrained/", help="Defines a directory containing ." ) - model_specification_group.add_argument("--model_format", type=str, default="models/hsm_pretrained/models_format.csv", - help="Defines the parameters for each model. Default: models/hsm_pretrained_format.csv") - model_specification_group.add_argument("--amino_acids_order", type=str, default="models/amino_acids.txt" + model_specification_group.add_argument("--model_format", type=str, default="models/hsm_pretrained/model_formats.csv", + help="Defines the parameters for each model. Default: models/hsm_pretrained/model_formats.csv") + model_specification_group.add_argument("--amino_acids_order", type=str, default="models/amino_acid_ordering.txt", help="Define a different group (and order) of amino-acids. Needed to add a 'chemistry' type not currently used " + - "like phospho-serine/threonine. Default: models/amino_acids.txt.") + "like phospho-serine/threonine. Default: models/amino_acid_ordering.txt.") opts = parser.parse_args() models_specification = utils.load_models.load_models_from_dir(opts.models, opts.model_format, opts.amino_acids_order) - metadata_tuple = load_metadata(opts.domain_metadata, opts.peptide_metadata_fpath, models_specification, ppi_pairs_fpath=opts.ppi_pairs) - predict_interactions(metadata_tuple, models_specification, opts.output_ppi_prediction, opts.output_dpi_prediction) + metadata_tuple = load_metadata(opts.domain_metadata, opts.peptide_metadata, models_specification, ppi_pairs_fpath=opts.ppi_pairs, progressbar=opts.progressbar) + + predict_interactions(metadata_tuple, models_specification, opts.output_ppi_prediction, opts.output_dpi_prediction, progressbar=opts.progressbar) diff --git a/predict/utils/dpi_prediction.py b/predict/utils/dpi_prediction.py index 8c99c03..c7b0bc3 100644 --- a/predict/utils/dpi_prediction.py +++ b/predict/utils/dpi_prediction.py @@ -92,7 +92,7 @@ def _slide_interactions(dseq, pseq, weights, peptide_length, amino_acid_ordering Computes the likelihood of a sliding-window interaction. """ - likelihood_dist = np.array([_compute_interaction(dseq, pseq_subseq, weights, amino_acid_ordering) for pseq_subseq in emit_peptide_sequences(pseq, peptide_lengtht)]) + likelihood_dist = np.array([_compute_interaction(dseq, pseq_subseq, weights, amino_acid_ordering) for pseq_subseq in emit_peptide_sequences(pseq, peptide_length)]) no_bind = np.prod(1-likelihood_dist) @@ -101,6 +101,6 @@ def _slide_interactions(dseq, pseq, weights, peptide_length, amino_acid_ordering def domain_peptide_interaction_prediction(domain_sequence, peptide_sequence, weights, domain_length, peptide_length, is_fixed, amino_acid_ordering): if is_fixed: - return _compute_interaction(domain_sequence, pseq, weights, amino_acid_ordering) + return _compute_interaction(domain_sequence, peptide_sequence, weights, amino_acid_ordering) else: return _slide_interactions(domain_sequence, peptide_sequence, weights, peptide_length, amino_acid_ordering) diff --git a/predict/utils/load_models.py b/predict/utils/load_models.py index c6f83a8..62d9a9c 100644 --- a/predict/utils/load_models.py +++ b/predict/utils/load_models.py @@ -7,6 +7,14 @@ from . import dpi_prediction +""" +File contains functions for loading in all available models. +Primary function is load_models_from_dir which returns a nested dict mapping +domain type -> peptide type -> prediction function, taken from dpi_prediction. +This is output in the ModelsSpecificationFormat +""" + + def _load_model(model_fpath, domain_len, peptide_len, is_fixed, amino_acid_ordering): # TODO: load models arr_archive = np.load(model_fpath) diff --git a/predict/utils/ppi_prediction.py b/predict/utils/ppi_prediction.py index 9f3b87a..d60f9d1 100644 --- a/predict/utils/ppi_prediction.py +++ b/predict/utils/ppi_prediction.py @@ -11,10 +11,10 @@ def is_valid(i, j, domain_metadata, peptide_metadata, models): def _is_valid_oneway(domains, peptides): dtypes = set(d[0] for d in domains) valid_ptypes = set(p for d in dtypes for p in models.models[d].keys()) - - ptypes = set(p[0] for p in domains) - return len(ptypes.intersection(valid_ptypes)) > 0 + ptypes = set(p[0] for p in peptides) + return len(ptypes.intersection(valid_ptypes)) > 0 + return _is_valid_oneway(di,pj) or _is_valid_oneway(dj,pi) @timeout(2)