Skip to content

Commit

Permalink
Readability code update
Browse files Browse the repository at this point in the history
  • Loading branch information
jmcunnin committed Jan 6, 2020
1 parent 75af4ec commit 7096586
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 32 deletions.
2 changes: 1 addition & 1 deletion predict/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This directory may be used to make novel domain-peptide or protein-protein inter

# Usage

Predictions are run through one of two scripts, `predict_domains.py` and `predict_proteins.py`, for either domain-peptide or protein-protein predictions. All domain-peptide interactions that comprise a protein-protein prediction are output from the `predict_proteins.py` script.
Predictions are run through one of two scripts, `predict_domains.py` and `predict_proteins.py`, for either domain-peptide or protein-protein predictions. All domain-peptide interactions that comprise a protein-protein prediction are output from the `predict_proteins.py` script. By default, data is output to an `outputs/` directory; please create this directory before running.

## Domain-Peptide Interaction Predictions

Expand Down
15 changes: 7 additions & 8 deletions predict/predict_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pickle, csv, json
from collections import *
from itertools import *
import tqdm
from tqdm import tqdm as tqdm

import numpy as np

Expand All @@ -11,7 +11,7 @@
# Models are specified using the named tuple (in utils/load_models.py):
# ModelsSpecificationFormat = namedtuple("ModelsSpecificationFormat", ["models_format", "models"])

def predict_domain_peptide_interactions(domain_file, peptide_file, models, output_file):
def predict_domain_peptide_interactions(domain_file, peptide_file, models, output_file, progressbar=False):
"""
Predicts domain peptide interactions between the domain and peptide metadata files.
"""
Expand All @@ -22,19 +22,19 @@ def predict_domain_peptide_interactions(domain_file, peptide_file, models, outpu
with open(output_file, 'w+') as ofile:
writer = csv.writer(ofile, delimiter=',')

for (did, dseq, dtype), (pid, pseq, ptype, mtype) in product(domain_metadata, peptide_metadata):
print(dtype, ptype)
total = len(domain_metadata) * len(peptide_metadata)
for (did, dseq, dtype), (pid, pseq, ptype, mtype) in tqdm(product(domain_metadata, peptide_metadata), disable=(not progressbar), desc="Pairwise Domain-peptide iteration", total=total):
if dtype in models.models and ptype in models.models[dtype]:
v = models.models[dtype][ptype](dseq, pseq)
writer.writerow([dtype, did, dseq, ptype, pid, pseq, v])
print(dtype, ptype, v)

if __name__=='__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("domains_metadata", type=str)
parser.add_argument("peptides_metadata", type=str)
parser.add_argument("-o", "--output_likelihoods", type=str, default="domain_peptide_predictions.csv")
parser.add_argument("-o", "--output_likelihoods", type=str, default="outputs/domain_peptide_predictions.csv")
parser.add_argument("-p", "--progressbar", action='store_true', default=False)

model_specification_group = parser.add_argument_group(title="Trained models specification")
model_specification_group.add_argument("-m", "--models", type=str, default="models/hsm_pretrained/",
Expand All @@ -48,5 +48,4 @@ def predict_domain_peptide_interactions(domain_file, peptide_file, models, outpu
opts = parser.parse_args()

models_specification = utils.load_models.load_models_from_dir(opts.models, opts.model_format, opts.amino_acids_order)
print(models_specification)
predict_domain_peptide_interactions(opts.domains_metadata, opts.peptides_metadata, models_specification, opts.output_likelihoods)
predict_domain_peptide_interactions(opts.domains_metadata, opts.peptides_metadata, models_specification, opts.output_likelihoods, progressbar=opts.progressbar)
45 changes: 27 additions & 18 deletions predict/predict_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pickle, csv, json
from collections import *
from itertools import *
import tqdm
from tqdm import tqdm as tqdm

import numpy as np

Expand All @@ -12,58 +12,67 @@
# ModelsSpecificationFormat = namedtuple("ModelsSpecificationFormat", ["models_format", "models"])

MetadataTuple = namedtuple("MetadataTuple", ["pairs", "domain_metadata", "peptide_metadata"])
def load_metadata(domain_metadata_fpath, peptide_metadata_fpath, pretrained_models, ppi_pairs_fpath=None):
def load_metadata(domain_metadata_fpath, peptide_metadata_fpath, pretrained_models, ppi_pairs_fpath=None, progressbar=False):
"""
Loads input metadata from the pre-trained models.
"""
domain_mdata = defaultdict(list)
for protein_id, dseq, dtype in csv.reader(open(domain_metadata_fpath, 'r')):
domain_mdata[protein_id].append((dtype, dseq))

domain_mdata = pickle.load(open(domain_metadata_fpath, 'rb'))
peptide_mdata = pickle.load(open(peptide_metadata_fpath, 'rb'))
peptide_mdata = defaultdict(list)
for protein_id, pseq, ptype, _ in csv.reader(open(peptide_metadata_fpath, 'r')):
peptide_mdata[protein_id].append((ptype, pseq))

if ppi_pairs_fpath is None:
pairs = set(tuple(sorted(t)) for t in product(set(domain_mdata.keys()), set(peptide_mdata.keys())))
else:
pairs = set(tuple(sorted(r)) for r in csv.reader(open(ppi_pairs_fpath, 'r')))

pairs = [t for t in pairs if utils.ppi_prediction.is_valid(*t, domain_mdata, peptide_mdata, pretrained_models)]
# Filters out invalid pairs.
pairs = [t for t in tqdm(pairs, disable=(not progressbar), desc="Validate pairs loop") if utils.ppi_prediction.is_valid(*t, domain_mdata, peptide_mdata, pretrained_models)]

print("Computing for {0} valid pairs".format(len(pairs)))

return MetadataTuple(pairs, domain_mdata, peptide_mdata)

def predict_interactions(metadata, models, output_ppis_fname, output_dpis_fname):
def predict_interactions(metadata, models, output_ppis_fname, output_dpis_fname, progressbar=False):
"""
Predictions protein-protein interactions. Mostly wraps utils/ppi_prediction.py.
"""
with open(output_ppis_fname, 'w+') as ppif, open(output_dpis_fname, 'w+') as dpif:
writer = csv.writer(ppif, delimiter=',')

for ui, uj in metadata.pairs:
for ui, uj in tqdm(metadata.pairs, disable=(not progressbar), desc="PPI Prediction"):
ppi_p, dpi_ps = utils.ppi_prediction.predict_ppi(ui, uj, metadata.domain_metadata, metadata.peptide_metadata, models)
writer.writerow([ui,uj,ppi_p])
dpif.write(json.dumps(dpi_ps) + "\n")
dpif.write(json.dumps([ui,uj,dpi_ps]) + "\n")

if __name__=='__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--ppi_pairs", type=str, default=None)
parser.add_argument("--ppi_pairs", type=str, default=None)
parser.add_argument("--domain_metadata", type=str, default="../data/predict/domain_metadata.csv")
parser.add_argument("--peptide_metadata", type=str, default="../data/predict/peptide_metadata.csv")
parser.add_argument("--output_ppi_prediction", type=str, default="ppi_predictions.csv",
parser.add_argument("--output_ppi_prediction", type=str, default="outputs/ppi_predictions.csv",
help="Output file for PPI predictions. Output is a csv file with format <ID 1>,<ID 2>,<PPI Likelihood>.")
parser.add_argument("--output_dpi_prediction", type=str, default="dpi_predictions.txt",
parser.add_argument("--output_dpi_prediction", type=str, default="outputs/dpi_predictions.txt",
help="Output file for domain-peptide predictions. Output is a text file with each row representing a " +
"JSON string with domain-peptide interactions.")

parser.add_argument("-p", "--progressbar", action='store_true', default=False)

model_specification_group = parser.add_argument_group(title="Trained models specification")
model_specification_group.add_argument("-m", "--models", type=str, default="models/hsm_pretrained/",
help="Defines a directory containing ." )
model_specification_group.add_argument("--model_format", type=str, default="models/hsm_pretrained/models_format.csv",
help="Defines the parameters for each model. Default: models/hsm_pretrained_format.csv")
model_specification_group.add_argument("--amino_acids_order", type=str, default="models/amino_acids.txt"
model_specification_group.add_argument("--model_format", type=str, default="models/hsm_pretrained/model_formats.csv",
help="Defines the parameters for each model. Default: models/hsm_pretrained/model_formats.csv")
model_specification_group.add_argument("--amino_acids_order", type=str, default="models/amino_acid_ordering.txt",
help="Define a different group (and order) of amino-acids. Needed to add a 'chemistry' type not currently used " +
"like phospho-serine/threonine. Default: models/amino_acids.txt.")
"like phospho-serine/threonine. Default: models/amino_acid_ordering.txt.")

opts = parser.parse_args()

models_specification = utils.load_models.load_models_from_dir(opts.models, opts.model_format, opts.amino_acids_order)
metadata_tuple = load_metadata(opts.domain_metadata, opts.peptide_metadata_fpath, models_specification, ppi_pairs_fpath=opts.ppi_pairs)
predict_interactions(metadata_tuple, models_specification, opts.output_ppi_prediction, opts.output_dpi_prediction)
metadata_tuple = load_metadata(opts.domain_metadata, opts.peptide_metadata, models_specification, ppi_pairs_fpath=opts.ppi_pairs, progressbar=opts.progressbar)

predict_interactions(metadata_tuple, models_specification, opts.output_ppi_prediction, opts.output_dpi_prediction, progressbar=opts.progressbar)
4 changes: 2 additions & 2 deletions predict/utils/dpi_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _slide_interactions(dseq, pseq, weights, peptide_length, amino_acid_ordering
Computes the likelihood of a sliding-window interaction.
"""

likelihood_dist = np.array([_compute_interaction(dseq, pseq_subseq, weights, amino_acid_ordering) for pseq_subseq in emit_peptide_sequences(pseq, peptide_lengtht)])
likelihood_dist = np.array([_compute_interaction(dseq, pseq_subseq, weights, amino_acid_ordering) for pseq_subseq in emit_peptide_sequences(pseq, peptide_length)])

no_bind = np.prod(1-likelihood_dist)

Expand All @@ -101,6 +101,6 @@ def _slide_interactions(dseq, pseq, weights, peptide_length, amino_acid_ordering

def domain_peptide_interaction_prediction(domain_sequence, peptide_sequence, weights, domain_length, peptide_length, is_fixed, amino_acid_ordering):
if is_fixed:
return _compute_interaction(domain_sequence, pseq, weights, amino_acid_ordering)
return _compute_interaction(domain_sequence, peptide_sequence, weights, amino_acid_ordering)
else:
return _slide_interactions(domain_sequence, peptide_sequence, weights, peptide_length, amino_acid_ordering)
8 changes: 8 additions & 0 deletions predict/utils/load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@

from . import dpi_prediction

"""
File contains functions for loading in all available models.
Primary function is load_models_from_dir which returns a nested dict mapping
domain type -> peptide type -> prediction function, taken from dpi_prediction.
This is output in the ModelsSpecificationFormat
"""


def _load_model(model_fpath, domain_len, peptide_len, is_fixed, amino_acid_ordering):
# TODO: load models
arr_archive = np.load(model_fpath)
Expand Down
6 changes: 3 additions & 3 deletions predict/utils/ppi_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ def is_valid(i, j, domain_metadata, peptide_metadata, models):
def _is_valid_oneway(domains, peptides):
dtypes = set(d[0] for d in domains)
valid_ptypes = set(p for d in dtypes for p in models.models[d].keys())

ptypes = set(p[0] for p in domains)
return len(ptypes.intersection(valid_ptypes)) > 0

ptypes = set(p[0] for p in peptides)
return len(ptypes.intersection(valid_ptypes)) > 0

return _is_valid_oneway(di,pj) or _is_valid_oneway(dj,pi)

@timeout(2)
Expand Down

0 comments on commit 7096586

Please sign in to comment.