Skip to content

Commit

Permalink
Merge pull request #29 from hammerlab/iipan_revised
Browse files Browse the repository at this point in the history
Add NetMHCIIpan to mhctools
  • Loading branch information
tavinathanson committed Aug 26, 2015
2 parents f4ea684 + 93e8ffc commit 25e1585
Show file tree
Hide file tree
Showing 10 changed files with 297 additions and 56 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ strongest_predicted_binder = epitope_collection[0]
```
## API

The following models are available in `mhctools`:
The following models are available in `mhctools`:
* `NetMHC`: requires locally installed version of [NetMHC](http://www.cbs.dtu.dk/services/NetMHC/)
* `NetMHCpan`: requires locally installed version of [NetMHCpan](http://www.cbs.dtu.dk/services/NetMHCpan/)
* `NetMHCIIpan`: requires locally installed version of [NetMHCIIpan](http://www.cbs.dtu.dk/services/NetMHCIIpan/)
* `NetMHCcons`: requires locally installed version of [NetMHCcons](http://www.cbs.dtu.dk/services/NetMHCcons/)
* `IedbMhcClass1`: Uses IEDB's REST API for class I binding predictions.
* `IedbMhcClass2`: Uses IEDB's REST API for class II binding predictions.
Expand Down
2 changes: 2 additions & 0 deletions mhctools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .netmhc import NetMHC
from .netmhc_cons import NetMHCcons
from .netmhc_pan import NetMHCpan
from .netmhcii_pan import NetMHCIIpan
from .random_predictor import RandomBindingPredictor

__all__ = [
Expand All @@ -25,5 +26,6 @@
"NetMHC",
"NetMHCcons",
"NetMHCpan",
"NetMHCIIpan",
"RandomBindingPredictor",
]
140 changes: 100 additions & 40 deletions mhctools/alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class AlleleParseError(Exception):
dog="DLA",
sheep=["OVA", "Ovar", "Ovca"],
swine="SLA",
mouse=["H2"],
mouse=["H2", "H-2"],
rainbow_trout="Onmy",
rat=["Rano", "Rara", "RT1"],
salmon="Sasa",
Expand Down Expand Up @@ -113,13 +113,58 @@ def _parse_mouse_allele_name(name):
allele = name[0]
return gene_name.upper(), allele.lower()

def parse_allele_name(name):
def parse_classi_or_classii_allele_name(name):
"""
Handle different forms of both single and alpha-beta allele names.
Alpha-beta alleles may look like:
DPA10105-DPB110001
HLA-DPA1*01:05-DPB1*100:01
hla-dpa1*0105-dpb1*10001
dpa1*0105-dpb1*10001
HLA-DPA1*01:05/DPB1*100:01
"""
species, name = split_species_prefix(name)

# Handle the case where alpha/beta pairs are separated with a /.
name = name.replace("/", "-")

parts = name.split("-")
assert len(parts) <= 2, "Allele has too many parts: %s" % name
if len(parts) == 1:
return (parse_allele_name(name, species),)
else:
return (parse_allele_name(parts[0], species),
parse_allele_name(parts[1], species))

def split_species_prefix(name):
"""
Splits off the species component of the allele name from the rest of it.
Given "HLA-A*02:01", returns ("HLA", "A*02:01").
"""
species = None
for species_list in SPECIES_PREFIXES.values():
if isinstance(species_list, str):
species_list = [species_list]
for curr_species in species_list:
prefix = name[:len(curr_species) + 1].upper()
if prefix == (curr_species.upper() + "-"):
species = curr_species
name = name[len(curr_species) + 1:]
return (species, name)
return (species, name)

def parse_allele_name(name, species_prefix=None):
"""Takes an allele name and splits it into four parts:
1) species prefix
2) gene name
3) allele family
4) allele code
If species_prefix is provided, that is used instead of getting the species prefix from the name.
(And in that case, a species prefix in the name will result in an error being raised.)
For example, in all of the following inputs:
"HLA-A*02:01"
"A0201"
Expand All @@ -141,20 +186,19 @@ def parse_allele_name(name):
if len(name) == 0:
raise ValueError("Can't normalize empty MHC allele name")

if name.upper().startswith("H2") or name.upper().startswith("H-2"):
gene, allele_code = _parse_mouse_allele_name(name)
species_from_name, name = split_species_prefix(name)
if species_prefix:
if species_from_name:
raise ValueError("If a species is passed in, we better not have another "
"species in the name itself.")
species = species_prefix
else:
species = species_from_name

if species == "H-2" or species == "H2":
gene, allele_code = _parse_mouse_allele_name("H-2-" + name)
# mice don't have allele families
return AlleleName("H-2", gene, "", allele_code)
species = None
for species_list in SPECIES_PREFIXES.values():
if isinstance(species_list, str):
species_list = [species_list]
for curr_species in species_list:
prefix = name[:len(curr_species) + 1].upper()
if prefix == (curr_species.upper() + "-"):
species = curr_species
name = name[len(curr_species) + 1:]
break

if len(name) == 0:
raise AlleleParseError("Incomplete MHC allele name: %s" % (original,))
Expand All @@ -165,10 +209,10 @@ def parse_allele_name(name):
raise AlleleParseError("Can't parse allele name: %s" % original)
species = "HLA"

if name[0] == "D":
if name[0].upper() == "D":
# MHC class II genes like "DQA1" need to be parsed with both
# letters and numbers
gene, name = _parse_alphanum(name)
gene, name = _parse_alphanum(name, 4)
elif name[0].isalpha():
# if there are more separators to come, then assume the gene names
# can have the form "DQA1"
Expand Down Expand Up @@ -216,9 +260,12 @@ def parse_allele_name(name):

_normalized_allele_cache = {}
def normalize_allele_name(raw_allele):
"""MHC alleles are named with a frustatingly loose system. It's not uncommon
"""MHC alleles are named with a frustratingly loose system. It's not uncommon
to see dozens of different forms for the same allele.
Note: this function works with both class I and class II allele names (including
alpha/beta pairs).
For example, these all refer to the same MHC sequence:
- HLA-A*02:01
- HLA-A02:01
Expand Down Expand Up @@ -254,35 +301,48 @@ def normalize_allele_name(raw_allele):
"""
if raw_allele in _normalized_allele_cache:
return _normalized_allele_cache[raw_allele]
parsed_allele = parse_allele_name(raw_allele)
if len(parsed_allele.allele_family) > 0:
normalized = "%s-%s*%s:%s" % (
parsed_allele.species,
parsed_allele.gene,
parsed_allele.allele_family,
parsed_allele.allele_code)
else:
# mice don't have allele families
# e.g. H-2-Kd
# species = H-2
# gene = K
# allele = d
normalized = "%s-%s%s" % (
parsed_allele.species,
parsed_allele.gene,
parsed_allele.allele_code)
parsed_alleles = parse_classi_or_classii_allele_name(raw_allele)
species = parsed_alleles[0].species
normalized_list = [species]
for parsed_allele in parsed_alleles:
if len(parsed_allele.allele_family) > 0:
normalized_list.append("%s*%s:%s" % (
parsed_allele.gene,
parsed_allele.allele_family,
parsed_allele.allele_code))
else:
# mice don't have allele families
# e.g. H-2-Kd
# species = H-2
# gene = K
# allele = d
normalized_list.append("%s%s" % (
parsed_allele.gene,
parsed_allele.allele_code))
normalized = "-".join(normalized_list)
_normalized_allele_cache[raw_allele] = normalized
return normalized

def compact_allele_name(raw_allele):
"""
Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db
Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db or
HLA-DPA1*01:05-DPB1*100:01 into DPA10105-DPB110001
"""
parsed_allele = parse_allele_name(raw_allele)
return "%s%s%s" % (
parsed_allele.gene,
parsed_allele.allele_family,
parsed_allele.allele_code)
parsed_alleles = parse_classi_or_classii_allele_name(raw_allele)
species = parsed_alleles[0].species
normalized_list = []
for parsed_allele in parsed_alleles:
if len(parsed_allele.allele_family) > 0:
normalized_list.append("%s%s%s" % (
parsed_allele.gene,
parsed_allele.allele_family,
parsed_allele.allele_code))
else:
# mice don't have allele families
normalized_list.append("%s%s" % (
parsed_allele.gene,
parsed_allele.allele_code))
return "-".join(normalized_list)

def _parse_substring(allele, pred, max_len=None):
"""
Expand Down
6 changes: 4 additions & 2 deletions mhctools/base_commandline_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,10 @@ def _determine_valid_alleles(command, supported_allele_flag):
line = line.strip()
if not line.startswith('#') and len(line) > 0:
try:
allele = normalize_allele_name(line)
valid_alleles.add(allele)
# We need to normalize these alleles (the output of the predictor
# when it lists its supported alleles) so that they are comparable with
# our own alleles.
valid_alleles.add(normalize_allele_name(line))
except AlleleParseError as error:
logging.info("Skipping allele %s: %s" % (
line, error))
Expand Down
4 changes: 2 additions & 2 deletions mhctools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
except NameError:
string_classes = (str,)

def seq_to_str(obj):
def seq_to_str(obj, sep=","):
"""
Given a sequence convert it to a comma separated string.
If, however, the argument is a single object, return its string
Expand All @@ -30,7 +30,7 @@ def seq_to_str(obj):
if isinstance(obj, string_classes):
return obj
elif isinstance(obj, (list, tuple)):
return ",".join([str(x) for x in obj])
return sep.join([str(x) for x in obj])
else:
return str(obj)

Expand Down
17 changes: 12 additions & 5 deletions mhctools/file_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,12 @@ def parse_netmhcpan_stdout(
netmhc_output,
fasta_dictionary,
prediction_method_name="netmhcpan",
sequence_key_mapping=None):
sequence_key_mapping=None,
is_netmhcpanii=False):
"""
Parse the output format for NetMHCpan and NetMHCcons, which looks like:
Parse the output format for NetMHCpan, NetMHCIIpan* and NetMHCcons, which looks like:
* netMHCIIpan has two extra fields
# Affinity Threshold for Strong binding peptides 50.000',
# Affinity Threshold for Weak binding peptides 500.000',
Expand All @@ -196,11 +199,15 @@ def parse_netmhcpan_stdout(
fasta_dictionary=fasta_dictionary,
prediction_method_name=prediction_method_name)

# netMHCIIpan has some extra fields
n_required_fields = 9 if is_netmhcpanii else 7
for fields in split_stdout_lines(netmhc_output):
n_required_fields = 7
if len(fields) >= n_required_fields:
pos, allele, peptide, key, log_affinity, ic50, rank = \
fields[:n_required_fields]
if is_netmhcpanii:
pos, allele, peptide, key, Pos, Core, log_affinity, ic50, rank = (
fields[:n_required_fields])
else:
pos, allele, peptide, key, log_affinity, ic50, rank = fields[:n_required_fields]
try:
pos = int(pos)
allele = str(allele)
Expand Down
105 changes: 105 additions & 0 deletions mhctools/netmhcii_pan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) 2014. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function, division, absolute_import
import tempfile
import logging

from .alleles import parse_classi_or_classii_allele_name
from .base_commandline_predictor import BaseCommandlinePredictor
from .cleanup_context import CleanupFiles
from .common import check_sequence_dictionary, seq_to_str
from .file_formats import create_input_fasta_files, parse_netmhcpan_stdout
from .process_helpers import AsyncProcess

class NetMHCIIpan(BaseCommandlinePredictor):

def __init__(
self,
alleles,
netmhc_command="netMHCIIpan",
epitope_lengths=[15, 16, 17, 18, 19, 20]):
BaseCommandlinePredictor.__init__(
self,
name="NetMHCIIpan",
command=netmhc_command,
alleles=alleles,
epitope_lengths=epitope_lengths,
supported_allele_flag="-list")

def normalize_allele(self, allele_name):
"""
netMHCIIpan has some unique requirements for allele formats,
expecting the following forms:
- DRB1_0101 (for non-alpha/beta pairs)
- HLA-DQA10501-DQB10636 (for alpha and beta pairs)
"""
parsed_alleles = parse_classi_or_classii_allele_name(allele_name)
if len(parsed_alleles) == 1:
return "%s_%s%s" % (parsed_alleles[0].gene,
parsed_alleles[0].allele_family,
parsed_alleles[0].allele_code)
else:
return "HLA-%s%s%s-%s%s%s" % (
parsed_alleles[0].gene,
parsed_alleles[0].allele_family,
parsed_alleles[0].allele_code,
parsed_alleles[1].gene,
parsed_alleles[1].allele_family,
parsed_alleles[1].allele_code)

def predict(self, fasta_dictionary):
fasta_dictionary = check_sequence_dictionary(fasta_dictionary)
input_filenames, sequence_key_mapping = create_input_fasta_files(
fasta_dictionary)
# TODO: We are not currently using the file chunking
# functionality here. See NetMHCcons.
input_filename = input_filenames[0]

alleles_str = \
",".join(self.normalize_allele(allele) for allele in self.alleles)
output_file = tempfile.NamedTemporaryFile(
"w",
prefix="netMHCIIpan_output",
delete=False)
args = [
self.command,
"-length", seq_to_str(self.epitope_lengths, sep=" "),
"-f", input_filename,
"-a", alleles_str
]
logging.info(" ".join(args))

with CleanupFiles(
filenames=[input_filename],
files=[output_file]):
process = AsyncProcess(
args=args,
redirect_stdout_file=output_file)
process.wait()
# need to flush written output and re-open for read
output_file.close()
with open(output_file.name, 'r') as f:
file_contents = f.read()
epitope_collection = parse_netmhcpan_stdout(
file_contents,
sequence_key_mapping=sequence_key_mapping,
fasta_dictionary=fasta_dictionary,
prediction_method_name="netmhciipan",
is_netmhcpanii=True)

if len(epitope_collection) == 0:
logging.warn(file_contents)
raise ValueError("No epitopes from netMHCIIpan")
return epitope_collection
Loading

0 comments on commit 25e1585

Please sign in to comment.