Merge pull request #29 from hammerlab/iipan_revised

Add NetMHCIIpan to mhctools
openvax · Aug 26, 2015 · 25e1585 · 25e1585
2 parents f4ea684 + 93e8ffc
commit 25e1585
Show file tree

Hide file tree

Showing 10 changed files with 297 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -27,8 +27,10 @@ strongest_predicted_binder = epitope_collection[0]
 ```
 ## API
 
-The following models are available in `mhctools`: 
+The following models are available in `mhctools`:
+* `NetMHC`: requires locally installed version of [NetMHC](http://www.cbs.dtu.dk/services/NetMHC/)
 * `NetMHCpan`: requires locally installed version of [NetMHCpan](http://www.cbs.dtu.dk/services/NetMHCpan/)
+* `NetMHCIIpan`: requires locally installed version of [NetMHCIIpan](http://www.cbs.dtu.dk/services/NetMHCIIpan/)
 * `NetMHCcons`: requires locally installed version of [NetMHCcons](http://www.cbs.dtu.dk/services/NetMHCcons/)
 * `IedbMhcClass1`: Uses IEDB's REST API for class I binding predictions.
 * `IedbMhcClass2`: Uses IEDB's REST API for class II binding predictions.

diff --git a/mhctools/__init__.py b/mhctools/__init__.py
@@ -11,6 +11,7 @@
 from .netmhc import NetMHC
 from .netmhc_cons import NetMHCcons
 from .netmhc_pan import NetMHCpan
+from .netmhcii_pan import NetMHCIIpan
 from .random_predictor import RandomBindingPredictor
 
 __all__ = [
@@ -25,5 +26,6 @@
     "NetMHC",
     "NetMHCcons",
     "NetMHCpan",
+    "NetMHCIIpan",
     "RandomBindingPredictor",
 ]
diff --git a/mhctools/alleles.py b/mhctools/alleles.py
@@ -26,7 +26,7 @@ class AlleleParseError(Exception):
     dog="DLA",
     sheep=["OVA", "Ovar", "Ovca"],
     swine="SLA",
-    mouse=["H2"],
+    mouse=["H2", "H-2"],
     rainbow_trout="Onmy",
     rat=["Rano", "Rara", "RT1"],
     salmon="Sasa",
@@ -113,13 +113,58 @@ def _parse_mouse_allele_name(name):
     allele = name[0]
     return gene_name.upper(), allele.lower()
 
-def parse_allele_name(name):
+def parse_classi_or_classii_allele_name(name):
+    """
+    Handle different forms of both single and alpha-beta allele names.
+    Alpha-beta alleles may look like:
+
+    DPA10105-DPB110001
+    HLA-DPA1*01:05-DPB1*100:01
+    hla-dpa1*0105-dpb1*10001
+    dpa1*0105-dpb1*10001
+    HLA-DPA1*01:05/DPB1*100:01
+    """
+    species, name = split_species_prefix(name)
+
+    # Handle the case where alpha/beta pairs are separated with a /.
+    name = name.replace("/", "-")
+
+    parts = name.split("-")
+    assert len(parts) <= 2, "Allele has too many parts: %s" % name
+    if len(parts) == 1:
+        return (parse_allele_name(name, species),)
+    else:
+        return (parse_allele_name(parts[0], species),
+                parse_allele_name(parts[1], species))
+
+def split_species_prefix(name):
+    """
+    Splits off the species component of the allele name from the rest of it.
+
+    Given "HLA-A*02:01", returns ("HLA", "A*02:01").
+    """
+    species = None
+    for species_list in SPECIES_PREFIXES.values():
+        if isinstance(species_list, str):
+            species_list = [species_list]
+        for curr_species in species_list:
+            prefix = name[:len(curr_species) + 1].upper()
+            if prefix == (curr_species.upper() + "-"):
+                species = curr_species
+                name = name[len(curr_species) + 1:]
+                return (species, name)
+    return (species, name)
+
+def parse_allele_name(name, species_prefix=None):
     """Takes an allele name and splits it into four parts:
         1) species prefix
         2) gene name
         3) allele family
         4) allele code
 
+    If species_prefix is provided, that is used instead of getting the species prefix from the name. 
+    (And in that case, a species prefix in the name will result in an error being raised.)
+
     For example, in all of the following inputs:
         "HLA-A*02:01"
         "A0201"
@@ -141,20 +186,19 @@ def parse_allele_name(name):
     if len(name) == 0:
         raise ValueError("Can't normalize empty MHC allele name")
 
-    if name.upper().startswith("H2") or name.upper().startswith("H-2"):
-        gene, allele_code = _parse_mouse_allele_name(name)
+    species_from_name, name = split_species_prefix(name)
+    if species_prefix:
+        if species_from_name:
+            raise ValueError("If a species is passed in, we better not have another "
+                             "species in the name itself.")
+        species = species_prefix
+    else:
+        species = species_from_name
+
+    if species == "H-2" or species == "H2":
+        gene, allele_code = _parse_mouse_allele_name("H-2-" + name)
         # mice don't have allele families
         return AlleleName("H-2", gene, "", allele_code)
-    species = None
-    for species_list in SPECIES_PREFIXES.values():
-        if isinstance(species_list, str):
-            species_list = [species_list]
-        for curr_species in species_list:
-            prefix = name[:len(curr_species) + 1].upper()
-            if prefix == (curr_species.upper() + "-"):
-                species = curr_species
-                name = name[len(curr_species) + 1:]
-                break
 
     if len(name) == 0:
         raise AlleleParseError("Incomplete MHC allele name: %s" % (original,))
@@ -165,10 +209,10 @@ def parse_allele_name(name):
             raise AlleleParseError("Can't parse allele name: %s" % original)
         species = "HLA"
 
-    if name[0] == "D":
+    if name[0].upper() == "D":
         # MHC class II genes like "DQA1" need to be parsed with both
         # letters and numbers
-        gene, name = _parse_alphanum(name)
+        gene, name = _parse_alphanum(name, 4)
     elif name[0].isalpha():
         # if there are more separators to come, then assume the gene names
         # can have the form "DQA1"
@@ -216,9 +260,12 @@ def parse_allele_name(name):
 
 _normalized_allele_cache = {}
 def normalize_allele_name(raw_allele):
-    """MHC alleles are named with a frustatingly loose system. It's not uncommon
+    """MHC alleles are named with a frustratingly loose system. It's not uncommon
     to see dozens of different forms for the same allele.
 
+    Note: this function works with both class I and class II allele names (including
+    alpha/beta pairs).
+
     For example, these all refer to the same MHC sequence:
         - HLA-A*02:01
         - HLA-A02:01
@@ -254,35 +301,48 @@ def normalize_allele_name(raw_allele):
     """
     if raw_allele in _normalized_allele_cache:
         return _normalized_allele_cache[raw_allele]
-    parsed_allele = parse_allele_name(raw_allele)
-    if len(parsed_allele.allele_family) > 0:
-        normalized = "%s-%s*%s:%s" % (
-            parsed_allele.species,
-            parsed_allele.gene,
-            parsed_allele.allele_family,
-            parsed_allele.allele_code)
-    else:
-        # mice don't have allele families
-        # e.g. H-2-Kd
-        # species = H-2
-        # gene = K
-        # allele = d
-        normalized = "%s-%s%s" % (
-            parsed_allele.species,
-            parsed_allele.gene,
-            parsed_allele.allele_code)
+    parsed_alleles = parse_classi_or_classii_allele_name(raw_allele)
+    species = parsed_alleles[0].species
+    normalized_list = [species]
+    for parsed_allele in parsed_alleles:
+        if len(parsed_allele.allele_family) > 0:
+            normalized_list.append("%s*%s:%s" % (
+                parsed_allele.gene,
+                parsed_allele.allele_family,
+                parsed_allele.allele_code))
+        else:
+            # mice don't have allele families
+            # e.g. H-2-Kd
+            # species = H-2
+            # gene = K
+            # allele = d
+            normalized_list.append("%s%s" % (
+                parsed_allele.gene,
+                parsed_allele.allele_code))
+    normalized = "-".join(normalized_list)
     _normalized_allele_cache[raw_allele] = normalized
     return normalized
 
 def compact_allele_name(raw_allele):
     """
-    Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db
+    Turn HLA-A*02:01 into A0201 or H-2-D-b into H-2Db or
+    HLA-DPA1*01:05-DPB1*100:01 into DPA10105-DPB110001
     """
-    parsed_allele = parse_allele_name(raw_allele)
-    return "%s%s%s" % (
-        parsed_allele.gene,
-        parsed_allele.allele_family,
-        parsed_allele.allele_code)
+    parsed_alleles = parse_classi_or_classii_allele_name(raw_allele)
+    species = parsed_alleles[0].species
+    normalized_list = []
+    for parsed_allele in parsed_alleles:
+        if len(parsed_allele.allele_family) > 0:
+            normalized_list.append("%s%s%s" % (
+                parsed_allele.gene,
+                parsed_allele.allele_family,
+                parsed_allele.allele_code))
+        else:
+            # mice don't have allele families
+            normalized_list.append("%s%s" % (
+                parsed_allele.gene,
+                parsed_allele.allele_code))
+    return "-".join(normalized_list)
 
 def _parse_substring(allele, pred, max_len=None):
     """

diff --git a/mhctools/base_commandline_predictor.py b/mhctools/base_commandline_predictor.py
@@ -80,8 +80,10 @@ def _determine_valid_alleles(command, supported_allele_flag):
                 line = line.strip()
                 if not line.startswith('#') and len(line) > 0:
                     try:
-                        allele = normalize_allele_name(line)
-                        valid_alleles.add(allele)
+                        # We need to normalize these alleles (the output of the predictor
+                        # when it lists its supported alleles) so that they are comparable with
+                        # our own alleles.
+                        valid_alleles.add(normalize_allele_name(line))
                     except AlleleParseError as error:
                         logging.info("Skipping allele %s: %s" % (
                             line, error))

diff --git a/mhctools/common.py b/mhctools/common.py
@@ -21,7 +21,7 @@
 except NameError:
     string_classes = (str,)
 
-def seq_to_str(obj):
+def seq_to_str(obj, sep=","):
     """
     Given a sequence convert it to a comma separated string.
     If, however, the argument is a single object, return its string
@@ -30,7 +30,7 @@ def seq_to_str(obj):
     if isinstance(obj, string_classes):
         return obj
     elif isinstance(obj, (list, tuple)):
-        return ",".join([str(x) for x in obj])
+        return sep.join([str(x) for x in obj])
     else:
         return str(obj)
 

diff --git a/mhctools/file_formats.py b/mhctools/file_formats.py
@@ -167,9 +167,12 @@ def parse_netmhcpan_stdout(
         netmhc_output,
         fasta_dictionary,
         prediction_method_name="netmhcpan",
-        sequence_key_mapping=None):
+        sequence_key_mapping=None,
+        is_netmhcpanii=False):
     """
-    Parse the output format for NetMHCpan and NetMHCcons, which looks like:
+    Parse the output format for NetMHCpan, NetMHCIIpan* and NetMHCcons, which looks like:
+
+     * netMHCIIpan has two extra fields
 
     # Affinity Threshold for Strong binding peptides  50.000',
     # Affinity Threshold for Weak binding peptides 500.000',
@@ -196,11 +199,15 @@ def parse_netmhcpan_stdout(
         fasta_dictionary=fasta_dictionary,
         prediction_method_name=prediction_method_name)
 
+    # netMHCIIpan has some extra fields
+    n_required_fields = 9 if is_netmhcpanii else 7
     for fields in split_stdout_lines(netmhc_output):
-        n_required_fields = 7
         if len(fields) >= n_required_fields:
-            pos, allele, peptide, key, log_affinity, ic50, rank = \
-                fields[:n_required_fields]
+            if is_netmhcpanii:
+                pos, allele, peptide, key, Pos, Core, log_affinity, ic50, rank = (
+                    fields[:n_required_fields])
+            else:
+                pos, allele, peptide, key, log_affinity, ic50, rank = fields[:n_required_fields]
             try:
                 pos = int(pos)
                 allele = str(allele)

diff --git a/mhctools/netmhcii_pan.py b/mhctools/netmhcii_pan.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+import tempfile
+import logging
+
+from .alleles import parse_classi_or_classii_allele_name
+from .base_commandline_predictor import BaseCommandlinePredictor
+from .cleanup_context import CleanupFiles
+from .common import check_sequence_dictionary, seq_to_str
+from .file_formats import create_input_fasta_files, parse_netmhcpan_stdout
+from .process_helpers import AsyncProcess
+
+class NetMHCIIpan(BaseCommandlinePredictor):
+
+    def __init__(
+            self,
+            alleles,
+            netmhc_command="netMHCIIpan",
+            epitope_lengths=[15, 16, 17, 18, 19, 20]):
+        BaseCommandlinePredictor.__init__(
+            self,
+            name="NetMHCIIpan",
+            command=netmhc_command,
+            alleles=alleles,
+            epitope_lengths=epitope_lengths,
+            supported_allele_flag="-list")
+
+    def normalize_allele(self, allele_name):
+        """
+        netMHCIIpan has some unique requirements for allele formats,
+        expecting the following forms:
+         - DRB1_0101 (for non-alpha/beta pairs)
+         - HLA-DQA10501-DQB10636 (for alpha and beta pairs)
+        """
+        parsed_alleles = parse_classi_or_classii_allele_name(allele_name)
+        if len(parsed_alleles) == 1:
+            return "%s_%s%s" % (parsed_alleles[0].gene,
+                                parsed_alleles[0].allele_family,
+                                parsed_alleles[0].allele_code)
+        else:
+            return "HLA-%s%s%s-%s%s%s" % (
+                parsed_alleles[0].gene,
+                parsed_alleles[0].allele_family,
+                parsed_alleles[0].allele_code,
+                parsed_alleles[1].gene,
+                parsed_alleles[1].allele_family,
+                parsed_alleles[1].allele_code)
+
+    def predict(self, fasta_dictionary):
+        fasta_dictionary = check_sequence_dictionary(fasta_dictionary)
+        input_filenames, sequence_key_mapping = create_input_fasta_files(
+            fasta_dictionary)
+        # TODO: We are not currently using the file chunking
+        # functionality here. See NetMHCcons.
+        input_filename = input_filenames[0]
+
+        alleles_str = \
+            ",".join(self.normalize_allele(allele) for allele in self.alleles)
+        output_file = tempfile.NamedTemporaryFile(
+                "w",
+                prefix="netMHCIIpan_output",
+                delete=False)
+        args = [
+            self.command,
+            "-length", seq_to_str(self.epitope_lengths, sep=" "),
+            "-f", input_filename,
+            "-a", alleles_str
+        ]
+        logging.info(" ".join(args))
+
+        with CleanupFiles(
+                filenames=[input_filename],
+                files=[output_file]):
+            process = AsyncProcess(
+                args=args,
+                redirect_stdout_file=output_file)
+            process.wait()
+            # need to flush written output and re-open for read
+            output_file.close()
+            with open(output_file.name, 'r') as f:
+                file_contents = f.read()
+                epitope_collection = parse_netmhcpan_stdout(
+                    file_contents,
+                    sequence_key_mapping=sequence_key_mapping,
+                    fasta_dictionary=fasta_dictionary,
+                    prediction_method_name="netmhciipan",
+                    is_netmhcpanii=True)
+
+        if len(epitope_collection) == 0:
+            logging.warn(file_contents)
+            raise ValueError("No epitopes from netMHCIIpan")
+        return epitope_collection