added BioMart functionality and supporting getting multiple gene name…

…s simultaneously
tansey-lab · Aug 5, 2024 · 54e728f · 54e728f
1 parent 3a0e023
commit 54e728f
Show file tree

Hide file tree

Showing 7 changed files with 198 additions and 140 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,7 @@ test = [
 ]
 
 [project.scripts]
-get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main"
+get_gene_name = "nf_rnaseq.cli.get_gene_name:main"
 
 [tool.coverage.run]
 source = ["nf_rnaseq"]

diff --git a/src/nf_rnaseq/api_schema.py b/src/nf_rnaseq/api_schema.py
@@ -16,7 +16,7 @@ def query_api(self):
         if self.headers is None:
             response = session.get(self.url_query)
         else:
-            response = session.get(self.url_query, ast.literal_eval(self.header))
+            response = session.get(self.url_query, headers=ast.literal_eval(self.headers))
 
         try:
             response.raise_for_status()
@@ -25,8 +25,8 @@ def query_api(self):
 
         try:
             self.json = response.json()
-        except requests.exceptions.JSONDecodeError as e:
-            logging.error("Error at %s", "division", exc_info=e)
+        except requests.exceptions.JSONDecodeError:
+            # logging.error("Error at %s", "division", exc_info=e)
             self.text = response.text
 
     @abstractmethod
@@ -35,11 +35,6 @@ def create_query_url(self):
         ...
 
     @abstractmethod
-    def maybe_set_attr_from_json(self):
-        """Set attributes in the object from the json response."""
-        ...
-
-    @abstractmethod
-    def maybe_get_hgnc_gene_name(self):
-        """Get the HGNC gene name from the json response."""
+    def maybe_get_gene_names(self):
+        """Get the gene name from the request response."""
         ...
diff --git a/src/nf_rnaseq/biomart.py b/src/nf_rnaseq/biomart.py
@@ -0,0 +1,53 @@
+import logging
+from dataclasses import dataclass
+from io import StringIO
+
+import pandas as pd
+
+from nf_rnaseq.api_schema import APIClient
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BioMart(APIClient):
+    """Class to interact with Ensembl BioMart API."""
+
+    identifier: str
+    """str: Ensembl transcript ID(s); either one or a  list of comma separated values (<=500 total)."""
+    search_term: str
+    """str: Term on which to search."""
+    url_base: str = 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "<SEARCH_TERM>" value = "<IDS>"/><Attribute name = "ensembl_transcript_id" /><Attribute name = "external_gene_name" /></Dataset></Query>'
+    """str: URL base for Ensembl BioMart API."""
+    url_query: str = None
+    """str: URL query for BioMart API."""
+    headers = None
+    """str: headers for BioMart API (use ast.as_literal for dict)."""
+    json: dict = None
+    """dict: JSON response from BioMart API."""
+    text: str = None
+    """str: Text response from BioMart API (if no json)."""
+    gene_names: list[str] = None
+    """str: Gene name(s)."""
+
+    def __post_init__(self):
+        self.create_query_url()
+        self.query_api()
+        self.maybe_get_gene_names()
+
+    def create_query_url(self):
+        """Create URL for BioMart API query."""
+        # split on ", ", trim, and join with "," to ensure no spaces
+        self.identifier = ",".join([id.strip() for id in self.identifier.replace("[", "").replace("]", "").split(",")])
+        self.url_query = self.url_base.replace("<IDS>", self.identifier).replace("<SEARCH_TERM>", self.search_term)
+
+    def maybe_get_gene_names(self):
+        """Get dataframe of transcript IDs and gene names from transcript IDs and add as hgnc_gene_name attr."""
+        try:
+            df = pd.read_csv(StringIO(self.text), sep="\t", header=None)
+            df.columns = ["in", "out"]
+            # in case multiple gene names for one transcript ID
+            df_agg = df.groupby("in", sort=False).agg(list)
+            self.gene_names = df_agg["out"].tolist()
+        except (KeyError, AttributeError) as e:
+            logging.error("Error at %s", "division", exc_info=e)
diff --git a/src/nf_rnaseq/cli/get_gene_name.py b/src/nf_rnaseq/cli/get_gene_name.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+import argparse
+
+from nf_rnaseq import biomart, config, hgnc, uniprot
+
+DICT_DATABASES = {
+    "BioMart": {
+        "api_object": biomart.BioMart,
+        "search_term": "ensembl_transcript_id_version",
+    },
+    "HGNC": {
+        "api_object": hgnc.HGNC,
+        "search_term": "mane_select",
+    },
+    "UniProt": {
+        "api_object": uniprot.UniProt,
+        "search_term": None,
+    },
+}
+
+
+def parsearg_utils():
+    """
+
+    Argparser to get HGNC gene name from string input.
+
+    Returns
+    -------
+    args: argparse.Namespace
+        Namespace object containing featureCounts files
+
+    """
+    parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.")
+
+    parser.add_argument(
+        "-c",
+        "--cachePath",
+        help="Path to requests cache (type: str, default: '')",
+        type=str,
+        default="",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--database",
+        help="Database to use including BioMart, HGNC, and UniProt (type: str, no default)",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input",
+        help="Input string (type: str)",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tsv",
+        help="If flag included tsv format out otherwise csv",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Get HGNC gene name from string input."""
+    args = parsearg_utils()
+    inputs_ids = args.input.replace("[", "").replace("]", "")
+
+    if args.cachePath != "":
+        config.set_request_cache(args.cachePath)
+
+    try:
+        api_obj = DICT_DATABASES[args.database]["api_object"](
+            identifier=inputs_ids,
+            search_term=DICT_DATABASES[args.database]["search_term"],
+        )
+        id_out = api_obj.gene_names
+    except KeyError as e:
+        raise UserWarning(f"Database {args.database} not in DICT_DATABASES.keys()") from e
+
+    # set delimiter depending on tsv flag
+    if args.tsv:
+        delim = "\t"
+    else:
+        delim = ","
+
+    # if inputs are a list, split and iterate
+    list_inputs = inputs_ids.split(", ")
+    str_out = ""
+    if len(list_inputs) > 1:
+        for idx, input_id in enumerate(list_inputs):
+            str1 = f"{input_id.ljust(20)}"
+            str2 = f"{str(id_out[idx]).ljust(20)}"
+            str3 = f"{args.database}"
+            str_out += f"{str1}{delim}{str2}{delim}{str3}\n"
+    else:
+        str1 = f"{args.input.ljust(20)}"
+        str2 = f"{str(id_out).ljust(20)}"
+        str3 = f"{args.database}"
+        str_out = f"{str1}{delim}{str2}{delim}{str3}\n"
+
+    print(str_out)
diff --git a/src/nf_rnaseq/cli/get_hgnc_gene_name.py b/src/nf_rnaseq/cli/get_hgnc_gene_name.py
diff --git a/src/nf_rnaseq/hgnc.py b/src/nf_rnaseq/hgnc.py
@@ -11,43 +11,37 @@
 class HGNC(APIClient):
     """Class to interact with HGNC API."""
 
-    search_id: str
+    identifier: str
     """str: ID on which to search."""
     search_term: str
     """str: Term from, https://www.genenames.org/help/rest/ on which to search."""
     url_base: str = "https://rest.genenames.org/fetch"
     """str: URL base for HGNC API."""
-    header: str = "{'Accept': 'application/json'}"
-    """str: Header for HGNC API (use ast.as_literal for dict)."""
-    # url_query: str = None
-    # """str: URL query for HGNC API."""
-    # json: dict = None
-    # """dict: JSON response from UniProt API."""
-    # text: str = None
-    # """str: Text response from UniProt API (if no json)."""
-    # hgnc_gene_name: list[str] = None
-    # """str: HGNC gene name."""
+    headers: str = "{'Accept': 'application/json'}"
+    """str: headers for HGNC API (use ast.as_literal for dict)."""
+    url_query: str = None
+    """str: URL query for HGNC API."""
+    json: dict = None
+    """dict: JSON response from UniProt API."""
+    text: str = None
+    """str: Text response from UniProt API (if no json)."""
+    gene_names: list[str] = None
+    """str: HGNC gene name."""
 
     def __post_init__(self):
         self.create_query_url()
         self.query_api()
-        self.maybe_set_json_properties()
-        self.maybe_get_hgnc_gene_name()
+        self.maybe_get_gene_names()
 
     def create_query_url(self):
         """Create URL for HGNC API query."""
-        self.url_query = os.path.join(self.url_base, self.search_term, self.search_id)
+        self.url_query = os.path.join(self.url_base, self.search_term, self.identifier)
 
-    def maybe_set_json_properties(self):
-        """If self.json is not None, set properties of UniProt object using self.json."""
-        if self.json is not None:
-            HGNC(**self.json)
-
-    def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]:
+    def maybe_get_gene_names(self, str_symbol: str = "symbol") -> list[str]:
         """Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
         try:
             list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol)
-            self.hgnc_gene_name = list_genes
+            self.gene_names = list_genes
         except (KeyError, AttributeError) as e:
             logging.error("Error at %s", "division", exc_info=e)
 
@@ -69,8 +63,8 @@ def maybe_extract_list_from_hgnc_response_docs(
 
         """
         try:
-            if self.response["numFound"] >= 1:
-                list_output = [doc[str_to_extract] for doc in self.response["docs"]]
+            if self.json["response"]["numFound"] >= 1:
+                list_output = [doc[str_to_extract] for doc in self.json["response"]["docs"]]
             else:
                 list_output = []
             return list_output