diff --git a/pyproject.toml b/pyproject.toml
index 19395bb..b5b30f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ test = [
]
[project.scripts]
-get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main"
+get_gene_name = "nf_rnaseq.cli.get_gene_name:main"
[tool.coverage.run]
source = ["nf_rnaseq"]
diff --git a/src/nf_rnaseq/api_schema.py b/src/nf_rnaseq/api_schema.py
index d4bbf07..8fa78d3 100644
--- a/src/nf_rnaseq/api_schema.py
+++ b/src/nf_rnaseq/api_schema.py
@@ -16,7 +16,7 @@ def query_api(self):
if self.headers is None:
response = session.get(self.url_query)
else:
- response = session.get(self.url_query, ast.literal_eval(self.header))
+ response = session.get(self.url_query, headers=ast.literal_eval(self.headers))
try:
response.raise_for_status()
@@ -25,8 +25,8 @@ def query_api(self):
try:
self.json = response.json()
- except requests.exceptions.JSONDecodeError as e:
- logging.error("Error at %s", "division", exc_info=e)
+ except requests.exceptions.JSONDecodeError:
+ # logging.error("Error at %s", "division", exc_info=e)
self.text = response.text
@abstractmethod
@@ -35,11 +35,6 @@ def create_query_url(self):
...
@abstractmethod
- def maybe_set_attr_from_json(self):
- """Set attributes in the object from the json response."""
- ...
-
- @abstractmethod
- def maybe_get_hgnc_gene_name(self):
- """Get the HGNC gene name from the json response."""
+ def maybe_get_gene_names(self):
+ """Get the gene name from the request response."""
...
diff --git a/src/nf_rnaseq/biomart.py b/src/nf_rnaseq/biomart.py
new file mode 100644
index 0000000..0a091fd
--- /dev/null
+++ b/src/nf_rnaseq/biomart.py
@@ -0,0 +1,53 @@
+import logging
+from dataclasses import dataclass
+from io import StringIO
+
+import pandas as pd
+
+from nf_rnaseq.api_schema import APIClient
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BioMart(APIClient):
+ """Class to interact with Ensembl BioMart API."""
+
+ identifier: str
+ """str: Ensembl transcript ID(s); either one or a list of comma separated values (<=500 total)."""
+ search_term: str
+ """str: Term on which to search."""
+ url_base: str = 'http://www.ensembl.org/biomart/martservice?query='
+ """str: URL base for Ensembl BioMart API."""
+ url_query: str = None
+ """str: URL query for BioMart API."""
+ headers = None
+ """str: headers for BioMart API (use ast.as_literal for dict)."""
+ json: dict = None
+ """dict: JSON response from BioMart API."""
+ text: str = None
+ """str: Text response from BioMart API (if no json)."""
+ gene_names: list[str] = None
+ """str: Gene name(s)."""
+
+ def __post_init__(self):
+ self.create_query_url()
+ self.query_api()
+ self.maybe_get_gene_names()
+
+ def create_query_url(self):
+ """Create URL for BioMart API query."""
+ # split on ", ", trim, and join with "," to ensure no spaces
+ self.identifier = ",".join([id.strip() for id in self.identifier.replace("[", "").replace("]", "").split(",")])
+ self.url_query = self.url_base.replace("", self.identifier).replace("", self.search_term)
+
+ def maybe_get_gene_names(self):
+ """Get dataframe of transcript IDs and gene names from transcript IDs and add as hgnc_gene_name attr."""
+ try:
+ df = pd.read_csv(StringIO(self.text), sep="\t", header=None)
+ df.columns = ["in", "out"]
+ # in case multiple gene names for one transcript ID
+ df_agg = df.groupby("in", sort=False).agg(list)
+ self.gene_names = df_agg["out"].tolist()
+ except (KeyError, AttributeError) as e:
+ logging.error("Error at %s", "division", exc_info=e)
diff --git a/src/nf_rnaseq/cli/get_gene_name.py b/src/nf_rnaseq/cli/get_gene_name.py
new file mode 100755
index 0000000..71b1246
--- /dev/null
+++ b/src/nf_rnaseq/cli/get_gene_name.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+import argparse
+
+from nf_rnaseq import biomart, config, hgnc, uniprot
+
+DICT_DATABASES = {
+ "BioMart": {
+ "api_object": biomart.BioMart,
+ "search_term": "ensembl_transcript_id_version",
+ },
+ "HGNC": {
+ "api_object": hgnc.HGNC,
+ "search_term": "mane_select",
+ },
+ "UniProt": {
+ "api_object": uniprot.UniProt,
+ "search_term": None,
+ },
+}
+
+
+def parsearg_utils():
+ """
+
+ Argparser to get HGNC gene name from string input.
+
+ Returns
+ -------
+ args: argparse.Namespace
+ Namespace object containing featureCounts files
+
+ """
+ parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.")
+
+ parser.add_argument(
+ "-c",
+ "--cachePath",
+ help="Path to requests cache (type: str, default: '')",
+ type=str,
+ default="",
+ )
+
+ parser.add_argument(
+ "-d",
+ "--database",
+ help="Database to use including BioMart, HGNC, and UniProt (type: str, no default)",
+ type=str,
+ )
+
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="Input string (type: str)",
+ type=str,
+ )
+
+ parser.add_argument(
+ "-t",
+ "--tsv",
+ help="If flag included tsv format out otherwise csv",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Get HGNC gene name from string input."""
+ args = parsearg_utils()
+ inputs_ids = args.input.replace("[", "").replace("]", "")
+
+ if args.cachePath != "":
+ config.set_request_cache(args.cachePath)
+
+ try:
+ api_obj = DICT_DATABASES[args.database]["api_object"](
+ identifier=inputs_ids,
+ search_term=DICT_DATABASES[args.database]["search_term"],
+ )
+ id_out = api_obj.gene_names
+ except KeyError as e:
+ raise UserWarning(f"Database {args.database} not in DICT_DATABASES.keys()") from e
+
+ # set delimiter depending on tsv flag
+ if args.tsv:
+ delim = "\t"
+ else:
+ delim = ","
+
+ # if inputs are a list, split and iterate
+ list_inputs = inputs_ids.split(", ")
+ str_out = ""
+ if len(list_inputs) > 1:
+ for idx, input_id in enumerate(list_inputs):
+ str1 = f"{input_id.ljust(20)}"
+ str2 = f"{str(id_out[idx]).ljust(20)}"
+ str3 = f"{args.database}"
+ str_out += f"{str1}{delim}{str2}{delim}{str3}\n"
+ else:
+ str1 = f"{args.input.ljust(20)}"
+ str2 = f"{str(id_out).ljust(20)}"
+ str3 = f"{args.database}"
+ str_out = f"{str1}{delim}{str2}{delim}{str3}\n"
+
+ print(str_out)
diff --git a/src/nf_rnaseq/cli/get_hgnc_gene_name.py b/src/nf_rnaseq/cli/get_hgnc_gene_name.py
deleted file mode 100644
index c73ee16..0000000
--- a/src/nf_rnaseq/cli/get_hgnc_gene_name.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/usr/env python
-
-import argparse
-
-from nf_rnaseq import config, hgnc, uniprot
-
-
-def parsearg_utils():
- """
-
- Argparser to get HGNC gene name from string input.
-
- Returns
- -------
- args: argparse.Namespace
- Namespace object containing featureCounts files
-
- """
- parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.")
-
- parser.add_argument(
- "-c",
- "--cachePath",
- help="Path to requests cache (type: str, default: '')",
- type=str,
- default="",
- )
-
- parser.add_argument(
- "-i",
- "--input",
- help="Input string (type: str)",
- type="str",
- )
-
- parser.add_argument(
- "-s",
- "--searchTerm",
- help="Search term for HGNC Fetch; if UniProt, not in use (type: str)",
- type="str",
- default="mane_select",
- )
-
- parser.add_argument(
- "-t",
- "--tsv",
- help="If flag included tsv format out otherwise csv",
- action="store_true",
- )
-
- parser.add_argument(
- "-u",
- "--uniProt",
- help="If flag included UniProt should be queried otherwise HGNC database used",
- action="store_true",
- )
-
- args = parser.parse_args()
-
- return args
-
-
-def main():
- """Get HGNC gene name from string input."""
- args = parsearg_utils()
-
- if args.cachePath != "":
- config.set_request_cache(args.cachePath)
-
- if args.uniProt:
- source = "UniProt"
- uniprot_obj = uniprot.UniProt(uniprot_id=args.input)
- uniprot_obj.query_api()
- uniprot_obj.maybe_set_attr_from_json()
- # id_out = uniprot_obj.hgnc_gene_name
- else:
- source = "HGNC"
- hgnc_obj = hgnc.HGNC(search_id=args.input, search_term=args.searchTerm)
- # hgnc_obj.query_api()
- # hgnc_obj.maybe_set_attr_from_json()
- id_out = hgnc_obj.hgnc_gene_name
-
- str1 = f"{args.input.ljust(20)}"
- str2 = f"{str(id_out).ljust(20)}"
- str3 = f"{source}"
-
- if args.tsv:
- print(f"{str1}\t{str2}\t{str3}")
- else:
- print(f"{str1},{str2},{str3}")
diff --git a/src/nf_rnaseq/hgnc.py b/src/nf_rnaseq/hgnc.py
index 4b218c2..29c421b 100644
--- a/src/nf_rnaseq/hgnc.py
+++ b/src/nf_rnaseq/hgnc.py
@@ -11,43 +11,37 @@
class HGNC(APIClient):
"""Class to interact with HGNC API."""
- search_id: str
+ identifier: str
"""str: ID on which to search."""
search_term: str
"""str: Term from, https://www.genenames.org/help/rest/ on which to search."""
url_base: str = "https://rest.genenames.org/fetch"
"""str: URL base for HGNC API."""
- header: str = "{'Accept': 'application/json'}"
- """str: Header for HGNC API (use ast.as_literal for dict)."""
- # url_query: str = None
- # """str: URL query for HGNC API."""
- # json: dict = None
- # """dict: JSON response from UniProt API."""
- # text: str = None
- # """str: Text response from UniProt API (if no json)."""
- # hgnc_gene_name: list[str] = None
- # """str: HGNC gene name."""
+ headers: str = "{'Accept': 'application/json'}"
+ """str: headers for HGNC API (use ast.as_literal for dict)."""
+ url_query: str = None
+ """str: URL query for HGNC API."""
+ json: dict = None
+ """dict: JSON response from UniProt API."""
+ text: str = None
+ """str: Text response from UniProt API (if no json)."""
+ gene_names: list[str] = None
+ """str: HGNC gene name."""
def __post_init__(self):
self.create_query_url()
self.query_api()
- self.maybe_set_json_properties()
- self.maybe_get_hgnc_gene_name()
+ self.maybe_get_gene_names()
def create_query_url(self):
"""Create URL for HGNC API query."""
- self.url_query = os.path.join(self.url_base, self.search_term, self.search_id)
+ self.url_query = os.path.join(self.url_base, self.search_term, self.identifier)
- def maybe_set_json_properties(self):
- """If self.json is not None, set properties of UniProt object using self.json."""
- if self.json is not None:
- HGNC(**self.json)
-
- def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]:
+ def maybe_get_gene_names(self, str_symbol: str = "symbol") -> list[str]:
"""Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
try:
list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol)
- self.hgnc_gene_name = list_genes
+ self.gene_names = list_genes
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)
@@ -69,8 +63,8 @@ def maybe_extract_list_from_hgnc_response_docs(
"""
try:
- if self.response["numFound"] >= 1:
- list_output = [doc[str_to_extract] for doc in self.response["docs"]]
+ if self.json["response"]["numFound"] >= 1:
+ list_output = [doc[str_to_extract] for doc in self.json["response"]["docs"]]
else:
list_output = []
return list_output
diff --git a/src/nf_rnaseq/uniprot.py b/src/nf_rnaseq/uniprot.py
index a412ef5..5eb6db1 100644
--- a/src/nf_rnaseq/uniprot.py
+++ b/src/nf_rnaseq/uniprot.py
@@ -2,7 +2,7 @@
import os
from dataclasses import dataclass
-from nf_rnaseq import APIClient
+from nf_rnaseq.api_schema import APIClient
logger = logging.getLogger(__name__)
@@ -11,38 +11,36 @@
class UniProt(APIClient):
"""Class to interact with UniProt API."""
- uniprot_id: str
+ identifier: str
"""str: UniProt ID."""
+ search_term: str
+ """str: Term on which to search."""
url_base: str = "https://rest.uniprot.org/uniprotkb"
"""str: URL base for UniProtKB API."""
url_query: str = None
"""str: URL query for UniProt API."""
+ headers = None
+ """str: headers for UniProt API (use ast.as_literal for dict)."""
json: dict = None
"""dict: JSON response from UniProt API."""
text: str = None
"""str: Text response from UniProt API (if no json)."""
- hgnc_gene_name: str = None
- """str: HGNC gene name."""
+ gene_names: list[str] = None
+ """list[str]: Gene name(s)."""
def __post_init__(self):
self.create_query_url()
self.query_api()
- self.maybe_set_json_properties()
- self.maybe_get_hgnc_gene_name()
+ self.maybe_get_gene_names()
def create_query_url(self):
"""Create URL for UniProt API query."""
- self.url_query = os.path.join(self.url_base, self.uniprot_id, ".json")
+ self.url_query = os.path.join(self.url_base, self.identifier + ".json")
- def maybe_set_json_properties(self):
- """If self.json is not None, set properties of UniProt object using self.json."""
- if self.json is not None:
- UniProt(**self.json)
-
- def maybe_get_hgnc_gene_name(self):
- """Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
+ def maybe_get_gene_names(self):
+ """Get list of gene names from UniProt ID and add as gene_name attr."""
try:
- list_genes = [str(gene["geneName"]["value"]) for gene in self.genes]
- self.hgnc_gene_name = list_genes
+ list_genes = [str(gene["geneName"]["value"]) for gene in self.json["genes"]]
+ self.gene_names = list_genes
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)