Skip to content

Commit

Permalink
added BioMart functionality and supporting getting multiple gene name…
Browse files Browse the repository at this point in the history
…s simultaneously
  • Loading branch information
jessicaw9910 committed Aug 5, 2024
1 parent 3a0e023 commit 54e728f
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 140 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ test = [
]

[project.scripts]
get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main"
get_gene_name = "nf_rnaseq.cli.get_gene_name:main"

[tool.coverage.run]
source = ["nf_rnaseq"]
Expand Down
15 changes: 5 additions & 10 deletions src/nf_rnaseq/api_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def query_api(self):
if self.headers is None:
response = session.get(self.url_query)
else:
response = session.get(self.url_query, ast.literal_eval(self.header))
response = session.get(self.url_query, headers=ast.literal_eval(self.headers))

try:
response.raise_for_status()
Expand All @@ -25,8 +25,8 @@ def query_api(self):

try:
self.json = response.json()
except requests.exceptions.JSONDecodeError as e:
logging.error("Error at %s", "division", exc_info=e)
except requests.exceptions.JSONDecodeError:
# logging.error("Error at %s", "division", exc_info=e)
self.text = response.text

@abstractmethod
Expand All @@ -35,11 +35,6 @@ def create_query_url(self):
...

@abstractmethod
def maybe_set_attr_from_json(self):
"""Set attributes in the object from the json response."""
...

@abstractmethod
def maybe_get_hgnc_gene_name(self):
"""Get the HGNC gene name from the json response."""
def maybe_get_gene_names(self):
"""Get the gene name from the request response."""
...
53 changes: 53 additions & 0 deletions src/nf_rnaseq/biomart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
from dataclasses import dataclass
from io import StringIO

import pandas as pd

from nf_rnaseq.api_schema import APIClient

logger = logging.getLogger(__name__)


@dataclass
class BioMart(APIClient):
"""Class to interact with Ensembl BioMart API."""

identifier: str
"""str: Ensembl transcript ID(s); either one or a list of comma separated values (<=500 total)."""
search_term: str
"""str: Term on which to search."""
url_base: str = 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "<SEARCH_TERM>" value = "<IDS>"/><Attribute name = "ensembl_transcript_id" /><Attribute name = "external_gene_name" /></Dataset></Query>'
"""str: URL base for Ensembl BioMart API."""
url_query: str = None
"""str: URL query for BioMart API."""
headers = None
"""str: headers for BioMart API (use ast.as_literal for dict)."""
json: dict = None
"""dict: JSON response from BioMart API."""
text: str = None
"""str: Text response from BioMart API (if no json)."""
gene_names: list[str] = None
"""str: Gene name(s)."""

def __post_init__(self):
self.create_query_url()
self.query_api()
self.maybe_get_gene_names()

def create_query_url(self):
"""Create URL for BioMart API query."""
# split on ", ", trim, and join with "," to ensure no spaces
self.identifier = ",".join([id.strip() for id in self.identifier.replace("[", "").replace("]", "").split(",")])
self.url_query = self.url_base.replace("<IDS>", self.identifier).replace("<SEARCH_TERM>", self.search_term)

def maybe_get_gene_names(self):
"""Get dataframe of transcript IDs and gene names from transcript IDs and add as hgnc_gene_name attr."""
try:
df = pd.read_csv(StringIO(self.text), sep="\t", header=None)
df.columns = ["in", "out"]
# in case multiple gene names for one transcript ID
df_agg = df.groupby("in", sort=False).agg(list)
self.gene_names = df_agg["out"].tolist()
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)
108 changes: 108 additions & 0 deletions src/nf_rnaseq/cli/get_gene_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python

import argparse

from nf_rnaseq import biomart, config, hgnc, uniprot

DICT_DATABASES = {
"BioMart": {
"api_object": biomart.BioMart,
"search_term": "ensembl_transcript_id_version",
},
"HGNC": {
"api_object": hgnc.HGNC,
"search_term": "mane_select",
},
"UniProt": {
"api_object": uniprot.UniProt,
"search_term": None,
},
}


def parsearg_utils():
"""
Argparser to get HGNC gene name from string input.
Returns
-------
args: argparse.Namespace
Namespace object containing featureCounts files
"""
parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.")

parser.add_argument(
"-c",
"--cachePath",
help="Path to requests cache (type: str, default: '')",
type=str,
default="",
)

parser.add_argument(
"-d",
"--database",
help="Database to use including BioMart, HGNC, and UniProt (type: str, no default)",
type=str,
)

parser.add_argument(
"-i",
"--input",
help="Input string (type: str)",
type=str,
)

parser.add_argument(
"-t",
"--tsv",
help="If flag included tsv format out otherwise csv",
action="store_true",
)

args = parser.parse_args()

return args


def main():
"""Get HGNC gene name from string input."""
args = parsearg_utils()
inputs_ids = args.input.replace("[", "").replace("]", "")

if args.cachePath != "":
config.set_request_cache(args.cachePath)

try:
api_obj = DICT_DATABASES[args.database]["api_object"](
identifier=inputs_ids,
search_term=DICT_DATABASES[args.database]["search_term"],
)
id_out = api_obj.gene_names
except KeyError as e:
raise UserWarning(f"Database {args.database} not in DICT_DATABASES.keys()") from e

# set delimiter depending on tsv flag
if args.tsv:
delim = "\t"
else:
delim = ","

# if inputs are a list, split and iterate
list_inputs = inputs_ids.split(", ")
str_out = ""
if len(list_inputs) > 1:
for idx, input_id in enumerate(list_inputs):
str1 = f"{input_id.ljust(20)}"
str2 = f"{str(id_out[idx]).ljust(20)}"
str3 = f"{args.database}"
str_out += f"{str1}{delim}{str2}{delim}{str3}\n"
else:
str1 = f"{args.input.ljust(20)}"
str2 = f"{str(id_out).ljust(20)}"
str3 = f"{args.database}"
str_out = f"{str1}{delim}{str2}{delim}{str3}\n"

print(str_out)
90 changes: 0 additions & 90 deletions src/nf_rnaseq/cli/get_hgnc_gene_name.py

This file was deleted.

40 changes: 17 additions & 23 deletions src/nf_rnaseq/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,37 @@
class HGNC(APIClient):
"""Class to interact with HGNC API."""

search_id: str
identifier: str
"""str: ID on which to search."""
search_term: str
"""str: Term from, https://www.genenames.org/help/rest/ on which to search."""
url_base: str = "https://rest.genenames.org/fetch"
"""str: URL base for HGNC API."""
header: str = "{'Accept': 'application/json'}"
"""str: Header for HGNC API (use ast.as_literal for dict)."""
# url_query: str = None
# """str: URL query for HGNC API."""
# json: dict = None
# """dict: JSON response from UniProt API."""
# text: str = None
# """str: Text response from UniProt API (if no json)."""
# hgnc_gene_name: list[str] = None
# """str: HGNC gene name."""
headers: str = "{'Accept': 'application/json'}"
"""str: headers for HGNC API (use ast.as_literal for dict)."""
url_query: str = None
"""str: URL query for HGNC API."""
json: dict = None
"""dict: JSON response from UniProt API."""
text: str = None
"""str: Text response from UniProt API (if no json)."""
gene_names: list[str] = None
"""str: HGNC gene name."""

def __post_init__(self):
self.create_query_url()
self.query_api()
self.maybe_set_json_properties()
self.maybe_get_hgnc_gene_name()
self.maybe_get_gene_names()

def create_query_url(self):
"""Create URL for HGNC API query."""
self.url_query = os.path.join(self.url_base, self.search_term, self.search_id)
self.url_query = os.path.join(self.url_base, self.search_term, self.identifier)

def maybe_set_json_properties(self):
"""If self.json is not None, set properties of UniProt object using self.json."""
if self.json is not None:
HGNC(**self.json)

def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]:
def maybe_get_gene_names(self, str_symbol: str = "symbol") -> list[str]:
"""Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
try:
list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol)
self.hgnc_gene_name = list_genes
self.gene_names = list_genes
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)

Expand All @@ -69,8 +63,8 @@ def maybe_extract_list_from_hgnc_response_docs(
"""
try:
if self.response["numFound"] >= 1:
list_output = [doc[str_to_extract] for doc in self.response["docs"]]
if self.json["response"]["numFound"] >= 1:
list_output = [doc[str_to_extract] for doc in self.json["response"]["docs"]]
else:
list_output = []
return list_output
Expand Down
Loading

0 comments on commit 54e728f

Please sign in to comment.