Skip to content

Commit

Permalink
Merge pull request #2 from tansey-lab/jw_pip_package
Browse files Browse the repository at this point in the history
Initial commits to add CLI interface to query HGNC or UniProt for gene names
  • Loading branch information
jessicaw9910 authored Aug 2, 2024
2 parents d35ab67 + 3a0e023 commit 4f68555
Show file tree
Hide file tree
Showing 7 changed files with 366 additions and 1 deletion.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ urls.Source = "https://github.com/tansey-lab/nf-rnaseq"
urls.Home-page = "https://github.com/tansey-lab/nf-rnaseq"
dependencies = [
"nextflow>=24.04.2",
"anndata",
"requests-cache>=0.9.7,<1",
"requests>=2.28.1,<3",
# "anndata",
# for debug logging (referenced from the issue template)
"session-info",
]
Expand Down Expand Up @@ -50,6 +52,9 @@ test = [
"coverage",
]

[project.scripts]
get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main"

[tool.coverage.run]
source = ["nf_rnaseq"]
omit = [
Expand Down
45 changes: 45 additions & 0 deletions src/nf_rnaseq/api_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import ast
import logging
from abc import ABC, abstractmethod

import requests

from nf_rnaseq import requests_wrapper


class APIClient(ABC):
"""Abstract class for API clients."""

def query_api(self):
"""Get response from API which tries to save as json in instance; otherwise saves as text."""
session = requests_wrapper.get_cached_session()
if self.headers is None:
response = session.get(self.url_query)
else:
response = session.get(self.url_query, ast.literal_eval(self.header))

try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
logging.error("Error at %s", "division", exc_info=e)

try:
self.json = response.json()
except requests.exceptions.JSONDecodeError as e:
logging.error("Error at %s", "division", exc_info=e)
self.text = response.text

@abstractmethod
def create_query_url(self):
"""Create the URL to query the API (e.g., add search term or ID)."""
...

@abstractmethod
def maybe_set_attr_from_json(self):
"""Set attributes in the object from the json response."""
...

@abstractmethod
def maybe_get_hgnc_gene_name(self):
"""Get the HGNC gene name from the json response."""
...
90 changes: 90 additions & 0 deletions src/nf_rnaseq/cli/get_hgnc_gene_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/usr/env python

import argparse

from nf_rnaseq import config, hgnc, uniprot


def parsearg_utils():
"""
Argparser to get HGNC gene name from string input.
Returns
-------
args: argparse.Namespace
Namespace object containing featureCounts files
"""
parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.")

parser.add_argument(
"-c",
"--cachePath",
help="Path to requests cache (type: str, default: '')",
type=str,
default="",
)

parser.add_argument(
"-i",
"--input",
help="Input string (type: str)",
type="str",
)

parser.add_argument(
"-s",
"--searchTerm",
help="Search term for HGNC Fetch; if UniProt, not in use (type: str)",
type="str",
default="mane_select",
)

parser.add_argument(
"-t",
"--tsv",
help="If flag included tsv format out otherwise csv",
action="store_true",
)

parser.add_argument(
"-u",
"--uniProt",
help="If flag included UniProt should be queried otherwise HGNC database used",
action="store_true",
)

args = parser.parse_args()

return args


def main():
"""Get HGNC gene name from string input."""
args = parsearg_utils()

if args.cachePath != "":
config.set_request_cache(args.cachePath)

if args.uniProt:
source = "UniProt"
uniprot_obj = uniprot.UniProt(uniprot_id=args.input)
uniprot_obj.query_api()
uniprot_obj.maybe_set_attr_from_json()
# id_out = uniprot_obj.hgnc_gene_name
else:
source = "HGNC"
hgnc_obj = hgnc.HGNC(search_id=args.input, search_term=args.searchTerm)
# hgnc_obj.query_api()
# hgnc_obj.maybe_set_attr_from_json()
id_out = hgnc_obj.hgnc_gene_name

str1 = f"{args.input.ljust(20)}"
str2 = f"{str(id_out).ljust(20)}"
str3 = f"{source}"

if args.tsv:
print(f"{str1}\t{str2}\t{str3}")
else:
print(f"{str1},{str2},{str3}")
35 changes: 35 additions & 0 deletions src/nf_rnaseq/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os

REQUESTS_CACHE_VAR = "REQUESTS_CACHE"
"""str: Environment variable for request cache file prefix."""


def set_request_cache(val: str) -> None:
"""Set the request cache path in environment variables.
Parameters
----------
val : str
Request cache path
Returns
-------
None
"""
os.environ[REQUESTS_CACHE_VAR] = val


def maybe_get_request_cache() -> str | None:
"""Get the request cache path from the environment.
Returns
-------
str | None
Request cache path as string if exists, otherwise None
"""
try:
return os.environ[REQUESTS_CACHE_VAR]
except KeyError:
return None
78 changes: 78 additions & 0 deletions src/nf_rnaseq/hgnc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging
import os
from dataclasses import dataclass

from nf_rnaseq.api_schema import APIClient

logger = logging.getLogger(__name__)


@dataclass
class HGNC(APIClient):
"""Class to interact with HGNC API."""

search_id: str
"""str: ID on which to search."""
search_term: str
"""str: Term from, https://www.genenames.org/help/rest/ on which to search."""
url_base: str = "https://rest.genenames.org/fetch"
"""str: URL base for HGNC API."""
header: str = "{'Accept': 'application/json'}"
"""str: Header for HGNC API (use ast.as_literal for dict)."""
# url_query: str = None
# """str: URL query for HGNC API."""
# json: dict = None
# """dict: JSON response from UniProt API."""
# text: str = None
# """str: Text response from UniProt API (if no json)."""
# hgnc_gene_name: list[str] = None
# """str: HGNC gene name."""

def __post_init__(self):
self.create_query_url()
self.query_api()
self.maybe_set_json_properties()
self.maybe_get_hgnc_gene_name()

def create_query_url(self):
"""Create URL for HGNC API query."""
self.url_query = os.path.join(self.url_base, self.search_term, self.search_id)

def maybe_set_json_properties(self):
"""If self.json is not None, set properties of UniProt object using self.json."""
if self.json is not None:
HGNC(**self.json)

def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]:
"""Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
try:
list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol)
self.hgnc_gene_name = list_genes
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)

def maybe_extract_list_from_hgnc_response_docs(
self,
str_to_extract: str,
) -> list[str] | None:
"""Extract a list of values from the response documents of an HGNC REST API request.
Parameters
----------
str_to_extract : str
Key to extract from the response documents
Returns
-------
list[str]
List of values extracted from the response documents
"""
try:
if self.response["numFound"] >= 1:
list_output = [doc[str_to_extract] for doc in self.response["docs"]]
else:
list_output = []
return list_output
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)
64 changes: 64 additions & 0 deletions src/nf_rnaseq/requests_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
from functools import cache

from requests.adapters import HTTPAdapter, Retry
from requests_cache import CachedSession

REQUEST_CACHE_VAR = "REQUEST_CACHE"


def add_retry_to_session(
session,
retries=5,
backoff_factor=0.3,
status_forcelist=(429, 500, 501, 502, 503, 504),
):
"""Add retry logic to a session.
Parameters
----------
session: requests.Session
Session object
retries: int
Number of retries
backoff_factor: float
Backoff factor
status_forcelist: tuple[int]
Tuple of status codes to force a retry
Returns
-------
requests.Session
Session object with retry logic
"""
retry = Retry(
total=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
allowed_methods=False,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session


@cache
def get_cached_session():
"""Get a cached session.
Returns
-------
requests.Session
Cached session object
"""
if REQUEST_CACHE_VAR in os.environ:
cache_location = os.environ[REQUEST_CACHE_VAR]

session = CachedSession(cache_location, allowable_codes=(200, 404, 400), backend="sqlite")
else:
session = CachedSession(backend="memory")

return add_retry_to_session(session)
48 changes: 48 additions & 0 deletions src/nf_rnaseq/uniprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import logging
import os
from dataclasses import dataclass

from nf_rnaseq import APIClient

logger = logging.getLogger(__name__)


@dataclass
class UniProt(APIClient):
"""Class to interact with UniProt API."""

uniprot_id: str
"""str: UniProt ID."""
url_base: str = "https://rest.uniprot.org/uniprotkb"
"""str: URL base for UniProtKB API."""
url_query: str = None
"""str: URL query for UniProt API."""
json: dict = None
"""dict: JSON response from UniProt API."""
text: str = None
"""str: Text response from UniProt API (if no json)."""
hgnc_gene_name: str = None
"""str: HGNC gene name."""

def __post_init__(self):
self.create_query_url()
self.query_api()
self.maybe_set_json_properties()
self.maybe_get_hgnc_gene_name()

def create_query_url(self):
"""Create URL for UniProt API query."""
self.url_query = os.path.join(self.url_base, self.uniprot_id, ".json")

def maybe_set_json_properties(self):
"""If self.json is not None, set properties of UniProt object using self.json."""
if self.json is not None:
UniProt(**self.json)

def maybe_get_hgnc_gene_name(self):
"""Get list of gene names from UniProt ID and add as hgnc_gene_name attr."""
try:
list_genes = [str(gene["geneName"]["value"]) for gene in self.genes]
self.hgnc_gene_name = list_genes
except (KeyError, AttributeError) as e:
logging.error("Error at %s", "division", exc_info=e)

0 comments on commit 4f68555

Please sign in to comment.