From 3a0e0233008da02576c960607a353205137744f2 Mon Sep 17 00:00:00 2001 From: jessicaw9910 Date: Fri, 2 Aug 2024 11:56:55 -0400 Subject: [PATCH] fixed ruff --- pyproject.toml | 7 +- src/nf_rnaseq/api_schema.py | 45 +++++++++++++ src/nf_rnaseq/cli/get_hgnc_gene_name.py | 90 +++++++++++++++++++++++++ src/nf_rnaseq/config.py | 35 ++++++++++ src/nf_rnaseq/hgnc.py | 78 +++++++++++++++++++++ src/nf_rnaseq/requests_wrapper.py | 64 ++++++++++++++++++ src/nf_rnaseq/uniprot.py | 48 +++++++++++++ 7 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 src/nf_rnaseq/api_schema.py create mode 100644 src/nf_rnaseq/cli/get_hgnc_gene_name.py create mode 100644 src/nf_rnaseq/config.py create mode 100644 src/nf_rnaseq/hgnc.py create mode 100644 src/nf_rnaseq/requests_wrapper.py create mode 100644 src/nf_rnaseq/uniprot.py diff --git a/pyproject.toml b/pyproject.toml index 66403d8..19395bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,9 @@ urls.Source = "https://github.com/tansey-lab/nf-rnaseq" urls.Home-page = "https://github.com/tansey-lab/nf-rnaseq" dependencies = [ "nextflow>=24.04.2", - "anndata", + "requests-cache>=0.9.7,<1", + "requests>=2.28.1,<3", + # "anndata", # for debug logging (referenced from the issue template) "session-info", ] @@ -50,6 +52,9 @@ test = [ "coverage", ] +[project.scripts] +get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main" + [tool.coverage.run] source = ["nf_rnaseq"] omit = [ diff --git a/src/nf_rnaseq/api_schema.py b/src/nf_rnaseq/api_schema.py new file mode 100644 index 0000000..d4bbf07 --- /dev/null +++ b/src/nf_rnaseq/api_schema.py @@ -0,0 +1,45 @@ +import ast +import logging +from abc import ABC, abstractmethod + +import requests + +from nf_rnaseq import requests_wrapper + + +class APIClient(ABC): + """Abstract class for API clients.""" + + def query_api(self): + """Get response from API which tries to save as json in instance; otherwise saves as text.""" + session = requests_wrapper.get_cached_session() + if self.headers is None: + response = session.get(self.url_query) + else: + response = session.get(self.url_query, ast.literal_eval(self.header)) + + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + logging.error("Error at %s", "division", exc_info=e) + + try: + self.json = response.json() + except requests.exceptions.JSONDecodeError as e: + logging.error("Error at %s", "division", exc_info=e) + self.text = response.text + + @abstractmethod + def create_query_url(self): + """Create the URL to query the API (e.g., add search term or ID).""" + ... + + @abstractmethod + def maybe_set_attr_from_json(self): + """Set attributes in the object from the json response.""" + ... + + @abstractmethod + def maybe_get_hgnc_gene_name(self): + """Get the HGNC gene name from the json response.""" + ... diff --git a/src/nf_rnaseq/cli/get_hgnc_gene_name.py b/src/nf_rnaseq/cli/get_hgnc_gene_name.py new file mode 100644 index 0000000..c73ee16 --- /dev/null +++ b/src/nf_rnaseq/cli/get_hgnc_gene_name.py @@ -0,0 +1,90 @@ +#!/bin/usr/env python + +import argparse + +from nf_rnaseq import config, hgnc, uniprot + + +def parsearg_utils(): + """ + + Argparser to get HGNC gene name from string input. + + Returns + ------- + args: argparse.Namespace + Namespace object containing featureCounts files + + """ + parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.") + + parser.add_argument( + "-c", + "--cachePath", + help="Path to requests cache (type: str, default: '')", + type=str, + default="", + ) + + parser.add_argument( + "-i", + "--input", + help="Input string (type: str)", + type="str", + ) + + parser.add_argument( + "-s", + "--searchTerm", + help="Search term for HGNC Fetch; if UniProt, not in use (type: str)", + type="str", + default="mane_select", + ) + + parser.add_argument( + "-t", + "--tsv", + help="If flag included tsv format out otherwise csv", + action="store_true", + ) + + parser.add_argument( + "-u", + "--uniProt", + help="If flag included UniProt should be queried otherwise HGNC database used", + action="store_true", + ) + + args = parser.parse_args() + + return args + + +def main(): + """Get HGNC gene name from string input.""" + args = parsearg_utils() + + if args.cachePath != "": + config.set_request_cache(args.cachePath) + + if args.uniProt: + source = "UniProt" + uniprot_obj = uniprot.UniProt(uniprot_id=args.input) + uniprot_obj.query_api() + uniprot_obj.maybe_set_attr_from_json() + # id_out = uniprot_obj.hgnc_gene_name + else: + source = "HGNC" + hgnc_obj = hgnc.HGNC(search_id=args.input, search_term=args.searchTerm) + # hgnc_obj.query_api() + # hgnc_obj.maybe_set_attr_from_json() + id_out = hgnc_obj.hgnc_gene_name + + str1 = f"{args.input.ljust(20)}" + str2 = f"{str(id_out).ljust(20)}" + str3 = f"{source}" + + if args.tsv: + print(f"{str1}\t{str2}\t{str3}") + else: + print(f"{str1},{str2},{str3}") diff --git a/src/nf_rnaseq/config.py b/src/nf_rnaseq/config.py new file mode 100644 index 0000000..5cb44e8 --- /dev/null +++ b/src/nf_rnaseq/config.py @@ -0,0 +1,35 @@ +import os + +REQUESTS_CACHE_VAR = "REQUESTS_CACHE" +"""str: Environment variable for request cache file prefix.""" + + +def set_request_cache(val: str) -> None: + """Set the request cache path in environment variables. + + Parameters + ---------- + val : str + Request cache path + + Returns + ------- + None + + """ + os.environ[REQUESTS_CACHE_VAR] = val + + +def maybe_get_request_cache() -> str | None: + """Get the request cache path from the environment. + + Returns + ------- + str | None + Request cache path as string if exists, otherwise None + + """ + try: + return os.environ[REQUESTS_CACHE_VAR] + except KeyError: + return None diff --git a/src/nf_rnaseq/hgnc.py b/src/nf_rnaseq/hgnc.py new file mode 100644 index 0000000..4b218c2 --- /dev/null +++ b/src/nf_rnaseq/hgnc.py @@ -0,0 +1,78 @@ +import logging +import os +from dataclasses import dataclass + +from nf_rnaseq.api_schema import APIClient + +logger = logging.getLogger(__name__) + + +@dataclass +class HGNC(APIClient): + """Class to interact with HGNC API.""" + + search_id: str + """str: ID on which to search.""" + search_term: str + """str: Term from, https://www.genenames.org/help/rest/ on which to search.""" + url_base: str = "https://rest.genenames.org/fetch" + """str: URL base for HGNC API.""" + header: str = "{'Accept': 'application/json'}" + """str: Header for HGNC API (use ast.as_literal for dict).""" + # url_query: str = None + # """str: URL query for HGNC API.""" + # json: dict = None + # """dict: JSON response from UniProt API.""" + # text: str = None + # """str: Text response from UniProt API (if no json).""" + # hgnc_gene_name: list[str] = None + # """str: HGNC gene name.""" + + def __post_init__(self): + self.create_query_url() + self.query_api() + self.maybe_set_json_properties() + self.maybe_get_hgnc_gene_name() + + def create_query_url(self): + """Create URL for HGNC API query.""" + self.url_query = os.path.join(self.url_base, self.search_term, self.search_id) + + def maybe_set_json_properties(self): + """If self.json is not None, set properties of UniProt object using self.json.""" + if self.json is not None: + HGNC(**self.json) + + def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]: + """Get list of gene names from UniProt ID and add as hgnc_gene_name attr.""" + try: + list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol) + self.hgnc_gene_name = list_genes + except (KeyError, AttributeError) as e: + logging.error("Error at %s", "division", exc_info=e) + + def maybe_extract_list_from_hgnc_response_docs( + self, + str_to_extract: str, + ) -> list[str] | None: + """Extract a list of values from the response documents of an HGNC REST API request. + + Parameters + ---------- + str_to_extract : str + Key to extract from the response documents + + Returns + ------- + list[str] + List of values extracted from the response documents + + """ + try: + if self.response["numFound"] >= 1: + list_output = [doc[str_to_extract] for doc in self.response["docs"]] + else: + list_output = [] + return list_output + except (KeyError, AttributeError) as e: + logging.error("Error at %s", "division", exc_info=e) diff --git a/src/nf_rnaseq/requests_wrapper.py b/src/nf_rnaseq/requests_wrapper.py new file mode 100644 index 0000000..3bc69b2 --- /dev/null +++ b/src/nf_rnaseq/requests_wrapper.py @@ -0,0 +1,64 @@ +import os +from functools import cache + +from requests.adapters import HTTPAdapter, Retry +from requests_cache import CachedSession + +REQUEST_CACHE_VAR = "REQUEST_CACHE" + + +def add_retry_to_session( + session, + retries=5, + backoff_factor=0.3, + status_forcelist=(429, 500, 501, 502, 503, 504), +): + """Add retry logic to a session. + + Parameters + ---------- + session: requests.Session + Session object + retries: int + Number of retries + backoff_factor: float + Backoff factor + status_forcelist: tuple[int] + Tuple of status codes to force a retry + + Returns + ------- + requests.Session + Session object with retry logic + + """ + retry = Retry( + total=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + allowed_methods=False, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +@cache +def get_cached_session(): + """Get a cached session. + + Returns + ------- + requests.Session + Cached session object + + """ + if REQUEST_CACHE_VAR in os.environ: + cache_location = os.environ[REQUEST_CACHE_VAR] + + session = CachedSession(cache_location, allowable_codes=(200, 404, 400), backend="sqlite") + else: + session = CachedSession(backend="memory") + + return add_retry_to_session(session) diff --git a/src/nf_rnaseq/uniprot.py b/src/nf_rnaseq/uniprot.py new file mode 100644 index 0000000..a412ef5 --- /dev/null +++ b/src/nf_rnaseq/uniprot.py @@ -0,0 +1,48 @@ +import logging +import os +from dataclasses import dataclass + +from nf_rnaseq import APIClient + +logger = logging.getLogger(__name__) + + +@dataclass +class UniProt(APIClient): + """Class to interact with UniProt API.""" + + uniprot_id: str + """str: UniProt ID.""" + url_base: str = "https://rest.uniprot.org/uniprotkb" + """str: URL base for UniProtKB API.""" + url_query: str = None + """str: URL query for UniProt API.""" + json: dict = None + """dict: JSON response from UniProt API.""" + text: str = None + """str: Text response from UniProt API (if no json).""" + hgnc_gene_name: str = None + """str: HGNC gene name.""" + + def __post_init__(self): + self.create_query_url() + self.query_api() + self.maybe_set_json_properties() + self.maybe_get_hgnc_gene_name() + + def create_query_url(self): + """Create URL for UniProt API query.""" + self.url_query = os.path.join(self.url_base, self.uniprot_id, ".json") + + def maybe_set_json_properties(self): + """If self.json is not None, set properties of UniProt object using self.json.""" + if self.json is not None: + UniProt(**self.json) + + def maybe_get_hgnc_gene_name(self): + """Get list of gene names from UniProt ID and add as hgnc_gene_name attr.""" + try: + list_genes = [str(gene["geneName"]["value"]) for gene in self.genes] + self.hgnc_gene_name = list_genes + except (KeyError, AttributeError) as e: + logging.error("Error at %s", "division", exc_info=e)