diff --git a/pyproject.toml b/pyproject.toml index 19395bb..b5b30f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ test = [ ] [project.scripts] -get_hgnc_gene_name = "nf_rnaseq.cli.get_hgnc_gene_name:main" +get_gene_name = "nf_rnaseq.cli.get_gene_name:main" [tool.coverage.run] source = ["nf_rnaseq"] diff --git a/src/nf_rnaseq/api_schema.py b/src/nf_rnaseq/api_schema.py index d4bbf07..8fa78d3 100644 --- a/src/nf_rnaseq/api_schema.py +++ b/src/nf_rnaseq/api_schema.py @@ -16,7 +16,7 @@ def query_api(self): if self.headers is None: response = session.get(self.url_query) else: - response = session.get(self.url_query, ast.literal_eval(self.header)) + response = session.get(self.url_query, headers=ast.literal_eval(self.headers)) try: response.raise_for_status() @@ -25,8 +25,8 @@ def query_api(self): try: self.json = response.json() - except requests.exceptions.JSONDecodeError as e: - logging.error("Error at %s", "division", exc_info=e) + except requests.exceptions.JSONDecodeError: + # logging.error("Error at %s", "division", exc_info=e) self.text = response.text @abstractmethod @@ -35,11 +35,6 @@ def create_query_url(self): ... @abstractmethod - def maybe_set_attr_from_json(self): - """Set attributes in the object from the json response.""" - ... - - @abstractmethod - def maybe_get_hgnc_gene_name(self): - """Get the HGNC gene name from the json response.""" + def maybe_get_gene_names(self): + """Get the gene name from the request response.""" ... diff --git a/src/nf_rnaseq/biomart.py b/src/nf_rnaseq/biomart.py new file mode 100644 index 0000000..0a091fd --- /dev/null +++ b/src/nf_rnaseq/biomart.py @@ -0,0 +1,53 @@ +import logging +from dataclasses import dataclass +from io import StringIO + +import pandas as pd + +from nf_rnaseq.api_schema import APIClient + +logger = logging.getLogger(__name__) + + +@dataclass +class BioMart(APIClient): + """Class to interact with Ensembl BioMart API.""" + + identifier: str + """str: Ensembl transcript ID(s); either one or a list of comma separated values (<=500 total).""" + search_term: str + """str: Term on which to search.""" + url_base: str = 'http://www.ensembl.org/biomart/martservice?query=' + """str: URL base for Ensembl BioMart API.""" + url_query: str = None + """str: URL query for BioMart API.""" + headers = None + """str: headers for BioMart API (use ast.as_literal for dict).""" + json: dict = None + """dict: JSON response from BioMart API.""" + text: str = None + """str: Text response from BioMart API (if no json).""" + gene_names: list[str] = None + """str: Gene name(s).""" + + def __post_init__(self): + self.create_query_url() + self.query_api() + self.maybe_get_gene_names() + + def create_query_url(self): + """Create URL for BioMart API query.""" + # split on ", ", trim, and join with "," to ensure no spaces + self.identifier = ",".join([id.strip() for id in self.identifier.replace("[", "").replace("]", "").split(",")]) + self.url_query = self.url_base.replace("", self.identifier).replace("", self.search_term) + + def maybe_get_gene_names(self): + """Get dataframe of transcript IDs and gene names from transcript IDs and add as hgnc_gene_name attr.""" + try: + df = pd.read_csv(StringIO(self.text), sep="\t", header=None) + df.columns = ["in", "out"] + # in case multiple gene names for one transcript ID + df_agg = df.groupby("in", sort=False).agg(list) + self.gene_names = df_agg["out"].tolist() + except (KeyError, AttributeError) as e: + logging.error("Error at %s", "division", exc_info=e) diff --git a/src/nf_rnaseq/cli/get_gene_name.py b/src/nf_rnaseq/cli/get_gene_name.py new file mode 100755 index 0000000..71b1246 --- /dev/null +++ b/src/nf_rnaseq/cli/get_gene_name.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +import argparse + +from nf_rnaseq import biomart, config, hgnc, uniprot + +DICT_DATABASES = { + "BioMart": { + "api_object": biomart.BioMart, + "search_term": "ensembl_transcript_id_version", + }, + "HGNC": { + "api_object": hgnc.HGNC, + "search_term": "mane_select", + }, + "UniProt": { + "api_object": uniprot.UniProt, + "search_term": None, + }, +} + + +def parsearg_utils(): + """ + + Argparser to get HGNC gene name from string input. + + Returns + ------- + args: argparse.Namespace + Namespace object containing featureCounts files + + """ + parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.") + + parser.add_argument( + "-c", + "--cachePath", + help="Path to requests cache (type: str, default: '')", + type=str, + default="", + ) + + parser.add_argument( + "-d", + "--database", + help="Database to use including BioMart, HGNC, and UniProt (type: str, no default)", + type=str, + ) + + parser.add_argument( + "-i", + "--input", + help="Input string (type: str)", + type=str, + ) + + parser.add_argument( + "-t", + "--tsv", + help="If flag included tsv format out otherwise csv", + action="store_true", + ) + + args = parser.parse_args() + + return args + + +def main(): + """Get HGNC gene name from string input.""" + args = parsearg_utils() + inputs_ids = args.input.replace("[", "").replace("]", "") + + if args.cachePath != "": + config.set_request_cache(args.cachePath) + + try: + api_obj = DICT_DATABASES[args.database]["api_object"]( + identifier=inputs_ids, + search_term=DICT_DATABASES[args.database]["search_term"], + ) + id_out = api_obj.gene_names + except KeyError as e: + raise UserWarning(f"Database {args.database} not in DICT_DATABASES.keys()") from e + + # set delimiter depending on tsv flag + if args.tsv: + delim = "\t" + else: + delim = "," + + # if inputs are a list, split and iterate + list_inputs = inputs_ids.split(", ") + str_out = "" + if len(list_inputs) > 1: + for idx, input_id in enumerate(list_inputs): + str1 = f"{input_id.ljust(20)}" + str2 = f"{str(id_out[idx]).ljust(20)}" + str3 = f"{args.database}" + str_out += f"{str1}{delim}{str2}{delim}{str3}\n" + else: + str1 = f"{args.input.ljust(20)}" + str2 = f"{str(id_out).ljust(20)}" + str3 = f"{args.database}" + str_out = f"{str1}{delim}{str2}{delim}{str3}\n" + + print(str_out) diff --git a/src/nf_rnaseq/cli/get_hgnc_gene_name.py b/src/nf_rnaseq/cli/get_hgnc_gene_name.py deleted file mode 100644 index c73ee16..0000000 --- a/src/nf_rnaseq/cli/get_hgnc_gene_name.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/usr/env python - -import argparse - -from nf_rnaseq import config, hgnc, uniprot - - -def parsearg_utils(): - """ - - Argparser to get HGNC gene name from string input. - - Returns - ------- - args: argparse.Namespace - Namespace object containing featureCounts files - - """ - parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.") - - parser.add_argument( - "-c", - "--cachePath", - help="Path to requests cache (type: str, default: '')", - type=str, - default="", - ) - - parser.add_argument( - "-i", - "--input", - help="Input string (type: str)", - type="str", - ) - - parser.add_argument( - "-s", - "--searchTerm", - help="Search term for HGNC Fetch; if UniProt, not in use (type: str)", - type="str", - default="mane_select", - ) - - parser.add_argument( - "-t", - "--tsv", - help="If flag included tsv format out otherwise csv", - action="store_true", - ) - - parser.add_argument( - "-u", - "--uniProt", - help="If flag included UniProt should be queried otherwise HGNC database used", - action="store_true", - ) - - args = parser.parse_args() - - return args - - -def main(): - """Get HGNC gene name from string input.""" - args = parsearg_utils() - - if args.cachePath != "": - config.set_request_cache(args.cachePath) - - if args.uniProt: - source = "UniProt" - uniprot_obj = uniprot.UniProt(uniprot_id=args.input) - uniprot_obj.query_api() - uniprot_obj.maybe_set_attr_from_json() - # id_out = uniprot_obj.hgnc_gene_name - else: - source = "HGNC" - hgnc_obj = hgnc.HGNC(search_id=args.input, search_term=args.searchTerm) - # hgnc_obj.query_api() - # hgnc_obj.maybe_set_attr_from_json() - id_out = hgnc_obj.hgnc_gene_name - - str1 = f"{args.input.ljust(20)}" - str2 = f"{str(id_out).ljust(20)}" - str3 = f"{source}" - - if args.tsv: - print(f"{str1}\t{str2}\t{str3}") - else: - print(f"{str1},{str2},{str3}") diff --git a/src/nf_rnaseq/hgnc.py b/src/nf_rnaseq/hgnc.py index 4b218c2..29c421b 100644 --- a/src/nf_rnaseq/hgnc.py +++ b/src/nf_rnaseq/hgnc.py @@ -11,43 +11,37 @@ class HGNC(APIClient): """Class to interact with HGNC API.""" - search_id: str + identifier: str """str: ID on which to search.""" search_term: str """str: Term from, https://www.genenames.org/help/rest/ on which to search.""" url_base: str = "https://rest.genenames.org/fetch" """str: URL base for HGNC API.""" - header: str = "{'Accept': 'application/json'}" - """str: Header for HGNC API (use ast.as_literal for dict).""" - # url_query: str = None - # """str: URL query for HGNC API.""" - # json: dict = None - # """dict: JSON response from UniProt API.""" - # text: str = None - # """str: Text response from UniProt API (if no json).""" - # hgnc_gene_name: list[str] = None - # """str: HGNC gene name.""" + headers: str = "{'Accept': 'application/json'}" + """str: headers for HGNC API (use ast.as_literal for dict).""" + url_query: str = None + """str: URL query for HGNC API.""" + json: dict = None + """dict: JSON response from UniProt API.""" + text: str = None + """str: Text response from UniProt API (if no json).""" + gene_names: list[str] = None + """str: HGNC gene name.""" def __post_init__(self): self.create_query_url() self.query_api() - self.maybe_set_json_properties() - self.maybe_get_hgnc_gene_name() + self.maybe_get_gene_names() def create_query_url(self): """Create URL for HGNC API query.""" - self.url_query = os.path.join(self.url_base, self.search_term, self.search_id) + self.url_query = os.path.join(self.url_base, self.search_term, self.identifier) - def maybe_set_json_properties(self): - """If self.json is not None, set properties of UniProt object using self.json.""" - if self.json is not None: - HGNC(**self.json) - - def maybe_get_hgnc_gene_name(self, str_symbol: str = "symbol") -> list[str]: + def maybe_get_gene_names(self, str_symbol: str = "symbol") -> list[str]: """Get list of gene names from UniProt ID and add as hgnc_gene_name attr.""" try: list_genes = self.maybe_extract_list_from_hgnc_response_docs(str_symbol) - self.hgnc_gene_name = list_genes + self.gene_names = list_genes except (KeyError, AttributeError) as e: logging.error("Error at %s", "division", exc_info=e) @@ -69,8 +63,8 @@ def maybe_extract_list_from_hgnc_response_docs( """ try: - if self.response["numFound"] >= 1: - list_output = [doc[str_to_extract] for doc in self.response["docs"]] + if self.json["response"]["numFound"] >= 1: + list_output = [doc[str_to_extract] for doc in self.json["response"]["docs"]] else: list_output = [] return list_output diff --git a/src/nf_rnaseq/uniprot.py b/src/nf_rnaseq/uniprot.py index a412ef5..5eb6db1 100644 --- a/src/nf_rnaseq/uniprot.py +++ b/src/nf_rnaseq/uniprot.py @@ -2,7 +2,7 @@ import os from dataclasses import dataclass -from nf_rnaseq import APIClient +from nf_rnaseq.api_schema import APIClient logger = logging.getLogger(__name__) @@ -11,38 +11,36 @@ class UniProt(APIClient): """Class to interact with UniProt API.""" - uniprot_id: str + identifier: str """str: UniProt ID.""" + search_term: str + """str: Term on which to search.""" url_base: str = "https://rest.uniprot.org/uniprotkb" """str: URL base for UniProtKB API.""" url_query: str = None """str: URL query for UniProt API.""" + headers = None + """str: headers for UniProt API (use ast.as_literal for dict).""" json: dict = None """dict: JSON response from UniProt API.""" text: str = None """str: Text response from UniProt API (if no json).""" - hgnc_gene_name: str = None - """str: HGNC gene name.""" + gene_names: list[str] = None + """list[str]: Gene name(s).""" def __post_init__(self): self.create_query_url() self.query_api() - self.maybe_set_json_properties() - self.maybe_get_hgnc_gene_name() + self.maybe_get_gene_names() def create_query_url(self): """Create URL for UniProt API query.""" - self.url_query = os.path.join(self.url_base, self.uniprot_id, ".json") + self.url_query = os.path.join(self.url_base, self.identifier + ".json") - def maybe_set_json_properties(self): - """If self.json is not None, set properties of UniProt object using self.json.""" - if self.json is not None: - UniProt(**self.json) - - def maybe_get_hgnc_gene_name(self): - """Get list of gene names from UniProt ID and add as hgnc_gene_name attr.""" + def maybe_get_gene_names(self): + """Get list of gene names from UniProt ID and add as gene_name attr.""" try: - list_genes = [str(gene["geneName"]["value"]) for gene in self.genes] - self.hgnc_gene_name = list_genes + list_genes = [str(gene["geneName"]["value"]) for gene in self.json["genes"]] + self.gene_names = list_genes except (KeyError, AttributeError) as e: logging.error("Error at %s", "division", exc_info=e)