-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added BioMart functionality and supporting getting multiple gene name…
…s simultaneously
- Loading branch information
1 parent
3a0e023
commit 54e728f
Showing
7 changed files
with
198 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import logging | ||
from dataclasses import dataclass | ||
from io import StringIO | ||
|
||
import pandas as pd | ||
|
||
from nf_rnaseq.api_schema import APIClient | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@dataclass | ||
class BioMart(APIClient): | ||
"""Class to interact with Ensembl BioMart API.""" | ||
|
||
identifier: str | ||
"""str: Ensembl transcript ID(s); either one or a list of comma separated values (<=500 total).""" | ||
search_term: str | ||
"""str: Term on which to search.""" | ||
url_base: str = 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "<SEARCH_TERM>" value = "<IDS>"/><Attribute name = "ensembl_transcript_id" /><Attribute name = "external_gene_name" /></Dataset></Query>' | ||
"""str: URL base for Ensembl BioMart API.""" | ||
url_query: str = None | ||
"""str: URL query for BioMart API.""" | ||
headers = None | ||
"""str: headers for BioMart API (use ast.as_literal for dict).""" | ||
json: dict = None | ||
"""dict: JSON response from BioMart API.""" | ||
text: str = None | ||
"""str: Text response from BioMart API (if no json).""" | ||
gene_names: list[str] = None | ||
"""str: Gene name(s).""" | ||
|
||
def __post_init__(self): | ||
self.create_query_url() | ||
self.query_api() | ||
self.maybe_get_gene_names() | ||
|
||
def create_query_url(self): | ||
"""Create URL for BioMart API query.""" | ||
# split on ", ", trim, and join with "," to ensure no spaces | ||
self.identifier = ",".join([id.strip() for id in self.identifier.replace("[", "").replace("]", "").split(",")]) | ||
self.url_query = self.url_base.replace("<IDS>", self.identifier).replace("<SEARCH_TERM>", self.search_term) | ||
|
||
def maybe_get_gene_names(self): | ||
"""Get dataframe of transcript IDs and gene names from transcript IDs and add as hgnc_gene_name attr.""" | ||
try: | ||
df = pd.read_csv(StringIO(self.text), sep="\t", header=None) | ||
df.columns = ["in", "out"] | ||
# in case multiple gene names for one transcript ID | ||
df_agg = df.groupby("in", sort=False).agg(list) | ||
self.gene_names = df_agg["out"].tolist() | ||
except (KeyError, AttributeError) as e: | ||
logging.error("Error at %s", "division", exc_info=e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
|
||
from nf_rnaseq import biomart, config, hgnc, uniprot | ||
|
||
DICT_DATABASES = { | ||
"BioMart": { | ||
"api_object": biomart.BioMart, | ||
"search_term": "ensembl_transcript_id_version", | ||
}, | ||
"HGNC": { | ||
"api_object": hgnc.HGNC, | ||
"search_term": "mane_select", | ||
}, | ||
"UniProt": { | ||
"api_object": uniprot.UniProt, | ||
"search_term": None, | ||
}, | ||
} | ||
|
||
|
||
def parsearg_utils(): | ||
""" | ||
Argparser to get HGNC gene name from string input. | ||
Returns | ||
------- | ||
args: argparse.Namespace | ||
Namespace object containing featureCounts files | ||
""" | ||
parser = argparse.ArgumentParser(description="Parser for get_hgnc_gene_name.py.") | ||
|
||
parser.add_argument( | ||
"-c", | ||
"--cachePath", | ||
help="Path to requests cache (type: str, default: '')", | ||
type=str, | ||
default="", | ||
) | ||
|
||
parser.add_argument( | ||
"-d", | ||
"--database", | ||
help="Database to use including BioMart, HGNC, and UniProt (type: str, no default)", | ||
type=str, | ||
) | ||
|
||
parser.add_argument( | ||
"-i", | ||
"--input", | ||
help="Input string (type: str)", | ||
type=str, | ||
) | ||
|
||
parser.add_argument( | ||
"-t", | ||
"--tsv", | ||
help="If flag included tsv format out otherwise csv", | ||
action="store_true", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
return args | ||
|
||
|
||
def main(): | ||
"""Get HGNC gene name from string input.""" | ||
args = parsearg_utils() | ||
inputs_ids = args.input.replace("[", "").replace("]", "") | ||
|
||
if args.cachePath != "": | ||
config.set_request_cache(args.cachePath) | ||
|
||
try: | ||
api_obj = DICT_DATABASES[args.database]["api_object"]( | ||
identifier=inputs_ids, | ||
search_term=DICT_DATABASES[args.database]["search_term"], | ||
) | ||
id_out = api_obj.gene_names | ||
except KeyError as e: | ||
raise UserWarning(f"Database {args.database} not in DICT_DATABASES.keys()") from e | ||
|
||
# set delimiter depending on tsv flag | ||
if args.tsv: | ||
delim = "\t" | ||
else: | ||
delim = "," | ||
|
||
# if inputs are a list, split and iterate | ||
list_inputs = inputs_ids.split(", ") | ||
str_out = "" | ||
if len(list_inputs) > 1: | ||
for idx, input_id in enumerate(list_inputs): | ||
str1 = f"{input_id.ljust(20)}" | ||
str2 = f"{str(id_out[idx]).ljust(20)}" | ||
str3 = f"{args.database}" | ||
str_out += f"{str1}{delim}{str2}{delim}{str3}\n" | ||
else: | ||
str1 = f"{args.input.ljust(20)}" | ||
str2 = f"{str(id_out).ljust(20)}" | ||
str3 = f"{args.database}" | ||
str_out = f"{str1}{delim}{str2}{delim}{str3}\n" | ||
|
||
print(str_out) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.