diff --git a/docs/api.md b/docs/api.md index 9c89dce..aca0b43 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1,38 +1,17 @@ # API -## Preprocessing +## API Clients -```{eval-rst} -.. module:: nf_rnaseq.pp -.. currentmodule:: nf_rnaseq - -.. autosummary:: - :toctree: generated - - pp.basic_preproc -``` - -## Tools - -```{eval-rst} -.. module:: nf_rnaseq.tl -.. currentmodule:: nf_rnaseq - -.. autosummary:: - :toctree: generated - - tl.basic_tool -``` +### BioMart -## Plotting +Supports downloads of ```{eval-rst} -.. module:: nf_rnaseq.pl +.. module:: nf_rnaseq.biomart .. currentmodule:: nf_rnaseq .. autosummary:: :toctree: generated - pl.basic_plot - pl.BasicClass + biomart.BioMart ``` diff --git a/docs/conf.py b/docs/conf.py index e8f2e95..0eecc2c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -92,7 +92,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), - "anndata": ("https://anndata.readthedocs.io/en/stable/", None), + # "anndata": ("https://anndata.readthedocs.io/en/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), } diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 742567b..12a4e1e 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example notebook" + "# `nf_rnaseq` notebook\n", + "\n", + "The `nf_rnaseq` package can be used to query a number of databases and harmonize gene identifiers in the course of an RNA-seq analysis." ] }, { @@ -13,39 +15,237 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from anndata import AnnData\n", "import pandas as pd\n", - "import nf_rnaseq" + "from io import StringIO\n", + "\n", + "from nf_rnaseq import variables\n", + "from nf_rnaseq.biomart import BioMart\n", + "from nf_rnaseq.hgnc import HGNC\n", + "from nf_rnaseq.uniprot import UniProt, UniProtPOST, UniProtGET\n", + "from nf_rnaseq import load" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Variables\n", + "\n", + "This module contains a dictionary for the default properties needed at instantiation of {class}`nf_rnaseq.biomart.BioMart`, {class}`nf_rnaseq.hgnc.HGNC`, {class}`nf_rnaseq.uniprot.UniProt`, {class}`nf_rnaseq.uniprot.UniProtGET`, and {class}`nf_rnaseq.uniprot.UniProtPOST`.\n", + "\n", + "This package is optimized only to query the provided `url_base`, but the `term_in` and `term_out` can be modified." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'BioMart': {'GET': {'api_object': nf_rnaseq.biomart.BioMart,\n", + " 'term_in': 'ensembl_transcript_id_version',\n", + " 'term_out': 'external_gene_name',\n", + " 'url_base': 'http://www.ensembl.org/biomart/martservice?query=\" value = \"\"/>\" />\" />',\n", + " 'headers': None}},\n", + " 'HGNC': {'GET': {'api_object': nf_rnaseq.hgnc.HGNC,\n", + " 'term_in': 'mane_select',\n", + " 'term_out': 'symbol',\n", + " 'url_base': 'https://rest.genenames.org/fetch',\n", + " 'headers': \"{'Accept': 'application/json'}\"}},\n", + " 'UniProt': {'GET': {'api_object': nf_rnaseq.uniprot.UniProt,\n", + " 'term_in': 'UniProtKB_AC-ID',\n", + " 'term_out': 'Gene_Name',\n", + " 'url_base': 'https://rest.uniprot.org/uniprotkb',\n", + " 'headers': None}},\n", + " 'UniProtBULK': {'POST': {'api_object': nf_rnaseq.uniprot.UniProtPOST,\n", + " 'term_in': 'UniProtKB_AC-ID',\n", + " 'term_out': 'Gene_Name',\n", + " 'url_base': 'https://rest.uniprot.org/idmapping/run'},\n", + " 'GET': {'api_object': nf_rnaseq.uniprot.UniProtGET,\n", + " 'term_in': 'UniProtKB_AC-ID',\n", + " 'term_out': 'Gene_Name',\n", + " 'url_base': 'https://rest.uniprot.org/idmapping/status',\n", + " 'headers': None}}}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "adata = AnnData(np.random.normal(size=(20, 10)))" + "dict_databases = variables.DICT_DATABASES\n", + "dict_databases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "With myst it is possible to link in the text cell of a notebook such as this one the documentation of a function or a class.\n", + "\n", + "\n", + "## API schema\n", + "\n", + "The use of the API clients is governed by a series of `ABC` and `dataclass` objects whose inheritance, properties, and functions are described below:\n", + "\n", + "**{class}`nf_rnaseq.api_schema.APIClient`**\n", + "\n", + "Parent Class that governs all shared API client properties and functions\n", + " \n", + "**Properties**\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PropertiesType (Default)Description
identifierstrString value containing search term or comma-delimited set of search terms
term_instr (default: None)Term to which to convert in query database
term_outstr (default: None)Term to which to convert in query database
\n", + "\n", + "**Functions**\n", + "\n", + "+ {func}`nf_rnaseq.APIClient.__post_init__`\n", + " \n", + "+ {func}`nf_rnaseq.APIClient.check_response`\n", + "\n", + "+ {func}`nf_rnaseq.APIClient.process_identifier`\n", + "\n", + "+ {func}`nf_rnaseq.APIClient.query_api` (`@abstractmethod`)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FunctionDescription
__post_init__Upon initialization, the `process_identifier` function is called
check_responseRaise for status with `requests` otherwise log error
process_identifierFor `identifier` strip [ and ], split on comma, strip extra spaces; save results as `identifier` and list version as `list_identifier`
query_apiAbstract method to query API implemented at level of sub-class
\n", + "\n", + "
\n", + "\n", + "**{class}`nf_rnaseq.api_schema.API.APIClientGET`**\n", + "\n", + "Child class of `APIClient` that provides basic `GET` functionality for HTTP requests\n", + "\n", + "**Additional properties**\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PropertiesType (Default)Description
headersstrString value containing search term or comma-delimited set of search terms
polling_intervalint (default: 5)How often a poll check occurs for change of state, if necessary (e.g., GET after POST)
\n", + "\n", + "**Additional functions**\n", + "\n", + "+ {func}`nf_rnaseq.APIClientGET.__post_init__`\n", + "\n", + "+ {func}`nf_rnaseq.APIClientGET.query_api`\n", "\n", - "Let's take as an example the function {func}`nf_rnaseq.pp.basic_preproc`. \n", - "You can see that by clicking on the text, the link redirects to the API documentation of the function. \n", - "Check the raw markdown of this cell to understand how this is specified.\n", + "+ {func}`nf_rnaseq.APIClient.create_query_url` (`@abstractmethod`)\n", "\n", - "This works also for any package listed by `intersphinx`. Go to `docs/conf.py` and look for the `intersphinx_mapping` variable. \n", - "There, you will see a list of packages (that this package is dependent on) for which this functionality is supported. \n", + "+ {func}`nf_rnaseq.APIClient.check_if_job_ready` (`@abstractmethod`)\n", "\n", - "For instance, we can link to the class {class}`anndata.AnnData`, to the attribute {attr}`anndata.AnnData.obs` or the method {meth}`anndata.AnnData.write`.\n", + "+ {func}`nf_rnaseq.APIClient.maybe_get_gene_names` (`@abstractmethod`)\n", "\n", - "Again, check the raw markdown of this cell to see how each of these links are specified.\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FunctionDescription
__post_init__Upon initialization, the `super().__post_init__`, `create_query_url`, `query_api`, and `maybe_get_gene_names` functions are called
query_apiQuery API and add the output to `self.json` if json otherwise `self.text`
process_identifier (@abstractmethod)Abstract method generate the URL to query implemented at level of sub-class
check_if_job_ready (@abstractmethod)Abstract method generate check if job ready if POST necessary implemented at level of sub-class; should return `True` if needed and `False` otherwise
maybe_get_gene_names (@abstractmethod)Abstract method generate check if job ready if POST necessary implemented at level of sub-class; should return `True` if needed and `False` otherwise
\n", "\n", - "You can read more about this in the [intersphinx page](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html) and the [myst page](https://myst-parser.readthedocs.io/en/v0.15.1/syntax/syntax.html#roles-an-in-line-extension-point)." + "
\n", + "\n", + "**{class}`nf_rnaseq.api_schema.API.APIClientPOST`**\n", + "\n", + "Child class of `APIClient` that provides basic `POST` functionality for HTTP requests\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## BioMart\n", + "\n", + "{class}`nf_rnaseq.biomart.BioMart` is a child class of can be used to retrieve multiple comma-delimited entries from [Ensembl's BioMart](https://useast.ensembl.org/info/data/biomart/index.html). Note that the following will produce a `requests.exceptions.JSONDecodeError` but that the results of the API query will be stored in the `text` property of the {class}`nf_rnaseq.biomart.BioMart` object instead of in the `json` property as a result." ] }, { @@ -54,16 +254,89 @@ "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Implement a preprocessing function here." + "ERROR:root:Error at division\n", + "Traceback (most recent call last):\n", + " File \"/home/whitej6/miniforge3/envs/nf_rna/lib/python3.11/site-packages/requests/models.py\", line 974, in json\n", + " return complexjson.loads(self.text, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/whitej6/miniforge3/envs/nf_rna/lib/python3.11/json/__init__.py\", line 346, in loads\n", + " return _default_decoder.decode(s)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/whitej6/miniforge3/envs/nf_rna/lib/python3.11/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/whitej6/miniforge3/envs/nf_rna/lib/python3.11/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/data1/tanseyw/projects/nf-rnaseq/src/nf_rnaseq/api_schema.py\", line 81, in query_api\n", + " self.json = response.json()\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/home/whitej6/miniforge3/envs/nf_rna/lib/python3.11/site-packages/requests/models.py\", line 978, in json\n", + " raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)\n", + "requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_idgene_namessource
0ENST00000493287.5[MRPL20]BioMart
1ENST00000582431.2[RN7SL657P]BioMart
2ENST00000614007.1[U6]BioMart
\n", + "
" + ], "text/plain": [ - "0" + " original_id gene_names source\n", + "0 ENST00000493287.5 [MRPL20] BioMart\n", + "1 ENST00000582431.2 [RN7SL657P] BioMart\n", + "2 ENST00000614007.1 [U6] BioMart" ] }, "execution_count": 3, @@ -72,7 +345,26 @@ } ], "source": [ - "nf_rnaseq.pp.basic_preproc(adata)" + "dict_biomart = dict_databases[\"BioMart\"][\"GET\"]\n", + "biomart_obj = BioMart(\n", + " identifier=\"ENST00000614007.1,ENST00000493287.5,ENST00000582431.2\",\n", + " term_in=dict_biomart[\"term_in\"],\n", + " term_out=dict_biomart[\"term_out\"],\n", + " url_base=dict_biomart[\"url_base\"],\n", + ")\n", + "\n", + "pd.DataFrame(\n", + " {\"original_id\": biomart_obj.list_identifier, \"gene_names\": biomart_obj.list_gene_names, \"source\": \"BioMart\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## HGNC\n", + "\n", + "{class}`nf_rnaseq.hgnc.HGNC` can be used to retrieve single entries from the Human Genome Nomenclature Committee's (HGNC) [API](https://www.genenames.org/help/rest). Allowable searchable fields can be found on their website." ] }, { @@ -101,50 +393,421 @@ " \n", " \n", " \n", - " A\n", - " B\n", + " original_id\n", + " gene_names\n", + " source\n", + " \n", + " \n", + " \n", + " \n", + " 0\n", + " NM_033360\n", + " KRAS\n", + " HGNC\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " original_id gene_names source\n", + "0 NM_033360 KRAS HGNC" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_hgnc = dict_databases[\"HGNC\"][\"GET\"]\n", + "hgnc_obj = HGNC(\n", + " identifier=\"NM_033360\",\n", + " term_in=\"refseq_accession\",\n", + " term_out=\"symbol\",\n", + " url_base=dict_hgnc[\"url_base\"],\n", + " headers=dict_hgnc[\"headers\"],\n", + ")\n", + "\n", + "pd.DataFrame({\"original_id\": hgnc_obj.list_identifier, \"gene_names\": hgnc_obj.list_gene_names, \"source\": \"HGNC\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UniProt\n", + "\n", + "### Single entry retrieval\n", + "\n", + "{class}`nf_rnaseq.uniprot.UniProt` can be used to retrieve single entries from UniProtKB's [individual entry API](https://www.uniprot.org/help/api_retrieve_entries)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_idgene_namessource
0P24468NR2F2UniProt
\n", + "
" + ], + "text/plain": [ + " original_id gene_names source\n", + "0 P24468 NR2F2 UniProt" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uniprot_obj = UniProt(\n", + " identifier=\"P24468\",\n", + " term_in=dict_databases[\"UniProt\"][\"GET\"][\"term_in\"],\n", + " term_out=dict_databases[\"UniProt\"][\"GET\"][\"term_out\"],\n", + " url_base=dict_databases[\"UniProt\"][\"GET\"][\"url_base\"],\n", + ")\n", + "\n", + "pd.DataFrame(\n", + " {\"original_id\": uniprot_obj.list_identifier, \"gene_names\": uniprot_obj.list_gene_names, \"source\": \"UniProt\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bulk entry retrieval\n", + "\n", + "A combination of {class}`nf_rnaseq.uniprot.UniProtGET` and {class}`nf_rnaseq.uniprot.UniProtGET` can be used to retrieve a large number of entries from UniProtKB's [ID mapping API](https://www.uniprot.org/help/id_mapping)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
original_idgene_namessource
0a1P24468[NR2F2]UniProtBULK
1b2C9J5X1[IGF1R]UniProtBULK
2c3Q5W5X9[TTC23]UniProtBULK
\n", "
" ], "text/plain": [ - " A B\n", - "0 a 1\n", - "1 b 2\n", - "2 c 3" + " original_id gene_names source\n", + "0 P24468 [NR2F2] UniProtBULK\n", + "1 C9J5X1 [IGF1R] UniProtBULK\n", + "2 Q5W5X9 [TTC23] UniProtBULK" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pd.DataFrame().assign(A=[\"a\", \"b\", \"c\"], B=[1, 2, 3])" + "str_id = \"P24468, C9J5X1, Q5W5X9\"\n", + "str_db = \"UniProtBULK\"\n", + "\n", + "dict_post = dict_databases[str_db][\"POST\"]\n", + "uniprot_post_obj = UniProtPOST(\n", + " identifier=str_id,\n", + " term_in=dict_post[\"term_in\"],\n", + " term_out=dict_post[\"term_out\"],\n", + " url_base=dict_post[\"url_base\"],\n", + ")\n", + "\n", + "dict_get = dict_databases[str_db][\"GET\"]\n", + "uniprot_get_obj = UniProtGET(\n", + " identifier=str_id,\n", + " term_in=dict_get[\"term_in\"],\n", + " term_out=dict_get[\"term_out\"],\n", + " url_base=dict_get[\"url_base\"],\n", + " jobId=uniprot_post_obj.jobId,\n", + ")\n", + "\n", + "pd.DataFrame(\n", + " {\n", + " \"original_id\": uniprot_get_obj.list_identifier,\n", + " \"gene_names\": uniprot_get_obj.list_gene_names,\n", + " \"source\": \"UniProtBULK\",\n", + " }\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Command line script\n", + "\n", + "The package also provides a command line \n", + "\n", + "- `input`: identifier or comma delimited list of identifiers\n", + "- `database`: keys in {dict}`nf_rnaseq.variables.DICT_DATABASES` (BioMart, HGNC, UniProt, UniProtBULK)\n", + "\n", + "### CSV output\n", + "```\n", + "get_gene_name \\\\\n", + " -i \\\\\n", + " -d \\\\\n", + " -c \\\\\n", + " > .csv\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-08-16 15:32:01,025 - nf_rnaseq.cli.get_gene_name - INFO - Querying API for UniProtBULK\n", + "2024-08-16 15:32:02,799 - nf_rnaseq.uniprot - INFO - \n", + "30672867582f7e26860278d92bf5058a91631230\n", + "{'results': [{'from': 'P24468', 'to': 'NR2F2'}, {'from': 'C9J5X1', 'to': 'IGF1R'}, {'from': 'Q5W5X9', 'to': 'TTC23'}], 'failedIds': []}\n", + "2024-08-16 15:32:02,799 - nf_rnaseq.api_schema - INFO - \n", + "P24468,C9J5X1,Q5W5X9\n", + "{'results': [{'from': 'P24468', 'to': 'NR2F2'}, {'from': 'C9J5X1', 'to': 'IGF1R'}, {'from': 'Q5W5X9', 'to': 'TTC23'}], 'failedIds': []}\n", + "\n", + "P24468 ,['NR2F2'] ,UniProtBULK\n", + "C9J5X1 ,['IGF1R'] ,UniProtBULK\n", + "Q5W5X9 ,['TTC23'] ,UniProtBULK\n", + "\n" + ] + } + ], + "source": [ + "!get_gene_name \\\n", + " -i \"P24468, C9J5X1, Q5W5X9\" \\\n", + " -d \"UniProtBULK\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TSV output\n", + "```\n", + "get_gene_name \\\\\n", + " -i \\\\\n", + " -d \\\\\n", + " -c \\\\\n", + " -t \\\\\n", + " > .tsv\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-08-16 15:32:06,455 - nf_rnaseq.cli.get_gene_name - INFO - Querying API for UniProtBULK\n", + "2024-08-16 15:32:08,084 - nf_rnaseq.uniprot - INFO - \n", + "30672867582f7e26860278d92bf5058a91631230\n", + "{'results': [{'from': 'P24468', 'to': 'NR2F2'}, {'from': 'C9J5X1', 'to': 'IGF1R'}, {'from': 'Q5W5X9', 'to': 'TTC23'}], 'failedIds': []}\n", + "2024-08-16 15:32:08,084 - nf_rnaseq.api_schema - INFO - \n", + "P24468,C9J5X1,Q5W5X9\n", + "{'results': [{'from': 'P24468', 'to': 'NR2F2'}, {'from': 'C9J5X1', 'to': 'IGF1R'}, {'from': 'Q5W5X9', 'to': 'TTC23'}], 'failedIds': []}\n", + "\n", + "P24468 \t['NR2F2'] \tUniProtBULK\n", + "C9J5X1 \t['IGF1R'] \tUniProtBULK\n", + "Q5W5X9 \t['TTC23'] \tUniProtBULK\n", + "\n" + ] + } + ], + "source": [ + "!get_gene_name \\\n", + " -i \"P24468, C9J5X1, Q5W5X9\" \\\n", + " -d \"UniProtBULK\" \\\n", + " -t" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis\n", + "\n", + "The package also provides an analysis module for processing the resulting CSV and TSV files. For the purposes of visualization, these files have additional spaces . Moreover, the output IDs take the format of ." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_idgene_namesource
0P24468[NR2F2]UniProtBULK
1C9J5X1[IGF1R]UniProtBULK
2Q5W5X9[TTC23]UniProtBULK
\n", + "
" + ], + "text/plain": [ + " original_id gene_name source\n", + "0 P24468 [NR2F2] UniProtBULK\n", + "1 C9J5X1 [IGF1R] UniProtBULK\n", + "2 Q5W5X9 [TTC23] UniProtBULK" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_tsv = \"\\\n", + "P24468 \\t['NR2F2'] \\tUniProtBULK\\n\\\n", + "C9J5X1 \\t['IGF1R'] \\tUniProtBULK\\n\\\n", + "Q5W5X9 \\t['TTC23'] \\tUniProtBULK\\\n", + "\"\n", + "\n", + "df_tsv = pd.read_table(StringIO(str_tsv), sep=\"\\t\", header=None)\n", + "\n", + "df_tsv.columns = [\"original_id\", \"gene_name\", \"source\"]\n", + "\n", + "df_tsv[\"original_id\"] = df_tsv[\"original_id\"].apply(lambda x: x.strip())\n", + "df_tsv[\"gene_name\"] = df_tsv[\"gene_name\"].apply(load.literal_eval_list)\n", + "df_tsv[\"source\"] = df_tsv[\"source\"].apply(lambda x: x.strip())\n", + "\n", + "df_tsv" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.12 ('squidpy39')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -158,7 +821,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.9" }, "vscode": { "interpreter": { diff --git a/pyproject.toml b/pyproject.toml index b5b30f5..20a8009 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,8 @@ doc = [ # For notebooks "ipykernel", "ipython", + "jupyterlab", + "notebook", "sphinx-copybutton", "pandas", ]