diff --git a/config.yaml b/config.yaml index 14cdcc7..b095e80 100644 --- a/config.yaml +++ b/config.yaml @@ -167,10 +167,10 @@ datasets: id: syn51942280.4 format: csv - name: ensg_to_uniprot_mapping - id: syn54113663.3 + id: syn54113663.5 format: tsv - name: pharos_classes - id: syn64123611.1 + id: syn64123611.2 format: csv final_format: json custom_transformations: @@ -203,8 +203,8 @@ datasets: - syn27211878.2 - *genes_biodomains_provenance - syn51942280.4 - - syn54113663.3 - - syn64123611.1 + - syn54113663.5 + - syn64123611.2 agora_rename: symbol: hgnc_symbol destination: *dest diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb index 2b36988..d9069d9 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb @@ -32,6 +32,22 @@ "config_filename = \"../../../../config.yaml\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the list of nominated targets for Agora" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "targets_df = preprocessing_utils.load_file_with_name(\"target_list\", config_filename=config_filename)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -49,8 +65,7 @@ "source": [ "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n", " config_filename=config_filename,\n", - " exclude_files=[\"gene_metadata\", \"druggability\"],\n", - " token=None,\n", + " exclude_files=[\"gene_metadata\", \"ensg_to_uniprot_mapping\"],\n", ")\n", "print(\"\")\n", "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")" @@ -65,52 +80,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Querying genes 1 - 1000\n", - "Querying genes 1001 - 2000\n", - "Querying genes 2001 - 3000\n", - "Querying genes 3001 - 4000\n", - "Querying genes 4001 - 5000\n", - "Querying genes 5001 - 6000\n", - "Querying genes 6001 - 7000\n", - "Querying genes 7001 - 8000\n", - "Querying genes 8001 - 9000\n", - "Querying genes 9001 - 10000\n", - "Querying genes 10001 - 11000\n", - "Querying genes 11001 - 12000\n", - "Querying genes 12001 - 13000\n", - "Querying genes 13001 - 14000\n", - "Querying genes 14001 - 15000\n", - "Querying genes 15001 - 16000\n", - "Querying genes 16001 - 17000\n", - "Querying genes 17001 - 18000\n", - "Querying genes 18001 - 19000\n", - "Querying genes 19001 - 20000\n", - "Querying genes 20001 - 21000\n", - "Querying genes 21001 - 22000\n", - "Querying genes 22001 - 23000\n", - "Querying genes 23001 - 24000\n", - "Querying genes 24001 - 25000\n", - "Querying genes 25001 - 26000\n", - "Querying genes 26001 - 27000\n", - "Querying genes 27001 - 28000\n", - "Querying genes 28001 - 29000\n", - "Querying genes 29001 - 30000\n", - "Querying genes 30001 - 31000\n", - "Querying genes 31001 - 32000\n", - "Querying genes 32001 - 33000\n", - "Querying genes 33001 - 34000\n", - "Querying genes 34001 - 35000\n", - "Querying genes 35001 - 35858\n" - ] - } - ], + "outputs": [], "source": [ "# Break the query into smaller chunks to avoid long jobs that could fail\n", "batch_ind = range(0, len(ensembl_ids), 1000)\n", @@ -138,128 +110,29 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UniProtKB_accessionRESOURCE_IDENTIFIER
0A0A075B6I4ENSG00000211642
1Q13641ENSG00000146242
2Q6PCB7ENSG00000130304
3Q7Z591ENSG00000106948
4Q5SZD1ENSG00000197261
.........
18456Q6ZUI0ENSG00000188001
18457O43747ENSG00000166747
18458Q9UBU2ENSG00000155011
18459Q86VY9ENSG00000164484
18460P02655ENSG00000234906
\n", - "

18461 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " UniProtKB_accession RESOURCE_IDENTIFIER\n", - "0 A0A075B6I4 ENSG00000211642\n", - "1 Q13641 ENSG00000146242\n", - "2 Q6PCB7 ENSG00000130304\n", - "3 Q7Z591 ENSG00000106948\n", - "4 Q5SZD1 ENSG00000197261\n", - "... ... ...\n", - "18456 Q6ZUI0 ENSG00000188001\n", - "18457 O43747 ENSG00000166747\n", - "18458 Q9UBU2 ENSG00000155011\n", - "18459 Q86VY9 ENSG00000164484\n", - "18460 P02655 ENSG00000234906\n", - "\n", - "[18461 rows x 2 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "mapping = pd.DataFrame(results).rename(\n", " columns={\"from\": \"RESOURCE_IDENTIFIER\", \"to\": \"UniProtKB_accession\"}\n", ")\n", "mapping = mapping[[\"UniProtKB_accession\", \"RESOURCE_IDENTIFIER\"]]\n", + "\n", + "nomination_string = \"Agora Nominated Target for Alzheimer’s Disease\"\n", + "\n", + "mapping[\"OPTIONAL_INFORMATION\"] = \"\"\n", + "mapping[\"OPTIONAL_INFORMATION\"].loc[\n", + " mapping[\"RESOURCE_IDENTIFIER\"].isin(targets_df[\"ensembl_gene_id\"])\n", + "] = nomination_string\n", + "\n", + "mapping = mapping.sort_values(by=\"RESOURCE_IDENTIFIER\")\n", "mapping" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -282,17 +155,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "18437 of 35858 (51.42%) Ensembl IDs match to an accession\n" - ] - } - ], + "outputs": [], "source": [ "matches = len(mapping[\"RESOURCE_IDENTIFIER\"].drop_duplicates())\n", "total = len(ensembl_ids)\n", @@ -310,341 +175,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23 Ensembl IDs map to more than one UniProt accession\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UniProtKB_accessionRESOURCE_IDENTIFIER
538P0CAP2ENSG00000255529
539Q6EEV4ENSG00000255529
2499O95467ENSG00000087460
2500P63092ENSG00000087460
2501Q5JWF2ENSG00000087460
2846P39880ENSG00000257923
2847Q13948ENSG00000257923
2943O96007ENSG00000164172
2944O96033ENSG00000164172
4298Q8NFQ8ENSG00000169905
4299Q9H496ENSG00000169905
5330O43687ENSG00000118507
5331Q9P0M2ENSG00000118507
5381P01258ENSG00000110680
5382P06881ENSG00000110680
7359P0DI83ENSG00000109113
7360Q9BZG1ENSG00000109113
7750P58400ENSG00000179915
7751Q9ULB1ENSG00000179915
10712O00241ENSG00000101307
10713Q5TFQ8ENSG00000101307
10844P42771ENSG00000147889
10845Q8N726ENSG00000147889
11763P60896ENSG00000127922
11764Q6ZVN7ENSG00000127922
12590P0DPB5ENSG00000186184
12591P0DPB6ENSG00000186184
12797P58401ENSG00000110076
12798Q9P2S2ENSG00000110076
13051Q9HDB5ENSG00000021645
13052Q9Y4C0ENSG00000021645
13521A8MTL9ENSG00000221887
13522P0C7T4ENSG00000221887
13855B7ZAP0ENSG00000152061
13856Q5R372ENSG00000152061
14724P42166ENSG00000120802
14725P42167ENSG00000120802
14894E9PAV3ENSG00000196531
14895Q13765ENSG00000196531
15965Q96PG8ENSG00000105327
15966Q9BXH1ENSG00000105327
16364Q5JU69ENSG00000160404
16365Q8N2E6ENSG00000160404
16539Q96RT6ENSG00000212710
16540Q9HC47ENSG00000212710
17256P0DP91ENSG00000225830
17257Q03468ENSG00000225830
\n", - "
" - ], - "text/plain": [ - " UniProtKB_accession RESOURCE_IDENTIFIER\n", - "538 P0CAP2 ENSG00000255529\n", - "539 Q6EEV4 ENSG00000255529\n", - "2499 O95467 ENSG00000087460\n", - "2500 P63092 ENSG00000087460\n", - "2501 Q5JWF2 ENSG00000087460\n", - "2846 P39880 ENSG00000257923\n", - "2847 Q13948 ENSG00000257923\n", - "2943 O96007 ENSG00000164172\n", - "2944 O96033 ENSG00000164172\n", - "4298 Q8NFQ8 ENSG00000169905\n", - "4299 Q9H496 ENSG00000169905\n", - "5330 O43687 ENSG00000118507\n", - "5331 Q9P0M2 ENSG00000118507\n", - "5381 P01258 ENSG00000110680\n", - "5382 P06881 ENSG00000110680\n", - "7359 P0DI83 ENSG00000109113\n", - "7360 Q9BZG1 ENSG00000109113\n", - "7750 P58400 ENSG00000179915\n", - "7751 Q9ULB1 ENSG00000179915\n", - "10712 O00241 ENSG00000101307\n", - "10713 Q5TFQ8 ENSG00000101307\n", - "10844 P42771 ENSG00000147889\n", - "10845 Q8N726 ENSG00000147889\n", - "11763 P60896 ENSG00000127922\n", - "11764 Q6ZVN7 ENSG00000127922\n", - "12590 P0DPB5 ENSG00000186184\n", - "12591 P0DPB6 ENSG00000186184\n", - "12797 P58401 ENSG00000110076\n", - "12798 Q9P2S2 ENSG00000110076\n", - "13051 Q9HDB5 ENSG00000021645\n", - "13052 Q9Y4C0 ENSG00000021645\n", - "13521 A8MTL9 ENSG00000221887\n", - "13522 P0C7T4 ENSG00000221887\n", - "13855 B7ZAP0 ENSG00000152061\n", - "13856 Q5R372 ENSG00000152061\n", - "14724 P42166 ENSG00000120802\n", - "14725 P42167 ENSG00000120802\n", - "14894 E9PAV3 ENSG00000196531\n", - "14895 Q13765 ENSG00000196531\n", - "15965 Q96PG8 ENSG00000105327\n", - "15966 Q9BXH1 ENSG00000105327\n", - "16364 Q5JU69 ENSG00000160404\n", - "16365 Q8N2E6 ENSG00000160404\n", - "16539 Q96RT6 ENSG00000212710\n", - "16540 Q9HC47 ENSG00000212710\n", - "17256 P0DP91 ENSG00000225830\n", - "17257 Q03468 ENSG00000225830" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dupes = mapping[\"RESOURCE_IDENTIFIER\"].loc[mapping[\"RESOURCE_IDENTIFIER\"].duplicated()].drop_duplicates()\n", "print(f'{len(dupes):d} Ensembl IDs map to more than one UniProt accession')\n", - "mapping.loc[mapping[\"RESOURCE_IDENTIFIER\"].isin(dupes)]" + "mapping.loc[mapping[\"RESOURCE_IDENTIFIER\"].isin(dupes)].sort_values(by=\"RESOURCE_IDENTIFIER\")" ] }, { @@ -656,128 +193,42 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "28 UniProt accessions map to more than one Ensembl ID\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UniProtKB_accessionRESOURCE_IDENTIFIER
498Q08493ENSG00000285188
664Q5JQF8ENSG00000184388
845P62805ENSG00000197061
1474Q71DI3ENSG00000203852
1553P0C0S8ENSG00000196747
.........
17564Q08493ENSG00000105650
18069P01562ENSG00000197919
18161P62805ENSG00000278705
18253P62807ENSG00000277224
18335Q9H3K6ENSG00000183336
\n", - "

78 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " UniProtKB_accession RESOURCE_IDENTIFIER\n", - "498 Q08493 ENSG00000285188\n", - "664 Q5JQF8 ENSG00000184388\n", - "845 P62805 ENSG00000197061\n", - "1474 Q71DI3 ENSG00000203852\n", - "1553 P0C0S8 ENSG00000196747\n", - "... ... ...\n", - "17564 Q08493 ENSG00000105650\n", - "18069 P01562 ENSG00000197919\n", - "18161 P62805 ENSG00000278705\n", - "18253 P62807 ENSG00000277224\n", - "18335 Q9H3K6 ENSG00000183336\n", - "\n", - "[78 rows x 2 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dupes2 = mapping[\"UniProtKB_accession\"].loc[mapping[\"UniProtKB_accession\"].duplicated()].drop_duplicates()\n", "print(f'{len(dupes2):d} UniProt accessions map to more than one Ensembl ID')\n", - "mapping.loc[mapping[\"UniProtKB_accession\"].isin(dupes2)]" + "mapping.loc[mapping[\"UniProtKB_accession\"].isin(dupes2)].sort_values(by=\"UniProtKB_accession\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Are any nominated targets missing a Uniprot accession?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens = targets_df[\"ensembl_gene_id\"].drop_duplicates()\n", + "missing = len(ens) - sum(ens.isin(mapping[\"RESOURCE_IDENTIFIER\"]))\n", + "\n", + "if missing == 0:\n", + " print(\"All nominated targets have a matching UniProt accession.\")\n", + "\n", + "else:\n", + " print(f\"{missing} of {len(ens)} nominated targets are missing a UniProt accession.\")\n", + " missing_ens = [x for x in ens if x not in list(mapping[\"RESOURCE_IDENTIFIER\"])]\n", + " print(\n", + " targets_df[targets_df[\"ensembl_gene_id\"].isin(missing_ens)][\n", + " [\"ensembl_gene_id\", \"hgnc_symbol\"]\n", + " ]\n", + " )" ] } ], diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index e85f441..106867f 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -336,6 +336,40 @@ def _extract_ensembl_ids( return list(set(file_ensembl_ids)) +def load_file_with_name( + file_name: str, config_filename: str, token: str = None +) -> Union[pd.DataFrame, None]: + """ + Loops through a config file, finds the input file config that matches file_name, and downloads + and reads the file in as a pandas data frame. + + Args: + file_name: the name of the data to load, which should match what is in the "name" field in + the config file + config_filename: path to the config YAML file + token: optional, a Synapse auth token + + Returns: + a pandas.DataFrame, if a file matching file_name exists in the config, or + None, if no file spec with that name exists + """ + syn = utils._login_to_synapse(token=token) + config = utils._get_config(config_path=config_filename) + datasets = config["datasets"] + + for dataset in datasets: + dataset_name = list(dataset.keys())[0] + + for file in dataset[dataset_name]["files"]: + if file["name"] == file_name: + df = extract.get_entity_as_df( + syn_id=file["id"], source=file["format"], syn=syn + ) + return df + + return None + + def standardize_list_item(item: Union[str, List[str]]) -> List[str]: """ For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN, @@ -372,7 +406,7 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]: def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame: """ - MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols + MyGene queries sometimes return multiple rows with the same Ensembl ID but different symbols or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the diff --git a/test_config.yaml b/test_config.yaml index 8372878..6ed5754 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -167,10 +167,10 @@ datasets: id: syn51942280.4 format: csv - name: ensg_to_uniprot_mapping - id: syn54113663.3 + id: syn54113663.5 format: tsv - name: pharos_classes - id: syn64123611.1 + id: syn64123611.2 format: csv final_format: json custom_transformations: @@ -203,8 +203,8 @@ datasets: - syn27211878.2 - *genes_biodomains_provenance - syn51942280.4 - - syn54113663.3 - - syn64123611.1 + - syn54113663.5 + - syn64123611.2 agora_rename: symbol: hgnc_symbol destination: *dest