diff --git a/config.yaml b/config.yaml
index 14cdcc7..b095e80 100644
--- a/config.yaml
+++ b/config.yaml
@@ -167,10 +167,10 @@ datasets:
id: syn51942280.4
format: csv
- name: ensg_to_uniprot_mapping
- id: syn54113663.3
+ id: syn54113663.5
format: tsv
- name: pharos_classes
- id: syn64123611.1
+ id: syn64123611.2
format: csv
final_format: json
custom_transformations:
@@ -203,8 +203,8 @@ datasets:
- syn27211878.2
- *genes_biodomains_provenance
- syn51942280.4
- - syn54113663.3
- - syn64123611.1
+ - syn54113663.5
+ - syn64123611.2
agora_rename:
symbol: hgnc_symbol
destination: *dest
diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
index 2b36988..d9069d9 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
@@ -32,6 +32,22 @@
"config_filename = \"../../../../config.yaml\""
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get the list of nominated targets for Agora"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "targets_df = preprocessing_utils.load_file_with_name(\"target_list\", config_filename=config_filename)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -49,8 +65,7 @@
"source": [
"ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
" config_filename=config_filename,\n",
- " exclude_files=[\"gene_metadata\", \"druggability\"],\n",
- " token=None,\n",
+ " exclude_files=[\"gene_metadata\", \"ensg_to_uniprot_mapping\"],\n",
")\n",
"print(\"\")\n",
"print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
@@ -65,52 +80,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Querying genes 1 - 1000\n",
- "Querying genes 1001 - 2000\n",
- "Querying genes 2001 - 3000\n",
- "Querying genes 3001 - 4000\n",
- "Querying genes 4001 - 5000\n",
- "Querying genes 5001 - 6000\n",
- "Querying genes 6001 - 7000\n",
- "Querying genes 7001 - 8000\n",
- "Querying genes 8001 - 9000\n",
- "Querying genes 9001 - 10000\n",
- "Querying genes 10001 - 11000\n",
- "Querying genes 11001 - 12000\n",
- "Querying genes 12001 - 13000\n",
- "Querying genes 13001 - 14000\n",
- "Querying genes 14001 - 15000\n",
- "Querying genes 15001 - 16000\n",
- "Querying genes 16001 - 17000\n",
- "Querying genes 17001 - 18000\n",
- "Querying genes 18001 - 19000\n",
- "Querying genes 19001 - 20000\n",
- "Querying genes 20001 - 21000\n",
- "Querying genes 21001 - 22000\n",
- "Querying genes 22001 - 23000\n",
- "Querying genes 23001 - 24000\n",
- "Querying genes 24001 - 25000\n",
- "Querying genes 25001 - 26000\n",
- "Querying genes 26001 - 27000\n",
- "Querying genes 27001 - 28000\n",
- "Querying genes 28001 - 29000\n",
- "Querying genes 29001 - 30000\n",
- "Querying genes 30001 - 31000\n",
- "Querying genes 31001 - 32000\n",
- "Querying genes 32001 - 33000\n",
- "Querying genes 33001 - 34000\n",
- "Querying genes 34001 - 35000\n",
- "Querying genes 35001 - 35858\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Break the query into smaller chunks to avoid long jobs that could fail\n",
"batch_ind = range(0, len(ensembl_ids), 1000)\n",
@@ -138,128 +110,29 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " UniProtKB_accession | \n",
- " RESOURCE_IDENTIFIER | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " A0A075B6I4 | \n",
- " ENSG00000211642 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Q13641 | \n",
- " ENSG00000146242 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Q6PCB7 | \n",
- " ENSG00000130304 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " Q7Z591 | \n",
- " ENSG00000106948 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " Q5SZD1 | \n",
- " ENSG00000197261 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 18456 | \n",
- " Q6ZUI0 | \n",
- " ENSG00000188001 | \n",
- "
\n",
- " \n",
- " 18457 | \n",
- " O43747 | \n",
- " ENSG00000166747 | \n",
- "
\n",
- " \n",
- " 18458 | \n",
- " Q9UBU2 | \n",
- " ENSG00000155011 | \n",
- "
\n",
- " \n",
- " 18459 | \n",
- " Q86VY9 | \n",
- " ENSG00000164484 | \n",
- "
\n",
- " \n",
- " 18460 | \n",
- " P02655 | \n",
- " ENSG00000234906 | \n",
- "
\n",
- " \n",
- "
\n",
- "
18461 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " UniProtKB_accession RESOURCE_IDENTIFIER\n",
- "0 A0A075B6I4 ENSG00000211642\n",
- "1 Q13641 ENSG00000146242\n",
- "2 Q6PCB7 ENSG00000130304\n",
- "3 Q7Z591 ENSG00000106948\n",
- "4 Q5SZD1 ENSG00000197261\n",
- "... ... ...\n",
- "18456 Q6ZUI0 ENSG00000188001\n",
- "18457 O43747 ENSG00000166747\n",
- "18458 Q9UBU2 ENSG00000155011\n",
- "18459 Q86VY9 ENSG00000164484\n",
- "18460 P02655 ENSG00000234906\n",
- "\n",
- "[18461 rows x 2 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"mapping = pd.DataFrame(results).rename(\n",
" columns={\"from\": \"RESOURCE_IDENTIFIER\", \"to\": \"UniProtKB_accession\"}\n",
")\n",
"mapping = mapping[[\"UniProtKB_accession\", \"RESOURCE_IDENTIFIER\"]]\n",
+ "\n",
+ "nomination_string = \"Agora Nominated Target for Alzheimer’s Disease\"\n",
+ "\n",
+ "mapping[\"OPTIONAL_INFORMATION\"] = \"\"\n",
+ "mapping[\"OPTIONAL_INFORMATION\"].loc[\n",
+ " mapping[\"RESOURCE_IDENTIFIER\"].isin(targets_df[\"ensembl_gene_id\"])\n",
+ "] = nomination_string\n",
+ "\n",
+ "mapping = mapping.sort_values(by=\"RESOURCE_IDENTIFIER\")\n",
"mapping"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -282,17 +155,9 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "18437 of 35858 (51.42%) Ensembl IDs match to an accession\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"matches = len(mapping[\"RESOURCE_IDENTIFIER\"].drop_duplicates())\n",
"total = len(ensembl_ids)\n",
@@ -310,341 +175,13 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23 Ensembl IDs map to more than one UniProt accession\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " UniProtKB_accession | \n",
- " RESOURCE_IDENTIFIER | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 538 | \n",
- " P0CAP2 | \n",
- " ENSG00000255529 | \n",
- "
\n",
- " \n",
- " 539 | \n",
- " Q6EEV4 | \n",
- " ENSG00000255529 | \n",
- "
\n",
- " \n",
- " 2499 | \n",
- " O95467 | \n",
- " ENSG00000087460 | \n",
- "
\n",
- " \n",
- " 2500 | \n",
- " P63092 | \n",
- " ENSG00000087460 | \n",
- "
\n",
- " \n",
- " 2501 | \n",
- " Q5JWF2 | \n",
- " ENSG00000087460 | \n",
- "
\n",
- " \n",
- " 2846 | \n",
- " P39880 | \n",
- " ENSG00000257923 | \n",
- "
\n",
- " \n",
- " 2847 | \n",
- " Q13948 | \n",
- " ENSG00000257923 | \n",
- "
\n",
- " \n",
- " 2943 | \n",
- " O96007 | \n",
- " ENSG00000164172 | \n",
- "
\n",
- " \n",
- " 2944 | \n",
- " O96033 | \n",
- " ENSG00000164172 | \n",
- "
\n",
- " \n",
- " 4298 | \n",
- " Q8NFQ8 | \n",
- " ENSG00000169905 | \n",
- "
\n",
- " \n",
- " 4299 | \n",
- " Q9H496 | \n",
- " ENSG00000169905 | \n",
- "
\n",
- " \n",
- " 5330 | \n",
- " O43687 | \n",
- " ENSG00000118507 | \n",
- "
\n",
- " \n",
- " 5331 | \n",
- " Q9P0M2 | \n",
- " ENSG00000118507 | \n",
- "
\n",
- " \n",
- " 5381 | \n",
- " P01258 | \n",
- " ENSG00000110680 | \n",
- "
\n",
- " \n",
- " 5382 | \n",
- " P06881 | \n",
- " ENSG00000110680 | \n",
- "
\n",
- " \n",
- " 7359 | \n",
- " P0DI83 | \n",
- " ENSG00000109113 | \n",
- "
\n",
- " \n",
- " 7360 | \n",
- " Q9BZG1 | \n",
- " ENSG00000109113 | \n",
- "
\n",
- " \n",
- " 7750 | \n",
- " P58400 | \n",
- " ENSG00000179915 | \n",
- "
\n",
- " \n",
- " 7751 | \n",
- " Q9ULB1 | \n",
- " ENSG00000179915 | \n",
- "
\n",
- " \n",
- " 10712 | \n",
- " O00241 | \n",
- " ENSG00000101307 | \n",
- "
\n",
- " \n",
- " 10713 | \n",
- " Q5TFQ8 | \n",
- " ENSG00000101307 | \n",
- "
\n",
- " \n",
- " 10844 | \n",
- " P42771 | \n",
- " ENSG00000147889 | \n",
- "
\n",
- " \n",
- " 10845 | \n",
- " Q8N726 | \n",
- " ENSG00000147889 | \n",
- "
\n",
- " \n",
- " 11763 | \n",
- " P60896 | \n",
- " ENSG00000127922 | \n",
- "
\n",
- " \n",
- " 11764 | \n",
- " Q6ZVN7 | \n",
- " ENSG00000127922 | \n",
- "
\n",
- " \n",
- " 12590 | \n",
- " P0DPB5 | \n",
- " ENSG00000186184 | \n",
- "
\n",
- " \n",
- " 12591 | \n",
- " P0DPB6 | \n",
- " ENSG00000186184 | \n",
- "
\n",
- " \n",
- " 12797 | \n",
- " P58401 | \n",
- " ENSG00000110076 | \n",
- "
\n",
- " \n",
- " 12798 | \n",
- " Q9P2S2 | \n",
- " ENSG00000110076 | \n",
- "
\n",
- " \n",
- " 13051 | \n",
- " Q9HDB5 | \n",
- " ENSG00000021645 | \n",
- "
\n",
- " \n",
- " 13052 | \n",
- " Q9Y4C0 | \n",
- " ENSG00000021645 | \n",
- "
\n",
- " \n",
- " 13521 | \n",
- " A8MTL9 | \n",
- " ENSG00000221887 | \n",
- "
\n",
- " \n",
- " 13522 | \n",
- " P0C7T4 | \n",
- " ENSG00000221887 | \n",
- "
\n",
- " \n",
- " 13855 | \n",
- " B7ZAP0 | \n",
- " ENSG00000152061 | \n",
- "
\n",
- " \n",
- " 13856 | \n",
- " Q5R372 | \n",
- " ENSG00000152061 | \n",
- "
\n",
- " \n",
- " 14724 | \n",
- " P42166 | \n",
- " ENSG00000120802 | \n",
- "
\n",
- " \n",
- " 14725 | \n",
- " P42167 | \n",
- " ENSG00000120802 | \n",
- "
\n",
- " \n",
- " 14894 | \n",
- " E9PAV3 | \n",
- " ENSG00000196531 | \n",
- "
\n",
- " \n",
- " 14895 | \n",
- " Q13765 | \n",
- " ENSG00000196531 | \n",
- "
\n",
- " \n",
- " 15965 | \n",
- " Q96PG8 | \n",
- " ENSG00000105327 | \n",
- "
\n",
- " \n",
- " 15966 | \n",
- " Q9BXH1 | \n",
- " ENSG00000105327 | \n",
- "
\n",
- " \n",
- " 16364 | \n",
- " Q5JU69 | \n",
- " ENSG00000160404 | \n",
- "
\n",
- " \n",
- " 16365 | \n",
- " Q8N2E6 | \n",
- " ENSG00000160404 | \n",
- "
\n",
- " \n",
- " 16539 | \n",
- " Q96RT6 | \n",
- " ENSG00000212710 | \n",
- "
\n",
- " \n",
- " 16540 | \n",
- " Q9HC47 | \n",
- " ENSG00000212710 | \n",
- "
\n",
- " \n",
- " 17256 | \n",
- " P0DP91 | \n",
- " ENSG00000225830 | \n",
- "
\n",
- " \n",
- " 17257 | \n",
- " Q03468 | \n",
- " ENSG00000225830 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " UniProtKB_accession RESOURCE_IDENTIFIER\n",
- "538 P0CAP2 ENSG00000255529\n",
- "539 Q6EEV4 ENSG00000255529\n",
- "2499 O95467 ENSG00000087460\n",
- "2500 P63092 ENSG00000087460\n",
- "2501 Q5JWF2 ENSG00000087460\n",
- "2846 P39880 ENSG00000257923\n",
- "2847 Q13948 ENSG00000257923\n",
- "2943 O96007 ENSG00000164172\n",
- "2944 O96033 ENSG00000164172\n",
- "4298 Q8NFQ8 ENSG00000169905\n",
- "4299 Q9H496 ENSG00000169905\n",
- "5330 O43687 ENSG00000118507\n",
- "5331 Q9P0M2 ENSG00000118507\n",
- "5381 P01258 ENSG00000110680\n",
- "5382 P06881 ENSG00000110680\n",
- "7359 P0DI83 ENSG00000109113\n",
- "7360 Q9BZG1 ENSG00000109113\n",
- "7750 P58400 ENSG00000179915\n",
- "7751 Q9ULB1 ENSG00000179915\n",
- "10712 O00241 ENSG00000101307\n",
- "10713 Q5TFQ8 ENSG00000101307\n",
- "10844 P42771 ENSG00000147889\n",
- "10845 Q8N726 ENSG00000147889\n",
- "11763 P60896 ENSG00000127922\n",
- "11764 Q6ZVN7 ENSG00000127922\n",
- "12590 P0DPB5 ENSG00000186184\n",
- "12591 P0DPB6 ENSG00000186184\n",
- "12797 P58401 ENSG00000110076\n",
- "12798 Q9P2S2 ENSG00000110076\n",
- "13051 Q9HDB5 ENSG00000021645\n",
- "13052 Q9Y4C0 ENSG00000021645\n",
- "13521 A8MTL9 ENSG00000221887\n",
- "13522 P0C7T4 ENSG00000221887\n",
- "13855 B7ZAP0 ENSG00000152061\n",
- "13856 Q5R372 ENSG00000152061\n",
- "14724 P42166 ENSG00000120802\n",
- "14725 P42167 ENSG00000120802\n",
- "14894 E9PAV3 ENSG00000196531\n",
- "14895 Q13765 ENSG00000196531\n",
- "15965 Q96PG8 ENSG00000105327\n",
- "15966 Q9BXH1 ENSG00000105327\n",
- "16364 Q5JU69 ENSG00000160404\n",
- "16365 Q8N2E6 ENSG00000160404\n",
- "16539 Q96RT6 ENSG00000212710\n",
- "16540 Q9HC47 ENSG00000212710\n",
- "17256 P0DP91 ENSG00000225830\n",
- "17257 Q03468 ENSG00000225830"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"dupes = mapping[\"RESOURCE_IDENTIFIER\"].loc[mapping[\"RESOURCE_IDENTIFIER\"].duplicated()].drop_duplicates()\n",
"print(f'{len(dupes):d} Ensembl IDs map to more than one UniProt accession')\n",
- "mapping.loc[mapping[\"RESOURCE_IDENTIFIER\"].isin(dupes)]"
+ "mapping.loc[mapping[\"RESOURCE_IDENTIFIER\"].isin(dupes)].sort_values(by=\"RESOURCE_IDENTIFIER\")"
]
},
{
@@ -656,128 +193,42 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "28 UniProt accessions map to more than one Ensembl ID\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " UniProtKB_accession | \n",
- " RESOURCE_IDENTIFIER | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 498 | \n",
- " Q08493 | \n",
- " ENSG00000285188 | \n",
- "
\n",
- " \n",
- " 664 | \n",
- " Q5JQF8 | \n",
- " ENSG00000184388 | \n",
- "
\n",
- " \n",
- " 845 | \n",
- " P62805 | \n",
- " ENSG00000197061 | \n",
- "
\n",
- " \n",
- " 1474 | \n",
- " Q71DI3 | \n",
- " ENSG00000203852 | \n",
- "
\n",
- " \n",
- " 1553 | \n",
- " P0C0S8 | \n",
- " ENSG00000196747 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 17564 | \n",
- " Q08493 | \n",
- " ENSG00000105650 | \n",
- "
\n",
- " \n",
- " 18069 | \n",
- " P01562 | \n",
- " ENSG00000197919 | \n",
- "
\n",
- " \n",
- " 18161 | \n",
- " P62805 | \n",
- " ENSG00000278705 | \n",
- "
\n",
- " \n",
- " 18253 | \n",
- " P62807 | \n",
- " ENSG00000277224 | \n",
- "
\n",
- " \n",
- " 18335 | \n",
- " Q9H3K6 | \n",
- " ENSG00000183336 | \n",
- "
\n",
- " \n",
- "
\n",
- "
78 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " UniProtKB_accession RESOURCE_IDENTIFIER\n",
- "498 Q08493 ENSG00000285188\n",
- "664 Q5JQF8 ENSG00000184388\n",
- "845 P62805 ENSG00000197061\n",
- "1474 Q71DI3 ENSG00000203852\n",
- "1553 P0C0S8 ENSG00000196747\n",
- "... ... ...\n",
- "17564 Q08493 ENSG00000105650\n",
- "18069 P01562 ENSG00000197919\n",
- "18161 P62805 ENSG00000278705\n",
- "18253 P62807 ENSG00000277224\n",
- "18335 Q9H3K6 ENSG00000183336\n",
- "\n",
- "[78 rows x 2 columns]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"dupes2 = mapping[\"UniProtKB_accession\"].loc[mapping[\"UniProtKB_accession\"].duplicated()].drop_duplicates()\n",
"print(f'{len(dupes2):d} UniProt accessions map to more than one Ensembl ID')\n",
- "mapping.loc[mapping[\"UniProtKB_accession\"].isin(dupes2)]"
+ "mapping.loc[mapping[\"UniProtKB_accession\"].isin(dupes2)].sort_values(by=\"UniProtKB_accession\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Are any nominated targets missing a Uniprot accession?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ens = targets_df[\"ensembl_gene_id\"].drop_duplicates()\n",
+ "missing = len(ens) - sum(ens.isin(mapping[\"RESOURCE_IDENTIFIER\"]))\n",
+ "\n",
+ "if missing == 0:\n",
+ " print(\"All nominated targets have a matching UniProt accession.\")\n",
+ "\n",
+ "else:\n",
+ " print(f\"{missing} of {len(ens)} nominated targets are missing a UniProt accession.\")\n",
+ " missing_ens = [x for x in ens if x not in list(mapping[\"RESOURCE_IDENTIFIER\"])]\n",
+ " print(\n",
+ " targets_df[targets_df[\"ensembl_gene_id\"].isin(missing_ens)][\n",
+ " [\"ensembl_gene_id\", \"hgnc_symbol\"]\n",
+ " ]\n",
+ " )"
]
}
],
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index e85f441..106867f 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -336,6 +336,40 @@ def _extract_ensembl_ids(
return list(set(file_ensembl_ids))
+def load_file_with_name(
+ file_name: str, config_filename: str, token: str = None
+) -> Union[pd.DataFrame, None]:
+ """
+ Loops through a config file, finds the input file config that matches file_name, and downloads
+ and reads the file in as a pandas data frame.
+
+ Args:
+ file_name: the name of the data to load, which should match what is in the "name" field in
+ the config file
+ config_filename: path to the config YAML file
+ token: optional, a Synapse auth token
+
+ Returns:
+ a pandas.DataFrame, if a file matching file_name exists in the config, or
+ None, if no file spec with that name exists
+ """
+ syn = utils._login_to_synapse(token=token)
+ config = utils._get_config(config_path=config_filename)
+ datasets = config["datasets"]
+
+ for dataset in datasets:
+ dataset_name = list(dataset.keys())[0]
+
+ for file in dataset[dataset_name]["files"]:
+ if file["name"] == file_name:
+ df = extract.get_entity_as_df(
+ syn_id=file["id"], source=file["format"], syn=syn
+ )
+ return df
+
+ return None
+
+
def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
"""
For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN,
@@ -372,7 +406,7 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame:
"""
- MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols
+ MyGene queries sometimes return multiple rows with the same Ensembl ID but different symbols
or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs
in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the
same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the
diff --git a/test_config.yaml b/test_config.yaml
index 8372878..6ed5754 100644
--- a/test_config.yaml
+++ b/test_config.yaml
@@ -167,10 +167,10 @@ datasets:
id: syn51942280.4
format: csv
- name: ensg_to_uniprot_mapping
- id: syn54113663.3
+ id: syn54113663.5
format: tsv
- name: pharos_classes
- id: syn64123611.1
+ id: syn64123611.2
format: csv
final_format: json
custom_transformations:
@@ -203,8 +203,8 @@ datasets:
- syn27211878.2
- *genes_biodomains_provenance
- syn51942280.4
- - syn54113663.3
- - syn64123611.1
+ - syn54113663.5
+ - syn64123611.2
agora_rename:
symbol: hgnc_symbol
destination: *dest