sourced genes_to_phenotype for HPO Subset

ivanwilliammd · Mar 31, 2024 · 56df3c3 · 56df3c3
1 parent 4097b73
commit 56df3c3
Show file tree

Hide file tree

Showing 3 changed files with 304,402 additions and 992,378 deletions.
diff --git a/phenotype/Generate HPO Subset.ipynb b/phenotype/Generate HPO Subset.ipynb
@@ -5,14 +5,6 @@
    "execution_count": 1,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: obonet in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (1.0.0)\n",
-      "Requirement already satisfied: networkx in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from obonet) (3.1)\n"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
@@ -27,9 +19,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Requirement already satisfied: pandas in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (1.2.3)\n",
-      "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (2.8.1)\n",
-      "Requirement already satisfied: pytz>=2017.3 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (2021.1)\n"
+      "Requirement already satisfied: obonet in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (1.0.0)\n",
+      "Requirement already satisfied: networkx in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from obonet) (3.1)\n",
+      "Requirement already satisfied: pandas in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (2.0.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (2021.1)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (2024.1)\n",
+      "Requirement already satisfied: numpy>=1.20.3 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (1.24.4)\n",
+      "Requirement already satisfied: six>=1.5 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.15.0)\n"
      ]
     },
     {
@@ -41,14 +38,6 @@
       "[notice] A new release of pip is available: 23.2.1 -> 24.0\n",
       "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
      ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: numpy>=1.16.5 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from pandas) (1.24.4)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\ivan-rtx2080\\.conda\\envs\\datascience\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n"
-     ]
     }
    ],
    "source": [
@@ -69,7 +58,7 @@
     "\n",
     "folder_source = 'rawdl_20240310'\n",
     "ontology_file = 'hp.obo'\n",
-    "hpo_file = 'phenotype_to_genes.txt'"
+    "hpo_file = 'genes_to_phenotype.txt'"
    ]
   },
   {
@@ -14959,16 +14948,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Read the phenotype_to_genes file, split it to ORPHA and OMIM and save to files\n",
-    "phenotype_to_genes = pd.read_csv(os.path.join(folder_source, hpo_file), sep='\\t')\n",
+    "# Read the genes_to_phenotype file, split it to ORPHA and OMIM and save to files\n",
+    "genes_to_phenotype = pd.read_csv(os.path.join(folder_source, hpo_file), sep='\\t')\n",
     "\n",
-    "# read phenotype_to_genes and filter the disease_id having prefix ORPHA:\n",
-    "orpha_to_hpo = phenotype_to_genes[phenotype_to_genes['disease_id'].str.contains('ORPHA:')]\n",
+    "# read genes_to_phenotype and filter the disease_id having prefix ORPHA:\n",
+    "orpha_to_hpo = genes_to_phenotype[genes_to_phenotype['disease_id'].str.contains('ORPHA:')]\n",
     "orpha_to_hpo = orpha_to_hpo.drop_duplicates()\n",
     "orpha_to_hpo[['disease_id', 'hpo_id', 'hpo_name']].to_csv(os.path.join('subset', 'orpha2hpo_subset.tsv'), sep='\\t', index=False)\n",
     "\n",
-    "# read phenotype_to_genes and filter the disease_id having prefix OMIM:\n",
-    "omim_to_hpo = phenotype_to_genes[phenotype_to_genes['disease_id'].str.contains('OMIM:')]\n",
+    "# read genes_to_phenotype and filter the disease_id having prefix OMIM:\n",
+    "omim_to_hpo = genes_to_phenotype[genes_to_phenotype['disease_id'].str.contains('OMIM:')]\n",
     "omim_to_hpo = omim_to_hpo.drop_duplicates()\n",
     "omim_to_hpo[['disease_id', 'hpo_id', 'hpo_name']].to_csv(os.path.join('subset', 'omim2hpo_subset.tsv'), sep='\\t', index=False)"
    ]