Update gene info processing for druggability revamp (#163)

* Removed druggability dataset from gene_info provenance * Updated gene_metadata syn25953363 version from 13 to 14 * Added pharos_classes file to config and test_config * Updated gene_info transform to use pharos_classes dataset and only include pharos_class in druggability object * Added pharos classes good input for testing * Updated test outputs for gene_info to include pharos_class * pytest passing * Updated druggability json for gx validation and commented out the validation itself so I can run adt without problems and then update gx validation * Got gx validation to work. Had to use mostly=0.5, not ideal * Removed the mostly from the gene_info druggability json validation * Added docstrings to gene_info tests to prevent issues with CI interrogate * Sorted biodomains column so we stop seeing unnecessary changes in PRs * pin ubuntu version * fixes pin --------- Co-authored-by: Beatriz Saldana <[email protected]> Co-authored-by: bwmac <[email protected]>
Sage-Bionetworks · Dec 11, 2024 · 5c5931b · 5c5931b
1 parent 17485f6
commit 5c5931b
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 252 deletions.
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -15,7 +15,7 @@ on:
 jobs:
   # test job includes unit tests and coverage
   pre-commit:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v4
         with: { fetch-depth: 0 } # deep clone for setuptools-scm

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -200,7 +200,7 @@ These expectations are defined in the `/great_expectations/gx/plugins/expectatio
 
 #### Nested Columns
 
-If the transform includes nested columns (example: `druggability` column in `gene_info` tranform), please follow these four steps:
+If the transform includes nested columns (example: `ensembl_info` column in `gene_info` tranform), please follow these four steps:
 1. In the config file, add the nested column name to the `gx_nested_columns` flag for the specific transform. This will convert the column values to a JSON parsable string.
 ```
 gx_nested_columns:

diff --git a/config.yaml b/config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.13
+          id: syn25953363.14
           format: feather
         - name: igap
           id: syn12514826.5
@@ -162,16 +162,16 @@ datasets:
         - name: median_expression
           id: syn27211878.2
           format: csv
-        - name: druggability
-          id: syn13363443.11
-          format: csv
         - <<: *genes_biodomains_files
         - name: tep_adi_info
           id: syn51942280.3
           format: csv
         - name: ensg_to_uniprot_mapping
           id: syn54113663.3
           format: tsv
+        - name: pharos_classes
+          id: syn64123611.1
+          format: csv
       final_format: json
       custom_transformations:
         adjusted_p_value_threshold: 0.05
@@ -192,7 +192,7 @@ datasets:
         uniprotkb_accession: uniprotkb_accessions
         resource_identifier: ensembl_gene_id
       provenance:
-        - syn25953363.13
+        - syn25953363.14
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance
@@ -201,10 +201,10 @@ datasets:
         - *rna_diff_expr_data_provenance
         - syn12540368.51
         - syn27211878.2
-        - syn13363443.11
         - *genes_biodomains_provenance
         - syn51942280.3
         - syn54113663.3
+        - syn64123611.1
       agora_rename:
         symbol: hgnc_symbol
       destination: *dest

diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb
@@ -272,7 +272,7 @@
     "# biodomains\n",
     "validator.expect_column_values_to_be_of_type(\"biodomains\", \"list\")\n",
     "validator.expect_column_values_to_have_list_members_of_type(column=\"biodomains\", member_type=\"str\", mostly=0.95)\n",
-    "validator.expect_column_values_to_have_list_members(column=\"biodomains\", list_members={\n",
+    "validator.expect_column_values_to_have_list_members(column=\"biodomains\", list_members=sorted([\n",
     "        'Apoptosis',\n",
     "        'Vasculature',\n",
     "        'Lipid Metabolism',\n",
@@ -292,7 +292,7 @@
     "        'RNA Spliceosome',\n",
     "        'Tau Homeostasis',\n",
     "        'Myelination'\n",
-    "    }\n",
+    "    ])\n",
     ")"
    ]
   },

diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py
@@ -21,7 +21,7 @@ def transform_gene_info(
     proteomics_srm = transform.transform_proteomics(df=datasets["proteomics_srm"])
     target_list = datasets["target_list"]
     median_expression = datasets["median_expression"]
-    druggability = datasets["druggability"]
+    pharos_classes = datasets["pharos_classes"]
     biodomains = datasets["genes_biodomains"]
     tep_info = datasets["tep_adi_info"]
     uniprot = datasets["ensg_to_uniprot_mapping"]
@@ -49,19 +49,6 @@ def transform_gene_info(
         .reset_index()
     )
 
-    # these are the interesting columns of the druggability dataset
-    useful_columns = [
-        "ensembl_gene_id",
-        "sm_druggability_bucket",
-        "safety_bucket",
-        "abability_bucket",
-        "pharos_class",
-        "classification",
-        "safety_bucket_definition",
-        "abability_bucket_definition",
-    ]
-    druggability = druggability[useful_columns]
-
     target_list = nest_fields(
         df=target_list,
         grouping="ensembl_gene_id",
@@ -77,10 +64,15 @@ def transform_gene_info(
     )
 
     druggability = nest_fields(
-        df=druggability,
+        df=(
+            pharos_classes.groupby("ensembl_gene_id")["pharos_class"]
+            .apply(list)
+            .reset_index()
+        ),
         grouping="ensembl_gene_id",
         new_column="druggability",
         drop_columns=["ensembl_gene_id"],
+        nested_field_is_list=False,
     )
 
     biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"])

diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json
@@ -431,46 +431,36 @@
         "json_schema": {
           "$id": "http://example.com/example.json",
           "$schema": "https://json-schema.org/draft/2019-09/schema",
-          "default": [],
-          "items": {
-            "default": {},
-            "properties": {
-              "abability_bucket": {
-                "type": "number"
-              },
-              "abability_bucket_definition": {
-                "maxLength": 1000,
-                "minLength": 44,
-                "type": "string"
-              },
-              "classification": {
-                "maxLength": 1000,
-                "minLength": 22,
-                "type": "string"
-              },
-              "pharos_class": {
-                "type": [
-                  "string",
-                  "null"
-                ]
-              },
-              "safety_bucket": {
-                "type": "number"
-              },
-              "safety_bucket_definition": {
-                "maxLength": 1000,
-                "minLength": 50,
+          "default": null,
+          "examples": [
+            {
+              "pharos_class": [
+                "Tchem"
+              ]
+            }
+          ],
+          "properties": {
+            "pharos_class": {
+              "default": [],
+              "items": {
+                "default": "",
+                "enum": [
+                  "Tdark",
+                  "Tchem",
+                  "Tbio",
+                  "Tclin",
+                  null
+                ],
+                "title": "Pharos object",
                 "type": "string"
               },
-              "sm_druggability_bucket": {
-                "type": "number"
-              }
-            },
-            "type": "object"
+              "title": "The pharos_class Schema",
+              "type": "array"
+            }
           },
-          "title": "Druggability Schema",
+          "title": "Root Schema",
           "type": [
-            "array",
+            "object",
             "null"
           ]
         }
@@ -516,25 +506,25 @@
       "kwargs": {
         "column": "biodomains",
         "list_members": [
-          "Myelination",
-          "Vasculature",
-          "Synapse",
-          "Immune Response",
-          "DNA Repair",
+          "APP Metabolism",
+          "Apoptosis",
           "Autophagy",
-          "Endolysosome",
-          "Proteostasis",
-          "Mitochondrial Metabolism",
           "Cell Cycle",
+          "DNA Repair",
+          "Endolysosome",
           "Epigenetic",
+          "Immune Response",
           "Lipid Metabolism",
           "Metal Binding and Homeostasis",
+          "Mitochondrial Metabolism",
+          "Myelination",
+          "Oxidative Stress",
+          "Proteostasis",
           "RNA Spliceosome",
+          "Structural Stabilization",
+          "Synapse",
           "Tau Homeostasis",
-          "Apoptosis",
-          "Oxidative Stress",
-          "APP Metabolism",
-          "Structural Stabilization"
+          "Vasculature"
         ]
       },
       "meta": {}

diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json b/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json
@@ -1,40 +1,31 @@
 {
     "$schema": "https://json-schema.org/draft/2019-09/schema",
     "$id": "http://example.com/example.json",
-    "type": ["array", "null"],
-    "default": [],
-    "title": "Druggability Schema",
-    "items": {
-        "type": "object",
-        "default": {},
-        "properties": {
-            "sm_druggability_bucket": {
-                "type": "number"
-            },
-            "safety_bucket": {
-                "type": "number"
-            },
-            "abability_bucket": {
-                "type": "number"
-            },
-            "pharos_class": {
-                "type": ["string", "null"]
-            },
-            "classification": {
+    "type": ["object", "null"],
+    "default": null,
+    "title": "Root Schema",
+    "properties": {
+        "pharos_class": {
+            "type": "array",
+            "default": [],
+            "title": "The pharos_class Schema",
+            "items": {
                 "type": "string",
-                "minLength": 22,
-                "maxLength": 1000
-            },
-            "safety_bucket_definition": {
-                "type": "string",
-                "minLength": 50,
-                "maxLength": 1000
-            },
-            "abability_bucket_definition": {
-                "type": "string",
-                "minLength": 44,
-                "maxLength": 1000
+                "default": "",
+                "title": "Pharos object",
+                "enum": [
+                    "Tdark",
+                    "Tchem",
+                    "Tbio",
+                    "Tclin",
+                    null
+                ]
             }
         }
-    }
+    },
+    "examples": [{
+        "pharos_class": [
+            "Tchem"
+        ]
+    }]
 }
diff --git a/test_config.yaml b/test_config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.13
+          id: syn25953363.14
           format: feather
         - name: igap
           id: syn12514826.5
@@ -162,16 +162,16 @@ datasets:
         - name: median_expression
           id: syn27211878.2
           format: csv
-        - name: druggability
-          id: syn13363443.11
-          format: csv
         - <<: *genes_biodomains_files
         - name: tep_adi_info
           id: syn51942280.3
           format: csv
         - name: ensg_to_uniprot_mapping
           id: syn54113663.3
           format: tsv
+        - name: pharos_classes
+          id: syn64123611.1
+          format: csv
       final_format: json
       custom_transformations:
         adjusted_p_value_threshold: 0.05
@@ -192,7 +192,7 @@ datasets:
         uniprotkb_accession: uniprotkb_accessions
         resource_identifier: ensembl_gene_id
       provenance:
-        - syn25953363.13
+        - syn25953363.14
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance
@@ -201,10 +201,10 @@ datasets:
         - *rna_diff_expr_data_provenance
         - syn12540368.51
         - syn27211878.2
-        - syn13363443.11
         - *genes_biodomains_provenance
         - syn51942280.3
         - syn54113663.3
+        - syn64123611.1
       agora_rename:
         symbol: hgnc_symbol
       destination: *dest

diff --git a/tests/test_assets/gene_info/input/pharos_classes_good_input.csv b/tests/test_assets/gene_info/input/pharos_classes_good_input.csv
@@ -0,0 +1,20 @@
+ensembl_gene_id,uniprot_id,hgnc_symbol,pharos_class
+ENSG00000000005,Q9H2S6,TNMD,Tbio
+ENSG00000000419,O60762,DPM1,Tbio
+ENSG00000000457,Q8IZE3,SCYL3,Tbio
+ENSG00000000460,Q9NSG2,C1orf112,Tbio
+ENSG00000000938,P09769,FGR,Tchem
+ENSG00000000971,P08603,CFH,Tbio
+ENSG00000001036,Q9BTY2,FUCA2,Tchem
+ENSG00000001084,P48506,GCLC,Tchem
+ENSG00000001167,P23511,NFYA,Tbio
+ENSG00000001460,Q5TH74,STPG1,Tbio
+ENSG00000001461,Q6P499,NIPAL3,Tdark
+ENSG00000001497,Q9Y4W2,LAS1L,Tbio
+ENSG00000001561,Q9Y6X5,ENPP4,Tbio
+ENSG00000001617,Q13275,SEMA3F,Tbio
+ENSG00000001626,P13569,CFTR,Tclin
+ENSG00000001629,Q9P2G1,ANKIB1,Tdark
+ENSG00000001630,Q16850,CYP51A1,Tchem
+ENSG00000001631,O00522,KRIT1,Tbio
+ENSG00000001631,O00522,KRIT1,Tchem