From 5c5931bda07131e0dc18c2743ad9fab9678f8da2 Mon Sep 17 00:00:00 2001 From: Beatriz Saldana <37123591+beatrizsaldana@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:20:54 -0800 Subject: [PATCH] Update gene info processing for druggability revamp (#163) * Removed druggability dataset from gene_info provenance * Updated gene_metadata syn25953363 version from 13 to 14 * Added pharos_classes file to config and test_config * Updated gene_info transform to use pharos_classes dataset and only include pharos_class in druggability object * Added pharos classes good input for testing * Updated test outputs for gene_info to include pharos_class * pytest passing * Updated druggability json for gx validation and commented out the validation itself so I can run adt without problems and then update gx validation * Got gx validation to work. Had to use mostly=0.5, not ideal * Removed the mostly from the gene_info druggability json validation * Added docstrings to gene_info tests to prevent issues with CI interrogate * Sorted biodomains column so we stop seeing unnecessary changes in PRs * pin ubuntu version * fixes pin --------- Co-authored-by: Beatriz Saldana Co-authored-by: bwmac --- .github/workflows/dev.yml | 2 +- CONTRIBUTING.md | 2 +- config.yaml | 12 +- gx_suite_definitions/gene_info.ipynb | 4 +- src/agoradatatools/etl/transform/gene_info.py | 22 +--- .../gx/expectations/gene_info.json | 86 ++++++------ .../json_schemas/gene_info/druggability.json | 57 ++++---- test_config.yaml | 12 +- .../input/pharos_classes_good_input.csv | 20 +++ .../output/gene_info_good_output_1.json | 122 ++++++++---------- .../output/gene_info_good_output_2.json | 122 ++++++++---------- tests/transform/test_gene_info.py | 34 ++++- 12 files changed, 243 insertions(+), 252 deletions(-) create mode 100644 tests/test_assets/gene_info/input/pharos_classes_good_input.csv diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 191af5e7..c35bfe4e 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -15,7 +15,7 @@ on: jobs: # test job includes unit tests and coverage pre-commit: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 with: { fetch-depth: 0 } # deep clone for setuptools-scm diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e18f6047..32cb6fed 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -200,7 +200,7 @@ These expectations are defined in the `/great_expectations/gx/plugins/expectatio #### Nested Columns -If the transform includes nested columns (example: `druggability` column in `gene_info` tranform), please follow these four steps: +If the transform includes nested columns (example: `ensembl_info` column in `gene_info` tranform), please follow these four steps: 1. In the config file, add the nested column name to the `gx_nested_columns` flag for the specific transform. This will convert the column values to a JSON parsable string. ``` gx_nested_columns: diff --git a/config.yaml b/config.yaml index b6ff6073..af4c3231 100644 --- a/config.yaml +++ b/config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.13 + id: syn25953363.14 format: feather - name: igap id: syn12514826.5 @@ -162,9 +162,6 @@ datasets: - name: median_expression id: syn27211878.2 format: csv - - name: druggability - id: syn13363443.11 - format: csv - <<: *genes_biodomains_files - name: tep_adi_info id: syn51942280.3 @@ -172,6 +169,9 @@ datasets: - name: ensg_to_uniprot_mapping id: syn54113663.3 format: tsv + - name: pharos_classes + id: syn64123611.1 + format: csv final_format: json custom_transformations: adjusted_p_value_threshold: 0.05 @@ -192,7 +192,7 @@ datasets: uniprotkb_accession: uniprotkb_accessions resource_identifier: ensembl_gene_id provenance: - - syn25953363.13 + - syn25953363.14 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance @@ -201,10 +201,10 @@ datasets: - *rna_diff_expr_data_provenance - syn12540368.51 - syn27211878.2 - - syn13363443.11 - *genes_biodomains_provenance - syn51942280.3 - syn54113663.3 + - syn64123611.1 agora_rename: symbol: hgnc_symbol destination: *dest diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb index f8095045..98c102cb 100644 --- a/gx_suite_definitions/gene_info.ipynb +++ b/gx_suite_definitions/gene_info.ipynb @@ -272,7 +272,7 @@ "# biodomains\n", "validator.expect_column_values_to_be_of_type(\"biodomains\", \"list\")\n", "validator.expect_column_values_to_have_list_members_of_type(column=\"biodomains\", member_type=\"str\", mostly=0.95)\n", - "validator.expect_column_values_to_have_list_members(column=\"biodomains\", list_members={\n", + "validator.expect_column_values_to_have_list_members(column=\"biodomains\", list_members=sorted([\n", " 'Apoptosis',\n", " 'Vasculature',\n", " 'Lipid Metabolism',\n", @@ -292,7 +292,7 @@ " 'RNA Spliceosome',\n", " 'Tau Homeostasis',\n", " 'Myelination'\n", - " }\n", + " ])\n", ")" ] }, diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 8c5d588b..34a3e57e 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -21,7 +21,7 @@ def transform_gene_info( proteomics_srm = transform.transform_proteomics(df=datasets["proteomics_srm"]) target_list = datasets["target_list"] median_expression = datasets["median_expression"] - druggability = datasets["druggability"] + pharos_classes = datasets["pharos_classes"] biodomains = datasets["genes_biodomains"] tep_info = datasets["tep_adi_info"] uniprot = datasets["ensg_to_uniprot_mapping"] @@ -49,19 +49,6 @@ def transform_gene_info( .reset_index() ) - # these are the interesting columns of the druggability dataset - useful_columns = [ - "ensembl_gene_id", - "sm_druggability_bucket", - "safety_bucket", - "abability_bucket", - "pharos_class", - "classification", - "safety_bucket_definition", - "abability_bucket_definition", - ] - druggability = druggability[useful_columns] - target_list = nest_fields( df=target_list, grouping="ensembl_gene_id", @@ -77,10 +64,15 @@ def transform_gene_info( ) druggability = nest_fields( - df=druggability, + df=( + pharos_classes.groupby("ensembl_gene_id")["pharos_class"] + .apply(list) + .reset_index() + ), grouping="ensembl_gene_id", new_column="druggability", drop_columns=["ensembl_gene_id"], + nested_field_is_list=False, ) biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"]) diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json index 9ca0d112..6c0346f2 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json +++ b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json @@ -431,46 +431,36 @@ "json_schema": { "$id": "http://example.com/example.json", "$schema": "https://json-schema.org/draft/2019-09/schema", - "default": [], - "items": { - "default": {}, - "properties": { - "abability_bucket": { - "type": "number" - }, - "abability_bucket_definition": { - "maxLength": 1000, - "minLength": 44, - "type": "string" - }, - "classification": { - "maxLength": 1000, - "minLength": 22, - "type": "string" - }, - "pharos_class": { - "type": [ - "string", - "null" - ] - }, - "safety_bucket": { - "type": "number" - }, - "safety_bucket_definition": { - "maxLength": 1000, - "minLength": 50, + "default": null, + "examples": [ + { + "pharos_class": [ + "Tchem" + ] + } + ], + "properties": { + "pharos_class": { + "default": [], + "items": { + "default": "", + "enum": [ + "Tdark", + "Tchem", + "Tbio", + "Tclin", + null + ], + "title": "Pharos object", "type": "string" }, - "sm_druggability_bucket": { - "type": "number" - } - }, - "type": "object" + "title": "The pharos_class Schema", + "type": "array" + } }, - "title": "Druggability Schema", + "title": "Root Schema", "type": [ - "array", + "object", "null" ] } @@ -516,25 +506,25 @@ "kwargs": { "column": "biodomains", "list_members": [ - "Myelination", - "Vasculature", - "Synapse", - "Immune Response", - "DNA Repair", + "APP Metabolism", + "Apoptosis", "Autophagy", - "Endolysosome", - "Proteostasis", - "Mitochondrial Metabolism", "Cell Cycle", + "DNA Repair", + "Endolysosome", "Epigenetic", + "Immune Response", "Lipid Metabolism", "Metal Binding and Homeostasis", + "Mitochondrial Metabolism", + "Myelination", + "Oxidative Stress", + "Proteostasis", "RNA Spliceosome", + "Structural Stabilization", + "Synapse", "Tau Homeostasis", - "Apoptosis", - "Oxidative Stress", - "APP Metabolism", - "Structural Stabilization" + "Vasculature" ] }, "meta": {} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json b/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json index b46996e2..41d22fe2 100644 --- a/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json +++ b/src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json @@ -1,40 +1,31 @@ { "$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "http://example.com/example.json", - "type": ["array", "null"], - "default": [], - "title": "Druggability Schema", - "items": { - "type": "object", - "default": {}, - "properties": { - "sm_druggability_bucket": { - "type": "number" - }, - "safety_bucket": { - "type": "number" - }, - "abability_bucket": { - "type": "number" - }, - "pharos_class": { - "type": ["string", "null"] - }, - "classification": { + "type": ["object", "null"], + "default": null, + "title": "Root Schema", + "properties": { + "pharos_class": { + "type": "array", + "default": [], + "title": "The pharos_class Schema", + "items": { "type": "string", - "minLength": 22, - "maxLength": 1000 - }, - "safety_bucket_definition": { - "type": "string", - "minLength": 50, - "maxLength": 1000 - }, - "abability_bucket_definition": { - "type": "string", - "minLength": 44, - "maxLength": 1000 + "default": "", + "title": "Pharos object", + "enum": [ + "Tdark", + "Tchem", + "Tbio", + "Tclin", + null + ] } } - } + }, + "examples": [{ + "pharos_class": [ + "Tchem" + ] + }] } diff --git a/test_config.yaml b/test_config.yaml index a76eb704..29feb44f 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.13 + id: syn25953363.14 format: feather - name: igap id: syn12514826.5 @@ -162,9 +162,6 @@ datasets: - name: median_expression id: syn27211878.2 format: csv - - name: druggability - id: syn13363443.11 - format: csv - <<: *genes_biodomains_files - name: tep_adi_info id: syn51942280.3 @@ -172,6 +169,9 @@ datasets: - name: ensg_to_uniprot_mapping id: syn54113663.3 format: tsv + - name: pharos_classes + id: syn64123611.1 + format: csv final_format: json custom_transformations: adjusted_p_value_threshold: 0.05 @@ -192,7 +192,7 @@ datasets: uniprotkb_accession: uniprotkb_accessions resource_identifier: ensembl_gene_id provenance: - - syn25953363.13 + - syn25953363.14 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance @@ -201,10 +201,10 @@ datasets: - *rna_diff_expr_data_provenance - syn12540368.51 - syn27211878.2 - - syn13363443.11 - *genes_biodomains_provenance - syn51942280.3 - syn54113663.3 + - syn64123611.1 agora_rename: symbol: hgnc_symbol destination: *dest diff --git a/tests/test_assets/gene_info/input/pharos_classes_good_input.csv b/tests/test_assets/gene_info/input/pharos_classes_good_input.csv new file mode 100644 index 00000000..1306491f --- /dev/null +++ b/tests/test_assets/gene_info/input/pharos_classes_good_input.csv @@ -0,0 +1,20 @@ +ensembl_gene_id,uniprot_id,hgnc_symbol,pharos_class +ENSG00000000005,Q9H2S6,TNMD,Tbio +ENSG00000000419,O60762,DPM1,Tbio +ENSG00000000457,Q8IZE3,SCYL3,Tbio +ENSG00000000460,Q9NSG2,C1orf112,Tbio +ENSG00000000938,P09769,FGR,Tchem +ENSG00000000971,P08603,CFH,Tbio +ENSG00000001036,Q9BTY2,FUCA2,Tchem +ENSG00000001084,P48506,GCLC,Tchem +ENSG00000001167,P23511,NFYA,Tbio +ENSG00000001460,Q5TH74,STPG1,Tbio +ENSG00000001461,Q6P499,NIPAL3,Tdark +ENSG00000001497,Q9Y4W2,LAS1L,Tbio +ENSG00000001561,Q9Y6X5,ENPP4,Tbio +ENSG00000001617,Q13275,SEMA3F,Tbio +ENSG00000001626,P13569,CFTR,Tclin +ENSG00000001629,Q9P2G1,ANKIB1,Tdark +ENSG00000001630,Q16850,CYP51A1,Tchem +ENSG00000001631,O00522,KRIT1,Tbio +ENSG00000001631,O00522,KRIT1,Tchem diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_1.json b/tests/test_assets/gene_info/output/gene_info_good_output_1.json index c39db39e..a86c1957 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_1.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_1.json @@ -32,17 +32,9 @@ } ], "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 7, - "safety_bucket": 4, - "abability_bucket": 3, - "pharos_class": "Tbio", - "classification": null, - "safety_bucket_definition": "Safety definition 1", - "abability_bucket_definition": "Abability definition 1" - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 1.0, "biodomains": [ "Proteostasis", @@ -148,7 +140,9 @@ "tissue": "IFG" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 3.0, "biodomains": [ "Apoptosis" @@ -207,7 +201,9 @@ "tissue": "TCX" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 1.0, "biodomains": [ "Structural Stabilization" @@ -243,17 +239,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 13, - "safety_bucket": 3, - "abability_bucket": 3, - "pharos_class": "Tdark", - "classification": "Classification 3", - "safety_bucket_definition": "Safety definition 3", - "abability_bucket_definition": null - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -290,7 +278,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -338,17 +328,9 @@ "tissue": "DLPFC" } ], - "druggability": [ - { - "sm_druggability_bucket": 3, - "safety_bucket": 4, - "abability_bucket": 1, - "pharos_class": "Tbio", - "classification": "Classification 4", - "safety_bucket_definition": "Safety definition 4", - "abability_bucket_definition": "Abability definition 4" - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -398,17 +380,9 @@ "tissue": "IFG" } ], - "druggability": [ - { - "sm_druggability_bucket": 1, - "safety_bucket": 3, - "abability_bucket": 1, - "pharos_class": "Tchem", - "classification": "Classification 2", - "safety_bucket_definition": null, - "abability_bucket_definition": "Abability definition 2" - } - ], + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -442,17 +416,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 1, - "safety_bucket": 5, - "abability_bucket": 3, - "pharos_class": null, - "classification": "Classification 5", - "safety_bucket_definition": "Safety definition 5", - "abability_bucket_definition": "Abability definition 5" - } - ], + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -486,7 +452,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -518,7 +486,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -551,7 +521,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tdark"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -586,7 +558,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": true, @@ -617,7 +591,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -650,7 +626,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -688,7 +666,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tclin"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -717,7 +697,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tdark"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -753,7 +735,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -804,7 +788,9 @@ "tissue": "IFG" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio", "Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_2.json b/tests/test_assets/gene_info/output/gene_info_good_output_2.json index dc8ad4c0..bf0fcaf0 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_2.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_2.json @@ -32,17 +32,9 @@ } ], "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 7, - "safety_bucket": 4, - "abability_bucket": 3, - "pharos_class": "Tbio", - "classification": null, - "safety_bucket_definition": "Safety definition 1", - "abability_bucket_definition": "Abability definition 1" - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 1.0, "biodomains": [ "Proteostasis", @@ -146,7 +138,9 @@ "tissue": "IFG" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 3.0, "biodomains": [ "Apoptosis" @@ -203,7 +197,9 @@ "tissue": "TCX" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": 1.0, "biodomains": [ "Structural Stabilization" @@ -237,17 +233,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 13, - "safety_bucket": 3, - "abability_bucket": 3, - "pharos_class": "Tdark", - "classification": "Classification 3", - "safety_bucket_definition": "Safety definition 3", - "abability_bucket_definition": null - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -282,7 +270,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -328,17 +318,9 @@ "tissue": "DLPFC" } ], - "druggability": [ - { - "sm_druggability_bucket": 3, - "safety_bucket": 4, - "abability_bucket": 1, - "pharos_class": "Tbio", - "classification": "Classification 4", - "safety_bucket_definition": "Safety definition 4", - "abability_bucket_definition": "Abability definition 4" - } - ], + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -386,17 +368,9 @@ "tissue": "IFG" } ], - "druggability": [ - { - "sm_druggability_bucket": 1, - "safety_bucket": 3, - "abability_bucket": 1, - "pharos_class": "Tchem", - "classification": "Classification 2", - "safety_bucket_definition": null, - "abability_bucket_definition": "Abability definition 2" - } - ], + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -428,17 +402,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": [ - { - "sm_druggability_bucket": 1, - "safety_bucket": 5, - "abability_bucket": 3, - "pharos_class": null, - "classification": "Classification 5", - "safety_bucket_definition": "Safety definition 5", - "abability_bucket_definition": "Abability definition 5" - } - ], + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -470,7 +436,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -500,7 +468,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -531,7 +501,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tdark"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -564,7 +536,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": true, @@ -593,7 +567,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -624,7 +600,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tbio"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -659,7 +637,9 @@ "protein_brain_change_studied": false, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tclin"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -686,7 +666,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tdark"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -720,7 +702,9 @@ "protein_brain_change_studied": true, "target_nominations": null, "median_expression": null, - "druggability": null, + "druggability": { + "pharos_class":["Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, @@ -769,7 +753,9 @@ "tissue": "IFG" } ], - "druggability": null, + "druggability": { + "pharos_class":["Tbio", "Tchem"] + }, "total_nominations": null, "biodomains": null, "is_adi": false, diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index 43fe4e57..3308c53c 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -54,10 +54,9 @@ "tissue" value is missing. Duplicate Ensembl IDs are allowed due to multiple tissues, so the test file has several rows with the same Ensembl ID but different tissue value. failing input: none - druggability: information on the druggability and safety of each gene. + pharos_classes: information on the pharos class of each gene. passing input: any field can be missing, so there are a few rows with missing data in at least one column. - Duplicate Ensembl IDs are technically allowed, but this does not happen (or make sense) in the - full dataset, so we do not test it. + Duplicate Ensembl IDs are allowed and does happen in the real dataset, so we test for it. failing input: none genes_biodomains: a list of Ensembl IDs and their associated biodomains and GO terms. passing input: any field can be missing, so the test file has rows with missing data in at least one column. @@ -123,7 +122,7 @@ class TestTransformGeneInfo: "proteomics_srm": "proteomics_srm_good_input.csv", "target_list": "target_list_good_input.csv", "median_expression": "median_expression_good_input.csv", - "druggability": "druggability_good_input.csv", + "pharos_classes": "pharos_classes_good_input.csv", "genes_biodomains": "genes_biodomains_good_input.csv", "tep_adi_info": "tep_adi_info_good_input.csv", "ensg_to_uniprot_mapping": "ensg_to_uniprot_mapping_good.tsv", @@ -265,6 +264,18 @@ def read_input_files_dict(self, input_files_dict: dict) -> dict: def test_transform_gene_info_should_pass( self, input_files_dict: dict, expected_output_file: str, param_set: dict ): + """ + Test that the transform_gene_info function passes with the given input files and parameters. + + Args: + input_files_dict: a dictionary where the keys are the names of the datasets, as expected by + transform_gene_info, and the values are the filenames to load + expected_output_file: the filename of the expected output JSON file + param_set: a dictionary of parameters to pass to transform_gene_info + + Returns: + None + """ datasets = self.read_input_files_dict(input_files_dict) output_df = gene_info.transform_gene_info( @@ -293,6 +304,21 @@ def test_transform_gene_info_should_fail( error_type: BaseException, error_match_string: str, ): + """ + Test that the transform_gene_info function fails with the given input files and parameters. + + Args: + input_files_dict: a dictionary where the keys are the names of the datasets, as expected by + transform_gene_info, and the values are the filenames to load + failure_case_files_dict: a dictionary where the keys are the names of the datasets with bad data, + and the values are the filenames to load + param_set: a dictionary of parameters to pass to transform_gene_info + error_type: the type of error that should be raised + error_match_string: a string to match against the error message + + Returns: + None + """ # Need to make a copy, otherwise this edits the original dictionary and persists through all the tests updated_files_dict = input_files_dict.copy()