diff --git a/eds_scikit/biology/cleaning/main.py b/eds_scikit/biology/cleaning/main.py index a90fb6d7..08e1a3d0 100644 --- a/eds_scikit/biology/cleaning/main.py +++ b/eds_scikit/biology/cleaning/main.py @@ -3,20 +3,11 @@ from eds_scikit.biology.cleaning.cohort import select_cohort from eds_scikit.biology.cleaning.transform import transform_measurement -from eds_scikit.biology.cleaning.utils import check_data_and_select_columns -from eds_scikit.biology.utils.process_concepts import ( - ConceptsSet, - fetch_all_concepts_set, - get_concept_src_to_std, -) -from eds_scikit.biology.utils.process_measurement import ( - filter_measurement_by_date, - get_measurement_std, - get_valid_measurement, -) +from eds_scikit.biology.utils.check_data import check_data_and_select_columns +from eds_scikit.biology.utils.process_concepts import ConceptsSet +from eds_scikit.biology.utils.prepare_measurement import prepare_measurement_table from eds_scikit.io import settings from eds_scikit.utils.typing import Data, DataFrame -from eds_scikit.biology.utils.prepare_df import prepare_biology_relationship default_standard_terminologies = settings.standard_terminologies default_standard_concept_regex = settings.standard_concept_regex @@ -25,7 +16,7 @@ def bioclean( data: Data, concepts_sets: List[ConceptsSet] = None, - config_name: str = None, + config_name: str = None, #config_name start_date: datetime = None, end_date: datetime = None, studied_cohort: Union[DataFrame, List[int]] = None, @@ -61,56 +52,14 @@ def bioclean( Data Same as the input with the transformed `bioclean` table """ - # Check the data and extract them - measurement, concept, concept_relationship = check_data_and_select_columns(data) - - # Filter valid measurement - measurement_valid = get_valid_measurement(measurement) - - # Filter measurement by date - measurement_timed = filter_measurement_by_date( - measurement_valid, start_date, end_date - ) - - # Query concepts-set information - #if concepts_sets is None: - # concepts_sets = fetch_all_concepts_set() - - # Map biology concept - source_terminologies = { - "ANALYSES_LABORATOIRE": r"Analyses Laboratoire", - "GLIMS_ANABIO": r"GLIMS.{0,20}Anabio", - "GLIMS_LOINC": r"GLIMS.{0,20}LOINC", - "ANABIO_ITM": r"ITM - ANABIO", - "LOINC_ITM": r"ITM - LOINC", - } - - mapping = [ - ("ANALYSES_LABORATOIRE", "GLIMS_ANABIO", "Maps to"), - ("ANALYSES_LABORATOIRE", "GLIMS_LOINC", "Maps to"), - ("GLIMS_ANABIO", "ANABIO_ITM", "Mapped from"), - ("ANABIO_ITM", "LOINC_ITM", "Maps to"), - ] - biology_relationship_table = prepare_biology_relationship(data, - source_terminologies, - mapping, - concepts_sets=concepts_sets) + measurements = prepare_measurement_table(data, start_date, end_date, concept_sets, cohort=None, convert_units=False, outliers_detection=None) - measurement_std_filtered = measurement_timed.merge(biology_relationship_table, - left_on="measurement_source_concept_id", - right_on=f"{mapping[0][0]}_concept_id") - - # Extract concept-set - measurement_std_filtered = measurement_std_filtered.drop(columns=["measurement_source_concept_id"]) - measurement_std_filtered = measurement_std_filtered.rename(columns={"GLIMS_ANABIO_concept_code" : "AnaBio_concept_code", - "GLIMS_LOINC_concept_code" : "LOINC_concept_code"}) - measurement_std_filtered = measurement_std_filtered.drop(columns=["ANALYSES_LABORATOIRE_concept_code", "ANABIO_ITM_concept_code", "LOINC_ITM_concept_code"]) # Filter Measurement if studied_cohort: measurement_std_filtered = select_cohort( - measurement_std_filtered, studied_cohort + measurements, studied_cohort ) # Transform values - data.bioclean = transform_measurement(measurement_std_filtered, clip, config_name) + data.bioclean = transform_measurement(measurements, clip, config_name) diff --git a/eds_scikit/biology/cleaning/utils.py b/eds_scikit/biology/utils/check_data.py similarity index 58% rename from eds_scikit/biology/cleaning/utils.py rename to eds_scikit/biology/utils/check_data.py index 4f7d0fd7..6b4f1720 100644 --- a/eds_scikit/biology/cleaning/utils.py +++ b/eds_scikit/biology/utils/check_data.py @@ -1,10 +1,9 @@ from eds_scikit.utils.checks import check_columns, check_tables from eds_scikit.utils.typing import Data - -def check_data_and_select_columns(data: Data): - """Check the required tables and columns in the Data and extract them - +def check_data_and_select_columns_measurement(data: Data): + """Check the required tables and columns in the Data and extract them. + Parameters ---------- data : Data @@ -56,3 +55,44 @@ def check_data_and_select_columns(data: Data): concept_relationship = data.concept_relationship[_relationship_required_columns] return measurement, concept, concept_relationship + + +def check_data_and_select_columns_relationship(data: Data): + """Check the required tables and columns in the Data and extract them. + + Parameters + ---------- + data : Data + Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData] + """ + check_tables( + data, + required_tables=[ + "concept", + "concept_relationship", + ], + ) + + _concept_required_columns = [ + "concept_id", + "concept_name", + "concept_code", + "vocabulary_id", + ] + + _concept_relationship_required_columns = [ + "concept_id_1", + "concept_id_2", + "relationship_id", + ] + + check_columns(data.concept, required_columns=_concept_required_columns) + check_columns( + data.concept_relationship, + required_columns=_concept_relationship_required_columns, + ) + + concept = data.concept[_concept_required_columns] + concept_relationship = data.concept_relationship[_relationship_required_columns] + + return concept, concept_relationship diff --git a/eds_scikit/biology/utils/config_files/default_concept_sets b/eds_scikit/biology/utils/config_files/default_concept_sets new file mode 100644 index 00000000..5da66b88 --- /dev/null +++ b/eds_scikit/biology/utils/config_files/default_concept_sets @@ -0,0 +1,576 @@ +,GLIMS_ANABIO_concept_code,concept_set,category +1602,A0174,Leukocytes_Blood,blood_count_cell +1604,H6740,Leukocytes_Blood,blood_count_cell +1608,C8824,Leukocytes_Blood,blood_count_cell +1617,A0163,Hemoglobin_Blood,blood_count_cell +1618,H6738,Hemoglobin_Blood,blood_count_cell +1655,A0230,Platelets_Blood,blood_count_cell +1656,H6751,Platelets_Blood,blood_count_cell +1657,A1598,Platelets_Blood,blood_count_cell +1658,A1598,Platelets_Blood,blood_count_cell +1696,A2538,Platelets_Blood,blood_count_cell +1697,A2539,Platelets_Blood,blood_count_cell +1698,A2539,Platelets_Blood,blood_count_cell +1707,J4463,Platelets_Blood,blood_count_cell +1726,A0155,Neutrophil_Polymorphonuclears_Blood,blood_count_cell +1727,H6732,Neutrophil_Polymorphonuclears_Blood,blood_count_cell +1732,A0150,Eosinophil_Polymorphonuclears_Blood,blood_count_cell +1733,H6730,Eosinophil_Polymorphonuclears_Blood,blood_count_cell +1740,A0198,Lymphocytes_Blood,blood_count_cell +1742,H6743,Lymphocytes_Blood,blood_count_cell +1745,A0210,Monocytes_Blood,blood_count_cell +1747,H6747,Monocytes_Blood,blood_count_cell +3194,A0275,Quick_Time, +3201,A1805,Prothrombin_Level,coagulation +3203,A0269,INR,coagulation +3209,E9993,Prothrombin_Level,coagulation +3236,A1792,Activated_Partial_Thromboplastin_Time,coagulation +3242,L7286,Activated_Partial_Thromboplastin_Time,coagulation +3289,A1795,Activated_Partial_Thromboplastin_Time_Kaolin, +3305,A0126,Fibrinogen,inflammatory_panel +3333,C7882,D-Dimers,coagulation +3334,C7882,D-Dimers,coagulation +3335,I8765,D-Dimers,coagulation +3336,A0124,D-Dimers,coagulation +3337,C0474,D-Dimers,coagulation +3338,C0474,D-Dimers,coagulation +3339,C0474,D-Dimers,coagulation +3345,B4199,D-Dimers,coagulation +3346,F5402,D-Dimers,coagulation +3613,A7748,Activated_Partial_Thromboplastin_Time,coagulation +6914,A0221,pH_Blood,blood_gas +6915,L5017,pH_Blood,blood_gas +6923,A0219,pH_Blood,blood_gas +6924,A7305,PaCO2_Blood,blood_gas +6925,A0630,PaCO2_Blood,blood_gas +6926,A7319,PaO2_Blood,blood_gas +6927,H7747,PaO2_Blood,blood_gas +6933,A0420,HCO3-_Blood,blood_gas +6934,L5018,HCO3-_Blood,blood_gas +6935,A7334,SaO2_Blood,blood_gas +6936,L5021,SaO2_Blood,blood_gas +6956,C8697,Lactate_Gaz_Blood,blood_gas +6957,H7748,Lactate_Gaz_Blood,blood_gas +7380,A0141,Glucose_Blood,diabete +7381,H7323,Glucose_Blood,diabete +7382,J7401,Glucose_Blood,diabete +7383,F2622,Glucose_Blood,diabete +7384,B9553,Glucose_Blood,diabete +7386,C7236,Glucose_Blood,diabete +7387,E7312,Glucose_Blood,diabete +7390,A7338,Glucose_Blood,diabete +7391,H7324,Glucose_Blood,diabete +7392,C0565,Glucose_Blood,diabete +7393,E9889,Glucose_Blood,diabete +7401,B6983,HbA1c_Blood,diabete +7402,A2228,HbA1c_Blood,diabete +7403,A1271,HbA1c_Blood,diabete +7405,E6632,HbA1c_Blood,diabete +7406,I5968,HbA1c_Blood,diabete +7413,A0262,Sodium_Blood,ionogram +7414,J1177,Sodium_Blood,ionogram +7415,F8162,Sodium_Blood,ionogram +7417,F2617,Sodium_Blood,ionogram +7424,A2380,Potassium_Blood,ionogram +7425,E2073,Potassium_Blood,ionogram +7427,F2618,Potassium_Blood,ionogram +7428,E2337,Potassium_Blood,ionogram +7429,J1178,Potassium_Blood,ionogram +7432,A0079,Chloride_Blood,ionogram +7433,J1179,Chloride_Blood,ionogram +7434,F2619,Chloride_Blood,ionogram +7436,A0422,Bicarbonate_Blood,ionogram +7437,H9622,Bicarbonate_Blood,ionogram +7438,C6408,Bicarbonate_Blood,ionogram +7439,F4161,Bicarbonate_Blood,ionogram +7440,A2136,Bicarbonate_Blood,ionogram +7441,J7371,Bicarbonate_Blood,ionogram +7442,G2031,Bicarbonate_Blood,ionogram +7448,A7347,Proteins_Blood,inflammatory_panel +7449,F5122,Proteins_Blood,inflammatory_panel +7450,F2624,Proteins_Blood,inflammatory_panel +7451,B9417,Proteins_Blood,inflammatory_panel +7452,A0249,Proteins_Blood,inflammatory_panel +7453,B3990,Proteins_Blood,inflammatory_panel +7455,A0286,Urea_Blood,renal_panel +7456,G3350,Urea_Blood,renal_panel +7457,J7372,Urea_Blood,renal_panel +7458,F2620,Urea_Blood,renal_panel +7463,F9622,GFR (MDRD),renal_panel +7464,G7835,GFR (MDRD),renal_panel +7465,G6921,GFR (EPI CKD),renal_panel +7471,B9964,GFR (MDRD),renal_panel +7472,A7456,GFR (MDRD),renal_panel +7473,A7455,GFR (MDRD),renal_panel +7474,H5609,GFR (MDRD),renal_panel +7476,F8160,GFR (EPI CKD),renal_panel +7498,A0173,Venous_Lactate,other +7499,B9146,Venous_Lactate,other +7500,A9995,Venous_Lactate,other +7508,A8424,Glucose_Blood,diabete +7520,C0543,Calcium_Blood,ionogram +7521,D2359,Calcium_Blood,ionogram +7522,A0038,Calcium_Blood,ionogram +7523,H5041,Calcium_Blood,ionogram +7524,F2625,Calcium_Blood,ionogram +7526,L5047,Calcium_Blood,ionogram +7527,D2358,Albumin,proteins +7528,C6841,Albumin,proteins +7529,C2102,Albumin,proteins +7530,G6616,Albumin,proteins +7531,L2260,Albumin,proteins +7533,A2512,Calcium_Blood,ionogram +7534,A2512,Calcium_Blood,ionogram +7535,A0607,Calcium_Blood,ionogram +7544,A0226,Phosphates,proteins +7545,F8186,Phosphates,proteins +7546,F2626,Phosphates,proteins +7564,A0248,CRP,inflammatory_panel +7565,E6332,CRP,inflammatory_panel +7566,F5581,CRP,inflammatory_panel +7567,J7381,CRP,inflammatory_panel +7568,F2631,CRP,inflammatory_panel +7573,A1661,Procalcitonin,inflammatory_biomarkers +7574,H5267,Procalcitonin,inflammatory_biomarkers +7576,F2632,Procalcitonin,inflammatory_biomarkers +7593,A0022,ASAT,hepatic_panel +7594,G1800,ASAT,hepatic_panel +7595,E2068,ASAT,hepatic_panel +7596,F2628,ASAT,hepatic_panel +7600,A0002,ALAT,hepatic_panel +7601,G1804,ALAT,hepatic_panel +7602,J7373,ALAT,hepatic_panel +7603,E2067,ALAT,hepatic_panel +7604,F2629,ALAT,hepatic_panel +7608,A0131,GGT,hepatic_panel +7609,F8184,GGT,hepatic_panel +7610,E9771,GGT,hepatic_panel +7611,J7370,GGT,hepatic_panel +7612,K7045,GGT,hepatic_panel +7613,A0227,PAL,hepatic_panel +7614,F8187,PAL,hepatic_panel +7615,E6331,PAL,hepatic_panel +7616,F1844,PAL,hepatic_panel +7619,A0029,Total Bilirubin,hepatic_panel +7620,H5264,Total Bilirubin,hepatic_panel +7623,D0189,Total Bilirubin,hepatic_panel +7635,A0170,LDH,other +7636,H5261,LDH,other +7637,J7400,LDH,other +7638,C8889,LDH,other +7639,J1161,LDH,other +7641,A0090,Creatine Kinase,other +7642,G0171,Creatine Kinase,other +7643,E6330,Creatine Kinase,other +7661,A0283,Troponin,cardiac_biomarkers +7662,C5560,Troponin,cardiac_biomarkers +7663,F9934,Troponin,cardiac_biomarkers +7664,E6954,Troponin,cardiac_biomarkers +7665,L3534,Troponin,cardiac_biomarkers +7666,G7716,Troponin,cardiac_biomarkers +7667,J5184,Troponin,cardiac_biomarkers +7670,A3832,Troponin,cardiac_biomarkers +7672,E7249,Troponin,cardiac_biomarkers +7681,C8189,BNP,cardiac_biomarkers +7682,B5596,BNP,cardiac_biomarkers +7683,A2128,BNP,cardiac_biomarkers +7686,A7333,NT Pro-BNP,cardiac_biomarkers +7687,J7267,NT Pro-BNP,cardiac_biomarkers +7688,J7959,NT Pro-BNP,cardiac_biomarkers +7811,A0278,Transferrin Saturation Coefficient,martial_panel +7812,A0123,Ferritin,martial_panel +7813,E9865,Ferritin,martial_panel +7836,A7426,B-HCG,diabete +7837,F2353,B-HCG,diabete +7838,A0164,B-HCG,diabete +7839,L2277,B-HCG,diabete +7970,A0006,Albumin,proteins +7971,E4799,Albumin,proteins +7972,I2013,Albumin,proteins +7981,F9613,GFR (EPI CKD),renal_panel +7982,F9621,GFR (EPI CKD),renal_panel +7983,F9621,GFR (EPI CKD),renal_panel +8167,A0250,EPP,proteins +8168,C9874,EPP,proteins +8169,A3758,EPP,proteins +8170,A0004,EPP,proteins +8171,F9978,EPP,proteins +8172,A0005,EPP,proteins +8173,H8137,EPP,proteins +8174,C7087,EPP,proteins +8175,A0003,EPP,proteins +8176,C7088,EPP,proteins +8177,B9456,EPP,proteins +8178,B9455,EPP,proteins +8179,A0008,EPP,proteins +8180,H8138,EPP,proteins +8181,C7089,EPP,proteins +8182,A0007,EPP,proteins +8183,C7090,EPP,proteins +8184,A0010,EPP,proteins +8185,H8139,EPP,proteins +8186,C7091,EPP,proteins +8187,A0009,EPP,proteins +8188,C7092,EPP,proteins +8189,C6525,EPP,proteins +8190,C6524,EPP,proteins +8191,A0415,EPP,proteins +8192,C7093,EPP,proteins +8193,A0414,EPP,proteins +8194,C7094,EPP,proteins +8195,B9458,EPP,proteins +8196,B9457,EPP,proteins +8197,A2113,EPP,proteins +8198,H8140,EPP,proteins +8199,E5327,EPP,proteins +8200,A2112,EPP,proteins +8201,E5328,EPP,proteins +8202,C6536,EPP,proteins +8203,C6535,EPP,proteins +8204,E2398,EPP,proteins +8205,E2399,EPP,proteins +8206,A2115,EPP,proteins +8207,H8141,EPP,proteins +8208,E5329,EPP,proteins +8209,A2114,EPP,proteins +8210,E5330,EPP,proteins +8211,C6538,EPP,proteins +8212,C6537,EPP,proteins +8213,E2400,EPP,proteins +8214,E2401,EPP,proteins +8215,A0130,EPP,proteins +8216,H8142,EPP,proteins +8217,C7100,EPP,proteins +8218,A0129,EPP,proteins +8219,C7101,EPP,proteins +8220,G6942,EPP,proteins +8221,G6941,EPP,proteins +8222,B9460,EPP,proteins +8223,B9459,EPP,proteins +8224,C6596,EPP,proteins +8225,C6595,EPP,proteins +8226,C6598,EPP,proteins +8227,C6597,EPP,proteins +8228,E2402,EPP,proteins +8229,E2403,EPP,proteins +8230,K4483,EPP,proteins +8231,A2118,EPP,proteins +8232,A2117,EPP,proteins +8233,E1847,EPP,proteins +8234,B8047,EPP,proteins +8235,A2127,EPP,proteins +8236,I8076,EPP,proteins +8237,A2126,EPP,proteins +8238,I8077,EPP,proteins +8239,X5093,EPP,proteins +8240,X5094,EPP,proteins +8241,X5091,EPP,proteins +8242,X5092,EPP,proteins +8243,A2279,EPP,proteins +8244,L7258,EPP,proteins +8245,A1361,EPP,proteins +8246,L7259,EPP,proteins +8247,C6909,EPP,proteins +8248,B1727,EPP,proteins +8249,B1725,EPP,proteins +8250,C6924,EPP,proteins +8251,I5139,EPP,proteins +8252,X5097,EPP,proteins +8253,X5098,EPP,proteins +8254,X5095,EPP,proteins +8255,X5096,EPP,proteins +8256,A2278,EPP,proteins +8257,L7260,EPP,proteins +8258,A2277,EPP,proteins +8259,L7261,EPP,proteins +8260,D0265,EPP,proteins +8261,B1728,EPP,proteins +8262,B1726,EPP,proteins +8263,D0267,EPP,proteins +8264,I5140,EPP,proteins +8265,X5101,EPP,proteins +8266,X5102,EPP,proteins +8267,X5099,EPP,proteins +8268,X5100,EPP,proteins +8269,H6397,EPP,proteins +8270,L7262,EPP,proteins +8271,H6396,EPP,proteins +8272,L7263,EPP,proteins +8273,D0266,EPP,proteins +8274,C0616,EPP,proteins +8275,D0268,EPP,proteins +8276,I5141,EPP,proteins +8277,A8775,EPP,proteins +8278,A7816,EPP,proteins +8279,A8776,EPP,proteins +8280,A8777,EPP,proteins +8281,A8778,EPP,proteins +8282,A8779,EPP,proteins +8283,A8780,EPP,proteins +8285,A7330,EPP,proteins +8286,F0748,EPP,proteins +8287,F0749,EPP,proteins +8288,H9656,EPP,proteins +8289,H9657,EPP,proteins +8290,H9658,EPP,proteins +8291,H9659,EPP,proteins +8292,H9660,EPP,proteins +8979,A1695,Proteins,proteins +8980,A1694,Proteins,proteins +8981,A1696,Proteins,proteins +8982,C9990,Proteins,proteins +8983,C9991,Proteins,proteins +8984,J7268,Proteins,proteins +8985,J7269,Proteins,proteins +8989,C3941,Proteins,proteins +8990,E4745,Proteins,proteins +8991,G4187,Proteins,proteins +8992,F6060,Proteins,proteins +17074,A1831,TSH,diabete +17075,F2150,TSH,diabete +17076,I8385,TSH,diabete +17077,C2666,TSH,diabete +47303,C9351,IL-1 beta,inflammatory_biomarkers +47305,B8921,IL-1 beta,inflammatory_biomarkers +47306,G4800,IL-1 beta,inflammatory_biomarkers +47307,K3662,IL-1 beta,inflammatory_biomarkers +47308,L2217,IL-1 beta,inflammatory_biomarkers +47327,B8929,IL-6,inflammatory_biomarkers +47328,G4799,IL-6,inflammatory_biomarkers +47329,B1910,IL-6,inflammatory_biomarkers +47330,K3467,IL-6,inflammatory_biomarkers +47331,L2205,IL-6,inflammatory_biomarkers +47332,E6992,IL-6,inflammatory_biomarkers +47342,B8922,IL-10,inflammatory_biomarkers +47343,C8763,IL-10,inflammatory_biomarkers +47345,K3478,IL-10,inflammatory_biomarkers +47346,L2210,IL-10,inflammatory_biomarkers +47400,B8931,TNF alpha,inflammatory_biomarkers +47401,G4801,TNF alpha,inflammatory_biomarkers +47402,C9393,TNF alpha,inflammatory_biomarkers +47403,K3505,TNF alpha,inflammatory_biomarkers +47404,L2203,TNF alpha,inflammatory_biomarkers +47405,E6993,TNF alpha,inflammatory_biomarkers +47415,J9194,TNF alpha,inflammatory_biomarkers +47417,J9193,IL-1 beta,inflammatory_biomarkers +47419,J9190,IL-6,inflammatory_biomarkers +47423,J9187,IL-10,inflammatory_biomarkers +47438,K3665,IL-1 beta,inflammatory_biomarkers +47439,K3687,IL-1 beta,inflammatory_biomarkers +47440,K3661,IL-1 beta,inflammatory_biomarkers +47441,L2197,IL-1 beta,inflammatory_biomarkers +47449,K3456,IL-6,inflammatory_biomarkers +47450,L2193,IL-6,inflammatory_biomarkers +47451,K3435,IL-6,inflammatory_biomarkers +47452,K3460,IL-6,inflammatory_biomarkers +47457,K3481,IL-10,inflammatory_biomarkers +47458,K3472,IL-10,inflammatory_biomarkers +47459,K3475,IL-10,inflammatory_biomarkers +47460,L2198,IL-10,inflammatory_biomarkers +47499,K3658,TNF alpha,inflammatory_biomarkers +47500,K3502,TNF alpha,inflammatory_biomarkers +47501,K3504,TNF alpha,inflammatory_biomarkers +47502,L2191,TNF alpha,inflammatory_biomarkers +71315,D2865,HIV Serology,serology +71317,D2867,HIV Serology,serology +71319,D2845,HIV Serology,serology +71321,D2864,HIV Serology,serology +71323,F4252,HIV Serology,serology +71326,D2866,HIV Serology,serology +71328,F3257,HIV Serology,serology +71329,D2869,HIV Serology,serology +71334,F1705,HIV Serology,serology +71337,D2846,HIV Serology,serology +71339,D2847,HIV Serology,serology +71341,D2844,HIV Serology,serology +71344,E8605,HIV Serology,serology +71346,F5401,HIV Serology,serology +71350,G0175,HIV Serology,serology +71352,J5891,HIV Serology,serology +71354,J2672,HIV Serology,serology +71356,H7667,HIV Serology,serology +71693,D2729,Hepatitis B Serology,serology +71695,D2730,Hepatitis B Serology,serology +71697,E1524,Hepatitis B Serology,serology +71699,F2075,Hepatitis B Serology,serology +71702,D2725,Hepatitis B Serology,serology +71705,I1903,Hepatitis B Serology,serology +71707,D2728,Hepatitis B Serology,serology +71709,D2726,Hepatitis B Serology,serology +71710,D2726,Hepatitis B Serology,serology +71713,F5613,Hepatitis B Serology,serology +71718,D2731,Hepatitis B Serology,serology +71721,D2727,Hepatitis B Serology,serology +71724,G1197,Hepatitis B Serology,serology +71726,G1199,Hepatitis B Serology,serology +71727,G1199,Hepatitis B Serology,serology +71730,J5887,Hepatitis B Serology,serology +71732,J5890,Hepatitis B Serology,serology +71734,J2697,Hepatitis B Serology,serology +71736,L6883,Hepatitis B Serology,serology +71738,J2695,Hepatitis B Serology,serology +71741,L7877,Hepatitis B Serology,serology +71802,D2653,Hepatitis B Serology,serology +71803,D2660,Hepatitis B Serology,serology +71804,D2654,Hepatitis B Serology,serology +71805,D2661,Hepatitis B Serology,serology +71806,D2649,Hepatitis B Serology,serology +71807,D2656,Hepatitis B Serology,serology +71808,H9686,Hepatitis B Serology,serology +71809,H9687,Hepatitis B Serology,serology +71810,I6579,Hepatitis B Serology,serology +71811,I6580,Hepatitis B Serology,serology +71813,D2652,Hepatitis B Serology,serology +71814,D2659,Hepatitis B Serology,serology +71815,D2650,Hepatitis B Serology,serology +71816,D2650,Hepatitis B Serology,serology +71817,D2657,Hepatitis B Serology,serology +71818,D2655,Hepatitis B Serology,serology +71819,D2662,Hepatitis B Serology,serology +71820,D2651,Hepatitis B Serology,serology +71821,D2651,Hepatitis B Serology,serology +71822,D2658,Hepatitis B Serology,serology +71823,F5615,Hepatitis B Serology,serology +71824,F5616,Hepatitis B Serology,serology +71826,G1209,Hepatitis B Serology,serology +71827,G1209,Hepatitis B Serology,serology +71828,G1210,Hepatitis B Serology,serology +71829,G1210,Hepatitis B Serology,serology +71830,I2927,Hepatitis B Serology,serology +71831,I2928,Hepatitis B Serology,serology +71832,J5885,Hepatitis B Serology,serology +71833,J5886,Hepatitis B Serology,serology +71834,J2699,Hepatitis B Serology,serology +71835,J2700,Hepatitis B Serology,serology +72053,D2780,Hepatitis C Serology,serology +72055,H5078,Hepatitis C Serology,serology +72057,I1846,Hepatitis C Serology,serology +72059,D2777,Hepatitis C Serology,serology +72062,E6503,Hepatitis C Serology,serology +72063,D2778,Hepatitis C Serology,serology +72065,D3088,Hepatitis C Serology,serology +72066,D3088,Hepatitis C Serology,serology +72068,D2774,Hepatitis C Serology,serology +72071,J1518,Hepatitis C Serology,serology +72075,D2776,Hepatitis C Serology,serology +72076,D2771,Hepatitis C Serology,serology +72079,E8372,Hepatitis C Serology,serology +72082,F7465,Hepatitis C Serology,serology +72084,I3151,Hepatitis C Serology,serology +72087,G0173,Hepatitis C Serology,serology +72089,L1237,Hepatitis C Serology,serology +72091,J2678,Hepatitis C Serology,serology +72093,K3833,Hepatitis C Serology,serology +72096,E8373,Hepatitis C Serology,serology +72098,K4228,Hepatitis C Serology,serology +85104,K1108,SARS-CoV-2,virology +85220,J9791,SARS-CoV-2,virology +85833,I5915,Adenovirus,virology +85834,I5916,Coronavirus,virology +85835,I5917,Coronavirus,virology +85836,I5918,Coronavirus,virology +85837,I5919,Coronavirus,virology +85838,I5920,Metapneumovirus,virology +85839,I5921,Rhino/Enterovirus,virology +85840,I5922,Influenza A,virology +85845,I5923,Influenza B,virology +85847,I5924,Parainfluenza,virology +85848,I5925,Parainfluenza,virology +85849,I5926,Parainfluenza,virology +85850,I5927,Parainfluenza,virology +85851,I5928,RSV,virology +85855,I7748,B.pertussis,virology +85856,I5930,C.pneumoniae,virology +85859,I7952,Adenovirus,virology +85860,I7953,Coronavirus,virology +85861,I7954,Coronavirus,virology +85862,I7955,Coronavirus,virology +85863,I7956,Coronavirus,virology +85865,I7958,Metapneumovirus,virology +85866,I7959,Rhino/Enterovirus,virology +85867,I7960,Influenza A,virology +85873,I7961,Influenza B,virology +85874,I7962,Parainfluenza,virology +85875,I7963,Parainfluenza,virology +85876,I7964,Parainfluenza,virology +85877,I7965,Parainfluenza,virology +85878,I7966,RSV,virology +85883,I7968,B.pertussis,virology +85886,I7969,C.pneumoniae,virology +98555,J2176,L.pneumophila,virology +98557,J2207,C.pneumoniae,virology +98560,J2198,Influenza A,virology +98561,J2203,Influenza B,virology +98562,J2201,RSV,virology +98563,J2197,Rhino/Enterovirus,virology +98564,J2200,Metapneumovirus,virology +98565,J2204,Parainfluenza,virology +98566,J2229,Adenovirus,virology +98567,J2199,Coronavirus,virology +98616,J2988,Adenovirus,virology +98622,J2996,Coronavirus,virology +98625,J2994,Coronavirus,virology +98628,J2993,Coronavirus,virology +98631,J2992,Coronavirus,virology +98634,J8706,SARS-CoV-2,virology +98638,J2991,Metapneumovirus,virology +98641,J2990,Influenza A,virology +98653,J2985,Influenza B,virology +98656,J2979,Parainfluenza,virology +98659,J2977,Parainfluenza,virology +98662,J2983,Parainfluenza,virology +98665,J2981,Parainfluenza,virology +98668,J2974,RSV,virology +98671,J2973,Rhino/Enterovirus,virology +98678,J3006,L.pneumophila,virology +98686,J8817,Adenovirus,virology +98694,J8829,Coronavirus,virology +98697,J8828,Coronavirus,virology +98700,J8823,Coronavirus,virology +98703,J8822,Coronavirus,virology +98706,J8827,SARS-CoV-2,virology +98710,J8824,Metapneumovirus,virology +98713,J8825,Influenza A,virology +98725,J8819,Influenza B,virology +98730,J8861,Parainfluenza,virology +98733,J8862,Parainfluenza,virology +98736,J8821,Parainfluenza,virology +98739,J8820,Parainfluenza,virology +98742,J8859,RSV,virology +98745,J8858,Rhino/Enterovirus,virology +98752,J8826,L.pneumophila,virology +98764,K1527,Adenovirus,virology +98765,K1525,Coronavirus,virology +98766,K1524,Coronavirus,virology +98767,K1522,Coronavirus,virology +98768,K1523,Coronavirus,virology +98769,K1519,Metapneumovirus,virology +98770,K1515,Rhino/Enterovirus,virology +98771,K1517,Influenza A,virology +98776,K1513,Influenza B,virology +98778,K1520,SARS-CoV-2,virology +98780,K1509,Parainfluenza,virology +98781,K1508,Parainfluenza,virology +98782,K1510,Parainfluenza,virology +98783,K1535,Parainfluenza,virology +98784,K1534,RSV,virology +98787,K1531,B.pertussis,virology +98788,K1530,C.pneumoniae,virology +98859,J9899,L.pneumophila,virology +98861,J9919,C.pneumoniae,virology +98868,J9965,Metapneumovirus,virology +98869,J9964,Parainfluenza,virology +98870,J9968,Adenovirus,virology +98871,J9969,Coronavirus,virology +100787,D1465,Legionella Antigenuria,antigenury +100789,H6694,Legionella Antigenuria,antigenury +100790,J7960,Legionella Antigenuria,antigenury +100798,D2055,Pneumococcal Antigenuria,antigenury +100799,J7962,Pneumococcal Antigenuria,antigenury +100801,A2804,Pneumococcal Antigenuria,antigenury +120611,C8189,BNP_and_NT_ProBNP,cardiac_biomarkers +120612,B5596,BNP_and_NT_ProBNP,cardiac_biomarkers +120613,A2128,BNP_and_NT_ProBNP,cardiac_biomarkers +120614,A7333,BNP_and_NT_ProBNP,cardiac_biomarkers +120615,J7267,BNP_and_NT_ProBNP,cardiac_biomarkers +120616,J7959,BNP_and_NT_ProBNP,cardiac_biomarkers diff --git a/eds_scikit/biology/utils/config_files/units b/eds_scikit/biology/utils/config_files/units new file mode 100644 index 00000000..d9030746 --- /dev/null +++ b/eds_scikit/biology/utils/config_files/units @@ -0,0 +1,128 @@ +unit_source_value,conversion,category,unit_source_base +10*6,1.0,Count,10*6 +10*8 cellules,100.0,Count,10*6 +10*6 cd3+,1.0,Count,10*6 +10*9,1000.0,Count,10*6 +10*6 pbmc,1.0,Count,10*6 +10*9 leuco,1000.0,Count,10*6 +x10^6 cellules,1.0,Count,10*6 +x10*9,1000.0,Count,10*6 +10*6cell,1.0,Count,10*6 +10*5 cell mono,0.0,Count,10*6 +x10^6,1.0,Count,10*6 +x10*12,1000000.0,Count,10*6 +x10*10,10000.0,Count,10*6 +10*6 cellules,1.0,Count,10*6 +10*6 leuco,1.0,Count,10*6 +10*3 rlu,0.001,Count,10*6 +10*6 leuc,1.0,Count,10*6 +10*3,0.001,Count,10*6 +10*6cellules,1.0,Count,10*6 +x10*6 car-t,1.0,Count,10*6 +x10*6,1.0,Count,10*6 +10*6 cell,1.0,Count,10*6 +x10*3cellules,0.001,Count,10*6 +10*3 cell,0.001,Count,10*6 +8x10*8 érythro,800.0,Count,10*6 +10*6 cellules mononucléées,1.0,Count,10*6 +10*6 cel,1.0,Count,10*6 +millions,1.0,Count,10*6 +cell,1e-06,Count,10*6 +cellules,1e-06,Count,10*6 + ml,0.001,Litre,l +10ml,0.01,Litre,l +100ml,0.1,Litre,l +500ml,0.5,Litre,l +dl,10,Litre,l +l,1.0,Litre,l +ml,0.001,Litre,l +"ml +",0.001,Litre,l +ml d'érythrocytes,0.001,Litre,l +ml érythro,0.001,Litre,l +ml),0.001,Litre,l +µl,1e-06,Litre,l +l érythro,1.0,Litre,l +100g,100.0,Mass,g +µg adn,1e-06,Mass,g +mg,0.001,Mass,g +g hb,1.0,Mass,g +mg de protéines,0.001,Mass,g +g prot,1.0,Mass,g +µg,1e-06,Mass,g +kg,1000.0,Mass,g +mg de prot,100000.0,Mass,g +g,1.0,Mass,g +mg prot,0.001,Mass,g +mg hémoglobine,0.001,Mass,g +mg hb,0.001,Mass,g +mg pr,0.001,Mass,g +ng,1e-09,Mass,g +pg,1e-12,Mass,g +24h,24.0,Time,h +4h,4.0,Time,h +5h,5.0,Time,h +6h,6.0,Time,h +h,1.0,Time,h +heure,1.0,Time,h +jours,24.0,Time,h +min,0.016666666666666666,Time,h +s,0.0002777777777777778,Time,h +mois,720.0,Time,h +semaine,168.0,Time,h +semaines,168.0,Time,h +semaines.jours,168.0,Time,h +ans,8736.0,Time,h +amol,1e-18,Mol,mol +fmol,1e-15,Mol,mol +mmmol,0.001,Mol,mol +mmol,0.001,Mol,mol +mmol créat,0.001,Mol,mol +mmol créat.,0.001,Mol,mol +mmol créatinine,0.001,Mol,mol +mmol de créatinine,0.001,Mol,mol +mol,1.0,Mol,mol +nmol,1e-09,Mol,mol +nmol 1,1e-09,Mol,mol +nmol de 1,1e-09,Mol,mol +nmol fe2+,1e-09,Mol,mol +nmol meszn,1e-09,Mol,mol +nmol o2,1e-09,Mol,mol +nmol proto,1e-09,Mol,mol +pmol,1e-12,Mol,mol +pmol copro,1e-12,Mol,mol +pmol proto,1e-12,Mol,mol +pmol uro,1e-12,Mol,mol +µmol,1e-06,Mol,mol +cm,0.01,m,m +cm d'eau,0.01,m,m +m,1.0,m,m +mm,0.001,m,m +nm,1e-09,m,m +µm,1e-06,m,m +cm2,0.0001,m2,m +m2,1.0,m2,m +mm2,1e-06,m2,m +nm2,1e-18,m2,m +µm2,1e-12,m2,m +cm²,0.0001,m2,m +m²,1.0,m2,m +mm²,1e-06,m2,m +nm²,1e-18,m2,m +µm²,1e-12,m2,m +"1,73m²",1.73,m2,m +1.73 m2,1.73,m2,m +1.73m²,1.73,m2,m +25cm2,25.0,m2,m +cm3,1e-06,m3,m +m3,1.0,m3,m +mm³,1e-09,m3,m +mm3,1e-09,m3,m +µm³,1e-18,m3,m +kui,1000.0,ui,ui +ui,1.0,ui,ui +uie,1.0,ui,ui +µui,1e-06,ui,ui +mui,0.001,ui,ui +log ui,0.0,ui,ui +log (ui,0.0,ui,ui diff --git a/eds_scikit/biology/utils/config_files/units_elements b/eds_scikit/biology/utils/config_files/units_elements new file mode 100644 index 00000000..75358eaf --- /dev/null +++ b/eds_scikit/biology/utils/config_files/units_elements @@ -0,0 +1,7 @@ +unit_a,unit_b,conversion,element +mol,g,113.12,Créatinine +g,mol,0.00884016973125884,Créatinine +mol,g,180.156,Glucose +g,mol,0.005550744909966918,Glucose +mol,g,64500.0,Hémoglobine +g,mol,1.5503875968992248e-05,Hémoglobine diff --git a/eds_scikit/biology/utils/prepare_measurement.py b/eds_scikit/biology/utils/prepare_measurement.py new file mode 100644 index 00000000..173d56da --- /dev/null +++ b/eds_scikit/biology/utils/prepare_measurement.py @@ -0,0 +1,79 @@ +from loguru import logger + +from eds_scikit.utils.framework import is_koalas, to + +from eds_scikit.biology.utils.check_data import check_data_and_select_columns_measurement +from eds_scikit.biology.utils.filter_measurement import filter_measurement_valid, filter_measurement_by_date, tag_measurement_anomaly +from eds_scikit.biology.utils.prepare_relationship import prepare_biology_relationship_table +from eds_scikit.biology.utils.units import Units + +def prepare_measurement_table(data, + start_date, + end_date, + concept_sets, + cohort=None, + convert_units=False, + outliers_detection=None,): + + """Returns filtered measurement table based on validity, date and concept_sets. + + The output format is identical to data.measurement but adding following columns : + - range_high_anomaly, range_low_anomaly + - {terminology}_code based on concept_sets terminologies + - concept_sets + - normalized_units and normalized_values if convert_units==True + - outlier if outliers_detection not None + + Parameters + ---------- + data : _type_ + _description_ + start_date : _type_ + _description_ + end_date : _type_ + _description_ + concept_sets : _type_ + _description_ + cohort : _type_, optional + _description_, by default None + convert_units : bool, optional + _description_, by default False + outliers_detection : _type_, optional + _description_, by default None + + Returns + ------- + _type_ + _description_ + """ + + measurement, _, _ = check_data_and_select_columns_measurement(data) + + # measurement preprocessing + measurement = filter_measurement_valid(measurement) + measurement = filter_measurement_by_date(measurement, start_date, end_date) + + # measurement codes mapping + logger.info(f"Preparing concept codes relationship table and mapping them to measurement.") + biology_relationship_table = prepare_biology_relationship_table(data, concept_sets) + measurement = measurement.merge(biology_relationship_table, left_on="measurement_source_concept_id", right_on="ANALYSES_LABORATOIRE_concept_id") + + #measurement anomaly tagging + measurement = tag_measurement_anomaly(measurement) + + if convert_units: + logger.info(f"Lazy preparation not available if convert_units=True. Computed table will be cached.") + measurement.cache() + units_mapping = Units(concept_sets=concept_sets).generate_units_mapping(measurement) + units_mapping = to("koalas", units_mapping) + measurement = measurement.merge(units_mapping, on=["concept_set", "unit_source_value"]) + + if outliers_detection: + measurement = measurement + + measurement = measurement.drop(columns="measurement_date") + + measurement.cache() + logger.info(f"Done. Once computed, measurement will be cached.") # or not ? + + return measurement diff --git a/eds_scikit/biology/utils/prepare_df.py b/eds_scikit/biology/utils/prepare_relationship.py similarity index 55% rename from eds_scikit/biology/utils/prepare_df.py rename to eds_scikit/biology/utils/prepare_relationship.py index d9cc04b3..9dade0b0 100644 --- a/eds_scikit/biology/utils/prepare_df.py +++ b/eds_scikit/biology/utils/prepare_relationship.py @@ -1,14 +1,36 @@ -import pandas as pd -from eds_scikit.utils.checks import check_columns, check_tables +import databricks.koalas as ks +import re +from eds_scikit.io import settings from eds_scikit.utils.framework import get_framework, to +import pandas as pd + +def select_mapping( + mapping, + sources=None, + terminologies=None, +): + #if some terminologies not in mapping : fail + mapping_filtered = [] + + for m in mapping: + keep_m = True + if sources: + keep_m = any([source in m for source in sources]) and keep_m + if terminologies: + keep_m = any([(terminology in m[0]) or (terminology in m[1]) for terminology in terminologies]) and keep_m + if keep_m: + mapping_filtered.append(m) + + return mapping_filtered -def prepare_biology_relationship( +def prepare_relationship_table( data, source_terminologies, mapping, - concept_codes=None -) -> pd.DataFrame: - """Computes biology relationship table +) -> ks.DataFrame: #ks or pandas + """ + + Create easy-to-use relationship table based on given terminologies and mapping between them. Parameters ---------- @@ -20,7 +42,7 @@ def prepare_biology_relationship( mapping : List[Tuple[str, str, str]] Ordered mapping of terminologies based on concept_relationship table **EXAMPLE**: `[("source_concept", "standard_concept", "Maps to")]` - + Output ------- | source_concept_id | source_concept_name | source_concept_code | standard_concept_id | standard_concept_name | standard_concept_code | @@ -33,35 +55,9 @@ def prepare_biology_relationship( """ - - #check_tables(data=data, required_tables=["concept", "concept_relationship"]) - concept_columns = [ - "concept_id", - "concept_name", - "concept_code", - "vocabulary_id", - ] - - concept_relationship_columns = [ - "concept_id_1", - "concept_id_2", - "relationship_id", - ] - check_columns( - data.concept, - required_columns=concept_columns, - df_name="concept", - ) - - check_columns( - data.concept_relationship, - required_columns=concept_relationship_columns, - df_name="concept_relationship", - ) + + concept, concept_relationship = check_data_and_select_columns_relationship(data) - concept = data.concept[concept_columns] - concept_relationship = data.concept_relationship[concept_relationship_columns] - concept, concept_relationship = concept.to_pandas(), concept_relationship.to_pandas() concept_by_terminology = {} for terminology, regex in source_terminologies.items(): concept_by_terminology[terminology] = ( @@ -76,7 +72,8 @@ def prepare_biology_relationship( .drop(columns="vocabulary_id") ) root_terminology = mapping[0][0] - biology_relationship = concept_by_terminology[root_terminology] + relationship_table = concept_by_terminology[root_terminology] + # Look over all predefined structured mapping for source, target, relationship_id in mapping: relationship = concept_relationship.rename( columns={ @@ -89,18 +86,34 @@ def prepare_biology_relationship( relationship = relationship.merge( concept_by_terminology[target], on="{}_concept_id".format(target) ) - biology_relationship = biology_relationship.merge( + relationship_table = relationship_table.merge( relationship, on="{}_concept_id".format(source), how="left" ) + + relationship_table = relationship_table.fillna("Unknown") + + return relationship_table + +def filter_concept_sets_relationship_table(relationship_table, concept_sets): + + framework = get_framework(relationship_table) - if concept_codes: - code_colums = [column for column in biology_relationship.columns if "concept_code" in column] - isin_concept_set = biology_relationship[code_colums].isin(concept_codes).sum(axis=1) > 0 - biology_relationship = biology_relationship[isin_concept_set] - - #biology_relationship["concepts_set"] = "XXX" - biology_relationship = biology_relationship.fillna("Unknown") + concept_sets_tables = pd.DataFrame({}) + for concept_set in concept_sets: + concept_set_table = concept_set.get_concept_codes() + concept_sets_tables = pd.concat((concept_set_table, concept_sets_tables), axis=0) + terminologies = concept_sets_tables.terminology.unique() + concept_sets_tables = to(framework, concept_sets_tables) + filtered_terminology_table = framework.DataFrame({}) + for terminology in terminologies: + filtered_terminology_table = concept_sets_tables[concept_sets_tables.terminology == terminology].merge(relationship_table, on=f"{terminology}_concept_code", how="left", suffixes=("_x", "")) + filtered_terminology_table = filtered_terminology_table[[column for column in filtered_terminology_table.columns if not("_x" in column)]] + + return filtered_terminology_table + +def prepare_biology_relationship_table(data, concept_sets): + + biology_relationship_table = prepare_relationship_table(data, settings.source_terminologies, settings.mapping) + biology_relationship_table = filter_concept_sets_relationship_table(biology_relationship_table, concept_sets) - biology_relationship = to("koalas", biology_relationship) - - return biology_relationship + return biology_relationship_table \ No newline at end of file diff --git a/eds_scikit/biology/utils/process_concepts.py b/eds_scikit/biology/utils/process_concepts.py index c5313b7b..a75f0be7 100644 --- a/eds_scikit/biology/utils/process_concepts.py +++ b/eds_scikit/biology/utils/process_concepts.py @@ -1,7 +1,7 @@ import ast import re from functools import reduce -from typing import List, Union +from typing import List, Union, Tuple, Dict import pandas as pd from loguru import logger @@ -14,6 +14,7 @@ default_standard_terminologies = settings.standard_terminologies default_standard_concept_regex = settings.standard_concept_regex +default_concept_set = pd.read_csv("default_concept_sets") class ConceptsSet: @@ -23,12 +24,23 @@ class ConceptsSet: - ``concept_codes`` : the list of concepts codes included in the concepts-set """ - def __init__(self, name: str, concept_codes: List[str] = None): + def __init__(self, name: str, concept_codes: List[str] = None, terminologies : List[str] = None, terminologies_source_regex : Dict[str, str] = None, conversion : List[Tuple] = None): self.name = name + self.terminologies = terminologies + self.terminologies_regex = default_standard_concept_regex + self.conversion = conversion + + self.terminologies_regex.update(default_standard_concept_regex) #AP-HP spé + if concept_codes is None: self.concept_codes = fetch_concept_codes_from_name(name) else: self.concept_codes = concept_codes + + check, codes = _check_regex(concept_codes, self.terminologies_regex) + if not check: + logger.info(f"{codes} do not match any terminology") + def add_concept_codes(self, concept_codes: Union[str, List[str]]): if isinstance(concept_codes, str): @@ -55,8 +67,36 @@ def remove_concept_codes(self, concept_codes: Union[str, List[str]]): else: logger.error("concept_codes must be string or list") raise TypeError - - + + def check_concept_codes_compatibility(self): + compatibility = True + for concept_code in self.concept_codes: + compatibility = compatibility and any([re.match(self.terminologies_regex[terminology], concept_code) for terminology in self.terminologies_regex]) + return compatibility + + def get_concept_codes(self, terminology=None): + if terminology: + if terminology in self.terminologies_regex: + regex = self.terminologies_regex[terminology] + return [concept_code for concept_code in self.concept_codes if bool(re.match(regex, concept_code))] + else: + return [] + else: + return self.concept_codes + + def get_concept_codes_terminologies(self, terminologies): + concept_codes_terminologies = {} + for terminology in self.terminologies: + for terminology_with_src in terminologies: + if terminology in terminology_with_src: + concept_codes_terminologies[terminology_with_src] = [] + for concept_code in self.concept_codes: + regex = self.terminologies_regex[terminology] + if re.match(regex, concept_code): + concept_codes_terminologies[terminology_with_src].append(concept_code) + return concept_codes_terminologies + + def fetch_concept_codes_from_name( concepts_set_name: str, concepts_sets_table_name: str = "default_concepts_sets" ): @@ -315,7 +355,7 @@ def _check_regex( concepts_codes: List[str], standard_concept_regex: dict = default_standard_concept_regex, ): - """Process ``Concept`` and ``Concept Relationship`` tables to obtain a wide DataFrame that gives for all concepts-sets the source code along with the standard concepts codes. + """ Parameters ---------- diff --git a/eds_scikit/biology/utils/process_measurement.py b/eds_scikit/biology/utils/process_measurement.py index 22fd32dd..4cd08ac2 100644 --- a/eds_scikit/biology/utils/process_measurement.py +++ b/eds_scikit/biology/utils/process_measurement.py @@ -8,7 +8,7 @@ from eds_scikit.utils.typing import DataFrame -def get_valid_measurement(measurement: DataFrame) -> DataFrame: +def filter_measurement_valid(measurement: DataFrame) -> DataFrame: """Filter valid observations based on the `row_status_source_value` column Parameters @@ -32,27 +32,6 @@ def get_valid_measurement(measurement: DataFrame) -> DataFrame: return measurement_valid -def _select_adequate_date_column(measurement: DataFrame): - missing_date = measurement.measurement_date.isna().sum() - if missing_date > 0: - missing_datetime = measurement.measurement_datetime.isna().sum() - if missing_date > missing_datetime: - measurement = measurement.drop(columns="measurement_date").rename( - columns={"measurement_datetime": "measurement_date"} - ) - logger.warning( - "As the measurement_date column is not reliable ({} missing dates), it has been replaced by the measurement_datetime column ({} missing datetimes)", - missing_date, - missing_datetime, - ) - missing_date = missing_datetime - else: - measurement = measurement.drop(columns="measurement_datetime") - else: - measurement = measurement.drop(columns="measurement_datetime") - return measurement - - def filter_measurement_by_date( measurement: DataFrame, start_date: datetime = None, end_date: datetime = None ) -> DataFrame: @@ -77,7 +56,8 @@ def filter_measurement_by_date( ) if "measurement_datetime" in measurement.columns: - measurement = _select_adequate_date_column(measurement=measurement) + measurement = measurement + #measurement = _select_adequate_date_column(measurement=measurement) measurement.measurement_date = measurement.measurement_date.astype("datetime64[ns]") @@ -130,6 +110,30 @@ def filter_concept_by_number( code_set, on="{}_concept_code".format(terminology), how="inner" ) +def tag_measurement_anomaly( + measurement: DataFrame +) -> DataFrame: + """ + + Parameters + ---------- + measurement : DataFrame + DataFrame to filter + start_date : datetime, optional + **EXAMPLE**: `"2019-05-01"` + end_date : datetime, optional + **EXAMPLE**: `"2022-05-01"` + + Returns + ------- + """ + logger.info(f"Tagging measurement value anomaly.") + + measurement["range_high_anomaly"] = (~measurement.range_high.isna()) & (measurement["value_as_number"] > measurement["range_high"]) + measurement["range_low_anomaly"] = (~measurement.range_low.isna()) & (measurement["value_as_number"] < measurement["range_low"]) + + return measurement + def get_measurement_std(measurement: DataFrame, src_to_std: DataFrame): check_columns( @@ -168,3 +172,24 @@ def normalize_unit(measurement: DataFrame): measurement["unit_source_value"].str.lower().fillna("Unknown") ) return measurement + + +def _select_adequate_date_column(measurement: DataFrame): + missing_date = measurement.measurement_date.isna().sum() + if missing_date > 0: + missing_datetime = measurement.measurement_datetime.isna().sum() + if missing_date > missing_datetime: + measurement = measurement.drop(columns="measurement_date").rename( + columns={"measurement_datetime": "measurement_date"} + ) + logger.warning( + "As the measurement_date column is not reliable ({} missing dates), it has been replaced by the measurement_datetime column ({} missing datetimes)", + missing_date, + missing_datetime, + ) + missing_date = missing_datetime + else: + measurement = measurement.drop(columns="measurement_datetime") + else: + measurement = measurement.drop(columns="measurement_datetime") + return measurement diff --git a/eds_scikit/biology/utils/process_units.py b/eds_scikit/biology/utils/process_units.py new file mode 100644 index 00000000..117d0d50 --- /dev/null +++ b/eds_scikit/biology/utils/process_units.py @@ -0,0 +1,101 @@ +import numpy as np +import pandas as pd +import databricks.koalas as ks + +class Units: + + def __init__(self, concept_sets=None, unit_file="config_files/units", element_file="config_files/units_elements", element=None): + self.units_file = pd.read_csv(unit_file).set_index("unit_source_value") + self.element_file = pd.read_csv(element_file).set_index(["unit_a", "unit_b"]) + self.element = element + #on part de l'idée que les unités sont des bases, qu'il est symétrique, qu'il est complété. NB : globalement c'est essentiellement pour la masse molaire, non ? Cuillère à soupe en g peut-être ? + self.outer_conversion = self.element_file[self.element_file.element == self.element] if self.element else pd.DataFrame() + self.target_unit = "" + self.concept_sets = concept_sets + + def add_target_unit(self, unit): + self.target_unit = unit + + def add_conversion(self, unit_a, unit_b, conversion): + df = pd.DataFrame({'unit_a' : [unit_a, unit_b], 'unit_b' : [unit_b, unit_a,], 'conversion' : [conversion, 1/conversion], 'element' : self.element}).set_index(["unit_a", "unit_b"]) + self.outer_conversion = pd.concat((self.outer_conversion, df), axis=0).drop_duplicates() + + def get_category(self, unit): #remplacer category par base ?? + unit_tokens = unit.split("/") + category = [] + for unit_token in unit_tokens: + unit_token = unit_token.lower() + if unit_token in self.units_file.index: + unit_token_category = self.units_file.loc[unit_token].category + category += [unit_token_category] + else: + category += ["Unkown"] + return category + + def get_unit_base(self, unit) -> str: #remplacer category par base ?? + unit_tokens = unit.lower().split("/") + unit_base = "" + for unit_token in unit_tokens: + unit_base += f"/{self.base(unit_token)}" + return unit_base[1:] + + + def base(self, token): + if token in self.units_file.index: + return self.units_file.loc[token].unit_source_base + else: + return "Unkown" + + def to_base(self, token): + if token in self.units_file.index: + return self.units_file.loc[token].conversion + else: + return np.NaN + + def can_be_converted(self, unit_1, unit_2): + #Faire un version token / version units + unit_tokens_1 = unit_1.split("/") + unit_tokens_2 = unit_2.split("/") + + if len(unit_tokens_1) == len(unit_tokens_2): + can_be_converted = True + for token_1, token_2 in zip(unit_tokens_1, unit_tokens_2): + base_1, base_2 = self.base(token_1), self.base(token_2) + token_1, token_2 = token_1.lower(), token_2.lower() + can_be_converted = can_be_converted and (self.base(token_1) == self.base(token_2) != "Unkown") or ((base_1, base_2) in self.outer_conversion.index) + return can_be_converted + else: + return False + + def convert_token(self, token_1, token_2): + token_1, token_2 = token_1.lower(), token_2.lower() + if self.base(token_1) == self.base(token_2) != "Unkown": + f1 = self.to_base(token_1) + f2 = self.to_base(token_2) + return (f1 / f2) + + base_1, base_2 = self.base(token_1), self.base(token_2) + if (base_1, base_2) in self.outer_conversion.index: + f1 = self.to_base(token_1) + f2 = self.to_base(token_2) + f3 = self.outer_conversion.loc[(base_1, base_2)].conversion + return f1 * f3 / f2 + else: + return np.NaN + + def convert_unit(self, unit_1, unit_2) -> float: + unit_1, unit_2 = unit_1.lower(), unit_2.lower() + tokens_1, tokens_2, f = unit_1.split("/"), unit_2.split("/"), 1 + for token_1, token_2 in zip(tokens_1, tokens_2): + f *= self.convert_token(token_1, token_2) + return f + + def generate_units_mapping(self, measurement): + units_mapping = measurement.groupby("concept_set").unit_source_value.unique() + units_mapping = units_mapping.to_pandas() + units_mapping = units_mapping.to_frame().explode("unit_source_value").reset_index() + f = lambda x : self.target_unit if (self.target_unit and self.can_be_converted(x, units_mapping.target_unit)) else self.get_unit_base(x) + g = lambda df : self.convert_unit(df.unit_source_value, df.normalized_unit) + units_mapping["normalized_unit"] = units_mapping.unit_source_value.apply(f) + units_mapping["conversion"] = units_mapping.apply(g, axis=1).fillna(0) + return units_mapping diff --git a/eds_scikit/biology/viz/aggregate.py b/eds_scikit/biology/viz/aggregate.py index 8ba490e2..78da6852 100644 --- a/eds_scikit/biology/viz/aggregate.py +++ b/eds_scikit/biology/viz/aggregate.py @@ -5,17 +5,17 @@ import pandas as pd from loguru import logger -from eds_scikit.biology.utils.process_concepts import ( +from eds_scikit.biology.utils.process_concept_sets import ( ConceptsSet, get_concept_src_to_std, ) from eds_scikit.biology.utils.process_measurement import ( + filter_measurement_valid, filter_concept_by_count, filter_concept_by_number, filter_measurement_by_date, filter_missing_values, get_measurement_std, - get_valid_measurement, normalize_unit, ) from eds_scikit.io import settings diff --git a/eds_scikit/biology/viz_other/aggregate_measurement_table.py b/eds_scikit/biology/viz_other/aggregate_measurement_table.py new file mode 100644 index 00000000..5d44a334 --- /dev/null +++ b/eds_scikit/biology/viz_other/aggregate_measurement_table.py @@ -0,0 +1,566 @@ +from datetime import datetime +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +from loguru import logger + +from eds_scikit.biology.utils.concept_set import ( + ConceptsSet, + get_concept_src_to_std, +) +from eds_scikit.biology.utils.filter_measurement import ( + filter_measurement_valid, + filter_concept_by_count, + filter_concept_by_number, + filter_measurement_by_date, + filter_missing_values, + get_measurement_std, + normalize_unit, +) +from eds_scikit.io import settings +from eds_scikit.utils.checks import check_columns, check_tables +from eds_scikit.utils.framework import is_koalas, to +from eds_scikit.utils.typing import Data, DataFrame + +""" + +NB : ça serait cool de l'avoir avec les prepare, filter, etc : aggreate_table(...) + +""" + + +""" + +POUR LES VIZS BOKEH + +""" + +def compute_df_value_statistics(df, pivot_columns, value_column): + #df_stats = df.groupby(pivot_columns, as_index=True)[value_column].quantile([0.05, 0.25, 0.5, 0.75, 0.95]).unstack() + df_stats = df.groupby(pivot_columns, as_index=True)[[value_column]].describe() #ATTENTION : le describe c'est un miracle. marche avec [[...]] mais pas [...] + df_stats = df_stats.droplevel(0, axis=1)[["25%", "50%", "75%"]] + df_stats.columns = "q" + df_stats.columns.str[:-1] + if is_koalas(df_stats): + df_stats = df_stats.to_pandas() + return df_stats + +def compute_df_category_statistics(df, pivot_columns, category_column): + df_stats = df.groupby(pivot_columns, as_index=True)[category_column].value_counts().to_frame().rename(columns={category_column : "count"}).reset_index() + if is_koalas(df_stats): + df_stats = df_stats.to_pandas() #Sinon résultat nawak + df_stats["total"] = df_stats.groupby(pivot_columns)["count"].transform(sum) + df_stats["proportion"] = df_stats["count"].div(df_stats["total"]) + df_stats = df_stats.rename(columns={category_column : "category"}) + df_stats = df_stats.drop(columns=["total", "count"]).set_index(pivot_columns) + return df_stats + +""" + +POUR LES VIZS ALTAIR. CA SERAIT COOL DE L AVOIR UN PEU MOINS CODE / CARE SITE - DEPENDENT +PLUS MODULABLE + +""" + +def aggregate_measurement( + measurement: DataFrame, + pd_limit_size: int, + stats_only: bool, + overall_only: bool, +): + + check_columns( + df=measurement, + required_columns=[ + "measurement_id", + "unit_source_value", + "measurement_date", + "value_as_number", + ], + df_name="measurement", + ) + + # Convert DF to Pandas if small enough + if is_koalas(measurement): + measurement.spark.cache() + logger.info( + "Checking if the Koalas DataFrame is small enough to be converted into Pandas DataFrame" + ) + size = measurement.shape[0] + if size < pd_limit_size: + logger.info( + "The number of measurements identified is {} < {}. DataFrame is converting to Pandas...", + size, + pd_limit_size, + ) + measurement = to("pandas", measurement) + if measurement.empty: + return {"measurement": measurement} + else: + logger.info( + "The number of measurements identified is {}.", + size, + ) + + # Truncate date + measurement["measurement_month"] = ( + measurement["measurement_date"].astype("datetime64").dt.strftime("%Y-%m") + ) + measurement = measurement.drop(columns=["measurement_date"]) + + # Filter measurement with missing values + filtered_measurement, missing_value = filter_missing_values(measurement) + + # Compute measurement statistics by code + measurement_stats = _describe_measurement_by_code( + filtered_measurement, overall_only + ) + + if stats_only: + return {"measurement_stats": measurement_stats} + + # Count measurement by care_site and by code per each month + measurement_volumetry = _count_measurement_by_care_site_and_code_per_month( + filtered_measurement, missing_value + ) + + # Bin measurement values by care_site and by code + measurement_distribution = _bin_measurement_value_by_care_site_and_code( + filtered_measurement + ) + + return { + "measurement_stats": measurement_stats, + "measurement_volumetry": measurement_volumetry, + "measurement_distribution": measurement_distribution, + } + + +def _describe_measurement_by_code( + filtered_measurement: DataFrame, overall_only: bool = False +): + check_columns( + df=filtered_measurement, + required_columns=[ + "measurement_id", + "unit_source_value", + "measurement_month", + "value_as_number", + "care_site_short_name", + ], + df_name="filtered_measurement", + ) + + concept_cols = [ + column_name + for column_name in filtered_measurement.columns + if ("concept_code" in column_name) or ("concept_name" in column_name) + ] + + measurement_stats_overall = ( + ( + filtered_measurement[ + [ + "unit_source_value", + "value_as_number", + ] + + concept_cols + ] + .groupby( + concept_cols + + [ + "unit_source_value", + ], + dropna=False, + ) + .describe() + ) + .droplevel(0, 1) + .reset_index() + ) + + # Add stats column to the measurement table + measurement_mad = measurement_stats_overall.merge( + filtered_measurement[concept_cols + ["value_as_number", "unit_source_value"]], + on=concept_cols + ["unit_source_value"], + ) + + # Compute median deviation for each measurement + measurement_mad["median_deviation"] = abs( + measurement_mad["50%"] - measurement_mad["value_as_number"] + ) + measurement_mad = measurement_mad.drop(columns="value_as_number") + + # Compute MAD + measurement_mad = ( + measurement_mad.groupby( + concept_cols + + [ + "unit_source_value", + ], + as_index=False, + dropna=False, + )["median_deviation"] + .median() + .rename(columns={"median_deviation": "MAD"}) + ) + + # Add MAD column to the measurement table + measurement_stats_overall = measurement_stats_overall.merge( + measurement_mad[concept_cols + ["MAD", "unit_source_value"]], + on=concept_cols + ["unit_source_value"], + ) + + logger.info("The overall statistics of measurements by code are computing...") + measurement_stats_overall = to("pandas", measurement_stats_overall) + logger.info("The overall statistics of measurements are computed...") + + measurement_stats_overall["MAD"] = 1.48 * measurement_stats_overall["MAD"] + + measurement_stats_overall["max_threshold"] = ( + measurement_stats_overall["50%"] + 4 * measurement_stats_overall["MAD"] + ) + measurement_stats_overall["min_threshold"] = ( + measurement_stats_overall["50%"] - 4 * measurement_stats_overall["MAD"] + ) + measurement_stats_overall["min_threshold"] = measurement_stats_overall[ + "min_threshold" + ].where(measurement_stats_overall["min_threshold"] >= 0, 0) + + if overall_only: + return measurement_stats_overall + + measurement_stats_overall["care_site_short_name"] = "ALL" + + measurement_stats = ( + ( + filtered_measurement[ + [ + "unit_source_value", + "care_site_short_name", + "value_as_number", + ] + + concept_cols + ] + .groupby( + concept_cols + + [ + "care_site_short_name", + "unit_source_value", + ], + dropna=False, + ) + .describe() + ) + .droplevel(0, 1) + .reset_index() + ) + + measurement_stats["MAD"] = None + measurement_stats["max_threshold"] = None + measurement_stats["min_threshold"] = None + + logger.info("The statistics of measurements by care site are computing...") + measurement_stats = to("pandas", measurement_stats) + logger.info("The statistics of measurements by care site are computed...") + + measurement_stats = pd.concat([measurement_stats_overall, measurement_stats]) + + return measurement_stats + + +def _count_measurement_by_care_site_and_code_per_month( + filtered_measurement: DataFrame, missing_value: DataFrame +): + check_columns( + df=filtered_measurement, + required_columns=[ + "measurement_id", + "unit_source_value", + "measurement_month", + "care_site_short_name", + ], + df_name="filtered_measurement", + ) + + check_columns( + df=missing_value, + required_columns=[ + "measurement_id", + "unit_source_value", + "measurement_month", + "care_site_short_name", + ], + df_name="missing_value", + ) + + concept_cols = [ + column_name + for column_name in filtered_measurement.columns + if "concept_code" in column_name + ] + + measurement_count = ( + filtered_measurement[ + [ + "measurement_id", + "unit_source_value", + "care_site_short_name", + "measurement_month", + ] + + concept_cols + ] + .groupby( + concept_cols + + [ + "unit_source_value", + "care_site_short_name", + "measurement_month", + ], + as_index=False, + dropna=False, + ) + .agg({"measurement_id": "count"}) + .rename(columns={"measurement_id": "# measurements"}) + ) + missing_value_count = ( + missing_value[ + [ + "measurement_id", + "unit_source_value", + "care_site_short_name", + "measurement_month", + ] + + concept_cols + ] + .groupby( + concept_cols + + [ + "unit_source_value", + "care_site_short_name", + "measurement_month", + ], + as_index=False, + dropna=False, + ) + .agg({"measurement_id": "count"}) + .rename(columns={"measurement_id": "# missing_values"}) + ) + + missing_value_count[["measurement_month"]] = missing_value_count[ + ["measurement_month"] + ].fillna("Unknown") + + logger.info( + "The counting of measurements by care site and code for each month is processing..." + ) + measurement_count = to("pandas", measurement_count) + logger.info("The counting of measurements is finished...") + + logger.info( + "The counting of missing values by care site and code for each month is processing..." + ) + missing_value_count = to("pandas", missing_value_count) + logger.info("The counting of missing values is finished...") + + measurement_volumetry = measurement_count.merge( + missing_value_count, + on=concept_cols + + [ + "unit_source_value", + "care_site_short_name", + "measurement_month", + ], + how="outer", + ) + + # Replace None by 0 + measurement_volumetry[ + ["# missing_values", "# measurements"] + ] = measurement_volumetry[["# missing_values", "# measurements"]].fillna(0) + return measurement_volumetry + + +def _bin_measurement_value_by_care_site_and_code( + filtered_measurement: DataFrame, +): + + check_columns( + df=filtered_measurement, + required_columns=[ + "measurement_id", + "unit_source_value", + "care_site_short_name", + "value_as_number", + ], + df_name="filtered_measurement", + ) + + concept_cols = [ + column_name + for column_name in filtered_measurement.columns + if "concept_code" in column_name + ] + + # Compute median per code + measurement_median = ( + filtered_measurement[ + concept_cols + + [ + "value_as_number", + ] + ] + .groupby( + concept_cols, + as_index=False, + dropna=False, + ) + .median() + .rename(columns={"value_as_number": "median"}) + ) + + # Add median column to the measurement table + measurement_median = measurement_median.merge( + filtered_measurement[ + concept_cols + + [ + "value_as_number", + ] + ], + on=concept_cols, + ) + + # Compute median deviation for each measurement + measurement_median["median_deviation"] = abs( + measurement_median["median"] - measurement_median["value_as_number"] + ) + + # Compute MAD per care site and code + measurement_mad = ( + measurement_median[ + concept_cols + + [ + "median", + "median_deviation", + ] + ] + .groupby( + concept_cols + + [ + "median", + ], + as_index=False, + dropna=False, + ) + .median() + .rename(columns={"median_deviation": "MAD"}) + ) + + measurement_mad["MAD"] = 1.48 * measurement_mad["MAD"] + + # Add MAD column to the measurement table + measurement_binned = measurement_mad.merge( + filtered_measurement[ + concept_cols + + [ + "measurement_id", + "care_site_short_name", + "unit_source_value", + "value_as_number", + ] + ], + on=concept_cols, + ) + + # Compute binned value + measurement_binned["max_value"] = ( + measurement_binned["median"] + 4 * measurement_binned["MAD"] + ) + measurement_binned["min_value"] = ( + measurement_binned["median"] - 4 * measurement_binned["MAD"] + ) + measurement_binned["min_value"] = measurement_binned["min_value"].where( + measurement_binned["min_value"] >= 0, 0 + ) + measurement_binned["binned_value"] = measurement_binned["value_as_number"].mask( + measurement_binned["value_as_number"] > measurement_binned["max_value"], + measurement_binned["max_value"], + ) + measurement_binned["binned_value"] = measurement_binned["binned_value"].mask( + measurement_binned["value_as_number"] < measurement_binned["min_value"], + measurement_binned["min_value"], + ) + # Freedman–Diaconis rule (https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule) + bin_width = ( + measurement_binned[ + concept_cols + + [ + "binned_value", + ] + ] + .groupby( + concept_cols, + dropna=False, + ) + .describe() + .droplevel(0, 1) + .reset_index() + ) + bin_width["bin_width"] = ( + 2 * (bin_width["75%"] - bin_width["25%"]) / np.cbrt(bin_width["count"]) + ) + # Add bin width column to the measurement table + measurement_binned = bin_width[concept_cols + ["bin_width"]].merge( + measurement_binned, + on=concept_cols, + ) + + measurement_binned["over_outlier"] = ( + measurement_binned["value_as_number"] > measurement_binned["max_value"] + ) + measurement_binned["under_outlier"] = ( + measurement_binned["value_as_number"] < measurement_binned["min_value"] + ) + + measurement_binned["binned_value"] = measurement_binned["binned_value"].where( + measurement_binned["over_outlier"] | measurement_binned["under_outlier"], + ( + np.floor( + measurement_binned["value_as_number"] / measurement_binned["bin_width"] + ) + + 0.5 + ) + * measurement_binned["bin_width"], + ) + + # Count the frequencies + measurement_distribution = ( + measurement_binned[ + concept_cols + + [ + "care_site_short_name", + "binned_value", + "measurement_id", + "over_outlier", + "under_outlier", + ] + ] + .groupby( + concept_cols + + [ + "care_site_short_name", + "over_outlier", + "under_outlier", + "binned_value", + ], + dropna=False, + as_index=False, + ) + .agg({"measurement_id": "count"}) + .rename(columns={"measurement_id": "frequency"}) + ) + + logger.info("The binning of measurements' values is processing...") + measurement_distribution = to("pandas", measurement_distribution) + logger.info("The binning of measurements' values is finished...") + return measurement_distribution diff --git a/eds_scikit/biology/viz_other/plot_altair.py b/eds_scikit/biology/viz_other/plot_altair.py new file mode 100644 index 00000000..900c2fd5 --- /dev/null +++ b/eds_scikit/biology/viz_other/plot_altair.py @@ -0,0 +1,54 @@ +import os +from datetime import datetime +from shutil import rmtree +from typing import List, Tuple, Union + +import altair as alt +import pandas as pd +from loguru import logger + +from eds_scikit.biology.utils.concept_set import ( + ConceptsSet, + fetch_all_concepts_set, +) + +from eds_scikit.biology.viz_other.aggregate_measurement_table import aggregate_measurement +from eds_scikit.biology.viz.plot import plot_interactive_distribution +from eds_scikit.io import settings +from eds_scikit.utils.typing import Data, DataFrame + +default_standard_terminologies = settings.standard_terminologies +default_standard_concept_regex = settings.standard_concept_regex + + +def plot_biology_summary( + measurement: DataFrame, + pd_limit_size: int = 100000, + stats_only: bool = False, +) -> Union[alt.ConcatChart, pd.DataFrame]: + """It aggregates, plots and saves all the concepts-sets in folders. + + + Parameters + ---------- + data : Data + Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData] + pd_limit_size : int, optional + The limit number of rows to convert [Koalas](https://koalas.readthedocs.io/en/latest/) DatFrame into [Pandas](https://pandas.pydata.org/) DataFrame + stats_only : bool, optional + If ``True``, it will only aggregate the data for the [summary table][summary-table]. + + Returns + ------- + List[alt.ConcatChart, pd.DataFrame] + Altair plots describing the volumetric and the distribution properties of your biological data along with a pandas DataFrame with a statistical summary + """ + + tables = aggregate_measurement(measurement, pd_limit_size, stats_only, overall_only) + + + + + + + diff --git a/eds_scikit/biology/viz_other/plot_bokeh.py b/eds_scikit/biology/viz_other/plot_bokeh.py new file mode 100644 index 00000000..f915d3a4 --- /dev/null +++ b/eds_scikit/biology/viz_other/plot_bokeh.py @@ -0,0 +1,128 @@ +from eds_scikit.biology.viz_other.aggregate_measurement_table import compute_df_category_statistics, compute_df_value_statistics, compute_df_value_statistics + +from loguru import logger + +from bokeh.models import ColumnDataSource, Whisker, Panel, Tabs +from bokeh.plotting import figure, show, output_notebook +from bokeh.sampledata.autompg2 import autompg2 +from bokeh.transform import factor_cmap +from bokeh.palettes import Category10, Category20 +from bokeh.layouts import layout +from bokeh.models.widgets import DataTable, TableColumn +from bokeh.io import curdoc + +def plot_measurement_summary(measurement, value_column="normalized_value"): + + """ + + Proposing measurement table preparation + extras + NB : measurement must have ... + NB : viz numéro 2 (numéro 1 : altair) + + """ + + logger.info(f"Aggregating measurement before visualization.") + + stats_categories = compute_df_category_statistics(measurement, pivot_columns=["concept_set", "care_site_short_name"], category_column="GLIMS_ANABIO_concept_code") + by_care_site_values = compute_df_value_statistics(measurement, pivot_columns=["concept_set", "care_site_short_name"], value_column=value_column) + by_concept_codes_values = compute_df_value_statistics(measurement, pivot_columns=["concept_set", "GLIMS_ANABIO_concept_code"], value_column=value_column) + + concept_sets = stats_categories.index.get_level_values(0) + + biology_tables = {} + for concept_set in concept_sets: + biology_table = { + "Code usage" : bokeh_plot_categories(stats_categories.loc[concept_set]), + "Value distribution" : {"By care site" : bokeh_plot_values(by_care_site_values.loc[concept_set]), + "By concept code" : bokeh_plot_values(by_concept_codes_values.loc[concept_set]),} + } + biology_tables.update({concept_set : biology_table}) + + biology_tables_bokeh = create_tpanel_structure(biology_tables) + + return biology_tables_bokeh + + +def create_tpanel_structure(data_structure): + tabs = [] + + for level_key, level_value in data_structure.items(): + if isinstance(level_value, dict): + # Recursively create TPanel structure + subtabs = create_tpanel_structure(level_value) + panel = Panel(child=subtabs, title=level_key) + tabs.append(panel) + else: + panel = Panel(child=level_value, title=level_key) + tabs.append(panel) + + return Tabs(tabs=tabs) + +def create_data_table(data): + # Convert the Pandas DataFrame to a Bokeh ColumnDataSource + source = ColumnDataSource(data) + + # Create a simple DataTable with one column + columns = [TableColumn(field="data", title="Data")] + data_table = DataTable(columns=columns, source=source, width=400, height=280) + return data_table + +def bokeh_plot_values(stats_table): + + pivot_column = stats_table.index.name + stats_table = stats_table.reset_index() + + iqr = stats_table.q75 - stats_table.q25 + stats_table["upper"] = stats_table.q75 + 1.5*iqr + stats_table["lower"] = stats_table.q25 - 1.5*iqr + + source = ColumnDataSource(stats_table) + + p = figure(x_range=stats_table[pivot_column], tools="", toolbar_location=None, plot_width=800, plot_height=300, + title="", + background_fill_color="#eaefef", y_axis_label="value") + + whisker = Whisker(base=pivot_column, upper="upper", lower="lower", source=source) + p.add_layout(whisker) + + p.vbar(pivot_column, 0.7, "q50", "q75", source=source, line_color="black") + p.vbar(pivot_column, 0.7, "q25", "q50", source=source, line_color="black") + + # outliers + #outliers = df[~df[value_column].between(df.lower, df.upper)] + #p.scatter(pivot_column, value_column, source=outliers, size=6, color="black", alpha=0.3) + + p.xgrid.grid_line_color = None + length = (stats_table['upper'].max() - stats_table['lower'].min()) / 100 + p.y_range.start = stats_table['lower'].min() - length + p.y_range.end = stats_table['upper'].max() + length + + return p + +def bokeh_plot_categories(stats_category): + + stats_category = stats_category.set_index("category", append=True).unstack().fillna(0) + stats_category.columns = stats_category.columns.get_level_values(1) # Set column names to the first level + + p = figure(x_range=stats_category.index.tolist(), plot_height=350, title="Stacked Bar Plot", + toolbar_location=None, tools="") + + # Create a ColumnDataSource from the DataFrame + source = ColumnDataSource(stats_category) + + # Use Category20 palette for different colors + colors = Category20[len(stats_category.columns)] if len(stats_category.columns) > 2 else Category10[3][:len(stats_category.columns)] + + # Plot stacked bars + p.vbar_stack(stats_category.columns.tolist(), x=stats_category.index.name, width=0.9, color=colors, source=source, legend_label=stats_category.columns.tolist()) + + # Customize plot + p.y_range.start = 0 + p.xgrid.grid_line_color = None + p.axis.minor_tick_line_color = None + p.outline_line_color = None + #p.legend.location = "top_left" + p.legend.orientation = "vertical" + p.add_layout(p.legend[0], 'right') + + return p \ No newline at end of file diff --git a/eds_scikit/io/settings.py b/eds_scikit/io/settings.py index 18016581..9c9a4eb4 100644 --- a/eds_scikit/io/settings.py +++ b/eds_scikit/io/settings.py @@ -197,11 +197,32 @@ or a [PostgresData][eds_scikit.io.postgres.PostgresData] """ -standard_terminologies = ["LOINC", "AnaBio"] +biology_codes_settings = { + "ANABIO" : { + "concept_regex" : "[A-Z][0-9]{4}", + "source_terminologies" : { + "GLIMS_ANABIO": r"GLIMS.{0,20}Anabio", + "ITM_ANABIO": r"ITM - ANABIO", + } + }, + + "LOINC" : { + "concept_regex" : "[0-9]{2,5}[-][0-9]", + "source_terminologies" : { + "GLIMS_LOINC": r"GLIMS.{0,20}Anabio", + "ITM_LOINC": r"ITM - ANABIO", + } + } +} + +standard_terminologies = ["LOINC", "AnaBio", "ANABIO", "ANALYSES_LABORATOIRE"] + standard_concept_regex = { "LOINC": "[0-9]{2,5}[-][0-9]", "AnaBio": "[A-Z][0-9]{4}", + "ANABIO": "[A-Z][0-9]{4}", } + source_terminologies = { "ANALYSES_LABORATOIRE": r"Analyses Laboratoire", "GLIMS_ANABIO": r"GLIMS.{0,20}Anabio",