diff --git a/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx b/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx index 927aa781..189781f4 100644 Binary files a/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx and b/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx differ diff --git a/src/obds_fhir_to_opal/obds_fhir_to_opal.py b/src/obds_fhir_to_opal/obds_fhir_to_opal.py index 12ecb59b..ccc81c66 100644 --- a/src/obds_fhir_to_opal/obds_fhir_to_opal.py +++ b/src/obds_fhir_to_opal/obds_fhir_to_opal.py @@ -9,7 +9,8 @@ from pathling.etc import find_jar from pydantic import BaseSettings from pyspark.sql import SparkSession -from pyspark.sql.functions import col, explode, first, regexp_replace, to_date, udf +from pyspark.sql.functions import col, explode, first, regexp_replace, to_date, udf, \ + substring from pyspark.sql.types import StringType @@ -388,6 +389,7 @@ def encode_conditions(ptl: PathlingContext, df_bundles): "evidencereference": regexp_replace( "evidencereference", "Observation/", "" ), + "conditiondate_year": substring("conditiondate", 1, 4), "stagereference": regexp_replace("stagereference", "Observation/", ""), "conditiondate": regexp_replace("conditiondate", "T", " "), } @@ -403,6 +405,7 @@ def encode_conditions(ptl: PathlingContext, df_bundles): conditions = conditions.select( "cond_id", "conditiondate", + "conditiondate_year", "subjectreference", "condcodingcode", "condcodingcode_mapped", @@ -639,6 +642,7 @@ def group_df(joined_dataframe): first("patID").alias("patID"), first("gender_mapped").alias("gender_mapped"), first("conditiondate").alias("conditiondate"), + first("conditiondate_year").alias("conditiondate_year"), first("condcodingcode").alias("condcodingcode"), first("condcodingcode_mapped").alias("condcodingcode_mapped"), first("entity_group").alias("entity_group"), @@ -669,6 +673,7 @@ def group_df(joined_dataframe): "cond_id", "gender_mapped", "conditiondate", + "conditiondate_year", "condcodingcode", "condcodingcode_mapped", "entity_group",