From a3cc15550736f884ec7db7b66316fd5d04c46d3e Mon Sep 17 00:00:00 2001 From: "jasmin.ziegler" Date: Mon, 8 Apr 2024 17:28:45 +0200 Subject: [PATCH] feat: add column conditiondate year for filtering in data shield (#169) * add column conditiondate year for safety check in data shield * flake8 and data dictionary update --- .../datadictionary_bzkf_q4_22.xlsx | Bin 13439 -> 13545 bytes src/obds_fhir_to_opal/obds_fhir_to_opal.py | 7 ++++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx b/src/obds_fhir_to_opal/datadictionary_bzkf_q4_22.xlsx index 927aa781454bec3af0478d5f2838a0e70b523732..189781f45d674168b6ca228f2f8baca172fd490b 100644 GIT binary patch delta 4992 zcmY*dbyO7IwjRKtJBRM>kOm1s8l)tJ5h>}AQf6pGni)EWkP>MaK)O?M=#D{zp?)AJ z!t>ty)_V7yKh7U}e|vx5+Iy|D*FFy}-(BnI34o7YB^>a;SO}P))?T?{)R4lahp;ie0)}wQ6NH7QaJ^Z)WF<=VqdDZsW3)qbpq8zXcfVn*``1$ zC!Asn1Cr@QdeOI$=3HLkf)(vGIx?vm1mu1Netve59OYWYqsCeqXt}IT9uMc5ExNu! zdzZTdE{l-2roB|$%{Ez_Vb{{7cFTr!&ql(DP_?0F(xuQIljEdSkf&wkS^nofVRIx7 z=S{!Mqf#?BZftZXRZZvqLCO7j1o}9H(>69^pUi4WYky?4QhFE8m@!m=Q|mhzKxLm^ zHKsb~XL02droMO8>Z|<1V2rkDY?2$lEj)GtzrSF!#V$h3z_%Y>6hl%bR@Cp@p+Z@o zFuC9I-8K#sSv09htIR78X&FK0(96NNC(GYml-2s}vp&+J`awgs)_~}u@Ajv#C@R)0 zMQh556(mft89n*V97Qsas_cCAXh+RTGPUBv1Pe40rJYN>5+1MZwT0Vnet3;r)5Fb~ z5J`&J?$t&UBs>ShwXCv_Eh%W_HHE9UGj%t=w_CEyRt2?}oPQ`}UAz{}aXJ}gaCh5$ z>@3(hmlXOIC1X?SYi8}=E%S0=PDb%D{m7TqLHlzjjNtA#F_LlLAS78eQC2)EY!gi9E?&g2XVKAv z(^Yg3)4X_m^ZsPJhA3PvJu_Ywg*Txoyv+?xKw^eicqq7b!Y&0&Gi27 z%-!A)oePdh1EN*>1b0EX!DSJpbzn0#+u>;gO0_B%H(lS&al-cYPJCVX_>}YZlv$Da+F|~# zV&jjqoh`=LJ}Yr-4ECq*eruu?v&8b!_5$!rLBD`#Kd?Fh;|Wuz%p~yNSu)Sgml^G{ z;fd|<>w@C;whAe13{h}iVi$kCzm%axgn`v)=L0RTqg}hPlsM~3wyd(5pWA#gjGIO} z2A_+z5<`I1G0m1or|EHviX@P4H60aN8Oz&+{x6in@~6=v%g>TyFOrYtv}S+DQkGJA z)5W*mVtxqlzryzybKEkpc;XWPC_9^z|0LwVZUil%yMIqBZ)=Yc;UD_}uWYEfspo5S zSU{MvGvK7j#5gZ3ovwpq_Y=4I85K6{can!lK`W?gUdF!0H?nUSuBDjUw>tY9CDs{( zFaw2e-#$qf-nM-Egi&_GWd$r96!R0`aM@GhEPrNj36C+vibS>lZBHZpFSm3XE$*|= za6M=`;-^$}bioHE+MgW$J=~$zi#~YWBPfUTu&$6Qa{nCAKkMNNxw2FST6RiiQt~>o z?GSbH$q2*qjE^^eP?ye0HB8C6T%Ij-Dnu=Fj#|0tjQv%O6)9k^$FpJ zHyhN!ue6)lATOxH+i$tZ-&gLjha>{sgz%#PJbNqrBdsz>Z3~hZpkl+hAJ81Bh67y( zv1JKhZO0c35qX!NV|&KN>2e`0oaM+)T*u-5)7)$VwtV-QrUW1FX*BFx9;tzeh)@>f z_^OQ7e={Wx4wwTymvYzzq}Nkw!i-vaCR!T@;fL;%Ddm6VjG>4P>rIJ1#dC<+Vk4%b z|1CtN%q^y28@H99dM4|p3feKb{jD{v3mPTN4|lGZBj1@rMALEr9qL! z7jE>SFXA!nBem&_7f5ZsMfj&y~7i6oG^m^4{Db-{~ zJocd#O>Z*W%cSzgnrZKv+vBt-8mhsbr4Vj}?EjC5A!biXbA3I@S3q;kj~PdaSP=*C zny_gbhx9Hhvb%XmB>#7NLN@&*#FF*wb&yUez8+kK?)v4f-%wLSJm#V@?{hwHuaqfSKzf`mc zPq*nC&%laU6vje4tv88nl^ksLHyRXsD}U=^Zp|RIr7;fZb=Ur&Mwj|Ijw^-I&zK^J zkK3u`4N}5f@hV#^2m=GXH93$uuMvjYkSeoMJj=idAV&F^a>RS3gKRh@PbNb&NVTJI zA1{?_^?zJ#turIu);}50xLj#=iiq!fwf;+5%s)x+yEzbC)6+CHnt8Mx9Y53}4o)Gk zjyu0?YYy4#HP~+Y=lGkjQYIUkRqx$oO?w#%gwa~9{ZJ+?+o-7H6*<8H(cMoKc#tYSKUP*lW zdbR`)KT3m;J?^<`LL8!tD~=udvI>&7J*$;ue6@Rw7ftrN{VM13ip7~g&oLh;`T8L( za48ko^bhZSHJj!`rD)v4fbJypHHsEA3Dp-k{2}_~kAhPw%R*;K#!r!K4VNgv+SuFZy?UcrwP}W8aXYSoy3r%YkRmxssT`c@Bu) zPOkYJ^30wv1xNdRAZia>GkB3qbL)Pc(QNL!DCx;?37{Ps z*ej2gojlngW#l_A2vyZd^IQ5XyG%2C#U&g##dGXtP>zD)xZMp_Co~Y6vi$bQPY^TV z?6j){^Utr95#&3IgndxV(>l?5L?m}%+0(0hLvxWComDJ0zZPS%tNi%!yVZI#>A!dewt-r+XPVs~Gpp>B-s|!Fo)#NPs(}uG?%oKUedX`4;d+8V zwWhR>1qaMjT*6wfZ>bCWv`9Rz;;_{6Hg2!vKR1K&4Z1Z<${EF6I+w-_<{vw*Ow9jU zh*&g>@II|sowHIYqIu;yt%6R(7^hUa3Iu(Pa|>nEqFNypu%?M>#^#J`TDly`$HDoO zZ4Xr4VJJkB`9qU1Fc0`XbWRok540p$ z#6Zq2urR$mbxw)6_Or_4V@G5b6Bf0t+P6V1Q!{d-UUOc@L&YAB!Gv^Jw;GGlp;H$U z$+MK7vm33`Zzrd=(Ii*RYt;`}0Kmfo4!}qk=MgmkmdKA!>RL0yP67k~LP-IDzou|W zT4=(HbzdqO>MrM)z-c7iwCmoVOP&^9aU5*|k!cBX`MGN56U@1l;|nXUn%>~-jWgoV z@UIrhC~*S8Ht*CW`6u!Zg-M2m?>*l%VEtV%oJ(AolJGyMW!RWHm;8M3?YsZeCUEiV zW@>Ngl?WCkOU3uzRn6=bg8wYLc<3pRVcSAKBfud-*5o6W+6!Wl&&+`BW<5HcxMKRu z1K=Pgsf2h6{@1+3EBQQ}vjq3`k(bPl840xIIid!W5%HADg}6>W3>JL_E=Aanxw^q) zXA!M>S?q#LlvYj>FG(v`_!2_3giHC9`p{YjxF*js=+x7RrUcj@8)ced#gsP0Gz&vz zNo}t6d_TllenD6;8iN}`f1wsO%G2mU@{X=CGNgY*QizgK%GC212D>)_B#{qfc6ym; zy60M<62E_Fi$*AIzjTOfm)!Z5=P}&oX&LVE@y{n!5BnxI9B8E1%AwoZ&ou=jy5vke zw;q8JGUx2y~rtob8lYux*uJ>9j0@slJ4c zE!hWmu_1$IxFw+PP~WNyhsKAP1(b8TtYk41I?<(ll=#lXWiE_Z1~T!oMRyaLy|MK&xANA*T#$%0CVY5p-}RZBz1 zWu#Z5FYVoMSNDsu^cL7~%ytx*$fm_m>r;S!7js-Ss7MhglnsPXfAO_dE0% zi=^!Y2WqZ%+tnUF@FT_6+(V&EK*63-k)pe+BE86plexhcbK2n%v{{Q_OX~b1eOrO$ zW)bg0wR2RL3+aSImtdu=Y*-8UFNqi?D?s*iWM;&-vWetyjr#dm&Z zDE~aw5Z|}4fs7*eZv)CbbpUg$@-p}G(yv6Yq8b>5}{tSZKR8F+xm1efTzxKO2^Xi>h821 zyjmw}`l)_5|9R8>RWrxw`(uTZoXRoPNrjUaFI#MrFqsU@ZMU&eQiRD`)H9$bJ0Ezq zgA{pqDe`xM8D^mLkcmFKnEVCjgL@CwH$BQjt2{y0IwkZQw6h>X9~}opZg%}Mc&SwH z99JooqQ0?wuL;8}wV|ud9DPnBjE)+$=xE=-zkTa&ND5cmJVSp02G`$HN59FpKukc%QTW zeHHnsA9eN+Ase&#d9k3jy8U9H=u_Z>`e?JquR$TiiHalLj*55ju?W{`yFDy}Yvp7! zx5c_fFKUeoy*DZ+I+}@R`OdB=(Zw{pnC&SGAIck1h280C4vA58sN8Atq^3T^#}yv` zQJvQu^_ZdH?Oa3)b^=;!j2uPl;^m<$Chbm^meV5#rwUBWn1{C4eDT&@#;XAggONRy zLi_W^YdC)bIEty#8VQymM*D{^5s#d9w8}>E}8-@Y0!tAY8ay*A2(M%)jf}$SBkV{NGQld2wT`moQ!ldF%{w zSRFYx?1O{=Ru-&LLIazI8YW6j10$0pXZpWI9smFV{`0P>0VUbi|sU-wKKvEi)29c0%>F$(TKvWR8 z{?2#5@4I*Yn1AM(IWy19Iqy4j;Cv6Sr3GWNw>OD!pqRi62y4%GB9x(d_4na_^y-LM zLFv%n(;xhe=lpXe)Z-YuB1~(uM4yzG--pf>6+IVTkxQHJ>+9cJeR1*wb5JdiLtpJ$ z;1DrDra_$faVTn;W50iA1et71&m|7!8Km$VcJ&K6$Mj&J2jA0;=7<3Uaw;vNrbog4 z*%zrZn&q412lsTp&LN{mmv0E zD{xTJj0-2u-YvVuX(#dnYO8_g*UYn4;)?Q`%K8+(WaHXJU2)r>jC`f`azR86pu*1> zNoUx}*9#<03G_+6XbBMOaC%}Zq%y#$;Gh8^hL4AVyCKvuu7iZk{LT+!Ndi7U^%cC5 zd7AHq*o(OLh}umy%NMStvdHLjGtCo6$s>ix{iuXv_jZ649<&14JgNclf!&5ejjKi8kLHkLQlptaeg z#k!iYlhWj@#YXBCC0fs=gFH5I*#V(A5wrHZo3FiGxYbcc25;@>54&g4AZ(7>qmRXi zxB0MXt`{!iK~_r&g}aIyvl*7_^H+&(#@lhuw!?Ik!vU@SG7jo)Ns|`|W;gFWAd%D3 z+VL>?d;n9x1oXj*IWND_0Q*wI1S=!yT3A=$Zl#nv2|2Y51^BEs3HMcesgjyBoq;s| z%frM-aPHOh)S%xqda@x<_m484`#UW*@4#U54B4q&BlIO*bvr=^Qyty*d}}9GNB(YN z!`!jlZ!DCz043UuP>KW}=+_#B!UF(k`B1fFvUu1ypZvAD$x%lQS|za`4Lq6 z9@YLBJ=Gn6!@i`g=iR8IPeO1ruI!;S;#R>^I3*+KE`7`}KZBe2=O4WWuW%E?(DQyNY(1S57ydNy;oCldqZm|{-g5{yiRmktCm z!K<k3z6sRya}mO{lRCmt^KrGV%+`%>pIryK_-c=1p6x5^MdUL#$=RCGy(da_!Sm5RIm88-l>= z@Ou1tmMW7P`^CnMz)kmnN5K+ajGzpdUMg<*Qr4Uc_u+GW7Q(v?idWBV$$LVm4^L+j zJeue3Gc11^rU;;W)()y)Mmcw>1q>>M$fbRO39FP;i^a4_+&B{4fuv6NW#Y;wcm$qIeY$=5u8BIR_&4GSV=dYM)D zqNsZ-4TIQiBvc2Z@Nr$Ek%=)^HU z3L43=K1sdRsAg?`c#PY5L9A>8KarQLNg4?gWI4Pn86ejginz2k1kvr%ynV*gAWa zhW+6rmKxt!GsDY#Snr=3k#FDf$91}Zqdri3m~F*8l&Jq0-4-=tq3dG0u2DssYBxo+ zfRATNQ1W}rMU>pSiuTiGLwK3Xs8?LEfSFKSrw=%ajplzm%X~%+B$hXoY#ZoMM z3EXAgc?>`wmLZ-UwLn*HJNR26TAF^qqQPyraM4Pg0p+V`ay)gO<=Q@?S8#Yd|1u{f zk4Fr z;MGGX4f;^#`fpX?ov2{M*3xK6fui+8TFjPr-($HntjCHp zxLKiEH}deU=Z1K*dMuR3{~JVpnX|I>t(YA)6dKof_?JbWA?QPRWgS)x^VsWzB30gS z3GEGB)kWq~wYbhlf2*hbH*es-ynX0maG)|X=NWs0gQpkw;8Ao$?TvQ<0-_ab;1YDh z^DLuYc`W9?%0Bz05p_N*h!gvvOTZ8t8X(A^gM*jI2{il}^aTSKB>Civ_e9eBfs{=t zA7i@o_XSoed(Vit>E#F&|5M&?cD;K|wWfp_I`m)FXTMCIvcb%nR*48Bx4J9SSt`N0 z&c>@IuC@r8DtG(td|o%szPsB3YHj3lY)(1#+-C?JLE0#B%>``=}9Oh~k(s16vd)CIkhFY5utA9g|nT!)Gc#wi=WeV8(ir z$V2L1lPw3Qgg3}#-=~tylVRH>XDG6lnI~LTT+`ngEKeV4eroK5U%5U~Swnbtt<@b| z@f;%-ZYsE9w5SUlr7+6)tmbO;kq#-1ZNPWTL-ri{^<+_!uMC;HsyV|bBk8sB55+y~ z9?s!rUNc@_^6WA;WHVFTmqbenNkV{PEQ2JuJHcjXNY&%`V^NHcg?rK<`c3D^{>dAH zGvc-@o(9H-Stb-Nxy2=Un-N}}E9`*blQY80KOvwv`V8{Ko*gW4s+J^V`9^d;-4u2@ z&|mLX{M!s!%)2lSM{1vgqe|YiMg6nr!p3Z9%N>;|lBSx-l z_Rgv^gIydCSf$OhV;VIbu|V?0z=(58Gu4FC4Ykk;xysla@4wj*X7@ww2m>UkgI3c* zE7;ZsW|$lN(l0!cQj=`6=Z=2utBcjsSrdNS9l@ctyNv6F`+1{R_HhUog&TdCEKXd- z8tzK2L(^Gq9+h2KH(ae!9FyuGu&@T{$)^Imx>-D_Ur1L4Vqqx;!&WSgW!TQ8v*JH!DOi!Gp3Pj~fl|lHcQhPd>8UChka+rMg0-F?NbV8M^_2xZ?uWs zWo>sW!Gg0iox$iVwCDNxQf^P)+ruQenmgQo{x0B+=aSghAOL^?HORq$fG+qfNl}K5 z>)rtil{j+gHrXaE`k-8O6q?AX5b{#-(9Q5|s^}OH1Yp8;ulxL*=oWZyCd|Vh04^Jh zq+Z4jRBq-T@0g}q2>)gkH>Dy$hzn8^A`lvy>WcmT)K89OQN*xT8()UeO@-vWAd|A( z+n=a# zj`g^(zTw|}j7Bp$QWH~iXAA3%M8rSn$N@QdlbdzFhv$Q4xSxwU2942k>E=;gfr-R< zg|AKADf?DSca12SHpDr6BR{eaUvvz~!itmW%7*!Xsz%PE**FXcTp>9w*{@#(o+j52 z@ro`eg?@M0Gwve1%>5+M;~E>rJkX-)%8c*Q)F3R`7SnawLo>Cr5KL*5H*y#9?Tw9o z^W1#?3G$aXY{|QtKEwW<-4|yAdWgW|@wIhj_e;SO+!MyK`2OKYNgK{0t0GGq?VlGr zoJzv4yj1Qkc@=615j?}U=~YyC=MuzQWS_$;O*QVL>+^E?REc#}CuP?!cNkH>_(G8_ zbE{(XhSDELE@MmuiFgr zeO-%UEGln8CP?<_f}&Yp=tzgwVZsYz_*6KdblAwVR-LtMgsU%&30=w7Gp<|YA3Fb5 z6JRjEDO!hq-nPzH6;WCkRw3L`BgT!58*%1YTi0hMQ^xnFDJ7250L9&f3QNCL;Rvl` zmdqtQO(5?&|FOzbu#H{05Tm0$uHY4}H=kW3m89&nuyh>8ecLN8P{H6wF%tc)x_fS^ zj||7QdEr_Hfe#Hw<*H)N^L3GB$5Xver?g&0;Jlr)&DyIv)V1bWkP`CRx5D&wz==of zJ9IwE$b>iA#@(cD^J^chxJ(4xlk}QcXt7vhqW5zD06_V1u z7@fSU-=wN0_+`UfP-I9*=P_&G9aD{*XjsYBEHY$RF?kI(VoW)@VqOWJPMTYx%-9R^ zuFmh<^}evrqqVY&TPf+v9a_eMJ69Z^&()NhbBg4QyYk!Gpcfw=hf{$MebbTuq^I zBhY^NLdPFxDC0-$*;8LLth%=9KF6l7kD3-lGW~5L@$kb`Tk;c}&UKPa#R#}ib_~7! z!64dUCxM|plqniRrnAwV|7%*Ai}%9H#Xj{=*an*E~i?AJ*-)3esO!wYOr4j zaRS6aAT%>8wkfm{a0BtA-*N{doq{YM+bi^fkr)a513dl0sKhe~;sc2OR#YM;ElQ%x zICa7yX1A0zchfst#_0>PtL)TZn%hb=GRS_QVs1Ean-RWH&Oj61JvNNLcap5|0Nio# zly31+TowB37Sk2!%LmI7)b|enfP0jm02!#HiBS!W#m5aqJlE+Fq3m75{-QdFcE5&= zkS`LOT(J;?_=i>>`OIOztLK>XSX-@MadF(V_Zhq0$j{O&~OZc7H#(;!*P{6xG&~d#fU;C z)@QSNGSvbRI8Eijdy`MKE<>1U7EzpI5%+gI^wj(w`AD%Y>06Tk#zdxBRLmm_S-O>< z5m;m^ELFRFN#Q@o=^tn&(Bo zaM0cY1K9t*D=CQ?0j*FeVsgMV)QFhDgA6iMEj<-VTwM481tm(9k_r_pPRj8A+cW?G z0sN<=(*MQNp}zbD{omU~3;#1OPn$ eHk<6f9AZ%d5=uZKl&gfvU)?Fej&m#W58z*_a2RU< diff --git a/src/obds_fhir_to_opal/obds_fhir_to_opal.py b/src/obds_fhir_to_opal/obds_fhir_to_opal.py index 12ecb59b..ccc81c66 100644 --- a/src/obds_fhir_to_opal/obds_fhir_to_opal.py +++ b/src/obds_fhir_to_opal/obds_fhir_to_opal.py @@ -9,7 +9,8 @@ from pathling.etc import find_jar from pydantic import BaseSettings from pyspark.sql import SparkSession -from pyspark.sql.functions import col, explode, first, regexp_replace, to_date, udf +from pyspark.sql.functions import col, explode, first, regexp_replace, to_date, udf, \ + substring from pyspark.sql.types import StringType @@ -388,6 +389,7 @@ def encode_conditions(ptl: PathlingContext, df_bundles): "evidencereference": regexp_replace( "evidencereference", "Observation/", "" ), + "conditiondate_year": substring("conditiondate", 1, 4), "stagereference": regexp_replace("stagereference", "Observation/", ""), "conditiondate": regexp_replace("conditiondate", "T", " "), } @@ -403,6 +405,7 @@ def encode_conditions(ptl: PathlingContext, df_bundles): conditions = conditions.select( "cond_id", "conditiondate", + "conditiondate_year", "subjectreference", "condcodingcode", "condcodingcode_mapped", @@ -639,6 +642,7 @@ def group_df(joined_dataframe): first("patID").alias("patID"), first("gender_mapped").alias("gender_mapped"), first("conditiondate").alias("conditiondate"), + first("conditiondate_year").alias("conditiondate_year"), first("condcodingcode").alias("condcodingcode"), first("condcodingcode_mapped").alias("condcodingcode_mapped"), first("entity_group").alias("entity_group"), @@ -669,6 +673,7 @@ def group_df(joined_dataframe): "cond_id", "gender_mapped", "conditiondate", + "conditiondate_year", "condcodingcode", "condcodingcode_mapped", "entity_group",