diff --git a/README.md b/README.md index 13f79df7..2ddc938e 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ [![PyPI](https://img.shields.io/pypi/v/eds-scikit?color=blue&style=flat-square)](https://pypi.org/project/eds-scikit/) [![Supported Python](https://img.shields.io/badge/python-%3E%3D%203.7.1%20%7C%20%3C%203.8-brightgreen?style=flat-square)](https://www.python.org/) [![Black](https://img.shields.io/badge/code%20style-black-black?style=flat-square)]([https://www.python.org/](https://github.com/psf/black)) -[![Coverage](https://codecov.io/github/aphp/eds-scikit/coverage.svg)](https://raw.githubusercontent.com/aphp/eds-scikit/coverage/coverage.txt) +[![Coverage](https://raw.githubusercontent.com/aphp/eds-scikit/coverage/coverage.svg)](https://raw.githubusercontent.com/aphp/eds-scikit/coverage/coverage.txt) [![DOI](https://zenodo.org/badge/571584236.svg)](https://zenodo.org/badge/latestdoi/571584236&style=flat-square)
diff --git a/changelog.md b/changelog.md index a7b6edc2..51fb384d 100644 --- a/changelog.md +++ b/changelog.md @@ -5,6 +5,10 @@ ### Added - omop teva module +### Fixed +- Pyarrow fix now work on spark executors. +- Fix OMOP _date columns issue + ## v0.1.7 (2024-04-12) ### Changed - Support for pyarrow > 0.17.0 diff --git a/eds_scikit/io/hive.py b/eds_scikit/io/hive.py index f04dd378..54c42106 100644 --- a/eds_scikit/io/hive.py +++ b/eds_scikit/io/hive.py @@ -6,12 +6,16 @@ import pandas as pd import pyarrow.parquet as pq +import pyspark.sql.functions as F +import pyspark.sql.types as T from databricks import koalas from loguru import logger from pyspark.sql import DataFrame as SparkDataFrame from pyspark.sql import SparkSession from pyspark.sql.types import LongType, StructField, StructType +from eds_scikit.utils.framework import cache + from . import settings from .base import BaseData from .data_quality import clean_dates @@ -33,6 +37,8 @@ def __init__( Union[Dict[str, Optional[List[str]]], List[str]] ] = None, database_type: Optional[str] = "OMOP", + prune_omop_date_columns: bool = True, + cache: bool = True, ): """Spark interface for OMOP data stored in a Hive database. @@ -54,6 +60,12 @@ def __init__( *deprecated* database_type: Optional[str] = 'OMOP'. Must be 'OMOP' or 'I2B2' Whether to use the native OMOP schema or to convert I2B2 inputs to OMOP. + prune_omop_date_columns: bool, default=True + In OMOP, most date values are stored both in a `