diff --git a/changelog.md b/changelog.md index e1534b5b..5da74050 100644 --- a/changelog.md +++ b/changelog.md @@ -1,6 +1,11 @@ # Changelog ## Unreleased +### Added +- sort_values_first_koalas function fo perform correctly sort_values(...).first() wehn using koalas dataframe + +### Fixed +- merge_visit sort_values(...).first() issue is corrected ## v0.1.7 (2024-04-12) ### Changed diff --git a/eds_scikit/utils/sort_first_koalas.py b/eds_scikit/utils/sort_first_koalas.py index f4f93628..970957cd 100644 --- a/eds_scikit/utils/sort_first_koalas.py +++ b/eds_scikit/utils/sort_first_koalas.py @@ -1,4 +1,23 @@ -def sort_values_first_koalas(dataframe, by_cols, cols, ascending=True): +from eds_scikit.utils.typing import DataFrame +from typing import List + +def sort_values_first_koalas(dataframe : DataFrame , by_cols : List[str], cols : List[str], ascending : bool = True) -> DataFrame: + """Replacement for dataframe.sort_values(cols).groupby(by_cols).first() + + To get a deterministic first, provide an id column of your dataframe as the last element of cols. + + Parameters + ---------- + dataframe : DataFrame + by_cols : List[str] + cols : List[str] + ascending : bool, optional + + Returns + ------- + DataFrame + + """ for col in cols: dataframe_min_max = dataframe.groupby(by_cols, as_index=False)[col] dataframe_min_max = (