diff --git a/eds_scikit/utils/sort_first_koalas.py b/eds_scikit/utils/sort_first_koalas.py index 52508f45..c5687d6f 100644 --- a/eds_scikit/utils/sort_first_koalas.py +++ b/eds_scikit/utils/sort_first_koalas.py @@ -4,17 +4,18 @@ def sort_values_first_koalas( - dataframe: DataFrame, by_cols: List[str], cols: List[str], ascending: bool = True + dataframe: DataFrame, by_cols: List[str], cols: List[str], disambiguate_col: str, ascending: bool = True ) -> DataFrame: - """Use this function to obtain in koalas the same ouput as dataframe.sort_values(cols).groupby(by_cols).first() in pandas. + """Use this function to obtain in koalas the same ouput as dataframe.sort_values([*cols, disambiguate_col]).groupby(by_cols).first() in pandas. - If you want the output to be deterministic, provide an id column of your dataframe as the last element of variable cols. + disambiguate_col must be provided to make sure the output is deterministic Parameters ---------- dataframe : DataFrame by_cols : List[str] cols : List[str] + disambiguate_col : List[str] ascending : bool, optional Returns @@ -22,6 +23,7 @@ def sort_values_first_koalas( DataFrame """ + cols = [*cols, disambiguate_col] for col in cols: dataframe_min_max = dataframe.groupby(by_cols, as_index=False)[col] dataframe_min_max = (