diff --git a/operations/data/dffml_operations_data/definitions.py b/operations/data/dffml_operations_data/definitions.py index 2d32697f93..978d7b15b6 100644 --- a/operations/data/dffml_operations_data/definitions.py +++ b/operations/data/dffml_operations_data/definitions.py @@ -3,6 +3,7 @@ definitions = [ Definition(name="input_data", primitive="List[List[int]]"), + Definition(name="target_data", primitive="List[int]"), Definition(name="output_data", primitive="List[List[int]]"), Definition(name="n_components", primitive="int"), Definition(name="n_iter", primitive="int"), @@ -10,6 +11,9 @@ Definition(name="missing_values", primitive="Any"), Definition(name="strategy", primitive="str"), Definition(name="categories", primitive="List[List[Any]]"), + Definition(name="percentile", primitive="int"), + Definition(name="k", primitive="int"), + Definition(name="score_func", primitive="function") ] for definition in definitions: diff --git a/operations/data/dffml_operations_data/operations.py b/operations/data/dffml_operations_data/operations.py index 1d0cede0d5..53e0fef3d6 100644 --- a/operations/data/dffml_operations_data/operations.py +++ b/operations/data/dffml_operations_data/operations.py @@ -2,6 +2,7 @@ from sklearn.decomposition import PCA, TruncatedSVD from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer +from sklearn.feature_selection import f_classif, SelectKBest, SelectPercentile from dffml.df.base import op @@ -14,6 +15,10 @@ random_state, n_components, missing_values, + target_data, + k, + percentile, + score_func ) @@ -206,8 +211,78 @@ async def ordinal_encoder(data): Returns ------- result: Encoded data for categorical values + + References: + + - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html + """ enc = OneHotEncoder() enc.fit(data) new_data = enc.transform(data).toarray() return {"result": new_data} + +@op( + inputs={"data": input_data, "target_data": target_data, "k": k, "score_func": score_func}, + outputs={"result": output_data} +) +async def select_k_best(data, target_data, score_func=f_classif, k=10): + """ + Select the top k features, based on the score function. + + Parameters + ---------- + data : List[List[int]] + Input data, excluding the target column + target_data : List[int] + 1D list containing values for the target column. + score_func : function + Function that takes in data and target_data, and returns + a pair of arrays (scores, pvalues) or a single array with + scores. + k : int + Number of top features to select. + + Returns + ------- + result: Encoded data for categorical values + """ + + selector = SelectKBest(score_func, k=k) + new_data = selector.fit_transform(data, target_data) + return {"result": new_data} + +@op( + inputs={"data": input_data, "target_data": target_data, "percentile": percentile, "score_func": score_func}, + outputs={"result": output_data} +) +async def select_percentile(data, target_data, score_func=f_classif, percentile=10): + """ + Select a certain top percentile of features, based on the score function. + + Parameters + ---------- + data : List[List[int]] + Input data, excluding the target column + target_data : List[int] + 1D list containing values for the target column. + score_func : function + Function that takes in data and target_data, and returns + a pair of arrays (scores, pvalues) or a single array with + scores. + percentile : int + Percentile of top features to select. + + Returns + ------- + result: Encoded data for categorical values + + References: + + - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html + + """ + + selector = SelectPercentile(score_func, percentile=percentile) + new_data = selector.fit_transform(data, target_data) + return {"result": new_data} diff --git a/operations/data/tests/test_operations.py b/operations/data/tests/test_operations.py index 5db5fbbd9a..92e06d20fa 100644 --- a/operations/data/tests/test_operations.py +++ b/operations/data/tests/test_operations.py @@ -5,7 +5,7 @@ from dffml.operation.output import GetSingle from dffml.df.memory import MemoryOrchestrator from dffml.util.asynctestcase import AsyncTestCase - +from sklearn.feature_selection import f_classif from dffml_operations_data.operations import * @@ -225,3 +225,76 @@ async def test_ordinal_encoder(self): == output_data ).all() ) + + async def test_select_k_best(self): + input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]] + target_data = [1,2,1,2,1,2] + output_data = [[1], [2], [1], [2], [1], [1]] + + async for ctx, results in MemoryOrchestrator.run( + DataFlow.auto(select_k_best, GetSingle), + [ + Input( + value=[select_k_best.op.outputs["result"].name], + definition=GetSingle.op.inputs["spec"], + ), + Input( + value=input_data, + definition=select_k_best.op.inputs["data"], + ), + Input( + value=target_data, + definition=select_k_best.op.inputs["target_data"], + ), + Input( + value=f_classif, + definition=select_k_best.op.inputs["score_func"], + ), + Input( + value=1, + definition=select_k_best.op.inputs["k"], + ), + ], + ): + self.assertTrue( + ( + results[select_k_best.op.outputs["result"].name] + == output_data + ).all() + ) + async def test_select_percentile(self): + input_data = [[1, 1], [1, 2], [1, 1], [0, 2], [1, 1], [1, 1]] + target_data = [1,2,1,2,1,2] + output_data = [[1], [2], [1], [2], [1], [1]] + + async for ctx, results in MemoryOrchestrator.run( + DataFlow.auto(select_percentile, GetSingle), + [ + Input( + value=[select_percentile.op.outputs["result"].name], + definition=GetSingle.op.inputs["spec"], + ), + Input( + value=input_data, + definition=select_percentile.op.inputs["data"], + ), + Input( + value=target_data, + definition=select_percentile.op.inputs["target_data"], + ), + Input( + value=f_classif, + definition=select_percentile.op.inputs["score_func"], + ), + Input( + value=50, + definition=select_percentile.op.inputs["percentile"], + ), + ], + ): + self.assertTrue( + ( + results[select_percentile.op.outputs["result"].name] + == output_data + ).all() + ) \ No newline at end of file