Skip to content

Commit

Permalink
[FSTORE-1285] Model Dependent Transformation Functions (#1308)
Browse files Browse the repository at this point in the history
* hopsworks_udf first version

* working code for running hopsworks udf without saving in backend using python client

* removing debugging logs

* statistics working with python client

* basic functionality working with backend

* code with statistics working and saved to backend

* working code for feature vector

* reformatted and documented Hopswork UDF class

* unit tests for transformation functions

* clearning transformations engine and adding unit tests

* feature view api formated

* reformatting and fixing feature_view_engine

* reformatted and added unit tests for feature view

* updating documentation for feature store

* updating documentation for feature store

* fixed tests for training datatset features

* reformatted and added unit tests for python engine

* most unit tests fixed

* all unit tests working

* removed print

* adding test for hopsworks_udf

* correcting merge for vector server

* reformatting with ruff

* fixing vector server

* fixing docs

* fixing vector server

* fixing building in transformations

* correcting get feature vector

* adding missed changes for build in transformations

* shallow copying scope dictonary to not overwrite statistics variable for different udf's having same statistics parameter name

* adding deep copy to create multiple transfromation functions with different features

* sorting transformation function to maintain consistent order

* sorting transformation functions in transformation function engine to mainatin same order

* using feature view transformation functions

* addressing review comments

* using PYARROW_EXTENSION_ENABLE during import rather than as a function

* skiping transformation function test in windows spark udf failing due to dependencies with greater expectation

* changing transformed_feature_vector_col_name to transformed_features to obtain feature names after transfromations

* adding property transformed_features in feature view to obtain feature names after transfromations

* updating doc string and adding property decorator missed during rebase

* refactoring transformation functions to update parsing of statistics parameters and also renaming decorator name

* refactoring transformation functions to update parsing of statistics parameters and also renaming decorator name

* reformating with ruff

* adding statistics to udf only if required

* convrting extended statistics to dictonary

* sorting built in label encoder to maintain consistency

* adding type hints for class TransformationStatistics

* adapating to backend update of reaturning output_types, transformation_features and statistics_argument_names as Lists

* fixing unit tests

* removign space in doc string

* replace - from output column names with _

* revreting unwanted spark test _ replace changes

* adding missed import

* correcting to_dict feature view

* reverting python.py unintentional changes during rebase

* rebase and adding back missed import
  • Loading branch information
manu-sj authored Jul 11, 2024
1 parent 8c0d994 commit a4efba1
Show file tree
Hide file tree
Showing 37 changed files with 4,106 additions and 2,955 deletions.
66 changes: 66 additions & 0 deletions python/hsfs/builtin_transformations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#
# Copyright 2024 Hopsworks AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import numpy as np
import pandas as pd
from hsfs.hopsworks_udf import udf
from hsfs.transformation_statistics import TransformationStatistics


feature_statistics = TransformationStatistics("feature")


@udf(float)
def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
return (feature - statistics.feature.min) / (
statistics.feature.max - statistics.feature.min
)


@udf(float)
def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
return (feature - statistics.feature.mean) / statistics.feature.stddev


@udf(float)
def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
return (feature - statistics.feature.percentiles[49]) / (
statistics.feature.percentiles[74] - statistics.feature.percentiles[24]
)


@udf(int)
def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
unique_data = sorted(
[value for value in statistics.feature.extended_statistics["unique_values"]]
)
value_to_index = {value: index for index, value in enumerate(unique_data)}
return pd.Series(
[value_to_index[data] if not pd.isna(data) else np.nan for data in feature]
)


@udf(bool)
def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
unique_data = [
value for value in statistics.feature.extended_statistics["unique_values"]
]
one_hot = pd.get_dummies(feature, dtype="bool")
for data in unique_data:
if data not in one_hot:
one_hot[data] = False
# Sorting by columns so as to maintain consistency in column order.
return one_hot.reindex(sorted(one_hot.columns), axis=1)
2 changes: 1 addition & 1 deletion python/hsfs/constructor/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(
fg_mod.ExternalFeatureGroup,
fg_mod.SpineGroup,
],
left_features: List[Union[str, "Feature"]],
left_features: List[Union[str, "Feature", Dict]],
feature_store_name: Optional[str] = None,
feature_store_id: Optional[int] = None,
left_feature_group_start_time: Optional[Union[str, int, date, datetime]] = None,
Expand Down
107 changes: 0 additions & 107 deletions python/hsfs/core/builtin_transformation_function.py

This file was deleted.

65 changes: 51 additions & 14 deletions python/hsfs/core/feature_view_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@

from typing import List, Optional, Union

from hsfs import (
client,
feature_view,
training_dataset,
transformation_function_attached,
)
from hsfs import client, feature_view, training_dataset, transformation_function
from hsfs.client.exceptions import RestAPIError
from hsfs.constructor import query, serving_prepared_statement
from hsfs.core import explicit_provenance, job, training_dataset_job_conf
Expand Down Expand Up @@ -86,13 +81,28 @@ def update(self, feature_view_obj: feature_view.FeatureView) -> None:
data=feature_view_obj.json(),
)

def get_by_name(self, name: str) -> feature_view.FeatureView:
def get_by_name(self, name: str) -> List[feature_view.FeatureView]:
"""
Get a feature view from the backend using its name.
# Arguments
name `str`: Name of the feature view.
# Returns
`List[FeatureView]`: A list that contains all version of the feature view.
# Raises
`RestAPIError`: If the feature view cannot be found from the backend.
`ValueError`: If the feature group associated with the feature view cannot be found.
"""
path = self._base_path + [name]
try:
return [
feature_view.FeatureView.from_response_json(fv)
for fv in self._client._send_request(
self._GET, path, {"expand": ["query", "features"]}
self._GET,
path,
{"expand": ["query", "features", "transformationfunctions"]},
)["items"]
]
except RestAPIError as e:
Expand All @@ -106,11 +116,27 @@ def get_by_name(self, name: str) -> feature_view.FeatureView:
raise e

def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureView:
"""
Get a feature view form the backend using both name and version
# Arguments
name `str`: Name of feature view.
version `version`: Version of the feature view.
# Returns
`FeatureView`
# Raises
`RestAPIError`: If the feature view cannot be found from the backend.
`ValueError`: If the feature group associated with the feature view cannot be found.
"""
path = self._base_path + [name, self._VERSION, version]
try:
return feature_view.FeatureView.from_response_json(
self._client._send_request(
self._GET, path, {"expand": ["query", "features"]}
self._GET,
path,
{"expand": ["query", "features", "transformationfunctions"]},
)
)
except RestAPIError as e:
Expand Down Expand Up @@ -190,12 +216,23 @@ def get_serving_prepared_statement(

def get_attached_transformation_fn(
self, name: str, version: int
) -> Union[
"transformation_function_attached.TransformationFunctionAttached",
List["transformation_function_attached.TransformationFunctionAttached"],
]:
) -> List["transformation_function.TransformationFunction"]:
"""
Get transformation functions attached to a feature view form the backend
# Arguments
name `str`: Name of feature view.
version `ìnt`: Version of feature view.
# Returns
`List[TransformationFunction]` : List of transformation functions attached to the feature view.
# Raises
`RestAPIError`: If the feature view cannot be found from the backend.
`ValueError`: If the feature group associated with the feature view cannot be found.
"""
path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION]
return transformation_function_attached.TransformationFunctionAttached.from_response_json(
return transformation_function.TransformationFunction.from_response_json(
self._client._send_request("GET", path)
)

Expand Down
Loading

0 comments on commit a4efba1

Please sign in to comment.