diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index 3c913fd989..f662b20337 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -502,11 +502,14 @@ def _read_dir_from_storage_connector( def _drop_helper_columns(self, df, with_columns, columns): if not with_columns: if columns: - df = engine.get_instance().drop_columns(df, columns) + try: + df = engine.get_instance().drop_columns(df, columns) + except KeyError: + pass else: if not columns: warnings.warn( - "Parent feature view doesn't have provided helper columns, thus it will be ignored " + "Parent feature view doesn't have helper columns, thus drop will be ignored " ) return df diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 81d16622b4..9d8f3f24dc 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1452,15 +1452,23 @@ def create_feature_view( the feature view. When replaying a `Query` during model inference, the label features can be omitted from the feature vector retrieval. Defaults to `[]`, no label. - inference_helper_columns: Inference Helper columns are a list of feature names in the feature view, defined - during its creation, that may not be used in training the model itself (e.g. primary keys and - datetime that can be used to sort dataframe and or merge to predictions back to original dataframes). - When replaying a `Query` during model inference, the helper columns optionally can be omitted during - batch (`get_batch_data`) and online inference (`get_feature_vector(s)`). Defaults to `False`, - no helper columns. - training_helper_columns: Training helper columns are a list of feature names in the feature view, - defined during its creation, that may not be used in training the model itself . Defaults to `False`, - no training helper columns. + inference_helper_columns: A list of feature names that are not used in training the model itself but can be + used during batch or online inference for extra information. Inference helper column name(s) must be + part of the `Query` object. If inference helper column name(s) belong to feature group that is part + of a `Join` with `prefix` defined, then this prefix needs to be prepended to the original column name + when defining `inference_helper_columns` list. When replaying a `Query` during model inference, + the inference helper columns optionally can be omitted during batch (`get_batch_data`) and will be + omitted during online inference (`get_feature_vector(s)`). To get inference helper column(s) during + online inference use `get_inference_helper(s)` method. Defaults to `[], no helper columns. + training_helper_columns: A list of feature names that are not the part of the model schema itself but can be + used during training as a helper for extra information. Training helper column name(s) must be + part of the `Query` object. If training helper column name(s) belong to feature group that is part + of a `Join` with `prefix` defined, then this prefix needs to prepended to the original column name when + defining `training_helper_columns` list. When replaying a `Query` during model inference, + the training helper columns will be omitted during both batch and online inference. + Training helper columns can be optionally fetched with training data. For more details see + documentation for feature view's get training data methods. Defaults to `[], no training helper + columns. transformation_functions: A dictionary mapping tansformation functions to to the features they should be applied to before writing out the vector and at inference time. Defaults to `{}`, no @@ -1521,15 +1529,23 @@ def get_or_create_feature_view( the feature view. When replaying a `Query` during model inference, the label features can be omitted from the feature vector retrieval. Defaults to `[]`, no label. - inference_helper_columns: Inference Helper columns are a list of feature names in the feature view, defined - during its creation, that may not be used in training the model itself (e.g. primary keys and - datetime that can be used to sort dataframe and or merge to predictions back to original dataframes). - When replaying a `Query` during model inference, the helper columns optionally can be omitted during - batch (`get_batch_data`) and online inference (`get_feature_vector(s)`). Defaults to `False`, - no helper columns. - training_helper_columns: Training helper columns are a list of feature names in the feature view, - defined during its creation, that may not be used in training the model itself . Defaults to `False`, - no training helper columns. + inference_helper_columns: A list of feature names that are not used in training the model itself but can be + used during batch or online inference for extra information. Inference helper column name(s) must be + part of the `Query` object. If inference helper column name(s) belong to feature group that is part + of a `Join` with `prefix` defined, then this prefix needs to be prepended to the original column name + when defining `inference_helper_columns` list. When replaying a `Query` during model inference, + the inference helper columns optionally can be omitted during batch (`get_batch_data`) and will be + omitted during online inference (`get_feature_vector(s)`). To get inference helper column(s) during + online inference use `get_inference_helper(s)` method. Defaults to `[], no helper columns. + training_helper_columns: A list of feature names that are not the part of the model schema itself but can be + used during training as a helper for extra information. Training helper column name(s) must be + part of the `Query` object. If training helper column name(s) belong to feature group that is part + of a `Join` with `prefix` defined, then this prefix needs to prepended to the original column name when + defining `training_helper_columns` list. When replaying a `Query` during model inference, + the training helper columns will be omitted during both batch and online inference. + Training helper columns can be optionally fetched with training data. For more details see + documentation for feature view's get training data methods. Defaults to `[], no training helper + columns. transformation_functions: A dictionary mapping tansformation functions to to the features they should be applied to before writing out the vector and at inference time. Defaults to `{}`, no diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index a9eb704d9c..9f2105ac2d 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -698,11 +698,10 @@ def get_batch_data( event_time: whether to include event time feature or not. Defaults to `False`, no event time features. inference_helper_columns: whether to include inference helper columns or not. - Inference Helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself (e.g. primary keys and datetime that can be used to - sort dataframe and or merge to predictions back to original dataframes). When replaying a `Query` - during model inference, the helper columns optionally can be omitted during batch (`get_batch_data`) - and online inference (`get_feature_vector(s)`). Defaults to `False`, no helper columns. + Inference helper columns are a list of feature names in the feature view, defined during its creation, + that may not be used in training the model itself but can be used during batch or online inference + for extra information. If inference helper columns were not defined in the feature view + `inference_helper_columns=True` will not any effect. Defaults to `False`, no helper columns. # Returns `DataFrame`: A dataframe """ @@ -1029,9 +1028,11 @@ def create_training_data( features. event_time: whether to include event time feature or not. Defaults to `False`, no event time features. - training_helper_columns: whether to include training helper columns or not. - Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + training_helper_columns: whether to include training helper columns or not. Training helper columns are a + list of feature names in the feature view, defined during its creation, that are not the part of the + model schema itself but can be used during training as a helper for extra information. + If training helper columns were not defined in the feature view then`training_helper_columns=True` + will not have any effect. Defaults to `False`, no training helper columns. # Returns (td_version, `Job`): Tuple of training dataset version and job. When using the `python` engine, it returns the Hopsworks Job @@ -1313,7 +1314,10 @@ def create_train_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view + then`training_helper_columns=True` will not have any effect. Defaults to `False`, no training helper + columns. # Returns (td_version, `Job`): Tuple of training dataset version and job. When using the `python` engine, it returns the Hopsworks Job @@ -1592,7 +1596,10 @@ def create_train_validation_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view + then`training_helper_columns=True` will not have any effect. Defaults to `False`, no training helper + columns. # Returns (td_version, `Job`): Tuple of training dataset version and job. When using the `python` engine, it returns the Hopsworks Job @@ -1828,7 +1835,10 @@ def training_data( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view + then`training_helper_columns=True` will not have any effect. Defaults to `False`, no training helper + columns. # Returns (X, y): Tuple of dataframe of features and labels. If there are no labels, y returns `None`. """ @@ -1988,7 +1998,10 @@ def train_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view + then`training_helper_columns=True` will not have any effect. Defaults to `False`, no training helper + columns. # Returns (X_train, X_test, y_train, y_test): Tuple of dataframe of features and labels @@ -2182,7 +2195,10 @@ def train_validation_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view + then`training_helper_columns=True` will not have any effect. Defaults to `False`, no training helper + columns. # Returns (X_train, X_val, X_test, y_train, y_val, y_test): Tuple of dataframe of features and labels @@ -2306,7 +2322,10 @@ def get_training_data( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view or during + materializing training dataset in the file system then`training_helper_columns=True` will not have + any effect. Defaults to `False`, no training helper columns. # Returns (X, y): Tuple of dataframe of features and labels """ @@ -2362,7 +2381,10 @@ def get_train_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view or during + materializing training dataset in the file system then`training_helper_columns=True` will not have + any effect. Defaults to `False`, no training helper columns. # Returns (X_train, X_test, y_train, y_test): Tuple of dataframe of features and labels @@ -2420,7 +2442,10 @@ def get_train_validation_test_split( features. training_helper_columns: whether to include training helper columns or not. Training helper columns are a list of feature names in the feature view, defined during its creation, - that may not be used in training the model itself. Defaults to `False`, no training helper columns. + that are not the part of the model schema itself but can be used during training as a helper for + extra information. If training helper columns were not defined in the feature view or during + materializing training dataset in the file system then`training_helper_columns=True` will not have + any effect. Defaults to `False`, no training helper columns. # Returns (X_train, X_val, X_test, y_train, y_val, y_test): Tuple of dataframe of features and labels