diff --git a/emission/analysis/modelling/trip_model/clustering.py b/emission/analysis/modelling/trip_model/clustering.py index d3924f32a..33f770e4e 100644 --- a/emission/analysis/modelling/trip_model/clustering.py +++ b/emission/analysis/modelling/trip_model/clustering.py @@ -6,12 +6,12 @@ import logging # import clustering algorithms -import sklearn.metrics.pairwise as smp -import sklearn.cluster as sc -from sklearn import metrics -from sklearn import svm -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +import sklearn.metrics.pairwise as sklmp +import sklearn.cluster as sklc +import sklearn.metrics.cluster as sklmc +import sklearn.svm as skls +import sklearn.pipeline as sklpl +import sklearn.preprocessing as sklpp # our imports # NOTE: this requires changing the branch of e-mission-server to @@ -102,7 +102,7 @@ def add_loc_clusters( dist_matrix_meters = get_distance_matrix(loc_df, loc_type) for r in radii: - model = sc.DBSCAN(r, metric="precomputed", + model = sklc.DBSCAN(r, metric="precomputed", min_samples=min_samples).fit(dist_matrix_meters) labels = model.labels_ # print(model.n_features_in_) @@ -150,7 +150,7 @@ def add_loc_clusters( dist_matrix_meters = get_distance_matrix(loc_df, loc_type) for r in radii: - labels = sc.OPTICS( + labels = sklc.OPTICS( min_samples=optics_min_samples, max_eps=r, xi=optics_xi, @@ -178,7 +178,7 @@ def add_loc_clusters( dist_matrix_meters = get_distance_matrix(p_loc_df, loc_type) for r in radii: - labels = sc.DBSCAN( + labels = sklc.DBSCAN( r, metric="precomputed", min_samples=min_samples).fit(dist_matrix_meters).labels_ @@ -231,7 +231,7 @@ def add_loc_clusters( # what the bandwidth roughly corresponds to in the real world/make # the value a little more interpretable. LATLON_TO_M = 1 / 111139 - labels = sc.MeanShift( + labels = sklc.MeanShift( bandwidth=LATLON_TO_M * r, min_bin_freq=min_samples, cluster_all=False, @@ -325,9 +325,9 @@ def add_loc_SVM(loc_df, ]] y_train = labeled_points_in_cluster.purpose_confirm.to_list() - labels = make_pipeline( - StandardScaler(), - svm.SVC( + labels = sklpl.make_pipeline( + sklpp.StandardScaler(), + skls.SVC( kernel='rbf', gamma=svm_gamma, C=svm_C, @@ -381,7 +381,7 @@ def get_distance_matrix(loc_df, loc_type): radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]]) dist_matrix_meters = pd.DataFrame( - smp.haversine_distances(radians_lat_lon, radians_lat_lon) * + sklmp.haversine_distances(radians_lat_lon, radians_lat_lon) * EARTH_RADIUS) return dist_matrix_meters @@ -404,7 +404,7 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'): def purity_score(y_true, y_pred): - contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred) + contingency_matrix = sklmc.contingency_matrix(y_true, y_pred) purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) return purity diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py index cc3b58a2e..8dc487391 100644 --- a/emission/analysis/modelling/trip_model/models.py +++ b/emission/analysis/modelling/trip_model/models.py @@ -5,18 +5,18 @@ import copy # sklearn imports -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.metrics.pairwise import haversine_distances -from sklearn.cluster import DBSCAN -from sklearn import svm -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier -from sklearn.tree import DecisionTreeClassifier -from sklearn.exceptions import NotFittedError +import sklearn.pipeline as sklpl +import sklearn.preprocessing as sklpp +import sklearn.impute as skli +import sklearn.metrics.pairwise as sklmp +import sklearn.cluster as sklc +import sklearn.svm as skls +import sklearn.ensemble as sklen +import sklearn.tree as sklt +import sklearn.exceptions as sklex # our imports -from emission.analysis.modelling.trip_model.clustering import get_distance_matrix, single_cluster_purity +import emission.analysis.modelling.trip_model.clustering as eamtcl import emission.analysis.modelling.trip_model.data_wrangling as eamtd import emission.storage.decorations.trip_queries as esdtq import emission.analysis.classification.inference.labels.inferrers as eacili @@ -28,7 +28,6 @@ import emission.analysis.modelling.trip_model.run_model as eamur -import emission.analysis.modelling.trip_model.clustering as eamtc # NOTE: tour_model_extended.similarity is on the # eval-private-data-compatibility branch in e-mission-server @@ -334,7 +333,7 @@ def fit(self, unused,train_entry_list=None): # fit the bins self.sim_model= eamtg.GreedySimilarityBinning(model_config) - cleaned_trip_entry= eamtc.cleanEntryTypeData(self.train_df,train_entry_list) + cleaned_trip_entry= eamtcl.cleanEntryTypeData(self.train_df,train_entry_list) self.sim_model.fit(cleaned_trip_entry) labels = [int(l) for l in self.sim_model.tripLabels] @@ -526,8 +525,8 @@ def fit(self, train_df,unused=None): ######################### ### get base clusters ### ######################### - dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type) - self.base_model = DBSCAN(self.radius, + dist_matrix_meters = eamtcl.get_distance_matrix(self.train_df, self.loc_type) + self.base_model = sklc.DBSCAN(self.radius, metric="precomputed", min_samples=1).fit(dist_matrix_meters) base_clusters = self.base_model.labels_ @@ -557,7 +556,7 @@ def fit(self, train_df,unused=None): continue # only do SVM if purity is below threshold - purity = single_cluster_purity(points_in_cluster, + purity = eamtcl.single_cluster_purity(points_in_cluster, label_col='purpose_true') if purity < self.purity_thresh: X = points_in_cluster[[ @@ -565,9 +564,9 @@ def fit(self, train_df,unused=None): ]] y = points_in_cluster.purpose_true.to_list() - svm_model = make_pipeline( - StandardScaler(), - svm.SVC( + svm_model = sklpl.make_pipeline( + sklpp.StandardScaler(), + skls.SVC( kernel='rbf', gamma=self.gamma, C=self.C, @@ -655,7 +654,7 @@ def _NN_predict(self, test_df): new_loc_radians = np.radians( row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( + dist_matrix_meters = sklmp.haversine_distances( new_loc_radians, train_radians) * EARTH_RADIUS shortest_dist_idx = np.argmin(dist_matrix_meters) @@ -1259,7 +1258,7 @@ def predict_proba(self, test_df): mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - except NotFittedError as e: + except sklex.NotFittedError as e: # if we can't predict purpose, we can still try to predict mode and # replaced-mode without one-hot encoding the purpose @@ -1277,7 +1276,7 @@ def predict_proba(self, test_df): and replaced_pred.dtype == np.float64): # this indicates that all the predictions are np.nan so none of the # random forest classifiers were fitted - raise NotFittedError + raise sklex.NotFittedError # TODO: move this to a Mixin for cluster-based predictors and use the # 'cluster' column of the proba_df outputs @@ -1379,7 +1378,7 @@ class probabilities for mode and replaced-mode respectively [self.X_test_for_mode, onehot_mode_df], axis=1) replaced_proba = self._try_predict_proba_replaced() - except NotFittedError as e: + except sklex.NotFittedError as e: mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) @@ -1409,7 +1408,7 @@ def _try_predict_proba_replaced(self): replaced_proba = pd.DataFrame( replaced_proba_raw, columns=self.replaced_predictor.classes_) - except NotFittedError as e: + except sklex.NotFittedError as e: replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0) replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan]) @@ -1436,7 +1435,7 @@ def _clusterable(self, test_df): # haversine distance, so we have to reimplement it ourselves new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( + dist_matrix_meters = sklmp.haversine_distances( new_loc_radians, train_radians) * EARTH_RADIUS shortest_dist = np.min(dist_matrix_meters) @@ -1569,7 +1568,7 @@ def __init__( handle_unknown='error') # ensemble classifiers for each label category - self.purpose_predictor = RandomForestClassifier( + self.purpose_predictor = sklen.RandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, @@ -1578,7 +1577,7 @@ def __init__( max_features=self.max_features, bootstrap=self.bootstrap, random_state=self.random_state) - self.mode_predictor = RandomForestClassifier( + self.mode_predictor = sklen.RandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, @@ -1587,7 +1586,7 @@ def __init__( max_features=self.max_features, bootstrap=self.bootstrap, random_state=self.random_state) - self.replaced_predictor = RandomForestClassifier( + self.replaced_predictor = sklen.RandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, @@ -1842,33 +1841,33 @@ def __init__( sparse=False, handle_unknown='error') - self.purpose_predictor = AdaBoostClassifier( + self.purpose_predictor = sklen.AdaBoostClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, random_state=self.random_state, - base_estimator=DecisionTreeClassifier( + base_estimator=sklt.DecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state)) - self.mode_predictor = AdaBoostClassifier( + self.mode_predictor = sklen.AdaBoostClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, random_state=self.random_state, - base_estimator=DecisionTreeClassifier( + base_estimator=sklt.DecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state)) - self.replaced_predictor = AdaBoostClassifier( + self.replaced_predictor = sklen.AdaBoostClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, random_state=self.random_state, - base_estimator=DecisionTreeClassifier( + base_estimator=sklt.DecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, @@ -2024,13 +2023,13 @@ def __init__( ): self.impute_missing = impute_missing if self.impute_missing: - self.encoder = make_pipeline( - SimpleImputer(missing_values=np.nan, + self.encoder = sklpl.make_pipeline( + skli.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing'), - OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) + sklpp.OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) else: - self.encoder = OneHotEncoder(sparse=sparse, + self.encoder = sklpp.OneHotEncoder(sparse=sparse, handle_unknown=handle_unknown) def fit_transform(self, train_df, output_col_prefix=None): diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index cfee60464..a1a48ef12 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -56,7 +56,9 @@ def update_trip_model( time_query = time_query_from_pipeline if model.is_incremental else None logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') logging.debug(f'time query for training data collection: {time_query}') + trips = _get_training_data(user_id, time_query) + # don't start training for a user that doesn't have at least $trips many trips # (assume if a stored model exists for the user, that they met this requirement previously) if len(trips) == 0: diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index b3da1d4a1..c9fc7e266 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -4,11 +4,14 @@ import pandas as pd from numpy.linalg import norm + def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point or "elbow" in the function when values are sorted. + copied from original similarity algorithm. permalink: [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L256] + with `y` passed in as `values` based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 And summarized by the statement: "A quick way of finding the elbow is to draw a diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 88813b5c4..6677221a3 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -12,7 +12,7 @@ import emission.tests.common as etc import emission.pipeline.intake_stage as epi import logging -from bson.objectid import ObjectId +import bson.objectid as boi import emission.analysis.modelling.trip_model.config as eamtc @@ -60,8 +60,8 @@ def setUp(self): for result_entry in train: result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] - result_entry['data']['start_place']=ObjectId() - result_entry['data']['end_place']=ObjectId() + result_entry['data']['start_place']=boi.ObjectId() + result_entry['data']['end_place']=boi.ObjectId() ts.bulk_insert(train) # confirm data write did not fail check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None) diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index 079bc908b..04e7f691a 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -1,7 +1,7 @@ from typing import ByteString import unittest import logging -from unittest.mock import patch +import unittest.mock as um import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.model_storage as eamums @@ -220,7 +220,7 @@ def mock_dump(*args,**kwargs): # side_effect, which is set to mock_dump, is called instead of # real joblib.dump function when 'to_dict' is invoked - with patch('joblib.dump',side_effect=mock_dump): + with um.patch('joblib.dump',side_effect=mock_dump): with self.assertRaises(RuntimeError): model.to_dict() @@ -260,7 +260,7 @@ def mock_load(*args,**kwargs): # side_effect, which is set to mock_load, is called instead of # real joblib.load function when 'to_dict' is invoked - with patch('joblib.load',side_effect=mock_load): + with um.patch('joblib.load',side_effect=mock_load): with self.assertRaises(RuntimeError): deserialized_model.from_dict(model_data) diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 8c6cd1650..672775483 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -12,7 +12,7 @@ import emission.storage.pipeline_queries as epq import emission.core.wrapper.pipelinestate as ecwp import emission.analysis.modelling.trip_model.forest_classifier as eamtf -from sklearn.ensemble import RandomForestClassifier +import sklearn.ensemble as sklen class TestRunForestModel(unittest.TestCase): """ @@ -98,9 +98,9 @@ def testBuildForestModelFromConfig(self): """ built_model = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build() - attributes={'purpose_predictor': RandomForestClassifier , - 'mode_predictor' :RandomForestClassifier, - 'replaced_predictor':RandomForestClassifier, + attributes={'purpose_predictor': sklen.RandomForestClassifier , + 'mode_predictor' :sklen.RandomForestClassifier, + 'replaced_predictor':sklen.RandomForestClassifier, 'purpose_enc' : eamtm.OneHotWrapper, 'mode_enc':eamtm.OneHotWrapper }