Import format changed + Whitespace / Newline fixes

1. Newline fixes - reverted removal of line a. util.py e-mission#938 (comment) b. run_model.py e-mission#938 (comment) 2. Import format changed to "import X as Y" instead of "from X import Y" Files: clustering.py, models.py, TestForestModelIntegration.py, TestForestModelLoadandSave.py, TestRunForestMode.py
MukuFlash03 · Nov 18, 2024 · de219ab · de219ab
1 parent ad968de
commit de219ab
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 62 deletions.
diff --git a/emission/analysis/modelling/trip_model/clustering.py b/emission/analysis/modelling/trip_model/clustering.py
@@ -6,12 +6,12 @@
 import logging
 
 # import clustering algorithms
-import sklearn.metrics.pairwise as smp
-import sklearn.cluster as sc
-from sklearn import metrics
-from sklearn import svm
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+import sklearn.metrics.pairwise as sklmp
+import sklearn.cluster as sklc
+import sklearn.metrics.cluster as sklmc
+import sklearn.svm as skls
+import sklearn.pipeline as sklpl
+import sklearn.preprocessing as sklpp
 
 # our imports
 # NOTE: this requires changing the branch of e-mission-server to
@@ -102,7 +102,7 @@ def add_loc_clusters(
         dist_matrix_meters = get_distance_matrix(loc_df, loc_type)
 
         for r in radii:
-            model = sc.DBSCAN(r, metric="precomputed",
+            model = sklc.DBSCAN(r, metric="precomputed",
                               min_samples=min_samples).fit(dist_matrix_meters)
             labels = model.labels_
             # print(model.n_features_in_)
@@ -150,7 +150,7 @@ def add_loc_clusters(
         dist_matrix_meters = get_distance_matrix(loc_df, loc_type)
 
         for r in radii:
-            labels = sc.OPTICS(
+            labels = sklc.OPTICS(
                 min_samples=optics_min_samples,
                 max_eps=r,
                 xi=optics_xi,
@@ -178,7 +178,7 @@ def add_loc_clusters(
             dist_matrix_meters = get_distance_matrix(p_loc_df, loc_type)
 
             for r in radii:
-                labels = sc.DBSCAN(
+                labels = sklc.DBSCAN(
                     r, metric="precomputed",
                     min_samples=min_samples).fit(dist_matrix_meters).labels_
 
@@ -231,7 +231,7 @@ def add_loc_clusters(
             # what the bandwidth roughly corresponds to in the real world/make
             # the value a little more interpretable.
             LATLON_TO_M = 1 / 111139
-            labels = sc.MeanShift(
+            labels = sklc.MeanShift(
                 bandwidth=LATLON_TO_M * r,
                 min_bin_freq=min_samples,
                 cluster_all=False,
@@ -325,9 +325,9 @@ def add_loc_SVM(loc_df,
                 ]]
                 y_train = labeled_points_in_cluster.purpose_confirm.to_list()
 
-                labels = make_pipeline(
-                    StandardScaler(),
-                    svm.SVC(
+                labels = sklpl.make_pipeline(
+                    sklpp.StandardScaler(),
+                    skls.SVC(
                         kernel='rbf',
                         gamma=svm_gamma,
                         C=svm_C,
@@ -381,7 +381,7 @@ def get_distance_matrix(loc_df, loc_type):
     radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]])
 
     dist_matrix_meters = pd.DataFrame(
-        smp.haversine_distances(radians_lat_lon, radians_lat_lon) *
+        sklmp.haversine_distances(radians_lat_lon, radians_lat_lon) *
         EARTH_RADIUS)
     return dist_matrix_meters
 
@@ -404,7 +404,7 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'):
 
 
 def purity_score(y_true, y_pred):
-    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
+    contingency_matrix = sklmc.contingency_matrix(y_true, y_pred)
     purity = np.sum(np.amax(contingency_matrix,
                             axis=0)) / np.sum(contingency_matrix)
     return purity
diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py
@@ -5,18 +5,18 @@
 import copy
 
 # sklearn imports
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.impute import SimpleImputer
-from sklearn.metrics.pairwise import haversine_distances
-from sklearn.cluster import DBSCAN
-from sklearn import svm
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.exceptions import NotFittedError
+import sklearn.pipeline as sklpl
+import sklearn.preprocessing as sklpp
+import sklearn.impute as skli
+import sklearn.metrics.pairwise as sklmp
+import sklearn.cluster as sklc
+import sklearn.svm as skls
+import sklearn.ensemble as sklen
+import sklearn.tree as sklt
+import sklearn.exceptions as sklex
 
 # our imports
-from emission.analysis.modelling.trip_model.clustering import get_distance_matrix, single_cluster_purity
+import emission.analysis.modelling.trip_model.clustering as eamtcl
 import emission.analysis.modelling.trip_model.data_wrangling as eamtd
 import emission.storage.decorations.trip_queries as esdtq
 import emission.analysis.classification.inference.labels.inferrers as eacili
@@ -28,7 +28,6 @@
 import emission.analysis.modelling.trip_model.run_model as eamur
 
 
-import emission.analysis.modelling.trip_model.clustering as eamtc
 # NOTE: tour_model_extended.similarity is on the
 # eval-private-data-compatibility branch in e-mission-server
 
@@ -334,7 +333,7 @@ def fit(self, unused,train_entry_list=None):
 
         # fit the bins
         self.sim_model= eamtg.GreedySimilarityBinning(model_config)
-        cleaned_trip_entry= eamtc.cleanEntryTypeData(self.train_df,train_entry_list)
+        cleaned_trip_entry= eamtcl.cleanEntryTypeData(self.train_df,train_entry_list)
         self.sim_model.fit(cleaned_trip_entry)
 
         labels = [int(l) for l in self.sim_model.tripLabels]
@@ -526,8 +525,8 @@ def fit(self, train_df,unused=None):
         #########################
         ### get base clusters ###
         #########################
-        dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type)
-        self.base_model = DBSCAN(self.radius,
+        dist_matrix_meters = eamtcl.get_distance_matrix(self.train_df, self.loc_type)
+        self.base_model = sklc.DBSCAN(self.radius,
                                  metric="precomputed",
                                  min_samples=1).fit(dist_matrix_meters)
         base_clusters = self.base_model.labels_
@@ -557,17 +556,17 @@ def fit(self, train_df,unused=None):
                     continue
 
                 # only do SVM if purity is below threshold
-                purity = single_cluster_purity(points_in_cluster,
+                purity = eamtcl.single_cluster_purity(points_in_cluster,
                                                label_col='purpose_true')
                 if purity < self.purity_thresh:
                     X = points_in_cluster[[
                         f"{self.loc_type}_lon", f"{self.loc_type}_lat"
                     ]]
                     y = points_in_cluster.purpose_true.to_list()
 
-                    svm_model = make_pipeline(
-                        StandardScaler(),
-                        svm.SVC(
+                    svm_model = sklpl.make_pipeline(
+                        sklpp.StandardScaler(),
+                        skls.SVC(
                             kernel='rbf',
                             gamma=self.gamma,
                             C=self.C,
@@ -655,7 +654,7 @@ def _NN_predict(self, test_df):
             new_loc_radians = np.radians(
                 row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list())
             new_loc_radians = np.reshape(new_loc_radians, (1, 2))
-            dist_matrix_meters = haversine_distances(
+            dist_matrix_meters = sklmp.haversine_distances(
                 new_loc_radians, train_radians) * EARTH_RADIUS
 
             shortest_dist_idx = np.argmin(dist_matrix_meters)
@@ -1259,7 +1258,7 @@ def predict_proba(self, test_df):
 
             mode_proba, replaced_proba = self._try_predict_proba_mode_replaced()
 
-        except NotFittedError as e:
+        except sklex.NotFittedError as e:
             # if we can't predict purpose, we can still try to predict mode and
             # replaced-mode without one-hot encoding the purpose
 
@@ -1277,7 +1276,7 @@ def predict_proba(self, test_df):
                 and replaced_pred.dtype == np.float64):
             # this indicates that all the predictions are np.nan so none of the
             # random forest classifiers were fitted
-            raise NotFittedError
+            raise sklex.NotFittedError
 
         # TODO: move this to a Mixin for cluster-based predictors and use the
         # 'cluster' column of the proba_df outputs
@@ -1379,7 +1378,7 @@ class probabilities for mode and replaced-mode respectively
                 [self.X_test_for_mode, onehot_mode_df], axis=1)
             replaced_proba = self._try_predict_proba_replaced()
 
-        except NotFittedError as e:
+        except sklex.NotFittedError as e:
             mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0)
             mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan])
 
@@ -1409,7 +1408,7 @@ def _try_predict_proba_replaced(self):
             replaced_proba = pd.DataFrame(
                 replaced_proba_raw, columns=self.replaced_predictor.classes_)
 
-        except NotFittedError as e:
+        except sklex.NotFittedError as e:
             replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0)
             replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan])
 
@@ -1436,7 +1435,7 @@ def _clusterable(self, test_df):
             # haversine distance, so we have to reimplement it ourselves
             new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list())
             new_loc_radians = np.reshape(new_loc_radians, (1, 2))
-            dist_matrix_meters = haversine_distances(
+            dist_matrix_meters = sklmp.haversine_distances(
                 new_loc_radians, train_radians) * EARTH_RADIUS
 
             shortest_dist = np.min(dist_matrix_meters)
@@ -1569,7 +1568,7 @@ def __init__(
                                       handle_unknown='error')
 
         # ensemble classifiers for each label category
-        self.purpose_predictor = RandomForestClassifier(
+        self.purpose_predictor = sklen.RandomForestClassifier(
             n_estimators=self.n_estimators,
             criterion=self.criterion,
             max_depth=self.max_depth,
@@ -1578,7 +1577,7 @@ def __init__(
             max_features=self.max_features,
             bootstrap=self.bootstrap,
             random_state=self.random_state)
-        self.mode_predictor = RandomForestClassifier(
+        self.mode_predictor = sklen.RandomForestClassifier(
             n_estimators=self.n_estimators,
             criterion=self.criterion,
             max_depth=self.max_depth,
@@ -1587,7 +1586,7 @@ def __init__(
             max_features=self.max_features,
             bootstrap=self.bootstrap,
             random_state=self.random_state)
-        self.replaced_predictor = RandomForestClassifier(
+        self.replaced_predictor = sklen.RandomForestClassifier(
             n_estimators=self.n_estimators,
             criterion=self.criterion,
             max_depth=self.max_depth,
@@ -1842,33 +1841,33 @@ def __init__(
                                       sparse=False,
                                       handle_unknown='error')
 
-        self.purpose_predictor = AdaBoostClassifier(
+        self.purpose_predictor = sklen.AdaBoostClassifier(
             n_estimators=self.n_estimators,
             learning_rate=self.learning_rate,
             random_state=self.random_state,
-            base_estimator=DecisionTreeClassifier(
+            base_estimator=sklt.DecisionTreeClassifier(
                 criterion=self.criterion,
                 max_depth=self.max_depth,
                 min_samples_split=self.min_samples_split,
                 min_samples_leaf=self.min_samples_leaf,
                 max_features=self.max_features,
                 random_state=self.random_state))
-        self.mode_predictor = AdaBoostClassifier(
+        self.mode_predictor = sklen.AdaBoostClassifier(
             n_estimators=self.n_estimators,
             learning_rate=self.learning_rate,
             random_state=self.random_state,
-            base_estimator=DecisionTreeClassifier(
+            base_estimator=sklt.DecisionTreeClassifier(
                 criterion=self.criterion,
                 max_depth=self.max_depth,
                 min_samples_split=self.min_samples_split,
                 min_samples_leaf=self.min_samples_leaf,
                 max_features=self.max_features,
                 random_state=self.random_state))
-        self.replaced_predictor = AdaBoostClassifier(
+        self.replaced_predictor = sklen.AdaBoostClassifier(
             n_estimators=self.n_estimators,
             learning_rate=self.learning_rate,
             random_state=self.random_state,
-            base_estimator=DecisionTreeClassifier(
+            base_estimator=sklt.DecisionTreeClassifier(
                 criterion=self.criterion,
                 max_depth=self.max_depth,
                 min_samples_split=self.min_samples_split,
@@ -2024,13 +2023,13 @@ def __init__(
     ):
         self.impute_missing = impute_missing
         if self.impute_missing:
-            self.encoder = make_pipeline(
-                SimpleImputer(missing_values=np.nan,
+            self.encoder = sklpl.make_pipeline(
+                skli.SimpleImputer(missing_values=np.nan,
                               strategy='constant',
                               fill_value='missing'),
-                OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
+                sklpp.OneHotEncoder(sparse=False, handle_unknown=handle_unknown))
         else:
-            self.encoder = OneHotEncoder(sparse=sparse,
+            self.encoder = sklpp.OneHotEncoder(sparse=sparse,
                                          handle_unknown=handle_unknown)
 
     def fit_transform(self, train_df, output_col_prefix=None):

diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py
@@ -56,7 +56,9 @@ def update_trip_model(
         time_query = time_query_from_pipeline if model.is_incremental else None
         logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}')
         logging.debug(f'time query for training data collection: {time_query}')
+
         trips = _get_training_data(user_id, time_query)
+
         # don't start training for a user that doesn't have at least $trips many trips
         # (assume if a stored model exists for the user, that they met this requirement previously)
         if len(trips) == 0:

diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from numpy.linalg import norm
 
+
 def find_knee_point(values: List[float]) -> Tuple[float, int]:
     """for a list of values, find the value which represents the cut-off point
     or "elbow" in the function when values are sorted.

diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py
@@ -12,7 +12,7 @@
 import emission.tests.common as etc
 import emission.pipeline.intake_stage as epi
 import logging
-from bson.objectid import ObjectId
+import bson.objectid as boi
 
 import emission.analysis.modelling.trip_model.config as eamtc
 
@@ -60,8 +60,8 @@ def setUp(self):
         for result_entry in train:
             result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
             result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
-            result_entry['data']['start_place']=ObjectId()
-            result_entry['data']['end_place']=ObjectId()
+            result_entry['data']['start_place']=boi.ObjectId()
+            result_entry['data']['end_place']=boi.ObjectId()
         ts.bulk_insert(train)
         # confirm data write did not fail
         check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)

diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py
@@ -1,7 +1,7 @@
 from typing import ByteString
 import unittest
 import logging
-from unittest.mock import patch
+import unittest.mock as um
 import emission.analysis.modelling.trip_model.run_model as eamur
 import emission.analysis.modelling.trip_model.model_type as eamumt
 import emission.analysis.modelling.trip_model.model_storage as eamums
@@ -220,7 +220,7 @@ def mock_dump(*args,**kwargs):
         # side_effect, which is set to mock_dump, is called instead of
         # real joblib.dump function when 'to_dict' is invoked
 
-        with patch('joblib.dump',side_effect=mock_dump):
+        with um.patch('joblib.dump',side_effect=mock_dump):
             with self.assertRaises(RuntimeError):
                 model.to_dict()
 
@@ -260,7 +260,7 @@ def mock_load(*args,**kwargs):
         # side_effect, which is set to mock_load, is called instead of
         # real joblib.load function when 'to_dict' is invoked
 
-        with patch('joblib.load',side_effect=mock_load):
+        with um.patch('joblib.load',side_effect=mock_load):
             with self.assertRaises(RuntimeError):
                 deserialized_model.from_dict(model_data)
 

diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py
@@ -12,7 +12,7 @@
 import emission.storage.pipeline_queries as epq
 import emission.core.wrapper.pipelinestate as ecwp
 import emission.analysis.modelling.trip_model.forest_classifier as eamtf
-from sklearn.ensemble import RandomForestClassifier 
+import sklearn.ensemble as sklen
 
 class TestRunForestModel(unittest.TestCase):
     """
@@ -98,9 +98,9 @@ def testBuildForestModelFromConfig(self):
         """
 
         built_model = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build()
-        attributes={'purpose_predictor': RandomForestClassifier ,
-                    'mode_predictor' :RandomForestClassifier,
-                    'replaced_predictor':RandomForestClassifier,
+        attributes={'purpose_predictor': sklen.RandomForestClassifier ,
+                    'mode_predictor' :sklen.RandomForestClassifier,
+                    'replaced_predictor':sklen.RandomForestClassifier,
                     'purpose_enc' : eamtm.OneHotWrapper,
                     'mode_enc':eamtm.OneHotWrapper
                     }