diff --git a/TRB_label_assist/SVM_decision_boundaries.ipynb b/TRB_label_assist/SVM_decision_boundaries.ipynb index 5ed5376..407aee3 100644 --- a/TRB_label_assist/SVM_decision_boundaries.ipynb +++ b/TRB_label_assist/SVM_decision_boundaries.ipynb @@ -30,6 +30,7 @@ "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "\n", "import data_wrangling\n", "from clustering import add_loc_clusters" @@ -60,10 +61,12 @@ "uuids = [suburban_uuid, college_campus_uuid]\n", "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", + "ct_entry={}\n", "expanded_trip_df_map = {}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None)\n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -110,6 +113,8 @@ " df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n", "\n", " df_for_cluster = add_loc_clusters(df_for_cluster,\n", + " ct_entry,\n", + " clustering_way='destination',\n", " radii=radii,\n", " alg=alg,\n", " loc_type=loc_type,\n", diff --git a/TRB_label_assist/classification_performance.ipynb b/TRB_label_assist/classification_performance.ipynb index 60d1300..6e61d79 100644 --- a/TRB_label_assist/classification_performance.ipynb +++ b/TRB_label_assist/classification_performance.ipynb @@ -19,7 +19,6 @@ "import pandas as pd\n", "import numpy as np\n", "from uuid import UUID\n", - "\n", "import matplotlib.pyplot as plt\n", "\n", "# import logging\n", @@ -27,7 +26,7 @@ "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS" ] }, @@ -49,10 +48,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None)\n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -132,6 +132,7 @@ "# load in all runs\n", "model_names = list(PREDICTORS.keys())\n", "cv_results = cv_for_all_algs(\n", + " ct_entry,\n", " uuid_list=all_users,\n", " expanded_trip_df_map=expanded_labeled_trip_df_map,\n", " model_names=model_names,\n", diff --git a/TRB_label_assist/cluster_performance.ipynb b/TRB_label_assist/cluster_performance.ipynb index b6eed6d..81c088d 100644 --- a/TRB_label_assist/cluster_performance.ipynb +++ b/TRB_label_assist/cluster_performance.ipynb @@ -15,11 +15,10 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", - "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from matplotlib.gridspec import GridSpec\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import performance_eval\n", @@ -45,10 +44,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -87,6 +87,8 @@ "\n", " all_results_df = performance_eval.run_eval_cluster_metrics(\n", " expanded_labeled_trip_df_map,\n", + " ct_entry,\n", + " clustering_way='destination',\n", " user_list=all_users,\n", " radii=radii,\n", " loc_type='end',\n", @@ -265,6 +267,8 @@ "\n", "SVM_results_df = performance_eval.run_eval_cluster_metrics(\n", " expanded_labeled_trip_df_map,\n", + " ct_entry,\n", + " clustering_way=\"destination\",\n", " user_list=all_users,\n", " radii=radii,\n", " loc_type='end',\n", diff --git a/TRB_label_assist/clustering.py b/TRB_label_assist/clustering.py index fbe8a3b..d3924f3 100644 --- a/TRB_label_assist/clustering.py +++ b/TRB_label_assist/clustering.py @@ -16,8 +16,8 @@ # our imports # NOTE: this requires changing the branch of e-mission-server to # eval-private-data-compatibility -import emission.analysis.modelling.tour_model_extended.similarity as eamts import emission.storage.decorations.trip_queries as esdtq +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg EARTH_RADIUS = 6371000 ALG_OPTIONS = [ @@ -28,9 +28,27 @@ 'mean_shift' ] +def cleanEntryTypeData(loc_df,trip_entry): + + """ + Helps weed out entries from the list of entries which were removed from the df using + esdtq.filter_labeled_trips() and esdtq.expand_userinputs() + + loc_df : dataframe amde from entry type data + trip_entry : the entry type equivalent of loc_df , + which was passed alongside the dataframe while loading the data + + """ + + ids_in_df=loc_df['_id'] + filtered_trip_entry = list(filter(lambda entry: entry['_id'] in ids_in_df.values, trip_entry)) + return filtered_trip_entry + def add_loc_clusters( loc_df, + trip_entry, + clustering_way, radii, loc_type, alg, @@ -53,6 +71,9 @@ def add_loc_clusters( Args: loc_df (dataframe): must have columns 'start_lat' and 'start_lon' or 'end_lat' and 'end_lon' + trip_entry ( list of Entry/confirmedTrip): list consisting all entries from the + time data was loaded. loc_df was obtained from this by converting to df and + then filtering out labeled trips and expanding user_inputs radii (int list): list of radii to run the clustering algs with loc_type (str): 'start' or 'end' alg (str): 'DBSCAN', 'naive', 'OPTICS', 'SVM', 'fuzzy', or @@ -98,19 +119,25 @@ def add_loc_clusters( loc_df.loc[:, f"{loc_type}_DBSCAN_clusters_{r}_m"] = labels elif alg == 'naive': + + cleaned_trip_entry= cleanEntryTypeData(loc_df,trip_entry) + for r in radii: # this is using a modified Similarity class that bins start/end # points separately before creating trip-level bins - sim_model = eamts.Similarity(loc_df, - radius_start=r, - radius_end=r, - shouldFilter=False, - cutoff=False) - # we only bin the loc_type points to speed up the alg. avoid - # unnecessary binning since this is really slow - sim_model.bin_helper(loc_type=loc_type) - labels = sim_model.data_df[loc_type + '_bin'].to_list() + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": r, # meters, + "apply_cutoff": False, + "clustering_way": clustering_way, + "shouldFilter":False, + "incremental_evaluation": False + } + + sim_model = eamtg.GreedySimilarityBinning(model_config) + sim_model.fit(cleaned_trip_entry) + labels = [int(l) for l in sim_model.tripLabels] # # pd.Categorical converts the type from int to category (so # # numerical operations aren't possible) # loc_df.loc[:, f"{loc_type}_{alg}_clusters_{r}_m"] = pd.Categorical( diff --git a/TRB_label_assist/clustering_examples.ipynb b/TRB_label_assist/clustering_examples.ipynb index 4eb8a67..998abab 100644 --- a/TRB_label_assist/clustering_examples.ipynb +++ b/TRB_label_assist/clustering_examples.ipynb @@ -26,12 +26,11 @@ "%autoreload 2\n", "\n", "from uuid import UUID\n", - "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", - "\n", - "import mapping" + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", + "import mapping\n" ] }, { @@ -60,9 +59,11 @@ "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", "expanded_trip_df_map = {}\n", + "ct_entry={}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -83,8 +84,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -98,8 +101,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -121,9 +126,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -137,9 +144,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -161,9 +170,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -177,9 +188,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", diff --git a/TRB_label_assist/generate_figs_for_poster.ipynb b/TRB_label_assist/generate_figs_for_poster.ipynb index f89ec7c..bc508fa 100644 --- a/TRB_label_assist/generate_figs_for_poster.ipynb +++ b/TRB_label_assist/generate_figs_for_poster.ipynb @@ -29,7 +29,6 @@ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib\n", - "\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn import svm\n", @@ -37,7 +36,7 @@ "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "import mapping\n", "import data_wrangling\n", "from clustering import add_loc_clusters" @@ -67,9 +66,11 @@ "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", "expanded_trip_df_map = {}\n", + "ct_entry={}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -98,8 +99,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user1_uuid],\n", + " ct_entry[user1_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -137,9 +140,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n", + " ct_entry[user2_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[150])\n", @@ -161,9 +166,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n", + " ct_entry[user2_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[150])\n", @@ -289,8 +296,14 @@ "\n", " labeled_trips_df = all_trips_df.loc[all_trips_df.user_input != {}]\n", " df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n", - "\n", + " if loc_type=='start':\n", + " clustering_way='origin'\n", + " else:\n", + " clustering_way='destination'\n", + " \n", " df_for_cluster = add_loc_clusters(df_for_cluster,\n", + " ct_entry,\n", + " clustering_way=clustering_way,\n", " radii=radii,\n", " alg=alg,\n", " loc_type=loc_type,\n", diff --git a/TRB_label_assist/get_performance_for_poster.ipynb b/TRB_label_assist/get_performance_for_poster.ipynb index cfacc5e..063a6e6 100644 --- a/TRB_label_assist/get_performance_for_poster.ipynb +++ b/TRB_label_assist/get_performance_for_poster.ipynb @@ -25,6 +25,7 @@ "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "\n", "from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS" ] @@ -48,10 +49,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -113,7 +115,7 @@ " 'random forests (O-D, destination clusters)',\n", " 'random forests (coordinates)'\n", "]\n", - "cv_results = cv_for_all_algs(\n", + "cv_results = cv_for_all_algs(ct_entry,\n", " uuid_list=all_users,\n", " expanded_trip_df_map=expanded_labeled_trip_df_map,\n", " model_names=model_names,\n", diff --git a/TRB_label_assist/mapping.py b/TRB_label_assist/mapping.py index 2ef54de..cd2d117 100644 --- a/TRB_label_assist/mapping.py +++ b/TRB_label_assist/mapping.py @@ -37,8 +37,10 @@ def find_plot_clusters(user_df, + user_entry, loc_type, alg, + clustering_way, SVM=False, radii=[50, 100, 150, 200], cluster_unlabeled=False, @@ -64,6 +66,8 @@ def find_plot_clusters(user_df, alg (str): the clustering algorithm to be used. must be one of the following: 'DBSCAN', 'naive', 'OPTICS', 'SVM', 'fuzzy' or 'mean_shift' + clustering_way(str): 'origin'or 'destination' or 'origin-destination'. + Decides the way we can cluster trips geospatially. SVM (bool): whether or not to sub-divide clusters with SVM radii (int list): list of radii to pass to the clustering alg cluster_unlabeled (bool): whether or not unlabeled points are used @@ -91,6 +95,7 @@ def find_plot_clusters(user_df, assert 'start_loc' in user_df.columns assert 'end_loc' in user_df.columns assert 'user_input' in user_df.columns + assert clustering_way in ['origin','destination','origin-destination'] assert alg in ALG_OPTIONS fig = bre.Figure(figsize=(20, 20)) @@ -116,6 +121,8 @@ def find_plot_clusters(user_df, df_for_cluster = add_loc_clusters( df_for_cluster, + user_entry, + clustering_way, radii=radii, alg=alg, SVM=SVM, diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 6f02277..f3026b6 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -19,11 +19,16 @@ from clustering import get_distance_matrix, single_cluster_purity import data_wrangling import emission.storage.decorations.trip_queries as esdtq -import emission.analysis.modelling.tour_model_first_only.build_save_model as bsm -import emission.analysis.modelling.tour_model_first_only.evaluation_pipeline as ep from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe -import emission.analysis.modelling.tour_model_extended.similarity as eamts +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.core.common as ecc +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur + + +import clustering # NOTE: tour_model_extended.similarity is on the # eval-private-data-compatibility branch in e-mission-server @@ -116,12 +121,12 @@ class Cluster(SetupMixin, metaclass=ABCMeta): """ blueprint for clustering models. """ @abstractmethod - def fit(self, train_df): + def fit(self, train_df,train_entry_list): """ Fit the clustering algorithm. Args: train_df (DataFrame): dataframe of labeled trips - + train_entry_list (List) : A list of trips where each element is of Entry type Returns: self """ @@ -159,12 +164,13 @@ def fit_predict(self, train_df): class TripClassifier(SetupMixin, metaclass=ABCMeta): @abstractmethod - def fit(self, train_df): + def fit(self, train_df,unused=None): """ Fit a classification model. Args: train_df (DataFrame): dataframe of labeled trips - + unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function. + Passed to keep fit function generic. Returns: self """ @@ -293,10 +299,10 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, unused,train_entry_list=None): # clean data - logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) - self.train_df = self._clean_data(train_df) + logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(unused)) + self.train_df = self._clean_data(unused) # we can use all trips as long as they have purpose labels. it's ok if # they're missing mode/replaced-mode labels, because they aren't as @@ -315,17 +321,23 @@ def fit(self, train_df): if len(self.train_df) == 0: # i.e. no valid trips after removing all nans raise Exception('no valid trips; nothing to fit') - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way":'origin' if self.loc_type=='start' + else 'destination' if self.loc_type =='end' + else 'origin-destination', + "incremental_evaluation": False + } + # fit the bins - self.sim_model = eamts.Similarity(self.train_df, - radius_start=self.radius, - radius_end=self.radius, - shouldFilter=False, - cutoff=False) - # we only bin the loc_type points to speed up the alg. avoid - # unnecessary binning since this is really slow - self.sim_model.bin_helper(loc_type=self.loc_type) - labels = self.sim_model.data_df[self.loc_type + '_bin'].to_list() + self.sim_model= eamtg.GreedySimilarityBinning(model_config) + cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,train_entry_list) + self.sim_model.fit(cleaned_trip_entry) + + labels = [int(l) for l in self.sim_model.tripLabels] self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels return self @@ -334,10 +346,32 @@ def predict(self, test_df): self.test_df = self._clean_data(test_df) if self.loc_type == 'start': - bins = self.sim_model.start_bins + bins = self.sim_model.bins elif self.loc_type == 'end': - bins = self.sim_model.end_bins - + bins = self.sim_model.bins + + # This looks weird but works + # >>> x = [(1, 'a'), (2, 'b'), (3, 'c')] + # >>> {int(key):value for key,value in x} + # {1: 'a', 2: 'b', 3: 'c'} + # + # bins = { '1': [ 'key1': [] , 'key2' :[],.. ....], + # '2': ['key1': [] , 'key2' :[],...], + # '3': ['key1': [] , 'key2' :[],.....] ...} + # + # the code below converts above to + # + # bins = { 1: [ 'key1': [] , 'key2' :[],.. ....], + # 2: ['key1': [] , 'key2' :[],...], + # 3: ['key1': [] , 'key2' :[],.....] ....} + # + # This is why it works : + # 1. Iterate over (key,value) pairs in 'bins.items()' + # 2. for each pair, 'key' is a string . so use int(key) to convert it into an integer. + # 3. Create a new dictionary(using {} within the dictionary comprehension) + # where the keys are now integers and the values are same + + bins = {int(key):value for key,value in bins.items()} labels = [] # for each trip in the test list: @@ -346,10 +380,15 @@ def predict(self, test_df): logging.info("PERF: RefactoredNaiveCluster Working on trip %s/%s" % (idx, len(self.test_df))) # iterate over all bins trip_binned = False - for i, bin in enumerate(bins): + for i in bins: # check if the trip can fit in the bin - # if so, get the bin index - if self._match(row, bin, self.loc_type): + # if so, get the bin index. + # + # 'feature_rows' is the key that contains the list of list where + # each of the inner list takes the form : + # + # [ start_lon,start_lat,end_lon,end_lat] + if self._match(row, bins[i]['feature_rows'], self.loc_type): labels += [i] trip_binned = True break @@ -366,8 +405,7 @@ def _match(self, trip, bin, loc_type): copied from the Similarity class on the e-mission-server. """ - for t_idx in bin: - trip_in_bin = self.train_df.iloc[t_idx] + for trip_in_bin in bin: if not self._distance_helper(trip, trip_in_bin, loc_type): return False return True @@ -375,16 +413,20 @@ def _match(self, trip, bin, loc_type): def _distance_helper(self, tripa, tripb, loc_type): """ Check if two trips have start/end points within the distance threshold. - - copied from the Similarity class on the e-mission-server. """ + #tripa is taken from the test datframe. + #tripb is taken from the stored bin list. pta_lat = tripa[[loc_type + '_lat']] pta_lon = tripa[[loc_type + '_lon']] - ptb_lat = tripb[[loc_type + '_lat']] - ptb_lon = tripb[[loc_type + '_lon']] + if loc_type == 'start': + ptb_lat = tripb[1] + ptb_lon = tripb[0] + elif loc_type == 'end': + ptb_lat = tripb[3] + ptb_lon = tripb[2] - return eamts.within_radius(pta_lat, pta_lon, ptb_lat, ptb_lon, - self.radius) + dist= ecc.calDistance([pta_lon,pta_lat],[ptb_lon,ptb_lat]) + return dist <= self.radius class DBSCANSVMCluster(Cluster): @@ -444,7 +486,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,unused=None): """ Creates clusters of trip points. self.train_df will be updated with columns containing base and final clusters. @@ -455,7 +497,8 @@ def fit(self, train_df): Args: train_df (dataframe): dataframe of labeled trips - """ + unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function. + Passed to keep fit function generic. """ ################## ### clean data ### ################## @@ -648,7 +691,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,unused=None): logging.info("PERF: Fitting NaiveBinningClassifier") # (copied from bsm.build_user_model()) @@ -656,21 +699,29 @@ def fit(self, train_df): # only accepts lists of Entry objects train_trips = self._trip_df_to_list(train_df) - sim, bins, bin_trips, train_trips = ep.first_round( - train_trips, self.radius) - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", #cause thats what is set in performance_eval.py for this model + "incremental_evaluation": False + } + + sim_model = eamtg.GreedySimilarityBinning(model_config) + sim_model.fit(train_trips) # set instance variables so we can access results later as well - self.sim = sim - self.bins = bins + self.sim = sim_model + self.bins = sim_model.bins # save all user labels user_id = train_df.user_id.iloc[0] - bsm.save_models('user_labels', - bsm.create_user_input_map(train_trips, bins), user_id) + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE + model_data_next=sim_model.to_dict() + last_done_ts = eamur._latest_timestamp(train_trips) + eamums.save_model(user_id, model_type, model_data_next, last_done_ts, model_storage) - # save location features of all bins - bsm.save_models('locations', bsm.create_location_map(train_trips, bins), - user_id) return self def predict_proba(self, test_df): @@ -880,13 +931,13 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,train_entry_list=None): # fit clustering model - self.end_cluster_model.fit(train_df) + self.end_cluster_model.fit(train_df,train_entry_list) self.train_df = self.end_cluster_model.train_df if self.cluster_method in ['trip', 'combination']: - self.start_cluster_model.fit(train_df) + self.start_cluster_model.fit(train_df,train_entry_list) self.train_df.loc[:, ['start_cluster_idx' ]] = self.start_cluster_model.train_df[[ 'start_cluster_idx' @@ -1049,7 +1100,7 @@ class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): replaced_predictor = NotImplemented # required methods - def fit(self, train_df): + def fit(self, train_df,unused=None): # get location features if self.loc_feature == 'cluster': # fit clustering model(s) and one-hot encode their indices diff --git a/TRB_label_assist/performance_eval.py b/TRB_label_assist/performance_eval.py index e63a576..aee5ad2 100644 --- a/TRB_label_assist/performance_eval.py +++ b/TRB_label_assist/performance_eval.py @@ -9,6 +9,7 @@ import os import time from datetime import datetime +import pathlib import sklearn.metrics as sm from sklearn.metrics.cluster import contingency_matrix @@ -18,8 +19,6 @@ import models from data_wrangling import expand_coords from clustering import add_loc_clusters, ALG_OPTIONS, purity_score -import emission.analysis.modelling.tour_model_first_only.get_users as gu -import emission.analysis.modelling.tour_model_first_only.data_preprocessing as pp # TODO: these may require further updating DEFAULT_MODES = [ @@ -120,6 +119,7 @@ def cross_val_predict(model, + ct_entry, model_params=None, user_df=None, k=5, @@ -171,8 +171,7 @@ def cross_val_predict(model, # train the model logging.info("About to fit the model %s" % model) - model_.fit(train_trips) - + model_.fit(train_trips,ct_entry) logging.info("About to generate predictions for the model %s" % model) # generate predictions pred_df = model_.predict(test_trips) @@ -216,6 +215,7 @@ def cross_val_predict(model, def cv_for_all_users(model, + ct_entry, uuid_list, expanded_trip_df_map=None, model_params=None, @@ -233,6 +233,7 @@ def cv_for_all_users(model, logging.info("------ START: predictions for user %s and model %s" % (user, model)) try: results = cross_val_predict(model, + ct_entry[user], model_params, user_df=expanded_trip_df_map[user], k=k, @@ -265,7 +266,8 @@ def cv_for_all_users(model, return cross_val_all -def cv_for_all_algs(uuid_list, +def cv_for_all_algs(ct_entry, + uuid_list, expanded_trip_df_map, model_names=list(PREDICTORS.keys()), override_prior_runs=True, @@ -274,6 +276,7 @@ def cv_for_all_algs(uuid_list, min_samples=False, raise_errors=False): cv_results = {} + pathlib.Path('first_trial_results').mkdir(parents=True,exist_ok=True) #needed first time for model_name in model_names: csv_path = f'first_trial_results/cv results {model_name}.csv' if not override_prior_runs and os.path.exists(csv_path): @@ -289,6 +292,7 @@ def cv_for_all_algs(uuid_list, start_time = datetime.now() model, model_params = PREDICTORS[model_name] cv_df = cv_for_all_users(model, + ct_entry, uuid_list=uuid_list, expanded_trip_df_map=expanded_trip_df_map, model_params=model_params, @@ -627,6 +631,8 @@ def get_cluster_metrics(trip_df): def run_eval_cluster_metrics(expanded_all_trip_df_map, + ct_entry, + clustering_way, user_list, radii, loc_type, @@ -730,6 +736,8 @@ def run_eval_cluster_metrics(expanded_all_trip_df_map, user_trips = add_loc_clusters( user_trips, + ct_entry, + clustering_way, radii=radii, alg=alg, SVM=SVM, diff --git a/TRB_label_assist/regenerate_classification_performance_results.py b/TRB_label_assist/regenerate_classification_performance_results.py index 2f524c3..b549cc1 100644 --- a/TRB_label_assist/regenerate_classification_performance_results.py +++ b/TRB_label_assist/regenerate_classification_performance_results.py @@ -1,9 +1,9 @@ import pandas as pd import numpy as np from uuid import UUID - import emission.storage.timeseries.abstract_timeseries as esta import emission.storage.decorations.trip_queries as esdtq +import emission.analysis.modelling.trip_model.run_model as eamtr from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS @@ -15,10 +15,11 @@ labeled_trip_df_map = {} expanded_labeled_trip_df_map = {} expanded_all_trip_df_map = {} +ct_entry={} for u in all_users: ts = esta.TimeSeries.get_time_series(u) - ct_df = ts.get_data_df("analysis/confirmed_trip") - + ct_entry[u]=eamtr._get_training_data(u,None) + ct_df = ts.to_data_df("analysis/confirmed_trip",ct_entry[u]) confirmed_trip_df_map[u] = ct_df labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df) expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs( @@ -47,6 +48,7 @@ # load in all runs model_names = list(PREDICTORS.keys()) cv_results = cv_for_all_algs( + ct_entry, uuid_list=all_users, expanded_trip_df_map=expanded_labeled_trip_df_map, model_names=model_names,