From 8d2784702995e3728ba6157941e7415a3b20819b Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sat, 25 Nov 2023 14:07:29 -0500 Subject: [PATCH] Update clustering.py (#37) * Update clustering.py Changes in clustering.py file to shift dependency from hlu09's tour_model_extended to main branch trip_model. Still need to change type of data being passed to fit function for this to work. * moving clustering_examples.ipynb to trip_model All dependencies of this notebook from custom branch are removed. There currently seems no errors while generating maps in clustering_examples notebook. * Removing changes in builtimeseries.py With these changes, no change in e-mission-server should be required. * Changes to support TRB_Label_Assist passing way of clustering to the e-mission-server. It was 'origin-destination' by default. Now can take one of three values, 'origin','destination' or 'origin-destination'. * suggestions previous suggestions to improve readability. * Revert "suggestions" This reverts commit 3e19b32cd090135b001709cb52da57e6c6a17c1f. * Improving readability Suggestions from previous comments to improve readability. * making `cluster_performance.ipynb`, `generate_figs_for_poster` and `SVM_decision_boundaries` compatible with changes in `clustering.py` and `mapping.py` files. Also porting these 3 notebooks to trip_model `cluster_performance.ipynb`, `generate_figs_for_poster` and `SVM_decision_boundaries` now have no dependence on the custom branch. Results of plots are attached to show no difference in theie previous and current outputs. * Unified Interface for fit function Unified Interface for fit function across all models. Passing 'Entry' Type data from the notebooks till the Binning functions. Default set to 'none'. * Fixing `models.py` to support `regenerate_classification_performance_results.py` Prior to this update, `NaiveBinningClassifier` in 'models.py' had dependencies on both of tour model and trip model. Now, this classifier is completely dependent on trip model. All the other notebooks (except `classification_performance.ipynb`) were tested as well and they are working as usual. Other minor fixes to support previous changes. * [PARTIALLY TESTED] Single database read and Code Cleanuo 1. removed mentions of `tour_model` or `tour_model_first_only` . 2. removed two reads from database. 3. Removed notebook outputs ( this could be the reason a few diffs are too big to view) * Delete TRB_label_assist/first_trial_results/cv results DBSCAN+SVM (destination).csv not required. * Reverting Notebook Reverting notebooks to initial state, since running on the browser messed up the cell index numbers. This was causing unnecessary git diffs even when no changes were made. running on VS code should resolve this. WIll do the subsequent changes on VS code and commit again. * [Partially Tested]Handled Whitespaces Whitespaces corrected. * [Partially Tested] Suggested changes implemented `Classification_performance` and `regenerate_classification_performance_results.py` are not tested yet as they would take too long to run. The itertools removal in these two files is tested in other notebooks and it works. Other files, like models.py will be tested once any of the above two are run. * Revert "[Partially Tested] Suggested changes implemented" This reverts commit bb404e989b2826f159e88fa828537b24785508e3. * [Partially Tested] Suggested changes implemented [Partially Tested] Suggested changes implemented bb404e9 `Classification_performance` and `regenerate_classification_performance_results.py` are not tested yet as they would take too long to run. The itertools removal in these two files is tested in other notebooks and it works. Other files, like models.py will be tested once any of the above two are run. * Minor variable fixes Fixed names of variables to be more self-explanatory * [TESTED] All the notebooks and files are tested 1. Change in models file a.t. changes in greedy_similarity_binning in e-mission-server 2.Minor fixes * Minor Fixes Minor Fixes to improve readability. * Minor Fixes in models.py Improved readability --- .../SVM_decision_boundaries.ipynb | 7 +- .../classification_performance.ipynb | 9 +- TRB_label_assist/cluster_performance.ipynb | 12 +- TRB_label_assist/clustering.py | 47 ++++-- TRB_label_assist/clustering_examples.ipynb | 21 ++- .../generate_figs_for_poster.ipynb | 21 ++- .../get_performance_for_poster.ipynb | 8 +- TRB_label_assist/mapping.py | 7 + TRB_label_assist/models.py | 153 ++++++++++++------ TRB_label_assist/performance_eval.py | 18 ++- ...rate_classification_performance_results.py | 8 +- 11 files changed, 222 insertions(+), 89 deletions(-) diff --git a/TRB_label_assist/SVM_decision_boundaries.ipynb b/TRB_label_assist/SVM_decision_boundaries.ipynb index 5ed5376..407aee3 100644 --- a/TRB_label_assist/SVM_decision_boundaries.ipynb +++ b/TRB_label_assist/SVM_decision_boundaries.ipynb @@ -30,6 +30,7 @@ "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "\n", "import data_wrangling\n", "from clustering import add_loc_clusters" @@ -60,10 +61,12 @@ "uuids = [suburban_uuid, college_campus_uuid]\n", "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", + "ct_entry={}\n", "expanded_trip_df_map = {}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None)\n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -110,6 +113,8 @@ " df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n", "\n", " df_for_cluster = add_loc_clusters(df_for_cluster,\n", + " ct_entry,\n", + " clustering_way='destination',\n", " radii=radii,\n", " alg=alg,\n", " loc_type=loc_type,\n", diff --git a/TRB_label_assist/classification_performance.ipynb b/TRB_label_assist/classification_performance.ipynb index 60d1300..6e61d79 100644 --- a/TRB_label_assist/classification_performance.ipynb +++ b/TRB_label_assist/classification_performance.ipynb @@ -19,7 +19,6 @@ "import pandas as pd\n", "import numpy as np\n", "from uuid import UUID\n", - "\n", "import matplotlib.pyplot as plt\n", "\n", "# import logging\n", @@ -27,7 +26,7 @@ "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS" ] }, @@ -49,10 +48,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None)\n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -132,6 +132,7 @@ "# load in all runs\n", "model_names = list(PREDICTORS.keys())\n", "cv_results = cv_for_all_algs(\n", + " ct_entry,\n", " uuid_list=all_users,\n", " expanded_trip_df_map=expanded_labeled_trip_df_map,\n", " model_names=model_names,\n", diff --git a/TRB_label_assist/cluster_performance.ipynb b/TRB_label_assist/cluster_performance.ipynb index b6eed6d..81c088d 100644 --- a/TRB_label_assist/cluster_performance.ipynb +++ b/TRB_label_assist/cluster_performance.ipynb @@ -15,11 +15,10 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", - "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from matplotlib.gridspec import GridSpec\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import performance_eval\n", @@ -45,10 +44,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -87,6 +87,8 @@ "\n", " all_results_df = performance_eval.run_eval_cluster_metrics(\n", " expanded_labeled_trip_df_map,\n", + " ct_entry,\n", + " clustering_way='destination',\n", " user_list=all_users,\n", " radii=radii,\n", " loc_type='end',\n", @@ -265,6 +267,8 @@ "\n", "SVM_results_df = performance_eval.run_eval_cluster_metrics(\n", " expanded_labeled_trip_df_map,\n", + " ct_entry,\n", + " clustering_way=\"destination\",\n", " user_list=all_users,\n", " radii=radii,\n", " loc_type='end',\n", diff --git a/TRB_label_assist/clustering.py b/TRB_label_assist/clustering.py index fbe8a3b..d3924f3 100644 --- a/TRB_label_assist/clustering.py +++ b/TRB_label_assist/clustering.py @@ -16,8 +16,8 @@ # our imports # NOTE: this requires changing the branch of e-mission-server to # eval-private-data-compatibility -import emission.analysis.modelling.tour_model_extended.similarity as eamts import emission.storage.decorations.trip_queries as esdtq +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg EARTH_RADIUS = 6371000 ALG_OPTIONS = [ @@ -28,9 +28,27 @@ 'mean_shift' ] +def cleanEntryTypeData(loc_df,trip_entry): + + """ + Helps weed out entries from the list of entries which were removed from the df using + esdtq.filter_labeled_trips() and esdtq.expand_userinputs() + + loc_df : dataframe amde from entry type data + trip_entry : the entry type equivalent of loc_df , + which was passed alongside the dataframe while loading the data + + """ + + ids_in_df=loc_df['_id'] + filtered_trip_entry = list(filter(lambda entry: entry['_id'] in ids_in_df.values, trip_entry)) + return filtered_trip_entry + def add_loc_clusters( loc_df, + trip_entry, + clustering_way, radii, loc_type, alg, @@ -53,6 +71,9 @@ def add_loc_clusters( Args: loc_df (dataframe): must have columns 'start_lat' and 'start_lon' or 'end_lat' and 'end_lon' + trip_entry ( list of Entry/confirmedTrip): list consisting all entries from the + time data was loaded. loc_df was obtained from this by converting to df and + then filtering out labeled trips and expanding user_inputs radii (int list): list of radii to run the clustering algs with loc_type (str): 'start' or 'end' alg (str): 'DBSCAN', 'naive', 'OPTICS', 'SVM', 'fuzzy', or @@ -98,19 +119,25 @@ def add_loc_clusters( loc_df.loc[:, f"{loc_type}_DBSCAN_clusters_{r}_m"] = labels elif alg == 'naive': + + cleaned_trip_entry= cleanEntryTypeData(loc_df,trip_entry) + for r in radii: # this is using a modified Similarity class that bins start/end # points separately before creating trip-level bins - sim_model = eamts.Similarity(loc_df, - radius_start=r, - radius_end=r, - shouldFilter=False, - cutoff=False) - # we only bin the loc_type points to speed up the alg. avoid - # unnecessary binning since this is really slow - sim_model.bin_helper(loc_type=loc_type) - labels = sim_model.data_df[loc_type + '_bin'].to_list() + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": r, # meters, + "apply_cutoff": False, + "clustering_way": clustering_way, + "shouldFilter":False, + "incremental_evaluation": False + } + + sim_model = eamtg.GreedySimilarityBinning(model_config) + sim_model.fit(cleaned_trip_entry) + labels = [int(l) for l in sim_model.tripLabels] # # pd.Categorical converts the type from int to category (so # # numerical operations aren't possible) # loc_df.loc[:, f"{loc_type}_{alg}_clusters_{r}_m"] = pd.Categorical( diff --git a/TRB_label_assist/clustering_examples.ipynb b/TRB_label_assist/clustering_examples.ipynb index 4eb8a67..998abab 100644 --- a/TRB_label_assist/clustering_examples.ipynb +++ b/TRB_label_assist/clustering_examples.ipynb @@ -26,12 +26,11 @@ "%autoreload 2\n", "\n", "from uuid import UUID\n", - "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", - "\n", - "import mapping" + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", + "import mapping\n" ] }, { @@ -60,9 +59,11 @@ "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", "expanded_trip_df_map = {}\n", + "ct_entry={}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -83,8 +84,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -98,8 +101,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -121,9 +126,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -137,9 +144,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -161,9 +170,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n", + " ct_entry[suburban_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", @@ -177,9 +188,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n", + " ct_entry[college_campus_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way=\"destination\",\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150, 200])\n", diff --git a/TRB_label_assist/generate_figs_for_poster.ipynb b/TRB_label_assist/generate_figs_for_poster.ipynb index f89ec7c..bc508fa 100644 --- a/TRB_label_assist/generate_figs_for_poster.ipynb +++ b/TRB_label_assist/generate_figs_for_poster.ipynb @@ -29,7 +29,6 @@ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib\n", - "\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn import svm\n", @@ -37,7 +36,7 @@ "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", "import emission.core.get_database as edb\n", - "\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "import mapping\n", "import data_wrangling\n", "from clustering import add_loc_clusters" @@ -67,9 +66,11 @@ "confirmed_trip_df_map = {}\n", "labeled_trip_df_map = {}\n", "expanded_trip_df_map = {}\n", + "ct_entry={}\n", "for u in uuids:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])" @@ -98,8 +99,10 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user1_uuid],\n", + " ct_entry[user1_uuid],\n", " alg='naive',\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[50, 100, 150])\n", @@ -137,9 +140,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n", + " ct_entry[user2_uuid],\n", " alg='DBSCAN',\n", " SVM=False,\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[150])\n", @@ -161,9 +166,11 @@ "outputs": [], "source": [ "fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n", + " ct_entry[user2_uuid],\n", " alg='DBSCAN',\n", " SVM=True,\n", " loc_type='end',\n", + " clustering_way='destination',\n", " plot_unlabeled=False,\n", " cluster_unlabeled=False,\n", " radii=[150])\n", @@ -289,8 +296,14 @@ "\n", " labeled_trips_df = all_trips_df.loc[all_trips_df.user_input != {}]\n", " df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n", - "\n", + " if loc_type=='start':\n", + " clustering_way='origin'\n", + " else:\n", + " clustering_way='destination'\n", + " \n", " df_for_cluster = add_loc_clusters(df_for_cluster,\n", + " ct_entry,\n", + " clustering_way=clustering_way,\n", " radii=radii,\n", " alg=alg,\n", " loc_type=loc_type,\n", diff --git a/TRB_label_assist/get_performance_for_poster.ipynb b/TRB_label_assist/get_performance_for_poster.ipynb index cfacc5e..063a6e6 100644 --- a/TRB_label_assist/get_performance_for_poster.ipynb +++ b/TRB_label_assist/get_performance_for_poster.ipynb @@ -25,6 +25,7 @@ "\n", "import emission.storage.timeseries.abstract_timeseries as esta\n", "import emission.storage.decorations.trip_queries as esdtq\n", + "import emission.analysis.modelling.trip_model.run_model as eamtr\n", "\n", "from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS" ] @@ -48,10 +49,11 @@ "labeled_trip_df_map = {}\n", "expanded_labeled_trip_df_map = {}\n", "expanded_all_trip_df_map = {}\n", + "ct_entry={}\n", "for u in all_users:\n", " ts = esta.TimeSeries.get_time_series(u)\n", - " ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n", - "\n", + " ct_entry[u]=eamtr._get_training_data(u,None) \n", + " ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n", " confirmed_trip_df_map[u] = ct_df\n", " labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n", " expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n", @@ -113,7 +115,7 @@ " 'random forests (O-D, destination clusters)',\n", " 'random forests (coordinates)'\n", "]\n", - "cv_results = cv_for_all_algs(\n", + "cv_results = cv_for_all_algs(ct_entry,\n", " uuid_list=all_users,\n", " expanded_trip_df_map=expanded_labeled_trip_df_map,\n", " model_names=model_names,\n", diff --git a/TRB_label_assist/mapping.py b/TRB_label_assist/mapping.py index 2ef54de..cd2d117 100644 --- a/TRB_label_assist/mapping.py +++ b/TRB_label_assist/mapping.py @@ -37,8 +37,10 @@ def find_plot_clusters(user_df, + user_entry, loc_type, alg, + clustering_way, SVM=False, radii=[50, 100, 150, 200], cluster_unlabeled=False, @@ -64,6 +66,8 @@ def find_plot_clusters(user_df, alg (str): the clustering algorithm to be used. must be one of the following: 'DBSCAN', 'naive', 'OPTICS', 'SVM', 'fuzzy' or 'mean_shift' + clustering_way(str): 'origin'or 'destination' or 'origin-destination'. + Decides the way we can cluster trips geospatially. SVM (bool): whether or not to sub-divide clusters with SVM radii (int list): list of radii to pass to the clustering alg cluster_unlabeled (bool): whether or not unlabeled points are used @@ -91,6 +95,7 @@ def find_plot_clusters(user_df, assert 'start_loc' in user_df.columns assert 'end_loc' in user_df.columns assert 'user_input' in user_df.columns + assert clustering_way in ['origin','destination','origin-destination'] assert alg in ALG_OPTIONS fig = bre.Figure(figsize=(20, 20)) @@ -116,6 +121,8 @@ def find_plot_clusters(user_df, df_for_cluster = add_loc_clusters( df_for_cluster, + user_entry, + clustering_way, radii=radii, alg=alg, SVM=SVM, diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 6f02277..f3026b6 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -19,11 +19,16 @@ from clustering import get_distance_matrix, single_cluster_purity import data_wrangling import emission.storage.decorations.trip_queries as esdtq -import emission.analysis.modelling.tour_model_first_only.build_save_model as bsm -import emission.analysis.modelling.tour_model_first_only.evaluation_pipeline as ep from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe -import emission.analysis.modelling.tour_model_extended.similarity as eamts +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.core.common as ecc +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur + + +import clustering # NOTE: tour_model_extended.similarity is on the # eval-private-data-compatibility branch in e-mission-server @@ -116,12 +121,12 @@ class Cluster(SetupMixin, metaclass=ABCMeta): """ blueprint for clustering models. """ @abstractmethod - def fit(self, train_df): + def fit(self, train_df,train_entry_list): """ Fit the clustering algorithm. Args: train_df (DataFrame): dataframe of labeled trips - + train_entry_list (List) : A list of trips where each element is of Entry type Returns: self """ @@ -159,12 +164,13 @@ def fit_predict(self, train_df): class TripClassifier(SetupMixin, metaclass=ABCMeta): @abstractmethod - def fit(self, train_df): + def fit(self, train_df,unused=None): """ Fit a classification model. Args: train_df (DataFrame): dataframe of labeled trips - + unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function. + Passed to keep fit function generic. Returns: self """ @@ -293,10 +299,10 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, unused,train_entry_list=None): # clean data - logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) - self.train_df = self._clean_data(train_df) + logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(unused)) + self.train_df = self._clean_data(unused) # we can use all trips as long as they have purpose labels. it's ok if # they're missing mode/replaced-mode labels, because they aren't as @@ -315,17 +321,23 @@ def fit(self, train_df): if len(self.train_df) == 0: # i.e. no valid trips after removing all nans raise Exception('no valid trips; nothing to fit') - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way":'origin' if self.loc_type=='start' + else 'destination' if self.loc_type =='end' + else 'origin-destination', + "incremental_evaluation": False + } + # fit the bins - self.sim_model = eamts.Similarity(self.train_df, - radius_start=self.radius, - radius_end=self.radius, - shouldFilter=False, - cutoff=False) - # we only bin the loc_type points to speed up the alg. avoid - # unnecessary binning since this is really slow - self.sim_model.bin_helper(loc_type=self.loc_type) - labels = self.sim_model.data_df[self.loc_type + '_bin'].to_list() + self.sim_model= eamtg.GreedySimilarityBinning(model_config) + cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,train_entry_list) + self.sim_model.fit(cleaned_trip_entry) + + labels = [int(l) for l in self.sim_model.tripLabels] self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels return self @@ -334,10 +346,32 @@ def predict(self, test_df): self.test_df = self._clean_data(test_df) if self.loc_type == 'start': - bins = self.sim_model.start_bins + bins = self.sim_model.bins elif self.loc_type == 'end': - bins = self.sim_model.end_bins - + bins = self.sim_model.bins + + # This looks weird but works + # >>> x = [(1, 'a'), (2, 'b'), (3, 'c')] + # >>> {int(key):value for key,value in x} + # {1: 'a', 2: 'b', 3: 'c'} + # + # bins = { '1': [ 'key1': [] , 'key2' :[],.. ....], + # '2': ['key1': [] , 'key2' :[],...], + # '3': ['key1': [] , 'key2' :[],.....] ...} + # + # the code below converts above to + # + # bins = { 1: [ 'key1': [] , 'key2' :[],.. ....], + # 2: ['key1': [] , 'key2' :[],...], + # 3: ['key1': [] , 'key2' :[],.....] ....} + # + # This is why it works : + # 1. Iterate over (key,value) pairs in 'bins.items()' + # 2. for each pair, 'key' is a string . so use int(key) to convert it into an integer. + # 3. Create a new dictionary(using {} within the dictionary comprehension) + # where the keys are now integers and the values are same + + bins = {int(key):value for key,value in bins.items()} labels = [] # for each trip in the test list: @@ -346,10 +380,15 @@ def predict(self, test_df): logging.info("PERF: RefactoredNaiveCluster Working on trip %s/%s" % (idx, len(self.test_df))) # iterate over all bins trip_binned = False - for i, bin in enumerate(bins): + for i in bins: # check if the trip can fit in the bin - # if so, get the bin index - if self._match(row, bin, self.loc_type): + # if so, get the bin index. + # + # 'feature_rows' is the key that contains the list of list where + # each of the inner list takes the form : + # + # [ start_lon,start_lat,end_lon,end_lat] + if self._match(row, bins[i]['feature_rows'], self.loc_type): labels += [i] trip_binned = True break @@ -366,8 +405,7 @@ def _match(self, trip, bin, loc_type): copied from the Similarity class on the e-mission-server. """ - for t_idx in bin: - trip_in_bin = self.train_df.iloc[t_idx] + for trip_in_bin in bin: if not self._distance_helper(trip, trip_in_bin, loc_type): return False return True @@ -375,16 +413,20 @@ def _match(self, trip, bin, loc_type): def _distance_helper(self, tripa, tripb, loc_type): """ Check if two trips have start/end points within the distance threshold. - - copied from the Similarity class on the e-mission-server. """ + #tripa is taken from the test datframe. + #tripb is taken from the stored bin list. pta_lat = tripa[[loc_type + '_lat']] pta_lon = tripa[[loc_type + '_lon']] - ptb_lat = tripb[[loc_type + '_lat']] - ptb_lon = tripb[[loc_type + '_lon']] + if loc_type == 'start': + ptb_lat = tripb[1] + ptb_lon = tripb[0] + elif loc_type == 'end': + ptb_lat = tripb[3] + ptb_lon = tripb[2] - return eamts.within_radius(pta_lat, pta_lon, ptb_lat, ptb_lon, - self.radius) + dist= ecc.calDistance([pta_lon,pta_lat],[ptb_lon,ptb_lat]) + return dist <= self.radius class DBSCANSVMCluster(Cluster): @@ -444,7 +486,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,unused=None): """ Creates clusters of trip points. self.train_df will be updated with columns containing base and final clusters. @@ -455,7 +497,8 @@ def fit(self, train_df): Args: train_df (dataframe): dataframe of labeled trips - """ + unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function. + Passed to keep fit function generic. """ ################## ### clean data ### ################## @@ -648,7 +691,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,unused=None): logging.info("PERF: Fitting NaiveBinningClassifier") # (copied from bsm.build_user_model()) @@ -656,21 +699,29 @@ def fit(self, train_df): # only accepts lists of Entry objects train_trips = self._trip_df_to_list(train_df) - sim, bins, bin_trips, train_trips = ep.first_round( - train_trips, self.radius) - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", #cause thats what is set in performance_eval.py for this model + "incremental_evaluation": False + } + + sim_model = eamtg.GreedySimilarityBinning(model_config) + sim_model.fit(train_trips) # set instance variables so we can access results later as well - self.sim = sim - self.bins = bins + self.sim = sim_model + self.bins = sim_model.bins # save all user labels user_id = train_df.user_id.iloc[0] - bsm.save_models('user_labels', - bsm.create_user_input_map(train_trips, bins), user_id) + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE + model_data_next=sim_model.to_dict() + last_done_ts = eamur._latest_timestamp(train_trips) + eamums.save_model(user_id, model_type, model_data_next, last_done_ts, model_storage) - # save location features of all bins - bsm.save_models('locations', bsm.create_location_map(train_trips, bins), - user_id) return self def predict_proba(self, test_df): @@ -880,13 +931,13 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,train_entry_list=None): # fit clustering model - self.end_cluster_model.fit(train_df) + self.end_cluster_model.fit(train_df,train_entry_list) self.train_df = self.end_cluster_model.train_df if self.cluster_method in ['trip', 'combination']: - self.start_cluster_model.fit(train_df) + self.start_cluster_model.fit(train_df,train_entry_list) self.train_df.loc[:, ['start_cluster_idx' ]] = self.start_cluster_model.train_df[[ 'start_cluster_idx' @@ -1049,7 +1100,7 @@ class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): replaced_predictor = NotImplemented # required methods - def fit(self, train_df): + def fit(self, train_df,unused=None): # get location features if self.loc_feature == 'cluster': # fit clustering model(s) and one-hot encode their indices diff --git a/TRB_label_assist/performance_eval.py b/TRB_label_assist/performance_eval.py index e63a576..aee5ad2 100644 --- a/TRB_label_assist/performance_eval.py +++ b/TRB_label_assist/performance_eval.py @@ -9,6 +9,7 @@ import os import time from datetime import datetime +import pathlib import sklearn.metrics as sm from sklearn.metrics.cluster import contingency_matrix @@ -18,8 +19,6 @@ import models from data_wrangling import expand_coords from clustering import add_loc_clusters, ALG_OPTIONS, purity_score -import emission.analysis.modelling.tour_model_first_only.get_users as gu -import emission.analysis.modelling.tour_model_first_only.data_preprocessing as pp # TODO: these may require further updating DEFAULT_MODES = [ @@ -120,6 +119,7 @@ def cross_val_predict(model, + ct_entry, model_params=None, user_df=None, k=5, @@ -171,8 +171,7 @@ def cross_val_predict(model, # train the model logging.info("About to fit the model %s" % model) - model_.fit(train_trips) - + model_.fit(train_trips,ct_entry) logging.info("About to generate predictions for the model %s" % model) # generate predictions pred_df = model_.predict(test_trips) @@ -216,6 +215,7 @@ def cross_val_predict(model, def cv_for_all_users(model, + ct_entry, uuid_list, expanded_trip_df_map=None, model_params=None, @@ -233,6 +233,7 @@ def cv_for_all_users(model, logging.info("------ START: predictions for user %s and model %s" % (user, model)) try: results = cross_val_predict(model, + ct_entry[user], model_params, user_df=expanded_trip_df_map[user], k=k, @@ -265,7 +266,8 @@ def cv_for_all_users(model, return cross_val_all -def cv_for_all_algs(uuid_list, +def cv_for_all_algs(ct_entry, + uuid_list, expanded_trip_df_map, model_names=list(PREDICTORS.keys()), override_prior_runs=True, @@ -274,6 +276,7 @@ def cv_for_all_algs(uuid_list, min_samples=False, raise_errors=False): cv_results = {} + pathlib.Path('first_trial_results').mkdir(parents=True,exist_ok=True) #needed first time for model_name in model_names: csv_path = f'first_trial_results/cv results {model_name}.csv' if not override_prior_runs and os.path.exists(csv_path): @@ -289,6 +292,7 @@ def cv_for_all_algs(uuid_list, start_time = datetime.now() model, model_params = PREDICTORS[model_name] cv_df = cv_for_all_users(model, + ct_entry, uuid_list=uuid_list, expanded_trip_df_map=expanded_trip_df_map, model_params=model_params, @@ -627,6 +631,8 @@ def get_cluster_metrics(trip_df): def run_eval_cluster_metrics(expanded_all_trip_df_map, + ct_entry, + clustering_way, user_list, radii, loc_type, @@ -730,6 +736,8 @@ def run_eval_cluster_metrics(expanded_all_trip_df_map, user_trips = add_loc_clusters( user_trips, + ct_entry, + clustering_way, radii=radii, alg=alg, SVM=SVM, diff --git a/TRB_label_assist/regenerate_classification_performance_results.py b/TRB_label_assist/regenerate_classification_performance_results.py index 2f524c3..b549cc1 100644 --- a/TRB_label_assist/regenerate_classification_performance_results.py +++ b/TRB_label_assist/regenerate_classification_performance_results.py @@ -1,9 +1,9 @@ import pandas as pd import numpy as np from uuid import UUID - import emission.storage.timeseries.abstract_timeseries as esta import emission.storage.decorations.trip_queries as esdtq +import emission.analysis.modelling.trip_model.run_model as eamtr from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS @@ -15,10 +15,11 @@ labeled_trip_df_map = {} expanded_labeled_trip_df_map = {} expanded_all_trip_df_map = {} +ct_entry={} for u in all_users: ts = esta.TimeSeries.get_time_series(u) - ct_df = ts.get_data_df("analysis/confirmed_trip") - + ct_entry[u]=eamtr._get_training_data(u,None) + ct_df = ts.to_data_df("analysis/confirmed_trip",ct_entry[u]) confirmed_trip_df_map[u] = ct_df labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df) expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs( @@ -47,6 +48,7 @@ # load in all runs model_names = list(PREDICTORS.keys()) cv_results = cv_for_all_algs( + ct_entry, uuid_list=all_users, expanded_trip_df_map=expanded_labeled_trip_df_map, model_names=model_names,