Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update clustering.py #37

Merged
merged 21 commits into from
Nov 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
431b33d
Update clustering.py
humbleOldSage Aug 11, 2023
36065b4
moving clustering_examples.ipynb to trip_model
humbleOldSage Aug 16, 2023
97406c4
Removing changes in builtimeseries.py
humbleOldSage Aug 16, 2023
88988d3
Changes to support TRB_Label_Assist
humbleOldSage Aug 20, 2023
3e19b32
suggestions
humbleOldSage Aug 20, 2023
0899ee4
Revert "suggestions"
humbleOldSage Aug 20, 2023
667ab24
Improving readability
humbleOldSage Aug 20, 2023
e2448c5
making `cluster_performance.ipynb`, `generate_figs_for_poster` and `…
humbleOldSage Aug 22, 2023
59c7c64
Unified Interface for fit function
humbleOldSage Aug 26, 2023
a34836f
Fixing `models.py` to support `regenerate_classification_performance_…
humbleOldSage Aug 30, 2023
7eefdb0
[PARTIALLY TESTED] Single database read and Code Cleanuo
humbleOldSage Sep 14, 2023
6a641db
Delete TRB_label_assist/first_trial_results/cv results DBSCAN+SVM (de…
humbleOldSage Sep 18, 2023
3a8bdc0
Reverting Notebook
humbleOldSage Sep 18, 2023
7606d3d
[Partially Tested]Handled Whitespaces
humbleOldSage Sep 19, 2023
bb404e9
[Partially Tested] Suggested changes implemented
humbleOldSage Nov 7, 2023
97475ef
Revert "[Partially Tested] Suggested changes implemented"
humbleOldSage Nov 7, 2023
452e454
[Partially Tested] Suggested changes implemented
humbleOldSage Nov 7, 2023
2a39b12
Minor variable fixes
humbleOldSage Nov 10, 2023
e0beb0e
[TESTED] All the notebooks and files are tested
humbleOldSage Nov 16, 2023
c8c3883
Minor Fixes
humbleOldSage Nov 22, 2023
9225572
Minor Fixes in models.py
humbleOldSage Nov 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion TRB_label_assist/SVM_decision_boundaries.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"import emission.storage.timeseries.abstract_timeseries as esta\n",
"import emission.storage.decorations.trip_queries as esdtq\n",
"import emission.core.get_database as edb\n",
"import emission.analysis.modelling.trip_model.run_model as eamtr\n",
"\n",
"import data_wrangling\n",
"from clustering import add_loc_clusters"
Expand Down Expand Up @@ -60,10 +61,12 @@
"uuids = [suburban_uuid, college_campus_uuid]\n",
"confirmed_trip_df_map = {}\n",
"labeled_trip_df_map = {}\n",
"ct_entry={}\n",
"expanded_trip_df_map = {}\n",
"for u in uuids:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
" ct_entry[u]=eamtr._get_training_data(u,None)\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])"
Expand Down Expand Up @@ -110,6 +113,8 @@
" df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n",
"\n",
" df_for_cluster = add_loc_clusters(df_for_cluster,\n",
" ct_entry,\n",
" clustering_way='destination',\n",
" radii=radii,\n",
" alg=alg,\n",
" loc_type=loc_type,\n",
Expand Down
9 changes: 5 additions & 4 deletions TRB_label_assist/classification_performance.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
"import pandas as pd\n",
"import numpy as np\n",
"from uuid import UUID\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# import logging\n",
"# logging.basicConfig(level=logging.DEBUG)\n",
"\n",
"import emission.storage.timeseries.abstract_timeseries as esta\n",
"import emission.storage.decorations.trip_queries as esdtq\n",
"\n",
"import emission.analysis.modelling.trip_model.run_model as eamtr\n",
"from performance_eval import get_clf_metrics, cv_for_all_algs, PREDICTORS"
]
},
Expand All @@ -49,10 +48,11 @@
"labeled_trip_df_map = {}\n",
"expanded_labeled_trip_df_map = {}\n",
"expanded_all_trip_df_map = {}\n",
"ct_entry={}\n",
"for u in all_users:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
"\n",
" ct_entry[u]=eamtr._get_training_data(u,None)\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n",
Expand Down Expand Up @@ -132,6 +132,7 @@
"# load in all runs\n",
"model_names = list(PREDICTORS.keys())\n",
"cv_results = cv_for_all_algs(\n",
" ct_entry,\n",
" uuid_list=all_users,\n",
" expanded_trip_df_map=expanded_labeled_trip_df_map,\n",
" model_names=model_names,\n",
Expand Down
12 changes: 8 additions & 4 deletions TRB_label_assist/cluster_performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.gridspec import GridSpec\n",
"\n",
"import emission.analysis.modelling.trip_model.run_model as eamtr\n",
"import emission.storage.timeseries.abstract_timeseries as esta\n",
"import emission.storage.decorations.trip_queries as esdtq\n",
"import performance_eval\n",
Expand All @@ -45,10 +44,11 @@
"labeled_trip_df_map = {}\n",
"expanded_labeled_trip_df_map = {}\n",
"expanded_all_trip_df_map = {}\n",
"ct_entry={}\n",
"for u in all_users:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
"\n",
" ct_entry[u]=eamtr._get_training_data(u,None) \n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n",
Expand Down Expand Up @@ -87,6 +87,8 @@
"\n",
" all_results_df = performance_eval.run_eval_cluster_metrics(\n",
" expanded_labeled_trip_df_map,\n",
" ct_entry,\n",
" clustering_way='destination',\n",
" user_list=all_users,\n",
" radii=radii,\n",
" loc_type='end',\n",
Expand Down Expand Up @@ -265,6 +267,8 @@
"\n",
"SVM_results_df = performance_eval.run_eval_cluster_metrics(\n",
" expanded_labeled_trip_df_map,\n",
" ct_entry,\n",
" clustering_way=\"destination\",\n",
" user_list=all_users,\n",
" radii=radii,\n",
" loc_type='end',\n",
Expand Down
47 changes: 37 additions & 10 deletions TRB_label_assist/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
# our imports
# NOTE: this requires changing the branch of e-mission-server to
# eval-private-data-compatibility
import emission.analysis.modelling.tour_model_extended.similarity as eamts
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg

EARTH_RADIUS = 6371000
ALG_OPTIONS = [
Expand All @@ -28,9 +28,27 @@
'mean_shift'
]

def cleanEntryTypeData(loc_df,trip_entry):

"""
Helps weed out entries from the list of entries which were removed from the df using
esdtq.filter_labeled_trips() and esdtq.expand_userinputs()

loc_df : dataframe amde from entry type data
trip_entry : the entry type equivalent of loc_df ,
which was passed alongside the dataframe while loading the data

"""

ids_in_df=loc_df['_id']
filtered_trip_entry = list(filter(lambda entry: entry['_id'] in ids_in_df.values, trip_entry))
return filtered_trip_entry


def add_loc_clusters(
loc_df,
trip_entry,
clustering_way,
radii,
loc_type,
alg,
Expand All @@ -53,6 +71,9 @@ def add_loc_clusters(
Args:
loc_df (dataframe): must have columns 'start_lat' and 'start_lon'
or 'end_lat' and 'end_lon'
trip_entry ( list of Entry/confirmedTrip): list consisting all entries from the
time data was loaded. loc_df was obtained from this by converting to df and
then filtering out labeled trips and expanding user_inputs
radii (int list): list of radii to run the clustering algs with
loc_type (str): 'start' or 'end'
alg (str): 'DBSCAN', 'naive', 'OPTICS', 'SVM', 'fuzzy', or
Expand Down Expand Up @@ -98,19 +119,25 @@ def add_loc_clusters(
loc_df.loc[:, f"{loc_type}_DBSCAN_clusters_{r}_m"] = labels

elif alg == 'naive':

cleaned_trip_entry= cleanEntryTypeData(loc_df,trip_entry)

for r in radii:
# this is using a modified Similarity class that bins start/end
# points separately before creating trip-level bins
sim_model = eamts.Similarity(loc_df,
radius_start=r,
radius_end=r,
shouldFilter=False,
cutoff=False)
shankari marked this conversation as resolved.
Show resolved Hide resolved
# we only bin the loc_type points to speed up the alg. avoid
# unnecessary binning since this is really slow
sim_model.bin_helper(loc_type=loc_type)
labels = sim_model.data_df[loc_type + '_bin'].to_list()

model_config = {
"metric": "od_similarity",
"similarity_threshold_meters": r, # meters,
"apply_cutoff": False,
"clustering_way": clustering_way,
shankari marked this conversation as resolved.
Show resolved Hide resolved
"shouldFilter":False,
"incremental_evaluation": False
}

sim_model = eamtg.GreedySimilarityBinning(model_config)
sim_model.fit(cleaned_trip_entry)
labels = [int(l) for l in sim_model.tripLabels]
# # pd.Categorical converts the type from int to category (so
# # numerical operations aren't possible)
# loc_df.loc[:, f"{loc_type}_{alg}_clusters_{r}_m"] = pd.Categorical(
Expand Down
21 changes: 17 additions & 4 deletions TRB_label_assist/clustering_examples.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@
"%autoreload 2\n",
"\n",
"from uuid import UUID\n",
"\n",
"import emission.storage.timeseries.abstract_timeseries as esta\n",
"import emission.storage.decorations.trip_queries as esdtq\n",
"import emission.core.get_database as edb\n",
"\n",
"import mapping"
"import emission.analysis.modelling.trip_model.run_model as eamtr\n",
"import mapping\n"
]
},
{
Expand Down Expand Up @@ -60,9 +59,11 @@
"confirmed_trip_df_map = {}\n",
"labeled_trip_df_map = {}\n",
"expanded_trip_df_map = {}\n",
"ct_entry={}\n",
"for u in uuids:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
" ct_entry[u]=eamtr._get_training_data(u,None) \n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])"
Expand All @@ -83,8 +84,10 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n",
" ct_entry[suburban_uuid],\n",
" alg='naive',\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150])\n",
Expand All @@ -98,8 +101,10 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n",
" ct_entry[college_campus_uuid],\n",
" alg='naive',\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150])\n",
Expand All @@ -121,9 +126,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n",
" ct_entry[suburban_uuid],\n",
" alg='DBSCAN',\n",
" SVM=False,\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150, 200])\n",
Expand All @@ -137,9 +144,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n",
" ct_entry[college_campus_uuid],\n",
" alg='DBSCAN',\n",
" SVM=False,\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150, 200])\n",
Expand All @@ -161,9 +170,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[suburban_uuid],\n",
" ct_entry[suburban_uuid],\n",
" alg='DBSCAN',\n",
" SVM=True,\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150, 200])\n",
Expand All @@ -177,9 +188,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[college_campus_uuid],\n",
" ct_entry[college_campus_uuid],\n",
" alg='DBSCAN',\n",
" SVM=True,\n",
" loc_type='end',\n",
" clustering_way=\"destination\",\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150, 200])\n",
Expand Down
21 changes: 17 additions & 4 deletions TRB_label_assist/generate_figs_for_poster.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,14 @@
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn import svm\n",
"\n",
"import emission.storage.timeseries.abstract_timeseries as esta\n",
"import emission.storage.decorations.trip_queries as esdtq\n",
"import emission.core.get_database as edb\n",
"\n",
"import emission.analysis.modelling.trip_model.run_model as eamtr\n",
"import mapping\n",
"import data_wrangling\n",
"from clustering import add_loc_clusters"
Expand Down Expand Up @@ -67,9 +66,11 @@
"confirmed_trip_df_map = {}\n",
"labeled_trip_df_map = {}\n",
"expanded_trip_df_map = {}\n",
"ct_entry={}\n",
"for u in uuids:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
" ct_entry[u]=eamtr._get_training_data(u,None) \n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])"
Expand Down Expand Up @@ -98,8 +99,10 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[user1_uuid],\n",
" ct_entry[user1_uuid],\n",
" alg='naive',\n",
" loc_type='end',\n",
" clustering_way='destination',\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[50, 100, 150])\n",
Expand Down Expand Up @@ -137,9 +140,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n",
" ct_entry[user2_uuid],\n",
" alg='DBSCAN',\n",
" SVM=False,\n",
" loc_type='end',\n",
" clustering_way='destination',\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[150])\n",
Expand All @@ -161,9 +166,11 @@
"outputs": [],
"source": [
"fig = mapping.find_plot_clusters(expanded_trip_df_map[user2_uuid],\n",
" ct_entry[user2_uuid],\n",
" alg='DBSCAN',\n",
" SVM=True,\n",
" loc_type='end',\n",
" clustering_way='destination',\n",
" plot_unlabeled=False,\n",
" cluster_unlabeled=False,\n",
" radii=[150])\n",
Expand Down Expand Up @@ -289,8 +296,14 @@
"\n",
" labeled_trips_df = all_trips_df.loc[all_trips_df.user_input != {}]\n",
" df_for_cluster = all_trips_df if cluster_unlabeled else labeled_trips_df\n",
"\n",
" if loc_type=='start':\n",
" clustering_way='origin'\n",
" else:\n",
" clustering_way='destination'\n",
" \n",
" df_for_cluster = add_loc_clusters(df_for_cluster,\n",
" ct_entry,\n",
" clustering_way=clustering_way,\n",
" radii=radii,\n",
" alg=alg,\n",
" loc_type=loc_type,\n",
Expand Down
Loading