Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update clustering.py #37

Merged
merged 21 commits into from
Nov 25, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
431b33d
Update clustering.py
humbleOldSage Aug 11, 2023
36065b4
moving clustering_examples.ipynb to trip_model
humbleOldSage Aug 16, 2023
97406c4
Removing changes in builtimeseries.py
humbleOldSage Aug 16, 2023
88988d3
Changes to support TRB_Label_Assist
humbleOldSage Aug 20, 2023
3e19b32
suggestions
humbleOldSage Aug 20, 2023
0899ee4
Revert "suggestions"
humbleOldSage Aug 20, 2023
667ab24
Improving readability
humbleOldSage Aug 20, 2023
e2448c5
making `cluster_performance.ipynb`, `generate_figs_for_poster` and `…
humbleOldSage Aug 22, 2023
59c7c64
Unified Interface for fit function
humbleOldSage Aug 26, 2023
a34836f
Fixing `models.py` to support `regenerate_classification_performance_…
humbleOldSage Aug 30, 2023
7eefdb0
[PARTIALLY TESTED] Single database read and Code Cleanuo
humbleOldSage Sep 14, 2023
6a641db
Delete TRB_label_assist/first_trial_results/cv results DBSCAN+SVM (de…
humbleOldSage Sep 18, 2023
3a8bdc0
Reverting Notebook
humbleOldSage Sep 18, 2023
7606d3d
[Partially Tested]Handled Whitespaces
humbleOldSage Sep 19, 2023
bb404e9
[Partially Tested] Suggested changes implemented
humbleOldSage Nov 7, 2023
97475ef
Revert "[Partially Tested] Suggested changes implemented"
humbleOldSage Nov 7, 2023
452e454
[Partially Tested] Suggested changes implemented
humbleOldSage Nov 7, 2023
2a39b12
Minor variable fixes
humbleOldSage Nov 10, 2023
e0beb0e
[TESTED] All the notebooks and files are tested
humbleOldSage Nov 16, 2023
c8c3883
Minor Fixes
humbleOldSage Nov 22, 2023
9225572
Minor Fixes in models.py
humbleOldSage Nov 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions TRB_label_assist/SVM_decision_boundaries.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"import itertools\n",
"\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
Expand Down Expand Up @@ -67,7 +66,7 @@
"for u in uuids:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_entry[u]=eamtr._get_training_data(u,None)\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",itertools.chain(ct_entry[u]))\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])"
Expand Down Expand Up @@ -270,9 +269,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "emission-private-eval",
shankari marked this conversation as resolved.
Show resolved Hide resolved
"display_name": "Python3",
"language": "python",
"name": "emission-private-eval"
"name": "Python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -284,7 +283,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
shankari marked this conversation as resolved.
Show resolved Hide resolved
"version": "3.9.6"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
3 changes: 1 addition & 2 deletions TRB_label_assist/classification_performance.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"import pandas as pd\n",
"import numpy as np\n",
"from uuid import UUID\n",
"import itertools\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# import logging\n",
Expand Down Expand Up @@ -53,7 +52,7 @@
"for u in all_users:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_entry[u]=eamtr._get_training_data(u,None)\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",itertools.chain(ct_entry[u]))\n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u])\n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n",
Expand Down
3 changes: 1 addition & 2 deletions TRB_label_assist/cluster_performance.ipynb
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
shankari marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import itertools\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.gridspec import GridSpec\n",
Expand Down Expand Up @@ -49,7 +48,7 @@
"for u in all_users:\n",
" ts = esta.TimeSeries.get_time_series(u)\n",
" ct_entry[u]=eamtr._get_training_data(u,None) \n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",itertools.chain(ct_entry[u])) \n",
" ct_df = ts.to_data_df(\"analysis/confirmed_trip\",ct_entry[u]) \n",
" confirmed_trip_df_map[u] = ct_df\n",
" labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)\n",
" expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(\n",
Expand Down
1 change: 1 addition & 0 deletions TRB_label_assist/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def add_loc_clusters(
"similarity_threshold_meters": r, # meters,
"apply_cutoff": False,
"clustering_way": clustering_way,
shankari marked this conversation as resolved.
Show resolved Hide resolved
"shouldFilter":False,
"incremental_evaluation": False
}

Expand Down
35 changes: 18 additions & 17 deletions TRB_label_assist/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ class Cluster(SetupMixin, metaclass=ABCMeta):
""" blueprint for clustering models. """

@abstractmethod
def fit(self, train_df,ct_entry=None):
def fit(self, train_df,unused=None):
""" Fit the clustering algorithm.

Args:
train_df (DataFrame): dataframe of labeled trips
ct_entry (List) : A list of Entry type of labeled and unlabeled trips

unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function.
shankari marked this conversation as resolved.
Show resolved Hide resolved
Passed to keep fit function generic.
Returns:
self
"""
Expand Down Expand Up @@ -165,13 +165,13 @@ def fit_predict(self, train_df):
class TripClassifier(SetupMixin, metaclass=ABCMeta):

@abstractmethod
def fit(self, train_df,ct_entry=None):
def fit(self, train_df,unused=None):
""" Fit a classification model.

Args:
train_df (DataFrame): dataframe of labeled trips
ct_entry (List) : A list of Entry type of labeled and unlabeled trips

unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function.
shankari marked this conversation as resolved.
Show resolved Hide resolved
Passed to keep fit function generic.
Returns:
self
"""
Expand Down Expand Up @@ -300,7 +300,7 @@ def set_params(self, params):

return self

def fit(self, train_df,ct_entry=None):
shankari marked this conversation as resolved.
Show resolved Hide resolved
def fit(self, train_df,ct_entry_list=None):
# clean data
logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df))
self.train_df = self._clean_data(train_df)
Expand Down Expand Up @@ -335,7 +335,7 @@ def fit(self, train_df,ct_entry=None):

# fit the bins
self.sim_model= eamtg.GreedySimilarityBinning(model_config)
cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,ct_entry)
cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,ct_entry_list)
self.sim_model.fit(cleaned_trip_entry)

labels = [int(l) for l in self.sim_model.tripLabels]
Expand All @@ -351,6 +351,7 @@ def predict(self, test_df):
elif self.loc_type == 'end':
bins = self.sim_model.bins

bins = {int(key):value for key,value in bins.items()}
shankari marked this conversation as resolved.
Show resolved Hide resolved
labels = []

# for each trip in the test list:
Expand Down Expand Up @@ -380,7 +381,7 @@ def _match(self, trip, bin, loc_type):
copied from the Similarity class on the e-mission-server.
"""
for t_idx in bin:
trip_in_bin = self.train_df.iloc[int(t_idx)]
trip_in_bin = self.train_df.iloc[t_idx]
if not self._distance_helper(trip, trip_in_bin, loc_type):
return False
return True
Expand Down Expand Up @@ -457,7 +458,7 @@ def set_params(self, params):

return self

def fit(self, train_df,ct_entry=None):
def fit(self, train_df,unused=None):
""" Creates clusters of trip points.
self.train_df will be updated with columns containing base and
final clusters.
Expand All @@ -468,8 +469,8 @@ def fit(self, train_df,ct_entry=None):

Args:
train_df (dataframe): dataframe of labeled trips
ct_entry (List) : A list of Entry type of labeled and unlabeled trips
"""
unused (List) : A list of Entry type of labeled and unlabeled trips which is not used in current function.
Passed to keep fit function generic. """
##################
### clean data ###
##################
Expand Down Expand Up @@ -662,7 +663,7 @@ def set_params(self, params):

return self

def fit(self, train_df,ct_entry=None):
def fit(self, train_df,unused=None):
logging.info("PERF: Fitting NaiveBinningClassifier")
# (copied from bsm.build_user_model())

Expand Down Expand Up @@ -902,13 +903,13 @@ def set_params(self, params):

return self

def fit(self, train_df,ct_entry=None):
def fit(self, train_df,ct_entry_list=None):
# fit clustering model
self.end_cluster_model.fit(train_df,ct_entry)
self.end_cluster_model.fit(train_df,ct_entry_list)
self.train_df = self.end_cluster_model.train_df

if self.cluster_method in ['trip', 'combination']:
self.start_cluster_model.fit(train_df,ct_entry)
self.start_cluster_model.fit(train_df,ct_entry_list)
self.train_df.loc[:, ['start_cluster_idx'
]] = self.start_cluster_model.train_df[[
'start_cluster_idx'
Expand Down Expand Up @@ -1071,7 +1072,7 @@ class EnsembleClassifier(TripClassifier, metaclass=ABCMeta):
replaced_predictor = NotImplemented

# required methods
def fit(self, train_df,ct_entry=None):
def fit(self, train_df,unused=None):
# get location features
if self.loc_feature == 'cluster':
# fit clustering model(s) and one-hot encode their indices
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
import numpy as np
from uuid import UUID
import itertools
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.trip_model.run_model as eamtr
Expand All @@ -20,7 +19,7 @@
for u in all_users:
ts = esta.TimeSeries.get_time_series(u)
ct_entry[u]=eamtr._get_training_data(u,None)
ct_df = ts.to_data_df("analysis/confirmed_trip",itertools.chain(ct_entry[u]))
ct_df = ts.to_data_df("analysis/confirmed_trip",ct_entry[u])
confirmed_trip_df_map[u] = ct_df
labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
expanded_labeled_trip_df_map[u] = esdtq.expand_userinputs(
Expand Down