From a79d4fa09221336650002828af913e60029ab320 Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Tue, 29 Jun 2021 18:17:48 -0700
Subject: [PATCH 1/6] check unit test code

---
 .../tour_model/data_preprocessing.py          | 58 ++++++++++++++
 .../modelling/tour_model/get_users.py         | 31 +++++++
 .../clusteringTests/TestDataPreprocessing.py  | 80 +++++++++++++++++++
 .../clusteringTests/TestGetUsers.py           | 34 ++++++++
 4 files changed, 203 insertions(+)
 create mode 100644 emission/analysis/modelling/tour_model/data_preprocessing.py
 create mode 100644 emission/analysis/modelling/tour_model/get_users.py
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetUsers.py

diff --git a/emission/analysis/modelling/tour_model/data_preprocessing.py b/emission/analysis/modelling/tour_model/data_preprocessing.py
new file mode 100644
index 000000000..23100c544
--- /dev/null
+++ b/emission/analysis/modelling/tour_model/data_preprocessing.py
@@ -0,0 +1,58 @@
+import emission.storage.decorations.analysis_timeseries_queries as esda
+import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
+import emission.analysis.modelling.tour_model.similarity as similarity
+import pandas as pd
+from sklearn.model_selection import KFold
+
+
+# read data that have user labels
+def read_data(user):
+    trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY)
+    return trips
+
+
+# - trips: all trips read from database
+# - filter_trips: valid trips that have user labels and are not points
+def filter_data(trips,radius):
+    non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
+    non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips)
+    valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
+    valid_trips_idx_ls = valid_trips_df.index.tolist()
+    valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]
+
+    # similarity codes can filter out trips that are points in valid_trips
+    filter_trips = similarity.filter_too_short(valid_trips, radius)
+    return filter_trips
+
+
+# use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets)
+def extract_features(filter_trips):
+    X = []
+    for trip in filter_trips:
+        start = trip.data.start_loc["coordinates"]
+        end = trip.data.end_loc["coordinates"]
+        distance = trip.data.distance
+        duration = trip.data.duration
+        X.append([start[0], start[1], end[0], end[1], distance, duration])
+    return X
+
+def split_data(filter_trips):
+    X = extract_features(filter_trips)
+    kf = KFold(n_splits=5, shuffle=True, random_state=3)
+    train_idx = []
+    test_idx = []
+    for train_index, test_index in kf.split(X):
+        train_idx.append(train_index)
+        test_idx.append(test_index)
+    return train_idx, test_idx
+
+
+# collect a set of data(training/test set) after splitting
+def get_subdata(filter_trips,train_test_set):
+    collect_sub_data = []
+    for train_test_subset in train_test_set:
+        sub_data = []
+        for idx in train_test_subset:
+            sub_data.append(filter_trips[idx])
+        collect_sub_data.append(sub_data)
+    return collect_sub_data
diff --git a/emission/analysis/modelling/tour_model/get_users.py b/emission/analysis/modelling/tour_model/get_users.py
new file mode 100644
index 000000000..fc540b4aa
--- /dev/null
+++ b/emission/analysis/modelling/tour_model/get_users.py
@@ -0,0 +1,31 @@
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+
+
+# to determine if the user is valid:
+# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50%
+def valid_user(filter_trips,trips):
+    valid = False
+    if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5:
+        valid = True
+    return valid
+
+
+# - user_ls: a list of strings representing short user names, such as [user1, user2, user3...]
+# - valid_user_ls: a subset of `user_ls` for valid users, so also string representation of user names
+# - all_users: a collection of all user ids, in terms of user id objects
+def get_user_ls(all_users,radius):
+    user_ls = []
+    valid_user_ls = []
+    for i in range(len(all_users)):
+        curr_user = 'user' + str(i + 1)
+        user = all_users[i]
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        if valid_user(filter_trips,trips):
+            valid_user_ls.append(curr_user)
+            user_ls.append(curr_user)
+        else:
+            user_ls.append(curr_user)
+            continue
+    return user_ls,valid_user_ls
+
diff --git a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
new file mode 100644
index 000000000..7f9a0e8c3
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
@@ -0,0 +1,80 @@
+import emission.core.wrapper.localdate as ecwl
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+
+from future import standard_library
+standard_library.install_aliases()
+from builtins import *
+import unittest
+import json
+import bson.json_util as bju
+import emission.storage.timeseries.abstract_timeseries as esta
+
+import emission.tests.common as etc
+
+
+class TestDataPreprocessing(unittest.TestCase):
+
+    # should setup user = [self.testUUID], radius = 100
+    # do we need teardown if we don't use databse?
+
+
+    def test_read_data(self):
+        dataFile = "emission/tests/data/real_examples/shankari_2016-06-20"
+        ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})
+        with open(dataFile+".ground_truth") as gfp:
+            ground_truth = json.load(gfp, object_hook=bju.object_hook)
+
+        etc.setupRealExample(self, dataFile)
+        # if (not preload):
+        self.entries = json.load(open(dataFile+".user_inputs"), object_hook = bju.object_hook)
+        etc.setupRealExampleWithEntries(self)
+        etc.runIntakePipeline(self.testUUID)
+        ts = esta.TimeSeries.get_time_series(self.testUUID)
+        confirmed_trips = list(ts.find_entries(["analysis/confirmed_trip"], None))
+        with open(dataFile+".expected_confirmed_trips") as dect:
+            expected_confirmed_trips = json.load(dect, object_hook = bju.object_hook)
+        print('confirmed_trips',confirmed_trips)
+        user = [self.testUUID]
+        trips = preprocess.read_data(user)
+        print('trips ', trips)
+        # I don't know how to assertEqual here
+
+
+    def test_filter_data(self):
+        radius = 100
+        # - trips: should be read from a file or from database
+        user = [self.testUUID]
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        # assertEqual
+
+    def test_extract_features(self):
+        user = [self.testUUID]
+        radius = 100
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        X = preprocess.extract_features(filter_trips)
+        # assertEqual
+
+    def test_split_data(self):
+        user = [self.testUUID]
+        radius = 100
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        train_idx, test_idx = preprocess.split_data(filter_trips)
+        # assertEqual
+
+    def test_get_subdata(self):
+        user = [self.testUUID]
+        radius = 100
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        train_set_idx = [0,1,2,3,4]
+        collect_sub_data = preprocess.get_subdata(filter_trips, train_set_idx)
+        # assertEqual
+
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
+
diff --git a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py
new file mode 100644
index 000000000..6283f8b07
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py
@@ -0,0 +1,34 @@
+from future import standard_library
+standard_library.install_aliases()
+import unittest
+import emission.analysis.modelling.tour_model.get_users as gu
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+
+import emission.tests.common as etc
+
+class TestGetUsers(unittest.TestCase):
+    # def setUp(self):
+    #
+    # def tearDown(self):
+
+
+    def test_valid_user(self):
+        user = [self.testUUID]
+        radius = 100
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        valid = gu.valid_user(filter_trips, trips)
+        # assertEqual
+
+
+    def test_get_user_ls(self):
+        all_users = [self.testUUID]
+        radius = 100
+        user_ls,valid_user_ls = gu.get_user_ls(all_users, radius)
+        # assertEqual
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
+
+

From 8de7c77d91df2858eb626f24c4abf05f91e9a2af Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Fri, 9 Jul 2021 11:34:17 -0700
Subject: [PATCH 2/6] not yet refactored, just for testing notebook codes

---
 .../tour_model/get_request_percentage.py      | 135 ++++++++++++++++++
 .../modelling/tour_model/get_scores.py        |  83 +++++++++++
 .../modelling/tour_model/label_processing.py  | 133 +++++++++++++++++
 3 files changed, 351 insertions(+)
 create mode 100644 emission/analysis/modelling/tour_model/get_request_percentage.py
 create mode 100644 emission/analysis/modelling/tour_model/get_scores.py
 create mode 100644 emission/analysis/modelling/tour_model/label_processing.py

diff --git a/emission/analysis/modelling/tour_model/get_request_percentage.py b/emission/analysis/modelling/tour_model/get_request_percentage.py
new file mode 100644
index 000000000..7e5f8639b
--- /dev/null
+++ b/emission/analysis/modelling/tour_model/get_request_percentage.py
@@ -0,0 +1,135 @@
+import label_processing as label_pro
+import copy
+import itertools
+
+
+# This function is to compare a trip with a group of trips to see if they happened in a same day
+def match_day(trip,bin,filter_trips):
+    if bin:
+        t = filter_trips[bin[0]]
+        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
+                and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\
+                and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:
+            return True
+    return False
+
+
+# This function is to compare a trip with a group of trips to see if they happened in a same month
+def match_month(trip,bin,filter_trips):
+    if bin:
+        t = filter_trips[bin[0]]
+        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
+                and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:
+            return True
+    return False
+
+
+# This function bins trips according to ['start_local_dt']
+def bin_date(trip_ls,filter_trips,day=None,month=None):
+    bin_date = []
+    for trip_index in trip_ls:
+        added = False
+        trip = filter_trips[trip_index]
+
+        for bin in bin_date:
+            if day:
+                if match_day(trip,bin,filter_trips):
+                    bin.append(trip_index)
+                    added = True
+                    break
+            if month:
+                if match_month(trip,bin,filter_trips):
+                    bin.append(trip_index)
+                    added = True
+                    break
+
+        if not added:
+            bin_date.append([trip_index])
+
+    return bin_date
+
+
+def find_first_trip(filter_trips,bin):
+    trip_ts = [filter_trips[trip_idx]['data']["start_ts"] for trip_idx in bin]
+    # - early_idx_in_bin: the earliest trip position in the bin
+    # ts = [20,10,40,5,100]
+    # early_idx_in_bin = 3
+    # early trip_index = 5
+    early_idx_in_bin = trip_ts.index(min(trip_ts))
+    # - early_trip_index: the original index of the earliest trip
+    early_trip_index = bin[early_idx_in_bin]
+    return early_trip_index
+
+
+# collect requested trips and common trips(no need to request) indices above cutoff
+def requested_trips_ab_cutoff(new_bins, filter_trips):
+    # collect requested trip indices above cutoff
+    ab_trip_ls = []
+    # collect common trip indices above cutoff
+    no_req_trip_ls = []
+    for bin in new_bins:
+        early_trip_index = find_first_trip(filter_trips, bin)
+        ab_trip_ls.append(early_trip_index)
+
+        # The following loop collects the original indices of the rest of the trips in the bin. Since they are not the
+        # earliest one, we don't need to request for user labels
+        # >>> x = [100,200,300]
+        # >>> x.remove(100); x
+        # [200, 300]
+        no_req_trip_subls = copy.copy(bin)
+        no_req_trip_subls.remove(early_trip_index)
+        # >>> x = [1,2,3]
+        # >>> x.extend([4,5,6]); x
+        # [1, 2, 3, 4, 5, 6]
+        no_req_trip_ls.extend(no_req_trip_subls)
+    return ab_trip_ls, no_req_trip_ls
+
+
+# collect requested trips indices below cutoff
+def requested_trips_bl_cutoff(sim):
+    # bins below cutoff
+    bl_bins = sim.below_cutoff
+
+    # collect requested trips indices below cutoff
+    # effectively, bl_trip_ls = flatten(bl_bins)
+    # >>> bl_bins = [[1,2],[3,4],[5,6]]
+    # >>> bl_trip_ls = [item for sublist in bl_bins for item in sublist]
+    # >>> bl_trip_ls
+    # [1, 2, 3, 4, 5, 6]
+    # the reason for flattening: we need to have a whole flatten list of requested trips, then compute the percentage
+    bl_trip_ls = [item for sublist in bl_bins for item in sublist]
+    return bl_trip_ls
+
+
+# a list of all requested trips indices
+# - filter_trips: we need to use timestamp in filter_trips here,
+# in requested_trips_ab_cutoff, we need to get the first trip of the bin,
+# and we need to collect original trip indices from filter_trips
+# - sim: we need to use code in similarity to find trips below cutoff
+# Since the indices from similarity code are original (trips below cutoff),
+# we need to have original indices of all requested trips,
+# so we use filter_trips for finding the requested common trips
+# new_bins: bins that have original indices of similar trips. They only represent common trips
+def get_requested_trips(new_bins,filter_trips,sim):
+    ab_trip_ls,no_req_trip_ls = requested_trips_ab_cutoff(new_bins,filter_trips)
+    bl_trip_ls = requested_trips_bl_cutoff(sim)
+    req_trips_ls = ab_trip_ls+bl_trip_ls
+    return req_trips_ls
+
+
+# get request percentage based on the number of requested trips and the total number of trips
+def get_req_pct(new_labels,track,filter_trips,sim):
+    # - new_bins: bins with original indices of similar trips from common trips
+    # - new_label: For the first round, new_label is the copy of the first round labels, e.g. [1,1,1,2,2,2].
+    # For the second round, new_label is that the first round label concatenate the second round label.
+    # e.g.the label from the second round is [1,2,1,2,3,3], new_label will turn to [11,12,11,22,23,23]
+    # - track: at this point, each item in the track contains the original index of a trip,
+    # and the latest label of it. e.g. [ori_idx, latest_label]
+    # concretely, please look at "group_similar_trips" function in label_processing.py
+    # If new_label is [11,12,11,22,23,23] and the original indices of the trips is [1,2,3,4,5,6],
+    # new_bins will be [[1,3],[2],[4],[5,6]]
+    new_bins = label_pro.group_similar_trips(new_labels,track)
+    req_trips = get_requested_trips(new_bins,filter_trips,sim)
+    pct = len(req_trips)/len(filter_trips)
+    pct = float('%.3f' % pct)
+    return pct
diff --git a/emission/analysis/modelling/tour_model/get_scores.py b/emission/analysis/modelling/tour_model/get_scores.py
new file mode 100644
index 000000000..eba0ba88e
--- /dev/null
+++ b/emission/analysis/modelling/tour_model/get_scores.py
@@ -0,0 +1,83 @@
+import pandas as pd
+import pandas.testing as pdt
+import label_processing as label_pro
+import sklearn.metrics as skm
+import itertools
+
+
+# compare the trip orders in bin_trips with those in filter_trips above cutoff
+def compare_trip_orders(bins,bin_trips,filter_trips):
+    bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
+    bin_ls = list(itertools.chain(*bins))
+    bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
+    # compare two data frames, the program will continue to score calculation if two data frames are the same
+    pdt.assert_frame_equal(bins_ts, bin_trips_ts)
+
+
+# This function is to get homogeneity score after the first/second round of clustering
+# It is based on bin_trips, which are common trips. bin_trips are collected according to the indices of the trips
+# in bins above cutoff
+# More info about bin_trips is in similarity.py (delete_bins)
+# The homogeneity score reflects the degree to which a cluster consists only of trips with similar ground truthed labels.
+# In the following examples, "A","B","C" are user labels.
+# The labels can be drawn from different sets as long as the mapping is unique (e.g. ["A", "A", "C"] matches perfectly
+# with [0,0,1]). 
+# Ideally, there would be 1:1 mapping between labels and clusters - e.g. ["A", "A", "A"] maps to [1,1,1]
+# This can break in two ways:
+# user label A maps to different clusters - e.g. ["A", "A", "A"] maps to [1,2,3].
+# In this case, the homogeneity score will still be 1.0, since each cluster only has label "A".
+# For our problem, this would typically map to the use case where trips with same user labels are actually to different 
+# destinations. For `medical` or `personal` locations, for example, users could actually go to multiple medical 
+# facilities or friends' houses. In this case, the trips will be in different clusters, but since the destinations are in 
+# fact different, this would actually be the correct behavior.
+# The trips could also be to the same location, but be clustered differently due to minor variations in duration or 
+# distance (maybe due to traffic conditions). This could result in multiple clusters for what is essentially the same 
+# trip. We capture this difference through the request percentage metric, which will result in three queries for 
+# [1,2,3] and only one for [1,1,1]
+# two different labels map to the same cluster - e.g. ["A", "A", "B"] maps to [1,1,1]. This is the case captured by the
+# homogeneity score, which will be less than 1.0 (0 representes inhomogeneous, 1.0 represents homogeneous).
+# This maps well to our use case because in this case, assigning the same label to all trips in the cluster would
+# be incorrect. In particular, if we did not have the ground truth, the third trip would be labeled "A", 
+# which would lower the accuracy.
+# At this point, we didn't make user_input have same labels for labels_true and labels_pred.
+# For example, in the second round, user labels are [("home", "ebike", "bus"),("home", "walk", "bus"),
+# ("home", "ebike", "bus")], the labels_pred can be [0,1,0], or [1,0,1] or represented by other numeric labels.
+def score(bin_trips, labels_pred):
+    bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
+    bin_trips_user_input_df = label_pro.map_labels(bin_trips_user_input_df)
+
+    # turn all user_input into list without binning
+    bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist()
+    # drop duplicate user_input
+    no_dup_df = bin_trips_user_input_df.drop_duplicates()
+    # turn non-duplicate user_input into list
+    no_dup_list = no_dup_df.values.tolist()
+
+    # collect labels_true based on user_input
+    # To compute labels_true, we need to find out non-duplicate user labels, and use the index of the unique user label
+    # to label the whole trips
+    # If user labels are [(purpose, confirmed_mode, replaced_mode)]
+    # e.g.,[("home","ebike","bus"),("work","walk","bike"),("home","ebike","bus"),("home","ebike","bus"),
+    # ("work","walk","bike"),("exercise","ebike","walk")],
+    # the unique label list is [0,1,2], labels_true will be [0,1,0,0,1,2]
+    # labels_pred is the flattened list of labels of all common trips, e.g.[1,1,11,12,13,22,23]
+    labels_true = []
+    for userinput_dict in bin_trips_user_input_ls:
+        if userinput_dict in no_dup_list:
+            labels_true.append(no_dup_list.index(userinput_dict))
+
+    labels_pred = labels_pred
+    homo_score = skm.homogeneity_score(labels_true, labels_pred)
+    homo_score = float('%.3f' % homo_score)
+    return homo_score
+
+
+# This function compute a score for every model.
+# It is used for tuning and finding the best model after two rounds of clustering
+# - homo_second: the homogeneity score after the second round of clustering
+# - percentage_second: the user labels request percentage
+def get_score(homo_second,percentage_second):
+    curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second)
+    curr_score = float('%.3f' % curr_score)
+    return curr_score
+
diff --git a/emission/analysis/modelling/tour_model/label_processing.py b/emission/analysis/modelling/tour_model/label_processing.py
new file mode 100644
index 000000000..2c6932ace
--- /dev/null
+++ b/emission/analysis/modelling/tour_model/label_processing.py
@@ -0,0 +1,133 @@
+import logging
+import scipy.cluster.hierarchy as sch
+
+# to map the user labels
+# - user_input_df: pass in original user input dataframe, return changed user input dataframe
+# - sp2en: change Spanish to English
+def map_labels_sp2en(user_input_df):
+    # Spanish words to English
+    span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance',
+                     'iglesia': 'church', 'curso': 'course',
+                     'mi_hija recién aliviada': 'my daughter just had a new baby',
+                     'servicio_comunitario': 'community service', 'pago_de aseguranza': 'insurance payment',
+                     'grupo_comunitario': 'community group', 'caminata_comunitaria': 'community walk'}
+
+    # change language
+    user_input_df = user_input_df.replace(span_eng_dict)
+    return user_input_df
+
+
+# to map purposes and replaced mode in user inputs
+# - cvt_pur_mo: convert purposes and replaced mode
+def map_labels_purpose(user_input_df):
+    # Convert purpose
+    map_pur_dict = {'course': 'school', 'work_- lunch break': 'lunch_break', 'on_the way home': 'home',
+                    'insurance_payment': 'insurance'}
+
+    # convert purpose
+    user_input_df = user_input_df.replace(map_pur_dict)
+    return user_input_df
+
+
+def map_labels_mode(user_input_df):
+    # convert mode
+    for a in range(len(user_input_df)):
+        if user_input_df.iloc[a]["replaced_mode"] == "same_mode":
+            # to see which row will be converted
+            logging.debug("The following rows will be changed: %s", user_input_df.iloc[a])
+            user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm']
+    return user_input_df
+
+
+# this function will change Spanish to English, convert purposes, and convert modes
+def map_labels(user_input_df):
+    # Note that the spanish -> english conversion MUST currently happen before the other
+    # mode and purpose mappings
+    user_input_df = map_labels_sp2en(user_input_df)
+    user_input_df = map_labels_purpose(user_input_df)
+    user_input_df = map_labels_mode(user_input_df)
+    return user_input_df
+
+# use hierarchical clustering to get labels of the second round
+# - sch.linkage: perform hierarchical(agglomerative) clustering
+# In this function, we set a low bound and a higher bound(cutoff) of distance in the dendrogram
+# - last_d: the distance of the last cluster in the dendrogram
+# - low: the lower bound of distance
+# e.g., if low = 300, last_d = 250, we will assign 0s as labels for the points, irrespective of the first round labels.
+# and the list of second round labels will be like [0,0,0,0,0].
+# It means the points are already similar to each other after the first round of clustering, they don't need to
+# go through the second round.
+# - max_d: the cutoff of distance
+# - dist_pct: the percentage of the last distance in the dendrogram
+# - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix
+# e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400
+# - clusters: the labels from the second round clustering
+def get_second_labels(x,method,low,dist_pct):
+    z = sch.linkage(x, method=method, metric='euclidean')
+    last_d = z[-1][2]
+    clusters = []
+    if last_d < low:
+        for i in range(len(x)):
+            clusters.append(0)
+    else:
+        max_d = last_d * dist_pct
+        clusters = sch.fcluster(z, max_d, criterion='distance')
+    return clusters
+
+
+# this function includes hierarchical clustering and changing labels from the first round to get appropriate labels for
+# the second round of clustering
+# appropriate labels are label from the first round concatenate label from the second round
+# (e.g. label from first round is 1, label from second round is 2, the new label will be 12)
+# - second_round_idx_labels: a list to store the indices and labels from the first round.
+# - second_labels: labels from the second round of clustering
+def get_new_labels(second_labels,second_round_idx_labels,new_labels):
+    for i in range(len(second_labels)):
+        first_index = second_round_idx_labels[i][0]
+        new_label = second_round_idx_labels[i][1]
+        # concatenate labels from two rounds
+        new_label = int(str(new_label) + str(second_labels[i]))
+        for k in range(len(new_labels)):
+            if k == first_index:
+                new_labels[k] = new_label
+                break
+    return new_labels
+
+
+# group similar trips according to new_labels, store the original indices of the trips
+def group_similar_trips(new_labels,track):
+    bin_sim_trips_idx = []
+
+    # find the unique set of bins and store their indices into `bin_sim_trips`
+    label_set = set(new_labels)
+    # convert the set of unique labels into their indices
+    # concretely, if the input labels are ['a','a','a','b','b','b']
+    # the unique labels are ['a', 'b']
+    for sel_label in label_set:
+        # for the first iteration, bin = [0,1,2]
+        # for the second iteration, bin = [3,4,5]
+        bin = [index for (index, label) in enumerate(new_labels) if label == sel_label]
+        bin_sim_trips_idx.append(bin)
+    # At the end, bin_sim_trips_idx = [[0,1,2],[3,4,5]]
+
+    # using track to replace the current indices with original indices
+    for bin in bin_sim_trips_idx:
+        # in the first iteration, bin = [0,1,2]
+        # in the first iteration of that, we map the trip index of the
+        # common trip (e.g. 0) to the original index for that trip from the track (e.g. 42)
+        for i in range(len(bin)):
+            bin[i] = track[bin[i]][0]
+    # At this point, the bin_sim_trips_idx will have original indices for the trips
+    return bin_sim_trips_idx
+
+
+
+# replace the first round labels with new labels
+# - track: a list to store the indices and labels from the first round of clustering
+# for item in track, item[0] is the original index of the trip in filter_trips
+# item[1] is the label after the first round of clustering
+# we change the labels from the first round with new labels from the second round here
+def change_track_labels(track,new_labels):
+    for i in range(len(new_labels)):
+        track[i][1] = new_labels[i]
+    return track

From 47e0f617a9a5b7dde91866eb87162622a990d595 Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Fri, 16 Jul 2021 00:21:58 -0700
Subject: [PATCH 3/6] add kmeans at the 2nd round of clutering

---
 .../analysis/modelling/tour_model/label_processing.py  | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/emission/analysis/modelling/tour_model/label_processing.py b/emission/analysis/modelling/tour_model/label_processing.py
index 2c6932ace..644b8967c 100644
--- a/emission/analysis/modelling/tour_model/label_processing.py
+++ b/emission/analysis/modelling/tour_model/label_processing.py
@@ -1,5 +1,6 @@
 import logging
 import scipy.cluster.hierarchy as sch
+import sklearn.cluster as sc
 
 # to map the user labels
 # - user_input_df: pass in original user input dataframe, return changed user input dataframe
@@ -61,7 +62,7 @@ def map_labels(user_input_df):
 # - dist_pct: the percentage of the last distance in the dendrogram
 # - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix
 # e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400
-# - clusters: the labels from the second round clustering
+# by default, using scipy hierarchical clustering
 def get_second_labels(x,method,low,dist_pct):
     z = sch.linkage(x, method=method, metric='euclidean')
     last_d = z[-1][2]
@@ -74,6 +75,13 @@ def get_second_labels(x,method,low,dist_pct):
         clusters = sch.fcluster(z, max_d, criterion='distance')
     return clusters
 
+# using kmeans to build the model
+def kmeans_clusters(clusters,x):
+    n_clusters = len(set(clusters))
+    kmeans = sc.KMeans(n_clusters=n_clusters, random_state=0).fit(x)
+    k_clusters = kmeans.labels_
+    return k_clusters
+
 
 # this function includes hierarchical clustering and changing labels from the first round to get appropriate labels for
 # the second round of clustering

From 3ba040dd3d9b6de168d894aa068716b1a0b89ee5 Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Sun, 25 Jul 2021 18:34:44 -0700
Subject: [PATCH 4/6] tests and fake trips I have so far

---
 .../clusteringTests/TestDataPreprocessing.py  |  94 +-
 .../TestGetRequestPercentage.py               | 154 ++++
 .../clusteringTests/TestGetUsers.py           |  55 +-
 .../clusteringTests/TestLabelProcessing.py    |  98 ++
 .../clusteringTests/TestSimilarity.py         |  76 ++
 emission/tests/data/real_examples/fake_trips  | 836 ++++++++++++++++++
 6 files changed, 1249 insertions(+), 64 deletions(-)
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestSimilarity.py
 create mode 100644 emission/tests/data/real_examples/fake_trips

diff --git a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
index 7f9a0e8c3..80a6b9af3 100644
--- a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
+++ b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py
@@ -1,77 +1,71 @@
-import emission.core.wrapper.localdate as ecwl
 import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
-
 from future import standard_library
 standard_library.install_aliases()
 from builtins import *
 import unittest
 import json
 import bson.json_util as bju
-import emission.storage.timeseries.abstract_timeseries as esta
-
 import emission.tests.common as etc
 
 
 class TestDataPreprocessing(unittest.TestCase):
-
-    # should setup user = [self.testUUID], radius = 100
-    # do we need teardown if we don't use databse?
-
+    def setUp(self):
+        self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
+        self.user = self.testUUID
+        self.radius = 100
+
+    def tearDown(self):
+        self.clearDBEntries()
+
+    def readAndStoreTripsFromFile(self, dataFile):
+        import emission.core.get_database as edb
+        atsdb = edb.get_analysis_timeseries_db()
+        etc.createAndFillUUID(self)
+        with open(dataFile) as dect:
+            expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook)
+            for t in expected_confirmed_trips:
+                t["user_id"] = self.testUUID
+                edb.save(atsdb, t)
+
+    def clearDBEntries(self):
+        import emission.core.get_database as edb
+        edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
 
     def test_read_data(self):
-        dataFile = "emission/tests/data/real_examples/shankari_2016-06-20"
-        ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20})
-        with open(dataFile+".ground_truth") as gfp:
-            ground_truth = json.load(gfp, object_hook=bju.object_hook)
-
-        etc.setupRealExample(self, dataFile)
-        # if (not preload):
-        self.entries = json.load(open(dataFile+".user_inputs"), object_hook = bju.object_hook)
-        etc.setupRealExampleWithEntries(self)
-        etc.runIntakePipeline(self.testUUID)
-        ts = esta.TimeSeries.get_time_series(self.testUUID)
-        confirmed_trips = list(ts.find_entries(["analysis/confirmed_trip"], None))
-        with open(dataFile+".expected_confirmed_trips") as dect:
-            expected_confirmed_trips = json.load(dect, object_hook = bju.object_hook)
-        print('confirmed_trips',confirmed_trips)
-        user = [self.testUUID]
-        trips = preprocess.read_data(user)
-        print('trips ', trips)
-        # I don't know how to assertEqual here
-
+        trips = preprocess.read_data(self.user)
+        self.assertEqual(len(trips), 10)
 
     def test_filter_data(self):
-        radius = 100
-        # - trips: should be read from a file or from database
-        user = [self.testUUID]
-        trips = preprocess.read_data(user)
-        filter_trips = preprocess.filter_data(trips,radius)
-        # assertEqual
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
+        self.assertEqual(len(filter_trips), 8)
 
     def test_extract_features(self):
-        user = [self.testUUID]
-        radius = 100
-        trips = preprocess.read_data(user)
-        filter_trips = preprocess.filter_data(trips,radius)
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
         X = preprocess.extract_features(filter_trips)
-        # assertEqual
+        self.assertEqual(len(X), 8)
+        self.assertEqual(X[0], [-122.0857861, 37.3898049, -122.0826931,
+                                37.3914184, 1047.1630675866315, 792.4609999656677])
 
     def test_split_data(self):
-        user = [self.testUUID]
-        radius = 100
-        trips = preprocess.read_data(user)
-        filter_trips = preprocess.filter_data(trips,radius)
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
         train_idx, test_idx = preprocess.split_data(filter_trips)
-        # assertEqual
+        self.assertEqual(len(train_idx),5)
+        self.assertEqual(len(test_idx), 5)
+        self.assertGreaterEqual(len(train_idx[0]),len(test_idx[0]),'the number of trips in train_idx should be greater '
+                                                                   'than the one in test_idx')
 
     def test_get_subdata(self):
-        user = [self.testUUID]
-        radius = 100
-        trips = preprocess.read_data(user)
-        filter_trips = preprocess.filter_data(trips,radius)
-        train_set_idx = [0,1,2,3,4]
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
+        train_set_idx = [[0,1,2,3,4],[0,1,2,4,5]]
         collect_sub_data = preprocess.get_subdata(filter_trips, train_set_idx)
-        # assertEqual
+        compare_idx = filter_trips.index(collect_sub_data[0][4])
+        self.assertEqual(compare_idx, 4)
 
 
 if __name__ == '__main__':
diff --git a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
new file mode 100644
index 000000000..1c0e6fe9b
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
@@ -0,0 +1,154 @@
+from future import standard_library
+standard_library.install_aliases()
+import unittest
+import emission.analysis.modelling.tour_model.similarity as similarity
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+
+import emission.analysis.modelling.tour_model.get_request_percentage as eamtg
+import pandas as pd
+import emission.tests.common as etc
+import sklearn.cluster as sc
+import numpy as np
+import json
+import bson.json_util as bju
+
+
+class TestGetRequestPercentage(unittest.TestCase):
+    def setUp(self):
+        self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
+        self.user = self.testUUID
+        self.radius = 100
+
+    def tearDown(self):
+        self.clearDBEntries()
+
+    def readAndStoreTripsFromFile(self, dataFile):
+        import emission.core.get_database as edb
+        atsdb = edb.get_analysis_timeseries_db()
+        etc.createAndFillUUID(self)
+        with open(dataFile) as dect:
+            expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook)
+            for t in expected_confirmed_trips:
+                t["user_id"] = self.testUUID
+                edb.save(atsdb, t)
+
+    def clearDBEntries(self):
+        import emission.core.get_database as edb
+        edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
+
+    def test_match_day(self):
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
+        sim = similarity.similarity(filter_trips, radius)
+
+    # def test_match_day(self):
+    #     # case 1: bin contains indices & trip matches selected trip in filter_trips
+    #     bin = [0,1,2]
+    #     trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}
+    #     filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}]
+    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), True)
+    #     # case 2: bin = True & trip doesn't match selected trip in filter_trips
+    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 18}}}]
+    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False)
+    #     #case 3: bin is none & trip matches selected trip in filter_trips
+    #     bin = None
+    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}]
+    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False)
+    #
+    #
+    # def test_match_month(self):
+    #     # case 1: bin contains indices & trip matches selected trip in filter_trips
+    #     bin = [0,1,2]
+    #     trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}
+    #     filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}]
+    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), True)
+    #     # case 2: bin = True & trip doesn't match selected trip in filter_trips
+    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
+    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False)
+    #     #case 3: bin is none & trip matches selected trip in filter_trips
+    #     bin = None
+    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}]
+    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False)
+    #
+    #
+    # def test_bin_date(self):
+    #     # case 1: bin day
+    #     trip_ls = [0,1,2]
+    #     filter_trips1 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
+    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
+    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
+    #     self.assertEqual(eamtg.bin_date(trip_ls, filter_trips1, day=True), [[0,1],[2]])
+    #     # case 2: bin month
+    #     filter_trips2 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 15}}},
+    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
+    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
+    #     self.assertEqual(eamtg.bin_date(trip_ls, filter_trips2, month=True), [[0,1],[2]])
+    #
+    #
+    # def test_find_first_trip(self):
+    #     import time
+    #     time1 = "Thu Jan 28 22:24:24 2020"
+    #     time2 = "Sat Jan 30 23:24:24 2020"
+    #     time3 = "Sun Jan 31 20:24:24 2020"
+    #     bin = [0,1,2]
+    #     ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y"))
+    #     ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y"))
+    #     ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y"))
+    #     filter_trips = [{'data': {'start_ts': ts1}},
+    #                     {'data': {'start_ts': ts2}},
+    #                     {'data': {'start_ts': ts3}}]
+    #
+    #     self.assertEqual(eamtg.find_first_trip(filter_trips, bin),0)
+    #
+    #
+    # def test_requested_trips_ab_cutoff(self):
+    #     import time
+    #     time1 = "Thu Jan 28 22:24:24 2020"
+    #     time2 = "Sat Jan 30 23:24:24 2020"
+    #     time3 = "Sun Jan 31 20:24:24 2020"
+    #     new_bins = [[0,1],[2]]
+    #     ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y"))
+    #     ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y"))
+    #     ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y"))
+    #     filter_trips = [{'data': {'start_ts': ts1}},
+    #                     {'data': {'start_ts': ts2}},
+    #                     {'data': {'start_ts': ts3}}]
+    #     self.assertEqual(eamtg.requested_trips_ab_cutoff(new_bins, filter_trips),([0, 2], [1]))
+    #
+    #
+    # def test_requested_trips_bl_cutoff(self):
+    #
+    #     # requested_trips_bl_cutoff(sim)
+    #     fake_trip_collect = []
+    #     trip1 = pd.DataFrame(data=([[-122.41925243091958,-122.42140476014033],[37.77938521735944,37.78194309045273]]),
+    #                       columns=[['start_loc','end_loc'],
+    #                                ['coordinates','coordinates']])
+    #     fake_trip_collect.append(trip1)
+    #     trip2 = pd.DataFrame(data=([[-122.41925243091958, -122.42093683661327], [37.77938521735944, 37.782278693221016]]),
+    #                        columns=[['start_loc', 'end_loc'],
+    #                                 ['coordinates', 'coordinates']])
+    #     fake_trip_collect.append(trip2)
+    #     trip3 = pd.DataFrame(data=([[-123.41925243091958,-122.41912876839925],[37.77938521735944,37.77766191670088]]),
+    #                       columns=[['start_loc','end_loc'],
+    #                                ['coordinates','coordinates']])
+    #     fake_trip_collect.append(trip3)
+    #     sim = similarity.similarity(fake_trip_collect,100)
+    #     print(sim.below_cutoff)
+    #     # print(bl_trip_ls)
+    #
+    #
+    #
+    #     # df = pd.DataFrame(columns=[['start_loc','end_loc'],['coordinates','coordinates']])
+    #
+    #     # print(df)
+    #     # df1 = pd.DataFrame(np.ra]ndom.randint(0, 150, size=(4, 6)),
+    #     #                    columns=[['python', 'python', 'math', 'math', 'En', 'En'],
+    #     #                             ['期中', '期末', '期中', '期末', '期中', '期末']])
+    #     # print(df1.python['期中'])
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
+
diff --git a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py
index 6283f8b07..7128fe6bd 100644
--- a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py
+++ b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py
@@ -3,29 +3,56 @@
 import unittest
 import emission.analysis.modelling.tour_model.get_users as gu
 import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
-
 import emission.tests.common as etc
+import json
+import bson.json_util as bju
+import copy
+
 
 class TestGetUsers(unittest.TestCase):
-    # def setUp(self):
-    #
-    # def tearDown(self):
+    def setUp(self):
+        self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
+        self.user = self.testUUID
+        self.radius = 100
+
+    def tearDown(self):
+        self.clearDBEntries()
+
+    def readAndStoreTripsFromFile(self, dataFile):
+        import emission.core.get_database as edb
+        atsdb = edb.get_analysis_timeseries_db()
+        etc.createAndFillUUID(self)
+        with open(dataFile) as dect:
+            expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook)
+            for t in expected_confirmed_trips:
+                t["user_id"] = self.testUUID
+                edb.save(atsdb, t)
+
+    def clearDBEntries(self):
+        import emission.core.get_database as edb
+        edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
 
 
     def test_valid_user(self):
-        user = [self.testUUID]
-        radius = 100
-        trips = preprocess.read_data(user)
-        filter_trips = preprocess.filter_data(trips,radius)
+        trips = preprocess.read_data(self.user)
+        filter_trips = preprocess.filter_data(trips,self.radius)
+        # the user has 8 labeled trips, >50% of trips are labeled
         valid = gu.valid_user(filter_trips, trips)
-        # assertEqual
-
+        self.assertEqual(valid,False)
+        for i in range(2):
+            filter_trips.append(copy.copy(filter_trips[0]))
+        # now the user has 10 labeled trips, >50% of trips are labeled
+        valid = gu.valid_user(filter_trips, trips)
+        self.assertEqual(valid,True)
 
     def test_get_user_ls(self):
-        all_users = [self.testUUID]
-        radius = 100
-        user_ls,valid_user_ls = gu.get_user_ls(all_users, radius)
-        # assertEqual
+        # only 1 invalid user
+        user_ls,valid_user_ls = gu.get_user_ls([self.user], self.radius)
+        self.assertEqual(len(user_ls),1)
+        self.assertEqual(len(valid_user_ls),0)
+
 
 if __name__ == '__main__':
     etc.configLogging()
diff --git a/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py b/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py
new file mode 100644
index 000000000..cb1d9e25c
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py
@@ -0,0 +1,98 @@
+from future import standard_library
+standard_library.install_aliases()
+import unittest
+import emission.analysis.modelling.tour_model.label_processing as eamtl
+import pandas as pd
+import emission.tests.common as etc
+import sklearn.cluster as sc
+import numpy as np
+
+
+class TestLabelProcessing(unittest.TestCase):
+
+    def test_map_labels_sp2en(self):
+        mode = ['placas_de carro','aseguranza','iglesia']
+        user_input_df = pd.DataFrame(data={'mode':mode})
+        user_input_df = eamtl.map_labels_sp2en(user_input_df)
+        compare_mode = ['car plates','insurance','church']
+        compare_df = pd.DataFrame(data={'mode':compare_mode})
+        pd.testing.assert_frame_equal(user_input_df,compare_df)
+
+    def test_map_labels_purpose(self):
+        purpose = ['course','work_- lunch break','on_the way home','insurance_payment']
+        user_input_df = pd.DataFrame(data={'purpose': purpose})
+        compare_purpose = ['school','lunch_break','home','insurance']
+        compare_df = pd.DataFrame(data={'purpose': compare_purpose})
+        user_input_df = eamtl.map_labels_purpose(user_input_df)
+        pd.testing.assert_frame_equal(user_input_df,compare_df)
+
+
+    def test_map_labels_mode(self):
+        mode_confirm = ['bike','ebike']
+        replaced_mode = ['same_mode','walk']
+        dict = {'mode_confirm':mode_confirm,'replaced_mode':replaced_mode}
+        user_input_df = pd.DataFrame(dict)
+        user_input_df = eamtl.map_labels_mode(user_input_df)
+        compare_replaced_mode = ['bike','walk']
+        compare_dict = {'mode_confirm':mode_confirm,'replaced_mode':compare_replaced_mode}
+        compare_df = pd.DataFrame(compare_dict)
+        pd.testing.assert_frame_equal(user_input_df,compare_df)
+
+    def test_map_labels(self):
+        mode_confirm = ['bike']
+        purpose_confirm = ['iglesia']
+        replaced_mode = ['same_mode']
+        user_input = {'mode_confirm':mode_confirm,'purpose_confirm':purpose_confirm,'replaced_mode':replaced_mode}
+        user_input_df = pd.DataFrame(user_input)
+        user_input_df = eamtl.map_labels(user_input_df)
+        compare_purpose_confirm = ['church']
+        compare_replaced_mode = ['bike']
+        compare_dict = {'mode_confirm':mode_confirm,'purpose_confirm':compare_purpose_confirm,'replaced_mode':compare_replaced_mode}
+        compare_df = pd.DataFrame(compare_dict)
+        pd.testing.assert_frame_equal(user_input_df,compare_df)
+
+    def test_get_second_labels(self):
+        x1 = [[1,2,3,4],[2,2,3,4],[3,3,3,3],[1,2,3,4]]
+        x2 = [[1,1,1,1],[18,33,57,20],[30,34,67,3],[40,20,3,4]]
+        method = 'single'
+        low = 50
+        dist_pct = 0.6
+        # if features are close
+        labels1 = eamtl.get_second_labels(x1, method, low, dist_pct)
+        labels2 = eamtl.get_second_labels(x2, method, low, dist_pct)
+        self.assertEqual(labels1, [0, 0, 0, 0])
+        self.assertEqual(labels2.tolist(), [2,1,1,3])
+
+
+    def test_kmeans_clusters(self):
+        clusters = [1, 1, 1, 0, 0, 0]
+        x = np.array([[1, 2], [1, 4], [1, 0],[10, 2], [10, 4], [10, 0]])
+        n_clusters = len(set(clusters))
+        k_clusters = eamtl.kmeans_clusters(clusters, x)
+        self.assertEqual(k_clusters.tolist(), [1,1,1,0,0,0])
+
+
+    def test_get_new_labels(self):
+        second_labels = [2,1,1,3]
+        second_round_idx_labels =[[0,1],[1,1],[2,1],[3,2]]
+        new_labels = [1,1,1,2,3,3,3,3]
+        new_labels = eamtl.get_new_labels(second_labels, second_round_idx_labels, new_labels)
+        self.assertEqual(new_labels, [12, 11, 11, 23, 3, 3, 3, 3])
+
+    def test_group_similar_trips(self):
+        new_labels = [12, 11, 11, 23, 31, 31, 32, 32]
+        track = [[11,12],[15,11],[20,11],[50,23],[57,31],[59,31],[67,32],[69,32]]
+        new_bins = eamtl.group_similar_trips(new_labels,track)
+        self.assertEqual(new_bins, [[67, 69], [15, 20], [11], [50], [57, 59]])
+
+    def test_change_track_labels(self):
+        track = [[11,1],[15,1],[20,1],[50,2],[57,3],[59,3],[67,3],[69,3]]
+        new_labels = [12, 11, 11, 23, 31, 31, 32, 32]
+        track = eamtl.change_track_labels(track,new_labels)
+        self.assertEqual(track, [[11, 12], [15, 11], [20, 11], [50, 23], [57, 31], [59, 31], [67, 32], [69, 32]])
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
+
+
diff --git a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py
new file mode 100644
index 000000000..12ea565b0
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py
@@ -0,0 +1,76 @@
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+import emission.analysis.modelling.tour_model.similarity as similarity
+from future import standard_library
+standard_library.install_aliases()
+from builtins import *
+import unittest
+import json
+import bson.json_util as bju
+import emission.tests.common as etc
+
+# This test file is to test the functions that are used in the
+class TestDataPreprocessing(unittest.TestCase):
+    def setUp(self):
+        self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
+        self.user = self.testUUID
+        self.radius = 100
+
+    def tearDown(self):
+        self.clearDBEntries()
+
+    def readAndStoreTripsFromFile(self, dataFile):
+        import emission.core.get_database as edb
+        atsdb = edb.get_analysis_timeseries_db()
+        etc.createAndFillUUID(self)
+        with open(dataFile) as dect:
+            expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook)
+            for t in expected_confirmed_trips:
+                t["user_id"] = self.testUUID
+                edb.save(atsdb, t)
+
+    def clearDBEntries(self):
+        import emission.core.get_database as edb
+        edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
+
+    def test_within_radius(self):
+        # case 1: start and end location are within radius
+        in_range = similarity.within_radius(-122.40998884982407, 37.809339507025655,
+                                            -122.4101610462097, 37.80925081700211, self.radius)
+        self.assertEqual(in_range,True)
+        # case 2: start and end location are not within radius
+        in_range = similarity.within_radius(-122.40998884982407, 37.809339507025655,
+                                            -122.41296471977945, 37.8079948386731, self.radius)
+        self.assertEqual(in_range,False)
+
+
+    def test_filter_too_short(self):
+        all_trips = preprocess.read_data(self.user)
+        valid_trips = similarity.filter_too_short(all_trips, self.radius)
+        self.assertEqual(len(valid_trips),10)
+
+    def test_bin_data(self):
+        trips = preprocess.read_data(self.user)
+        sim = similarity.similarity(trips, self.radius)
+        filter_trips = sim.data
+        sim.bin_data()
+        self.assertEqual(sim.bins,[[4, 5, 6, 7, 8], [0], [1], [2], [3], [9]])
+
+    def test_delete_bins(self):
+        trips = preprocess.read_data(self.user)
+        sim = similarity.similarity(trips, self.radius)
+        filter_trips = sim.data
+        sim.bin_data()
+        sim.delete_bins()
+        bins = sim.bins
+        bin_trips = sim.newdata
+        self.assertEqual(bins,[[4, 5, 6, 7, 8]])
+        self.assertEqual(len(bin_trips),5)
+
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
+
+
diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips
new file mode 100644
index 000000000..1a26d2074
--- /dev/null
+++ b/emission/tests/data/real_examples/fake_trips
@@ -0,0 +1,836 @@
+[
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0241"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050275.276295,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 55,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:55.276295-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466437275.856,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 8,
+                "minute": 41,
+                "second": 15,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T08:41:15.856000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0826931,
+                    37.3914184
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349789"
+            },
+            "start_ts": 1466436483.395,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 8,
+                "minute": 28,
+                "second": 3,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T08:28:03.395000-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0857861,
+                    37.3898049
+                ]
+            },
+            "duration": 792.4609999656677,
+            "distance": 1047.1630675866315,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349860"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349861"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e663baff4ef23d3497af"
+            },
+            "user_input": {
+                "mode_confirm": "walk",
+                "purpose_confirm": "library"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0242"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050275.488737,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 55,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:55.488737-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466438022.959,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 8,
+                "minute": 53,
+                "second": 42,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T08:53:42.959000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0866181,
+                    37.3910231
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d34978b"
+            },
+            "start_ts": 1466437438.6453953,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 8,
+                "minute": 43,
+                "second": 58,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T08:43:58.645395-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0826931,
+                    37.3914184
+                ]
+            },
+            "duration": 584.3136048316956,
+            "distance": 886.4937093667857,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349861"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349862"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e663baff4ef23d3497ce"
+            },
+            "user_input": {}
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0243"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050275.7204192,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 55,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:55.720419-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466461966.379,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 15,
+                "minute": 32,
+                "second": 46,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T15:32:46.379000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0830016,
+                    37.3901637
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d34978d"
+            },
+            "start_ts": 1466461623.1195338,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 15,
+                "minute": 27,
+                "second": 3,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:27:03.119534-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0866181,
+                    37.3910231
+                ]
+            },
+            "duration": 343.25946617126465,
+            "distance": 610.2234223038181,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349862"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349863"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e663baff4ef23d3497e6"
+            },
+            "user_input": {}
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0244"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050275.942955,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 55,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:55.942955-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466462452.708,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 15,
+                "minute": 40,
+                "second": 52,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T15:40:52.708000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d34978f"
+            },
+            "start_ts": 1466462052.158904,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 15,
+                "minute": 34,
+                "second": 12,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:34:12.158904-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0830016,
+                    37.3901637
+                ]
+            },
+            "duration": 400.54909586906433,
+            "distance": 405.97685486691756,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349863"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349864"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e663baff4ef23d3497f6"
+            },
+            "user_input": {
+                "mode_confirm": "walk",
+                "purpose_confirm": "home"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0245"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.1554408,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.155441-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466463835.713,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 16,
+                "minute": 3,
+                "second": 55,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T16:03:55.713000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.1081974,
+                    37.4168828
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349791"
+            },
+            "start_ts": 1466462970.2807262,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 15,
+                "minute": 49,
+                "second": 30,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:49:30.280726-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "duration": 865.4322738647461,
+            "distance": 4521.417177464177,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349864"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349865"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d349808"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "karate"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0246"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.546149,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.546149-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466467959.767,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 17,
+                "minute": 12,
+                "second": 39,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T17:12:39.767000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.0864051,
+                    37.3907649
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349795"
+            },
+            "start_ts": 1466466584.0461695,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 20,
+                "hour": 16,
+                "minute": 49,
+                "second": 44,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T16:49:44.046170-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.1081974,
+                    37.4168828
+                ]
+            },
+            "duration": 1375.7208304405212,
+            "distance": 5136.824369981995,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349865"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349867"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d34982c"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "home"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0247"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.1554408,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.155441-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466377435.0,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 19,
+                "hour": 16,
+                "minute": 3,
+                "second": 55,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T16:03:55.713000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.1081974,
+                    37.4168828
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349791"
+            },
+            "start_ts": 1466376570.0,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 19,
+                "hour": 15,
+                "minute": 49,
+                "second": 30,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:49:30.280726-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "duration": 865.4322738647461,
+            "distance": 4521.417177464177,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349864"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349865"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d349808"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "karate"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0248"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.1554408,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.155441-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466636635.0,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 22,
+                "hour": 16,
+                "minute": 3,
+                "second": 55,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T16:03:55.713000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.10849780967732,
+                    37.416772003842034
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349791"
+            },
+            "start_ts": 1466635770.0,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 22,
+                "hour": 15,
+                "minute": 49,
+                "second": 30,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:49:30.280726-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "duration": 865.4322738647461,
+            "distance": 4521.417177464177,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349864"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349865"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d349808"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "soccer"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0249"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.1554408,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.155441-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466809435.0,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 24,
+                "hour": 16,
+                "minute": 3,
+                "second": 55,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T16:03:55.713000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.10849780967732,
+                    37.416772003842034
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349791"
+            },
+            "start_ts": 1466808570.0,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 24,
+                "hour": 15,
+                "minute": 49,
+                "second": 30,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:49:30.280726-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "duration": 865.4322738647461,
+            "distance": 4521.417177464177,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349864"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349868"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d349808"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "soccer"
+            }
+        }
+    },
+    {
+        "_id": {
+            "$oid": "5fd8e69ac61669a9ebad0250"
+        },
+        "user_id": {
+            "$uuid": "aa9fdec92944446c8ee250d79b3044d3"
+        },
+        "metadata": {
+            "key": "analysis/confirmed_trip",
+            "platform": "server",
+            "write_ts": 1608050276.1554408,
+            "time_zone": "America/Los_Angeles",
+            "write_local_dt": {
+                "year": 2020,
+                "month": 12,
+                "day": 15,
+                "hour": 8,
+                "minute": 37,
+                "second": 56,
+                "weekday": 1,
+                "timezone": "America/Los_Angeles"
+            },
+            "write_fmt_time": "2020-12-15T08:37:56.155441-08:00"
+        },
+        "data": {
+            "source": "DwellSegmentationTimeFilter",
+            "end_ts": 1466550235.0,
+            "end_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 21,
+                "hour": 16,
+                "minute": 3,
+                "second": 55,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "end_fmt_time": "2016-06-20T16:03:55.713000-07:00",
+            "end_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.10849780967732,
+                    37.416772003842034
+                ]
+            },
+            "raw_trip": {
+                "$oid": "5fd8e662baff4ef23d349791"
+            },
+            "start_ts": 1466549370.0,
+            "start_local_dt": {
+                "year": 2016,
+                "month": 6,
+                "day": 21,
+                "hour": 15,
+                "minute": 49,
+                "second": 30,
+                "weekday": 0,
+                "timezone": "America/Los_Angeles"
+            },
+            "start_fmt_time": "2016-06-20T15:49:30.280726-07:00",
+            "start_loc": {
+                "type": "Point",
+                "coordinates": [
+                    -122.086605,
+                    37.3910011
+                ]
+            },
+            "duration": 865.4322738647461,
+            "distance": 4521.417177464177,
+            "start_place": {
+                "$oid": "5fd8e664baff4ef23d349888"
+            },
+            "end_place": {
+                "$oid": "5fd8e664baff4ef23d349889"
+            },
+            "cleaned_trip": {
+                "$oid": "5fd8e664baff4ef23d349808"
+            },
+            "user_input": {
+                "mode_confirm": "shared_ride",
+                "purpose_confirm": "soccer"
+            }
+        }
+    }
+]
\ No newline at end of file

From f815cff60022c6da9d6c3c2dbf59a69179aa3c83 Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Sun, 25 Jul 2021 20:33:04 -0700
Subject: [PATCH 5/6]  done with TestGetRequestPercentage, update fake_trips
 for testing

---
 .../TestGetRequestPercentage.py               | 180 +++++++-----------
 emission/tests/data/real_examples/fake_trips  |  16 +-
 2 files changed, 76 insertions(+), 120 deletions(-)

diff --git a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
index 1c0e6fe9b..e93a5ea17 100644
--- a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
+++ b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py
@@ -3,21 +3,22 @@
 import unittest
 import emission.analysis.modelling.tour_model.similarity as similarity
 import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
-
 import emission.analysis.modelling.tour_model.get_request_percentage as eamtg
-import pandas as pd
+import emission.analysis.modelling.tour_model.evaluation_pipeline as ep
 import emission.tests.common as etc
-import sklearn.cluster as sc
-import numpy as np
 import json
 import bson.json_util as bju
 
 
+
 class TestGetRequestPercentage(unittest.TestCase):
     def setUp(self):
         self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
         self.user = self.testUUID
         self.radius = 100
+        self.trips = preprocess.read_data(self.user)
+        self.filter_trips = preprocess.filter_data(self.trips,self.radius)
+
 
     def tearDown(self):
         self.clearDBEntries()
@@ -39,114 +40,69 @@ def clearDBEntries(self):
         edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
 
     def test_match_day(self):
-        trips = preprocess.read_data(self.user)
-        filter_trips = preprocess.filter_data(trips,self.radius)
-        sim = similarity.similarity(filter_trips, radius)
-
-    # def test_match_day(self):
-    #     # case 1: bin contains indices & trip matches selected trip in filter_trips
-    #     bin = [0,1,2]
-    #     trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}
-    #     filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}]
-    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), True)
-    #     # case 2: bin = True & trip doesn't match selected trip in filter_trips
-    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 18}}}]
-    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False)
-    #     #case 3: bin is none & trip matches selected trip in filter_trips
-    #     bin = None
-    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}]
-    #     self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False)
-    #
-    #
-    # def test_match_month(self):
-    #     # case 1: bin contains indices & trip matches selected trip in filter_trips
-    #     bin = [0,1,2]
-    #     trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}
-    #     filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}]
-    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), True)
-    #     # case 2: bin = True & trip doesn't match selected trip in filter_trips
-    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
-    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False)
-    #     #case 3: bin is none & trip matches selected trip in filter_trips
-    #     bin = None
-    #     filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}]
-    #     self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False)
-    #
-    #
-    # def test_bin_date(self):
-    #     # case 1: bin day
-    #     trip_ls = [0,1,2]
-    #     filter_trips1 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
-    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
-    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
-    #     self.assertEqual(eamtg.bin_date(trip_ls, filter_trips1, day=True), [[0,1],[2]])
-    #     # case 2: bin month
-    #     filter_trips2 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 15}}},
-    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}},
-    #                     {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}]
-    #     self.assertEqual(eamtg.bin_date(trip_ls, filter_trips2, month=True), [[0,1],[2]])
-    #
-    #
-    # def test_find_first_trip(self):
-    #     import time
-    #     time1 = "Thu Jan 28 22:24:24 2020"
-    #     time2 = "Sat Jan 30 23:24:24 2020"
-    #     time3 = "Sun Jan 31 20:24:24 2020"
-    #     bin = [0,1,2]
-    #     ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y"))
-    #     ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y"))
-    #     ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y"))
-    #     filter_trips = [{'data': {'start_ts': ts1}},
-    #                     {'data': {'start_ts': ts2}},
-    #                     {'data': {'start_ts': ts3}}]
-    #
-    #     self.assertEqual(eamtg.find_first_trip(filter_trips, bin),0)
-    #
-    #
-    # def test_requested_trips_ab_cutoff(self):
-    #     import time
-    #     time1 = "Thu Jan 28 22:24:24 2020"
-    #     time2 = "Sat Jan 30 23:24:24 2020"
-    #     time3 = "Sun Jan 31 20:24:24 2020"
-    #     new_bins = [[0,1],[2]]
-    #     ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y"))
-    #     ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y"))
-    #     ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y"))
-    #     filter_trips = [{'data': {'start_ts': ts1}},
-    #                     {'data': {'start_ts': ts2}},
-    #                     {'data': {'start_ts': ts3}}]
-    #     self.assertEqual(eamtg.requested_trips_ab_cutoff(new_bins, filter_trips),([0, 2], [1]))
-    #
-    #
-    # def test_requested_trips_bl_cutoff(self):
-    #
-    #     # requested_trips_bl_cutoff(sim)
-    #     fake_trip_collect = []
-    #     trip1 = pd.DataFrame(data=([[-122.41925243091958,-122.42140476014033],[37.77938521735944,37.78194309045273]]),
-    #                       columns=[['start_loc','end_loc'],
-    #                                ['coordinates','coordinates']])
-    #     fake_trip_collect.append(trip1)
-    #     trip2 = pd.DataFrame(data=([[-122.41925243091958, -122.42093683661327], [37.77938521735944, 37.782278693221016]]),
-    #                        columns=[['start_loc', 'end_loc'],
-    #                                 ['coordinates', 'coordinates']])
-    #     fake_trip_collect.append(trip2)
-    #     trip3 = pd.DataFrame(data=([[-123.41925243091958,-122.41912876839925],[37.77938521735944,37.77766191670088]]),
-    #                       columns=[['start_loc','end_loc'],
-    #                                ['coordinates','coordinates']])
-    #     fake_trip_collect.append(trip3)
-    #     sim = similarity.similarity(fake_trip_collect,100)
-    #     print(sim.below_cutoff)
-    #     # print(bl_trip_ls)
-    #
-    #
-    #
-    #     # df = pd.DataFrame(columns=[['start_loc','end_loc'],['coordinates','coordinates']])
-    #
-    #     # print(df)
-    #     # df1 = pd.DataFrame(np.ra]ndom.randint(0, 150, size=(4, 6)),
-    #     #                    columns=[['python', 'python', 'math', 'math', 'En', 'En'],
-    #     #                             ['期中', '期末', '期中', '期末', '期中', '期末']])
-    #     # print(df1.python['期中'])
+        sim = similarity.similarity(self.filter_trips, self.radius)
+        sim.bin_data()
+        sel_bin = sim.bins[0]
+        # case 1: not same day trip
+        trip = self.filter_trips[sel_bin[1]]
+        self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), False)
+        # case 2: same day trip
+        sel_bin = sim.bins[0]
+        trip = self.filter_trips[sim.bins[1][0]]
+        self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), True)
+
+    def test_match_month(self):
+        sim = similarity.similarity(self.filter_trips, self.radius)
+        sim.bin_data()
+        sel_bin = sim.bins[0]
+        # case 1: not same month trip
+        trip = self.filter_trips[sel_bin[4]]
+        self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), False)
+        # case 2: same month trip
+        sel_bin = sim.bins[0]
+        trip = self.filter_trips[sim.bins[1][0]]
+        self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), True)
+
+    def test_bin_date(self):
+        trip_ls = [0,1,2,3,4,5,6,7]
+        self.assertEqual(eamtg.bin_date(trip_ls, self.filter_trips, day=True),[[0, 1, 2, 7], [3], [4], [5], [6]])
+        self.assertEqual(eamtg.bin_date(trip_ls, self.filter_trips, month=True),[[0, 1, 2, 3, 4, 7], [5], [6]])
+
+    def test_find_first_trip(self):
+        test_bin = [0,1,2,3,4,5,6,7]
+        self.assertEqual(eamtg.find_first_trip(self.filter_trips, test_bin),3)
+
+    def test_requested_trips_ab_cutoff(self):
+        bins = [[2,3,4,5,6]]
+        # should request [3]
+        request_trip_idx, no_request_idx = eamtg.requested_trips_ab_cutoff(bins, self.filter_trips)
+        self.assertEqual((request_trip_idx, no_request_idx),([3], [2, 4, 5, 6]))
+
+    def test_requested_trips_bl_cutoff(self):
+        sim = similarity.similarity(self.filter_trips, self.radius)
+        sim.bin_data()
+        sim.delete_bins()
+        request_idx_bl_cutoff = eamtg.requested_trips_bl_cutoff(sim)
+        self.assertEqual(request_idx_bl_cutoff,[7, 1, 0])
+
+    def test_get_requested_trips(self):
+        sim = similarity.similarity(self.filter_trips, self.radius)
+        sim.bin_data()
+        sim.delete_bins()
+        bins = sim.bins
+        self.assertEqual(eamtg.get_requested_trips(bins, self.filter_trips, sim),[3, 7, 1, 0])
+
+    def test_get_req_pct(self):
+        sim = similarity.similarity(self.filter_trips, self.radius)
+        sim.bin_data()
+        sim.delete_bins()
+        bins = sim.bins
+        bin_trips = sim.newdata
+        first_labels, track = ep.get_first_label_and_track(bins,bin_trips,self.filter_trips)
+        new_labels = first_labels.copy()
+        pct = eamtg.get_req_pct(new_labels, track, self.filter_trips, sim)
+        self.assertEqual(pct,0.5)
+
 
 if __name__ == '__main__':
     etc.configLogging()
diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips
index 1a26d2074..a537b1b2b 100644
--- a/emission/tests/data/real_examples/fake_trips
+++ b/emission/tests/data/real_examples/fake_trips
@@ -691,10 +691,10 @@
         },
         "data": {
             "source": "DwellSegmentationTimeFilter",
-            "end_ts": 1466809435.0,
+            "end_ts": 1469401435.0,
             "end_local_dt": {
                 "year": 2016,
-                "month": 6,
+                "month": 7,
                 "day": 24,
                 "hour": 16,
                 "minute": 3,
@@ -713,10 +713,10 @@
             "raw_trip": {
                 "$oid": "5fd8e662baff4ef23d349791"
             },
-            "start_ts": 1466808570.0,
+            "start_ts": 1469400570.0,
             "start_local_dt": {
                 "year": 2016,
-                "month": 6,
+                "month": 7,
                 "day": 24,
                 "hour": 15,
                 "minute": 49,
@@ -775,9 +775,9 @@
         },
         "data": {
             "source": "DwellSegmentationTimeFilter",
-            "end_ts": 1466550235.0,
+            "end_ts": 1498086235.0,
             "end_local_dt": {
-                "year": 2016,
+                "year": 2017,
                 "month": 6,
                 "day": 21,
                 "hour": 16,
@@ -797,9 +797,9 @@
             "raw_trip": {
                 "$oid": "5fd8e662baff4ef23d349791"
             },
-            "start_ts": 1466549370.0,
+            "start_ts": 1498085370.0,
             "start_local_dt": {
-                "year": 2016,
+                "year": 2017,
                 "month": 6,
                 "day": 21,
                 "hour": 15,

From 2a0bc45b6f4658b549a34006fe386591ee8bb117 Mon Sep 17 00:00:00 2001
From: Chunrui Huang <corinnehcr@gmail.com>
Date: Mon, 26 Jul 2021 10:26:30 -0700
Subject: [PATCH 6/6] update tests and fake trips

---
 .../clusteringTests/TestGetScores.py          | 68 +++++++++++++++++++
 .../clusteringTests/TestSimilarity.py         |  2 +-
 emission/tests/data/real_examples/fake_trips  |  6 +-
 3 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetScores.py

diff --git a/emission/tests/analysisTests/clusteringTests/TestGetScores.py b/emission/tests/analysisTests/clusteringTests/TestGetScores.py
new file mode 100644
index 000000000..c65857f28
--- /dev/null
+++ b/emission/tests/analysisTests/clusteringTests/TestGetScores.py
@@ -0,0 +1,68 @@
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+import emission.analysis.modelling.tour_model.similarity as similarity
+import emission.analysis.modelling.tour_model.get_scores as gs
+from future import standard_library
+standard_library.install_aliases()
+from builtins import *
+import unittest
+import json
+import bson.json_util as bju
+import emission.tests.common as etc
+
+class TestGetScores(unittest.TestCase):
+    def setUp(self):
+        self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
+        self.user = self.testUUID
+        self.radius = 100
+        self.trips = preprocess.read_data(self.user)
+        self.filter_trips = preprocess.filter_data(self.trips,self.radius)
+        self.sim = similarity.similarity(self.filter_trips, self.radius)
+        self.sim.bin_data()
+
+    def tearDown(self):
+        self.clearDBEntries()
+
+    def readAndStoreTripsFromFile(self, dataFile):
+        import emission.core.get_database as edb
+        atsdb = edb.get_analysis_timeseries_db()
+        etc.createAndFillUUID(self)
+        with open(dataFile) as dect:
+            expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook)
+            for t in expected_confirmed_trips:
+                t["user_id"] = self.testUUID
+                edb.save(atsdb, t)
+
+    def clearDBEntries(self):
+        import emission.core.get_database as edb
+        edb.get_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
+        edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID})
+
+    def test_compare_trip_orders(self):
+        # this function contains pandas.testing.assert_frame_equal
+        # if the orders of bin_trips and self.filter_trips(according to bins) are the same, the test will pass
+        self.sim.delete_bins()
+        self.bins = self.sim.bins
+        self.bin_trips = self.sim.newdata
+        gs.compare_trip_orders(self.bins, self.bin_trips, self.filter_trips)
+
+    def test_score(self):
+        labels_pred = []
+        # we use all bins for testing
+        for b in range(len(self.sim.bins)):
+            for trip in self.sim.bins[b]:
+                labels_pred.append(b)
+        # labels_true = [0, 1, 2, 2, 3, 3, 3, 4]
+        # labels_pred = [0, 0, 0, 0, 0, 1, 2, 3]
+        homo_score = gs.score(self.filter_trips, labels_pred)
+        self.assertEqual(homo_score,0.443)
+
+    def test_get_score(self):
+        homo_second = 0.443
+        percentage_second = 0.5
+        curr_score = gs.get_score(homo_second, percentage_second)
+        self.assertEqual(curr_score,0.472)
+
+if __name__ == '__main__':
+    etc.configLogging()
+    unittest.main()
diff --git a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py
index 12ea565b0..d5d42112b 100644
--- a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py
+++ b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py
@@ -9,7 +9,7 @@
 import emission.tests.common as etc
 
 # This test file is to test the functions that are used in the
-class TestDataPreprocessing(unittest.TestCase):
+class TestSimilarity(unittest.TestCase):
     def setUp(self):
         self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips")
         self.user = self.testUUID
diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips
index a537b1b2b..b2e20a174 100644
--- a/emission/tests/data/real_examples/fake_trips
+++ b/emission/tests/data/real_examples/fake_trips
@@ -648,7 +648,7 @@
                     37.3910011
                 ]
             },
-            "duration": 865.4322738647461,
+            "duration": 700,
             "distance": 4521.417177464177,
             "start_place": {
                 "$oid": "5fd8e664baff4ef23d349864"
@@ -732,7 +732,7 @@
                     37.3910011
                 ]
             },
-            "duration": 865.4322738647461,
+            "duration": 700,
             "distance": 4521.417177464177,
             "start_place": {
                 "$oid": "5fd8e664baff4ef23d349864"
@@ -816,7 +816,7 @@
                     37.3910011
                 ]
             },
-            "duration": 865.4322738647461,
+            "duration": 700,
             "distance": 4521.417177464177,
             "start_place": {
                 "$oid": "5fd8e664baff4ef23d349888"