e-mission · corinne-hcr · Jun 30, 2021 · Jul 9, 2021 · Jul 16, 2021 · Jul 26, 2021
diff --git a/emission/analysis/modelling/tour_model/data_preprocessing.py b/emission/analysis/modelling/tour_model/data_preprocessing.py
@@ -0,0 +1,58 @@
+import emission.storage.decorations.analysis_timeseries_queries as esda
+import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
+import emission.analysis.modelling.tour_model.similarity as similarity
+import pandas as pd
+from sklearn.model_selection import KFold
+
+
+# read data that have user labels
+def read_data(user):
+    trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY)
+    return trips
+
+
+# - trips: all trips read from database
+# - filter_trips: valid trips that have user labels and are not points
+def filter_data(trips,radius):
+    non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
+    non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips)
+    valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
+    valid_trips_idx_ls = valid_trips_df.index.tolist()
+    valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]
+
+    # similarity codes can filter out trips that are points in valid_trips
+    filter_trips = similarity.filter_too_short(valid_trips, radius)
+    return filter_trips
+
+
+# use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets)
+def extract_features(filter_trips):
+    X = []
+    for trip in filter_trips:
+        start = trip.data.start_loc["coordinates"]
+        end = trip.data.end_loc["coordinates"]
+        distance = trip.data.distance
+        duration = trip.data.duration
+        X.append([start[0], start[1], end[0], end[1], distance, duration])
+    return X
+
+def split_data(filter_trips):
+    X = extract_features(filter_trips)
+    kf = KFold(n_splits=5, shuffle=True, random_state=3)
+    train_idx = []
+    test_idx = []
+    for train_index, test_index in kf.split(X):
+        train_idx.append(train_index)
+        test_idx.append(test_index)
+    return train_idx, test_idx
+
+
+# collect a set of data(training/test set) after splitting
+def get_subdata(filter_trips,train_test_set):
+    collect_sub_data = []
+    for train_test_subset in train_test_set:
+        sub_data = []
+        for idx in train_test_subset:
+            sub_data.append(filter_trips[idx])
+        collect_sub_data.append(sub_data)
+    return collect_sub_data
diff --git a/emission/analysis/modelling/tour_model/get_request_percentage.py b/emission/analysis/modelling/tour_model/get_request_percentage.py
@@ -0,0 +1,135 @@
+import label_processing as label_pro
+import copy
+import itertools
+
+
+# This function is to compare a trip with a group of trips to see if they happened in a same day
+def match_day(trip,bin,filter_trips):
+    if bin:
+        t = filter_trips[bin[0]]
+        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
+                and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\
+                and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:
+            return True
+    return False
+
+
+# This function is to compare a trip with a group of trips to see if they happened in a same month
+def match_month(trip,bin,filter_trips):
+    if bin:
+        t = filter_trips[bin[0]]
+        if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
+                and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:
+            return True
+    return False
+
+
+# This function bins trips according to ['start_local_dt']
+def bin_date(trip_ls,filter_trips,day=None,month=None):
+    bin_date = []
+    for trip_index in trip_ls:
+        added = False
+        trip = filter_trips[trip_index]
+
+        for bin in bin_date:
+            if day:
+                if match_day(trip,bin,filter_trips):
+                    bin.append(trip_index)
+                    added = True
+                    break
+            if month:
+                if match_month(trip,bin,filter_trips):
+                    bin.append(trip_index)
+                    added = True
+                    break
+
+        if not added:
+            bin_date.append([trip_index])
+
+    return bin_date
+
+
+def find_first_trip(filter_trips,bin):
+    trip_ts = [filter_trips[trip_idx]['data']["start_ts"] for trip_idx in bin]
+    # - early_idx_in_bin: the earliest trip position in the bin
+    # ts = [20,10,40,5,100]
+    # early_idx_in_bin = 3
+    # early trip_index = 5
+    early_idx_in_bin = trip_ts.index(min(trip_ts))
+    # - early_trip_index: the original index of the earliest trip
+    early_trip_index = bin[early_idx_in_bin]
+    return early_trip_index
+
+
+# collect requested trips and common trips(no need to request) indices above cutoff
+def requested_trips_ab_cutoff(new_bins, filter_trips):
+    # collect requested trip indices above cutoff
+    ab_trip_ls = []
+    # collect common trip indices above cutoff
+    no_req_trip_ls = []
+    for bin in new_bins:
+        early_trip_index = find_first_trip(filter_trips, bin)
+        ab_trip_ls.append(early_trip_index)
+
+        # The following loop collects the original indices of the rest of the trips in the bin. Since they are not the
+        # earliest one, we don't need to request for user labels
+        # >>> x = [100,200,300]
+        # >>> x.remove(100); x
+        # [200, 300]
+        no_req_trip_subls = copy.copy(bin)
+        no_req_trip_subls.remove(early_trip_index)
+        # >>> x = [1,2,3]
+        # >>> x.extend([4,5,6]); x
+        # [1, 2, 3, 4, 5, 6]
+        no_req_trip_ls.extend(no_req_trip_subls)
+    return ab_trip_ls, no_req_trip_ls
+
+
+# collect requested trips indices below cutoff
+def requested_trips_bl_cutoff(sim):
+    # bins below cutoff
+    bl_bins = sim.below_cutoff
+
+    # collect requested trips indices below cutoff
+    # effectively, bl_trip_ls = flatten(bl_bins)
+    # >>> bl_bins = [[1,2],[3,4],[5,6]]
+    # >>> bl_trip_ls = [item for sublist in bl_bins for item in sublist]
+    # >>> bl_trip_ls
+    # [1, 2, 3, 4, 5, 6]
+    # the reason for flattening: we need to have a whole flatten list of requested trips, then compute the percentage
+    bl_trip_ls = [item for sublist in bl_bins for item in sublist]
+    return bl_trip_ls
+
+
+# a list of all requested trips indices
+# - filter_trips: we need to use timestamp in filter_trips here,
+# in requested_trips_ab_cutoff, we need to get the first trip of the bin,
+# and we need to collect original trip indices from filter_trips
+# - sim: we need to use code in similarity to find trips below cutoff
+# Since the indices from similarity code are original (trips below cutoff),
+# we need to have original indices of all requested trips,
+# so we use filter_trips for finding the requested common trips
+# new_bins: bins that have original indices of similar trips. They only represent common trips
+def get_requested_trips(new_bins,filter_trips,sim):
+    ab_trip_ls,no_req_trip_ls = requested_trips_ab_cutoff(new_bins,filter_trips)
+    bl_trip_ls = requested_trips_bl_cutoff(sim)
+    req_trips_ls = ab_trip_ls+bl_trip_ls
+    return req_trips_ls
+
+
+# get request percentage based on the number of requested trips and the total number of trips
+def get_req_pct(new_labels,track,filter_trips,sim):
+    # - new_bins: bins with original indices of similar trips from common trips
+    # - new_label: For the first round, new_label is the copy of the first round labels, e.g. [1,1,1,2,2,2].
+    # For the second round, new_label is that the first round label concatenate the second round label.
+    # e.g.the label from the second round is [1,2,1,2,3,3], new_label will turn to [11,12,11,22,23,23]
+    # - track: at this point, each item in the track contains the original index of a trip,
+    # and the latest label of it. e.g. [ori_idx, latest_label]
+    # concretely, please look at "group_similar_trips" function in label_processing.py
+    # If new_label is [11,12,11,22,23,23] and the original indices of the trips is [1,2,3,4,5,6],
+    # new_bins will be [[1,3],[2],[4],[5,6]]
+    new_bins = label_pro.group_similar_trips(new_labels,track)
+    req_trips = get_requested_trips(new_bins,filter_trips,sim)
+    pct = len(req_trips)/len(filter_trips)
+    pct = float('%.3f' % pct)
+    return pct
diff --git a/emission/analysis/modelling/tour_model/get_scores.py b/emission/analysis/modelling/tour_model/get_scores.py
@@ -0,0 +1,83 @@
+import pandas as pd
+import pandas.testing as pdt
+import label_processing as label_pro
+import sklearn.metrics as skm
+import itertools
+
+
+# compare the trip orders in bin_trips with those in filter_trips above cutoff
+def compare_trip_orders(bins,bin_trips,filter_trips):
+    bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
+    bin_ls = list(itertools.chain(*bins))
+    bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
+    # compare two data frames, the program will continue to score calculation if two data frames are the same
+    pdt.assert_frame_equal(bins_ts, bin_trips_ts)
+
+
+# This function is to get homogeneity score after the first/second round of clustering
+# It is based on bin_trips, which are common trips. bin_trips are collected according to the indices of the trips
+# in bins above cutoff
+# More info about bin_trips is in similarity.py (delete_bins)
+# The homogeneity score reflects the degree to which a cluster consists only of trips with similar ground truthed labels.
+# In the following examples, "A","B","C" are user labels.
+# The labels can be drawn from different sets as long as the mapping is unique (e.g. ["A", "A", "C"] matches perfectly
+# with [0,0,1]). 
+# Ideally, there would be 1:1 mapping between labels and clusters - e.g. ["A", "A", "A"] maps to [1,1,1]
+# This can break in two ways:
+# user label A maps to different clusters - e.g. ["A", "A", "A"] maps to [1,2,3].
+# In this case, the homogeneity score will still be 1.0, since each cluster only has label "A".
+# For our problem, this would typically map to the use case where trips with same user labels are actually to different 
+# destinations. For `medical` or `personal` locations, for example, users could actually go to multiple medical 
+# facilities or friends' houses. In this case, the trips will be in different clusters, but since the destinations are in 
+# fact different, this would actually be the correct behavior.
+# The trips could also be to the same location, but be clustered differently due to minor variations in duration or 
+# distance (maybe due to traffic conditions). This could result in multiple clusters for what is essentially the same 
+# trip. We capture this difference through the request percentage metric, which will result in three queries for 
+# [1,2,3] and only one for [1,1,1]
+# two different labels map to the same cluster - e.g. ["A", "A", "B"] maps to [1,1,1]. This is the case captured by the
+# homogeneity score, which will be less than 1.0 (0 representes inhomogeneous, 1.0 represents homogeneous).
+# This maps well to our use case because in this case, assigning the same label to all trips in the cluster would
+# be incorrect. In particular, if we did not have the ground truth, the third trip would be labeled "A", 
+# which would lower the accuracy.
+# At this point, we didn't make user_input have same labels for labels_true and labels_pred.
+# For example, in the second round, user labels are [("home", "ebike", "bus"),("home", "walk", "bus"),
+# ("home", "ebike", "bus")], the labels_pred can be [0,1,0], or [1,0,1] or represented by other numeric labels.
+def score(bin_trips, labels_pred):
+    bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
+    bin_trips_user_input_df = label_pro.map_labels(bin_trips_user_input_df)
+
+    # turn all user_input into list without binning
+    bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist()
+    # drop duplicate user_input
+    no_dup_df = bin_trips_user_input_df.drop_duplicates()
+    # turn non-duplicate user_input into list
+    no_dup_list = no_dup_df.values.tolist()
+
+    # collect labels_true based on user_input
+    # To compute labels_true, we need to find out non-duplicate user labels, and use the index of the unique user label
+    # to label the whole trips
+    # If user labels are [(purpose, confirmed_mode, replaced_mode)]
+    # e.g.,[("home","ebike","bus"),("work","walk","bike"),("home","ebike","bus"),("home","ebike","bus"),
+    # ("work","walk","bike"),("exercise","ebike","walk")],
+    # the unique label list is [0,1,2], labels_true will be [0,1,0,0,1,2]
+    # labels_pred is the flattened list of labels of all common trips, e.g.[1,1,11,12,13,22,23]
+    labels_true = []
+    for userinput_dict in bin_trips_user_input_ls:
+        if userinput_dict in no_dup_list:
+            labels_true.append(no_dup_list.index(userinput_dict))
+
+    labels_pred = labels_pred
+    homo_score = skm.homogeneity_score(labels_true, labels_pred)
+    homo_score = float('%.3f' % homo_score)
+    return homo_score
+
+
+# This function compute a score for every model.
+# It is used for tuning and finding the best model after two rounds of clustering
+# - homo_second: the homogeneity score after the second round of clustering
+# - percentage_second: the user labels request percentage
+def get_score(homo_second,percentage_second):
+    curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second)
+    curr_score = float('%.3f' % curr_score)
+    return curr_score
+
diff --git a/emission/analysis/modelling/tour_model/get_users.py b/emission/analysis/modelling/tour_model/get_users.py
@@ -0,0 +1,31 @@
+import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
+
+
+# to determine if the user is valid:
+# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50%
+def valid_user(filter_trips,trips):
+    valid = False
+    if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5:
+        valid = True
+    return valid
+
+
+# - user_ls: a list of strings representing short user names, such as [user1, user2, user3...]
+# - valid_user_ls: a subset of `user_ls` for valid users, so also string representation of user names
+# - all_users: a collection of all user ids, in terms of user id objects
+def get_user_ls(all_users,radius):
+    user_ls = []
+    valid_user_ls = []
+    for i in range(len(all_users)):
+        curr_user = 'user' + str(i + 1)
+        user = all_users[i]
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips,radius)
+        if valid_user(filter_trips,trips):
+            valid_user_ls.append(curr_user)
+            user_ls.append(curr_user)
+        else:
+            user_ls.append(curr_user)
+            continue
+    return user_ls,valid_user_ls
+