Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check unit test code #826

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions emission/analysis/modelling/tour_model/data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline
import emission.analysis.modelling.tour_model.similarity as similarity
import pandas as pd
from sklearn.model_selection import KFold


# read data that have user labels
def read_data(user):
trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY)
return trips


# - trips: all trips read from database
# - filter_trips: valid trips that have user labels and are not points
def filter_data(trips,radius):
non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}]
non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips)
valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
valid_trips_idx_ls = valid_trips_df.index.tolist()
valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls]

# similarity codes can filter out trips that are points in valid_trips
filter_trips = similarity.filter_too_short(valid_trips, radius)
return filter_trips


# use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets)
def extract_features(filter_trips):
X = []
for trip in filter_trips:
start = trip.data.start_loc["coordinates"]
end = trip.data.end_loc["coordinates"]
distance = trip.data.distance
duration = trip.data.duration
X.append([start[0], start[1], end[0], end[1], distance, duration])
return X

def split_data(filter_trips):
X = extract_features(filter_trips)
kf = KFold(n_splits=5, shuffle=True, random_state=3)
train_idx = []
test_idx = []
for train_index, test_index in kf.split(X):
train_idx.append(train_index)
test_idx.append(test_index)
return train_idx, test_idx


# collect a set of data(training/test set) after splitting
def get_subdata(filter_trips,train_test_set):
collect_sub_data = []
for train_test_subset in train_test_set:
sub_data = []
for idx in train_test_subset:
sub_data.append(filter_trips[idx])
collect_sub_data.append(sub_data)
return collect_sub_data
135 changes: 135 additions & 0 deletions emission/analysis/modelling/tour_model/get_request_percentage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import label_processing as label_pro
import copy
import itertools


# This function is to compare a trip with a group of trips to see if they happened in a same day
def match_day(trip,bin,filter_trips):
if bin:
t = filter_trips[bin[0]]
if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\
and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']:
return True
return False


# This function is to compare a trip with a group of trips to see if they happened in a same month
def match_month(trip,bin,filter_trips):
if bin:
t = filter_trips[bin[0]]
if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\
and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']:
return True
return False


# This function bins trips according to ['start_local_dt']
def bin_date(trip_ls,filter_trips,day=None,month=None):
bin_date = []
for trip_index in trip_ls:
added = False
trip = filter_trips[trip_index]

for bin in bin_date:
if day:
if match_day(trip,bin,filter_trips):
bin.append(trip_index)
added = True
break
if month:
if match_month(trip,bin,filter_trips):
bin.append(trip_index)
added = True
break

if not added:
bin_date.append([trip_index])

return bin_date


def find_first_trip(filter_trips,bin):
trip_ts = [filter_trips[trip_idx]['data']["start_ts"] for trip_idx in bin]
# - early_idx_in_bin: the earliest trip position in the bin
# ts = [20,10,40,5,100]
# early_idx_in_bin = 3
# early trip_index = 5
early_idx_in_bin = trip_ts.index(min(trip_ts))
# - early_trip_index: the original index of the earliest trip
early_trip_index = bin[early_idx_in_bin]
return early_trip_index


# collect requested trips and common trips(no need to request) indices above cutoff
def requested_trips_ab_cutoff(new_bins, filter_trips):
# collect requested trip indices above cutoff
ab_trip_ls = []
# collect common trip indices above cutoff
no_req_trip_ls = []
for bin in new_bins:
early_trip_index = find_first_trip(filter_trips, bin)
ab_trip_ls.append(early_trip_index)

# The following loop collects the original indices of the rest of the trips in the bin. Since they are not the
# earliest one, we don't need to request for user labels
# >>> x = [100,200,300]
# >>> x.remove(100); x
# [200, 300]
no_req_trip_subls = copy.copy(bin)
no_req_trip_subls.remove(early_trip_index)
# >>> x = [1,2,3]
# >>> x.extend([4,5,6]); x
# [1, 2, 3, 4, 5, 6]
no_req_trip_ls.extend(no_req_trip_subls)
return ab_trip_ls, no_req_trip_ls


# collect requested trips indices below cutoff
def requested_trips_bl_cutoff(sim):
# bins below cutoff
bl_bins = sim.below_cutoff

# collect requested trips indices below cutoff
# effectively, bl_trip_ls = flatten(bl_bins)
# >>> bl_bins = [[1,2],[3,4],[5,6]]
# >>> bl_trip_ls = [item for sublist in bl_bins for item in sublist]
# >>> bl_trip_ls
# [1, 2, 3, 4, 5, 6]
# the reason for flattening: we need to have a whole flatten list of requested trips, then compute the percentage
bl_trip_ls = [item for sublist in bl_bins for item in sublist]
return bl_trip_ls


# a list of all requested trips indices
# - filter_trips: we need to use timestamp in filter_trips here,
# in requested_trips_ab_cutoff, we need to get the first trip of the bin,
# and we need to collect original trip indices from filter_trips
# - sim: we need to use code in similarity to find trips below cutoff
# Since the indices from similarity code are original (trips below cutoff),
# we need to have original indices of all requested trips,
# so we use filter_trips for finding the requested common trips
# new_bins: bins that have original indices of similar trips. They only represent common trips
def get_requested_trips(new_bins,filter_trips,sim):
ab_trip_ls,no_req_trip_ls = requested_trips_ab_cutoff(new_bins,filter_trips)
bl_trip_ls = requested_trips_bl_cutoff(sim)
req_trips_ls = ab_trip_ls+bl_trip_ls
return req_trips_ls


# get request percentage based on the number of requested trips and the total number of trips
def get_req_pct(new_labels,track,filter_trips,sim):
# - new_bins: bins with original indices of similar trips from common trips
# - new_label: For the first round, new_label is the copy of the first round labels, e.g. [1,1,1,2,2,2].
# For the second round, new_label is that the first round label concatenate the second round label.
# e.g.the label from the second round is [1,2,1,2,3,3], new_label will turn to [11,12,11,22,23,23]
# - track: at this point, each item in the track contains the original index of a trip,
# and the latest label of it. e.g. [ori_idx, latest_label]
# concretely, please look at "group_similar_trips" function in label_processing.py
# If new_label is [11,12,11,22,23,23] and the original indices of the trips is [1,2,3,4,5,6],
# new_bins will be [[1,3],[2],[4],[5,6]]
new_bins = label_pro.group_similar_trips(new_labels,track)
req_trips = get_requested_trips(new_bins,filter_trips,sim)
pct = len(req_trips)/len(filter_trips)
pct = float('%.3f' % pct)
return pct
83 changes: 83 additions & 0 deletions emission/analysis/modelling/tour_model/get_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pandas as pd
import pandas.testing as pdt
import label_processing as label_pro
import sklearn.metrics as skm
import itertools


# compare the trip orders in bin_trips with those in filter_trips above cutoff
def compare_trip_orders(bins,bin_trips,filter_trips):
bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips])
bin_ls = list(itertools.chain(*bins))
bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls])
# compare two data frames, the program will continue to score calculation if two data frames are the same
pdt.assert_frame_equal(bins_ts, bin_trips_ts)


# This function is to get homogeneity score after the first/second round of clustering
# It is based on bin_trips, which are common trips. bin_trips are collected according to the indices of the trips
# in bins above cutoff
# More info about bin_trips is in similarity.py (delete_bins)
# The homogeneity score reflects the degree to which a cluster consists only of trips with similar ground truthed labels.
# In the following examples, "A","B","C" are user labels.
# The labels can be drawn from different sets as long as the mapping is unique (e.g. ["A", "A", "C"] matches perfectly
# with [0,0,1]).
# Ideally, there would be 1:1 mapping between labels and clusters - e.g. ["A", "A", "A"] maps to [1,1,1]
# This can break in two ways:
# user label A maps to different clusters - e.g. ["A", "A", "A"] maps to [1,2,3].
# In this case, the homogeneity score will still be 1.0, since each cluster only has label "A".
# For our problem, this would typically map to the use case where trips with same user labels are actually to different
# destinations. For `medical` or `personal` locations, for example, users could actually go to multiple medical
# facilities or friends' houses. In this case, the trips will be in different clusters, but since the destinations are in
# fact different, this would actually be the correct behavior.
# The trips could also be to the same location, but be clustered differently due to minor variations in duration or
# distance (maybe due to traffic conditions). This could result in multiple clusters for what is essentially the same
# trip. We capture this difference through the request percentage metric, which will result in three queries for
# [1,2,3] and only one for [1,1,1]
# two different labels map to the same cluster - e.g. ["A", "A", "B"] maps to [1,1,1]. This is the case captured by the
# homogeneity score, which will be less than 1.0 (0 representes inhomogeneous, 1.0 represents homogeneous).
# This maps well to our use case because in this case, assigning the same label to all trips in the cluster would
# be incorrect. In particular, if we did not have the ground truth, the third trip would be labeled "A",
# which would lower the accuracy.
# At this point, we didn't make user_input have same labels for labels_true and labels_pred.
# For example, in the second round, user labels are [("home", "ebike", "bus"),("home", "walk", "bus"),
# ("home", "ebike", "bus")], the labels_pred can be [0,1,0], or [1,0,1] or represented by other numeric labels.
def score(bin_trips, labels_pred):
bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips])
bin_trips_user_input_df = label_pro.map_labels(bin_trips_user_input_df)

# turn all user_input into list without binning
bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist()
# drop duplicate user_input
no_dup_df = bin_trips_user_input_df.drop_duplicates()
# turn non-duplicate user_input into list
no_dup_list = no_dup_df.values.tolist()

# collect labels_true based on user_input
# To compute labels_true, we need to find out non-duplicate user labels, and use the index of the unique user label
# to label the whole trips
# If user labels are [(purpose, confirmed_mode, replaced_mode)]
# e.g.,[("home","ebike","bus"),("work","walk","bike"),("home","ebike","bus"),("home","ebike","bus"),
# ("work","walk","bike"),("exercise","ebike","walk")],
# the unique label list is [0,1,2], labels_true will be [0,1,0,0,1,2]
# labels_pred is the flattened list of labels of all common trips, e.g.[1,1,11,12,13,22,23]
labels_true = []
for userinput_dict in bin_trips_user_input_ls:
if userinput_dict in no_dup_list:
labels_true.append(no_dup_list.index(userinput_dict))

labels_pred = labels_pred
homo_score = skm.homogeneity_score(labels_true, labels_pred)
homo_score = float('%.3f' % homo_score)
return homo_score


# This function compute a score for every model.
# It is used for tuning and finding the best model after two rounds of clustering
# - homo_second: the homogeneity score after the second round of clustering
# - percentage_second: the user labels request percentage
def get_score(homo_second,percentage_second):
curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second)
curr_score = float('%.3f' % curr_score)
return curr_score

31 changes: 31 additions & 0 deletions emission/analysis/modelling/tour_model/get_users.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import emission.analysis.modelling.tour_model.data_preprocessing as preprocess


# to determine if the user is valid:
# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50%
def valid_user(filter_trips,trips):
valid = False
if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5:
valid = True
return valid


# - user_ls: a list of strings representing short user names, such as [user1, user2, user3...]
# - valid_user_ls: a subset of `user_ls` for valid users, so also string representation of user names
# - all_users: a collection of all user ids, in terms of user id objects
def get_user_ls(all_users,radius):
user_ls = []
valid_user_ls = []
for i in range(len(all_users)):
curr_user = 'user' + str(i + 1)
user = all_users[i]
trips = preprocess.read_data(user)
filter_trips = preprocess.filter_data(trips,radius)
if valid_user(filter_trips,trips):
valid_user_ls.append(curr_user)
user_ls.append(curr_user)
else:
user_ls.append(curr_user)
continue
return user_ls,valid_user_ls

Loading