From a79d4fa09221336650002828af913e60029ab320 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Tue, 29 Jun 2021 18:17:48 -0700 Subject: [PATCH 1/6] check unit test code --- .../tour_model/data_preprocessing.py | 58 ++++++++++++++ .../modelling/tour_model/get_users.py | 31 +++++++ .../clusteringTests/TestDataPreprocessing.py | 80 +++++++++++++++++++ .../clusteringTests/TestGetUsers.py | 34 ++++++++ 4 files changed, 203 insertions(+) create mode 100644 emission/analysis/modelling/tour_model/data_preprocessing.py create mode 100644 emission/analysis/modelling/tour_model/get_users.py create mode 100644 emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetUsers.py diff --git a/emission/analysis/modelling/tour_model/data_preprocessing.py b/emission/analysis/modelling/tour_model/data_preprocessing.py new file mode 100644 index 000000000..23100c544 --- /dev/null +++ b/emission/analysis/modelling/tour_model/data_preprocessing.py @@ -0,0 +1,58 @@ +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.analysis.modelling.tour_model.cluster_pipeline as pipeline +import emission.analysis.modelling.tour_model.similarity as similarity +import pandas as pd +from sklearn.model_selection import KFold + + +# read data that have user labels +def read_data(user): + trips = pipeline.read_data(uuid=user, key=esda.CONFIRMED_TRIP_KEY) + return trips + + +# - trips: all trips read from database +# - filter_trips: valid trips that have user labels and are not points +def filter_data(trips,radius): + non_empty_trips = [t for t in trips if t["data"]["user_input"] != {}] + non_empty_trips_df = pd.DataFrame(t["data"]["user_input"] for t in non_empty_trips) + valid_trips_df = non_empty_trips_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) + valid_trips_idx_ls = valid_trips_df.index.tolist() + valid_trips = [non_empty_trips[i]for i in valid_trips_idx_ls] + + # similarity codes can filter out trips that are points in valid_trips + filter_trips = similarity.filter_too_short(valid_trips, radius) + return filter_trips + + +# use KFold (n_splits=5) to split the data into 5 models (5 training sets, 5 test sets) +def extract_features(filter_trips): + X = [] + for trip in filter_trips: + start = trip.data.start_loc["coordinates"] + end = trip.data.end_loc["coordinates"] + distance = trip.data.distance + duration = trip.data.duration + X.append([start[0], start[1], end[0], end[1], distance, duration]) + return X + +def split_data(filter_trips): + X = extract_features(filter_trips) + kf = KFold(n_splits=5, shuffle=True, random_state=3) + train_idx = [] + test_idx = [] + for train_index, test_index in kf.split(X): + train_idx.append(train_index) + test_idx.append(test_index) + return train_idx, test_idx + + +# collect a set of data(training/test set) after splitting +def get_subdata(filter_trips,train_test_set): + collect_sub_data = [] + for train_test_subset in train_test_set: + sub_data = [] + for idx in train_test_subset: + sub_data.append(filter_trips[idx]) + collect_sub_data.append(sub_data) + return collect_sub_data diff --git a/emission/analysis/modelling/tour_model/get_users.py b/emission/analysis/modelling/tour_model/get_users.py new file mode 100644 index 000000000..fc540b4aa --- /dev/null +++ b/emission/analysis/modelling/tour_model/get_users.py @@ -0,0 +1,31 @@ +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess + + +# to determine if the user is valid: +# valid user should have >= 10 trips for further analysis and the proportion of filter_trips is >=50% +def valid_user(filter_trips,trips): + valid = False + if len(filter_trips) >= 10 and len(filter_trips) / len(trips) >= 0.5: + valid = True + return valid + + +# - user_ls: a list of strings representing short user names, such as [user1, user2, user3...] +# - valid_user_ls: a subset of `user_ls` for valid users, so also string representation of user names +# - all_users: a collection of all user ids, in terms of user id objects +def get_user_ls(all_users,radius): + user_ls = [] + valid_user_ls = [] + for i in range(len(all_users)): + curr_user = 'user' + str(i + 1) + user = all_users[i] + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + if valid_user(filter_trips,trips): + valid_user_ls.append(curr_user) + user_ls.append(curr_user) + else: + user_ls.append(curr_user) + continue + return user_ls,valid_user_ls + diff --git a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py new file mode 100644 index 000000000..7f9a0e8c3 --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py @@ -0,0 +1,80 @@ +import emission.core.wrapper.localdate as ecwl +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess + +from future import standard_library +standard_library.install_aliases() +from builtins import * +import unittest +import json +import bson.json_util as bju +import emission.storage.timeseries.abstract_timeseries as esta + +import emission.tests.common as etc + + +class TestDataPreprocessing(unittest.TestCase): + + # should setup user = [self.testUUID], radius = 100 + # do we need teardown if we don't use databse? + + + def test_read_data(self): + dataFile = "emission/tests/data/real_examples/shankari_2016-06-20" + ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) + with open(dataFile+".ground_truth") as gfp: + ground_truth = json.load(gfp, object_hook=bju.object_hook) + + etc.setupRealExample(self, dataFile) + # if (not preload): + self.entries = json.load(open(dataFile+".user_inputs"), object_hook = bju.object_hook) + etc.setupRealExampleWithEntries(self) + etc.runIntakePipeline(self.testUUID) + ts = esta.TimeSeries.get_time_series(self.testUUID) + confirmed_trips = list(ts.find_entries(["analysis/confirmed_trip"], None)) + with open(dataFile+".expected_confirmed_trips") as dect: + expected_confirmed_trips = json.load(dect, object_hook = bju.object_hook) + print('confirmed_trips',confirmed_trips) + user = [self.testUUID] + trips = preprocess.read_data(user) + print('trips ', trips) + # I don't know how to assertEqual here + + + def test_filter_data(self): + radius = 100 + # - trips: should be read from a file or from database + user = [self.testUUID] + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + # assertEqual + + def test_extract_features(self): + user = [self.testUUID] + radius = 100 + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + X = preprocess.extract_features(filter_trips) + # assertEqual + + def test_split_data(self): + user = [self.testUUID] + radius = 100 + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + train_idx, test_idx = preprocess.split_data(filter_trips) + # assertEqual + + def test_get_subdata(self): + user = [self.testUUID] + radius = 100 + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + train_set_idx = [0,1,2,3,4] + collect_sub_data = preprocess.get_subdata(filter_trips, train_set_idx) + # assertEqual + + +if __name__ == '__main__': + etc.configLogging() + unittest.main() + diff --git a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py new file mode 100644 index 000000000..6283f8b07 --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py @@ -0,0 +1,34 @@ +from future import standard_library +standard_library.install_aliases() +import unittest +import emission.analysis.modelling.tour_model.get_users as gu +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess + +import emission.tests.common as etc + +class TestGetUsers(unittest.TestCase): + # def setUp(self): + # + # def tearDown(self): + + + def test_valid_user(self): + user = [self.testUUID] + radius = 100 + trips = preprocess.read_data(user) + filter_trips = preprocess.filter_data(trips,radius) + valid = gu.valid_user(filter_trips, trips) + # assertEqual + + + def test_get_user_ls(self): + all_users = [self.testUUID] + radius = 100 + user_ls,valid_user_ls = gu.get_user_ls(all_users, radius) + # assertEqual + +if __name__ == '__main__': + etc.configLogging() + unittest.main() + + From 8de7c77d91df2858eb626f24c4abf05f91e9a2af Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Fri, 9 Jul 2021 11:34:17 -0700 Subject: [PATCH 2/6] not yet refactored, just for testing notebook codes --- .../tour_model/get_request_percentage.py | 135 ++++++++++++++++++ .../modelling/tour_model/get_scores.py | 83 +++++++++++ .../modelling/tour_model/label_processing.py | 133 +++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 emission/analysis/modelling/tour_model/get_request_percentage.py create mode 100644 emission/analysis/modelling/tour_model/get_scores.py create mode 100644 emission/analysis/modelling/tour_model/label_processing.py diff --git a/emission/analysis/modelling/tour_model/get_request_percentage.py b/emission/analysis/modelling/tour_model/get_request_percentage.py new file mode 100644 index 000000000..7e5f8639b --- /dev/null +++ b/emission/analysis/modelling/tour_model/get_request_percentage.py @@ -0,0 +1,135 @@ +import label_processing as label_pro +import copy +import itertools + + +# This function is to compare a trip with a group of trips to see if they happened in a same day +def match_day(trip,bin,filter_trips): + if bin: + t = filter_trips[bin[0]] + if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\ + and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']\ + and trip['data']['start_local_dt']['day']==t['data']['start_local_dt']['day']: + return True + return False + + +# This function is to compare a trip with a group of trips to see if they happened in a same month +def match_month(trip,bin,filter_trips): + if bin: + t = filter_trips[bin[0]] + if trip['data']['start_local_dt']['year']==t['data']['start_local_dt']['year']\ + and trip['data']['start_local_dt']['month']==t['data']['start_local_dt']['month']: + return True + return False + + +# This function bins trips according to ['start_local_dt'] +def bin_date(trip_ls,filter_trips,day=None,month=None): + bin_date = [] + for trip_index in trip_ls: + added = False + trip = filter_trips[trip_index] + + for bin in bin_date: + if day: + if match_day(trip,bin,filter_trips): + bin.append(trip_index) + added = True + break + if month: + if match_month(trip,bin,filter_trips): + bin.append(trip_index) + added = True + break + + if not added: + bin_date.append([trip_index]) + + return bin_date + + +def find_first_trip(filter_trips,bin): + trip_ts = [filter_trips[trip_idx]['data']["start_ts"] for trip_idx in bin] + # - early_idx_in_bin: the earliest trip position in the bin + # ts = [20,10,40,5,100] + # early_idx_in_bin = 3 + # early trip_index = 5 + early_idx_in_bin = trip_ts.index(min(trip_ts)) + # - early_trip_index: the original index of the earliest trip + early_trip_index = bin[early_idx_in_bin] + return early_trip_index + + +# collect requested trips and common trips(no need to request) indices above cutoff +def requested_trips_ab_cutoff(new_bins, filter_trips): + # collect requested trip indices above cutoff + ab_trip_ls = [] + # collect common trip indices above cutoff + no_req_trip_ls = [] + for bin in new_bins: + early_trip_index = find_first_trip(filter_trips, bin) + ab_trip_ls.append(early_trip_index) + + # The following loop collects the original indices of the rest of the trips in the bin. Since they are not the + # earliest one, we don't need to request for user labels + # >>> x = [100,200,300] + # >>> x.remove(100); x + # [200, 300] + no_req_trip_subls = copy.copy(bin) + no_req_trip_subls.remove(early_trip_index) + # >>> x = [1,2,3] + # >>> x.extend([4,5,6]); x + # [1, 2, 3, 4, 5, 6] + no_req_trip_ls.extend(no_req_trip_subls) + return ab_trip_ls, no_req_trip_ls + + +# collect requested trips indices below cutoff +def requested_trips_bl_cutoff(sim): + # bins below cutoff + bl_bins = sim.below_cutoff + + # collect requested trips indices below cutoff + # effectively, bl_trip_ls = flatten(bl_bins) + # >>> bl_bins = [[1,2],[3,4],[5,6]] + # >>> bl_trip_ls = [item for sublist in bl_bins for item in sublist] + # >>> bl_trip_ls + # [1, 2, 3, 4, 5, 6] + # the reason for flattening: we need to have a whole flatten list of requested trips, then compute the percentage + bl_trip_ls = [item for sublist in bl_bins for item in sublist] + return bl_trip_ls + + +# a list of all requested trips indices +# - filter_trips: we need to use timestamp in filter_trips here, +# in requested_trips_ab_cutoff, we need to get the first trip of the bin, +# and we need to collect original trip indices from filter_trips +# - sim: we need to use code in similarity to find trips below cutoff +# Since the indices from similarity code are original (trips below cutoff), +# we need to have original indices of all requested trips, +# so we use filter_trips for finding the requested common trips +# new_bins: bins that have original indices of similar trips. They only represent common trips +def get_requested_trips(new_bins,filter_trips,sim): + ab_trip_ls,no_req_trip_ls = requested_trips_ab_cutoff(new_bins,filter_trips) + bl_trip_ls = requested_trips_bl_cutoff(sim) + req_trips_ls = ab_trip_ls+bl_trip_ls + return req_trips_ls + + +# get request percentage based on the number of requested trips and the total number of trips +def get_req_pct(new_labels,track,filter_trips,sim): + # - new_bins: bins with original indices of similar trips from common trips + # - new_label: For the first round, new_label is the copy of the first round labels, e.g. [1,1,1,2,2,2]. + # For the second round, new_label is that the first round label concatenate the second round label. + # e.g.the label from the second round is [1,2,1,2,3,3], new_label will turn to [11,12,11,22,23,23] + # - track: at this point, each item in the track contains the original index of a trip, + # and the latest label of it. e.g. [ori_idx, latest_label] + # concretely, please look at "group_similar_trips" function in label_processing.py + # If new_label is [11,12,11,22,23,23] and the original indices of the trips is [1,2,3,4,5,6], + # new_bins will be [[1,3],[2],[4],[5,6]] + new_bins = label_pro.group_similar_trips(new_labels,track) + req_trips = get_requested_trips(new_bins,filter_trips,sim) + pct = len(req_trips)/len(filter_trips) + pct = float('%.3f' % pct) + return pct diff --git a/emission/analysis/modelling/tour_model/get_scores.py b/emission/analysis/modelling/tour_model/get_scores.py new file mode 100644 index 000000000..eba0ba88e --- /dev/null +++ b/emission/analysis/modelling/tour_model/get_scores.py @@ -0,0 +1,83 @@ +import pandas as pd +import pandas.testing as pdt +import label_processing as label_pro +import sklearn.metrics as skm +import itertools + + +# compare the trip orders in bin_trips with those in filter_trips above cutoff +def compare_trip_orders(bins,bin_trips,filter_trips): + bin_trips_ts = pd.DataFrame(data=[trip["data"]["start_ts"] for trip in bin_trips]) + bin_ls = list(itertools.chain(*bins)) + bins_ts = pd.DataFrame(data=[filter_trips[i]["data"]["start_ts"] for i in bin_ls]) + # compare two data frames, the program will continue to score calculation if two data frames are the same + pdt.assert_frame_equal(bins_ts, bin_trips_ts) + + +# This function is to get homogeneity score after the first/second round of clustering +# It is based on bin_trips, which are common trips. bin_trips are collected according to the indices of the trips +# in bins above cutoff +# More info about bin_trips is in similarity.py (delete_bins) +# The homogeneity score reflects the degree to which a cluster consists only of trips with similar ground truthed labels. +# In the following examples, "A","B","C" are user labels. +# The labels can be drawn from different sets as long as the mapping is unique (e.g. ["A", "A", "C"] matches perfectly +# with [0,0,1]). +# Ideally, there would be 1:1 mapping between labels and clusters - e.g. ["A", "A", "A"] maps to [1,1,1] +# This can break in two ways: +# user label A maps to different clusters - e.g. ["A", "A", "A"] maps to [1,2,3]. +# In this case, the homogeneity score will still be 1.0, since each cluster only has label "A". +# For our problem, this would typically map to the use case where trips with same user labels are actually to different +# destinations. For `medical` or `personal` locations, for example, users could actually go to multiple medical +# facilities or friends' houses. In this case, the trips will be in different clusters, but since the destinations are in +# fact different, this would actually be the correct behavior. +# The trips could also be to the same location, but be clustered differently due to minor variations in duration or +# distance (maybe due to traffic conditions). This could result in multiple clusters for what is essentially the same +# trip. We capture this difference through the request percentage metric, which will result in three queries for +# [1,2,3] and only one for [1,1,1] +# two different labels map to the same cluster - e.g. ["A", "A", "B"] maps to [1,1,1]. This is the case captured by the +# homogeneity score, which will be less than 1.0 (0 representes inhomogeneous, 1.0 represents homogeneous). +# This maps well to our use case because in this case, assigning the same label to all trips in the cluster would +# be incorrect. In particular, if we did not have the ground truth, the third trip would be labeled "A", +# which would lower the accuracy. +# At this point, we didn't make user_input have same labels for labels_true and labels_pred. +# For example, in the second round, user labels are [("home", "ebike", "bus"),("home", "walk", "bus"), +# ("home", "ebike", "bus")], the labels_pred can be [0,1,0], or [1,0,1] or represented by other numeric labels. +def score(bin_trips, labels_pred): + bin_trips_user_input_df = pd.DataFrame(data=[trip["data"]["user_input"] for trip in bin_trips]) + bin_trips_user_input_df = label_pro.map_labels(bin_trips_user_input_df) + + # turn all user_input into list without binning + bin_trips_user_input_ls = bin_trips_user_input_df.values.tolist() + # drop duplicate user_input + no_dup_df = bin_trips_user_input_df.drop_duplicates() + # turn non-duplicate user_input into list + no_dup_list = no_dup_df.values.tolist() + + # collect labels_true based on user_input + # To compute labels_true, we need to find out non-duplicate user labels, and use the index of the unique user label + # to label the whole trips + # If user labels are [(purpose, confirmed_mode, replaced_mode)] + # e.g.,[("home","ebike","bus"),("work","walk","bike"),("home","ebike","bus"),("home","ebike","bus"), + # ("work","walk","bike"),("exercise","ebike","walk")], + # the unique label list is [0,1,2], labels_true will be [0,1,0,0,1,2] + # labels_pred is the flattened list of labels of all common trips, e.g.[1,1,11,12,13,22,23] + labels_true = [] + for userinput_dict in bin_trips_user_input_ls: + if userinput_dict in no_dup_list: + labels_true.append(no_dup_list.index(userinput_dict)) + + labels_pred = labels_pred + homo_score = skm.homogeneity_score(labels_true, labels_pred) + homo_score = float('%.3f' % homo_score) + return homo_score + + +# This function compute a score for every model. +# It is used for tuning and finding the best model after two rounds of clustering +# - homo_second: the homogeneity score after the second round of clustering +# - percentage_second: the user labels request percentage +def get_score(homo_second,percentage_second): + curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second) + curr_score = float('%.3f' % curr_score) + return curr_score + diff --git a/emission/analysis/modelling/tour_model/label_processing.py b/emission/analysis/modelling/tour_model/label_processing.py new file mode 100644 index 000000000..2c6932ace --- /dev/null +++ b/emission/analysis/modelling/tour_model/label_processing.py @@ -0,0 +1,133 @@ +import logging +import scipy.cluster.hierarchy as sch + +# to map the user labels +# - user_input_df: pass in original user input dataframe, return changed user input dataframe +# - sp2en: change Spanish to English +def map_labels_sp2en(user_input_df): + # Spanish words to English + span_eng_dict = {'revisado_bike': 'test ride with bike', 'placas_de carro': 'car plates', 'aseguranza': 'insurance', + 'iglesia': 'church', 'curso': 'course', + 'mi_hija recién aliviada': 'my daughter just had a new baby', + 'servicio_comunitario': 'community service', 'pago_de aseguranza': 'insurance payment', + 'grupo_comunitario': 'community group', 'caminata_comunitaria': 'community walk'} + + # change language + user_input_df = user_input_df.replace(span_eng_dict) + return user_input_df + + +# to map purposes and replaced mode in user inputs +# - cvt_pur_mo: convert purposes and replaced mode +def map_labels_purpose(user_input_df): + # Convert purpose + map_pur_dict = {'course': 'school', 'work_- lunch break': 'lunch_break', 'on_the way home': 'home', + 'insurance_payment': 'insurance'} + + # convert purpose + user_input_df = user_input_df.replace(map_pur_dict) + return user_input_df + + +def map_labels_mode(user_input_df): + # convert mode + for a in range(len(user_input_df)): + if user_input_df.iloc[a]["replaced_mode"] == "same_mode": + # to see which row will be converted + logging.debug("The following rows will be changed: %s", user_input_df.iloc[a]) + user_input_df.iloc[a]["replaced_mode"] = user_input_df.iloc[a]['mode_confirm'] + return user_input_df + + +# this function will change Spanish to English, convert purposes, and convert modes +def map_labels(user_input_df): + # Note that the spanish -> english conversion MUST currently happen before the other + # mode and purpose mappings + user_input_df = map_labels_sp2en(user_input_df) + user_input_df = map_labels_purpose(user_input_df) + user_input_df = map_labels_mode(user_input_df) + return user_input_df + +# use hierarchical clustering to get labels of the second round +# - sch.linkage: perform hierarchical(agglomerative) clustering +# In this function, we set a low bound and a higher bound(cutoff) of distance in the dendrogram +# - last_d: the distance of the last cluster in the dendrogram +# - low: the lower bound of distance +# e.g., if low = 300, last_d = 250, we will assign 0s as labels for the points, irrespective of the first round labels. +# and the list of second round labels will be like [0,0,0,0,0]. +# It means the points are already similar to each other after the first round of clustering, they don't need to +# go through the second round. +# - max_d: the cutoff of distance +# - dist_pct: the percentage of the last distance in the dendrogram +# - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix +# e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400 +# - clusters: the labels from the second round clustering +def get_second_labels(x,method,low,dist_pct): + z = sch.linkage(x, method=method, metric='euclidean') + last_d = z[-1][2] + clusters = [] + if last_d < low: + for i in range(len(x)): + clusters.append(0) + else: + max_d = last_d * dist_pct + clusters = sch.fcluster(z, max_d, criterion='distance') + return clusters + + +# this function includes hierarchical clustering and changing labels from the first round to get appropriate labels for +# the second round of clustering +# appropriate labels are label from the first round concatenate label from the second round +# (e.g. label from first round is 1, label from second round is 2, the new label will be 12) +# - second_round_idx_labels: a list to store the indices and labels from the first round. +# - second_labels: labels from the second round of clustering +def get_new_labels(second_labels,second_round_idx_labels,new_labels): + for i in range(len(second_labels)): + first_index = second_round_idx_labels[i][0] + new_label = second_round_idx_labels[i][1] + # concatenate labels from two rounds + new_label = int(str(new_label) + str(second_labels[i])) + for k in range(len(new_labels)): + if k == first_index: + new_labels[k] = new_label + break + return new_labels + + +# group similar trips according to new_labels, store the original indices of the trips +def group_similar_trips(new_labels,track): + bin_sim_trips_idx = [] + + # find the unique set of bins and store their indices into `bin_sim_trips` + label_set = set(new_labels) + # convert the set of unique labels into their indices + # concretely, if the input labels are ['a','a','a','b','b','b'] + # the unique labels are ['a', 'b'] + for sel_label in label_set: + # for the first iteration, bin = [0,1,2] + # for the second iteration, bin = [3,4,5] + bin = [index for (index, label) in enumerate(new_labels) if label == sel_label] + bin_sim_trips_idx.append(bin) + # At the end, bin_sim_trips_idx = [[0,1,2],[3,4,5]] + + # using track to replace the current indices with original indices + for bin in bin_sim_trips_idx: + # in the first iteration, bin = [0,1,2] + # in the first iteration of that, we map the trip index of the + # common trip (e.g. 0) to the original index for that trip from the track (e.g. 42) + for i in range(len(bin)): + bin[i] = track[bin[i]][0] + # At this point, the bin_sim_trips_idx will have original indices for the trips + return bin_sim_trips_idx + + + +# replace the first round labels with new labels +# - track: a list to store the indices and labels from the first round of clustering +# for item in track, item[0] is the original index of the trip in filter_trips +# item[1] is the label after the first round of clustering +# we change the labels from the first round with new labels from the second round here +def change_track_labels(track,new_labels): + for i in range(len(new_labels)): + track[i][1] = new_labels[i] + return track From 47e0f617a9a5b7dde91866eb87162622a990d595 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Fri, 16 Jul 2021 00:21:58 -0700 Subject: [PATCH 3/6] add kmeans at the 2nd round of clutering --- .../analysis/modelling/tour_model/label_processing.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/emission/analysis/modelling/tour_model/label_processing.py b/emission/analysis/modelling/tour_model/label_processing.py index 2c6932ace..644b8967c 100644 --- a/emission/analysis/modelling/tour_model/label_processing.py +++ b/emission/analysis/modelling/tour_model/label_processing.py @@ -1,5 +1,6 @@ import logging import scipy.cluster.hierarchy as sch +import sklearn.cluster as sc # to map the user labels # - user_input_df: pass in original user input dataframe, return changed user input dataframe @@ -61,7 +62,7 @@ def map_labels(user_input_df): # - dist_pct: the percentage of the last distance in the dendrogram # - sch.fcluster: form clusters from the hierarchical clustering defined by the given linkage matrix # e.g., if last_d = 10000, dist_pct = 0.4, max_d = 400, clusters will be assigned at the distance of 400 -# - clusters: the labels from the second round clustering +# by default, using scipy hierarchical clustering def get_second_labels(x,method,low,dist_pct): z = sch.linkage(x, method=method, metric='euclidean') last_d = z[-1][2] @@ -74,6 +75,13 @@ def get_second_labels(x,method,low,dist_pct): clusters = sch.fcluster(z, max_d, criterion='distance') return clusters +# using kmeans to build the model +def kmeans_clusters(clusters,x): + n_clusters = len(set(clusters)) + kmeans = sc.KMeans(n_clusters=n_clusters, random_state=0).fit(x) + k_clusters = kmeans.labels_ + return k_clusters + # this function includes hierarchical clustering and changing labels from the first round to get appropriate labels for # the second round of clustering From 3ba040dd3d9b6de168d894aa068716b1a0b89ee5 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sun, 25 Jul 2021 18:34:44 -0700 Subject: [PATCH 4/6] tests and fake trips I have so far --- .../clusteringTests/TestDataPreprocessing.py | 94 +- .../TestGetRequestPercentage.py | 154 ++++ .../clusteringTests/TestGetUsers.py | 55 +- .../clusteringTests/TestLabelProcessing.py | 98 ++ .../clusteringTests/TestSimilarity.py | 76 ++ emission/tests/data/real_examples/fake_trips | 836 ++++++++++++++++++ 6 files changed, 1249 insertions(+), 64 deletions(-) create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py create mode 100644 emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py create mode 100644 emission/tests/analysisTests/clusteringTests/TestSimilarity.py create mode 100644 emission/tests/data/real_examples/fake_trips diff --git a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py index 7f9a0e8c3..80a6b9af3 100644 --- a/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py +++ b/emission/tests/analysisTests/clusteringTests/TestDataPreprocessing.py @@ -1,77 +1,71 @@ -import emission.core.wrapper.localdate as ecwl import emission.analysis.modelling.tour_model.data_preprocessing as preprocess - from future import standard_library standard_library.install_aliases() from builtins import * import unittest import json import bson.json_util as bju -import emission.storage.timeseries.abstract_timeseries as esta - import emission.tests.common as etc class TestDataPreprocessing(unittest.TestCase): - - # should setup user = [self.testUUID], radius = 100 - # do we need teardown if we don't use databse? - + def setUp(self): + self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") + self.user = self.testUUID + self.radius = 100 + + def tearDown(self): + self.clearDBEntries() + + def readAndStoreTripsFromFile(self, dataFile): + import emission.core.get_database as edb + atsdb = edb.get_analysis_timeseries_db() + etc.createAndFillUUID(self) + with open(dataFile) as dect: + expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) + for t in expected_confirmed_trips: + t["user_id"] = self.testUUID + edb.save(atsdb, t) + + def clearDBEntries(self): + import emission.core.get_database as edb + edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) def test_read_data(self): - dataFile = "emission/tests/data/real_examples/shankari_2016-06-20" - ld = ecwl.LocalDate({'year': 2016, 'month': 6, 'day': 20}) - with open(dataFile+".ground_truth") as gfp: - ground_truth = json.load(gfp, object_hook=bju.object_hook) - - etc.setupRealExample(self, dataFile) - # if (not preload): - self.entries = json.load(open(dataFile+".user_inputs"), object_hook = bju.object_hook) - etc.setupRealExampleWithEntries(self) - etc.runIntakePipeline(self.testUUID) - ts = esta.TimeSeries.get_time_series(self.testUUID) - confirmed_trips = list(ts.find_entries(["analysis/confirmed_trip"], None)) - with open(dataFile+".expected_confirmed_trips") as dect: - expected_confirmed_trips = json.load(dect, object_hook = bju.object_hook) - print('confirmed_trips',confirmed_trips) - user = [self.testUUID] - trips = preprocess.read_data(user) - print('trips ', trips) - # I don't know how to assertEqual here - + trips = preprocess.read_data(self.user) + self.assertEqual(len(trips), 10) def test_filter_data(self): - radius = 100 - # - trips: should be read from a file or from database - user = [self.testUUID] - trips = preprocess.read_data(user) - filter_trips = preprocess.filter_data(trips,radius) - # assertEqual + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) + self.assertEqual(len(filter_trips), 8) def test_extract_features(self): - user = [self.testUUID] - radius = 100 - trips = preprocess.read_data(user) - filter_trips = preprocess.filter_data(trips,radius) + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) X = preprocess.extract_features(filter_trips) - # assertEqual + self.assertEqual(len(X), 8) + self.assertEqual(X[0], [-122.0857861, 37.3898049, -122.0826931, + 37.3914184, 1047.1630675866315, 792.4609999656677]) def test_split_data(self): - user = [self.testUUID] - radius = 100 - trips = preprocess.read_data(user) - filter_trips = preprocess.filter_data(trips,radius) + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) train_idx, test_idx = preprocess.split_data(filter_trips) - # assertEqual + self.assertEqual(len(train_idx),5) + self.assertEqual(len(test_idx), 5) + self.assertGreaterEqual(len(train_idx[0]),len(test_idx[0]),'the number of trips in train_idx should be greater ' + 'than the one in test_idx') def test_get_subdata(self): - user = [self.testUUID] - radius = 100 - trips = preprocess.read_data(user) - filter_trips = preprocess.filter_data(trips,radius) - train_set_idx = [0,1,2,3,4] + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) + train_set_idx = [[0,1,2,3,4],[0,1,2,4,5]] collect_sub_data = preprocess.get_subdata(filter_trips, train_set_idx) - # assertEqual + compare_idx = filter_trips.index(collect_sub_data[0][4]) + self.assertEqual(compare_idx, 4) if __name__ == '__main__': diff --git a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py new file mode 100644 index 000000000..1c0e6fe9b --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py @@ -0,0 +1,154 @@ +from future import standard_library +standard_library.install_aliases() +import unittest +import emission.analysis.modelling.tour_model.similarity as similarity +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess + +import emission.analysis.modelling.tour_model.get_request_percentage as eamtg +import pandas as pd +import emission.tests.common as etc +import sklearn.cluster as sc +import numpy as np +import json +import bson.json_util as bju + + +class TestGetRequestPercentage(unittest.TestCase): + def setUp(self): + self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") + self.user = self.testUUID + self.radius = 100 + + def tearDown(self): + self.clearDBEntries() + + def readAndStoreTripsFromFile(self, dataFile): + import emission.core.get_database as edb + atsdb = edb.get_analysis_timeseries_db() + etc.createAndFillUUID(self) + with open(dataFile) as dect: + expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) + for t in expected_confirmed_trips: + t["user_id"] = self.testUUID + edb.save(atsdb, t) + + def clearDBEntries(self): + import emission.core.get_database as edb + edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) + + def test_match_day(self): + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) + sim = similarity.similarity(filter_trips, radius) + + # def test_match_day(self): + # # case 1: bin contains indices & trip matches selected trip in filter_trips + # bin = [0,1,2] + # trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}} + # filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}] + # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), True) + # # case 2: bin = True & trip doesn't match selected trip in filter_trips + # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 18}}}] + # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False) + # #case 3: bin is none & trip matches selected trip in filter_trips + # bin = None + # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}] + # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False) + # + # + # def test_match_month(self): + # # case 1: bin contains indices & trip matches selected trip in filter_trips + # bin = [0,1,2] + # trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}} + # filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}] + # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), True) + # # case 2: bin = True & trip doesn't match selected trip in filter_trips + # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] + # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False) + # #case 3: bin is none & trip matches selected trip in filter_trips + # bin = None + # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}] + # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False) + # + # + # def test_bin_date(self): + # # case 1: bin day + # trip_ls = [0,1,2] + # filter_trips1 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, + # {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, + # {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] + # self.assertEqual(eamtg.bin_date(trip_ls, filter_trips1, day=True), [[0,1],[2]]) + # # case 2: bin month + # filter_trips2 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 15}}}, + # {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, + # {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] + # self.assertEqual(eamtg.bin_date(trip_ls, filter_trips2, month=True), [[0,1],[2]]) + # + # + # def test_find_first_trip(self): + # import time + # time1 = "Thu Jan 28 22:24:24 2020" + # time2 = "Sat Jan 30 23:24:24 2020" + # time3 = "Sun Jan 31 20:24:24 2020" + # bin = [0,1,2] + # ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y")) + # ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y")) + # ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y")) + # filter_trips = [{'data': {'start_ts': ts1}}, + # {'data': {'start_ts': ts2}}, + # {'data': {'start_ts': ts3}}] + # + # self.assertEqual(eamtg.find_first_trip(filter_trips, bin),0) + # + # + # def test_requested_trips_ab_cutoff(self): + # import time + # time1 = "Thu Jan 28 22:24:24 2020" + # time2 = "Sat Jan 30 23:24:24 2020" + # time3 = "Sun Jan 31 20:24:24 2020" + # new_bins = [[0,1],[2]] + # ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y")) + # ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y")) + # ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y")) + # filter_trips = [{'data': {'start_ts': ts1}}, + # {'data': {'start_ts': ts2}}, + # {'data': {'start_ts': ts3}}] + # self.assertEqual(eamtg.requested_trips_ab_cutoff(new_bins, filter_trips),([0, 2], [1])) + # + # + # def test_requested_trips_bl_cutoff(self): + # + # # requested_trips_bl_cutoff(sim) + # fake_trip_collect = [] + # trip1 = pd.DataFrame(data=([[-122.41925243091958,-122.42140476014033],[37.77938521735944,37.78194309045273]]), + # columns=[['start_loc','end_loc'], + # ['coordinates','coordinates']]) + # fake_trip_collect.append(trip1) + # trip2 = pd.DataFrame(data=([[-122.41925243091958, -122.42093683661327], [37.77938521735944, 37.782278693221016]]), + # columns=[['start_loc', 'end_loc'], + # ['coordinates', 'coordinates']]) + # fake_trip_collect.append(trip2) + # trip3 = pd.DataFrame(data=([[-123.41925243091958,-122.41912876839925],[37.77938521735944,37.77766191670088]]), + # columns=[['start_loc','end_loc'], + # ['coordinates','coordinates']]) + # fake_trip_collect.append(trip3) + # sim = similarity.similarity(fake_trip_collect,100) + # print(sim.below_cutoff) + # # print(bl_trip_ls) + # + # + # + # # df = pd.DataFrame(columns=[['start_loc','end_loc'],['coordinates','coordinates']]) + # + # # print(df) + # # df1 = pd.DataFrame(np.ra]ndom.randint(0, 150, size=(4, 6)), + # # columns=[['python', 'python', 'math', 'math', 'En', 'En'], + # # ['期中', '期末', '期中', '期末', '期中', '期末']]) + # # print(df1.python['期中']) + +if __name__ == '__main__': + etc.configLogging() + unittest.main() + diff --git a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py index 6283f8b07..7128fe6bd 100644 --- a/emission/tests/analysisTests/clusteringTests/TestGetUsers.py +++ b/emission/tests/analysisTests/clusteringTests/TestGetUsers.py @@ -3,29 +3,56 @@ import unittest import emission.analysis.modelling.tour_model.get_users as gu import emission.analysis.modelling.tour_model.data_preprocessing as preprocess - import emission.tests.common as etc +import json +import bson.json_util as bju +import copy + class TestGetUsers(unittest.TestCase): - # def setUp(self): - # - # def tearDown(self): + def setUp(self): + self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") + self.user = self.testUUID + self.radius = 100 + + def tearDown(self): + self.clearDBEntries() + + def readAndStoreTripsFromFile(self, dataFile): + import emission.core.get_database as edb + atsdb = edb.get_analysis_timeseries_db() + etc.createAndFillUUID(self) + with open(dataFile) as dect: + expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) + for t in expected_confirmed_trips: + t["user_id"] = self.testUUID + edb.save(atsdb, t) + + def clearDBEntries(self): + import emission.core.get_database as edb + edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) def test_valid_user(self): - user = [self.testUUID] - radius = 100 - trips = preprocess.read_data(user) - filter_trips = preprocess.filter_data(trips,radius) + trips = preprocess.read_data(self.user) + filter_trips = preprocess.filter_data(trips,self.radius) + # the user has 8 labeled trips, >50% of trips are labeled valid = gu.valid_user(filter_trips, trips) - # assertEqual - + self.assertEqual(valid,False) + for i in range(2): + filter_trips.append(copy.copy(filter_trips[0])) + # now the user has 10 labeled trips, >50% of trips are labeled + valid = gu.valid_user(filter_trips, trips) + self.assertEqual(valid,True) def test_get_user_ls(self): - all_users = [self.testUUID] - radius = 100 - user_ls,valid_user_ls = gu.get_user_ls(all_users, radius) - # assertEqual + # only 1 invalid user + user_ls,valid_user_ls = gu.get_user_ls([self.user], self.radius) + self.assertEqual(len(user_ls),1) + self.assertEqual(len(valid_user_ls),0) + if __name__ == '__main__': etc.configLogging() diff --git a/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py b/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py new file mode 100644 index 000000000..cb1d9e25c --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestLabelProcessing.py @@ -0,0 +1,98 @@ +from future import standard_library +standard_library.install_aliases() +import unittest +import emission.analysis.modelling.tour_model.label_processing as eamtl +import pandas as pd +import emission.tests.common as etc +import sklearn.cluster as sc +import numpy as np + + +class TestLabelProcessing(unittest.TestCase): + + def test_map_labels_sp2en(self): + mode = ['placas_de carro','aseguranza','iglesia'] + user_input_df = pd.DataFrame(data={'mode':mode}) + user_input_df = eamtl.map_labels_sp2en(user_input_df) + compare_mode = ['car plates','insurance','church'] + compare_df = pd.DataFrame(data={'mode':compare_mode}) + pd.testing.assert_frame_equal(user_input_df,compare_df) + + def test_map_labels_purpose(self): + purpose = ['course','work_- lunch break','on_the way home','insurance_payment'] + user_input_df = pd.DataFrame(data={'purpose': purpose}) + compare_purpose = ['school','lunch_break','home','insurance'] + compare_df = pd.DataFrame(data={'purpose': compare_purpose}) + user_input_df = eamtl.map_labels_purpose(user_input_df) + pd.testing.assert_frame_equal(user_input_df,compare_df) + + + def test_map_labels_mode(self): + mode_confirm = ['bike','ebike'] + replaced_mode = ['same_mode','walk'] + dict = {'mode_confirm':mode_confirm,'replaced_mode':replaced_mode} + user_input_df = pd.DataFrame(dict) + user_input_df = eamtl.map_labels_mode(user_input_df) + compare_replaced_mode = ['bike','walk'] + compare_dict = {'mode_confirm':mode_confirm,'replaced_mode':compare_replaced_mode} + compare_df = pd.DataFrame(compare_dict) + pd.testing.assert_frame_equal(user_input_df,compare_df) + + def test_map_labels(self): + mode_confirm = ['bike'] + purpose_confirm = ['iglesia'] + replaced_mode = ['same_mode'] + user_input = {'mode_confirm':mode_confirm,'purpose_confirm':purpose_confirm,'replaced_mode':replaced_mode} + user_input_df = pd.DataFrame(user_input) + user_input_df = eamtl.map_labels(user_input_df) + compare_purpose_confirm = ['church'] + compare_replaced_mode = ['bike'] + compare_dict = {'mode_confirm':mode_confirm,'purpose_confirm':compare_purpose_confirm,'replaced_mode':compare_replaced_mode} + compare_df = pd.DataFrame(compare_dict) + pd.testing.assert_frame_equal(user_input_df,compare_df) + + def test_get_second_labels(self): + x1 = [[1,2,3,4],[2,2,3,4],[3,3,3,3],[1,2,3,4]] + x2 = [[1,1,1,1],[18,33,57,20],[30,34,67,3],[40,20,3,4]] + method = 'single' + low = 50 + dist_pct = 0.6 + # if features are close + labels1 = eamtl.get_second_labels(x1, method, low, dist_pct) + labels2 = eamtl.get_second_labels(x2, method, low, dist_pct) + self.assertEqual(labels1, [0, 0, 0, 0]) + self.assertEqual(labels2.tolist(), [2,1,1,3]) + + + def test_kmeans_clusters(self): + clusters = [1, 1, 1, 0, 0, 0] + x = np.array([[1, 2], [1, 4], [1, 0],[10, 2], [10, 4], [10, 0]]) + n_clusters = len(set(clusters)) + k_clusters = eamtl.kmeans_clusters(clusters, x) + self.assertEqual(k_clusters.tolist(), [1,1,1,0,0,0]) + + + def test_get_new_labels(self): + second_labels = [2,1,1,3] + second_round_idx_labels =[[0,1],[1,1],[2,1],[3,2]] + new_labels = [1,1,1,2,3,3,3,3] + new_labels = eamtl.get_new_labels(second_labels, second_round_idx_labels, new_labels) + self.assertEqual(new_labels, [12, 11, 11, 23, 3, 3, 3, 3]) + + def test_group_similar_trips(self): + new_labels = [12, 11, 11, 23, 31, 31, 32, 32] + track = [[11,12],[15,11],[20,11],[50,23],[57,31],[59,31],[67,32],[69,32]] + new_bins = eamtl.group_similar_trips(new_labels,track) + self.assertEqual(new_bins, [[67, 69], [15, 20], [11], [50], [57, 59]]) + + def test_change_track_labels(self): + track = [[11,1],[15,1],[20,1],[50,2],[57,3],[59,3],[67,3],[69,3]] + new_labels = [12, 11, 11, 23, 31, 31, 32, 32] + track = eamtl.change_track_labels(track,new_labels) + self.assertEqual(track, [[11, 12], [15, 11], [20, 11], [50, 23], [57, 31], [59, 31], [67, 32], [69, 32]]) + +if __name__ == '__main__': + etc.configLogging() + unittest.main() + + diff --git a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py new file mode 100644 index 000000000..12ea565b0 --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py @@ -0,0 +1,76 @@ +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess +import emission.analysis.modelling.tour_model.similarity as similarity +from future import standard_library +standard_library.install_aliases() +from builtins import * +import unittest +import json +import bson.json_util as bju +import emission.tests.common as etc + +# This test file is to test the functions that are used in the +class TestDataPreprocessing(unittest.TestCase): + def setUp(self): + self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") + self.user = self.testUUID + self.radius = 100 + + def tearDown(self): + self.clearDBEntries() + + def readAndStoreTripsFromFile(self, dataFile): + import emission.core.get_database as edb + atsdb = edb.get_analysis_timeseries_db() + etc.createAndFillUUID(self) + with open(dataFile) as dect: + expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) + for t in expected_confirmed_trips: + t["user_id"] = self.testUUID + edb.save(atsdb, t) + + def clearDBEntries(self): + import emission.core.get_database as edb + edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) + + def test_within_radius(self): + # case 1: start and end location are within radius + in_range = similarity.within_radius(-122.40998884982407, 37.809339507025655, + -122.4101610462097, 37.80925081700211, self.radius) + self.assertEqual(in_range,True) + # case 2: start and end location are not within radius + in_range = similarity.within_radius(-122.40998884982407, 37.809339507025655, + -122.41296471977945, 37.8079948386731, self.radius) + self.assertEqual(in_range,False) + + + def test_filter_too_short(self): + all_trips = preprocess.read_data(self.user) + valid_trips = similarity.filter_too_short(all_trips, self.radius) + self.assertEqual(len(valid_trips),10) + + def test_bin_data(self): + trips = preprocess.read_data(self.user) + sim = similarity.similarity(trips, self.radius) + filter_trips = sim.data + sim.bin_data() + self.assertEqual(sim.bins,[[4, 5, 6, 7, 8], [0], [1], [2], [3], [9]]) + + def test_delete_bins(self): + trips = preprocess.read_data(self.user) + sim = similarity.similarity(trips, self.radius) + filter_trips = sim.data + sim.bin_data() + sim.delete_bins() + bins = sim.bins + bin_trips = sim.newdata + self.assertEqual(bins,[[4, 5, 6, 7, 8]]) + self.assertEqual(len(bin_trips),5) + + +if __name__ == '__main__': + etc.configLogging() + unittest.main() + + diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips new file mode 100644 index 000000000..1a26d2074 --- /dev/null +++ b/emission/tests/data/real_examples/fake_trips @@ -0,0 +1,836 @@ +[ + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0241" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050275.276295, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 55, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:55.276295-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466437275.856, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 8, + "minute": 41, + "second": 15, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T08:41:15.856000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.0826931, + 37.3914184 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349789" + }, + "start_ts": 1466436483.395, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 8, + "minute": 28, + "second": 3, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T08:28:03.395000-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.0857861, + 37.3898049 + ] + }, + "duration": 792.4609999656677, + "distance": 1047.1630675866315, + "start_place": { + "$oid": "5fd8e664baff4ef23d349860" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349861" + }, + "cleaned_trip": { + "$oid": "5fd8e663baff4ef23d3497af" + }, + "user_input": { + "mode_confirm": "walk", + "purpose_confirm": "library" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0242" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050275.488737, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 55, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:55.488737-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466438022.959, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 8, + "minute": 53, + "second": 42, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T08:53:42.959000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.0866181, + 37.3910231 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d34978b" + }, + "start_ts": 1466437438.6453953, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 8, + "minute": 43, + "second": 58, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T08:43:58.645395-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.0826931, + 37.3914184 + ] + }, + "duration": 584.3136048316956, + "distance": 886.4937093667857, + "start_place": { + "$oid": "5fd8e664baff4ef23d349861" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349862" + }, + "cleaned_trip": { + "$oid": "5fd8e663baff4ef23d3497ce" + }, + "user_input": {} + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0243" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050275.7204192, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 55, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:55.720419-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466461966.379, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 15, + "minute": 32, + "second": 46, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T15:32:46.379000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.0830016, + 37.3901637 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d34978d" + }, + "start_ts": 1466461623.1195338, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 15, + "minute": 27, + "second": 3, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:27:03.119534-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.0866181, + 37.3910231 + ] + }, + "duration": 343.25946617126465, + "distance": 610.2234223038181, + "start_place": { + "$oid": "5fd8e664baff4ef23d349862" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349863" + }, + "cleaned_trip": { + "$oid": "5fd8e663baff4ef23d3497e6" + }, + "user_input": {} + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0244" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050275.942955, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 55, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:55.942955-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466462452.708, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 15, + "minute": 40, + "second": 52, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T15:40:52.708000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d34978f" + }, + "start_ts": 1466462052.158904, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 15, + "minute": 34, + "second": 12, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:34:12.158904-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.0830016, + 37.3901637 + ] + }, + "duration": 400.54909586906433, + "distance": 405.97685486691756, + "start_place": { + "$oid": "5fd8e664baff4ef23d349863" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349864" + }, + "cleaned_trip": { + "$oid": "5fd8e663baff4ef23d3497f6" + }, + "user_input": { + "mode_confirm": "walk", + "purpose_confirm": "home" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0245" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.1554408, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.155441-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466463835.713, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 16, + "minute": 3, + "second": 55, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T16:03:55.713000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.1081974, + 37.4168828 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349791" + }, + "start_ts": 1466462970.2807262, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 15, + "minute": 49, + "second": 30, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:49:30.280726-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "duration": 865.4322738647461, + "distance": 4521.417177464177, + "start_place": { + "$oid": "5fd8e664baff4ef23d349864" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349865" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d349808" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "karate" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0246" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.546149, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.546149-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466467959.767, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 17, + "minute": 12, + "second": 39, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T17:12:39.767000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.0864051, + 37.3907649 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349795" + }, + "start_ts": 1466466584.0461695, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 20, + "hour": 16, + "minute": 49, + "second": 44, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T16:49:44.046170-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.1081974, + 37.4168828 + ] + }, + "duration": 1375.7208304405212, + "distance": 5136.824369981995, + "start_place": { + "$oid": "5fd8e664baff4ef23d349865" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349867" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d34982c" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "home" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0247" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.1554408, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.155441-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466377435.0, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 19, + "hour": 16, + "minute": 3, + "second": 55, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T16:03:55.713000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.1081974, + 37.4168828 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349791" + }, + "start_ts": 1466376570.0, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 19, + "hour": 15, + "minute": 49, + "second": 30, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:49:30.280726-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "duration": 865.4322738647461, + "distance": 4521.417177464177, + "start_place": { + "$oid": "5fd8e664baff4ef23d349864" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349865" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d349808" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "karate" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0248" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.1554408, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.155441-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466636635.0, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 22, + "hour": 16, + "minute": 3, + "second": 55, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T16:03:55.713000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.10849780967732, + 37.416772003842034 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349791" + }, + "start_ts": 1466635770.0, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 22, + "hour": 15, + "minute": 49, + "second": 30, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:49:30.280726-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "duration": 865.4322738647461, + "distance": 4521.417177464177, + "start_place": { + "$oid": "5fd8e664baff4ef23d349864" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349865" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d349808" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "soccer" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0249" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.1554408, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.155441-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466809435.0, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 24, + "hour": 16, + "minute": 3, + "second": 55, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T16:03:55.713000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.10849780967732, + 37.416772003842034 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349791" + }, + "start_ts": 1466808570.0, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 24, + "hour": 15, + "minute": 49, + "second": 30, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:49:30.280726-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "duration": 865.4322738647461, + "distance": 4521.417177464177, + "start_place": { + "$oid": "5fd8e664baff4ef23d349864" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349868" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d349808" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "soccer" + } + } + }, + { + "_id": { + "$oid": "5fd8e69ac61669a9ebad0250" + }, + "user_id": { + "$uuid": "aa9fdec92944446c8ee250d79b3044d3" + }, + "metadata": { + "key": "analysis/confirmed_trip", + "platform": "server", + "write_ts": 1608050276.1554408, + "time_zone": "America/Los_Angeles", + "write_local_dt": { + "year": 2020, + "month": 12, + "day": 15, + "hour": 8, + "minute": 37, + "second": 56, + "weekday": 1, + "timezone": "America/Los_Angeles" + }, + "write_fmt_time": "2020-12-15T08:37:56.155441-08:00" + }, + "data": { + "source": "DwellSegmentationTimeFilter", + "end_ts": 1466550235.0, + "end_local_dt": { + "year": 2016, + "month": 6, + "day": 21, + "hour": 16, + "minute": 3, + "second": 55, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "end_fmt_time": "2016-06-20T16:03:55.713000-07:00", + "end_loc": { + "type": "Point", + "coordinates": [ + -122.10849780967732, + 37.416772003842034 + ] + }, + "raw_trip": { + "$oid": "5fd8e662baff4ef23d349791" + }, + "start_ts": 1466549370.0, + "start_local_dt": { + "year": 2016, + "month": 6, + "day": 21, + "hour": 15, + "minute": 49, + "second": 30, + "weekday": 0, + "timezone": "America/Los_Angeles" + }, + "start_fmt_time": "2016-06-20T15:49:30.280726-07:00", + "start_loc": { + "type": "Point", + "coordinates": [ + -122.086605, + 37.3910011 + ] + }, + "duration": 865.4322738647461, + "distance": 4521.417177464177, + "start_place": { + "$oid": "5fd8e664baff4ef23d349888" + }, + "end_place": { + "$oid": "5fd8e664baff4ef23d349889" + }, + "cleaned_trip": { + "$oid": "5fd8e664baff4ef23d349808" + }, + "user_input": { + "mode_confirm": "shared_ride", + "purpose_confirm": "soccer" + } + } + } +] \ No newline at end of file From f815cff60022c6da9d6c3c2dbf59a69179aa3c83 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Sun, 25 Jul 2021 20:33:04 -0700 Subject: [PATCH 5/6] done with TestGetRequestPercentage, update fake_trips for testing --- .../TestGetRequestPercentage.py | 180 +++++++----------- emission/tests/data/real_examples/fake_trips | 16 +- 2 files changed, 76 insertions(+), 120 deletions(-) diff --git a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py index 1c0e6fe9b..e93a5ea17 100644 --- a/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py +++ b/emission/tests/analysisTests/clusteringTests/TestGetRequestPercentage.py @@ -3,21 +3,22 @@ import unittest import emission.analysis.modelling.tour_model.similarity as similarity import emission.analysis.modelling.tour_model.data_preprocessing as preprocess - import emission.analysis.modelling.tour_model.get_request_percentage as eamtg -import pandas as pd +import emission.analysis.modelling.tour_model.evaluation_pipeline as ep import emission.tests.common as etc -import sklearn.cluster as sc -import numpy as np import json import bson.json_util as bju + class TestGetRequestPercentage(unittest.TestCase): def setUp(self): self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") self.user = self.testUUID self.radius = 100 + self.trips = preprocess.read_data(self.user) + self.filter_trips = preprocess.filter_data(self.trips,self.radius) + def tearDown(self): self.clearDBEntries() @@ -39,114 +40,69 @@ def clearDBEntries(self): edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) def test_match_day(self): - trips = preprocess.read_data(self.user) - filter_trips = preprocess.filter_data(trips,self.radius) - sim = similarity.similarity(filter_trips, radius) - - # def test_match_day(self): - # # case 1: bin contains indices & trip matches selected trip in filter_trips - # bin = [0,1,2] - # trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}} - # filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}] - # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), True) - # # case 2: bin = True & trip doesn't match selected trip in filter_trips - # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 18}}}] - # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False) - # #case 3: bin is none & trip matches selected trip in filter_trips - # bin = None - # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}] - # self.assertEqual(eamtg.match_day(trip, bin, filter_trips), False) - # - # - # def test_match_month(self): - # # case 1: bin contains indices & trip matches selected trip in filter_trips - # bin = [0,1,2] - # trip = {'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}} - # filter_trips = [{'data':{'start_local_dt':{'year':2020,'month':8,'day':14}}}] - # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), True) - # # case 2: bin = True & trip doesn't match selected trip in filter_trips - # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] - # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False) - # #case 3: bin is none & trip matches selected trip in filter_trips - # bin = None - # filter_trips = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}] - # self.assertEqual(eamtg.match_month(trip, bin, filter_trips), False) - # - # - # def test_bin_date(self): - # # case 1: bin day - # trip_ls = [0,1,2] - # filter_trips1 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, - # {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, - # {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] - # self.assertEqual(eamtg.bin_date(trip_ls, filter_trips1, day=True), [[0,1],[2]]) - # # case 2: bin month - # filter_trips2 = [{'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 15}}}, - # {'data': {'start_local_dt': {'year': 2020, 'month': 8, 'day': 14}}}, - # {'data': {'start_local_dt': {'year': 2020, 'month': 7, 'day': 18}}}] - # self.assertEqual(eamtg.bin_date(trip_ls, filter_trips2, month=True), [[0,1],[2]]) - # - # - # def test_find_first_trip(self): - # import time - # time1 = "Thu Jan 28 22:24:24 2020" - # time2 = "Sat Jan 30 23:24:24 2020" - # time3 = "Sun Jan 31 20:24:24 2020" - # bin = [0,1,2] - # ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y")) - # ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y")) - # ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y")) - # filter_trips = [{'data': {'start_ts': ts1}}, - # {'data': {'start_ts': ts2}}, - # {'data': {'start_ts': ts3}}] - # - # self.assertEqual(eamtg.find_first_trip(filter_trips, bin),0) - # - # - # def test_requested_trips_ab_cutoff(self): - # import time - # time1 = "Thu Jan 28 22:24:24 2020" - # time2 = "Sat Jan 30 23:24:24 2020" - # time3 = "Sun Jan 31 20:24:24 2020" - # new_bins = [[0,1],[2]] - # ts1 = time.mktime(time.strptime(time1, "%a %b %d %H:%M:%S %Y")) - # ts2 = time.mktime(time.strptime(time2, "%a %b %d %H:%M:%S %Y")) - # ts3 = time.mktime(time.strptime(time3, "%a %b %d %H:%M:%S %Y")) - # filter_trips = [{'data': {'start_ts': ts1}}, - # {'data': {'start_ts': ts2}}, - # {'data': {'start_ts': ts3}}] - # self.assertEqual(eamtg.requested_trips_ab_cutoff(new_bins, filter_trips),([0, 2], [1])) - # - # - # def test_requested_trips_bl_cutoff(self): - # - # # requested_trips_bl_cutoff(sim) - # fake_trip_collect = [] - # trip1 = pd.DataFrame(data=([[-122.41925243091958,-122.42140476014033],[37.77938521735944,37.78194309045273]]), - # columns=[['start_loc','end_loc'], - # ['coordinates','coordinates']]) - # fake_trip_collect.append(trip1) - # trip2 = pd.DataFrame(data=([[-122.41925243091958, -122.42093683661327], [37.77938521735944, 37.782278693221016]]), - # columns=[['start_loc', 'end_loc'], - # ['coordinates', 'coordinates']]) - # fake_trip_collect.append(trip2) - # trip3 = pd.DataFrame(data=([[-123.41925243091958,-122.41912876839925],[37.77938521735944,37.77766191670088]]), - # columns=[['start_loc','end_loc'], - # ['coordinates','coordinates']]) - # fake_trip_collect.append(trip3) - # sim = similarity.similarity(fake_trip_collect,100) - # print(sim.below_cutoff) - # # print(bl_trip_ls) - # - # - # - # # df = pd.DataFrame(columns=[['start_loc','end_loc'],['coordinates','coordinates']]) - # - # # print(df) - # # df1 = pd.DataFrame(np.ra]ndom.randint(0, 150, size=(4, 6)), - # # columns=[['python', 'python', 'math', 'math', 'En', 'En'], - # # ['期中', '期末', '期中', '期末', '期中', '期末']]) - # # print(df1.python['期中']) + sim = similarity.similarity(self.filter_trips, self.radius) + sim.bin_data() + sel_bin = sim.bins[0] + # case 1: not same day trip + trip = self.filter_trips[sel_bin[1]] + self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), False) + # case 2: same day trip + sel_bin = sim.bins[0] + trip = self.filter_trips[sim.bins[1][0]] + self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), True) + + def test_match_month(self): + sim = similarity.similarity(self.filter_trips, self.radius) + sim.bin_data() + sel_bin = sim.bins[0] + # case 1: not same month trip + trip = self.filter_trips[sel_bin[4]] + self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), False) + # case 2: same month trip + sel_bin = sim.bins[0] + trip = self.filter_trips[sim.bins[1][0]] + self.assertEqual(eamtg.match_day(trip, sel_bin, self.filter_trips), True) + + def test_bin_date(self): + trip_ls = [0,1,2,3,4,5,6,7] + self.assertEqual(eamtg.bin_date(trip_ls, self.filter_trips, day=True),[[0, 1, 2, 7], [3], [4], [5], [6]]) + self.assertEqual(eamtg.bin_date(trip_ls, self.filter_trips, month=True),[[0, 1, 2, 3, 4, 7], [5], [6]]) + + def test_find_first_trip(self): + test_bin = [0,1,2,3,4,5,6,7] + self.assertEqual(eamtg.find_first_trip(self.filter_trips, test_bin),3) + + def test_requested_trips_ab_cutoff(self): + bins = [[2,3,4,5,6]] + # should request [3] + request_trip_idx, no_request_idx = eamtg.requested_trips_ab_cutoff(bins, self.filter_trips) + self.assertEqual((request_trip_idx, no_request_idx),([3], [2, 4, 5, 6])) + + def test_requested_trips_bl_cutoff(self): + sim = similarity.similarity(self.filter_trips, self.radius) + sim.bin_data() + sim.delete_bins() + request_idx_bl_cutoff = eamtg.requested_trips_bl_cutoff(sim) + self.assertEqual(request_idx_bl_cutoff,[7, 1, 0]) + + def test_get_requested_trips(self): + sim = similarity.similarity(self.filter_trips, self.radius) + sim.bin_data() + sim.delete_bins() + bins = sim.bins + self.assertEqual(eamtg.get_requested_trips(bins, self.filter_trips, sim),[3, 7, 1, 0]) + + def test_get_req_pct(self): + sim = similarity.similarity(self.filter_trips, self.radius) + sim.bin_data() + sim.delete_bins() + bins = sim.bins + bin_trips = sim.newdata + first_labels, track = ep.get_first_label_and_track(bins,bin_trips,self.filter_trips) + new_labels = first_labels.copy() + pct = eamtg.get_req_pct(new_labels, track, self.filter_trips, sim) + self.assertEqual(pct,0.5) + if __name__ == '__main__': etc.configLogging() diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips index 1a26d2074..a537b1b2b 100644 --- a/emission/tests/data/real_examples/fake_trips +++ b/emission/tests/data/real_examples/fake_trips @@ -691,10 +691,10 @@ }, "data": { "source": "DwellSegmentationTimeFilter", - "end_ts": 1466809435.0, + "end_ts": 1469401435.0, "end_local_dt": { "year": 2016, - "month": 6, + "month": 7, "day": 24, "hour": 16, "minute": 3, @@ -713,10 +713,10 @@ "raw_trip": { "$oid": "5fd8e662baff4ef23d349791" }, - "start_ts": 1466808570.0, + "start_ts": 1469400570.0, "start_local_dt": { "year": 2016, - "month": 6, + "month": 7, "day": 24, "hour": 15, "minute": 49, @@ -775,9 +775,9 @@ }, "data": { "source": "DwellSegmentationTimeFilter", - "end_ts": 1466550235.0, + "end_ts": 1498086235.0, "end_local_dt": { - "year": 2016, + "year": 2017, "month": 6, "day": 21, "hour": 16, @@ -797,9 +797,9 @@ "raw_trip": { "$oid": "5fd8e662baff4ef23d349791" }, - "start_ts": 1466549370.0, + "start_ts": 1498085370.0, "start_local_dt": { - "year": 2016, + "year": 2017, "month": 6, "day": 21, "hour": 15, From 2a0bc45b6f4658b549a34006fe386591ee8bb117 Mon Sep 17 00:00:00 2001 From: Chunrui Huang Date: Mon, 26 Jul 2021 10:26:30 -0700 Subject: [PATCH 6/6] update tests and fake trips --- .../clusteringTests/TestGetScores.py | 68 +++++++++++++++++++ .../clusteringTests/TestSimilarity.py | 2 +- emission/tests/data/real_examples/fake_trips | 6 +- 3 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 emission/tests/analysisTests/clusteringTests/TestGetScores.py diff --git a/emission/tests/analysisTests/clusteringTests/TestGetScores.py b/emission/tests/analysisTests/clusteringTests/TestGetScores.py new file mode 100644 index 000000000..c65857f28 --- /dev/null +++ b/emission/tests/analysisTests/clusteringTests/TestGetScores.py @@ -0,0 +1,68 @@ +import emission.analysis.modelling.tour_model.data_preprocessing as preprocess +import emission.analysis.modelling.tour_model.similarity as similarity +import emission.analysis.modelling.tour_model.get_scores as gs +from future import standard_library +standard_library.install_aliases() +from builtins import * +import unittest +import json +import bson.json_util as bju +import emission.tests.common as etc + +class TestGetScores(unittest.TestCase): + def setUp(self): + self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") + self.user = self.testUUID + self.radius = 100 + self.trips = preprocess.read_data(self.user) + self.filter_trips = preprocess.filter_data(self.trips,self.radius) + self.sim = similarity.similarity(self.filter_trips, self.radius) + self.sim.bin_data() + + def tearDown(self): + self.clearDBEntries() + + def readAndStoreTripsFromFile(self, dataFile): + import emission.core.get_database as edb + atsdb = edb.get_analysis_timeseries_db() + etc.createAndFillUUID(self) + with open(dataFile) as dect: + expected_confirmed_trips = json.load(dect, object_hook=bju.object_hook) + for t in expected_confirmed_trips: + t["user_id"] = self.testUUID + edb.save(atsdb, t) + + def clearDBEntries(self): + import emission.core.get_database as edb + edb.get_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID}) + edb.get_pipeline_state_db().delete_many({"user_id": self.testUUID}) + + def test_compare_trip_orders(self): + # this function contains pandas.testing.assert_frame_equal + # if the orders of bin_trips and self.filter_trips(according to bins) are the same, the test will pass + self.sim.delete_bins() + self.bins = self.sim.bins + self.bin_trips = self.sim.newdata + gs.compare_trip_orders(self.bins, self.bin_trips, self.filter_trips) + + def test_score(self): + labels_pred = [] + # we use all bins for testing + for b in range(len(self.sim.bins)): + for trip in self.sim.bins[b]: + labels_pred.append(b) + # labels_true = [0, 1, 2, 2, 3, 3, 3, 4] + # labels_pred = [0, 0, 0, 0, 0, 1, 2, 3] + homo_score = gs.score(self.filter_trips, labels_pred) + self.assertEqual(homo_score,0.443) + + def test_get_score(self): + homo_second = 0.443 + percentage_second = 0.5 + curr_score = gs.get_score(homo_second, percentage_second) + self.assertEqual(curr_score,0.472) + +if __name__ == '__main__': + etc.configLogging() + unittest.main() diff --git a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py index 12ea565b0..d5d42112b 100644 --- a/emission/tests/analysisTests/clusteringTests/TestSimilarity.py +++ b/emission/tests/analysisTests/clusteringTests/TestSimilarity.py @@ -9,7 +9,7 @@ import emission.tests.common as etc # This test file is to test the functions that are used in the -class TestDataPreprocessing(unittest.TestCase): +class TestSimilarity(unittest.TestCase): def setUp(self): self.readAndStoreTripsFromFile("emission/tests/data/real_examples/fake_trips") self.user = self.testUUID diff --git a/emission/tests/data/real_examples/fake_trips b/emission/tests/data/real_examples/fake_trips index a537b1b2b..b2e20a174 100644 --- a/emission/tests/data/real_examples/fake_trips +++ b/emission/tests/data/real_examples/fake_trips @@ -648,7 +648,7 @@ 37.3910011 ] }, - "duration": 865.4322738647461, + "duration": 700, "distance": 4521.417177464177, "start_place": { "$oid": "5fd8e664baff4ef23d349864" @@ -732,7 +732,7 @@ 37.3910011 ] }, - "duration": 865.4322738647461, + "duration": 700, "distance": 4521.417177464177, "start_place": { "$oid": "5fd8e664baff4ef23d349864" @@ -816,7 +816,7 @@ 37.3910011 ] }, - "duration": 865.4322738647461, + "duration": 700, "distance": 4521.417177464177, "start_place": { "$oid": "5fd8e664baff4ef23d349888"