-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path(6) Trail Recommender Final.py
276 lines (208 loc) · 13.3 KB
/
(6) Trail Recommender Final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import itertools
import mysql.connector
from mysql.connector import errorcode
from secret import mySQL_password # Note: secret.py file
import numpy as np
import pandas as pd
import time
import json
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
import pickle
# Create or get a MySQL connection object
cnx = mysql.connector.connect(user='root', password=mySQL_password,
host='127.0.0.1',
database='trails')
print(cnx)
# Instantiates and returns a cursor using C Extension
cursor = cnx.cursor()
# ### Retrieve the `trail_info_final` table from the database and create Pandas DataFrame
# Read SQL query or database table into a DataFrame
trail_info_final_df = pd.read_sql('SELECT * FROM trails.trail_info_final', con=cnx)
trail_info_final_df.head()
# ### Examine the DataFrame
# Return the number of rows and columns (dimensionality) of the DataFrame
print('Rows: {}, Cols: {}'.format(trail_info_final_df.shape[0], trail_info_final_df.shape[1]))
# Print a concise summary of a DataFrame including the index dtype and column dtypes, non-null values, and memory usage
# Note: Useful to quickly see if null values exist
trail_info_final_df.info()
# Column names (which is "an index")
trail_info_final_df.columns
# Sort a DataFrame by multiple columns
# Note: ascending: sort ascending vs. descending; ascending=True is default
# inplace=True changes the original DataFrame
#trail_info_final_df.sort_values(by=['X', 'Y'], ascending=True, inplace=True)
# Print the first 5 rows and the last 5 rows of the DataFrame
trail_info_final_df.head().append(trail_info_final_df.tail())
# ### [Turi Create](https://github.com/apple/turicreate)
#
# pip install turicreate
#
# - [User Guide](https://apple.github.io/turicreate/docs/userguide/)
# - [Turi Create API Documentation](https://apple.github.io/turicreate/docs/api/index.html)
import turicreate as tc
# ### Reading a File
# SFrame means scalable data frame. A tabular, column-mutable dataframe object that can scale to big data
# Note: data: array | pandas.DataFrame | string | dict, optional;
# the actual interpretation of this field is dependent on the 'format' parameter
# format: format of the data; the default, 'auto' will automatically infer the input data format
trail_info_final_sf = tc.SFrame(data=trail_info_final_df, format='dataframe')
# ### Examine the SFrame
# The shape of the SFrame, in a tuple. The first entry is the number of rows, the second is the number of columns
print('Rows: {}, Cols: {}'.format(trail_info_final_sf.shape[0], trail_info_final_sf.shape[1]))
# Print a concise summary of a DataFrame including the index dtype and column dtypes, non-null values, and memory usage
# Note: Useful to quickly see if null values exist
#trail_info_final_sf.info()
# Column names and types of the SFrame
column_info = dict(zip(trail_info_final_sf.column_names(), trail_info_final_sf.column_types()))
column_info
# The first n rows of the SFrame
trail_info_final_sf.head(5)
# ### [Recommender Systems](https://apple.github.io/turicreate/docs/userguide/recommender/)
#
# ### [Item Content Recommender](https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.item_content_recommender.ItemContentRecommender.html)
#
# Create a content-based recommender model in which the similarity between the items recommended is determined by the content of those items rather than learned from user interaction data.
#
# The similarity score between two items is calculated by first computing the similarity between the item data for each column, then taking a weighted average of the per-column similarities to get the final similarity. The recommendations are generated according to the average similarity of a candidate item to all the items in a user’s set of rated items.
# **Creating an `ItemContentRecommender`**
# - A recommender based on the similarity between item content rather using user interaction patterns to compute similarity.
# ### Model Variables: Elevation Gain, Distance, Trail Difficulty, Route Type, and Trail Tags
# Describe numeric columns
# Generates descriptive summary statistics of the central tendency, dispersion, and shape of the distribution
# Note: By default only numeric (int64) fields are returned
# Excludes "NaN" (missing) values
# Remove columns by specifying directly column names
# Note: inplace=True changes the original DataFrame
pd.set_option('max_rows', None, 'max_columns', None)
round(trail_info_final_df.drop(columns=['trail_id', 'stars', 'num_reviews', 'duration_mins', 'elevation_gain_ft', 'distance_miles', 'trail_difficulty_easy', 'trail_difficulty_hard', 'trail_difficulty_moderate', 'route_type_Loop', 'route_type_Out_and_Back', 'route_type_Point_to_Point']).describe(), 3)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
# Trail tags to include in model
trail_tags = ['Backpacking', 'Beach', 'BirdWatching', 'Blowdown', 'Bugs', 'Camping', 'Cave', 'CityWalk', 'Closed', 'DogFriendly', 'DogsOnLeash', 'Fee', 'Fishing', 'Forest', 'Hiking', 'HistoricSite', 'HorsebackRiding', 'KidFriendly', 'Lake',
'MountainBiking', 'Muddy', 'NatureTrips', 'NoDogs', 'NoShade', 'OffTrail', 'OhvOffRoadDriving', 'OverGrown', 'PartiallyPaved', 'Paved', 'PrivateProperty', 'River', 'RoadBiking', 'RockClimbing', 'Rocky', 'Running', 'ScenicDriving',
'Scramble', 'Snow', 'Snowshoeing', 'StrollerFriendly', 'Views', 'Walking', 'WashedOut', 'Waterfall', 'WheelchairFriendly', 'WildFlowers', 'Wildlife']
# Create a list of model columns
model_cols = ['elevation_gain_ft', 'distance_miles', 'trail_difficulty_easy', 'trail_difficulty_moderate', 'trail_difficulty_hard', 'route_type_Loop', 'route_type_Out_and_Back', 'route_type_Point_to_Point']
model_cols = model_cols + trail_tags
# Create X
X = trail_info_final_sf[model_cols]
# Convert this SFrame to pandas.DataFrame
X_df = X.to_dataframe()
# Use MinMaxScaler when you do not assume that the shape of all your features follows a normal distribution.
# Instantiate a MinMaxScaler object and compute the minimum and maximum to be used for later scaling
# --> Transform features by scaling each feature to a given range, e.g. between zero and one (default)
scaler = MinMaxScaler()
scaler.fit(X_df)
# Scale features of X according to feature_range
X_scaled = scaler.transform(X_df)
X_scaled
# Create pandas dataframe from numpy array
X_scaled_df = pd.DataFrame(X_scaled, columns=model_cols)
# SFrame means scalable data frame. A tabular, column-mutable dataframe object that can scale to big data
# Note: data: array | pandas.DataFrame | string | dict, optional;
# the actual interpretation of this field is dependent on the 'format' parameter
# format: format of the data; the default, 'auto' will automatically infer the input data format
trail_info_final_item_data_sf = tc.SFrame(data=X_scaled_df, format='dataframe')
# Returns an SFrame with a new column.
# The number of elements in the data given must match the length of every other column of the SFrame
# Note: column_name: the name of the column
# inplace: whether the SFrame is modified in place; defaults to False
trail_info_final_item_data_sf = trail_info_final_item_data_sf.add_column(trail_info_final_sf['trail_id'], column_name='trail_id')
# Note: `ItemContentRecommender` shows a parameter for `similarity_metrics` in the documentation. However, the default option is `auto` which produces cosine similarity and this is the only similarity metric type available. `ItemSimilarityRecommender` allows ‘jaccard’, ‘cosine’, ‘pearson’ as similarity types.
#
# [Cosine Similarity](https://www.sciencedirect.com/topics/computer-science/cosine-similarity)
# Setting parameter `max_item_neighborhood_size` equal to the total number of trail items in order to get a similarity score and ranking for each trail.
k = len(trail_info_final_item_data_sf)
# Create a content-based recommender model in which the similarity between the items recommended is determined by
# the content of those items rather than learned from user interaction data
# Note: item_data: an SFrame giving the content of the items to use to learn the structure of similar items;
# the SFrame must have one column that matches the name of the item_id; this gives a unique identifier
# that can then be used to make recommendations;
# the rest of the columns are then used in the distance calculations below
# item_id: the name of the column in item_data (and observation_data, if given) that represents the item ID
# weights: if given, then weights must be a dictionary of column names present in 'item_data' to weights between
# the column names; if 'auto' (default) is given, the all columns are weighted equally
# similarity_metrics: similarity metric to use; 'auto' defaults to cosine
# max_item_neighborhood_size: for each item, we hold this many similar items to use when aggregating models for
# predictions; dcreasing this value decreases the memory required by the model and
# decreases the time required to generate recommendations, but it may also
# decrease recommendation accuracy
model_cosine = tc.recommender.item_content_recommender.create(item_data=trail_info_final_item_data_sf, item_id='trail_id', weights='auto', similarity_metrics='cosine', max_item_neighborhood_size=k)
model_cosine
# # Save the model. The model is saved as a directory which can then be loaded using the 'turicreate.load_model' method
# location = '/Users/yangweichle/Documents/Employment/TRAINING/DATA SCIENCE/SharpestMinds/Project/CaliforniaTrailFinder_app/TuriCreate_model'
# model_cosine.save(location)
# # Load any Turi Create model that was previously saved
# loaded_model_cosine = tc.load_model(location)
# ### Recommend Similar Items
# Drop table if exists
sql = 'DROP TABLE IF EXISTS recommender'
cursor.execute(sql)
# Store CREATE statements in Python dictionary TABLES
TABLES = {}
TABLES['recommender'] = (
"CREATE TABLE `recommender` ("
" `trail_id` INT(11) NOT NULL,"
" `similiar_trails` MEDIUMBLOB,"
" PRIMARY KEY (`trail_id`)"
") ENGINE=InnoDB")
# Create tables by iterating over the items of the TABLES dictionary
for table_name in TABLES:
table_description = TABLES[table_name]
try:
print('Creating table {}: '.format(table_name), end='')
cursor.execute(table_description)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
print('already exists')
else:
print(err.msg)
else:
print('OK')
#cursor.close()
#cnx.close()
def recommender(trail_id_list):
recommender_list = []
trails_added = 0
for trail_i in range(len(trail_id_list)):
# Recommend the 'k' highest scored items based on the interactions given in 'observed_items'
# Note: observed_items: a list/SArray of items to use to make recommendations, or an SFrame of items and optionally ratings and/or other interaction data;
# **the model will then recommend the most similar items to those given
# k: the number of recommendations to generate
item_recommender = model_cosine.recommend_from_interactions(observed_items=trail_id_list[trail_i:trail_i+1], k=k)
item_recommender_trail_id_list = item_recommender['trail_id'].to_numpy().tolist()
#print(item_recommender_trail_id_list)
item_recommender_score_list = item_recommender['score'].to_numpy().tolist()
#print(item_recommender_score_list)
similiar_trails = [list(a) for a in zip(item_recommender_trail_id_list, item_recommender_score_list)]
#print('\nSimiliar Trails:', similiar_trails)
trail_id = int(trail_id_list[trail_i:trail_i+1].to_numpy()[0])
#print('\nTrail ID:', trail_id)
list_item = [trail_id, similiar_trails]
# Return the pickled representation of the object as a bytes object
pickled_list = pickle.dumps(similiar_trails)
recommender_list.append(list_item)
values = (trail_id, pickled_list)
#print('Values:', values)
# Execute given statement using given parameters
# Note: If the record is a duplicate, then the IGNORE keyword tells MySQL to discard it silently without generating an error
cursor.execute('''
INSERT IGNORE INTO recommender (trail_id, similiar_trails)
VALUES (%s, %s)''', values)
rowcount = cursor.rowcount
if rowcount == 1:
trails_added += 1
print('Trails added:', trails_added)
# Commit current transaction
cnx.commit()
return recommender_list
trail_id_list = trail_info_final_sf['trail_id']
# Get Recommended Similar Items
get_time_start = time.time()
# Recommend the 'k' highest scored items based on the interactions given in 'observed_items'
# Note: observed_items: a list/SArray of items to use to make recommendations, or an SFrame of items and optionally ratings and/or other interaction data;
# **the model will then recommend the most similar items to those given
# k: the number of recommendations to generate
recommender_list = recommender(trail_id_list)
minutes = (time.time() - get_time_start)/60
print('Get Recommended Similar Items time: {} hrs: {} mins'.format(int(minutes // 60), round(minutes % 60)))