forked from SBodapati11/DataMavericks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
369 lines (283 loc) · 14.2 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
import pandas as pd
import boto3
import io
import re
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Circle, Rectangle, Arc
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
# Get the play-by-play data
# Set Buffer
buffer_pbp = io.BytesIO()
buffer_players = io.BytesIO()
# Create connection to S3
s3 = boto3.resource('s3', aws_access_key_id = 'AKIAWNNDBSXELJDB2NPI', aws_secret_access_key = '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')
# Read PBP Data from S3
pbp_object = s3.Object('utd-hackathon', 'event_pbp.parquet')
pbp_object.download_fileobj(buffer_pbp)
# Read the PBP data into a pandas dataframe and save it
df_pbp = pd.read_parquet(buffer_pbp)
# Read Players Data from S3
players_object = s3.Object('utd-hackathon', 'game_players.parquet')
players_object.download_fileobj(buffer_players)
# Read the PBP data into a pandas dataframe and save it
df_players = pd.read_parquet(buffer_players)
df_players.to_parquet(str(Path.cwd()) + '/data/player_data.parquet')
# find keywords to extract from the mesage descriptions
# replace punctuation, lowercase, remove numbers expect 3 and others
# check if shots are missed ('Missed') or not('Made')
# check for assists and fouls
offensive_plays = ["turnaround fadeaway shot","free throw","3pt shot","driving finger roll layup shot",
"turnover lost ball","violation kicked ball","jump ball","turnover bad pass","floating jump shot",
"fadeaway jump shot","driving floating jump shot","driving layup shot","running layup shot",
"pullup jump shot","driving dunk shot","jump shot","cutting layup shot",
"tip layup shot","driving floating bank jump shot","running jump shot","cutting dunk shot",
"turnover traveling","turnaround hook shot","tip dunk shot","turnover offensive foul","step back jump shot",
"assist","dunk shot","hook shot","running finger roll layup","alley oop layup shot",
"turnaround jump shot","running pullup jump shot","turnover out of bounds"]
#offensive_plays_basic = ["fadeaway","jump","layup", "turnover","hook", "pullup", "step","3pt","dunk", "free"]
defensive_plays = ["steal","block", "rebound"]
# Get the data for the Mavericks Regular Season
seasons=["Regular"]
team_names = ['DAL']
mavs_pbp = df_pbp.loc[df_pbp['team'].isin(team_names)]
mavs_pbp_season = mavs_pbp.loc[mavs_pbp['seasonType'].isin(seasons)]
mavs_pbp_season = mavs_pbp_season.reset_index(drop=True)
# Add columns for denote whether a play is offensive, defensive, off_missed, and the quarter
mavs_pbp_season['offensive_play'] = pd.Series(dtype=str)
mavs_pbp_season['defensive_play'] = pd.Series(dtype=str)
mavs_pbp_season['off_missed'] = pd.Series(dtype=str)
#mavs_pbp_season['quarter'] = pd.Series()
# Iterate to classify plays as an offensive or defensive play
c=0
for idx,row in mavs_pbp_season.iterrows():
msg_str = row['description']
msg_str = re.sub(r'[^\w\s]', '', msg_str)
msg_str = msg_str.lower()
msg_list = msg_str.split(" ")
msg_list = [i for i in msg_list if i!='']
msg_str = ' '.join(msg_list)
c+=1
for op in offensive_plays:
if op in msg_str:
#print(op)
mavs_pbp_season.at[idx, 'offensive_play'] = op
break
for dp in defensive_plays:
if dp in msg_str:
mavs_pbp_season.at[idx, 'defensive_play'] = dp
break
if 'foul' in msg_str:
mavs_pbp_season.at[idx, 'offensive_play'] = 'foul'
# iterate to find the missed shots
for idx,row in mavs_pbp_season.iterrows():
msg_str = row['description']
msg_str = re.sub(r'[^\w\s]', '', msg_str)
msg_str = msg_str.lower()
msg_list = msg_str.split(" ")
msg_list = [i for i in msg_list if i!='']
#print(msg_list)
msg_str = ' '.join(msg_list)
if 'missed' in msg_str:
mavs_pbp_season.at[idx, 'off_missed'] = 1
elif 'made' in msg_str:
mavs_pbp_season.at[idx, 'off_missed'] = 0
else:
mavs_pbp_season.at[idx, 'off_missed'] = -1
mavs_pbp_season.to_parquet(str(Path.cwd())+ '/data/mavs_pbp_season.parquet')
# Function to draw a half-court basketball court (mostly) to scale
def draw_basketball_court(court):
# Create 3-point arc
three_point_arc = Arc((0, 140), 440, 315, theta1=0, theta2=180, facecolor='none', edgecolor='black', lw=2)
# Create 3-point lines
court.plot([-220, -220], [0, 140], linewidth=2, color='black')
court.plot([220, 220], [0, 140], linewidth=2, color='black')
# Create the rim
rim = Circle((0, 60), 15, facecolor='none', edgecolor='black', lw=2)
# Create the paint areas
outer_rectangle = Rectangle((-80, 0), 160, 190, fill=False, lw=2, edgecolor='black')
inner_rectangle = Rectangle((-60, 0), 120, 190, fill=False, lw=2, edgecolor='black')
# Create free throw arc
free_throw_arc = Arc((0, 190), 120, 120, theta1=0, theta2=180, facecolor='none', edgecolor='black', lw=2)
# Create dotted free throw arc
dotted_free_throw_arc = Arc((0, 190), 120, 120, theta1=-180, theta2=0, facecolor='none', edgecolor='black', lw=2, ls='-')
# Create the backboard
court.plot([-30, 30], [40, 40], linewidth=2, color='black')
# Add all the components
court.add_artist(three_point_arc)
court.add_artist(free_throw_arc)
court.add_artist(dotted_free_throw_arc)
court.add_artist(rim)
court.add_artist(outer_rectangle)
court.add_artist(inner_rectangle)
# Remove axes ticks and set the dimensions for the
court.set_xticks([])
court.set_yticks([])
court.set_xlim(-250, 250)
court.set_ylim(0, 470)
court.set_facecolor('#dfbb85')
return court
# Draw the basketball court with the data
court_figure = plt.figure(figsize=(4, 3.76))
court = court_figure.add_axes([0, 0, 1, 1])
court = draw_basketball_court(court)
plt.savefig(str(Path.cwd()) + "/analytics_images/empty_court.png")
# Line up data
df_players['startPos_cat'] = df_players['startPos'].astype('category').cat.codes
# Normalize the data
# Specify the columns you want to normalize
columns_to_normalize = ['teamMargin', 'secPlayed', 'fgm', 'fga', 'ftm', 'fta', 'tpm',
'tpa', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
'plusMinus', 'flagrants', 'techs', 'ejections', 'blkA', 'fbPts', 'fbM',
'fbA', 'pitp', 'pitpM', 'pitpA', 'secChancePts', 'secChanceM',
'secChanceA', 'startPos_cat', 'isOnCourt', 'boxScoreOrder', 'teamPts', 'oppPts']
# Create a MinMaxScaler instance
scaler = MinMaxScaler()
df = df_players
# Fit and transform the selected columns
normalized_columns = scaler.fit_transform(df[columns_to_normalize])
# Replace the original columns with the normalized ones
df[columns_to_normalize] = normalized_columns
mavs_players = df[df['team'] == 'DAL']
mavs_players_grouped = mavs_players.groupby(['name', 'opponent']).mean().reset_index()
selected_columns = ['gs', 'fgm', 'fga', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plusMinus', 'flagrants', 'secPlayed', 'oreb', 'dreb']
mavs_stats = mavs_players_grouped[selected_columns]
pca = PCA(n_components=2)
principal_components = pca.fit_transform(mavs_stats)
# Create offensive and defensive scores
offensive_score = principal_components[:, 0]
defensive_score = principal_components[:, 1]
mavs_players_grouped = mavs_players_grouped.assign(offensive_score=offensive_score, defensive_score=defensive_score)
# Starting position grouping
grouped_positions = mavs_players_grouped.groupby('startPos_cat')
unique_positions = mavs_players_grouped['startPos_cat'].unique()
players_by_position = {}
for pos_cat in unique_positions:
players_by_position[pos_cat] = mavs_players_grouped[mavs_players_grouped['startPos_cat'] == pos_cat]
unique_opponents = mavs_players_grouped['opponent'].unique()
# Function to generate a lineup with exactly 5 unique players using KMeans clustering
def generate_unique_lineup_kmeans(players_by_position_opponent, unique_positions):
lineup = []
remaining_slots = 5
for pos_cat in unique_positions:
position_players = players_by_position_opponent[pos_cat]
if position_players.empty:
continue
if len(position_players) >= 2:
# Apply KMeans clustering to find the most suitable player for each position
kmeans = KMeans(n_clusters=2, random_state=42).fit(position_players[['offensive_score', 'defensive_score']])
position_players['cluster'] = kmeans.labels_
# Determine the cluster with the highest average offensive and defensive scores
cluster_summary = position_players.groupby('cluster')[['offensive_score', 'defensive_score']].mean()
best_cluster = cluster_summary.idxmax().mode().iloc[0]
# Select a player from the best cluster and add to the lineup
best_cluster_players = position_players[position_players['cluster'] == best_cluster]
selected_player = best_cluster_players.sample(n=1)
else:
# Select the only available player for the position
selected_player = position_players
lineup.append(selected_player)
remaining_slots -= 1
# Assign remaining slots to the position categories with the highest scores
while remaining_slots > 0:
max_scores = {pos_cat: players_by_position_opponent[pos_cat][['offensive_score', 'defensive_score']].max().mean() for pos_cat in unique_positions}
max_score_pos_cat = max(max_scores, key=max_scores.get)
max_score_position_players = players_by_position_opponent[max_score_pos_cat]
selected_player = max_score_position_players.sample(n=1)
lineup.append(selected_player)
remaining_slots -= 1
lineup_df = pd.concat(lineup).reset_index(drop=True)
return lineup_df
# Generate lineups for each opponent
all_opponent_lineups = {}
for opponent in unique_opponents:
opponent_players = mavs_players_grouped[mavs_players_grouped['opponent'] == opponent]
# Create a dictionary to store players by position category for the specific opponent
players_by_position_opponent = {}
for pos_cat in unique_positions:
players_by_position_opponent[pos_cat] = opponent_players[opponent_players['startPos_cat'] == pos_cat]
# Generate lineups
lineups = []
for i in range(10):
lineup_df = generate_unique_lineup_kmeans(players_by_position_opponent, unique_positions)
# Determine if the lineup is better for offense or defense
offense_strength = lineup_df['offensive_score'].mean()
defense_strength = lineup_df['defensive_score'].mean()
if offense_strength > defense_strength:
play_type = 'Offense'
else:
play_type = 'Defense'
lineup_summary = {
'Lineup': lineup_df['name'].tolist(),
'Play Type': play_type,
'Offensive Strength': offense_strength,
'Defensive Strength': defense_strength
}
lineups.append(lineup_summary)
all_opponent_lineups[opponent] = lineups
columns = ['team', '#', 'lineup', 'play_type', 'offense_strength', 'defense_strength']
lineups_df = pd.DataFrame(columns=columns)
for opponent in all_opponent_lineups:
i = 1
for lineup in all_opponent_lineups[opponent]:
lineup_data = [opponent, i]
for key in lineup:
lineup_data.append(lineup[key])
lineups_df.loc[len(lineups_df.index)] = lineup_data
i += 1
lineups_df.to_parquet(str(Path.cwd()) + "/data/lineups.parquet")
# Compute the mean of stats based on 'name' and 'opponent'
mean_stats = df_players.groupby(['name', 'opponent']).mean().reset_index()
# Get the unique startPos values for each 'name' and 'opponent' combination
startPos_data = df_players.groupby(['name', 'opponent'])['startPos'].first().reset_index()
# Get the unique team values for each 'name' and 'opponent' combination
team_data = df_players.groupby(['name', 'opponent'])['team'].first().reset_index()
# Add the startPos and team columns to the mean_stats dataframe
mean_stats['startPos'] = startPos_data['startPos']
mean_stats['team'] = team_data['team']
# Define the criteria for offensive and defensive players
offensive_positions = ['PG', 'SG']
defensive_positions = ['C', 'PF', 'SF']
# Create two dataframes: one for offensive players and one for defensive players
offensive_players = mean_stats[mean_stats['startPos'].isin(offensive_positions)]
defensive_players = mean_stats[mean_stats['startPos'].isin(defensive_positions)]
def preprocess_data(players_df):
# Select relevant columns
X = players_df[['fgm', 'fga', 'ftm', 'fta', 'tpm', 'tpa', 'oreb', 'dreb',
'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plusMinus', 'opponent']]
# One-hot encode the opponent team
X = pd.get_dummies(X, columns=['opponent'], prefix='opponent')
# Normalize the data (excluding one-hot encoded columns)
columns_to_normalize = ['fgm', 'fga', 'ftm', 'fta', 'tpm', 'tpa', 'oreb', 'dreb',
'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plusMinus']
X[columns_to_normalize] = (X[columns_to_normalize] - X[columns_to_normalize].min()) / (X[columns_to_normalize].max() - X[columns_to_normalize].min())
return X
def train_model(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return model
# Preprocess the data
offensive_X = preprocess_data(offensive_players)
defensive_X = preprocess_data(defensive_players)
# Use the pts column as target for the model
offensive_y = offensive_players['pts']
defensive_y = defensive_players['pts']
# Train the models for offensive and defensive players
offensive_model = train_model(offensive_X, offensive_y)
defensive_model = train_model(defensive_X, defensive_y)
# Generate scores for players
offensive_players['score'] = offensive_model.predict(offensive_X)
defensive_players['score'] = defensive_model.predict(defensive_X)
offensive_players.to_parquet(str(Path.cwd()) + "/data/offensive_players.parquet")
defensive_players.to_parquet(str(Path.cwd()) + "/data/defensive_players.parquet")