-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWorld_Cup_Model.py
260 lines (241 loc) · 11.6 KB
/
World_Cup_Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import pandas as pd
import time
import copy
from Getting_Current_Ratings import time_sensitive_elo_dict, home_advantage_elo_boost, df, bat_first_elo_dict
import random
import statistics
print("Home Advantage:", home_advantage_elo_boost)
print()
# Convert the "Date" column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
# filters the data frame for 2023 World Cup matches
wc_2023_matches = df[(df['Date'].dt.year == 2023) & (df['Series Type'] == 'world-cup')]
# imports the venue data for World Cup Matches
wc_2023_venue_df = pd.read_csv("World_Cup_2023_GS_Grounds.csv")
# imports ground data
ground_data = pd.read_csv("ODI Grounds.csv")
ground_data = ground_data[ground_data["Country"] == "India"]
ground_bf_dict = {}
for idx, row in ground_data.iterrows():
ground_bf_dict.update({row["Ground Name"]: row["Batting First Elo Boost"]})
# India gets a boost from being the hosts of the tournament
time_sensitive_elo_dict['India'] += home_advantage_elo_boost
wc_teams = ['India', 'Australia', 'New Zealand', 'England', 'South Africa', 'Pakistan', 'Sri Lanka', 'Bangladesh',
'Afghanistan', 'Netherlands']
# this simulates a match based on elo ratings and returns the team_1 nrr
def match_simulation(team_1, team_2, team_1_elo, team_2_elo, ground_bf_elo_boost):
# uses the elo formula to get the two-outcome win probability
batting_first = random.randrange(0, 2)
if batting_first == 0:
team_1_elo += bat_first_elo_dict[team_1] - bat_first_elo_dict[team_2] + ground_bf_elo_boost
else:
team_2_elo += bat_first_elo_dict[team_2] - bat_first_elo_dict[team_1] + ground_bf_elo_boost
team_1_wl = 1 / (10 ** ((team_2_elo - team_1_elo) / 400) + 1)
team_1_margin_mean = statistics.NormalDist(0, 1.64).inv_cdf(team_1_wl)
team_1_nrr = statistics.NormalDist(team_1_margin_mean, 1.64).inv_cdf(random.random())
# we use 285 as the team 1's score. We can use any value but 285 is around the average for an ODI innings in the
# modern age
if batting_first == 0:
team_1_runs = 285
team_1_overs = 50
if team_1_nrr > 0:
team_2_overs = 50
team_2_runs = team_1_runs - (team_1_nrr * team_1_overs)
else:
team_2_runs = 286
team_2_overs = team_2_runs / ((team_1_runs / team_1_overs) - team_1_nrr)
else:
team_2_runs = 285
team_2_overs = 50
if team_1_nrr > 0:
team_1_runs = 286
team_1_overs = team_1_runs / ((team_2_runs / team_2_overs) + team_1_nrr)
else:
team_1_overs = 50
team_1_runs = team_2_runs + (team_1_nrr * team_1_overs)
return [team_1_runs, team_1_overs, team_2_runs, team_2_overs]
# this function helps sort the league table dictionary
def sort_table_dict(item):
# These variables assist in locating "column" numbers
nrr = 4
points = 5
club, stats = item
return (stats[points], stats[nrr])
# dictionary in the form of {Team: [Total Runs, Total Overs Batted, Total Runs Conceded, Total Overs Bowled,
# Net Run Rate, Points]}
wc_table = {}
for team in wc_teams:
wc_table.update({team: [0, 0, 0, 0, 0, 0]})
# list of fixtures completed
fixtures_completed = []
for match_num, match_facts in wc_2023_matches.iterrows():
winner = match_facts["Winner"]
bf = match_facts["Batting First"]
bs = match_facts["Batting Second"]
# doesn't change the ratings if there is no result
if winner == bf:
wc_table[bf][5] += 2
elif winner == bs:
wc_table[bs][5] += 2
elif winner == 'No Result':
wc_table[bf][5] += 1
wc_table[bs][5] += 1
# gets the score information
bf_adj_rr = match_facts["Team 1 Adjusted Run Rate"]
bs_adj_rr = match_facts["Team 2 Adjusted Run Rate"]
bf_runs = match_facts["Team 1 Runs"]
bs_runs = match_facts["Team 2 Runs"]
bf_adj_overs = bf_runs / bf_adj_rr
bs_adj_overs = bs_runs / bs_adj_rr
# this is to fix the run rate in the match between Pakistan and New Zealand
if winner == 'Pakistan' and bf == 'New Zealand':
bf_runs = 179
bf_adj_overs = 25.5
# because England lost 9 wickets and not 10, the original data extraction does not consider England all out.
# The last wicket was lost due a batsmen being absent hurt, but the ICC considers England to be all out.
elif winner == 'South Africa' and bs == 'England':
bs_adj_overs = 50
# adds the score info to the table
wc_table[bf][0] += bf_runs
wc_table[bs][0] += bs_runs
wc_table[bf][1] += bf_adj_overs
wc_table[bs][1] += bs_adj_overs
wc_table[bf][2] += bs_runs
wc_table[bs][2] += bf_runs
wc_table[bf][3] += bs_adj_overs
wc_table[bs][3] += bf_adj_overs
# adds the match to list of fixtures completed
fixtures_completed.append([bf, bs])
knockouts_started = False
# if the knockout rounds have started, we record the data in a seperate data frame
if len(wc_2023_matches) > 45:
knockouts_started = True
knockout_matches = wc_2023_matches.iloc[45:]
wc_2023_matches = wc_2023_matches.iloc[0:45]
# dictionary in the form of {Team: [Avg_Pos, Avg_NRR, Avg_Points, 1st, 2nd, 3rd, 4th, Make SF, Make F, Win F]}
wc_sims_table = {}
for team in wc_teams:
wc_sims_table.update({team: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
start_time = time.time()
for sim in range(10000):
sim_wc_table = copy.deepcopy(wc_table)
for team_num1, team_1 in enumerate(wc_teams):
for team_num2, team_2 in enumerate(wc_teams):
if team_num1 <= team_num2:
# we don't want matches between the same 2 teams or both home and away matches
continue
elif [team_1, team_2] in fixtures_completed or [team_2, team_1] in fixtures_completed:
# skips over completed matches
continue
else:
# simulates the match if it hasn't been completed
team_1_elo = time_sensitive_elo_dict[team_1]
team_2_elo = time_sensitive_elo_dict[team_2]
# finds the venue information
ground = wc_2023_venue_df[
((wc_2023_venue_df["First Team"] == team_1) | (wc_2023_venue_df["Second Team"] == team_1)) &
((wc_2023_venue_df["First Team"] == team_2) | (wc_2023_venue_df["Second Team"] == team_2))
].iloc[0]["Ground"]
bf_ground_elo_boost = ground_bf_dict[ground]
sim_match = match_simulation(team_1, team_2, team_1_elo, team_2_elo, bf_ground_elo_boost)
# adds the match statistics after a match has been simulated
for team_1_idx, score_piece in enumerate(sim_match):
sim_wc_table[team_1][team_1_idx] += score_piece
if team_1_idx < 2:
sim_wc_table[team_2][team_1_idx + 2] += score_piece
else:
sim_wc_table[team_2][team_1_idx - 2] += score_piece
if sim_match[0] > sim_match[2]:
sim_wc_table[team_1][5] += 2
else:
sim_wc_table[team_2][5] += 2
# calculates Net Run Rate
for team, standings_info in sim_wc_table.items():
standings_info[4] = (standings_info[0] / standings_info[1]) - (standings_info[2] / standings_info[3])
# sorts the table into final group stage positions
final_sim_wc_table = dict(sorted(sim_wc_table.items(), key=sort_table_dict, reverse=True))
# adds group stage info to simulation summary data
rank = 0
semifinalists = []
for team, standings_info in final_sim_wc_table.items():
rank += 1
wc_sims_table[team][0] += rank
wc_sims_table[team][1] += standings_info[4]
wc_sims_table[team][2] += standings_info[5]
if rank <= 4:
wc_sims_table[team][2 + rank] += 1
wc_sims_table[team][7] += 1
semifinalists.append(team)
sfs = [[semifinalists[0], semifinalists[3]], [semifinalists[1], semifinalists[2]]]
# semifinal stage
finalists = []
if knockouts_started:
sf_1_winner = knockout_matches.iloc[0]["Winner"]
sf_2_winner = knockout_matches.iloc[1]["Winner"]
else:
# determining venues for both semifinals
if "Pakistan" in sfs[0]:
venues = ["Eden Gardens", "Wankhede Stadium"]
elif ("Pakistan" not in semifinalists) and ("India" in sfs[1]):
venues = ["Eden Gardens", "Wankhede Stadium"]
else:
venues = ["Wankhede Stadium", "Eden Gardens"]
# simulates the semifinals if they have not been completed yet
for sf_num, sf in enumerate(sfs):
team_1 = sf[0]
team_2 = sf[1]
team_1_elo = time_sensitive_elo_dict[team_1]
team_2_elo = time_sensitive_elo_dict[team_2]
ground = venues[sf_num]
bf_ground_elo_boost = ground_bf_dict[ground]
sim_match = match_simulation(team_1, team_2, team_1_elo, team_2_elo, bf_ground_elo_boost)
if sim_match[0] > sim_match[2]:
finalists.append(team_1)
wc_sims_table[team_1][8] += 1
else:
finalists.append(team_2)
wc_sims_table[team_2][8] += 1
# world cup final stage
team_1 = finalists[0]
team_2 = finalists[1]
team_1_elo = time_sensitive_elo_dict[team_1]
team_2_elo = time_sensitive_elo_dict[team_2]
ground_bf_elo_boost = ground_bf_dict["Narendra Modi Stadium"]
sim_match = match_simulation(team_1, team_2, team_1_elo, team_2_elo, ground_bf_elo_boost)
if sim_match[0] > sim_match[2]:
wc_sims_table[team_1][9] += 1
else:
wc_sims_table[team_2][9] += 1
# time updates
if (sim + 1) % 100 == 0:
print("Simulations", (sim + 1) / 100, "% complete")
current_time = time.time()
expected_total_time = (current_time - start_time) / ((sim + 1) / 10000)
time_left_minutes = round((expected_total_time - (current_time - start_time)) / 60, 2)
print(time_left_minutes, "Minutes left")
end_time = time.time()
print()
print("World Cup Simulated in", round((end_time - start_time) / 60, 2), "Minutes")
pd.set_option("display.max_columns", None) # Display all columns
pd.set_option("display.expand_frame_repr", False) # Prevent line-wrapping
pd.set_option("display.width", None) # Auto-adjust the column width
# puts the information into a Data Frame
world_cup_sim_summary_df = pd.DataFrame(columns=["Avg Pos", "Avg NRR", "Avg Pts", "1st", "2nd", "3rd", "4th",
"Make SF", "Make Final", "Win World Cup"],
data=list(wc_sims_table.values()))
world_cup_sim_summary_df = world_cup_sim_summary_df / 10000
world_cup_sim_summary_df.insert(0, "Team", list(wc_sims_table.keys()))
# Sorts by Average Position
world_cup_sim_summary_df.sort_values(by='Avg Pos', inplace=True)
# creates a new index for how the Teams will be viewed in Data Frames
world_cup_sim_summary_df['Pos'] = list(range(1, 11))
world_cup_sim_summary_df.set_index("Pos", inplace=True)
world_cup_sim_summary_df.to_csv("2023_World_Cup_Expected_Results.csv", index=True, header=True)
# Round and format the percentage columns
percentage_cols = ["1st", "2nd", "3rd", "4th", "Make SF", "Make Final", "Win World Cup"]
world_cup_sim_summary_df[percentage_cols] = (world_cup_sim_summary_df[percentage_cols]).applymap(
lambda x: f'{x:.0%}')
world_cup_sim_summary_df[["Avg Pos", "Avg Pts"]] = round(world_cup_sim_summary_df[["Avg Pos", "Avg Pts"]], 1)
world_cup_sim_summary_df["Avg NRR"] = round(world_cup_sim_summary_df["Avg NRR"], 3)
world_cup_sim_summary_df = world_cup_sim_summary_df[["Team", "Make Final", "Win World Cup"]]
print(world_cup_sim_summary_df)