-
Notifications
You must be signed in to change notification settings - Fork 11
/
wrapper_rtb.py
384 lines (298 loc) · 13.1 KB
/
wrapper_rtb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# Copyright (c) 2023, Haruka Kiyohara, Ren Kishimoto, HAKUHODO Technologies Inc., and Hanjuku-kaso Co., Ltd. All rights reserved.
# Licensed under the Apache 2.0 License.
"""Customization of RTBEnv."""
from typing import Tuple, Optional, Union, Any
import gym
from gym.spaces import Box, Discrete
from sklearn.base import BaseEstimator
from sklearn.utils import check_scalar
import numpy as np
from .rtb import RTBEnv
from ..utils import check_array
from ..types import Action, Numeric
class CustomizedRTBEnv(gym.Env):
"""Wrapper class for RTBEnv to customize RL action space and bidder by decision makers.
Bases: :class:`gym.Env`
Imported as: :class:`rtbgym.CustomizedRTBEnv`
Note
-------
Users can customize three following decision making using CustomizedEnv.
- reward_predictor in Bidder class
We use predicted rewards to calculate bid price as follows.
:math:`{bid price}_{t, i} = {adjust rate}_{t} \\times {predicted reward}_{t,i} ( \\times const.)`
- scaler in Bidder class
Scaler defines const.in the bid price calculation as follows.
:math:`const. = scaler \\times {standard bid price}`
where standard_bid_price indicates the average of the standard bid price
(bid price which has approximately 50% impression probability) over all ads.
- action space for agent
We transform continual adjust rate space :math:`[0, \\infty)` into agent action space.
Both discrete and continuous actions are acceptable.
Note that we recommend you to set action space within [0.1, 10].
Instead, you can tune multiplication of adjust rate using scaler.
Constrained Markov Decision Process (CMDP) definition are given as follows:
timestep: int (> 0)
Set 24h a day or seven days per week for instance.
We have (search volume, ) auctions during a timestep.
Note that each single auction do NOT correspond to the timestep.
state: array-like of shape (7, )
Statistical feedbacks of auctions during the timestep, including following values.
- timestep
- remaining budget
- impression level features at the previous timestep (budget consumption rate, cost per mille of impressions, auction winning rate, and reward)
- adjust rate (i.e., RL agent action) at the previous timestep
action: {int, float, array-like of shape (1, )} (>= 0)
Adjust rate parameter used for the bid price calculation as follows.
Note that the following bid price is individually determined for each auction.
:math:`{bid price}_{t, i} = {adjust rate}_{t} \\times {predicted reward}_{t,i} ( \\times {const.})`
Both discrete and continuous actions are acceptable.
reward: int (> 0)
Total clicks/conversions gained during the timestep.
discount_rate: int
Discount factor for cumulative reward calculation.
Set discount_rate = 1 (i.e., no discount) in RTB.
constraint: int (> 0)
Total cost should not exceed the initial budget.
Parameters
-------
original_env: RTBEnv
Original RTB environment.
reward_predictor: BaseEstimator, default=None
A machine learning model to predict the reward to determine the bidding price.
If `None`, the ground-truth (expected) reward is used instead of the predicted one.
scaler: {int, float}, default=None (> 0)
Scaling factor (constant value) used for bid price determination.
If `None`, scaler is autofitted by `bidder.auto_fit_scaler()`.
action_min: float, default=0.1 (> 0)
Minimum value of action.
action_max: float, default=10.0 (> 0)
Maximum value of action.
action_type: {"discrete", "continuous"}, default="discrete"
Type of the action space.
n_actions: int, default=10 (> 0)
Number of actions.
Used only when `action_type="discrete"`.
action_meaning: ndarray of shape (n_actions, ), default=None
Dictionary to map discrete action index to a specific action.
Used only when `action_type == "discrete"`.
If `None`, the values are automatically set to `[action_min, action_max]` as `np.logspace(-1, 1, n_actions)`.
Examples
-------
Setup:
.. code-block:: python
# import necessary module from rtbgym
from rtbgym.env import RTBEnv
from rtbgym.policy import OnlineHead
from rtbgym.ope.online import calc_on_policy_policy_value
# import necessary module from other libraries
from sklearn.linear_model import LogisticRegression
from d3rlpy.algos import DiscreteRandomPolicy
# initialize and customize environment
env = RTBEnv(random_state=12345)
env = CustomizedRTBEnv(
original_env=env,
reward_predictor=LogisticRegression(),
action_type="discrete",
)
# define (RL) agent (i.e., policy)
agent = OnlineHead(DiscreteRandomPolicy())
agent.build_with_env(env)
Interaction:
.. code-block:: python
# OpenAI Gym and Gymnasium-like interaction with agent
for episode in range(1000):
obs, info = env.reset()
done = False
while not done:
action = agent.predict_online(obs)
obs, reward, done, truncated, info = env.step(action)
Online Evaluation:
.. code-block:: python
# calculate on-policy policy value
on_policy_performance = calc_on_policy_policy_value(
env,
agent,
n_trajectories=100,
random_state=12345
)
Output:
.. code-block:: python
>>> on_policy_performance
11.75
References
-------
Di Wu, Xiujun Chen, Xun Yang, Hao Wang, Qing Tan, Xiaoxun Zhang, Jian Xu, and Kun Gai.
"Budget Constrained Bidding by Model-free Reinforcement Learning in Display Advertising." 2018.
Jun Zhao, Guang Qiu, Ziyu Guan, Wei Zhao, and Xiaofei He.
"Deep Reinforcement Learning for Sponsored Search Real-time Bidding." 2018.
Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba.
"OpenAI Gym." 2016.
"""
def __init__(
self,
original_env: RTBEnv,
reward_predictor: Optional[BaseEstimator] = None,
scaler: Optional[Union[int, float]] = None,
action_min: float = 0.1,
action_max: float = 10.0,
action_type: str = "discrete", # "continuous"
n_actions: int = 10,
action_meaning: Optional[
np.ndarray
] = None, # maps categorical actions to adjust rate
):
super().__init__()
if not isinstance(original_env, RTBEnv):
raise ValueError("original_env must be RTBEnv or a child class of RTBEnv")
self.env = original_env
check_scalar(action_min, name="action_min", target_type=(int, float), min_val=0)
check_scalar(action_max, name="action_max", target_type=(int, float), min_val=0)
if action_min >= action_max:
raise ValueError("action_min must be smaller than action_max")
if action_type not in ["discrete", "continuous"]:
raise ValueError(
f'action_type must be either "discrete" or "continuous", but {action_type} is given'
)
if action_type == "discrete":
check_scalar(n_actions, name="n_acitons", target_type=int, min_val=2)
if action_meaning is None:
action_meaning = np.logspace(
np.log10(action_min), np.log10(action_max), n_actions
)
check_array(
action_meaning,
name="action_meaning",
expected_dim=1,
min_val=action_min,
max_val=action_max,
)
if action_meaning.shape[0] != n_actions:
raise ValueError(
"Expected `action_meaning.shape[0] == n_actions`, but found False"
)
self.action_meaning = action_meaning
# set reward predictor
if reward_predictor is not None:
self.env.bidder.custom_set_reward_predictor(
reward_predictor=reward_predictor
)
self.env.bidder.fit_reward_predictor(
step_per_episode=self.env.step_per_episode
)
# set scaler
if scaler is None:
self.env.bidder.auto_fit_scaler(step_per_episode=self.env.step_per_episode)
else:
self.env.bidder.custom_set_scaler(scaler)
# define observation space
self.observation_space = Box(
low=np.array([0, 0, 0, 0, 0, 0, action_min]),
high=np.array(
[
self.env.step_per_episode,
self.env.initial_budget,
np.inf,
np.inf,
1,
np.inf,
action_max,
]
),
dtype=float,
)
# define action space
self.action_type = action_type
self.n_actions = n_actions
self.action_dim = 1
self.action_meaning = action_meaning
if self.action_type == "discrete":
self.action_space = Discrete(n_actions)
else: # "continuous"
self.action_space = Box(
low=action_min, high=action_max, shape=(1,), dtype=float
)
@property
def obs_keys(self):
return self.env.obs_keys
@property
def reward_range(self):
return self.env.reward_range
@property
def step_per_episode(self):
return self.env.step_per_episode
@property
def initial_budget(self):
return self.env.initial_budget
def step(self, action: Action) -> Tuple[Any]:
"""Rollout auctions arise during the timestep and return feedbacks to the agent.
Parameters
-------
action: {int, float, array-like of shape (1, )} (>= 0)
RL agent action which indicates adjust rate parameter used for bid price determination.
Both discrete and continuos actions are acceptable.
Returns
-------
obs: ndarray of shape (7, )
Statistical feedbacks of auctions during the timestep.
Corresponds to RL state, which include following components.
- timestep
- remaining budget
- impression level features at the previous timestep
(budget consumption rate, cost per mille of impressions, auction winning rate, and reward)
- adjust rate (i.e., agent action) at the previous timestep
reward: int (>= 0)
Total clicks/conversions gained during the timestep.
done: bool
Whether the episode end or not.
info: dict
Additional feedbacks (total impressions, clicks, and conversions) that may be useful for the package users.
These are unavailable for the RL agent.
"""
if self.action_type == "discrete":
if not (
isinstance(action, (int, np.integer))
and 0 <= action < self.action_space.n
):
raise ValueError(
f"action must be an integer within [0, {self.action_space.n}), but {action} is given"
)
else: # "continuous"
if isinstance(action, Numeric):
action = np.array([action])
if not self.action_space.contains(action):
raise ValueError(
f"action must be a float value within ({self.action_space.low}, {self.action_space.high})"
)
# map agent action into meaningful value
action = (
action if self.action_type == "continuous" else self.action_meaning[action]
)
return self.env.step(action)
def reset(self, seed: Optional[int] = None) -> np.ndarray:
"""Initialize the environment.
Note
-------
Remaining budget is initialized to the initial budget of an episode.
Parameters
-------
seed: Optional[int], default=None
Random state.
Returns
-------
obs: ndarray of shape (7, )
Statistical feedbacks of auctions during the timestep.
Corresponds to RL state, which include following components.
- timestep
- remaining budget
- impression level features at the previous timestep
(budget consumption rate, cost per mille of impressions, auction winning rate, and reward)
- adjust rate (i.e., agent action) at the previous timestep
info: (empty) dict
Additional information that may be useful for the package users.
This is unavailable to the RL agent.
"""
return self.env.reset(seed)
def render(self) -> None:
self.env.render()
def close(self) -> None:
self.env.close()