forked from JoshuaWu1997/PyTorch-DDPG-Stock-Trading
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDDPG.py
128 lines (107 loc) · 5.95 KB
/
DDPG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
@File :DDPG.py
@Author :JohsuaWu1997
@Date :2020/1/30
"""
import torch
from actor_critic import Actor, Critic
from ou_noise import OUNoise
cuda = torch.device('cuda')
GAMMA = 0.9999999993340943687843739933894
def min_max_scale(data):
data_min = torch.min(data, 0).values.view(1, -1)
data_max = torch.max(data, 0).values.view(1, -1)
data_max[data_max - data_min == 0] = 0
return (data - data_min) / (data_max - data_min)
class DDPG:
"""docstring for DDPG"""
def __init__(self, env, time_steps, hidden_dim):
self.name = 'DDPG' # name for uploading results
self.scale = env.asset
self.unit = env.unit
self.seed = env.rd_seed
self.time_dim = time_steps
self.state_dim = env.observation_space.shape[1]
self.action_dim = env.action_space.shape[0]
self.batch_size = 64
self.memory_size = self.time_dim + self.batch_size * 10
self.start_size = self.time_dim + self.batch_size * 2
# Initialise actor & critic networks
self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim)
self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim)
# Initialize replay buffer
self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
self.replay_action = torch.zeros((self.start_size - 1, 1, self.state_dim), device=cuda)
self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim)
self.initial()
def initial(self):
self.steps = 0
self.action = torch.zeros(self.action_dim, device=cuda)
self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda)
self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
def train_on_batch(self):
# Sample a random minibatch of N transitions from replay buffer
sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda)
index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)]).t().reshape(-1)
state_data = min_max_scale(self.replay_state[:, 0, :])
amount_data = min_max_scale(self.replay_state[:, 2, :])
next_state_data = min_max_scale(self.replay_next_state[:, 0, :])
next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])
state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1)
amount_data = torch.index_select(amount_data, 0, sample).view(self.batch_size, -1)
state_batch = torch.cat([state_batch, amount_data], dim=1)
next_state_batch = torch.index_select(next_state_data, 0, index).view(self.batch_size, -1)
next_amount_data = torch.index_select(next_amount_data, 0, sample).view(self.batch_size, -1)
next_state_batch = torch.cat([next_state_batch, next_amount_data], dim=1)
action_batch = torch.index_select(self.replay_action / self.unit, 0, sample)
reward_batch = torch.index_select(self.replay_reward, 0, sample)
# Calculate y_batch
next_action_batch = self.actor_network.target_action(next_state_batch)
q_batch = self.critic_network.target_q(next_action_batch, next_state_batch)
y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)
# train actor-critic by target loss
self.actor_network.train(
self.critic_network.train(
y_batch, action_batch, state_batch
)
)
# Update target networks by soft update
self.actor_network.update_target()
self.critic_network.update_target()
def perceive(self, state, action, reward, next_state, done):
if self.steps < self.start_size - 1:
self.replay_state[self.steps] = state
self.replay_next_state[self.steps] = next_state
self.replay_action[self.steps] = action
self.replay_reward[self.steps] = reward
else:
if self.steps >= self.memory_size:
self.replay_state = self.replay_state[1:]
self.replay_next_state = self.replay_next_state[1:]
self.replay_action = self.replay_action[1:]
self.replay_reward = self.replay_reward[1:]
self.replay_state = torch.cat((self.replay_state, state.unsqueeze(0)), dim=0)
self.replay_next_state = torch.cat((self.replay_next_state, next_state.unsqueeze(0)), dim=0)
self.replay_action = torch.cat((self.replay_action, action.unsqueeze(0)), dim=0)
self.replay_reward = torch.cat((self.replay_reward, reward.unsqueeze(0)), dim=0)
self.steps += 1
def act(self, next_state, portfolio):
if self.steps > self.start_size:
next_state_data = min_max_scale(self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1)
next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])[-1].view(1, -1)
next_state_data = torch.cat([next_state_data, next_amount_data], dim=1)
self.train_on_batch()
allocation = self.actor_network.target_action(next_state_data).data.view(-1)
allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda)
allocation[allocation < 0] = 0
allocation /= sum(allocation)
allocation = torch.floor(
portfolio * allocation / next_state[1, :] / self.unit
) * self.unit
self.action = allocation
return self.action.clone()