forked from AI4Finance-Foundation/ElegantRL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnet.py
97 lines (73 loc) · 3.5 KB
/
net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import torch
import torch.nn as nn
from torch import Tensor
from torch.distributions.normal import Normal
class QNet(nn.Module): # `nn.Module` is a PyTorch module for neural network
def __init__(self, dims: [int], state_dim: int, action_dim: int):
super().__init__()
self.net = build_mlp(dims=[state_dim, *dims, action_dim])
self.explore_rate = None
self.action_dim = action_dim
def forward(self, state: Tensor) -> Tensor:
return self.net(state) # Q values for multiple actions
def get_action(self, state: Tensor) -> Tensor: # return the index [int] of discrete action for exploration
if self.explore_rate < torch.rand(1):
action = self.net(state).argmax(dim=1, keepdim=True)
else:
action = torch.randint(self.action_dim, size=(state.shape[0], 1))
return action
class Actor(nn.Module):
def __init__(self, dims: [int], state_dim: int, action_dim: int):
super().__init__()
self.net = build_mlp(dims=[state_dim, *dims, action_dim])
self.explore_noise_std = None # standard deviation of exploration action noise
def forward(self, state: Tensor) -> Tensor:
action = self.net(state)
return action.tanh()
def get_action(self, state: Tensor) -> Tensor: # for exploration
action_avg = self.net(state).tanh()
dist = Normal(action_avg, self.explore_noise_std)
action = dist.sample()
return action.clip(-1.0, 1.0)
class Critic(nn.Module):
def __init__(self, dims: [int], state_dim: int, action_dim: int):
super().__init__()
self.net = build_mlp(dims=[state_dim + action_dim, *dims, 1])
def forward(self, state: Tensor, action: Tensor) -> Tensor:
return self.net(torch.cat((state, action), dim=1)) # Q value
class ActorPPO(nn.Module):
def __init__(self, dims: [int], state_dim: int, action_dim: int):
super().__init__()
self.net = build_mlp(dims=[state_dim, *dims, action_dim])
self.action_std_log = nn.Parameter(torch.zeros((1, action_dim)), requires_grad=True) # trainable parameter
def forward(self, state: Tensor) -> Tensor:
return self.net(state).tanh() # action.tanh()
def get_action(self, state: Tensor) -> (Tensor, Tensor): # for exploration
action_avg = self.net(state)
action_std = self.action_std_log.exp()
dist = Normal(action_avg, action_std)
action = dist.sample()
logprob = dist.log_prob(action).sum(1)
return action, logprob
def get_logprob_entropy(self, state: Tensor, action: Tensor) -> (Tensor, Tensor):
action_avg = self.net(state)
action_std = self.action_std_log.exp()
dist = Normal(action_avg, action_std)
logprob = dist.log_prob(action).sum(1)
entropy = dist.entropy().sum(1)
return logprob, entropy
@staticmethod
def convert_action_for_env(action: Tensor) -> Tensor:
return action.tanh()
class CriticPPO(nn.Module):
def __init__(self, dims: [int], state_dim: int, _action_dim: int):
super().__init__()
self.net = build_mlp(dims=[state_dim, *dims, 1])
def forward(self, state: Tensor) -> Tensor:
return self.net(state) # advantage value
def build_mlp(dims: [int]) -> nn.Sequential: # MLP (MultiLayer Perceptron)
net_list = []
for i in range(len(dims) - 1):
net_list.extend([nn.Linear(dims[i], dims[i + 1]), nn.ReLU()])
del net_list[-1] # remove the activation of output layer
return nn.Sequential(*net_list)