-
Notifications
You must be signed in to change notification settings - Fork 153
/
main.py
360 lines (292 loc) · 14.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
##############################################################################################################
# 这部分代码主要是引入所需要的库
#
#
##############################################################################################################
import os
import numpy as np
from pathlib import Path
from data_prep_bbh import *
from utils import *
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
##############################################################################################################
# 这部分代码主要是定义数据生成器
#
#
##############################################################################################################
class DatasetGenerator(Dataset):
def __init__(self, fs=8192, T=1, snr=20,
detectors=['H1', 'L1'],
nsample_perepoch=100,
Nnoise=25, mdist='metric',beta=[0.75,0.95],
verbose=True):
# 初始化函数,设置各种参数
if verbose:
print('GPU available?', torch.cuda.is_available())
self.fs = fs # the sampling frequency (Hz)
self.T = T # the observation duration (sec)
safe = 2 # define the safe multiplication scale for the desired time length
self.T *= safe
self.detectors = detectors
self.snr = snr
self.generate(nsample_perepoch, Nnoise, mdist, beta) # pre-generate sampels
def generate(self, Nblock, Nnoise=25, mdist='metric',beta=[0.75,0.95]):
# 生成数据的函数
# Nnoise: # the number of noise realisations per signal
# Nblock: # the number of training samples per output file
# mdist: # mass distribution (astro,gh,metric)
ts, par = sim_data(self.fs, self.T, self.snr, self.detectors, Nnoise, size=Nblock, mdist=mdist,
beta=beta, verbose=False)
self.strains = np.expand_dims(ts[0], 1) # (nsample, 1, len(det), fs*T)
self.labels = ts[1]
def __len__(self):
# 返回数据的长度
return len(self.strains)
def __getitem__(self, idx):
# 获取数据的函数
return self.strains[idx], self.labels[idx]
##############################################################################################################
# 这部分代码主要是定义网络结构,以及加载和保存模型的函数
#
#
##############################################################################################################
# 在模型定义中,我们定义了一个卷积神经网络,包含了多个卷积层、激活函数、批量归一化层和最大池化层。
# 最后,我们添加了一个Flatten层和两个全连接层。
class MyNet(nn.Module):
def __init__(self):
# 初始化函数,设置网络的各种参数
super(MyNet, self).__init__()
Nfilters = [8, 16, 16, 32, 64, 64, 128, 128]
filter_size = [(1, 32)] + [(1, 16)] * 3 + [(1, 8)] * 2 + [(1, 4)] * 2
filter_stride = [(1, 1)] * 8
dilation = [(1, 1)] * 8
pooling = [1, 0, 0, 0, 1, 0, 0, 1]
pool_size = [[1, 8]] + [(1, 1)] * 3 + [[1, 6]] + [(1, 1)] * 2 + [[1, 4]]
pool_stride = [[1, 8]] + [(1, 1)] * 3 + [[1, 6]] + [(1, 1)] * 2 + [[1, 4]]
self.layers = nn.ModuleList()
for i in range(8):
# 添加卷积层
self.layers.append(nn.Conv2d(
in_channels=1 if i == 0 else Nfilters[i-1], # Number of channels in the input image
out_channels=Nfilters[i], # Number of channels produced by the convolution
kernel_size=filter_size[i], # Size of the convolving kernel
stride=filter_stride[i], # Stride of the convolution
padding=0, # Zero-padding added to both sides of the input
dilation=dilation[i], # Spacing between kernel elements
groups=1, # Number of blocked connections from input channels to output channels
bias=True, # If True, adds a learnable bias to the output
padding_mode='zeros', # Specifies the type of padding, 'zeros' pads with zero
))
# 添加ELU激活函数,alpha参数为0.01
self.layers.append(nn.ELU(0.01))
# 添加批量归一化层,特征数量为Nfilters[i]
self.layers.append(nn.BatchNorm2d(num_features=Nfilters[i]))
# 如果pooling[i]为真,添加最大池化层
if pooling[i]:
# 最大池化层的参数:核大小为pool_size[i],步长为pool_stride[i],填充为0
self.layers.append(nn.MaxPool2d(
kernel_size=pool_size[i],
stride=pool_stride[i],
padding=0,
))
# 添加Flatten层,将输入展平
self.layers.append(nn.Flatten())
# 添加全连接层,输入维度为20224,输出维度为64
self.layers.append(nn.Linear(20224, 64))
# 添加ELU激活函数,alpha参数为0.01
self.layers.append(nn.ELU(0.01))
# 添加Dropout层,丢弃率为0.5
self.layers.append(nn.Dropout(0.5))
# 添加全连接层,输入维度为64,输出维度为2
self.layers.append(nn.Linear(64, 2))
def forward(self, x):
# 前向传播函数
for layer in self.layers:
x = layer(x)
return x
# 在模型保存和加载函数中,我们保存了模型的参数、优化器的状态、学习率调度器的状态和训练的epoch。
# 在加载模型时,我们加载了模型的参数,并返回了模型、训练的epoch和训练损失历史。
def load_model(checkpoint_dir=None):
# 加载模型的函数
net = MyNet()
if (checkpoint_dir is not None) and (Path(checkpoint_dir).is_dir()):
p = Path(checkpoint_dir)
files = [f for f in os.listdir(p) if '.pt' in f]
# if there is a *.pt model file, load it!
if (files != []) and (len(files) == 1):
checkpoint = torch.load(p / files[0])
net.load_state_dict(checkpoint['model_state_dict'])
print('Load network from', p / files[0])
epoch = checkpoint['epoch']
train_loss_history = np.load(p / 'train_loss_history_cnn.npy').tolist()
return net, epoch, train_loss_history
else:
print('Init network!')
return net, 0, []
def save_model(epoch, model, optimizer, scheduler, checkpoint_dir, train_loss_history, filename):
"""Save a model and optimizer to file.
"""
# 保存模型的函数
p = Path(checkpoint_dir)
p.mkdir(parents=True, exist_ok=True)
# clear all the *.pt
assert '.pt' in filename
for f in [f for f in os.listdir(p) if '.pt' in f]:
os.remove(p / f)
# Save loss history
np.save(p / 'train_loss_history_cnn', train_loss_history)
output = {
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch,
}
if scheduler is not None:
output['scheduler_state_dict'] = scheduler.state_dict()
# save the model
torch.save(output, p / filename)
##############################################################################################################
# 这部分代码主要是定义训练和评估函数
#
#
##############################################################################################################
numpy = lambda x, *args, **kwargs: x.detach().numpy(*args, **kwargs)
size = lambda x, *args, **kwargs: x.numel(*args, **kwargs)
reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs)
to = lambda x, *args, **kwargs: x.to(*args, **kwargs)
reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs)
astype = lambda x, *args, **kwargs: x.type(*args, **kwargs)
transpose = lambda x, *args, **kwargs: x.t(*args, **kwargs)
def accuracy(y_hat, y):
"""Compute the number of correct predictions."""
# 计算预测正确的数量
if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
y_hat = argmax(y_hat, dim=1)
cmp = astype(y_hat, y.dtype) == y
return float(reduce_sum(astype(cmp, y.dtype)))
def evaluate_accuracy_gpu(net, data_iter, loss_func, device=None): #@save
"""使用GPU计算模型在数据集上的精度"""
if isinstance(net, nn.Module):
net.eval() # 设置为评估模式
if not device:
device = next(iter(net.parameters())).device
# 正确预测的数量,总预测的数量, test_loss
metric = Accumulator(3)
with torch.no_grad():
for X, y in data_iter:
X = X.to(device).to(torch.float)
y = y.to(device).to(torch.long)
y_hat = net(X)
loss = loss_func(y_hat, y)
metric.add(accuracy(y_hat, y), y.numel(), loss.sum())
return metric[0] / metric[1], metric[2] / metric[1]
# 在训练函数中,我们首先定义了损失函数、优化器和学习率调度器。
# 然后,我们开始训练循环,每个epoch我们都会生成新的训练样本,
# 然后通过网络进行前向传播,计算损失,然后进行反向传播和参数更新。
# 在每个epoch结束后,我们会在测试集上评估模型,并保存测试损失最小的模型。
def train(net, lr, nsample_perepoch, epoch, total_epochs,
dataset_train, data_loader, test_iter,
train_loss_history, checkpoint_dir, device, notebook=True):
"""训练函数"""
# 设置优化器参数
loss_func = nn.CrossEntropyLoss() # 定义损失函数
optimizer = torch.optim.Adam(net.parameters(), lr=lr) # 定义优化器
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=total_epochs, # 定义学习率调度器
)
torch.cuda.empty_cache() # 清空CUDA缓存
if notebook:
animator = Animator(xlabel='epoch', xlim=[1, total_epochs],
legend=['train loss', 'test loss', 'train acc', 'test acc']) # 定义动画显示器
timer, num_batches = Timer(), len(dataset_train) # 定义计时器和批次数量
# 开始训练循环
for epoch in range(epoch, epoch + total_epochs):
# 预生成训练样本
dataset_train.generate(nsample_perepoch)
# 如果不是notebook模式,打印当前学习率
if not notebook:
print('Learning rate: {}'.format(
optimizer.state_dict()['param_groups'][0]['lr']))
# 初始化度量累加器,用于计算训练损失总和,训练准确率总和,样本数
metric = Accumulator(3)
# 设置网络为训练模式
net.train()
# 遍历数据加载器中的所有批次
for batch_idx, (x, y) in enumerate(data_loader):
# 开始计时
timer.start()
# 清零优化器的梯度 (1/4)
optimizer.zero_grad()
# 将数据和标签转移到设备上,并转换为适当的数据类型
data = x.to(device, non_blocking=True).to(torch.float)
label = y.to(device, non_blocking=True).to(torch.long)
# 通过网络进行前向传播,得到预测结果
pred = net(data)
# 计算损失
loss = loss_func(pred, label)
# 在不需要计算梯度的情况下执行以下操作
with torch.no_grad():
# 更新度量累加器
metric.add(loss.sum(), accuracy(pred, label), x.shape[0])
# 停止计时
timer.stop()
# 反向传播计算梯度 (2/4)
loss.backward()
# 使用优化器更新网络参数 (3/4)
optimizer.step()
# 计算训练损失和训练准确率
train_l = metric[0] / metric[2]
train_acc = metric[1] / metric[2]
# 如果是notebook模式,并且当前批次是整个训练集的1/5或最后一个批次,更新动画显示器
if notebook and (batch_idx + 1) % (num_batches // 5) == 0 or batch_idx == num_batches - 1:
animator.add(epoch + (batch_idx + 1) / num_batches,
(train_l, None, train_acc, None))
# 使用学习率调度器更新学习率 (4/4)
scheduler.step()
# 在测试集上评估模型
test_acc, test_l = evaluate_accuracy_gpu(net, test_iter, loss_func, device)
# 保存训练损失历史
train_loss_history.append([epoch+1, train_l, test_l, train_acc, test_acc])
# 如果是notebook模式,更新动画显示器;否则,打印训练损失、测试损失、训练准确率和测试准确率
if notebook:
animator.add(epoch + 1, (train_l, test_l, train_acc, test_acc))
else:
print(f'Epoch: {epoch+1} \t'
f'Train Loss: {train_l:.4f} Test Loss: {test_l:.4f} \t'
f'Train Acc: {train_acc} Test Acc: {test_acc}')
# 如果当前测试损失小于或等于历史最低测试损失,保存模型
if (test_l <= min(np.asarray(train_loss_history)[:,1])):
save_model(epoch, net, optimizer, scheduler,
checkpoint_dir=checkpoint_dir,
train_loss_history=train_loss_history,
filename=f'model_e{epoch}.pt',)
# 打印最终的训练损失、训练准确率和测试准确率
print(f'loss {train_l:.4f}, train acc {train_acc:.3f}, '
f'test acc {test_acc:.3f}')
# 打印每秒处理的样本数和使用的设备
print(f'{metric[2] * total_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
if __name__ == "__main__":
# 主函数,程序的入口
nsample_perepoch = 100 # 每个epoch的样本数
dataset_train = DatasetGenerator(snr=20, nsample_perepoch=nsample_perepoch) # 训练数据集
dataset_test = DatasetGenerator(snr=20, nsample_perepoch=nsample_perepoch) # 测试数据集
# 创建一个DataLoader
data_loader = DataLoader(dataset_train, batch_size=32, shuffle=True,) # 训练数据加载器
test_iter = DataLoader(dataset_test, batch_size=32, shuffle=True,) # 测试数据加载器
device = torch.device('cuda') # 使用CUDA设备
# 模型和损失历史的输出路径
checkpoint_dir = './checkpoints_cnn1/'
# 创建模型
net, epoch, train_loss_history = load_model(checkpoint_dir) # 加载模型
net.to(device); # 将模型转移到设备上
# 优化器参数
lr = 0.003 # 学习率
total_epochs = 100 # 总的训练轮数
total_epochs += epoch # 加上已经训练过的轮数
output_freq = 1 # 输出频率
train(net, lr, nsample_perepoch, epoch, total_epochs, data_loader, test_iter, notebook=False) # 训练模型```