-
Notifications
You must be signed in to change notification settings - Fork 7
/
conv3d.py
440 lines (378 loc) · 17.5 KB
/
conv3d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# adapted from https://github.com/ballasn/LeViRe/blob/master/blocks/bricks/conv3d.py
from theano.sandbox.cuda.dnn import dnn_conv3d, dnn_pool
from theano.sandbox.cuda.blas import GpuCorr3dMM
from blocks.bricks import Initializable, Feedforward, Sequence
from blocks.bricks.base import application, Brick, lazy
from blocks.roles import add_role, FILTER, BIAS
from blocks.utils import shared_floatx_nans
class Convolutional(Initializable):
"""Performs a 3D convolution.
Parameters
----------
filter_size : tuple
The duration, height and width of the filters (also called *kernels*).
num_filters : int
Number of filters per channel.
num_channels : int
Number of input channels in the video.
batch_size : int, optional
Number of examples per batch. If given, this will be passed to
Theano convolution operator, possibly resulting in faster
execution.
image_size : tuple, optional
The height and width of the input (video or feature map). If given,
this will be passed to the Theano convolution operator, resulting
in possibly faster execution times.
step : tuple, optional
The step (or stride) with which to slide the filters over the
image. Defaults to (1, 1, 1).
border_mode : {'valid', 'full'}, optional
The border mode to use. Defaults to 'valid'.
tied_biases : bool
If ``True``, it indicates that the biases of every filter in this
layer should be shared amongst all applications of that filter.
Setting this to ``False`` will untie the biases, yielding a
separate bias for every location at which the filter is applied.
Defaults to ``False``.
"""
@lazy(allocation=['filter_size', 'num_filters', 'num_channels'])
def __init__(self, filter_size, num_filters, num_channels, batch_size=None,
image_size=(None, None, None), step=(1, 1, 1), border_mode='valid',
cudnn_impl=False, tied_biases=False, **kwargs):
super(Convolutional, self).__init__(**kwargs)
self.filter_size = filter_size
self.num_filters = num_filters
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.step = step
self.border_mode = border_mode
self.tied_biases = tied_biases
self.cudnn_impl = cudnn_impl
@property
def padding(self):
if self.border_mode == "valid":
return (0, 0, 0)
if self.border_mode == "full":
return tuple((s - 1) / 2 for s in self.filter_size)
else:
return tuple(self.border_mode)
def _allocate(self):
W = shared_floatx_nans((self.num_filters, self.num_channels) +
self.filter_size, name='W')
add_role(W, FILTER)
self.parameters.append(W)
self.add_auxiliary_variable(W.norm(2), name='W_norm')
if self.use_bias:
if self.tied_biases:
b = shared_floatx_nans((self.num_filters,), name='b')
else:
# this error is raised here instead of during initializiation
# because ConvolutionalSequence may specify the image size
if self.image_size == (None, None, None) and not self.tied_biases:
raise ValueError('Cannot infer bias size without '
'image_size specified. If you use '
'variable image_size, you should use '
'tied_biases=True.')
b = shared_floatx_nans(self.get_dim('output'), name='b')
add_role(b, BIAS)
self.parameters.append(b)
self.add_auxiliary_variable(b.norm(2), name='b_norm')
def _initialize(self):
if self.use_bias:
W, b = self.parameters
self.biases_init.initialize(b, self.rng)
else:
W, = self.parameters
self.weights_init.initialize(W, self.rng)
@application(inputs=['input_'], outputs=['output'])
def apply(self, input_):
"""Perform the convolution.
Parameters
----------
input_ : :class:`~tensor.TensorVariable`
A 5D tensor with the axes representing batch size, number of
channels, height, width and time.
Returns
-------
output : :class:`~tensor.TensorVariable`
A 5D tensor of filtered images (feature maps) with dimensions
representing batch size, number of filters, feature map height,
feature map width and feature map time.
The height and width of the feature map depend on the border
mode. For 'valid' it is ``image_size - filter_size + 1`` while
for 'full' it is ``image_size + filter_size - 1``.
"""
if self.use_bias:
W, b = self.parameters
else:
W, = self.parameters
if self.cudnn_impl:
output = dnn_conv3d(input_, W,
subsample=tuple(self.kernel_stride),
border_mode=self.padding)
else:
output = GpuCorr3dMM(subsample=tuple(self.step),
pad=self.padding)(input_, W)
if self.use_bias:
if self.tied_biases:
output += b.dimshuffle('x', 0, 'x', 'x', 'x')
else:
output += b.dimshuffle('x', 0, 1, 2, 3)
return output
def get_dim(self, name):
if name == 'input_':
return (self.num_channels,) + self.image_size
if name == 'output':
return ((self.num_filters,) +
tuple(((i + 2*pad - k) // d + 1)
for i, k, d, pad in zip(self.image_size,
self.filter_size,
self.step,
self.padding)))
return super(Convolutional, self).get_dim(name)
class MaxPooling(Initializable, Feedforward):
"""Max pooling layer.
Parameters
----------
pooling_size : tuple
The height, width and time of the pooling region i.e. this is the factor
by which your input's last two dimensions will be downscaled.
step : tuple, optional
The vertical, horizontal and time shift (stride) between pooling regions.
By default this is equal to `pooling_size`. Setting this to a lower
number results in overlapping pooling regions.
input_dim : tuple, optional
A tuple of integers representing the shape of the input. The last
three dimensions will be used to calculate the output dimension.
"""
@lazy(allocation=['pooling_size'])
def __init__(self, pooling_size, step=None, input_dim=None, **kwargs):
super(MaxPooling, self).__init__(**kwargs)
self.input_dim = input_dim
self.pooling_size = pooling_size
self.step = step
@application(inputs=['input_'], outputs=['output'])
def apply(self, input_):
"""Apply the pooling (subsampling) transformation.
Parameters
----------
input_ : :class:`~tensor.TensorVariable`
An tensor with dimension greater or equal to 3. The last two
dimensions will be downsampled. For example, with videos this
means that the last three dimensions should represent the
duration, height and width of your image.
Returns
-------
output : :class:`~tensor.TensorVariable`
A tensor with the same number of dimensions as `input_`, but
with the last two dimensions downsampled.
"""
if self.pooling_size == (1, 1, 1):
return input_
# Pooling on last two dimensions
input__shape = input_.shape
input_ = input_.reshape((input__shape[0], input__shape[1] * input__shape[2], input__shape[3], input__shape[4]))
p = dnn_pool(img=input_, ws=tuple(self.pooling_size[1:]), stride=tuple(self.step[1:]))
p_shape = p.shape
p = p.reshape((p_shape[0], input__shape[1], input__shape[2], p_shape[2], p_shape[3]))
# Pooling on first dimension
p_shape = p.shape
p = p.reshape((p_shape[0], p_shape[1], p_shape[2], p_shape[3] * p_shape[4]))
output = dnn_pool(img=p, ws=(self.pooling_size[0], 1), stride=(self.step[0], 1))
output_shape = output.shape
output = output.reshape((output_shape[0], output_shape[1], output_shape[2], p_shape[3] , p_shape[4]))
return output
def get_dim(self, name):
if name == 'input_':
return self.input_dim
if name == 'output':
out_shape = ((self.input_dim[0],) +
tuple((a - b)//c + 1 for a, b, c in
zip(self.input_dim[1:], self.pooling_size, self.step)))
return out_shape
class _AllocationMixin(object):
def _push_allocation_config(self):
for attr in ['filter_size', 'num_filters', 'border_mode',
'batch_size', 'num_channels', 'image_size',
'tied_biases', 'use_bias']:
setattr(self.convolution, attr, getattr(self, attr))
class ConvolutionalActivation(_AllocationMixin, Sequence, Initializable):
"""A convolution followed by an activation function.
Parameters
----------
activation : :class:`.BoundApplication`
The application method to apply after convolution (i.e.
the nonlinear activation function)
See Also
--------
:class:`ConvolutionalActivation` : For the documentation of other parameters.
"""
@lazy(allocation=['filter_size', 'num_filters', 'num_channels'])
def __init__(self, activation, filter_size, num_filters, num_channels,
batch_size=None, image_size=None, step=(1, 1, 1),
border_mode='valid', tied_biases=False, **kwargs):
self.convolution = Convolutional()
self.filter_size = filter_size
self.num_filters = num_filters
self.num_channels = num_channels
self.batch_size = batch_size
self.image_size = image_size
self.step = step
self.border_mode = border_mode
self.tied_biases = tied_biases
super(ConvolutionalActivation, self).__init__(
application_methods=[self.convolution.apply, activation],
**kwargs)
def get_dim(self, name):
# TODO The name of the activation output doesn't need to be `output`
return self.convolution.get_dim(name)
def _push_allocation_config(self):
super(ConvolutionalActivation, self)._push_allocation_config()
self.convolution.step = self.step
class ConvolutionalLayer(_AllocationMixin, Sequence, Initializable):
"""A complete convolutional layer: Convolution, nonlinearity, pooling.
Parameters
----------
activation : :class:`.BoundApplication`
The application method to apply in the detector stage (i.e. the
nonlinearity before pooling. Needed for ``__init__``.
See Also
--------
:class:`Convolutional` : Documentation of convolution arguments.
:class:`MaxPooling` : Documentation of pooling arguments.
Notes
-----
Uses max pooling.
"""
@lazy(allocation=['filter_size', 'num_filters', 'pooling_size',
'num_channels'])
def __init__(self, activation, filter_size, num_filters, pooling_size,
num_channels, conv_step=(1, 1, 1), pooling_step=None,
batch_size=None, image_size=None, border_mode='valid',
tied_biases=False, pool_mode='max', **kwargs):
self.convolution = ConvolutionalActivation(activation)
self.pooling = MaxPooling()
super(ConvolutionalLayer, self).__init__(
application_methods=[self.convolution.apply,
self.pooling.apply], **kwargs)
self.convolution.name = self.name + '_convolution'
self.pooling.name = self.name + '_pooling'
self.filter_size = filter_size
self.num_filters = num_filters
self.num_channels = num_channels
self.pooling_size = pooling_size
self.conv_step = conv_step
self.pooling_step = pooling_step
self.batch_size = batch_size
self.border_mode = border_mode
self.image_size = image_size
self.tied_biases = tied_biases
self.pool_mode = pool_mode
def _push_allocation_config(self):
super(ConvolutionalLayer, self)._push_allocation_config()
self.convolution.step = self.conv_step
self.convolution._push_allocation_config()
if self.image_size is not None:
pooling_input_dim = self.convolution.get_dim('output')
else:
pooling_input_dim = None
self.pooling.input_dim = pooling_input_dim
self.pooling.pooling_size = self.pooling_size
self.pooling.step = self.pooling_step
self.pooling.batch_size = self.batch_size
def get_dim(self, name):
if name == 'input_':
return self.convolution.get_dim('input_')
if name == 'output':
return self.pooling.get_dim('output')
return super(ConvolutionalLayer, self).get_dim(name)
class ConvolutionalSequence(Sequence, Initializable, Feedforward):
"""A sequence of convolutional operations.
Parameters
----------
layers : list
List of convolutional bricks (i.e. :class:`ConvolutionalActivation`
or :class:`ConvolutionalLayer`)
num_channels : int
Number of input channels in the image. For the first layer this is
normally 1 for grayscale images and 3 for color (RGB) images. For
subsequent layers this is equal to the number of filters output by
the previous convolutional layer.
batch_size : int, optional
Number of images in batch. If given, will be passed to
theano's convolution operator resulting in possibly faster
execution.
image_size : tuple, optional
Width and height of the input (image/featuremap). If given,
will be passed to theano's convolution operator resulting in
possibly faster execution.
border_mode : 'valid', 'full' or None, optional
The border mode to use, see :func:`scipy.signal.convolve2d` for
details. Unlike with :class:`Convolutional`, this defaults to
None, in which case no default value is pushed down to child
bricks at allocation time. Child bricks will in this case
need to rely on either a default border mode (usually valid)
or one provided at construction and/or after construction
(but before allocation).
Notes
-----
The passed convolutional operators should be 'lazy' constructed, that
is, without specifying the batch_size, num_channels and image_size. The
main feature of :class:`ConvolutionalSequence` is that it will set the
input dimensions of a layer to the output dimensions of the previous
layer by the :meth:`~.Brick.push_allocation_config` method.
The reason the `border_mode` parameter behaves the way it does is that
pushing a single default `border_mode` makes it very difficult to
have child bricks with different border modes. Normally, such things
would be overridden after `push_allocation_config()`, but this is
a particular hassle as the border mode affects the allocation
parameters of every subsequent child brick in the sequence. Thus, only
an explicitly specified border mode will be pushed down the hierarchy.
"""
@lazy(allocation=['num_channels'])
def __init__(self, layers, num_channels, batch_size=None, image_size=None,
border_mode=None, tied_biases=False, **kwargs):
self.layers = layers
self.image_size = image_size
self.num_channels = num_channels
self.batch_size = batch_size
self.border_mode = border_mode
self.tied_biases = tied_biases
application_methods = [brick.apply for brick in layers]
super(ConvolutionalSequence, self).__init__(
application_methods=application_methods, **kwargs)
def get_dim(self, name):
if name == 'input_':
return ((self.num_channels,) + self.image_size)
if name == 'output':
return self.layers[-1].get_dim(name)
return super(ConvolutionalSequence, self).get_dim(name)
def _push_allocation_config(self):
num_channels = self.num_channels
image_size = self.image_size
for layer in self.layers:
if self.border_mode is not None:
layer.border_mode = self.border_mode
layer.tied_biases = self.tied_biases
layer.image_size = image_size
layer.num_channels = num_channels
layer.batch_size = self.batch_size
layer.use_bias = self.use_bias
# Push input dimensions to children
layer._push_allocation_config()
# Retrieve output dimensions
# and set it for next layer
if layer.image_size is not None:
output_shape = layer.get_dim('output')
image_size = output_shape[1:]
num_channels = layer.num_filters
class Flattener(Brick):
"""Flattens the input.
It may be used to pass multidimensional objects like images or feature
maps of convolutional bricks into bricks which allow only two
dimensional input (batch, features) like MLP.
"""
@application(inputs=['input_'], outputs=['output'])
def apply(self, input_):
return input_.flatten(ndim=2)