From e7fc473ae9f27782310d814a9b811925a39b21b0 Mon Sep 17 00:00:00 2001 From: Balint Cristian Date: Thu, 25 Jul 2019 15:48:54 +0300 Subject: [PATCH] Add Winograd matrices computation. --- topi/python/topi/arm_cpu/conv2d.py | 61 +------ topi/python/topi/cuda/conv2d_winograd.py | 61 +------ topi/python/topi/mali/conv2d.py | 60 +------ topi/python/topi/nn/conv2d.py | 31 +--- topi/python/topi/nn/winograd_util.py | 151 ++++++++++++++++++ .../tests/python/test_topi_conv2d_winograd.py | 7 + 6 files changed, 184 insertions(+), 187 deletions(-) create mode 100644 topi/python/topi/nn/winograd_util.py diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index 9f08f686dde7..62df8f9fbfbe 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -20,20 +20,19 @@ import warnings -import numpy as np - import tvm from tvm import autotvm import tvm.contrib.nnpack from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform, \ schedule_conv2d_winograd_nnpack_without_weight_transform -from ..util import traverse_inline, get_const_tuple, const_matrix +from ..util import traverse_inline, get_const_tuple from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \ conv2d_winograd_without_weight_transform, \ conv2d_winograd_nnpack_without_weight_transform, \ depthwise_conv2d_nchw from ..nn.util import get_const_int, get_pad_tuple +from ..nn.winograd_util import winograd_transform_matrices @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct']) def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): @@ -330,57 +329,14 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' - assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 + assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") - if tile_size == 4: - G_data = np.array([ - [1 / 4.0, 0, 0], - [-1 / 6.0, -1 / 6.0, -1 / 6.0], - [-1 / 6.0, 1 / 6.0, -1 / 6.0], - [1 / 24.0, 1 / 12.0, 1 / 6.0], - [1 / 24.0, -1 / 12.0, 1 / 6.0], - [0, 0, 1]], dtype=np.float32) - - B_data = np.array([ - [4, 0, 0, 0, 0, 0], - [0, -4, 4, -2, 2, 4], - [-5, -4, -4, -1, -1, 0], - [0, 1, -1, 2, -2, -5], - [1, 1, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 1]], out_dtype) - - A_data = np.array([ - [1, 0, 0, 0], - [1, 1, 1, 1], - [1, -1, 1, -1], - [1, 2, 4, 8], - [1, -2, 4, -8], - [0, 0, 0, 1]], out_dtype) - elif tile_size == 2: - G_data = np.array([ - [1, 0, 0], - [1.0/2, 1.0/2, 1.0/2], - [1.0/2, -1.0/2, 1.0/2], - [0, 0, 1]], np.float32) - - B_data = np.array([ - [1, 0, 0, 0], - [0, 1, -1, 1], - [-1, 1, 1, 0], - [0, 0, 0, -1]], out_dtype) - - A_data = np.array([ - [1, 0], - [1, 1], - [1, -1], - [0, -1]], out_dtype) - else: - raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) - - m = A_data.shape[1] - r = 3 + r = KW + m = tile_size alpha = m + r - 1 + A, B, G = winograd_transform_matrices(m, r, out_dtype) + K = CO C = CI @@ -405,7 +361,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt if pre_computed: U = kernel else: - G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk: @@ -413,7 +368,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image - B = const_matrix(B_data, 'B') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb: @@ -427,7 +381,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt V[eps][nu][b // VP][c][b % VP], axis=c), name='M') # inverse transform - A = const_matrix(A_data, 'A') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 3d46bcd81c70..29f14a0f708e 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -17,15 +17,14 @@ # pylint: disable=invalid-name,unused-variable,unused-argument """Winograd template for cuda backend""" -import numpy as np - import tvm from tvm import autotvm from .. import nn from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform -from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline +from ..util import get_const_int, get_const_tuple, traverse_inline from ..generic import schedule_conv2d_winograd_without_weight_transform +from ..nn.winograd_util import winograd_transform_matrices def _infer_tile_size(data, kernel): @@ -54,7 +53,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty CO, CI, KH, KW = get_const_tuple(kernel.shape) HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel) HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides - assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3 + assert HSTR == 1 and WSTR == 1 and KH == KW else: # kernel tensor is pre-transfomred. this op is created by # alter op layout, do not check # dilation is not supported @@ -65,54 +64,11 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") - if tile_size == 4: - G_data = np.array([ - [1 / 4.0, 0, 0], - [-1 / 6.0, -1 / 6.0, -1 / 6.0], - [-1 / 6.0, 1 / 6.0, -1 / 6.0], - [1 / 24.0, 1 / 12.0, 1 / 6.0], - [1 / 24.0, -1 / 12.0, 1 / 6.0], - [0, 0, 1]], dtype=np.float32) - - B_data = np.array([ - [4, 0, 0, 0, 0, 0], - [0, -4, 4, -2, 2, 4], - [-5, -4, -4, -1, -1, 0], - [0, 1, -1, 2, -2, -5], - [1, 1, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 1]], out_dtype) - - A_data = np.array([ - [1, 0, 0, 0], - [1, 1, 1, 1], - [1, -1, 1, -1], - [1, 2, 4, 8], - [1, -2, 4, -8], - [0, 0, 0, 1]], out_dtype) - elif tile_size == 2: - G_data = np.array([ - [1, 0, 0], - [1.0/2, 1.0/2, 1.0/2], - [1.0/2, -1.0/2, 1.0/2], - [0, 0, 1]], np.float32) - - B_data = np.array([ - [1, 0, 0, 0], - [0, 1, -1, 1], - [-1, 1, 1, 0], - [0, 0, 0, -1]], out_dtype) - - A_data = np.array([ - [1, 0], - [1, 1], - [1, -1], - [0, -1]], out_dtype) - else: - raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) - - m = A_data.shape[1] - r = 3 + r = KW + m = tile_size alpha = m + r - 1 + A, B, G = winograd_transform_matrices(m, r, out_dtype) + H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m @@ -120,7 +76,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty # transform kernel if not pre_computed: - G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), name='r_kh') r_kw = tvm.reduce_axis((0, KW), name='r_kw') kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co: @@ -136,7 +91,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty [p % nW * m + nu], name='d') # transform data - B = const_matrix(B_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: @@ -151,7 +105,6 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty axis=[ci]), name='bgemm') # inverse transform - A = const_matrix(A_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw: diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 9a333353f41d..ddb34628c861 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -16,16 +16,15 @@ # under the License. # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return """conv2d schedule on ARM Mali GPU""" -import numpy as np - import tvm from tvm import autotvm from tvm.autotvm.task.space import get_factors from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform -from ..util import traverse_inline, get_const_int, get_const_tuple, const_matrix +from ..util import traverse_inline, get_const_int, get_const_tuple from ..nn import conv2d, conv2d_winograd_without_weight_transform, \ get_pad_tuple, pad, conv2d_alter_layout +from ..nn.winograd_util import winograd_transform_matrices # reuse some compute declarations from ARM CPU from ..arm_cpu.conv2d import _decl_spatial_pack, _alter_conv2d_layout_arm @@ -226,57 +225,13 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' - assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 + assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") - if tile_size == 4: - G_data = np.array([ - [1 / 4.0, 0, 0], - [-1 / 6.0, -1 / 6.0, -1 / 6.0], - [-1 / 6.0, 1 / 6.0, -1 / 6.0], - [1 / 24.0, 1 / 12.0, 1 / 6.0], - [1 / 24.0, -1 / 12.0, 1 / 6.0], - [0, 0, 1]], out_dtype) - - B_data = np.array([ - [4, 0, 0, 0, 0, 0], - [0, -4, 4, -2, 2, 4], - [-5, -4, -4, -1, -1, 0], - [0, 1, -1, 2, -2, -5], - [1, 1, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 1]], out_dtype) - - A_data = np.array([ - [1, 0, 0, 0], - [1, 1, 1, 1], - [1, -1, 1, -1], - [1, 2, 4, 8], - [1, -2, 4, -8], - [0, 0, 0, 1]], out_dtype) - elif tile_size == 2: - G_data = np.array([ - [1, 0, 0], - [1.0/2, 1.0/2, 1.0/2], - [1.0/2, -1.0/2, 1.0/2], - [0, 0, 1]], out_dtype) - - B_data = np.array([ - [1, 0, 0, 0], - [0, 1, -1, 1], - [-1, 1, 1, 0], - [0, 0, 0, -1]], out_dtype) - - A_data = np.array([ - [1, 0], - [1, 1], - [1, -1], - [0, -1]], out_dtype) - else: - raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) - - m = A_data.shape[1] - r = 3 + r = KW + m = tile_size alpha = m + r - 1 + A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 @@ -321,7 +276,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt if pre_computed: U = kernel else: - G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: @@ -329,7 +283,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt axis=[r_kh, r_kw]), name='U') # transform image - B = const_matrix(B_data, 'B') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: @@ -342,7 +295,6 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt tvm.sum(U[eps][nu][co // bna][ci][co % bna] * V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M') - A = const_matrix(A_data, 'A') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw: diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index bc49ba27d6a9..affbfcaa7dd8 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -19,12 +19,12 @@ """Conv2D operators""" from __future__ import absolute_import as _abs from collections import namedtuple -import numpy as np import tvm from .pad import pad from .util import get_pad_tuple -from ..util import simplify, const_matrix, get_const_tuple +from ..util import simplify, get_const_tuple +from .winograd_util import winograd_transform_matrices # workload description of conv2d Workload = namedtuple('Workload', @@ -425,7 +425,7 @@ def conv2d_winograd_weight_transform(kernel, tile_size): Parameters ---------- kernel: Tensor - The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now + The raw kernel tensor with layout "NCHW". tile_size: int Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3) @@ -434,34 +434,15 @@ def conv2d_winograd_weight_transform(kernel, tile_size): output : tvm.Tensor 4-D with shape [alpha, alpha, CO, CI] """ - K = 3 - shape = get_const_tuple(kernel.shape) - assert shape[2:] == (K, K), "Only support 3x3 kernel" + assert shape[2] == shape[3], "Only support NxN kernel" + K = shape[3] r = tile_size + K - 1 shape = (r, r) + shape[:2] - if tile_size == 2: - G_data = np.array([ - [1, 0, 0], - [1.0/2, 1.0/2, 1.0/2], - [1.0/2, -1.0/2, 1.0/2], - [0, 0, 1], - ], dtype=kernel.dtype) - elif tile_size == 4: - G_data = np.array([ - [1 / 4.0, 0, 0], - [-1 / 6.0, -1 / 6.0, -1 / 6.0], - [-1 / 6.0, 1 / 6.0, -1 / 6.0], - [1 / 24.0, 1 / 12.0, 1 / 6.0], - [1 / 24.0, -1 / 12.0, 1 / 6.0], - [0, 0, 1] - ], dtype=kernel.dtype) - else: - raise ValueError("Unsupoorted tile size:" + tile_size) + _, _, G = winograd_transform_matrices(tile_size, K, kernel.dtype) - G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, K), name='r_kh') r_kw = tvm.reduce_axis((0, K), name='r_kw') return tvm.compute(shape, lambda eps, nu, co, ci: diff --git a/topi/python/topi/nn/winograd_util.py b/topi/python/topi/nn/winograd_util.py new file mode 100644 index 000000000000..db57f7671618 --- /dev/null +++ b/topi/python/topi/nn/winograd_util.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +""" Utility functions for implementing Winograd convolutions + [*] Fast Algorithms for Convolutional Neural Networks + Andrew Lavin, Scott Gray + https://arxiv.org/abs/1509.09308 + https://github.com/andravin/wincnn +""" + +from operator import mul +from functools import reduce +import numpy as np +from ..util import const_matrix + + +# pylint: disable=invalid-name +def _cook_toom_convolution(a, n, r): + """Compute Cook-Toom convolution A,B,G matrices""" + + def _F_m(a, n): + f = lambda j, i: reduce(mul, ((a[i]-a[k] if k != i else 1) for k in range(0, n-1)), 1) + F = np.fromfunction(np.vectorize(f), (1, n-1), dtype=int) + F = np.diagflat(F) + F = np.append(F, np.zeros((n-1, 1), dtype=int), axis=1) + f = lambda i, j: (1 if j == (n-1) else 0) + z = np.fromfunction(np.vectorize(f), (1, n), dtype=int) + + return np.append(F, z, axis=0) + + def _A_m(a, m, n): + f = lambda i, j: a[i]**j + A = np.fromfunction(np.vectorize(f), (m-1, n), dtype=int) + f = lambda i, j: (1 if j == (n-1) else 0) + z = np.fromfunction(np.vectorize(f), (1, n), dtype=int) + + return np.append(A, z, axis=0) + + def _B_m(a, n): + f = lambda j, i: reduce(mul, ((a[i]-a[k] if k != i else 1) for k in range(0, n-1)), 1) + Ff = np.fromfunction(np.vectorize(f), (1, n-1), dtype=int) + f = lambda i, nth: (reduce(mul, [(np.poly1d([1, -a[k]]) if k != i else 1) \ + for k in range(0, n-1)], 1)).coef[n-1-nth-1]/Ff[0, i] + F = np.fromfunction(np.vectorize(f), (n-1, n-1), dtype=int) + f = lambda i, j: -a[i]**(n-1) + t = np.fromfunction(np.vectorize(f), (n-1, 1), dtype=int) + T = np.append(np.eye(n-1), t, axis=1) + + return np.append(F.T.dot(T), np.array([np.eye(n)[n-1]]), axis=0) + + alpha = n + r - 1 + + f = _F_m(a, alpha) + + if f[0, 0] < 0: + f[0, :] *= -1 + + A = _A_m(a, alpha, n) + + G = _A_m(a, alpha, r).T + G = G.dot(np.linalg.inv(f)).T + + B = _B_m(a, alpha) + B = B.dot(f.T) + + return (A, B, G) + +def _interpolation_points(degree): + """Propose filter points""" + + assert 2 < degree < 18 + + # Default interpolation lookup table + # + # [1] Error Analysis and Improving the Accuracy of Winograd Convolution for Deep Neural Networks + # Barbara Barabasz, Andrew Anderson, Kirk M. Soodhalter, David Gregg + # https://arxiv.org/abs/1803.10986 + # + + # pylint: disable=bad-whitespace,line-too-long + in_pts = [ + # {invalid} + [], + #01 {E=4.63E-08 on conv2d [1]} + [], + #02 {E=7.65E-08 on F( 2,3) [1]} + [0, -1, 1], + #03 {E=2.35E-07 on F( 3,3) [1]} + [0, -1, 1, 1/2], + #04 {E=3.29E-07 on F( 4,3) [1]} + [0, -1, 1, 1/2, -2], + #05 {E=6.81E-07 on F( 5,3) [1]} + [0, -1, 1, 1/2, -2, -1/2], + #06 {E=8.79E-07 on F( 6,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2], + #07 {E=3.71E-06 on F( 7,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4], + #08 {E=7.35E-06 on F( 8,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4], + #09 {E=2.20E-05 on F( 9,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 3/4, -4/3], + #10 {E=3.22E-05 on F(10,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 3/4, -4/3], + #11 {E=1.09E-04 on F(11,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 3/4, -4/3, 1/4], + #12 {E=1.99E-04 on F(12,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 1/4, -3/4, 4/3, -4], + #13 {E=5.54E-04 on F(13,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 1/4, -3/4, 4/3, 3/4, -4/3], + #14 {E=8.80E-04 on F(14,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 1/4, -3/4, 4/3, -4, 3/4, -4/3], + #15 {E=1.07E-02 on F(15,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 1/4, -3/4, 4/3, -4, 2/3, -3/2, 3/2], + #16 {E=1.93E-02 on F(16,3) [1]} + [0, -1, 1, 1/2, -1/2, 2, -2, -1/4, 4, 1/4, -3/4, 4/3, -4, 2/3, -3/2, -2/3, 3/2] + ] # pylint: enable=bad-whitespace,line-too-long + + return np.array(in_pts[degree-1], dtype=np.float64) + +def winograd_transform_matrices(tile_size, kernel_size, out_dtype): + """Compute the A, B, and G transform matrices for `tile_size` as a `tvm.Expr`. + """ + if not 1 < tile_size < 9: + raise ValueError("Unsupported tile size for Winograd: {}".format(tile_size)) + if not 2 < kernel_size < 8: + raise ValueError("Unsupported kernel size for Winograd: {}".format(kernel_size)) + + degree = tile_size + kernel_size - 2 + + intp_pts = _interpolation_points(degree) + A_data, B_data, G_data = _cook_toom_convolution(intp_pts, tile_size, kernel_size) + + return ( + const_matrix(A_data.astype(out_dtype), "A"), + const_matrix(B_data.astype(out_dtype), "B"), + const_matrix(G_data.astype(out_dtype), "G"), + ) diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py index b68ea4609fdb..cf176a8ab607 100644 --- a/topi/tests/python/test_topi_conv2d_winograd.py +++ b/topi/tests/python/test_topi_conv2d_winograd.py @@ -103,11 +103,18 @@ def test_conv2d_nchw(): autotvm.DispatchContext.current.silent = True with WinogradFallback(): + + # inception v3 workloads + verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda']) + verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=['cuda']) + verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=['cuda']) + # resnet 18 workloads verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=['cuda']) # batch size = 2 verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)