From 6a990ce1d00b559c4df76a2e15338cc00796a3ab Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 26 Jul 2020 19:21:01 +0000 Subject: [PATCH 1/4] Fast implementation of Select for most cases on CPU Fixes #684 Measured: enes.student.tiny11 xzcat sources.shuf.xz |head -n 10000 var (Cascade Lake) single core based on intgemm_reintegrated_computestats branch Before Total time: 66.69077s wall After Total time: 61.20206s wall --- src/tensors/cpu/tensor_operators.cpp | 128 +++++++++++++++++++-------- 1 file changed, 91 insertions(+), 37 deletions(-) diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 5f56e6340..260495fbc 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -654,40 +654,63 @@ void PasteCols(Tensor out_, } } -#if 0 // this version seems to actually be buggy, but also not used in decoding? -// Optimized version of Select for axis=2 -// @TODO: make this generally fast without this special version -void SelectAxis2(Tensor out, - const Tensor in, - const Tensor indices) { - - std::cerr << indices->debug() << std::endl; - - matchOrAbort(indices->type()); - - functional::Shape outShape = out->shape(); - functional::Shape inShape = in->shape(); - - auto idxData = indices->data(); - auto odata = out->data(); - const auto idata = in->data(); - - int size = outShape[3]; - - for(int k = 0; k < outShape[0]; ++k) { - for(int j = 0; j < outShape[1]; ++j) { - int outOffset = k * j * outShape[2] * size + j * outShape[2] * size; - int inOffset = k * j * inShape[2] * size + j * inShape[2] * size; - for(int i = 0; i < outShape[2]; ++i) { - auto idx = idxData[i]; - int outIndex = outOffset + i * size; - int inIndex = inOffset + idx * size; - std::copy(idata + inIndex, idata + inIndex + size, odata + outIndex); +/* Recursive template to implement LoopBeforeAxis. */ +template struct LoopBeforeAxisImpl { + static inline void Loop( + const functional::Shape &outShape, int outBase, + const functional::Shape &inShape, int inBase, + const functional::Shape &idxShape, int idxBase, + int axisCPU, + Backend backend) { + // Loop over this dimension. + const int dim = axisCPU - Before; + if (dim < 0) { + // This template is instantiated for every possible dimension, typically + // more than before the axis. + LoopBeforeAxisImpl::Loop(outShape, outBase, inShape, inBase, idxShape, idxBase, axisCPU, backend); + } else { + const int outStride = outShape.stride(dim); + const int end = outShape.dim(dim); + const int inStride = inShape.stride(dim); + const int idxStride = idxShape.bstride(dim); + for (int i = 0; i < end; ++i) { + LoopBeforeAxisImpl::Loop(outShape, outBase, inShape, inBase, idxShape, idxBase, axisCPU, backend); + outBase += outStride; + inBase += inStride; + idxBase += idxStride; } } } +}; + +/* We're at the axis, call the functor. */ +template struct LoopBeforeAxisImpl { + static inline void Loop( + const functional::Shape &, int outBase, + const functional::Shape &, int inBase, + const functional::Shape &, int idxBase, + int axisCPU, + Backend backend) { + backend(outBase, inBase, idxBase); + } +}; + +/* Jointly loop over dimensions [0, axisCPU) of three tensors out, in, and + * indices. Call the Backend functor for each iteration of the loop. + * Backend is a functor taking the tensors and base indices into them: + * Backend::operator()( + * int out_base, + * int in_base, + * int indices_base); + */ +template inline void LoopBeforeAxis( + const functional::Shape &outShape, + const functional::Shape &inShape, + const functional::Shape &idxShape, + int axisCPU, + Backend backend) { + LoopBeforeAxisImpl::Loop(outShape, 0, inShape, 0, idxShape, 0, axisCPU, backend); } -#endif void Select(Tensor out, const Tensor in, @@ -696,19 +719,50 @@ void Select(Tensor out, matchOrAbort(indices->type()); - // @TODO: make this efficient functional::Shape outShape = out->shape(); functional::Shape inShape = in->shape(); functional::Shape idxShape = indices->shape(); - int length = outShape.elements(); - functional::Array dims; int axisCPU = (int)(axis + functional::Shape::size() - out->shape().size()); -#if 0 // buggy but not really used? - if(axisCPU == 2 && outShape == idxShape) // specialization for axis==2 when there is no broadcasting, @TODO to be removed once we have a faster implementation below - return SelectAxis2(out, in, indices); -#endif + // Are all index dimensions 1 after the axis? + bool flatIndices = true; + // Total dimensionality of input and output after the axis. + int afterAxis = 1; + for (int i = axisCPU + 1; i < functional::Shape::size(); ++i) { + afterAxis *= outShape[i]; + if (idxShape[i] != 1) { + flatIndices = false; + } + } + /* Faster version based on copying. Requirements: + * input is contiguous for every dimension after the axis. + * output is contiguous for every dimension after the axis. + * indices have shape 1 for every dimension after the axis. + */ + if (afterAxis == inShape.stride(axisCPU) && afterAxis == outShape.stride(axisCPU) && flatIndices) { + const int end = outShape.dim(axisCPU); + const int outStride = outShape.stride(axisCPU); + const int idxStride = idxShape.bstride(axisCPU); + // Loop over all dimensions before the axis. + LoopBeforeAxis(outShape, inShape, idxShape, axisCPU, + [out, in, indices, afterAxis, end, outStride, idxStride, axisCPU](int outBase, int inBase, int idxBase) { + // Loop over the axis dimension. + for (int i = 0; i < end; ++i) { + int index = indices->data()[idxBase]; + // Loop over all dimensions after the axis. + std::copy(in->data() + inBase + index * afterAxis, in->data() + inBase + index * afterAxis + afterAxis, out->data() + outBase); + outBase += outStride; + idxBase += idxStride; + } + }); + return; + } + + // @TODO: make this efficient + int length = outShape.elements(); + // Loop over outer dimensions (those before the axis). + functional::Array dims; for(int index = 0; index < length; ++index) { outShape.dims(index, dims); // compute dimension-based indices from global index; From 3e8d1fb0190df898728323697cdcbfb5aab0f8e1 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 26 Jul 2020 19:31:42 +0000 Subject: [PATCH 2/4] Changelog entry for Select --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b4becec..9ec83a582 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. and translation with options --tsv and --tsv-fields n. ### Fixed +- Fast implementation of Select for most cases on CPU - Fix compilation without BLAS installed - Providing a single value to vector-like options using the equals sign, e.g. --models=model.npz - Fix quiet-translation in marian-server From 0b4dba8c6974be668c9fe57821a4307e83558a4c Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 26 Jul 2020 20:33:36 +0100 Subject: [PATCH 3/4] Unreferenced parameter --- src/tensors/cpu/tensor_operators.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 260495fbc..0f9a9fc68 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -689,7 +689,7 @@ template struct LoopBeforeAxisImpl { const functional::Shape &, int outBase, const functional::Shape &, int inBase, const functional::Shape &, int idxBase, - int axisCPU, + int /*axisCPU*/, Backend backend) { backend(outBase, inBase, idxBase); } From 450d1ec6a4e915519a9088076f3396b9002bdcf3 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 26 Jul 2020 20:49:01 +0100 Subject: [PATCH 4/4] Unused lambda capture --- src/tensors/cpu/tensor_operators.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 0f9a9fc68..756fbb306 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -746,7 +746,7 @@ void Select(Tensor out, const int idxStride = idxShape.bstride(axisCPU); // Loop over all dimensions before the axis. LoopBeforeAxis(outShape, inShape, idxShape, axisCPU, - [out, in, indices, afterAxis, end, outStride, idxStride, axisCPU](int outBase, int inBase, int idxBase) { + [out, in, indices, afterAxis, end, outStride, idxStride](int outBase, int inBase, int idxBase) { // Loop over the axis dimension. for (int i = 0; i < end; ++i) { int index = indices->data()[idxBase];