diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b4becec..9ec83a582 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. and translation with options --tsv and --tsv-fields n. ### Fixed +- Fast implementation of Select for most cases on CPU - Fix compilation without BLAS installed - Providing a single value to vector-like options using the equals sign, e.g. --models=model.npz - Fix quiet-translation in marian-server diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 5f56e6340..756fbb306 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -654,40 +654,63 @@ void PasteCols(Tensor out_, } } -#if 0 // this version seems to actually be buggy, but also not used in decoding? -// Optimized version of Select for axis=2 -// @TODO: make this generally fast without this special version -void SelectAxis2(Tensor out, - const Tensor in, - const Tensor indices) { - - std::cerr << indices->debug() << std::endl; - - matchOrAbort(indices->type()); - - functional::Shape outShape = out->shape(); - functional::Shape inShape = in->shape(); - - auto idxData = indices->data(); - auto odata = out->data(); - const auto idata = in->data(); - - int size = outShape[3]; - - for(int k = 0; k < outShape[0]; ++k) { - for(int j = 0; j < outShape[1]; ++j) { - int outOffset = k * j * outShape[2] * size + j * outShape[2] * size; - int inOffset = k * j * inShape[2] * size + j * inShape[2] * size; - for(int i = 0; i < outShape[2]; ++i) { - auto idx = idxData[i]; - int outIndex = outOffset + i * size; - int inIndex = inOffset + idx * size; - std::copy(idata + inIndex, idata + inIndex + size, odata + outIndex); +/* Recursive template to implement LoopBeforeAxis. */ +template struct LoopBeforeAxisImpl { + static inline void Loop( + const functional::Shape &outShape, int outBase, + const functional::Shape &inShape, int inBase, + const functional::Shape &idxShape, int idxBase, + int axisCPU, + Backend backend) { + // Loop over this dimension. + const int dim = axisCPU - Before; + if (dim < 0) { + // This template is instantiated for every possible dimension, typically + // more than before the axis. + LoopBeforeAxisImpl::Loop(outShape, outBase, inShape, inBase, idxShape, idxBase, axisCPU, backend); + } else { + const int outStride = outShape.stride(dim); + const int end = outShape.dim(dim); + const int inStride = inShape.stride(dim); + const int idxStride = idxShape.bstride(dim); + for (int i = 0; i < end; ++i) { + LoopBeforeAxisImpl::Loop(outShape, outBase, inShape, inBase, idxShape, idxBase, axisCPU, backend); + outBase += outStride; + inBase += inStride; + idxBase += idxStride; } } } +}; + +/* We're at the axis, call the functor. */ +template struct LoopBeforeAxisImpl { + static inline void Loop( + const functional::Shape &, int outBase, + const functional::Shape &, int inBase, + const functional::Shape &, int idxBase, + int /*axisCPU*/, + Backend backend) { + backend(outBase, inBase, idxBase); + } +}; + +/* Jointly loop over dimensions [0, axisCPU) of three tensors out, in, and + * indices. Call the Backend functor for each iteration of the loop. + * Backend is a functor taking the tensors and base indices into them: + * Backend::operator()( + * int out_base, + * int in_base, + * int indices_base); + */ +template inline void LoopBeforeAxis( + const functional::Shape &outShape, + const functional::Shape &inShape, + const functional::Shape &idxShape, + int axisCPU, + Backend backend) { + LoopBeforeAxisImpl::Loop(outShape, 0, inShape, 0, idxShape, 0, axisCPU, backend); } -#endif void Select(Tensor out, const Tensor in, @@ -696,19 +719,50 @@ void Select(Tensor out, matchOrAbort(indices->type()); - // @TODO: make this efficient functional::Shape outShape = out->shape(); functional::Shape inShape = in->shape(); functional::Shape idxShape = indices->shape(); - int length = outShape.elements(); - functional::Array dims; int axisCPU = (int)(axis + functional::Shape::size() - out->shape().size()); -#if 0 // buggy but not really used? - if(axisCPU == 2 && outShape == idxShape) // specialization for axis==2 when there is no broadcasting, @TODO to be removed once we have a faster implementation below - return SelectAxis2(out, in, indices); -#endif + // Are all index dimensions 1 after the axis? + bool flatIndices = true; + // Total dimensionality of input and output after the axis. + int afterAxis = 1; + for (int i = axisCPU + 1; i < functional::Shape::size(); ++i) { + afterAxis *= outShape[i]; + if (idxShape[i] != 1) { + flatIndices = false; + } + } + /* Faster version based on copying. Requirements: + * input is contiguous for every dimension after the axis. + * output is contiguous for every dimension after the axis. + * indices have shape 1 for every dimension after the axis. + */ + if (afterAxis == inShape.stride(axisCPU) && afterAxis == outShape.stride(axisCPU) && flatIndices) { + const int end = outShape.dim(axisCPU); + const int outStride = outShape.stride(axisCPU); + const int idxStride = idxShape.bstride(axisCPU); + // Loop over all dimensions before the axis. + LoopBeforeAxis(outShape, inShape, idxShape, axisCPU, + [out, in, indices, afterAxis, end, outStride, idxStride](int outBase, int inBase, int idxBase) { + // Loop over the axis dimension. + for (int i = 0; i < end; ++i) { + int index = indices->data()[idxBase]; + // Loop over all dimensions after the axis. + std::copy(in->data() + inBase + index * afterAxis, in->data() + inBase + index * afterAxis + afterAxis, out->data() + outBase); + outBase += outStride; + idxBase += idxStride; + } + }); + return; + } + + // @TODO: make this efficient + int length = outShape.elements(); + // Loop over outer dimensions (those before the axis). + functional::Array dims; for(int index = 0; index < length; ++index) { outShape.dims(index, dims); // compute dimension-based indices from global index;