From 0307631a22e85fd0f1fa57809f3b82761efdf18d Mon Sep 17 00:00:00 2001 From: cecilia peng Date: Wed, 27 Mar 2024 15:55:04 +0800 Subject: [PATCH] [CPU] plugin implementation of ScatterElementsUpdate-12 (#23287) ### Details: - *[CPU] implement ScatterElementUpdate-12* - *...* ### Tickets: - *129813* --------- Co-authored-by: Li, Tingqian Co-authored-by: Maksim Kutakov --- .../intel_cpu/src/nodes/scatter_update.cpp | 501 +++++++++++++++--- .../intel_cpu/src/nodes/scatter_update.h | 64 ++- .../src/index_add_scatter_elements_update.cpp | 306 +++++++++++ .../scatter_elements_update.cpp | 33 ++ 4 files changed, 821 insertions(+), 83 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp index 09951387b68510..875a5b8bac53a3 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp @@ -10,6 +10,8 @@ #include "openvino/core/parallel.hpp" #include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset4.hpp" +#include "openvino/opsets/opset12.hpp" +#include "selective_build.h" #include #include @@ -23,12 +25,13 @@ namespace node { bool ScatterUpdate::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto scatterElemUpd = ov::as_type_ptr(op); + auto scatterElemUpd3 = ov::as_type_ptr(op); + auto scatterElemUpd12 = ov::as_type_ptr(op); auto scatterUpd = ov::as_type_ptr(op); auto scatterNdUpd = ov::as_type_ptr(op); - if (!scatterElemUpd && !scatterUpd && !scatterNdUpd) { + if (!scatterElemUpd3 && !scatterElemUpd12 && !scatterUpd && !scatterNdUpd) { const std::string opType = op->get_type_name(); - errorMessage = "Only opset" + opType == "ScatterNDUpdate" ? "4 " : "3 " + opType + " operation is supported"; + errorMessage = std::string("Type ") + opType + " is not supported."; return false; } } catch (...) { @@ -53,6 +56,14 @@ ScatterUpdate::ScatterUpdate(const std::shared_ptr& op, const GraphCon } else { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } + + const auto node = ov::as_type_ptr(op); + if (node) { + reduction_type = node->get_reduction(); + use_init_val = node->get_use_init_val(); + } else { + reduction_type = ScatterUpdate::Reduction::NONE; + } } void ScatterUpdate::getSupportedDescriptors() { @@ -196,36 +207,23 @@ void ScatterUpdate::initSupportedPrimitiveDescriptors() { } dataPrec = getOriginalInputPrecisionAtPort(DATA_ID); + if (scatterUpdateMode == ScatterUpdateMode::ScatterElementsUpdate && + !one_of(dataPrec, ov::element::f32, ov::element::i32, + ov::element::bf16, ov::element::f16, + ov::element::u8, ov::element::i8)) { + dataPrec = ov::element::f32; + } dataSize = dataPrec.size(); - bool canBeInplace = !isDynamicNode() && getParentEdgeAt(DATA_ID)->getParent()->getChildEdges().size() == 1 && - !getParentEdgeAt(DATA_ID)->getParent()->isConstant(); + bool canBeInplace = !getParentEdgeAt(DATA_ID)->getParent()->isConstant(); - NodeConfig config; - if (axisRelaxed) { - config.inConfs.resize(4); - } else { - config.inConfs.resize(3); - } - config.outConfs.resize(1); - config.inConfs[DATA_ID].constant(false); - config.inConfs[INDICES_ID].constant(false); - config.inConfs[UPDATE_ID].constant(false); - config.outConfs[0].constant(false); - config.inConfs[DATA_ID].inPlace(canBeInplace ? 0 : -1); - config.inConfs[INDICES_ID].inPlace(-1); - config.inConfs[UPDATE_ID].inPlace(-1); - config.outConfs[0].inPlace(canBeInplace ? 0 : -1); - if (axisRelaxed) { - config.inConfs[AXIS_ID].constant(false); - config.inConfs[AXIS_ID].inPlace(-1); - } - - std::vector inPortConfig{{LayoutType::ncsp, dataPrec}, {LayoutType::ncsp, indicesPrec}, {LayoutType::ncsp, dataPrec}}; + std::vector inPortConfig{{LayoutType::ncsp, dataPrec, false, canBeInplace ? 0 : -1}, + {LayoutType::ncsp, indicesPrec}, + {LayoutType::ncsp, dataPrec}}; if (axisRelaxed) inPortConfig.emplace_back(LayoutType::ncsp, axisPrec); addSupportedPrimDesc(inPortConfig, - {{LayoutType::ncsp, dataPrec}}, + {{LayoutType::ncsp, dataPrec, false, canBeInplace ? 0 : -1}}, impl_desc_type::unknown); } @@ -263,6 +261,398 @@ static std::vector getBlockND(const VectorDims& shape) { return blockND; } +namespace scatter_elements_update { +template +static T reduction_neutral_value(const ScatterUpdate::Reduction reduction_type) { + switch (reduction_type) { + case ScatterUpdate::Reduction::MAX: + return std::numeric_limits::lowest(); + case ScatterUpdate::Reduction::MIN: + return std::numeric_limits::max(); + case ScatterUpdate::Reduction::PROD: + return T{1}; + case ScatterUpdate::Reduction::SUM: + case ScatterUpdate::Reduction::MEAN: + case ScatterUpdate::Reduction::NONE: + return T{0}; + default: + OPENVINO_THROW("Neutral value not available for this type of reduction"); + return 0; + } +} + +static inline void getCoordinate(VectorDims& coordinate, size_t offset, const VectorDims& shape) { + size_t shapeRank = shape.size(); + for (int i = shapeRank - 1; i >= 0; i--) { + coordinate[i] = offset % shape[i]; + offset /= shape[i]; + } +} + +struct TensorIterator { + TensorIterator(const VectorDims& squashed_shape, const int64_t squashed_axis) : m_squashed_shape(squashed_shape), m_squashed_axis(squashed_axis) { + OPENVINO_ASSERT(m_squashed_shape[m_squashed_axis] == 1); + } + + std::array startover(const size_t start, const std::vector& dataBlockND, const std::vector& indicesBlockND) { + m_tensorIter.resize(m_squashed_shape.size(), 0); + getCoordinate(m_tensorIter, start, m_squashed_shape); + + size_t i, dst_idx = 0, indices_idx = 0; + for (i = 0; i < static_cast(m_squashed_axis); ++i) { + dst_idx += m_tensorIter[i] * dataBlockND[i + 1]; + indices_idx += m_tensorIter[i] * indicesBlockND[i + 1]; + } + for (i++; i < m_squashed_shape.size(); ++i) { + dst_idx += m_tensorIter[i] * dataBlockND[i + 1]; + indices_idx += m_tensorIter[i] * indicesBlockND[i + 1]; + } + + return {dst_idx, indices_idx}; + } + + void increment(std::array& offsets, const std::vector& dataBlockND, const std::vector& indicesBlockND) { + for (int64_t j = m_squashed_shape.size() - 1; j >= 0; j--) { + m_tensorIter[j]++; + if (m_tensorIter[j] < m_squashed_shape[j]) { // no need check if (j != axis) as it is squashed + offsets[0] += dataBlockND[j + 1]; + offsets[1] += indicesBlockND[j + 1]; + break; + } else { + m_tensorIter[j] = 0; + size_t i = 0; + for (offsets[0] = 0, offsets[1] = 0; i < m_squashed_axis; ++i) { + offsets[0] += m_tensorIter[i] * dataBlockND[i + 1]; + offsets[1] += m_tensorIter[i] * indicesBlockND[i + 1]; + } + for (i++; i < m_squashed_shape.size(); ++i) { + offsets[0] += m_tensorIter[i] * dataBlockND[i + 1]; + offsets[1] += m_tensorIter[i] * indicesBlockND[i + 1]; + } + } + } + } + + VectorDims m_tensorIter; + const VectorDims m_squashed_shape; + const size_t m_squashed_axis; +}; + +struct ScatterElementsUpdateContext { + ScatterUpdate* node; + MemoryPtr dstMemPtr; + MemoryPtr indicesMemPtr; + MemoryPtr updateMemPtr; + int axis; + ScatterUpdate::Reduction reduction_type; +}; + +// tier 2 dispatcher with Reduce which follows up DataType. +template +struct ScatterElementsUpdateReduceDispatcher { + void operator()(ScatterElementsUpdateContext& ctx) { + using kernel_t = typename PT::second_type; + using data_t = typename PT::first_type; + ctx.node->scatterElementsUpdate(ctx.dstMemPtr, ctx.indicesMemPtr, ctx.updateMemPtr, ctx.axis, + kernel_t{}); + } +}; + +// tier 1 dispatcher with DataType +template +struct ScatterElementsUpdateDispatcher { + void operator()(ScatterElementsUpdateContext& ctx) { + scatterElementsUpdate_dispatch(ctx); + } + +private: + void scatterElementsUpdate_dispatch(ScatterElementsUpdateContext& ctx) { + using namespace scatter_elements_update; + using DT_NONE = std::pair; + using DT_SUM = std::pair; + using DT_MAX = std::pair; + using DT_MIN = std::pair; + using DT_MUL = std::pair; + using DT_MEAN = std::pair; + OV_SWITCH(intel_cpu, + ScatterElementsUpdateReduceDispatcher, + ctx, + ctx.reduction_type, + OV_CASE(ScatterUpdate::Reduction::NONE, DT_NONE), + OV_CASE(ScatterUpdate::Reduction::SUM, DT_SUM), + OV_CASE(ScatterUpdate::Reduction::MAX, DT_MAX), + OV_CASE(ScatterUpdate::Reduction::MIN, DT_MIN), + OV_CASE(ScatterUpdate::Reduction::PROD, DT_MUL), + OV_CASE(ScatterUpdate::Reduction::MEAN, DT_MEAN)); + } +}; +}; // namespace scatter_elements_update + +// output[indices[i][j][k]][j][k] = updates[i][j][k] if axis = 0, +// output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1, +// output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2. +template +void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, + int axis, const KernelType& kernel) { + using namespace scatter_elements_update; + DataType *dataPtr = mem_data->getDataAs(); + DataType *updatePtr = mem_updates->getDataAs(); + uint8_t *indicesPtr = mem_indices->getDataAs(); + + const auto& data_shape = mem_data->getStaticDims(); + const auto& indices_shape = mem_indices->getStaticDims(); + const size_t updates_rank = indices_shape.size(); + + const int64_t data_dim_size = static_cast(data_shape[axis]); + const auto index_dim_size = indices_shape[axis]; + + if (axis < 0) + axis += updates_rank; + + VectorDims squashed_indices_shape(indices_shape); + squashed_indices_shape[axis] = 1; + + const std::vector dataBlockND = getBlockND(data_shape); + const std::vector indicesBlockND = getBlockND(indices_shape); + const size_t dataBlock_axisplus1 = dataBlockND[axis + 1]; + const size_t indicesBlock_axisplus1 = indicesBlockND[axis + 1]; + + // process serially along 'axis' dimension because of data dependency brought by duplicated value in indices + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(shape_size(squashed_indices_shape), nthr, ithr, start, end); + scatter_elements_update::TensorIterator tensorItr(squashed_indices_shape, axis); + + // When *use_init_val* attribute is false, we need to substitute the copied values at target locations with values that + // will not affect the particular reduction algorithms. + if (!use_init_val) { + const auto value = reduction_neutral_value(reduction_type); + auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND); + for (size_t worker = start; worker < end; worker++) { + auto indices_offset = offsets[1]; + for (size_t idx = 0; idx < index_dim_size; idx++) { + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + dataPtr[offsets[0] + idxValue * dataBlock_axisplus1] = value; + indices_offset += indicesBlock_axisplus1; + } + + // increment + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + } + } + + // Apply the Reduce function in an element-wise fashion. For better performance, + // when axis is the last dimension, we iterate along axis in the inner loop; otherwise we iterate axis + // in the outer loop. + auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND); + if (axis == static_cast(updates_rank - 1)) { + for (size_t worker = start; worker < end; worker++) { + auto indices_offset = offsets[1]; + for (size_t idx = 0; idx < index_dim_size; idx++) { + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[offsets[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[indices_offset]; + kernel(dst, src); + indices_offset += indicesBlock_axisplus1; + } + // increment + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + } + } else { + // For better performance, the offsets of dst and indices are cached in the first iteration of outer loop, and reused + // in the remaining iterations. + std::vector dst_offsets(end-start+1, offsets[0]); // one extra to avoid overflow at the last iteration of inner loop + std::vector indices_offsets(end-start+1, offsets[1]); + size_t *ptr_dst_offset = &dst_offsets[0]; + size_t *ptr_indices_offset = &indices_offsets[0]; + for (size_t worker = start; worker < end; worker++) { // idx = 0 + int64_t idxValue = getIndicesValue(indicesPtr, *ptr_indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[ptr_indices_offset[0]]; + kernel(dst, src); + + // increment once for all + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + *++ptr_dst_offset = offsets[0]; + *++ptr_indices_offset = offsets[1]; + } + for (size_t idx = 1; idx < index_dim_size; idx++) { + ptr_indices_offset = &indices_offsets[0]; + ptr_dst_offset = &dst_offsets[0]; + for (size_t worker = start; worker < end; worker++) { + auto indices_offset = *ptr_indices_offset + idx * indicesBlock_axisplus1; + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[indices_offset]; + kernel(dst, src); + ptr_indices_offset++; + ptr_dst_offset++; + } + } + } + }); +} + +// We specialize ReduceMean to avoid spoil performance of other reduce methods, as otherwise condition branch +// were used in loops. +template +void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, + int axis, const scatter_elements_update::ReduceMean& kernel) { + using namespace scatter_elements_update; + OPENVINO_ASSERT(reduction_type == ScatterUpdate::Reduction::MEAN, "The reduction type should be MEAN here."); + DataType *dataPtr = mem_data->getDataAs(); + DataType *updatePtr = mem_updates->getDataAs(); + uint8_t *indicesPtr = mem_indices->getDataAs(); + + const auto& data_shape = mem_data->getStaticDims(); + const auto& indices_shape = mem_indices->getStaticDims(); + size_t updates_rank = indices_shape.size(); + + const int64_t data_dim_size = static_cast(data_shape[axis]); + const auto index_dim_size = indices_shape[axis]; + + if (axis < 0) + axis += updates_rank; + + VectorDims squashed_indices_shape(indices_shape); + squashed_indices_shape[axis] = 1; + + const std::vector dataBlockND = getBlockND(data_shape); + const std::vector indicesBlockND = getBlockND(indices_shape); + const size_t dataBlock_axisplus1 = dataBlockND[axis + 1]; + const size_t indicesBlock_axisplus1 = indicesBlockND[axis + 1]; + + // process serially along 'axis' dimension because of data dependency brought by duplicated value in indices + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(shape_size(squashed_indices_shape), nthr, ithr, start, end); + scatter_elements_update::TensorIterator tensorItr(squashed_indices_shape, axis); + + // When *use_init_val* attribute is false, we need to substitute the copied values at target locations with values that + // will not affect the particular reduction algorithms. + if (!use_init_val) { + const auto value = reduction_neutral_value(reduction_type); + auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND); + for (size_t worker = start; worker < end; worker++) { + auto indices_offset = offsets[1]; + for (size_t idx = 0; idx < index_dim_size; idx++) { + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + dataPtr[offsets[0] + idxValue * dataBlock_axisplus1] = value; + indices_offset += indicesBlock_axisplus1; + } + + // increment + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + } + } + + // Apply the Reduce function in an element-wise fashion. For better performance, + // when axis is the last dimension, we iterate along axis in the inner loop; otherwise we iterate axis + // in the outer loop. + auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND); + if (axis == static_cast(updates_rank - 1)) { + for (size_t worker = start; worker < end; worker++) { + std::unordered_map mean_reduction_counters; // (idxValue, num_sums) for current worker + + auto indices_offset = offsets[1]; + for (size_t idx = 0; idx < index_dim_size; idx++) { + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[offsets[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[indices_offset]; + kernel(dst, src); + indices_offset += indicesBlock_axisplus1; + + mean_reduction_counters[idxValue] += 1; + } + + // average + for (const auto& counter : mean_reduction_counters) { + auto dst = &dataPtr[offsets[0] + counter.first * dataBlock_axisplus1]; + const auto N = counter.second + static_cast(use_init_val); + *dst = static_cast(static_cast(*dst) / N); + } + + // increment + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + } + } else { + // For better performance, the offsets of dst and indices are cached in the first iteration of outer loop, and reused + // in the remaining iterations. + std::unordered_map mean_reduction_counters; // (dst_addr, num_sums) for all workers + + std::vector dst_offsets(end-start+1, offsets[0]); // one extra to avoid overflow at the last iteration of inner loop + std::vector indices_offsets(end-start+1, offsets[1]); + size_t *ptr_dst_offset = &dst_offsets[0]; + size_t *ptr_indices_offset = &indices_offsets[0]; + for (size_t worker = start; worker < end; worker++) { // idx = 0 + int64_t idxValue = getIndicesValue(indicesPtr, *ptr_indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[ptr_indices_offset[0]]; + kernel(dst, src); + + mean_reduction_counters[dst] += 1; + + // increment once for all + tensorItr.increment(offsets, dataBlockND, indicesBlockND); + *++ptr_dst_offset = offsets[0]; + *++ptr_indices_offset = offsets[1]; + } + for (size_t idx = 1; idx < index_dim_size; idx++) { + ptr_indices_offset = &indices_offsets[0]; + ptr_dst_offset = &dst_offsets[0]; + for (size_t worker = start; worker < end; worker++) { + auto indices_offset = *ptr_indices_offset + idx * indicesBlock_axisplus1; + int64_t idxValue = getIndicesValue(indicesPtr, indices_offset); + if (idxValue < 0) idxValue += data_dim_size; + assert(idxValue < data_dim_size && idxValue >= 0); + auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1]; + auto src = &updatePtr[indices_offset]; + kernel(dst, src); + mean_reduction_counters[dst] += 1; + ptr_indices_offset++; + ptr_dst_offset++; + } + } + + // average + for (const auto& counter : mean_reduction_counters) { + auto dst = counter.first; + const auto N = counter.second + static_cast(use_init_val); + *dst = static_cast(static_cast(*dst) / N); + } + } + }); +} + +void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& dstMemPtr, const MemoryPtr& indicesMemPtr, const MemoryPtr& updateMemPtr, int axis) { + using namespace scatter_elements_update; + ScatterElementsUpdateContext ctx{this, dstMemPtr, indicesMemPtr, updateMemPtr, axis, reduction_type}; + OV_SWITCH(intel_cpu, + ScatterElementsUpdateDispatcher, + ctx, + dataPrec, + OV_CASE(ov::element::f32, float), + OV_CASE(ov::element::i32, int32_t), + OV_CASE(ov::element::bf16, ov::bfloat16), + OV_CASE(ov::element::f16, ov::float16), + OV_CASE(ov::element::i8, int8_t), + OV_CASE(ov::element::u8, uint8_t)); +} + void ScatterUpdate::execute(dnnl::stream strm) { auto srcMemPtr = getSrcMemoryAtPort(DATA_ID); auto dstMemPtr = getDstMemoryAtPort(0); @@ -391,12 +781,11 @@ void ScatterUpdate::execute(dnnl::stream strm) { break; } case ScatterUpdateMode::ScatterElementsUpdate: { - scatterElementsUpdate(indicesPtr, updatePtr, axis, dstPtr); + scatterElementsUpdate(dstMemPtr, indicesMemPtr, updateMemPtr, axis); break; } default: { - OPENVINO_THROW(errorPrefix - , " is not supported"); + OPENVINO_THROW(errorPrefix, " is not supported"); } } } @@ -468,58 +857,6 @@ void ScatterUpdate::scatterNDUpdate(uint8_t *indices, uint8_t *update, uint8_t * }); } -// output[indices[i][j][k]][j][k] = updates[i][j][k] if axis = 0, -// output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1, -// output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2. -void ScatterUpdate::scatterElementsUpdate(uint8_t *indices, uint8_t *update, int axis, uint8_t *dstData) { - const auto& srcDataDim = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims(); - const auto& updateDim = getParentEdgeAt(UPDATE_ID)->getMemory().getStaticDims(); - size_t updateRank = updateDim.size(); - - std::vector srcBlockND = getBlockND(srcDataDim); - std::vector updateBlockND = getBlockND(updateDim); - - parallel_nt(0, [&](const int ithr, const int nthr) { - int j; - size_t i, dst_idx = 0, start = 0, end = 0; - VectorDims tensorItr(updateRank, 0); - splitter(updateBlockND[0], nthr, ithr, start, end); - for (j = updateRank - 1, i = start; j >= 0; j--) { - tensorItr[j] = i % updateDim[j]; - i /= updateDim[j]; - } - - for (i = 0; i < static_cast(axis); ++i) - dst_idx += tensorItr[i] * srcBlockND[i + 1]; - for (i++; i < updateRank; ++i) - dst_idx += tensorItr[i] * srcBlockND[i + 1]; - - for (size_t iwork = start; iwork < end; iwork++) { - int64_t idxValue = getIndicesValue(indices, iwork); - int64_t axisDim = static_cast(srcDataDim[axis]); - if (idxValue < 0) - idxValue += axisDim; - if (0 <= idxValue && idxValue < axisDim) - cpu_memcpy(dstData + dataSize * (dst_idx + idxValue * srcBlockND[axis + 1]), - update + iwork * dataSize, dataSize); - - for (j = updateRank - 1; j >= 0; j--) { - tensorItr[j]++; - if (tensorItr[j] < updateDim[j]) { - if (j != axis) - dst_idx += srcBlockND[j + 1]; - break; - } else { - tensorItr[j] = 0; - for (dst_idx = 0, i = 0; i < static_cast(axis); ++i) - dst_idx += tensorItr[i] * srcBlockND[i + 1]; - for (i++; i < updateRank; ++i) - dst_idx += tensorItr[i] * srcBlockND[i + 1]; - } - } - } - }); -} bool ScatterUpdate::created() const { return getType() == Type::ScatterUpdate diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h index cd6a5bb4cce82a..6b0805dd459c97 100644 --- a/src/plugins/intel_cpu/src/nodes/scatter_update.h +++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h @@ -5,6 +5,8 @@ #pragma once #include "node.h" +#include "openvino/op/scatter_elements_update.hpp" +#include namespace ov { namespace intel_cpu { @@ -16,6 +18,56 @@ enum class ScatterUpdateMode { ScatterElementsUpdate }; +namespace scatter_elements_update { +class ReduceMultiply { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data *= *src_data; + } +}; + +class ReduceAdd { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data += *src_data; + } +}; + +class ReduceMean { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data += *src_data; + } +}; + +class ReduceMaximum { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data = std::max(*dst_data, *src_data); + } +}; + +class ReduceMinimum { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data = std::min(*dst_data, *src_data); + } +}; + +class ReduceNone { +public: + template + void operator() (DT* dst_data, const DT* src_data) const { + *dst_data = *src_data; + } +}; +}; // namespace scatter_elements_update + class ScatterUpdate : public Node { public: ScatterUpdate(const std::shared_ptr& op, const GraphContext::CPtr context); @@ -34,15 +86,25 @@ class ScatterUpdate : public Node { bool isExecutable() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + using Reduction = ov::op::v12::ScatterElementsUpdate::Reduction; + template + void scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, int axis, const KernelType& kernel); + template + void scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, + int axis, const scatter_elements_update::ReduceMean& kernel); + private: void scatterUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, int axis, uint8_t *dstDataPtr); void scatterNDUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, uint8_t *dstDataPtr); - void scatterElementsUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, int axis, uint8_t *dstDataPtr); + void scatterElementsUpdate(const MemoryPtr& dstMemPtr, const MemoryPtr& indicesMemPtr, const MemoryPtr& updateMemPtr, int axis); inline int64_t getIndicesValue(uint8_t *indices, size_t offset); ScatterUpdateMode scatterUpdateMode = ScatterUpdateMode::ScatterUpdate; enum { DATA_ID, INDICES_ID, UPDATE_ID, AXIS_ID }; + Reduction reduction_type; + bool use_init_val = true; + // if axis can be set other than default 0. bool axisRelaxed = false; size_t dataSize, indicesSize, axisSize; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp new file mode 100644 index 00000000000000..4aa422fa263b18 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp @@ -0,0 +1,306 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/opsets/opset13.hpp" +#include "openvino/pass/manager.hpp" + +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" +#include "common_test_utils/include/common_test_utils/ov_tensor_utils.hpp" + +using namespace ov::test; +using namespace CPUTestUtils; +using namespace ov::op; + +namespace ov { +namespace test { +/* + This test runs a graph that is equivelent to torch.Tensor.index_add_. + TorchFE maps it to a compilicated subgraph which could be briefed similar to this - + * Indices(1D) + * | + * | + * X Broadcast Updates + * \ | / + * \ | / + * ScatterElementsUpdate + * | + * Result +*/ +using InputsAndAxis = std::tuple< + std::vector, // Input, shape of data and updates + int // Axis +>; +using IndexAddTestParams = std::tuple; + +class IndexAddTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest, + public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + auto shapes_ss = [](const InputShape& shape) { + std::stringstream ss; + ss << "_IS=(" << ov::test::utils::partialShape2str({shape.first}) << ")_TS="; + for (size_t j = 0lu; j < shape.second.size(); j++) + ss << "{" << ov::test::utils::vec2str(shape.second[j]) << "}"; + return ss; + }; + + InputsAndAxis shapes_desc; + std::vector input_shapes; + int axis; + v12::ScatterElementsUpdate::Reduction reduceMode; + ov::element::Type data_type, indices_type; + float alpha; + bool dynamic; + + std::tie(shapes_desc, reduceMode, data_type, indices_type, alpha, dynamic) = obj.param; + std::tie(input_shapes, axis) = shapes_desc; + std::ostringstream result; + result << "InputShape=" << shapes_ss(input_shapes.at(0)).str() << "_"; + result << "UpdateShape=" << ov::test::utils::vec2str(input_shapes.at(1).second) << "_"; + result << "Axis=" << axis << "_"; + result << "ReduceMode=" << as_string(reduceMode) << "_"; + result << "modelType=" << data_type.to_string() << "_"; + result << "idxType=" << indices_type.to_string() << "_"; + result << "alpha=" << alpha; + result << "dynamic=" << dynamic; + return result.str(); + } + +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + + constexpr size_t DATA_INPUT_IDX = 0; + constexpr size_t UPDATES_INPUT_IDX = 1; + + InputsAndAxis shapes_desc; + std::vector input_shapes; + int axis; + + v12::ScatterElementsUpdate::Reduction reduceMode; + float alpha_value; + bool dynamic; + + ov::element::Type data_type, indices_type; + std::string target_device; + std::tie(shapes_desc, reduceMode, data_type, indices_type, alpha_value, dynamic) = this->GetParam(); + std::tie(input_shapes, axis) = shapes_desc; + + if (ov::element::bf16 == data_type || ov::element::f16 == data_type) { + configuration.insert({ov::hint::inference_precision.name(), data_type}); + inType = outType = data_type; + abs_threshold = 0.01f; + rel_threshold = 0.01f; + } + + init_input_shapes(input_shapes); + + // + normalized_axis = axis < 0 ? axis + inputDynamicShapes.at(DATA_INPUT_IDX).rank().get_length(): axis; + + if (dynamic) { + // infer dynamic shape from axis + inputDynamicShapes.at(DATA_INPUT_IDX)[normalized_axis] = -1; + inputDynamicShapes.at(UPDATES_INPUT_IDX)[normalized_axis] = -1; + } + + auto param = std::make_shared(data_type, inputDynamicShapes.at(DATA_INPUT_IDX)); + param->set_friendly_name("data"); + auto update_param = std::make_shared(data_type, inputDynamicShapes.at(UPDATES_INPUT_IDX)); + update_param->set_friendly_name("update"); + auto indices_param = std::make_shared(indices_type, ov::PartialShape{-1}); // 1D + indices_param->set_friendly_name("indices"); + + auto axis_const = + std::make_shared(ov::element::i32, ov::Shape{}, std::vector{axis}); + axis_const->set_friendly_name("axis"); + auto alpha_const = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{alpha_value}); + alpha_const->set_friendly_name("alpha"); + + auto input = param; + auto dim = axis_const; + auto index = std::make_shared(indices_param, element::i32); + auto src = update_param; + auto alpha = alpha_const; + auto converted_alpha = std::make_shared(alpha, src); + auto alpha_src = std::make_shared(converted_alpha, src); + auto input_shape_rank = get_shape_rank(input); + auto const_one = v0::Constant::create(element::i32, Shape{1}, {1}); + auto const_one_0d = v0::Constant::create(element::i32, Shape{}, {1}); + auto inp_rank = std::get<1>(input_shape_rank); + // ScatterElementsUpdate required that index, source and update have the same rank + // in aten::index_add index represents as 1d-array for specific dim and update may have different size + // from source in non-indexing axes + // slice src for having only relevant data + auto src_broadcast_shape = std::make_shared(const_one, inp_rank); + auto src_broadcasted = + std::make_shared(alpha_src, src_broadcast_shape, BroadcastType::BIDIRECTIONAL); + auto src_shape_rank = get_shape_rank(src_broadcasted); + auto const_zero = v0::Constant::create(element::i32, Shape{1}, {0}); + auto src_rank = std::get<1>(src_shape_rank); + auto slice_start = std::make_shared(const_zero, inp_rank); + auto axes = get_node_axes_range(src_broadcasted); + auto const_inf = + v0::Constant::create(element::i32, Shape{1}, {std::numeric_limits::max()}); + auto slice_end = std::make_shared(const_inf, src_rank); + auto slice_step = std::make_shared(const_one, src_rank); + auto dim_1d = std::make_shared(dim, const_one); + auto slice_end2 = + std::make_shared(slice_end, + dim_1d, + const_one, + const_zero, + v12::ScatterElementsUpdate::Reduction::NONE); + auto new_shape_ = std::make_shared(input, slice_start, slice_end2, slice_step, axes); + auto new_shape = std::make_shared(new_shape_, element::i32); + auto src_ = + std::make_shared(src_broadcasted, new_shape, BroadcastType::BIDIRECTIONAL); + auto src_input_dtype = std::make_shared(src_, input); + // brodcast index to input rank size + src_rank = std::make_shared(new_shape, element::i32); + auto new_index_shape_ = std::make_shared(const_one, src_rank); + auto const_minus_one = v0::Constant::create(element::i32, Shape{1}, {-1}); + auto new_index_shape = + std::make_shared(new_index_shape_, dim_1d, const_minus_one, const_zero); + // precerve indicies location for spicifc dim + auto reshaped_index = std::make_shared(index, new_index_shape, false); + auto broadcasted_index = + std::make_shared(reshaped_index, new_shape, BroadcastType::BIDIRECTIONAL); + auto scatter_result = + std::make_shared(input, + broadcasted_index, + src_, + dim, + reduceMode); + ov::ResultVector results{std::make_shared(scatter_result)}; + function = std::make_shared(results, ov::ParameterVector{param, indices_param, update_param}, "index_add"); + } + + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& funcInputs = function->inputs(); + auto dataShape = targetInputStaticShapes[0]; + auto updateShape = targetInputStaticShapes[1]; + // The dim-th dimension of update must have the same size as the length of index (which must be a vector) + auto indicesShape = ov::Shape{updateShape[normalized_axis]}; // 1D + + for (size_t i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::Tensor tensor; + ov::test::utils::InputGenerateData in_data; + + if (i == 0) { // "data" + in_data.start_from = 1; + in_data.range = 1; + in_data.resolution = 1; + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), dataShape, in_data); + } else if (i == 1) { // "indices" + // All index values are expected to be within bounds [-d, d - 1] along dimension d pointed by axis. + auto d = dataShape[normalized_axis]; + in_data.start_from = -1.0 * static_cast(d); + in_data.range = d-1; + in_data.resolution = 1; + tensor = shape_size(indicesShape) == 0 ? ov::Tensor(funcInput.get_element_type(), indicesShape) : + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), indicesShape, in_data); + } else if (i == 2) { // "updates" + in_data.start_from = -50; + in_data.range = 50; + in_data.resolution = 1; + tensor = shape_size(updateShape) == 0 ? ov::Tensor(funcInput.get_element_type(), updateShape) : + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), updateShape, in_data); + } else { + OPENVINO_THROW("Unknown input"); + } + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } + } + +private: + std::tuple, Output> get_shape_rank(const Output& x, + bool as_scalar = false, + element::Type output_type = element::i32) { + auto shape = std::make_shared(x, output_type); + Output rank = std::make_shared(shape, output_type); + if (as_scalar) { + auto axis_0 = opset10::Constant::create(output_type, Shape{}, {0}); + rank = std::make_shared(rank, axis_0); + } + return std::make_tuple(shape, rank); + } + + std::shared_ptr get_node_axes_range(const Output& x) { + auto start = std::make_shared(element::i32, Shape{}, 0); + auto step = std::make_shared(element::i32, Shape{}, 1); + Output reduced_rank; + std::tie(std::ignore, reduced_rank) = get_shape_rank(x, true); + return std::make_shared(start, reduced_rank, step, element::i32); + } + + size_t normalized_axis; // normalized_axis +}; + +TEST_P(IndexAddTest, CompareWithRefs) { + run(); +} + +namespace { +// map> +std::map, std::map, std::vector>> axesShapeInShape { + {{3}, {{{2}, {0, -1}}, {{3}, {0, -1}}/*, {{0}, {0, -1}}*/}}, // TODO: empty tensor failing in template plugin + {{4, 6}, {{{3, 6}, {0, -2}}, {{4, 6}, {0, 1, -1}}/*, {{0, 2}, {0, -2}}*/}}, // axis 0 + {{2, 4}, {{{2, 3}, {1, -1}}, {{2, 4}, {0, 1, -1}}/*, {{4, 0}, {1, -1}}*/}}, // axis 1 + {{1, 120}, {{{1, 120}, {0}}}}, + {{32, 120}, {{{16, 120}, {0}}, {{32, 120}, {0}}}}, + {{120, 32}, {{{120, 16}, {1}}, {{120, 32}, {1}}}}, +}; + +inline std::vector partial_shapes_to_test_representation( + const std::vector& shapes) { + std::vector result; + for (const auto& staticShape : shapes) { + result.push_back({{staticShape}, {staticShape.get_shape()}}); + } + return result; +} + +std::vector combine_shapes( + const std::map, std::map, std::vector>>& input_shapes) { + std::vector res_vec; + for (auto& input_shape : input_shapes) { + for (auto& item : input_shape.second) { + for (auto& elt : item.second) { + res_vec.push_back(ov::test::InputsAndAxis{ + partial_shapes_to_test_representation({ov::PartialShape(input_shape.first), ov::PartialShape(item.first)}), + elt}); + } + } + } + return res_vec; +} + +INSTANTIATE_TEST_SUITE_P(smoke_IndexAddTest, + IndexAddTest, + ::testing::Combine(::testing::ValuesIn(combine_shapes(axesShapeInShape)), + ::testing::Values(v12::ScatterElementsUpdate::Reduction::SUM, v12::ScatterElementsUpdate::Reduction::NONE), + ::testing::Values(ElementType::f32, ElementType::i32, + // ElementType::u8, ElementType::i8, // cannot validate until CVS-136858 addressed + ElementType::f16, ElementType::bf16), // data precision + ::testing::Values(ElementType::i32, ElementType::i64), // indices precision + ::testing::Values(1.0), // alpha + ::testing::Values(true, false)), // dynamic shape test + IndexAddTest::getTestCaseName); +} // namespace + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp index 731f7a02180639..8b2386714a9545 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp @@ -8,6 +8,7 @@ #include "common_test_utils/test_constants.hpp" using ov::test::ScatterElementsUpdateLayerTest; +using ov::test::ScatterElementsUpdate12LayerTest; namespace { // map> @@ -25,6 +26,8 @@ const std::vector model_types = { ov::element::f32, ov::element::f16, ov::element::i32, + // ov::element::i8, // cannot validate until CVS-136858 addressed + // ov::element::u8, // cannot validate until CVS-136858 addressed }; const std::vector idx_types = { @@ -58,4 +61,34 @@ const auto scatter_elt_update_cases = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_ScatterEltsUpdate, ScatterElementsUpdateLayerTest, scatter_elt_update_cases, ScatterElementsUpdateLayerTest::getTestCaseName); +const std::vector reduceModes{ + ov::op::v12::ScatterElementsUpdate::Reduction::NONE, + ov::op::v12::ScatterElementsUpdate::Reduction::SUM, + ov::op::v12::ScatterElementsUpdate::Reduction::PROD, + ov::op::v12::ScatterElementsUpdate::Reduction::MIN, + ov::op::v12::ScatterElementsUpdate::Reduction::MAX, + ov::op::v12::ScatterElementsUpdate::Reduction::MEAN +}; + +const std::vector> idxWithNegativeValues = { + {1, 0, 0, 1}, + {-1, -2, -2, -1}, +}; + +// map> +std::map, std::map, std::vector>> axesShapeInShape2D { + {{2, 4}, {{{1, 4}, {0, 1}}, {{2, 2}, {-1, -2}}}}, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_ScatterEltsUpdate12, + ScatterElementsUpdate12LayerTest, + ::testing::Combine(::testing::ValuesIn(combine_shapes(axesShapeInShape2D)), + ::testing::ValuesIn(idxWithNegativeValues), + ::testing::ValuesIn(reduceModes), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn(model_types), + ::testing::ValuesIn(idx_types), + ::testing::Values(ov::test::utils::DEVICE_CPU)), + ScatterElementsUpdate12LayerTest::getTestCaseName); } // namespace