From 0307631a22e85fd0f1fa57809f3b82761efdf18d Mon Sep 17 00:00:00 2001
From: cecilia peng <cecilia.peng@intel.com>
Date: Wed, 27 Mar 2024 15:55:04 +0800
Subject: [PATCH] [CPU] plugin implementation of ScatterElementsUpdate-12
 (#23287)

### Details:
 - *[CPU] implement ScatterElementUpdate-12*
 - *...*

### Tickets:
 - *129813*

---------

Co-authored-by: Li, Tingqian <tingqian.li@intel.com>
Co-authored-by: Maksim Kutakov <maxim.kutakov@gmail.com>
---
 .../intel_cpu/src/nodes/scatter_update.cpp    | 501 +++++++++++++++---
 .../intel_cpu/src/nodes/scatter_update.h      |  64 ++-
 .../src/index_add_scatter_elements_update.cpp | 306 +++++++++++
 .../scatter_elements_update.cpp               |  33 ++
 4 files changed, 821 insertions(+), 83 deletions(-)
 create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp
diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
index 09951387b68510..875a5b8bac53a3 100644
--- a/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
+++ b/src/plugins/intel_cpu/src/nodes/scatter_update.cpp
@@ -10,6 +10,8 @@
 #include "openvino/core/parallel.hpp"
 #include "openvino/opsets/opset3.hpp"
 #include "openvino/opsets/opset4.hpp"
+#include "openvino/opsets/opset12.hpp"
+#include "selective_build.h"
 
 #include <algorithm>
 #include <string>
@@ -23,12 +25,13 @@ namespace node {
 
 bool ScatterUpdate::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto scatterElemUpd = ov::as_type_ptr<const ov::opset3::ScatterElementsUpdate>(op);
+        auto scatterElemUpd3 = ov::as_type_ptr<const ov::opset3::ScatterElementsUpdate>(op);
+        auto scatterElemUpd12 = ov::as_type_ptr<const ov::opset12::ScatterElementsUpdate>(op);
         auto scatterUpd = ov::as_type_ptr<const ov::opset3::ScatterUpdate>(op);
         auto scatterNdUpd = ov::as_type_ptr<const ov::opset4::ScatterNDUpdate>(op);
-        if (!scatterElemUpd && !scatterUpd && !scatterNdUpd) {
+        if (!scatterElemUpd3 && !scatterElemUpd12 && !scatterUpd && !scatterNdUpd) {
             const std::string opType = op->get_type_name();
-            errorMessage = "Only opset" + opType == "ScatterNDUpdate" ? "4 " : "3 " + opType + " operation is supported";
+            errorMessage = std::string("Type ") + opType + " is not supported.";
             return false;
         }
     } catch (...) {
@@ -53,6 +56,14 @@ ScatterUpdate::ScatterUpdate(const std::shared_ptr<ov::Node>& op, const GraphCon
     } else {
         OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage);
     }
+
+    const auto node = ov::as_type_ptr<const ov::op::v12::ScatterElementsUpdate>(op);
+    if (node) {
+        reduction_type = node->get_reduction();
+        use_init_val = node->get_use_init_val();
+    } else {
+        reduction_type = ScatterUpdate::Reduction::NONE;
+    }
 }
 
 void ScatterUpdate::getSupportedDescriptors() {
@@ -196,36 +207,23 @@ void ScatterUpdate::initSupportedPrimitiveDescriptors() {
     }
 
     dataPrec = getOriginalInputPrecisionAtPort(DATA_ID);
+    if (scatterUpdateMode == ScatterUpdateMode::ScatterElementsUpdate &&
+        !one_of(dataPrec, ov::element::f32, ov::element::i32,
+                          ov::element::bf16, ov::element::f16,
+                          ov::element::u8, ov::element::i8)) {
+        dataPrec = ov::element::f32;
+    }
     dataSize = dataPrec.size();
 
-    bool canBeInplace = !isDynamicNode() && getParentEdgeAt(DATA_ID)->getParent()->getChildEdges().size() == 1 &&
-                        !getParentEdgeAt(DATA_ID)->getParent()->isConstant();
+    bool canBeInplace = !getParentEdgeAt(DATA_ID)->getParent()->isConstant();
 
-    NodeConfig config;
-    if (axisRelaxed) {
-        config.inConfs.resize(4);
-    } else {
-        config.inConfs.resize(3);
-    }
-    config.outConfs.resize(1);
-    config.inConfs[DATA_ID].constant(false);
-    config.inConfs[INDICES_ID].constant(false);
-    config.inConfs[UPDATE_ID].constant(false);
-    config.outConfs[0].constant(false);
-    config.inConfs[DATA_ID].inPlace(canBeInplace ? 0 : -1);
-    config.inConfs[INDICES_ID].inPlace(-1);
-    config.inConfs[UPDATE_ID].inPlace(-1);
-    config.outConfs[0].inPlace(canBeInplace ? 0 : -1);
-    if (axisRelaxed) {
-        config.inConfs[AXIS_ID].constant(false);
-        config.inConfs[AXIS_ID].inPlace(-1);
-    }
-
-    std::vector<PortConfigurator> inPortConfig{{LayoutType::ncsp, dataPrec}, {LayoutType::ncsp, indicesPrec}, {LayoutType::ncsp, dataPrec}};
+    std::vector<PortConfigurator> inPortConfig{{LayoutType::ncsp, dataPrec, false, canBeInplace ? 0 : -1},
+                                                {LayoutType::ncsp, indicesPrec},
+                                                {LayoutType::ncsp, dataPrec}};
     if (axisRelaxed)
         inPortConfig.emplace_back(LayoutType::ncsp, axisPrec);
     addSupportedPrimDesc(inPortConfig,
-                         {{LayoutType::ncsp, dataPrec}},
+                         {{LayoutType::ncsp, dataPrec, false, canBeInplace ? 0 : -1}},
                           impl_desc_type::unknown);
 }
 
@@ -263,6 +261,398 @@ static std::vector<size_t> getBlockND(const VectorDims& shape) {
     return blockND;
 }
 
+namespace scatter_elements_update {
+template <typename T>
+static T reduction_neutral_value(const ScatterUpdate::Reduction reduction_type) {
+    switch (reduction_type) {
+    case ScatterUpdate::Reduction::MAX:
+        return std::numeric_limits<T>::lowest();
+    case ScatterUpdate::Reduction::MIN:
+        return std::numeric_limits<T>::max();
+    case ScatterUpdate::Reduction::PROD:
+        return T{1};
+    case ScatterUpdate::Reduction::SUM:
+    case ScatterUpdate::Reduction::MEAN:
+    case ScatterUpdate::Reduction::NONE:
+        return T{0};
+    default:
+        OPENVINO_THROW("Neutral value not available for this type of reduction");
+        return 0;
+    }
+}
+
+static inline void getCoordinate(VectorDims& coordinate, size_t offset, const VectorDims& shape) {
+    size_t shapeRank = shape.size();
+    for (int i = shapeRank - 1; i >= 0; i--) {
+        coordinate[i] = offset % shape[i];
+        offset /= shape[i];
+    }
+}
+
+struct TensorIterator {
+    TensorIterator(const VectorDims& squashed_shape, const int64_t squashed_axis) : m_squashed_shape(squashed_shape), m_squashed_axis(squashed_axis) {
+        OPENVINO_ASSERT(m_squashed_shape[m_squashed_axis] == 1);
+    }
+
+    std::array<size_t, 2> startover(const size_t start, const std::vector<size_t>& dataBlockND, const std::vector<size_t>& indicesBlockND) {
+        m_tensorIter.resize(m_squashed_shape.size(), 0);
+        getCoordinate(m_tensorIter, start, m_squashed_shape);
+
+        size_t i, dst_idx = 0, indices_idx = 0;
+        for (i = 0; i < static_cast<size_t>(m_squashed_axis); ++i) {
+            dst_idx += m_tensorIter[i] * dataBlockND[i + 1];
+            indices_idx += m_tensorIter[i] * indicesBlockND[i + 1];
+        }
+        for (i++; i < m_squashed_shape.size(); ++i) {
+            dst_idx += m_tensorIter[i] * dataBlockND[i + 1];
+            indices_idx += m_tensorIter[i] * indicesBlockND[i + 1];
+        }
+
+        return {dst_idx, indices_idx};
+    }
+
+    void increment(std::array<size_t, 2>& offsets, const std::vector<size_t>& dataBlockND, const std::vector<size_t>& indicesBlockND) {
+        for (int64_t j = m_squashed_shape.size() - 1; j >= 0; j--) {
+            m_tensorIter[j]++;
+            if (m_tensorIter[j] < m_squashed_shape[j]) { // no need check if (j != axis) as it is squashed
+                offsets[0] += dataBlockND[j + 1];
+                offsets[1] += indicesBlockND[j + 1];
+                break;
+            } else {
+                m_tensorIter[j] = 0;
+                size_t i = 0;
+                for (offsets[0] = 0, offsets[1] = 0; i < m_squashed_axis; ++i) {
+                    offsets[0] += m_tensorIter[i] * dataBlockND[i + 1];
+                    offsets[1] += m_tensorIter[i] * indicesBlockND[i + 1];
+                }
+                for (i++; i < m_squashed_shape.size(); ++i) {
+                    offsets[0] += m_tensorIter[i] * dataBlockND[i + 1];
+                    offsets[1] += m_tensorIter[i] * indicesBlockND[i + 1];
+                }
+            }
+        }
+    }
+
+    VectorDims m_tensorIter;
+    const VectorDims m_squashed_shape;
+    const size_t m_squashed_axis;
+};
+
+struct ScatterElementsUpdateContext {
+    ScatterUpdate* node;
+    MemoryPtr dstMemPtr;
+    MemoryPtr indicesMemPtr;
+    MemoryPtr updateMemPtr;
+    int axis;
+    ScatterUpdate::Reduction reduction_type;
+};
+
+// tier 2 dispatcher with Reduce which follows up DataType.
+template<typename PT>
+struct ScatterElementsUpdateReduceDispatcher {
+    void operator()(ScatterElementsUpdateContext& ctx) {
+        using kernel_t = typename PT::second_type;
+        using data_t = typename PT::first_type;
+        ctx.node->scatterElementsUpdate<data_t>(ctx.dstMemPtr, ctx.indicesMemPtr, ctx.updateMemPtr, ctx.axis,
+                                                kernel_t{});
+    }
+};
+
+// tier 1 dispatcher with DataType
+template<typename DataType>
+struct ScatterElementsUpdateDispatcher {
+    void operator()(ScatterElementsUpdateContext& ctx) {
+        scatterElementsUpdate_dispatch(ctx);
+    }
+
+private:
+    void scatterElementsUpdate_dispatch(ScatterElementsUpdateContext& ctx) {
+        using namespace scatter_elements_update;
+        using DT_NONE = std::pair<DataType, ReduceNone>;
+        using DT_SUM = std::pair<DataType, ReduceAdd>;
+        using DT_MAX = std::pair<DataType, ReduceMaximum>;
+        using DT_MIN = std::pair<DataType, ReduceMinimum>;
+        using DT_MUL = std::pair<DataType, ReduceMultiply>;
+        using DT_MEAN = std::pair<DataType, ReduceMean>;
+        OV_SWITCH(intel_cpu,
+                ScatterElementsUpdateReduceDispatcher,
+                ctx,
+                ctx.reduction_type,
+                OV_CASE(ScatterUpdate::Reduction::NONE, DT_NONE),
+                OV_CASE(ScatterUpdate::Reduction::SUM,  DT_SUM),
+                OV_CASE(ScatterUpdate::Reduction::MAX,  DT_MAX),
+                OV_CASE(ScatterUpdate::Reduction::MIN,  DT_MIN),
+                OV_CASE(ScatterUpdate::Reduction::PROD, DT_MUL),
+                OV_CASE(ScatterUpdate::Reduction::MEAN, DT_MEAN));
+    }
+};
+};   // namespace scatter_elements_update
+
+// output[indices[i][j][k]][j][k] = updates[i][j][k] if axis = 0,
+// output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1,
+// output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2.
+template <typename DataType, typename KernelType>
+void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates,
+                            int axis, const KernelType& kernel) {
+    using namespace scatter_elements_update;
+    DataType *dataPtr = mem_data->getDataAs<DataType>();
+    DataType *updatePtr = mem_updates->getDataAs<DataType>();
+    uint8_t *indicesPtr = mem_indices->getDataAs<uint8_t>();
+
+    const auto& data_shape = mem_data->getStaticDims();
+    const auto& indices_shape = mem_indices->getStaticDims();
+    const size_t updates_rank = indices_shape.size();
+
+    const int64_t data_dim_size = static_cast<int64_t>(data_shape[axis]);
+    const auto index_dim_size = indices_shape[axis];
+
+    if (axis < 0)
+        axis += updates_rank;
+
+    VectorDims squashed_indices_shape(indices_shape);
+    squashed_indices_shape[axis] = 1;
+
+    const std::vector<size_t> dataBlockND = getBlockND(data_shape);
+    const std::vector<size_t> indicesBlockND = getBlockND(indices_shape);
+    const size_t dataBlock_axisplus1 = dataBlockND[axis + 1];
+    const size_t indicesBlock_axisplus1 = indicesBlockND[axis + 1];
+
+    // process serially along 'axis' dimension because of data dependency brought by duplicated value in indices
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        splitter(shape_size(squashed_indices_shape), nthr, ithr, start, end);
+        scatter_elements_update::TensorIterator tensorItr(squashed_indices_shape, axis);
+
+        // When *use_init_val* attribute is false, we need to substitute the copied values at target locations with values that
+        // will not affect the particular reduction algorithms.
+        if (!use_init_val) {
+            const auto value = reduction_neutral_value<DataType>(reduction_type);
+            auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND);
+            for (size_t worker = start; worker < end; worker++) {
+                auto indices_offset = offsets[1];
+                for (size_t idx = 0; idx < index_dim_size; idx++) {
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    dataPtr[offsets[0] + idxValue * dataBlock_axisplus1] = value;
+                    indices_offset += indicesBlock_axisplus1;
+                }
+
+                // increment
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+            }
+        }
+
+        // Apply the Reduce function in an element-wise fashion. For better performance,
+        // when axis is the last dimension, we iterate along axis in the inner loop; otherwise we iterate axis
+        // in the outer loop.
+        auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND);
+        if (axis == static_cast<int>(updates_rank - 1)) {
+            for (size_t worker = start; worker < end; worker++) {
+                auto indices_offset = offsets[1];
+                for (size_t idx = 0; idx < index_dim_size; idx++) {
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    auto dst = &dataPtr[offsets[0] + idxValue * dataBlock_axisplus1];
+                    auto src = &updatePtr[indices_offset];
+                    kernel(dst, src);
+                    indices_offset += indicesBlock_axisplus1;
+                }
+                // increment
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+            }
+        } else {
+            // For better performance, the offsets of dst and indices are cached in the first iteration of outer loop, and reused
+            // in the remaining iterations.
+            std::vector<size_t> dst_offsets(end-start+1, offsets[0]);  // one extra to avoid overflow at the last iteration of inner loop
+            std::vector<size_t> indices_offsets(end-start+1, offsets[1]);
+            size_t *ptr_dst_offset = &dst_offsets[0];
+            size_t *ptr_indices_offset = &indices_offsets[0];
+            for (size_t worker = start; worker < end; worker++) { // idx = 0
+                int64_t idxValue =  getIndicesValue(indicesPtr, *ptr_indices_offset);
+                if (idxValue < 0) idxValue += data_dim_size;
+                assert(idxValue < data_dim_size && idxValue >= 0);
+                auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1];
+                auto src = &updatePtr[ptr_indices_offset[0]];
+                kernel(dst, src);
+
+                // increment once for all
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+                *++ptr_dst_offset = offsets[0];
+                *++ptr_indices_offset = offsets[1];
+            }
+            for (size_t idx = 1; idx < index_dim_size; idx++) {
+                ptr_indices_offset = &indices_offsets[0];
+                ptr_dst_offset = &dst_offsets[0];
+                for (size_t worker = start; worker < end; worker++) {
+                    auto indices_offset = *ptr_indices_offset + idx * indicesBlock_axisplus1;
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1];
+                    auto src = &updatePtr[indices_offset];
+                    kernel(dst, src);
+                    ptr_indices_offset++;
+                    ptr_dst_offset++;
+                }
+            }
+        }
+    });
+}
+
+// We specialize ReduceMean to avoid spoil performance of other reduce methods, as otherwise condition branch
+// were used in loops.
+template <typename DataType>
+void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates,
+                                          int axis, const scatter_elements_update::ReduceMean& kernel) {
+    using namespace scatter_elements_update;
+    OPENVINO_ASSERT(reduction_type == ScatterUpdate::Reduction::MEAN, "The reduction type should be MEAN here.");
+    DataType *dataPtr = mem_data->getDataAs<DataType>();
+    DataType *updatePtr = mem_updates->getDataAs<DataType>();
+    uint8_t *indicesPtr = mem_indices->getDataAs<uint8_t>();
+
+    const auto& data_shape = mem_data->getStaticDims();
+    const auto& indices_shape = mem_indices->getStaticDims();
+    size_t updates_rank = indices_shape.size();
+
+    const int64_t data_dim_size = static_cast<int64_t>(data_shape[axis]);
+    const auto index_dim_size = indices_shape[axis];
+
+    if (axis < 0)
+        axis += updates_rank;
+
+    VectorDims squashed_indices_shape(indices_shape);
+    squashed_indices_shape[axis] = 1;
+
+    const std::vector<size_t> dataBlockND = getBlockND(data_shape);
+    const std::vector<size_t> indicesBlockND = getBlockND(indices_shape);
+    const size_t dataBlock_axisplus1 = dataBlockND[axis + 1];
+    const size_t indicesBlock_axisplus1 = indicesBlockND[axis + 1];
+
+    // process serially along 'axis' dimension because of data dependency brought by duplicated value in indices
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        splitter(shape_size(squashed_indices_shape), nthr, ithr, start, end);
+        scatter_elements_update::TensorIterator tensorItr(squashed_indices_shape, axis);
+
+        // When *use_init_val* attribute is false, we need to substitute the copied values at target locations with values that
+        // will not affect the particular reduction algorithms.
+        if (!use_init_val) {
+            const auto value = reduction_neutral_value<DataType>(reduction_type);
+            auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND);
+            for (size_t worker = start; worker < end; worker++) {
+                auto indices_offset = offsets[1];
+                for (size_t idx = 0; idx < index_dim_size; idx++) {
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    dataPtr[offsets[0] + idxValue * dataBlock_axisplus1] = value;
+                    indices_offset += indicesBlock_axisplus1;
+                }
+
+                // increment
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+            }
+        }
+
+        // Apply the Reduce function in an element-wise fashion. For better performance,
+        // when axis is the last dimension, we iterate along axis in the inner loop; otherwise we iterate axis
+        // in the outer loop.
+        auto offsets = tensorItr.startover(start, dataBlockND, indicesBlockND);
+        if (axis == static_cast<int>(updates_rank - 1)) {
+            for (size_t worker = start; worker < end; worker++) {
+                std::unordered_map<size_t, int64_t> mean_reduction_counters;  // (idxValue, num_sums) for current worker
+
+                auto indices_offset = offsets[1];
+                for (size_t idx = 0; idx < index_dim_size; idx++) {
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    auto dst = &dataPtr[offsets[0] + idxValue * dataBlock_axisplus1];
+                    auto src = &updatePtr[indices_offset];
+                    kernel(dst, src);
+                    indices_offset += indicesBlock_axisplus1;
+
+                    mean_reduction_counters[idxValue] += 1;
+                }
+
+                // average
+                for (const auto& counter : mean_reduction_counters) {
+                    auto dst = &dataPtr[offsets[0] + counter.first * dataBlock_axisplus1];
+                    const auto N = counter.second + static_cast<int32_t>(use_init_val);
+                    *dst = static_cast<DataType>(static_cast<double>(*dst) / N);
+                }
+
+                // increment
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+            }
+        } else {
+            // For better performance, the offsets of dst and indices are cached in the first iteration of outer loop, and reused
+            // in the remaining iterations.
+            std::unordered_map<DataType*, int64_t> mean_reduction_counters;     // (dst_addr, num_sums) for all workers
+
+            std::vector<size_t> dst_offsets(end-start+1, offsets[0]);  // one extra to avoid overflow at the last iteration of inner loop
+            std::vector<size_t> indices_offsets(end-start+1, offsets[1]);
+            size_t *ptr_dst_offset = &dst_offsets[0];
+            size_t *ptr_indices_offset = &indices_offsets[0];
+            for (size_t worker = start; worker < end; worker++) { // idx = 0
+                int64_t idxValue =  getIndicesValue(indicesPtr, *ptr_indices_offset);
+                if (idxValue < 0) idxValue += data_dim_size;
+                assert(idxValue < data_dim_size && idxValue >= 0);
+                auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1];
+                auto src = &updatePtr[ptr_indices_offset[0]];
+                kernel(dst, src);
+
+                mean_reduction_counters[dst] += 1;
+
+                // increment once for all
+                tensorItr.increment(offsets, dataBlockND, indicesBlockND);
+                *++ptr_dst_offset = offsets[0];
+                *++ptr_indices_offset = offsets[1];
+            }
+            for (size_t idx = 1; idx < index_dim_size; idx++) {
+                ptr_indices_offset = &indices_offsets[0];
+                ptr_dst_offset = &dst_offsets[0];
+                for (size_t worker = start; worker < end; worker++) {
+                    auto indices_offset = *ptr_indices_offset + idx * indicesBlock_axisplus1;
+                    int64_t idxValue =  getIndicesValue(indicesPtr, indices_offset);
+                    if (idxValue < 0) idxValue += data_dim_size;
+                    assert(idxValue < data_dim_size && idxValue >= 0);
+                    auto dst = &dataPtr[ptr_dst_offset[0] + idxValue * dataBlock_axisplus1];
+                    auto src = &updatePtr[indices_offset];
+                    kernel(dst, src);
+                    mean_reduction_counters[dst] += 1;
+                    ptr_indices_offset++;
+                    ptr_dst_offset++;
+                }
+            }
+
+            // average
+            for (const auto& counter : mean_reduction_counters) {
+                auto dst = counter.first;
+                const auto N = counter.second + static_cast<int32_t>(use_init_val);
+                *dst = static_cast<DataType>(static_cast<double>(*dst) / N);
+            }
+        }
+    });
+}
+
+void ScatterUpdate::scatterElementsUpdate(const MemoryPtr& dstMemPtr, const MemoryPtr& indicesMemPtr, const MemoryPtr& updateMemPtr, int axis) {
+    using namespace scatter_elements_update;
+    ScatterElementsUpdateContext ctx{this, dstMemPtr, indicesMemPtr, updateMemPtr, axis, reduction_type};
+    OV_SWITCH(intel_cpu,
+              ScatterElementsUpdateDispatcher,
+              ctx,
+              dataPrec,
+              OV_CASE(ov::element::f32, float),
+              OV_CASE(ov::element::i32, int32_t),
+              OV_CASE(ov::element::bf16, ov::bfloat16),
+              OV_CASE(ov::element::f16, ov::float16),
+              OV_CASE(ov::element::i8, int8_t),
+              OV_CASE(ov::element::u8, uint8_t));
+}
+
 void ScatterUpdate::execute(dnnl::stream strm) {
     auto srcMemPtr = getSrcMemoryAtPort(DATA_ID);
     auto dstMemPtr = getDstMemoryAtPort(0);
@@ -391,12 +781,11 @@ void ScatterUpdate::execute(dnnl::stream strm) {
             break;
         }
         case ScatterUpdateMode::ScatterElementsUpdate: {
-            scatterElementsUpdate(indicesPtr, updatePtr, axis, dstPtr);
+            scatterElementsUpdate(dstMemPtr, indicesMemPtr, updateMemPtr, axis);
             break;
         }
         default: {
-            OPENVINO_THROW(errorPrefix
-           , " is not supported");
+            OPENVINO_THROW(errorPrefix, " is not supported");
         }
     }
 }
@@ -468,58 +857,6 @@ void ScatterUpdate::scatterNDUpdate(uint8_t *indices, uint8_t *update, uint8_t *
     });
 }
 
-// output[indices[i][j][k]][j][k] = updates[i][j][k] if axis = 0,
-// output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1,
-// output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2.
-void ScatterUpdate::scatterElementsUpdate(uint8_t *indices, uint8_t *update, int axis, uint8_t *dstData) {
-    const auto& srcDataDim = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims();
-    const auto& updateDim = getParentEdgeAt(UPDATE_ID)->getMemory().getStaticDims();
-    size_t updateRank = updateDim.size();
-
-    std::vector<size_t> srcBlockND = getBlockND(srcDataDim);
-    std::vector<size_t> updateBlockND = getBlockND(updateDim);
-
-    parallel_nt(0, [&](const int ithr, const int nthr) {
-        int j;
-        size_t i, dst_idx = 0, start = 0, end = 0;
-        VectorDims tensorItr(updateRank, 0);
-        splitter(updateBlockND[0], nthr, ithr, start, end);
-        for (j = updateRank - 1, i = start; j >= 0; j--) {
-            tensorItr[j] = i % updateDim[j];
-            i /= updateDim[j];
-        }
-
-        for (i = 0; i < static_cast<size_t>(axis); ++i)
-            dst_idx += tensorItr[i] * srcBlockND[i + 1];
-        for (i++; i < updateRank; ++i)
-            dst_idx += tensorItr[i] * srcBlockND[i + 1];
-
-        for (size_t iwork = start; iwork < end; iwork++) {
-            int64_t idxValue = getIndicesValue(indices, iwork);
-            int64_t axisDim = static_cast<int64_t>(srcDataDim[axis]);
-            if (idxValue < 0)
-                idxValue += axisDim;
-            if (0 <= idxValue && idxValue < axisDim)
-                cpu_memcpy(dstData + dataSize * (dst_idx + idxValue * srcBlockND[axis + 1]),
-                            update + iwork * dataSize, dataSize);
-
-            for (j = updateRank - 1; j >= 0; j--) {
-                tensorItr[j]++;
-                if (tensorItr[j] < updateDim[j]) {
-                    if (j != axis)
-                        dst_idx += srcBlockND[j + 1];
-                    break;
-                } else {
-                    tensorItr[j] = 0;
-                    for (dst_idx = 0, i = 0; i < static_cast<size_t>(axis); ++i)
-                        dst_idx += tensorItr[i] * srcBlockND[i + 1];
-                    for (i++; i < updateRank; ++i)
-                        dst_idx += tensorItr[i] * srcBlockND[i + 1];
-                }
-            }
-        }
-    });
-}
 
 bool ScatterUpdate::created() const {
     return getType() == Type::ScatterUpdate
diff --git a/src/plugins/intel_cpu/src/nodes/scatter_update.h b/src/plugins/intel_cpu/src/nodes/scatter_update.h
index cd6a5bb4cce82a..6b0805dd459c97 100644
--- a/src/plugins/intel_cpu/src/nodes/scatter_update.h
+++ b/src/plugins/intel_cpu/src/nodes/scatter_update.h
@@ -5,6 +5,8 @@
 #pragma once
 
 #include "node.h"
+#include "openvino/op/scatter_elements_update.hpp"
+#include <utility>
 
 namespace ov {
 namespace intel_cpu {
@@ -16,6 +18,56 @@ enum class ScatterUpdateMode {
     ScatterElementsUpdate
 };
 
+namespace scatter_elements_update {
+class ReduceMultiply {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data *= *src_data;
+    }
+};
+
+class ReduceAdd {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data += *src_data;
+    }
+};
+
+class ReduceMean {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data += *src_data;
+    }
+};
+
+class ReduceMaximum {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data = std::max(*dst_data, *src_data);
+    }
+};
+
+class ReduceMinimum {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data = std::min(*dst_data, *src_data);
+    }
+};
+
+class ReduceNone {
+public:
+    template <typename DT>
+    void operator() (DT* dst_data, const DT* src_data) const {
+        *dst_data = *src_data;
+    }
+};
+};  // namespace scatter_elements_update
+
 class ScatterUpdate : public Node {
 public:
     ScatterUpdate(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr context);
@@ -34,15 +86,25 @@ class ScatterUpdate : public Node {
     bool isExecutable() const override;
     static bool isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept;
 
+    using Reduction = ov::op::v12::ScatterElementsUpdate::Reduction;
+    template <typename DataType, typename KernelType>
+    void scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates, int axis, const KernelType& kernel);
+    template <typename DataType>
+    void scatterElementsUpdate(const MemoryPtr& mem_data, const MemoryPtr& mem_indices, const MemoryPtr& mem_updates,
+                                int axis, const scatter_elements_update::ReduceMean& kernel);
+
 private:
     void scatterUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, int axis, uint8_t *dstDataPtr);
     void scatterNDUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, uint8_t *dstDataPtr);
-    void scatterElementsUpdate(uint8_t *indicesPtr, uint8_t *updatePtr, int axis, uint8_t *dstDataPtr);
+    void scatterElementsUpdate(const MemoryPtr& dstMemPtr, const MemoryPtr& indicesMemPtr, const MemoryPtr& updateMemPtr, int axis);
     inline int64_t getIndicesValue(uint8_t *indices, size_t offset);
 
     ScatterUpdateMode scatterUpdateMode = ScatterUpdateMode::ScatterUpdate;
     enum { DATA_ID, INDICES_ID, UPDATE_ID, AXIS_ID };
 
+    Reduction reduction_type;
+    bool use_init_val = true;
+
     // if axis can be set other than default 0.
     bool axisRelaxed = false;
     size_t dataSize, indicesSize, axisSize;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp
new file mode 100644
index 00000000000000..4aa422fa263b18
--- /dev/null
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/index_add_scatter_elements_update.cpp
@@ -0,0 +1,306 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/opsets/opset13.hpp"
+#include "openvino/pass/manager.hpp"
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "common_test_utils/include/common_test_utils/ov_tensor_utils.hpp"
+
+using namespace ov::test;
+using namespace CPUTestUtils;
+using namespace ov::op;
+
+namespace ov {
+namespace test {
+/*
+  This test runs a graph that is equivelent to torch.Tensor.index_add_.
+  TorchFE maps it to a compilicated subgraph which could be briefed similar to this -
+ *                       Indices(1D)
+ *                           |
+ *                           |
+ *                  X    Broadcast   Updates
+ *                   \       |         /
+ *                    \      |        /
+ *                  ScatterElementsUpdate
+ *                           |
+ *                         Result
+*/
+using InputsAndAxis = std::tuple<
+        std::vector<InputShape>,           // Input, shape of data and updates
+        int                                // Axis
+>;
+using IndexAddTestParams = std::tuple<InputsAndAxis,                   // Input shapes and axis
+                                    v12::ScatterElementsUpdate::Reduction,  // Reduce mode
+                                    ElementType,  // model precision
+                                    ElementType,  // indices precision
+                                    float,        // alpha
+                                    bool          // dynamic shape test
+                                    >;
+
+class IndexAddTest : public testing::WithParamInterface<IndexAddTestParams>,
+                     virtual public ov::test::SubgraphBaseTest,
+                     public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<IndexAddTestParams> obj) {
+        auto shapes_ss = [](const InputShape& shape) {
+            std::stringstream ss;
+            ss << "_IS=(" << ov::test::utils::partialShape2str({shape.first}) << ")_TS=";
+            for (size_t j = 0lu; j < shape.second.size(); j++)
+                ss << "{" << ov::test::utils::vec2str(shape.second[j]) << "}";
+            return ss;
+        };
+
+        InputsAndAxis shapes_desc;
+        std::vector<InputShape> input_shapes;
+        int axis;
+        v12::ScatterElementsUpdate::Reduction reduceMode;
+        ov::element::Type data_type, indices_type;
+        float alpha;
+        bool dynamic;
+
+        std::tie(shapes_desc, reduceMode, data_type, indices_type, alpha, dynamic) = obj.param;
+        std::tie(input_shapes, axis) = shapes_desc;
+        std::ostringstream result;
+        result << "InputShape=" << shapes_ss(input_shapes.at(0)).str() << "_";
+        result << "UpdateShape=" << ov::test::utils::vec2str(input_shapes.at(1).second) << "_";
+        result << "Axis=" << axis << "_";
+        result << "ReduceMode=" << as_string(reduceMode) << "_";
+        result << "modelType=" << data_type.to_string() << "_";
+        result << "idxType=" << indices_type.to_string() << "_";
+        result << "alpha=" << alpha;
+        result << "dynamic=" << dynamic;
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        SKIP_IF_CURRENT_TEST_IS_DISABLED();
+
+        constexpr size_t DATA_INPUT_IDX = 0;
+        constexpr size_t UPDATES_INPUT_IDX = 1;
+
+        InputsAndAxis shapes_desc;
+        std::vector<InputShape> input_shapes;
+        int axis;
+
+        v12::ScatterElementsUpdate::Reduction reduceMode;
+        float alpha_value;
+        bool dynamic;
+
+        ov::element::Type data_type, indices_type;
+        std::string target_device;
+        std::tie(shapes_desc, reduceMode, data_type, indices_type, alpha_value, dynamic) = this->GetParam();
+        std::tie(input_shapes, axis) = shapes_desc;
+
+        if (ov::element::bf16 == data_type || ov::element::f16 == data_type) {
+            configuration.insert({ov::hint::inference_precision.name(), data_type});
+            inType = outType = data_type;
+            abs_threshold = 0.01f;
+            rel_threshold = 0.01f;
+        }
+
+        init_input_shapes(input_shapes);
+
+        //
+        normalized_axis = axis < 0 ? axis + inputDynamicShapes.at(DATA_INPUT_IDX).rank().get_length(): axis;
+
+        if (dynamic) {
+            // infer dynamic shape from axis
+            inputDynamicShapes.at(DATA_INPUT_IDX)[normalized_axis] = -1;
+            inputDynamicShapes.at(UPDATES_INPUT_IDX)[normalized_axis] = -1;
+        }
+
+        auto param = std::make_shared<v0::Parameter>(data_type, inputDynamicShapes.at(DATA_INPUT_IDX));
+        param->set_friendly_name("data");
+        auto update_param = std::make_shared<v0::Parameter>(data_type, inputDynamicShapes.at(UPDATES_INPUT_IDX));
+        update_param->set_friendly_name("update");
+        auto indices_param = std::make_shared<v0::Parameter>(indices_type, ov::PartialShape{-1}); // 1D
+        indices_param->set_friendly_name("indices");
+
+        auto axis_const =
+            std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int>{axis});
+        axis_const->set_friendly_name("axis");
+        auto alpha_const =
+            std::make_shared<v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{alpha_value});
+        alpha_const->set_friendly_name("alpha");
+
+        auto input = param;
+        auto dim = axis_const;
+        auto index = std::make_shared<v0::Convert>(indices_param, element::i32);
+        auto src = update_param;
+        auto alpha = alpha_const;
+        auto converted_alpha = std::make_shared<v1::ConvertLike>(alpha, src);
+        auto alpha_src = std::make_shared<v1::Multiply>(converted_alpha, src);
+        auto input_shape_rank = get_shape_rank(input);
+        auto const_one = v0::Constant::create(element::i32, Shape{1}, {1});
+        auto const_one_0d = v0::Constant::create(element::i32, Shape{}, {1});
+        auto inp_rank = std::get<1>(input_shape_rank);
+        // ScatterElementsUpdate required that index, source and update have the same rank
+        // in aten::index_add index represents as 1d-array for specific dim and update may have different size
+        // from source in non-indexing axes
+        // slice src for having only relevant data
+        auto src_broadcast_shape = std::make_shared<v3::Broadcast>(const_one, inp_rank);
+        auto src_broadcasted =
+            std::make_shared<v3::Broadcast>(alpha_src, src_broadcast_shape, BroadcastType::BIDIRECTIONAL);
+        auto src_shape_rank = get_shape_rank(src_broadcasted);
+        auto const_zero = v0::Constant::create(element::i32, Shape{1}, {0});
+        auto src_rank = std::get<1>(src_shape_rank);
+        auto slice_start = std::make_shared<v3::Broadcast>(const_zero, inp_rank);
+        auto axes = get_node_axes_range(src_broadcasted);
+        auto const_inf =
+            v0::Constant::create(element::i32, Shape{1}, {std::numeric_limits<int32_t>::max()});
+        auto slice_end = std::make_shared<v3::Broadcast>(const_inf, src_rank);
+        auto slice_step = std::make_shared<v3::Broadcast>(const_one, src_rank);
+        auto dim_1d = std::make_shared<v3::Broadcast>(dim, const_one);
+        auto slice_end2 =
+            std::make_shared<v12::ScatterElementsUpdate>(slice_end,
+                                                        dim_1d,
+                                                        const_one,
+                                                        const_zero,
+                                                        v12::ScatterElementsUpdate::Reduction::NONE);
+        auto new_shape_ = std::make_shared<v8::Slice>(input, slice_start, slice_end2, slice_step, axes);
+        auto new_shape = std::make_shared<v3::ShapeOf>(new_shape_, element::i32);
+        auto src_ =
+            std::make_shared<v3::Broadcast>(src_broadcasted, new_shape, BroadcastType::BIDIRECTIONAL);
+        auto src_input_dtype = std::make_shared<v1::ConvertLike>(src_, input);
+        // brodcast index to input rank size
+        src_rank = std::make_shared<v3::ShapeOf>(new_shape, element::i32);
+        auto new_index_shape_ = std::make_shared<v3::Broadcast>(const_one, src_rank);
+        auto const_minus_one = v0::Constant::create(element::i32, Shape{1}, {-1});
+        auto new_index_shape =
+            std::make_shared<v12::ScatterElementsUpdate>(new_index_shape_, dim_1d, const_minus_one, const_zero);
+        // precerve indicies location for spicifc dim
+        auto reshaped_index = std::make_shared<v1::Reshape>(index, new_index_shape, false);
+        auto broadcasted_index =
+            std::make_shared<v3::Broadcast>(reshaped_index, new_shape, BroadcastType::BIDIRECTIONAL);
+        auto scatter_result =
+            std::make_shared<v12::ScatterElementsUpdate>(input,
+                                                        broadcasted_index,
+                                                        src_,
+                                                        dim,
+                                                        reduceMode);
+        ov::ResultVector results{std::make_shared<ov::op::v0::Result>(scatter_result)};
+        function = std::make_shared<ov::Model>(results, ov::ParameterVector{param, indices_param, update_param}, "index_add");
+    }
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override {
+        inputs.clear();
+        const auto& funcInputs = function->inputs();
+        auto dataShape = targetInputStaticShapes[0];
+        auto updateShape = targetInputStaticShapes[1];
+        // The dim-th dimension of update must have the same size as the length of index (which must be a vector)
+        auto indicesShape = ov::Shape{updateShape[normalized_axis]};  // 1D
+
+        for (size_t i = 0; i < funcInputs.size(); ++i) {
+            const auto& funcInput = funcInputs[i];
+            ov::Tensor tensor;
+            ov::test::utils::InputGenerateData in_data;
+
+            if (i == 0) {  // "data"
+                in_data.start_from = 1;
+                in_data.range = 1;
+                in_data.resolution = 1;
+                tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), dataShape, in_data);
+            } else if (i == 1) {  // "indices"
+                // All index values are expected to be within bounds [-d, d - 1] along dimension d pointed by axis.
+                auto d = dataShape[normalized_axis];
+                in_data.start_from = -1.0 * static_cast<int64_t>(d);
+                in_data.range = d-1;
+                in_data.resolution = 1;
+                tensor = shape_size(indicesShape) == 0 ? ov::Tensor(funcInput.get_element_type(), indicesShape) :
+                                            ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), indicesShape, in_data);
+            } else if (i == 2) {  // "updates"
+                in_data.start_from = -50;
+                in_data.range = 50;
+                in_data.resolution = 1;
+                tensor = shape_size(updateShape) == 0 ? ov::Tensor(funcInput.get_element_type(), updateShape) :
+                            ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), updateShape, in_data);
+            } else {
+                OPENVINO_THROW("Unknown input");
+            }
+            inputs.insert({funcInput.get_node_shared_ptr(), tensor});
+        }
+    }
+
+private:
+    std::tuple<Output<Node>, Output<Node>> get_shape_rank(const Output<Node>& x,
+                                                      bool as_scalar = false,
+                                                      element::Type output_type = element::i32) {
+        auto shape = std::make_shared<opset10::ShapeOf>(x, output_type);
+        Output<Node> rank = std::make_shared<opset10::ShapeOf>(shape, output_type);
+        if (as_scalar) {
+            auto axis_0 = opset10::Constant::create(output_type, Shape{}, {0});
+            rank = std::make_shared<opset10::Squeeze>(rank, axis_0);
+        }
+        return std::make_tuple(shape, rank);
+    }
+
+    std::shared_ptr<Node> get_node_axes_range(const Output<Node>& x) {
+        auto start = std::make_shared<opset10::Constant>(element::i32, Shape{}, 0);
+        auto step = std::make_shared<opset10::Constant>(element::i32, Shape{}, 1);
+        Output<Node> reduced_rank;
+        std::tie(std::ignore, reduced_rank) = get_shape_rank(x, true);
+        return std::make_shared<opset10::Range>(start, reduced_rank, step, element::i32);
+    }
+
+    size_t normalized_axis;  // normalized_axis
+};
+
+TEST_P(IndexAddTest, CompareWithRefs) {
+    run();
+}
+
+namespace {
+// map<inputShape, map<updatesShape, axis>>
+std::map<std::vector<size_t>, std::map<std::vector<size_t>, std::vector<int>>> axesShapeInShape {
+    {{3}, {{{2}, {0, -1}}, {{3}, {0, -1}}/*, {{0}, {0, -1}}*/}}, // TODO: empty tensor failing in template plugin
+    {{4, 6}, {{{3, 6}, {0, -2}}, {{4, 6}, {0, 1, -1}}/*, {{0, 2}, {0, -2}}*/}},  // axis 0
+    {{2, 4}, {{{2, 3}, {1, -1}}, {{2, 4}, {0, 1, -1}}/*, {{4, 0}, {1, -1}}*/}},  // axis 1
+    {{1, 120}, {{{1, 120}, {0}}}},
+    {{32, 120}, {{{16, 120}, {0}}, {{32, 120}, {0}}}},
+    {{120, 32}, {{{120, 16}, {1}}, {{120, 32}, {1}}}},
+};
+
+inline std::vector<InputShape> partial_shapes_to_test_representation(
+    const std::vector<ov::PartialShape>& shapes) {
+    std::vector<InputShape> result;
+    for (const auto& staticShape : shapes) {
+        result.push_back({{staticShape}, {staticShape.get_shape()}});
+    }
+    return result;
+}
+
+std::vector<ov::test::InputsAndAxis> combine_shapes(
+    const std::map<std::vector<size_t>, std::map<std::vector<size_t>, std::vector<int>>>& input_shapes) {
+    std::vector<ov::test::InputsAndAxis> res_vec;
+    for (auto& input_shape : input_shapes) {
+        for (auto& item : input_shape.second) {
+            for (auto& elt : item.second) {
+                res_vec.push_back(ov::test::InputsAndAxis{
+                    partial_shapes_to_test_representation({ov::PartialShape(input_shape.first), ov::PartialShape(item.first)}),
+                    elt});
+            }
+        }
+    }
+    return res_vec;
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke_IndexAddTest,
+                         IndexAddTest,
+                         ::testing::Combine(::testing::ValuesIn(combine_shapes(axesShapeInShape)),
+                                            ::testing::Values(v12::ScatterElementsUpdate::Reduction::SUM, v12::ScatterElementsUpdate::Reduction::NONE),
+                                            ::testing::Values(ElementType::f32, ElementType::i32,
+                                                            //   ElementType::u8, ElementType::i8,     // cannot validate until CVS-136858 addressed
+                                                              ElementType::f16, ElementType::bf16), // data precision
+                                            ::testing::Values(ElementType::i32, ElementType::i64), // indices precision
+                                            ::testing::Values(1.0),              // alpha
+                                            ::testing::Values(true, false)),     // dynamic shape test
+                         IndexAddTest::getTestCaseName);
+} //  namespace
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp
index 731f7a02180639..8b2386714a9545 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/scatter_elements_update.cpp
@@ -8,6 +8,7 @@
 #include "common_test_utils/test_constants.hpp"
 
 using ov::test::ScatterElementsUpdateLayerTest;
+using ov::test::ScatterElementsUpdate12LayerTest;
 
 namespace {
 // map<input_shape, map<indices_shape, axis>>
@@ -25,6 +26,8 @@ const std::vector<ov::element::Type> model_types = {
         ov::element::f32,
         ov::element::f16,
         ov::element::i32,
+        // ov::element::i8,     // cannot validate until CVS-136858 addressed
+        // ov::element::u8,     // cannot validate until CVS-136858 addressed
 };
 
 const std::vector<ov::element::Type> idx_types = {
@@ -58,4 +61,34 @@ const auto scatter_elt_update_cases = ::testing::Combine(
 INSTANTIATE_TEST_SUITE_P(smoke_ScatterEltsUpdate, ScatterElementsUpdateLayerTest,
     scatter_elt_update_cases, ScatterElementsUpdateLayerTest::getTestCaseName);
 
+const std::vector<ov::op::v12::ScatterElementsUpdate::Reduction> reduceModes{
+    ov::op::v12::ScatterElementsUpdate::Reduction::NONE,
+    ov::op::v12::ScatterElementsUpdate::Reduction::SUM,
+    ov::op::v12::ScatterElementsUpdate::Reduction::PROD,
+    ov::op::v12::ScatterElementsUpdate::Reduction::MIN,
+    ov::op::v12::ScatterElementsUpdate::Reduction::MAX,
+    ov::op::v12::ScatterElementsUpdate::Reduction::MEAN
+};
+
+const std::vector<std::vector<int64_t>> idxWithNegativeValues = {
+    {1, 0, 0, 1},
+    {-1, -2, -2, -1},
+};
+
+// map<input_shape, map<indices_shape, axis>>
+std::map<std::vector<size_t>, std::map<std::vector<size_t>, std::vector<int>>> axesShapeInShape2D {
+    {{2, 4}, {{{1, 4}, {0, 1}}, {{2, 2}, {-1, -2}}}},
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_ScatterEltsUpdate12,
+    ScatterElementsUpdate12LayerTest,
+    ::testing::Combine(::testing::ValuesIn(combine_shapes(axesShapeInShape2D)),
+                       ::testing::ValuesIn(idxWithNegativeValues),
+                       ::testing::ValuesIn(reduceModes),
+                       ::testing::ValuesIn({true, false}),
+                       ::testing::ValuesIn(model_types),
+                       ::testing::ValuesIn(idx_types),
+                       ::testing::Values(ov::test::utils::DEVICE_CPU)),
+    ScatterElementsUpdate12LayerTest::getTestCaseName);
 }  // namespace