Skip to content

Commit

Permalink
[CPU] CPU plugin api 2.0 migration for TensorDesc (openvinotoolkit#21002
Browse files Browse the repository at this point in the history
)

* [CPU] CPU API 2.0 migration for TensorDesc

* Fixed CustomOpCPUTest issue

* Cleanup unused code

1. normalize_preprocess
2. MemoryDescUtils::convertToBlockedMemoryDesc
3. Fix a typo issue

* Fix zero dim with none zero strides issue

* Fix Expected and actual shape are different: [] VS [1]

* Remove InferenceEngine::Layout

* Solve rebase issues

* Update for code reviewer's comments

* Keep ov::intel_cpu::node::Generic for legacy compatibility

* Solve scalar data issue

* Solve reviewer's comments

* Restore Generic registeration
  • Loading branch information
riverlijunjie authored Dec 8, 2023
1 parent fbec7be commit 04db11e
Show file tree
Hide file tree
Showing 20 changed files with 120 additions and 524 deletions.
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/dnnl_extension_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::elem
case ov::element::undefined:
return memory::data_type::undef;
default: {
OPENVINO_THROW("The plugin does not support ", elementType.to_string(), " for use with oneDNN");
OPENVINO_THROW("CPU plugin does not support ", elementType.to_string(), " for use with oneDNN.");
}
}
}
Expand Down
100 changes: 15 additions & 85 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
#include "utils/ngraph_utils.hpp"
#include "utils/node_dumper.h"
#include "utils/verbose.h"
#include "memory_desc/cpu_memory_desc_utils.h"

#include "openvino/runtime/memory_solver.hpp"

Expand Down Expand Up @@ -894,63 +893,17 @@ void Graph::PushInputData(const std::string& name, const ov::SoPtr<ITensor>& inp
if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready.");
auto input_itr = inputNodesMap.find(name);
if (input_itr != inputNodesMap.end()) {
auto create_mem_desc = [&](const ov::SoPtr<ITensor>& tensor) -> CpuBlockedMemoryDesc {
auto element_type = tensor->get_element_type();
auto shape = tensor->get_shape();
if (shape.empty())
shape = {tensor->get_size()};
std::vector<size_t> blk_order(shape.size());
std::iota(blk_order.begin(), blk_order.end(), 0);
std::vector<size_t> dim_offset(shape.size(), 0);
std::vector<size_t> blk_strides;
auto byte_strides = element_type.bitwidth() >= 8 ? tensor->get_strides() : Strides{};
if (byte_strides.empty()) {
blk_strides = ov::row_major_strides(shape);
} else {
// ROI tensor need figure out correct blk_strides
blk_strides.resize(byte_strides.size());
std::transform(byte_strides.begin(),
byte_strides.end(),
blk_strides.begin(),
[&element_type](size_t byte_stride) {
OPENVINO_ASSERT(byte_stride % element_type.size() == 0,
"Limitation: Stride in bytes ",
byte_stride,
" should be divisible by size of element ",
element_type.size());
return byte_stride / element_type.size();
});
}
InferenceEngine::TensorDesc tensorDesc(
InferenceEngine::details::convertPrecision(tensor->get_element_type()),
shape,
InferenceEngine::BlockingDesc{shape, blk_order, 0, dim_offset, blk_strides});
return MemoryDescUtils::convertToCpuBlockedMemoryDesc(tensorDesc);
};

auto node = input_itr->second;
auto childEdge = node->getChildEdgeAt(0);
const auto& outDims = node->getOutputShapeAtPort(0);

const void* ext_data_ptr = input->data();
void* inter_data_ptr = childEdge->getMemory().getData();

if (ext_data_ptr != inter_data_ptr) {
auto ext_tensor_desc = create_mem_desc(input);
auto ext_tensor_desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(input);
Memory ext_mem(getEngine(), ext_tensor_desc, ext_data_ptr, false);
childEdge->getMemory().load(ext_mem, false);
}

// todo: make sure 'name' exists in this map...
if (_normalizePreprocMap.find(name) != _normalizePreprocMap.end()) {
if (input->get_element_type() == ov::element::f32) {
_normalizePreprocMap[name].NormalizeImage(outDims,
reinterpret_cast<float*>(inter_data_ptr),
TensorDesc::getLayoutByDims(input->get_shape()));
} else {
OPENVINO_THROW("Mean image of type ", input->get_element_type().get_type_name(), " is unsupported");
}
}
} else {
OPENVINO_THROW("Input blob for infer '", name, "' doesn't correspond to input in network");
}
Expand All @@ -973,53 +926,41 @@ void Graph::PullOutputData(std::unordered_map<std::string, ov::SoPtr<ITensor>>&
OPENVINO_THROW("The CPU plugin graph doesn't contain output node with name: ", name.c_str());
}

InferenceEngine::TensorDesc expectedDesc(
InferenceEngine::details::convertPrecision(ext_blob->get_element_type()),
ext_blob->get_shape(),
InferenceEngine::TensorDesc::getLayoutByRank(ext_blob->get_shape().size()));
DEBUG_LOG(name, ", tensor data addr ", static_cast<void*>(output[name]->data()));
auto expected_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(ext_blob);
const auto actualDesc = intr_blob.getDescWithType<BlockedMemoryDesc>();

const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc());
DEBUG_LOG(name, ", tensor data addr ", static_cast<void*>(output[name]->data()));

// TODO [NM]: need to create universal reorder which will be detect cases when we really need to use it
// WA: for cases when output shape after transformation will be 1x1x1x1 but model output is scalar
bool isScalarOutput = false;
if (actualDesc.getLayout() == SCALAR) {
isScalarOutput = expectedDesc.getLayout() == SCALAR ||
(!expectedDesc.getDims().empty() &&
std::accumulate(expectedDesc.getDims().begin(), expectedDesc.getDims().end(), (size_t)1, std::multiplies<size_t>()) == 1);
} else if (expectedDesc.getLayout() == SCALAR) {
isScalarOutput = actualDesc.getLayout() == SCALAR ||
(!actualDesc.getDims().empty() &&
std::accumulate(actualDesc.getDims().begin(), actualDesc.getDims().end(), (size_t)1, std::multiplies<size_t>()) == 1);
if (ext_blob->get_shape().empty() && ext_blob->get_size() == 1) {
const auto& actualDims = expected_desc_ptr->getShape().getStaticDims();
isScalarOutput =
!actualDims.empty() &&
std::accumulate(actualDims.begin(), actualDims.end(), (size_t)1, std::multiplies<size_t>()) == 1;
}

auto outDims = intr_blob.getStaticDims();
if (ext_blob->get_shape() != outDims && !isScalarOutput) {
// WA: because input/output info initially contains non empty dims, order etc.
// and setDims (called inside setShape) can't correct modify blocked desc for desc with blocked layout
if (expectedDesc.getLayout() == InferenceEngine::Layout::BLOCKED) {
expectedDesc = TensorDesc(expectedDesc.getPrecision(), expectedDesc.getLayout());
}
DEBUG_LOG(name, ", tensor data addr ", static_cast<void*>(output[name]->data()),
" dims ", PartialShape(output[name]->get_shape()), " -> ", PartialShape(outDims),
", intr ptr ", intr_blob.getData(), " , parentedge's memory object ", parentEdge->getMemoryPtr().get());
ext_blob->set_shape(outDims);
DEBUG_LOG(name, ", tensor data addr ", static_cast<void*>(output[name]->data()),
" dims ", PartialShape(output[name]->get_shape()), ", intr ptr ", intr_blob.getData());
expectedDesc =
InferenceEngine::TensorDesc(InferenceEngine::details::convertPrecision(ext_blob->get_element_type()),
ext_blob->get_shape(),
InferenceEngine::TensorDesc::getLayoutByRank(ext_blob->get_shape().size()));
expected_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(ext_blob);
}

// check for empty output blob
if (std::any_of(outDims.begin(), outDims.end(), [](const Dim dim) {return dim == 0;})) {
continue;
}

auto srcPrec = actualDesc.getPrecision();
auto dstPrec = expectedDesc.getPrecision();
auto srcPrec = actualDesc->getPrecision();
auto dstPrec = expected_desc_ptr->getPrecision();
if (!getConfig().isLegacyApi && srcPrec == dstPrec && ext_blob->get_byte_size() != intr_blob.getSize())
OPENVINO_THROW("Output blob byte size is not equal network output byte size (",
ext_blob->get_byte_size(),
Expand All @@ -1034,24 +975,13 @@ void Graph::PullOutputData(std::unordered_map<std::string, ov::SoPtr<ITensor>>&
// That is the same memory. No need to copy
if (ext_blob_ptr == intr_blob_ptr) continue;

if (actualDesc.getBlockingDesc() != expectedDesc.getBlockingDesc() && !isScalarOutput) {
// User can initialize output via SetOutput API using tensorDesc with ANY layout.
// For these cases we create planar memory descriptor.
auto outBlobDesc =
expectedDesc.getLayout() == InferenceEngine::Layout::ANY
? DnnlBlockedMemoryDesc(InferenceEngine::details::convertPrecision(expectedDesc.getPrecision()),
Shape(expectedDesc.getDims()))
: MemoryDescUtils::convertToDnnlBlockedMemoryDesc(expectedDesc);
Memory outBloMem(getEngine(), outBlobDesc, ext_blob_ptr, false);
if (actualDesc->isCompatible(*expected_desc_ptr) && !isScalarOutput) {
Memory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr, false);
outBloMem.load(intr_blob, false);
} else {
size_t size_to_copy = intr_blob.getDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
DEBUG_LOG("pull_output: convert ", srcPrec, " to ", dstPrec);
cpu_convert(intr_blob_ptr,
ext_blob_ptr,
InferenceEngine::details::convertPrecision(srcPrec),
InferenceEngine::details::convertPrecision(dstPrec),
size_to_copy);
cpu_convert(intr_blob_ptr, ext_blob_ptr, srcPrec, dstPrec, size_to_copy);
}
}
}
Expand Down
7 changes: 0 additions & 7 deletions src/plugins/intel_cpu/src/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "edge.h"
#include "graph_context.h"
#include "node.h"
#include "normalize_preprocess.h"
#include "openvino/runtime/make_tensor.hpp"
#include "openvino/runtime/profiling_info.hpp"

Expand Down Expand Up @@ -60,10 +59,6 @@ class Graph {
const GraphContext::CPtr ctx,
std::string name);

bool hasMeanImageFor(const std::string& name) {
return _normalizePreprocMap.find(name) != _normalizePreprocMap.end();
}

void PushInputData(const std::string& name, const ov::SoPtr<ITensor>& input);
void PullOutputData(std::unordered_map<std::string, ov::SoPtr<ITensor>>& output);

Expand Down Expand Up @@ -212,7 +207,6 @@ class Graph {
outputNodesMap.clear();
graphNodes.clear();
graphEdges.clear();
_normalizePreprocMap.clear();
syncNodesInds.clear();
}
Status status { Status::NotReady };
Expand All @@ -228,7 +222,6 @@ class Graph {
std::vector<NodePtr> graphNodes;
std::vector<EdgePtr> graphEdges;

std::map<std::string, NormalizePreprocess> _normalizePreprocMap;
std::string _name;

bool graphHasDynamicInput = false;
Expand Down
68 changes: 17 additions & 51 deletions src/plugins/intel_cpu/src/infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,37 +332,6 @@ void SyncInferRequest::throw_if_canceled() const {
}
}

static InferenceEngine::TensorDesc create_tensor_desc(const ov::SoPtr<ITensor>& tensor) {
auto element_type = tensor->get_element_type();
auto shape = tensor->get_shape();
std::vector<size_t> blk_order(shape.size());
std::iota(blk_order.begin(), blk_order.end(), 0);
std::vector<size_t> dim_offset(shape.size(), 0);
std::vector<size_t> blk_strides;
auto byte_strides = element_type.bitwidth() >= 8 ? tensor->get_strides() : Strides{};
if (byte_strides.empty()) {
blk_strides = ov::row_major_strides(shape);
} else {
blk_strides.resize(byte_strides.size());
std::transform(byte_strides.begin(),
byte_strides.end(),
blk_strides.begin(),
[&element_type](size_t byte_stride) {
OPENVINO_ASSERT(byte_stride % element_type.size() == 0,
"Limitation: Stride in bytes ",
byte_stride,
" should be divisible by size of element ",
element_type.size());
return byte_stride / element_type.size();
});
}
OPENVINO_SUPPRESS_DEPRECATED_START
return InferenceEngine::TensorDesc{InferenceEngine::details::convertPrecision(element_type),
shape,
InferenceEngine::BlockingDesc{shape, blk_order, 0, dim_offset, blk_strides}};
OPENVINO_SUPPRESS_DEPRECATED_END
}

ov::SoPtr<ov::ITensor> SyncInferRequest::get_tensor(const ov::Output<const ov::Node>& in_port) const {
auto port = get_internal_port(in_port);
return ov::ISyncInferRequest::get_tensor(port);
Expand Down Expand Up @@ -398,7 +367,7 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& in_port, con
tensor = ov::make_tensor(in_tensor->get_element_type(), in_port.get_shape(), in_tensor->data());
}
auto name = get_port_name(in_port, m_is_legacy_api);
auto tensor_desc = create_tensor_desc(tensor);
auto mem_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor);
bool is_input = ov::op::util::is_parameter(port.get_node());
if (is_input) {
const auto netInPrc = port.get_element_type();
Expand Down Expand Up @@ -436,14 +405,11 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& in_port, con
// we must define desc for dynamic case
// otherwise we got incorrect check on shape compatibility inside isCompatible
// because lower and upper bound will be compared
OPENVINO_SUPPRESS_DEPRECATED_START
actualDesc = actualDesc->cloneWithNewDims(tensor_desc.getLayout() == InferenceEngine::Layout::SCALAR
? InferenceEngine::SizeVector{1}
: tensor_desc.getDims());
OPENVINO_SUPPRESS_DEPRECATED_END
actualDesc = actualDesc->cloneWithNewDims(
ov::is_scalar(tensor->get_shape()) ? VectorDims{1} : VectorDims{tensor->get_shape()});
}
if (actualDesc->isCompatible(MemoryDescUtils::convertToCpuBlockedMemoryDesc(tensor_desc)) &&
m_graph->_normalizePreprocMap.find(name) == m_graph->_normalizePreprocMap.end()) {

if (actualDesc->isCompatible(*mem_desc_ptr)) {
m_external_ptr[name] = tensor;
} else if (m_external_ptr.find(name) != m_external_ptr.end()) {
m_external_ptr.erase(name);
Expand Down Expand Up @@ -481,7 +447,7 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& in_port, con
}

const auto& desc = m_graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory().getDesc();
if (!isDynamic && tensor_desc == MemoryDescUtils::convertToTensorDesc(desc)) {
if (!isDynamic && mem_desc_ptr->isCompatible(desc)) {
m_external_ptr[name] = tensor;
} else if (m_external_ptr.find(name) != m_external_ptr.end()) {
m_external_ptr.erase(name);
Expand Down Expand Up @@ -538,12 +504,12 @@ void SyncInferRequest::init_tensor(const std::string& name) {
tensor = ov::make_tensor(port.get_element_type(), tensor_shape);
ov::ISyncInferRequest::set_tensor(port, tensor);

auto desc = create_tensor_desc(tensor);
if (!isDynamic &&
desc == MemoryDescUtils::convertToTensorDesc(
m_graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory().getDesc()) &&
m_graph->_normalizePreprocMap.find(name) == m_graph->_normalizePreprocMap.end()) {
m_external_ptr[name] = tensor;
if (!isDynamic) {
auto mem_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor);
if (mem_desc_ptr->isCompatible(
m_graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory().getDesc())) {
m_external_ptr[name] = tensor;
}
}
}
}
Expand Down Expand Up @@ -626,11 +592,11 @@ void SyncInferRequest::init_tensor(const std::string& name) {
}
}
m_outputs[name] = tensor;
auto desc = create_tensor_desc(tensor);
if (!port_shape.is_dynamic() && !m_external_ptr.count(name) &&
desc == MemoryDescUtils::convertToTensorDesc(
output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc())) {
m_external_ptr[name] = tensor;
if (!port_shape.is_dynamic() && !m_external_ptr.count(name)) {
auto desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(tensor);
if (desc->isCompatible(output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc())) {
m_external_ptr[name] = tensor;
}
}
// update tensors in case of multiple output ports with the same name
for (const auto& out : get_outputs()) {
Expand Down
Loading

0 comments on commit 04db11e

Please sign in to comment.