Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[compute/cker] Introduce compile-time definitions file #14244

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions compute/cker/include/cker/ctdef.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#pragma once

#include <type_traits>

// Compile time definitions and related trait-like utilities

namespace nnfw
{
// This enum specifies all compile-time features that should be testable using the `is_defined` and
// `is_defined_v` templates below.
enum class Define
{
PLATFORM_X86,
PLATFORM_AARCH64,
USE_NEON
};

// A compile-time feature-detection structure which defaults to false
// for all defines. This struct should be specialized for each Define enum value.
//
// This struct should be used the following way:
//
// if constexpr (nnfw::is_defined<Define::PLATFORM_X86>::value) {}
template <Define> struct is_defined : std::false_type
{
};

// A helper template variable to be used the following way:
//
// if constexpr (nnfw::is_defined_v<Define::PLATFORM_X86>) {}
template <Define def> inline constexpr const bool is_defined_v = is_defined<def>::value;

/* ********************************************************************************************** */

#if defined(CKER_X86_PLATFORM)
template <> struct is_defined<Define::PLATFORM_X86> : std::true_type
{
};
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
template <> struct is_defined<Define::PLATFORM_AARCH64> : std::true_type
{
};
#endif

#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <> struct is_defined<Define::USE_NEON> : std::true_type
{
};
#endif

} // namespace nnfw
114 changes: 60 additions & 54 deletions compute/cker/include/cker/operation/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,81 +20,87 @@

#include "cker/neon/neon_check.h"
#include "cker/Utils.h"
#include "cker/ctdef.h"

namespace nnfw
{
namespace cker
{

inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data,
int array_size, float *array_data)
template <typename T = float>
inline void BiasAndClamp(T clamp_min, T clamp_max, int bias_size, const T *bias_data,
int array_size, T *array_data)
{
// Note: see b/132215220: in May 2019 we thought it would be OK to replace
// this with the Eigen one-liner:
// return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
// This turned out to severely regress performance: +4ms (i.e. 8%) on
// MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
assert((array_size % bias_size) == 0);
#ifdef USE_NEON
float *array_ptr = array_data;
float *array_end_ptr = array_ptr + array_size;
const auto clamp_min_vec = vdupq_n_f32(clamp_min);
const auto clamp_max_vec = vdupq_n_f32(clamp_max);
for (; array_ptr != array_end_ptr; array_ptr += bias_size)

if constexpr (is_defined_v<Define::USE_NEON>)
{
int i = 0;
for (; i <= bias_size - 16; i += 16)
{
auto b0 = vld1q_f32(bias_data + i);
auto b1 = vld1q_f32(bias_data + i + 4);
auto b2 = vld1q_f32(bias_data + i + 8);
auto b3 = vld1q_f32(bias_data + i + 12);
auto a0 = vld1q_f32(array_ptr + i);
auto a1 = vld1q_f32(array_ptr + i + 4);
auto a2 = vld1q_f32(array_ptr + i + 8);
auto a3 = vld1q_f32(array_ptr + i + 12);
auto x0 = vaddq_f32(a0, b0);
auto x1 = vaddq_f32(a1, b1);
auto x2 = vaddq_f32(a2, b2);
auto x3 = vaddq_f32(a3, b3);
x0 = vmaxq_f32(clamp_min_vec, x0);
x1 = vmaxq_f32(clamp_min_vec, x1);
x2 = vmaxq_f32(clamp_min_vec, x2);
x3 = vmaxq_f32(clamp_min_vec, x3);
x0 = vminq_f32(clamp_max_vec, x0);
x1 = vminq_f32(clamp_max_vec, x1);
x2 = vminq_f32(clamp_max_vec, x2);
x3 = vminq_f32(clamp_max_vec, x3);
vst1q_f32(array_ptr + i, x0);
vst1q_f32(array_ptr + i + 4, x1);
vst1q_f32(array_ptr + i + 8, x2);
vst1q_f32(array_ptr + i + 12, x3);
}
for (; i <= bias_size - 4; i += 4)
{
auto b = vld1q_f32(bias_data + i);
auto a = vld1q_f32(array_ptr + i);
auto x = vaddq_f32(a, b);
x = vmaxq_f32(clamp_min_vec, x);
x = vminq_f32(clamp_max_vec, x);
vst1q_f32(array_ptr + i, x);
}
for (; i < bias_size; i++)
T *array_ptr = array_data;
T *array_end_ptr = array_ptr + array_size;
const auto clamp_min_vec = vdupq_n_f32(clamp_min);
const auto clamp_max_vec = vdupq_n_f32(clamp_max);
for (; array_ptr != array_end_ptr; array_ptr += bias_size)
{
array_ptr[i] =
ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
int i = 0;
for (; i <= bias_size - 16; i += 16)
{
auto b0 = vld1q_f32(bias_data + i);
auto b1 = vld1q_f32(bias_data + i + 4);
auto b2 = vld1q_f32(bias_data + i + 8);
auto b3 = vld1q_f32(bias_data + i + 12);
auto a0 = vld1q_f32(array_ptr + i);
auto a1 = vld1q_f32(array_ptr + i + 4);
auto a2 = vld1q_f32(array_ptr + i + 8);
auto a3 = vld1q_f32(array_ptr + i + 12);
auto x0 = vaddq_f32(a0, b0);
auto x1 = vaddq_f32(a1, b1);
auto x2 = vaddq_f32(a2, b2);
auto x3 = vaddq_f32(a3, b3);
x0 = vmaxq_f32(clamp_min_vec, x0);
x1 = vmaxq_f32(clamp_min_vec, x1);
x2 = vmaxq_f32(clamp_min_vec, x2);
x3 = vmaxq_f32(clamp_min_vec, x3);
x0 = vminq_f32(clamp_max_vec, x0);
x1 = vminq_f32(clamp_max_vec, x1);
x2 = vminq_f32(clamp_max_vec, x2);
x3 = vminq_f32(clamp_max_vec, x3);
vst1q_f32(array_ptr + i, x0);
vst1q_f32(array_ptr + i + 4, x1);
vst1q_f32(array_ptr + i + 8, x2);
vst1q_f32(array_ptr + i + 12, x3);
}
for (; i <= bias_size - 4; i += 4)
{
auto b = vld1q_f32(bias_data + i);
auto a = vld1q_f32(array_ptr + i);
auto x = vaddq_f32(a, b);
x = vmaxq_f32(clamp_min_vec, x);
x = vminq_f32(clamp_max_vec, x);
vst1q_f32(array_ptr + i, x);
}
for (; i < bias_size; i++)
{
array_ptr[i] =
ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
}
}
}
#else // not NEON
for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
else
{
for (int i = 0; i < bias_size; i++)
for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
{
array_data[array_offset + i] = ActivationFunctionWithMinMax(
array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
for (int i = 0; i < bias_size; i++)
{
array_data[array_offset + i] = ActivationFunctionWithMinMax(
array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
}
}
}
#endif
}

} // namespace cker
Expand Down
Loading