Skip to content

Commit

Permalink
Arm: Speed up FLOAT2INT16 conversion with Neon
Browse files Browse the repository at this point in the history
Using Neon for float to int conversion, and introducing platform-
specific function for converting an array of float values to int16.
Also adding appropriate unit test.
  • Loading branch information
VSZS authored and agosdahu committed Dec 5, 2024
1 parent 7db2693 commit f8b4bc2
Show file tree
Hide file tree
Showing 9 changed files with 262 additions and 6 deletions.
18 changes: 16 additions & 2 deletions celt/arm/arm_celt_map.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot */
* Copyright (c) 2013 Parrot
* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -29,12 +30,25 @@
#include "config.h"
#endif

#include "pitch.h"
#include "kiss_fft.h"
#include "mathops.h"
#include "mdct.h"
#include "pitch.h"

#if defined(OPUS_HAVE_RTCD)

# if !defined(DISABLE_FLOAT_API)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
celt_float2int16_c, /* ARMv4 */
celt_float2int16_c, /* EDSP */
celt_float2int16_c, /* Media */
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};
# endif
# endif

# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */
Expand Down
39 changes: 39 additions & 0 deletions celt/arm/celt_neon_intr.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
Expand Down Expand Up @@ -35,8 +36,46 @@
#endif

#include <arm_neon.h>
#include "../float_cast.h"
#include "../mathops.h"
#include "../pitch.h"

#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i = 0;

#if defined(__ARM_NEON)
const int BLOCK_SIZE = 16;
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;

for (; i < blockedSize; i += BLOCK_SIZE)
{
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
float32x4_t orig_d = vld1q_f32(&in[i + 12]);

int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));

vst1_s16(&out[i + 0], asShort_a);
vst1_s16(&out[i + 4], asShort_b);
vst1_s16(&out[i + 8], asShort_c);
vst1_s16(&out[i + 12], asShort_d);
}
#endif

for (; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}
#endif

#if defined(FIXED_POINT)
#include <string.h>

Expand Down
65 changes: 65 additions & 0 deletions celt/arm/mathops_arm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#if !defined(MATHOPS_ARM_H)
# define MATHOPS_ARM_H

#include "armcpu.h"
#include "cpu_support.h"
#include "opus_defines.h"

# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

#include <arm_neon.h>

static inline int32x4_t vroundf(float32x4_t x)
{
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
return vcvtaq_s32_f32(x);
# else
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
uint32x4_t bias = vdupq_n_u32(0x3F000000);
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
# endif
}

void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);

# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) \
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))

# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
# endif
# endif

#endif /* MATHOPS_ARM_H */
7 changes: 7 additions & 0 deletions celt/float_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s

return intgr ;
}
#elif defined(__aarch64__)

#include <arm_neon.h>
static OPUS_INLINE opus_int32 float2int(float flt)
{
return vcvtns_s32_f32(flt);
}

#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L

Expand Down
15 changes: 15 additions & 0 deletions celt/mathops.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin */
/**
@file mathops.h
Expand Down Expand Up @@ -35,6 +36,7 @@
#include "config.h"
#endif

#include "float_cast.h"
#include "mathops.h"

/*Compute floor(sqrt(_val)) with exact arithmetic.
Expand Down Expand Up @@ -207,3 +209,16 @@ opus_val32 celt_rcp(opus_val32 x)
}

#endif

#ifndef DISABLE_FLOAT_API

void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i;
for (i = 0; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}

#endif /* DISABLE_FLOAT_API */
16 changes: 16 additions & 0 deletions celt/mathops.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin */
/**
@file mathops.h
Expand Down Expand Up @@ -38,6 +39,10 @@
#include "entcode.h"
#include "os_support.h"

#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
#include "arm/mathops_arm.h"
#endif

#define PI 3.141592653f

/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
Expand Down Expand Up @@ -293,4 +298,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
}

#endif /* FIXED_POINT */

#ifndef DISABLE_FLOAT_API

void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);

#ifndef OVERRIDE_FLOAT2INT16
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
#endif

#endif /* DISABLE_FLOAT_API */

#endif /* MATHOPS_H */
100 changes: 99 additions & 1 deletion celt/tests/test_unit_mathops.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
Gregory Maxwell
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */
/*
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -36,8 +37,10 @@

#include <stdio.h>
#include <math.h>
#include "mathops.h"
#include "bands.h"
#include "cpu_support.h"
#include "float_cast.h"
#include "mathops.h"

#ifdef FIXED_POINT
#define WORD "%d"
Expand Down Expand Up @@ -250,8 +253,94 @@ void testilog2(void)
}
#endif


#ifndef DISABLE_FLOAT_API

void testcelt_float2int16(int use_ref_impl, int buffer_size)
{

#define MAX_BUFFER_SIZE 2080
int i, cnt;
float floatsToConvert[MAX_BUFFER_SIZE];
short results[MAX_BUFFER_SIZE] = { 0 };
float scaleInt16RangeTo01;

celt_assert(buffer_size <= MAX_BUFFER_SIZE);

scaleInt16RangeTo01 = 1.f / 32768.f;
cnt = 0;

while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
{
floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .0f;
floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;

celt_assert(cnt < buffer_size);
}

while (cnt < buffer_size)
{
float inInt16Range = cnt * 7 + .5;
inInt16Range += (cnt & 0x01) ? .1 : -.1;
inInt16Range *= (cnt & 0x02) ? 1 : -1;
floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
}

for (i = 0; i < MAX_BUFFER_SIZE; ++i)
{
results[i] = 42;
}

if (use_ref_impl)
{
celt_float2int16_c(floatsToConvert, results, cnt);
} else {
celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
}

for (i = 0; i < cnt; ++i)
{
const float expected = FLOAT2INT16(floatsToConvert[i]);
if (results[i] != expected)
{
fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
ret = 1;
}
}

for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
{
if (results[i] != 42)
{
fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
ret = 1;
break;
}
}
#undef MAX_BUFFER_SIZE
}

#endif

int main(void)
{
int i;
int use_ref_impl[2] = { 0, 1 };

testbitexactcos();
testbitexactlog2tan();
testdiv();
Expand All @@ -261,6 +350,15 @@ int main(void)
testexp2log2();
#ifdef FIXED_POINT
testilog2();
#endif
#ifndef DISABLE_FLOAT_API
for (i = 0; i <= 1; ++i)
{
testcelt_float2int16(use_ref_impl[i], 1);
testcelt_float2int16(use_ref_impl[i], 32);
testcelt_float2int16(use_ref_impl[i], 127);
testcelt_float2int16(use_ref_impl[i], 1031);
}
#endif
return ret;
}
1 change: 1 addition & 0 deletions celt_headers.mk
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
celt/arm/fixed_arm64.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
celt/arm/mathops_arm.h \
celt/arm/pitch_arm.h \
celt/arm/fft_arm.h \
celt/arm/mdct_arm.h \
Expand Down
Loading

0 comments on commit f8b4bc2

Please sign in to comment.