mirror of
https://gitlab.freedesktop.org/pulseaudio/pulseaudio.git
synced 2025-11-04 13:29:59 -05:00
v7: * cleanups and reduce code; add 4->4 channels mappings, add rearrange code v6: * rename mono_to_stereo_float_neon_a9() to mono_to_stereo_float_arm_generic(); note that Cortex-A8 and -A9/A15 are different, later chips do not benefit from NEON memory transfers v5: * 4-channel remapping * use vrhadd instruction, fix int16 overflow for to-mono case v4: * fix for sample length < 4 v3: * fix test code: init float and int map_table * different code path for Cortex-A8 and later (-A9, A15, unknown) * convert from intrinsics to inline assembly v2: * add ARM NEON stereo-to-mono remapping code * static __attribute__ ((noinline)) is necessary to prevent inlining and work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863 * call test code, the reference implementation is obtained using pa_get_init_remap_func() * remove check for NEON flags v1: * ARM NEON mono-to-stereo remapping code note that orig is the time of the special-case C implementation where available, not the generic matric remapping implementation on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6): Checking NEON remap (float, mono->stereo) func: 757474 usec (avg: 7574.74, min = 6165, max = 11963, stddev = 1479.71). orig: 784882 usec (avg: 7848.82, min = 6835, max = 17639, stddev = 1656.01). Checking NEON remap (float, mono->4-channel) func: 1545507 usec (avg: 15455.1, min = 6531, max = 30609, stddev = 2689.6). orig: 2601413 usec (avg: 26014.1, min = 22796, max = 52979, stddev = 3281.84). Checking NEON remap (s16, mono->stereo) func: 343844 usec (avg: 3438.44, min = 1709, max = 8880, stddev = 1180.1). orig: 474460 usec (avg: 4744.6, min = 4212, max = 7751, stddev = 1069.29). Checking NEON remap (s16, mono->4-channel) func: 736574 usec (avg: 7365.74, min = 3784, max = 11902, stddev = 1637.79). orig: 1062772 usec (avg: 10627.7, min = 7630, max = 17517, stddev = 3011.44). Checking NEON remap (float, stereo->mono) func: 571412 usec (avg: 5714.12, min = 4608, max = 15808, stddev = 2131.7). orig: 4356630 usec (avg: 43566.3, min = 41596, max = 52430, stddev = 2056.79). Checking NEON remap (float, 4-channel->mono) func: 1443202 usec (avg: 14432, min = 12298, max = 32349, stddev = 3300). orig: 9273410 usec (avg: 92734.1, min = 81940, max = 184265, stddev = 23310). Checking NEON remap (s16, stereo->mono) func: 185761 usec (avg: 1857.61, min = 1556, max = 4975, stddev = 743.681). orig: 1204776 usec (avg: 12047.8, min = 10711, max = 16022, stddev = 1596.88). Checking NEON remap (s16, 4-channel->mono) func: 482912 usec (avg: 4829.12, min = 4241, max = 9980, stddev = 1270.8). orig: 1692050 usec (avg: 16920.5, min = 14679, max = 30060, stddev = 2760.7). Checking NEON remap (float, 4-channel->4-channel) func: 5324471 usec (avg: 53244.7, min = 49774, max = 87036, stddev = 4255.47). orig: 73674628 usec (avg: 736746, min = 720338, max = 824128, stddev = 18361.8). Checking NEON remap (s16, 4-channel->4-channel) func: 5321320 usec (avg: 53213.2, min = 49591, max = 84443, stddev = 3931.49). orig: 24122021 usec (avg: 241220, min = 233337, max = 291687, stddev = 9064.31). Checking NEON remap (float, stereo rearrange) func: 1116547 usec (avg: 11165.5, min = 9124, max = 27496, stddev = 3345.63). orig: 1385011 usec (avg: 13850.1, min = 12237, max = 18005, stddev = 1793.05). Checking NEON remap (s16, stereo rearrange) func: 517027 usec (avg: 5170.27, min = 4577, max = 9735, stddev = 1215.23). orig: 1208435 usec (avg: 12084.4, min = 10406, max = 25299, stddev = 2512.02). Checking NEON remap (float, 4-channel rearrange) func: 1564667 usec (avg: 15646.7, min = 13855, max = 20172, stddev = 1766.48). orig: 2970000 usec (avg: 29700, min = 26215, max = 45654, stddev = 2351.07). Checking NEON remap (s16, 4-channel rearrange) func: 1088808 usec (avg: 10888.1, min = 9064, max = 23407, stddev = 2465.82). orig: 1908416 usec (avg: 19084.2, min = 16968, max = 22705, stddev = 1637.46). Signed-off-by: Peter Meerwald <pmeerw@pmeerw.net>
498 lines
19 KiB
C
498 lines
19 KiB
C
/***
|
|
This file is part of PulseAudio.
|
|
|
|
Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
|
|
|
|
PulseAudio is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as published
|
|
by the Free Software Foundation; either version 2.1 of the License,
|
|
or (at your option) any later version.
|
|
|
|
PulseAudio is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
***/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <pulse/sample.h>
|
|
#include <pulse/xmalloc.h>
|
|
#include <pulsecore/log.h>
|
|
#include <pulsecore/macro.h>
|
|
|
|
#include "cpu-arm.h"
|
|
#include "remap.h"
|
|
|
|
#include <arm_neon.h>
|
|
|
|
static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
for (; n >= 4; n -= 4) {
|
|
__asm__ __volatile__ (
|
|
"vld1.32 {q0}, [%[src]]! \n\t"
|
|
"vmov q1, q0 \n\t"
|
|
"vst2.32 {q0,q1}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "q0", "q1" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = dst[1] = src[0];
|
|
src++;
|
|
dst += 2;
|
|
}
|
|
}
|
|
|
|
static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
for (; n >= 2; n -= 2) {
|
|
__asm__ __volatile__ (
|
|
"ldm %[src]!, {r4,r6} \n\t"
|
|
"mov r5, r4 \n\t"
|
|
"mov r7, r6 \n\t"
|
|
"stm %[dst]!, {r4-r7} \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "r4", "r5", "r6", "r7" /* clobber list */
|
|
);
|
|
}
|
|
|
|
if (n > 0)
|
|
dst[0] = dst[1] = src[0];
|
|
}
|
|
|
|
static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
for (; n >= 8; n -= 8) {
|
|
__asm__ __volatile__ (
|
|
"vld1.16 {q0}, [%[src]]! \n\t"
|
|
"vmov q1, q0 \n\t"
|
|
"vst2.16 {q0,q1}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "q0", "q1" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = dst[1] = src[0];
|
|
src++;
|
|
dst += 2;
|
|
}
|
|
}
|
|
|
|
static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
for (; n >= 2; n -= 2) {
|
|
__asm__ __volatile__ (
|
|
"vld1.32 {d0}, [%[src]]! \n\t"
|
|
"vdup.f32 q1, d0[0] \n\t"
|
|
"vdup.f32 q2, d0[1] \n\t"
|
|
"vst1.32 {q1,q2}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "q0", "q1", "q2" /* clobber list */
|
|
);
|
|
}
|
|
|
|
if (n--)
|
|
dst[0] = dst[1] = dst[2] = dst[3] = src[0];
|
|
}
|
|
|
|
static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
for (; n >= 4; n -= 4) {
|
|
__asm__ __volatile__ (
|
|
"vld1.16 {d0}, [%[src]]! \n\t"
|
|
"vdup.s16 d1, d0[1] \n\t"
|
|
"vdup.s16 d2, d0[2] \n\t"
|
|
"vdup.s16 d3, d0[3] \n\t"
|
|
"vdup.s16 d0, d0[0] \n\t"
|
|
"vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "d0", "d1", "d2", "d3" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = dst[1] = dst[2] = dst[3] = src[0];
|
|
src++;
|
|
dst += 4;
|
|
}
|
|
}
|
|
|
|
static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
const float32x4_t halve = vdupq_n_f32(0.5f);
|
|
for (; n >= 4; n -= 4) {
|
|
__asm__ __volatile__ (
|
|
"vld2.32 {q0,q1}, [%[src]]! \n\t"
|
|
"vadd.f32 q0, q0, q1 \n\t"
|
|
"vmul.f32 q0, q0, %q[halve] \n\t"
|
|
"vst1.32 {q0}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [halve] "w" (halve) /* input operands */
|
|
: "memory", "q0", "q1" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = (src[0] + src[1])*0.5f;
|
|
src += 2;
|
|
dst++;
|
|
}
|
|
}
|
|
|
|
static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
for (; n >= 8; n -= 8) {
|
|
__asm__ __volatile__ (
|
|
"vld2.16 {q0,q1}, [%[src]]! \n\t"
|
|
"vrhadd.s16 q0, q0, q1 \n\t"
|
|
"vst1.16 {q0}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "q0", "q1" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = (src[0] + src[1])/2;
|
|
src += 2;
|
|
dst++;
|
|
}
|
|
}
|
|
|
|
static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
const float32x2_t quart = vdup_n_f32(0.25f);
|
|
for (; n >= 2; n -= 2) {
|
|
__asm__ __volatile__ (
|
|
"vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t"
|
|
"vadd.f32 d0, d0, d1 \n\t"
|
|
"vadd.f32 d2, d2, d3 \n\t"
|
|
"vadd.f32 d0, d0, d2 \n\t"
|
|
"vmul.f32 d0, d0, %[quart] \n\t"
|
|
"vst1.32 {d0}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [quart] "w" (quart) /* input operands */
|
|
: "memory", "d0", "d1", "d2", "d3" /* clobber list */
|
|
);
|
|
}
|
|
|
|
if (n > 0)
|
|
dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
|
|
}
|
|
|
|
static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
for (; n >= 4; n -= 4) {
|
|
__asm__ __volatile__ (
|
|
"vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t"
|
|
"vrhadd.s16 d0, d0, d1 \n\t"
|
|
"vrhadd.s16 d2, d2, d3 \n\t"
|
|
"vrhadd.s16 d0, d0, d2 \n\t"
|
|
"vst1.16 {d0}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: /* input operands */
|
|
: "memory", "d0", "d1", "d2", "d3" /* clobber list */
|
|
);
|
|
}
|
|
|
|
for (; n > 0; n--) {
|
|
dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
|
|
src += 4;
|
|
dst++;
|
|
}
|
|
}
|
|
|
|
static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
int32x4_t *f = m->state;
|
|
const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.16 {d0}, [%[src]]! \n\t"
|
|
"vmovl.s16 q0, d0 \n\t"
|
|
"vdup.s32 q1, d0[0] \n\t"
|
|
"vmul.s32 q1, q1, %q[f0] \n\t"
|
|
"vdup.s32 q2, d0[1] \n\t"
|
|
"vmla.s32 q1, q2, %q[f1] \n\t"
|
|
"vdup.s32 q2, d1[0] \n\t"
|
|
"vmla.s32 q1, q2, %q[f2] \n\t"
|
|
"vdup.s32 q2, d1[1] \n\t"
|
|
"vmla.s32 q1, q2, %q[f3] \n\t"
|
|
"vqshrn.s32 d2, q1, #16 \n\t"
|
|
"vst1.32 {d2}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src)
|
|
: [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
|
|
: "memory", "q0", "q1", "q2"
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
float32x4_t *f = m->state;
|
|
const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.32 {d0,d1}, [%[src]]! \n\t"
|
|
"vdup.f32 q1, d0[0] \n\t"
|
|
"vmul.f32 q1, q1, %q[f0] \n\t"
|
|
"vdup.f32 q2, d0[1] \n\t"
|
|
"vmla.f32 q1, q2, %q[f1] \n\t"
|
|
"vdup.f32 q2, d1[0] \n\t"
|
|
"vmla.f32 q1, q2, %q[f2] \n\t"
|
|
"vdup.f32 q2, d1[1] \n\t"
|
|
"vmla.f32 q1, q2, %q[f3] \n\t"
|
|
"vst1.32 {d2,d3}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src)
|
|
: [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
|
|
: "memory", "q0", "q1", "q2"
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
const uint8x8_t t = ((uint8x8_t *) m->state)[0];
|
|
|
|
for (; n >= 2; n -= 2) {
|
|
__asm__ __volatile__ (
|
|
"vld1.s16 d0, [%[src]]! \n\t"
|
|
"vtbl.8 d0, {d0}, %[t] \n\t"
|
|
"vst1.s16 d0, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t] "w" (t) /* input operands */
|
|
: "memory", "d0" /* clobber list */
|
|
);
|
|
}
|
|
|
|
if (n > 0) {
|
|
__asm__ __volatile__ (
|
|
"vld1.32 d0[0], [%[src]]! \n\t"
|
|
"vtbl.8 d0, {d0}, %[t] \n\t"
|
|
"vst1.32 d0[0], [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t] "w" (t) /* input operands */
|
|
: "memory", "d0" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
const uint8x8_t t = ((uint8x8_t *) m->state)[0];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.32 d0[0], [%[src]]! \n\t"
|
|
"vtbl.8 d0, {d0}, %[t] \n\t"
|
|
"vst1.s16 d0, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t] "w" (t) /* input operands */
|
|
: "memory", "d0" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
|
const uint8x8_t t = ((uint8x8_t *) m->state)[0];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.s16 d0, [%[src]]! \n\t"
|
|
"vtbl.8 d0, {d0}, %[t] \n\t"
|
|
"vst1.s16 d0, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t] "w" (t) /* input operands */
|
|
: "memory", "d0" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
const uint8x8_t t = ((uint8x8_t *)m->state)[0];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.f32 d0, [%[src]]! \n\t"
|
|
"vtbl.8 d0, {d0}, %[t] \n\t"
|
|
"vst1.s16 {d0}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t] "w" (t) /* input operands */
|
|
: "memory", "d0" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_ch2_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
|
|
const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.f32 d0, [%[src]]! \n\t"
|
|
"vtbl.8 d1, {d0}, %[t0] \n\t"
|
|
"vtbl.8 d2, {d0}, %[t1] \n\t"
|
|
"vst1.s16 {d1,d2}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t0] "w" (t0), [t1] "w" (t1) /* input operands */
|
|
: "memory", "d0", "d1", "d2" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
|
const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
|
|
const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
|
|
|
|
for (; n > 0; n--) {
|
|
__asm__ __volatile__ (
|
|
"vld1.f32 {d0,d1}, [%[src]]! \n\t"
|
|
"vtbl.8 d2, {d0,d1}, %[t0] \n\t"
|
|
"vtbl.8 d3, {d0,d1}, %[t1] \n\t"
|
|
"vst1.s16 {d2,d3}, [%[dst]]! \n\t"
|
|
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
|
: [t0] "w" (t0), [t1] "w" (t1) /* input operands */
|
|
: "memory", "d0", "d1", "d2", "d3" /* clobber list */
|
|
);
|
|
}
|
|
}
|
|
|
|
static pa_cpu_arm_flag_t arm_flags;
|
|
|
|
static void init_remap_neon(pa_remap_t *m) {
|
|
unsigned n_oc, n_ic;
|
|
int8_t arrange[PA_CHANNELS_MAX];
|
|
|
|
n_oc = m->o_ss.channels;
|
|
n_ic = m->i_ss.channels;
|
|
|
|
if (n_ic == 1 && n_oc == 2 &&
|
|
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
|
|
if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
|
|
|
|
pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
|
|
}
|
|
else {
|
|
pa_log_info("Using ARM NEON mono to stereo remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
|
|
}
|
|
} else if (n_ic == 1 && n_oc == 4 &&
|
|
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
|
|
m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
|
|
|
|
pa_log_info("Using ARM NEON mono to 4-channel remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
|
|
} else if (n_ic == 2 && n_oc == 1 &&
|
|
m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
|
|
|
|
pa_log_info("Using ARM NEON stereo to mono remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
|
|
} else if (n_ic == 4 && n_oc == 1 &&
|
|
m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
|
|
m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
|
|
|
|
pa_log_info("Using ARM NEON 4-channel to mono remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
|
|
} else if (pa_setup_remap_arrange(m, arrange) &&
|
|
((n_ic == 2 && n_oc == 2) ||
|
|
(n_ic == 2 && n_oc == 4) ||
|
|
(n_ic == 4 && n_oc == 4))) {
|
|
unsigned o;
|
|
|
|
if (n_ic == 2 && n_oc == 2) {
|
|
pa_log_info("Using NEON stereo arrange remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
|
|
} else if (n_ic == 2 && n_oc == 4) {
|
|
pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_arrange_ch2_ch4_float32ne_neon);
|
|
} else if (n_ic == 4 && n_oc == 4) {
|
|
pa_log_info("Using NEON 4-channel arrange remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
|
|
}
|
|
|
|
/* setup state */
|
|
switch (m->format) {
|
|
case PA_SAMPLE_S16NE: {
|
|
uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
|
|
for (o = 0; o < 4; o++) {
|
|
if (arrange[o % n_oc] >= 0) {
|
|
/* convert channel index to vtbl indices */
|
|
unsigned frame = o / n_oc;
|
|
((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
|
|
((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
|
|
} else {
|
|
/* use invalid table indices to map to 0 */
|
|
((uint8_t *) t)[o * 2 + 0] = 0xff;
|
|
((uint8_t *) t)[o * 2 + 1] = 0xff;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case PA_SAMPLE_FLOAT32NE: {
|
|
uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
|
|
for (o = 0; o < n_oc; o++) {
|
|
if (arrange[o] >= 0) {
|
|
/* convert channel index to vtbl indices */
|
|
((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
|
|
((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
|
|
((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
|
|
((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
|
|
} else {
|
|
/* use invalid table indices to map to 0 */
|
|
((uint8_t *) t)[o * 4 + 0] = 0xff;
|
|
((uint8_t *) t)[o * 4 + 1] = 0xff;
|
|
((uint8_t *) t)[o * 4 + 2] = 0xff;
|
|
((uint8_t *) t)[o * 4 + 3] = 0xff;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
pa_assert_not_reached();
|
|
}
|
|
} else if (n_ic == 4 && n_oc == 4) {
|
|
unsigned i, o;
|
|
|
|
pa_log_info("Using ARM NEON 4-channel remapping");
|
|
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
|
|
(pa_do_remap_func_t) remap_ch4_float32ne_neon);
|
|
|
|
/* setup state */
|
|
switch (m->format) {
|
|
case PA_SAMPLE_S16NE: {
|
|
int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
|
|
for (o = 0; o < 4; o++) {
|
|
for (i = 0; i < 4; i++) {
|
|
((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case PA_SAMPLE_FLOAT32NE: {
|
|
float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
|
|
for (o = 0; o < 4; o++) {
|
|
for (i = 0; i < 4; i++) {
|
|
((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
pa_assert_not_reached();
|
|
}
|
|
}
|
|
}
|
|
|
|
void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
|
|
pa_log_info("Initialising ARM NEON optimized remappers.");
|
|
arm_flags = flags;
|
|
pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
|
|
}
|