remap: support S32NE work format

So far PulseAudio only supported two different work formats: S16NE if
it's sufficient to represent the input and output formats without loss
of precision and FLOAT32NE in all other cases. For systems that use
S32NE exclusively, this results in unnecessary conversions from S32NE to
FLOAT32NE and back again.

Add S32NE remap operations and make use of them (for the COPY and
TRIVIAL resamplers) if both input and output format are S32NE. This
avoids the back and forth conversions between S32NE and FLOAT32NE,
significantly improving performance for those cases.
This commit is contained in:
Sascha Silbe 2019-03-26 10:35:55 +01:00 committed by Arun Raghavan
parent 1e4fb61436
commit 034b77823a
7 changed files with 327 additions and 14 deletions

View file

@ -143,6 +143,25 @@ static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const
}
}
static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
for (; n >= 4; n -= 4) {
__asm__ __volatile__ (
"vld2.32 {q0,q1}, [%[src]]! \n\t"
"vrhadd.s32 q0, q0, q1 \n\t"
"vst1.32 {q0}, [%[dst]]! \n\t"
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
: /* input operands */
: "memory", "q0", "q1" /* clobber list */
);
}
for (; n > 0; n--) {
dst[0] = src[0]/2 + src[1]/2;
src += 2;
dst++;
}
}
static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
for (; n >= 8; n -= 8) {
__asm__ __volatile__ (
@ -322,7 +341,8 @@ static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const
}
}
static void remap_arrange_ch2_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
/* Works for both S32NE and FLOAT32NE */
static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
@ -365,39 +385,52 @@ static void init_remap_neon(pa_remap_t *m) {
n_oc = m->o_ss.channels;
n_ic = m->i_ss.channels;
/* We short-circuit remap function selection for S32NE in most
* cases as the corresponding generic C code is performing
* similarly or even better. However there are a few cases where
* there actually is a significant improvement from using
* hand-crafted NEON assembly so we cannot just bail out for S32NE
* here. */
if (n_ic == 1 && n_oc == 2 &&
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
if (m->format == PA_SAMPLE_S32NE)
return;
if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
}
else {
pa_log_info("Using ARM NEON mono to stereo remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
}
} else if (n_ic == 1 && n_oc == 4 &&
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
if (m->format == PA_SAMPLE_S32NE)
return;
pa_log_info("Using ARM NEON mono to 4-channel remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
(pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
} else if (n_ic == 2 && n_oc == 1 &&
m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
pa_log_info("Using ARM NEON stereo to mono remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
(pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
(pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
} else if (n_ic == 4 && n_oc == 1 &&
m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
if (m->format == PA_SAMPLE_S32NE)
return;
pa_log_info("Using ARM NEON 4-channel to mono remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
(pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
} else if (pa_setup_remap_arrange(m, arrange) &&
((n_ic == 2 && n_oc == 2) ||
(n_ic == 2 && n_oc == 4) ||
@ -405,17 +438,22 @@ static void init_remap_neon(pa_remap_t *m) {
unsigned o;
if (n_ic == 2 && n_oc == 2) {
if (m->format == PA_SAMPLE_S32NE)
return;
pa_log_info("Using NEON stereo arrange remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
(pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
} else if (n_ic == 2 && n_oc == 4) {
pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
(pa_do_remap_func_t) remap_arrange_ch2_ch4_float32ne_neon);
(pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
(pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
} else if (n_ic == 4 && n_oc == 4) {
if (m->format == PA_SAMPLE_S32NE)
return;
pa_log_info("Using NEON 4-channel arrange remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
(pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
}
/* setup state */
@ -436,6 +474,8 @@ static void init_remap_neon(pa_remap_t *m) {
}
break;
}
case PA_SAMPLE_S32NE:
/* fall-through */
case PA_SAMPLE_FLOAT32NE: {
uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
for (o = 0; o < n_oc; o++) {
@ -461,8 +501,11 @@ static void init_remap_neon(pa_remap_t *m) {
} else if (n_ic == 4 && n_oc == 4) {
unsigned i, o;
if (m->format == PA_SAMPLE_S32NE)
return;
pa_log_info("Using ARM NEON 4-channel remapping");
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
(pa_do_remap_func_t) NULL,
(pa_do_remap_func_t) remap_ch4_float32ne_neon);
/* setup state */