mirror of
https://gitlab.freedesktop.org/pulseaudio/pulseaudio.git
synced 2025-11-14 06:59:53 -05:00
remap: support S32NE work format
So far PulseAudio only supported two different work formats: S16NE if it's sufficient to represent the input and output formats without loss of precision and FLOAT32NE in all other cases. For systems that use S32NE exclusively, this results in unnecessary conversions from S32NE to FLOAT32NE and back again. Add S32NE remap operations and make use of them (for the COPY and TRIVIAL resamplers) if both input and output format are S32NE. This avoids the back and forth conversions between S32NE and FLOAT32NE, significantly improving performance for those cases.
This commit is contained in:
parent
1e4fb61436
commit
034b77823a
7 changed files with 327 additions and 14 deletions
|
|
@ -143,6 +143,25 @@ static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const
|
|||
}
|
||||
}
|
||||
|
||||
static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
|
||||
for (; n >= 4; n -= 4) {
|
||||
__asm__ __volatile__ (
|
||||
"vld2.32 {q0,q1}, [%[src]]! \n\t"
|
||||
"vrhadd.s32 q0, q0, q1 \n\t"
|
||||
"vst1.32 {q0}, [%[dst]]! \n\t"
|
||||
: [dst] "+r" (dst), [src] "+r" (src) /* output operands */
|
||||
: /* input operands */
|
||||
: "memory", "q0", "q1" /* clobber list */
|
||||
);
|
||||
}
|
||||
|
||||
for (; n > 0; n--) {
|
||||
dst[0] = src[0]/2 + src[1]/2;
|
||||
src += 2;
|
||||
dst++;
|
||||
}
|
||||
}
|
||||
|
||||
static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
|
||||
for (; n >= 8; n -= 8) {
|
||||
__asm__ __volatile__ (
|
||||
|
|
@ -322,7 +341,8 @@ static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const
|
|||
}
|
||||
}
|
||||
|
||||
static void remap_arrange_ch2_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
||||
/* Works for both S32NE and FLOAT32NE */
|
||||
static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
|
||||
const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
|
||||
const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
|
||||
|
||||
|
|
@ -365,39 +385,52 @@ static void init_remap_neon(pa_remap_t *m) {
|
|||
n_oc = m->o_ss.channels;
|
||||
n_ic = m->i_ss.channels;
|
||||
|
||||
/* We short-circuit remap function selection for S32NE in most
|
||||
* cases as the corresponding generic C code is performing
|
||||
* similarly or even better. However there are a few cases where
|
||||
* there actually is a significant improvement from using
|
||||
* hand-crafted NEON assembly so we cannot just bail out for S32NE
|
||||
* here. */
|
||||
if (n_ic == 1 && n_oc == 2 &&
|
||||
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
|
||||
|
||||
pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
|
||||
NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
|
||||
}
|
||||
else {
|
||||
pa_log_info("Using ARM NEON mono to stereo remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
|
||||
NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
|
||||
}
|
||||
} else if (n_ic == 1 && n_oc == 4 &&
|
||||
m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
|
||||
m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
|
||||
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
pa_log_info("Using ARM NEON mono to 4-channel remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
|
||||
NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
|
||||
} else if (n_ic == 2 && n_oc == 1 &&
|
||||
m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
|
||||
|
||||
pa_log_info("Using ARM NEON stereo to mono remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
|
||||
(pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
|
||||
} else if (n_ic == 4 && n_oc == 1 &&
|
||||
m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
|
||||
m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
|
||||
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
pa_log_info("Using ARM NEON 4-channel to mono remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
|
||||
NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
|
||||
} else if (pa_setup_remap_arrange(m, arrange) &&
|
||||
((n_ic == 2 && n_oc == 2) ||
|
||||
(n_ic == 2 && n_oc == 4) ||
|
||||
|
|
@ -405,17 +438,22 @@ static void init_remap_neon(pa_remap_t *m) {
|
|||
unsigned o;
|
||||
|
||||
if (n_ic == 2 && n_oc == 2) {
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
pa_log_info("Using NEON stereo arrange remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
|
||||
NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
|
||||
} else if (n_ic == 2 && n_oc == 4) {
|
||||
pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_arrange_ch2_ch4_float32ne_neon);
|
||||
(pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
|
||||
(pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
|
||||
} else if (n_ic == 4 && n_oc == 4) {
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
pa_log_info("Using NEON 4-channel arrange remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
|
||||
(pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
|
||||
NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
|
||||
}
|
||||
|
||||
/* setup state */
|
||||
|
|
@ -436,6 +474,8 @@ static void init_remap_neon(pa_remap_t *m) {
|
|||
}
|
||||
break;
|
||||
}
|
||||
case PA_SAMPLE_S32NE:
|
||||
/* fall-through */
|
||||
case PA_SAMPLE_FLOAT32NE: {
|
||||
uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
|
||||
for (o = 0; o < n_oc; o++) {
|
||||
|
|
@ -461,8 +501,11 @@ static void init_remap_neon(pa_remap_t *m) {
|
|||
} else if (n_ic == 4 && n_oc == 4) {
|
||||
unsigned i, o;
|
||||
|
||||
if (m->format == PA_SAMPLE_S32NE)
|
||||
return;
|
||||
pa_log_info("Using ARM NEON 4-channel remapping");
|
||||
pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
|
||||
(pa_do_remap_func_t) NULL,
|
||||
(pa_do_remap_func_t) remap_ch4_float32ne_neon);
|
||||
|
||||
/* setup state */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue