mirror of
https://gitlab.freedesktop.org/pulseaudio/pulseaudio.git
synced 2025-11-06 13:29:56 -05:00
sbc: ARM NEON optimization for scale factors calculation
Improves SBC encoding performance when joint stereo is not used. Benchmarked on ARM Cortex-A8: == Before: == $ time ./sbcenc -b53 -s8 test.au > /dev/null real 0m4.756s user 0m4.313s sys 0m0.438s samples % image name symbol name 2569 27.6296 sbcenc sbc_pack_frame 1934 20.8002 sbcenc sbc_analyze_4b_8s_neon 1386 14.9064 sbcenc sbc_calculate_bits 1221 13.1319 sbcenc sbc_calc_scalefactors 996 10.7120 sbcenc sbc_enc_process_input_8s_be 878 9.4429 no-vmlinux /no-vmlinux 204 2.1940 sbcenc sbc_encode 56 0.6023 libc-2.10.1.so memcpy == After: == $ time ./sbcenc -b53 -s8 test.au > /dev/null real 0m4.220s user 0m3.797s sys 0m0.422s samples % image name symbol name 2563 31.3249 sbcenc sbc_pack_frame 1892 23.1239 sbcenc sbc_analyze_4b_8s_neon 1368 16.7196 sbcenc sbc_calculate_bits 961 11.7453 sbcenc sbc_enc_process_input_8s_be 836 10.2176 no-vmlinux /no-vmlinux 262 3.2022 sbcenc sbc_calc_scalefactors_neon 199 2.4322 sbcenc sbc_encode 49 0.5989 libc-2.10.1.so memcpy
This commit is contained in:
parent
1f617ea9ec
commit
fd7dc68ded
2 changed files with 59 additions and 1 deletions
|
|
@ -77,7 +77,7 @@ struct sbc_frame {
|
|||
uint8_t joint;
|
||||
|
||||
/* only the lower 4 bits of every element are to be used */
|
||||
uint32_t scale_factor[2][8];
|
||||
uint32_t SBC_ALIGNED scale_factor[2][8];
|
||||
|
||||
/* raw integer subband samples in the frame */
|
||||
int32_t SBC_ALIGNED sb_sample_f[16][2][8];
|
||||
|
|
|
|||
|
|
@ -236,10 +236,68 @@ static inline void sbc_analyze_4b_8s_neon(int16_t *x,
|
|||
_sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
|
||||
}
|
||||
|
||||
static void sbc_calc_scalefactors_neon(
|
||||
int32_t sb_sample_f[16][2][8],
|
||||
uint32_t scale_factor[2][8],
|
||||
int blocks, int channels, int subbands)
|
||||
{
|
||||
int ch, sb;
|
||||
for (ch = 0; ch < channels; ch++) {
|
||||
for (sb = 0; sb < subbands; sb += 4) {
|
||||
int blk = blocks;
|
||||
int32_t *in = &sb_sample_f[0][ch][sb];
|
||||
asm volatile (
|
||||
"vmov.s32 q0, %[c1]\n"
|
||||
"vmov.s32 q1, %[c1]\n"
|
||||
"1:\n"
|
||||
"vld1.32 {d16, d17}, [%[in], :128], %[inc]\n"
|
||||
"vabs.s32 q8, q8\n"
|
||||
"vld1.32 {d18, d19}, [%[in], :128], %[inc]\n"
|
||||
"vabs.s32 q9, q9\n"
|
||||
"vld1.32 {d20, d21}, [%[in], :128], %[inc]\n"
|
||||
"vabs.s32 q10, q10\n"
|
||||
"vld1.32 {d22, d23}, [%[in], :128], %[inc]\n"
|
||||
"vabs.s32 q11, q11\n"
|
||||
"vcgt.s32 q12, q8, #0\n"
|
||||
"vcgt.s32 q13, q9, #0\n"
|
||||
"vcgt.s32 q14, q10, #0\n"
|
||||
"vcgt.s32 q15, q11, #0\n"
|
||||
"vadd.s32 q8, q8, q12\n"
|
||||
"vadd.s32 q9, q9, q13\n"
|
||||
"vadd.s32 q10, q10, q14\n"
|
||||
"vadd.s32 q11, q11, q15\n"
|
||||
"vorr.s32 q0, q0, q8\n"
|
||||
"vorr.s32 q1, q1, q9\n"
|
||||
"vorr.s32 q0, q0, q10\n"
|
||||
"vorr.s32 q1, q1, q11\n"
|
||||
"subs %[blk], %[blk], #4\n"
|
||||
"bgt 1b\n"
|
||||
"vorr.s32 q0, q0, q1\n"
|
||||
"vmov.s32 q15, %[c2]\n"
|
||||
"vclz.s32 q0, q0\n"
|
||||
"vsub.s32 q0, q15, q0\n"
|
||||
"vst1.32 {d0, d1}, [%[out], :128]\n"
|
||||
:
|
||||
[blk] "+r" (blk),
|
||||
[in] "+r" (in)
|
||||
:
|
||||
[inc] "r" ((char *) &sb_sample_f[1][0][0] -
|
||||
(char *) &sb_sample_f[0][0][0]),
|
||||
[out] "r" (&scale_factor[ch][sb]),
|
||||
[c1] "i" (1 << SCALE_OUT_BITS),
|
||||
[c2] "i" (31 - SCALE_OUT_BITS)
|
||||
: "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19",
|
||||
"d20", "d21", "d22", "d23", "d24", "d25", "d26",
|
||||
"d27", "d28", "d29", "d30", "d31", "cc", "memory");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sbc_init_primitives_neon(struct sbc_encoder_state *state)
|
||||
{
|
||||
state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
|
||||
state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
|
||||
state->sbc_calc_scalefactors = sbc_calc_scalefactors_neon;
|
||||
state->implementation_info = "NEON";
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue