From db230fc13625059b135aeadb3fa7df5e2b5e6255 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Thu, 7 Feb 2019 12:30:36 +0100 Subject: [PATCH] floatmix: unroll loop a little --- src/modules/module-audio-dsp/floatmix.c | 29 ++++++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/modules/module-audio-dsp/floatmix.c b/src/modules/module-audio-dsp/floatmix.c index bf9c6508f..41e7bf83d 100644 --- a/src/modules/module-audio-dsp/floatmix.c +++ b/src/modules/module-audio-dsp/floatmix.c @@ -725,15 +725,32 @@ static void mix_2(float *dst, float *src1, float *src2, int n_samples) if (SPA_IS_ALIGNED(src1, 16) && SPA_IS_ALIGNED(src2, 16) && SPA_IS_ALIGNED(dst, 16)) - unrolled = n_samples / 4; + unrolled = n_samples / 16; else unrolled = 0; - for (n = 0; unrolled--; n += 4) { - in[0] = _mm_load_ps(&src1[n]), - in[1] = _mm_load_ps(&src2[n]), - in[0] = _mm_add_ps(in[0], in[1]); - _mm_store_ps(&dst[n], in[0]); + for (n = 0; unrolled--; n += 16) { + __m128 in1[4], in2[4]; + + in1[0] = _mm_load_ps(&src1[n+ 0]); + in1[1] = _mm_load_ps(&src1[n+ 4]); + in1[2] = _mm_load_ps(&src1[n+ 8]); + in1[3] = _mm_load_ps(&src1[n+12]); + + in2[0] = _mm_load_ps(&src2[n+ 0]); + in2[1] = _mm_load_ps(&src2[n+ 4]); + in2[2] = _mm_load_ps(&src2[n+ 8]); + in2[3] = _mm_load_ps(&src2[n+12]); + + in1[0] = _mm_add_ps(in1[0], in2[0]); + in1[1] = _mm_add_ps(in1[1], in2[1]); + in1[2] = _mm_add_ps(in1[2], in2[2]); + in1[3] = _mm_add_ps(in1[3], in2[3]); + + _mm_store_ps(&dst[n+ 0], in1[0]); + _mm_store_ps(&dst[n+ 4], in1[1]); + _mm_store_ps(&dst[n+ 8], in1[2]); + _mm_store_ps(&dst[n+12], in1[3]); } for (; n < n_samples; n++) { in[0] = _mm_load_ss(&src1[n]),