diff --git a/spa/plugins/audiomixer/benchmark-mix-ops.c b/spa/plugins/audiomixer/benchmark-mix-ops.c index 762c179db..e698417b6 100644 --- a/spa/plugins/audiomixer/benchmark-mix-ops.c +++ b/spa/plugins/audiomixer/benchmark-mix-ops.c @@ -74,8 +74,8 @@ static void run_test1(const char *name, const char *impl, mix_func_t func, int n mix.n_channels = 1; for (j = 0; j < n_src; j++) - ip[j] = SPA_PTR_ALIGN(&samp_in[j * n_samples * 4], 16, void); - op = SPA_PTR_ALIGN(samp_out, 16, void); + ip[j] = SPA_PTR_ALIGN(&samp_in[j * n_samples * 4], 32, void); + op = SPA_PTR_ALIGN(samp_out, 32, void); clock_gettime(CLOCK_MONOTONIC, &ts); t1 = SPA_TIMESPEC_TO_NSEC(&ts); @@ -163,6 +163,11 @@ static void test_f32(void) run_test("test_f32", "sse", mix_f32_sse); } #endif +#if defined (HAVE_AVX) + if (cpu_flags & SPA_CPU_FLAG_AVX) { + run_test("test_f32", "avx", mix_f32_avx); + } +#endif } static void test_f64(void) diff --git a/spa/plugins/audiomixer/mix-ops-avx.c b/spa/plugins/audiomixer/mix-ops-avx.c index b38842463..a5e3b5b11 100644 --- a/spa/plugins/audiomixer/mix-ops-avx.c +++ b/spa/plugins/audiomixer/mix-ops-avx.c @@ -86,50 +86,59 @@ static inline void mix_4(float * dst, static inline void mix_2(float * dst, const float * SPA_RESTRICT src, uint32_t n_samples) { - uint32_t n, unrolled; - - if (SPA_IS_ALIGNED(src, 32) && - SPA_IS_ALIGNED(dst, 32)) - unrolled = n_samples & ~15; - else - unrolled = 0; - - for (n = 0; n < unrolled; n += 16) { - __m256 in1[2], in2[2]; - - in1[0] = _mm256_load_ps(&dst[n + 0]); - in1[1] = _mm256_load_ps(&dst[n + 8]); - in2[0] = _mm256_load_ps(&src[n + 0]); - in2[1] = _mm256_load_ps(&src[n + 8]); - - in1[0] = _mm256_add_ps(in1[0], in2[0]); - in1[1] = _mm256_add_ps(in1[1], in2[1]); - - _mm256_store_ps(&dst[n + 0], in1[0]); - _mm256_store_ps(&dst[n + 8], in1[1]); - } - for (; n < n_samples; n++) { - __m128 in1[1], in2[1]; - in1[0] = _mm_load_ss(&dst[n]), - in2[0] = _mm_load_ss(&src[n]), - in1[0] = _mm_add_ss(in1[0], in2[0]); - _mm_store_ss(&dst[n], in1[0]); - } } void mix_f32_avx(struct mix_ops *ops, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) { - uint32_t i; + n_samples *= ops->n_channels; if (n_src == 0) memset(dst, 0, n_samples * ops->n_channels * sizeof(float)); - else if (dst != src[0]) - spa_memcpy(dst, src[0], n_samples * ops->n_channels * sizeof(float)); + else if (n_src == 1) { + if (dst != src[0]) + spa_memcpy(dst, src[0], n_samples * sizeof(float)); + } else { + uint32_t i, n, unrolled; + const float **s = (const float **)src; + float *d = dst; - for (i = 1; i + 2 < n_src; i += 3) - mix_4(dst, src[i], src[i + 1], src[i + 2], n_samples); - for (; i < n_src; i++) - mix_2(dst, src[i], n_samples * ops->n_channels); + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { + unrolled = n_samples & ~31; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (n = 0; n < unrolled; n += 32) { + __m256 in[4]; + + in[0] = _mm256_load_ps(&s[0][n + 0]); + in[1] = _mm256_load_ps(&s[0][n + 8]); + in[2] = _mm256_load_ps(&s[0][n + 16]); + in[3] = _mm256_load_ps(&s[0][n + 24]); + for (i = 1; i < n_src; i++) { + in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n + 0])); + in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n + 8])); + in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n + 16])); + in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n + 24])); + } + _mm256_store_ps(&d[n + 0], in[0]); + _mm256_store_ps(&d[n + 8], in[1]); + _mm256_store_ps(&d[n + 16], in[2]); + _mm256_store_ps(&d[n + 24], in[3]); + } + for (; n < n_samples; n++) { + __m128 in[1]; + in[0] = _mm_load_ss(&s[0][n]); + for (i = 1; i < n_src; i++) + in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n])); + _mm_store_ss(&d[n], in[0]); + } + } } diff --git a/spa/plugins/audiomixer/mix-ops-sse.c b/spa/plugins/audiomixer/mix-ops-sse.c index 01beeabd5..bae619bad 100644 --- a/spa/plugins/audiomixer/mix-ops-sse.c +++ b/spa/plugins/audiomixer/mix-ops-sse.c @@ -48,18 +48,17 @@ mix_f32_sse(struct mix_ops *ops, void * SPA_RESTRICT dst, const void * SPA_RESTR __m128 in[4]; const float **s = (const float **)src; float *d = dst; - bool aligned = true; - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(dst, 16))) - aligned = false; - else { - for (i = 0; i < n_src && aligned; i++) { - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) - aligned = false; + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } } - } - - unrolled = aligned ? n_samples & ~15 : 0; + } else + unrolled = 0; for (n = 0; n < unrolled; n += 16) { in[0] = _mm_load_ps(&s[0][n+ 0]); diff --git a/spa/plugins/audiomixer/mix-ops-sse2.c b/spa/plugins/audiomixer/mix-ops-sse2.c index fa2eb97db..e2f632d44 100644 --- a/spa/plugins/audiomixer/mix-ops-sse2.c +++ b/spa/plugins/audiomixer/mix-ops-sse2.c @@ -48,18 +48,17 @@ mix_f64_sse2(struct mix_ops *ops, void * SPA_RESTRICT dst, const void * SPA_REST __m128d in[4]; const double **s = (const double **)src; double *d = dst; - bool aligned = true; - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(dst, 16))) - aligned = false; - else { - for (i = 0; i < n_src && aligned; i++) { - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) - aligned = false; + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } } - } - - unrolled = aligned ? n_samples & ~7 : 0; + } else + unrolled = 0; for (n = 0; n < unrolled; n += 8) { in[0] = _mm_load_pd(&s[0][n+0]); diff --git a/spa/plugins/audiomixer/test-mix-ops.c b/spa/plugins/audiomixer/test-mix-ops.c index 622592573..a20f7a431 100644 --- a/spa/plugins/audiomixer/test-mix-ops.c +++ b/spa/plugins/audiomixer/test-mix-ops.c @@ -240,6 +240,13 @@ static void test_f32(void) run_test("test_f32_4_sse", src, 4, out_4, sizeof(out_4), SPA_N_ELEMENTS(out_4), mix_f32_sse); } #endif +#if defined(HAVE_AVX) + if (cpu_flags & SPA_CPU_FLAG_AVX) { + run_test("test_f32_0_avx", NULL, 0, out, sizeof(out), SPA_N_ELEMENTS(out), mix_f32_avx); + run_test("test_f32_1_avx", src, 1, in_1, sizeof(in_1), SPA_N_ELEMENTS(in_1), mix_f32_avx); + run_test("test_f32_4_avx", src, 4, out_4, sizeof(out_4), SPA_N_ELEMENTS(out_4), mix_f32_avx); + } +#endif } static void test_f64(void)