From d6030adada7a654c13b583e9ff50f6ce49919ad3 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Tue, 7 Jan 2025 09:19:05 +0100 Subject: [PATCH] filter-graph: optimize mix function a little Make special cases for no and 1 gain value when mixing. --- spa/plugins/filter-graph/audio-dsp-avx.c | 203 +++++++++++++++++----- spa/plugins/filter-graph/audio-dsp-c.c | 46 +++-- spa/plugins/filter-graph/audio-dsp-impl.h | 14 +- spa/plugins/filter-graph/audio-dsp-sse.c | 199 ++++++++++++++++----- spa/plugins/filter-graph/audio-dsp.h | 34 ++-- spa/plugins/filter-graph/builtin_plugin.c | 12 +- 6 files changed, 367 insertions(+), 141 deletions(-) diff --git a/spa/plugins/filter-graph/audio-dsp-avx.c b/spa/plugins/filter-graph/audio-dsp-avx.c index 870e94940..1509284c7 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx.c +++ b/spa/plugins/filter-graph/audio-dsp-avx.c @@ -16,10 +16,156 @@ #include +static void dsp_add_avx(void *obj, float *dst, const float * SPA_RESTRICT src[], + uint32_t n_src, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m256 in[4]; + const float **s = (const float **)src; + float *d = dst; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { + unrolled = n_samples & ~31; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (n = 0; n < unrolled; n += 32) { + in[0] = _mm256_load_ps(&s[0][n+ 0]); + in[1] = _mm256_load_ps(&s[0][n+ 8]); + in[2] = _mm256_load_ps(&s[0][n+16]); + in[3] = _mm256_load_ps(&s[0][n+24]); + + for (i = 1; i < n_src; i++) { + in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0])); + in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8])); + in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16])); + in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24])); + } + _mm256_store_ps(&d[n+ 0], in[0]); + _mm256_store_ps(&d[n+ 8], in[1]); + _mm256_store_ps(&d[n+16], in[2]); + _mm256_store_ps(&d[n+24], in[3]); + } + for (; n < n_samples; n++) { + __m128 in[1]; + in[0] = _mm_load_ss(&s[0][n]); + for (i = 1; i < n_src; i++) + in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n])); + _mm_store_ss(&d[n], in[0]); + } +} + +static void dsp_add_1_gain_avx(void *obj, float *dst, const float * SPA_RESTRICT src[], + uint32_t n_src, float gain, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m256 in[4], g; + const float **s = (const float **)src; + float *d = dst; + __m128 g1; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { + unrolled = n_samples & ~31; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + g = _mm256_set1_ps(gain); + g1 = _mm_set_ss(gain); + + for (n = 0; n < unrolled; n += 32) { + in[0] = _mm256_load_ps(&s[0][n+ 0]); + in[1] = _mm256_load_ps(&s[0][n+ 8]); + in[2] = _mm256_load_ps(&s[0][n+16]); + in[3] = _mm256_load_ps(&s[0][n+24]); + + for (i = 1; i < n_src; i++) { + in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0])); + in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8])); + in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16])); + in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24])); + } + _mm256_store_ps(&d[n+ 0], _mm256_mul_ps(g, in[0])); + _mm256_store_ps(&d[n+ 8], _mm256_mul_ps(g, in[1])); + _mm256_store_ps(&d[n+16], _mm256_mul_ps(g, in[2])); + _mm256_store_ps(&d[n+24], _mm256_mul_ps(g, in[3])); + } + for (; n < n_samples; n++) { + __m128 in[1]; + in[0] = _mm_load_ss(&s[0][n]); + for (i = 1; i < n_src; i++) + in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n])); + _mm_store_ss(&d[n], _mm_mul_ss(g1, in[0])); + } +} + +static void dsp_add_n_gain_avx(void *obj, float *dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m256 in[4], g; + const float **s = (const float **)src; + float *d = dst; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { + unrolled = n_samples & ~31; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (n = 0; n < unrolled; n += 32) { + g = _mm256_set1_ps(gain[0]); + in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0])); + in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8])); + in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16])); + in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24])); + + for (i = 1; i < n_src; i++) { + g = _mm256_set1_ps(gain[i]); + in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0]))); + in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8]))); + in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16]))); + in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24]))); + } + _mm256_store_ps(&d[n+ 0], in[0]); + _mm256_store_ps(&d[n+ 8], in[1]); + _mm256_store_ps(&d[n+16], in[2]); + _mm256_store_ps(&d[n+24], in[3]); + } + for (; n < n_samples; n++) { + __m128 in[1], g; + g = _mm_set_ss(gain[0]); + in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n])); + for (i = 1; i < n_src; i++) { + g = _mm_set_ss(gain[i]); + in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n]))); + } + _mm_store_ss(&d[n], in[0]); + } +} + + void dsp_mix_gain_avx(void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], - float gain[], uint32_t n_src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) { if (n_src == 0) { memset(dst, 0, n_samples * sizeof(float)); @@ -27,51 +173,12 @@ void dsp_mix_gain_avx(void *obj, if (dst != src[0]) spa_memcpy(dst, src[0], n_samples * sizeof(float)); } else { - uint32_t n, i, unrolled; - __m256 in[4], g; - const float **s = (const float **)src; - float *d = dst; - - if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { - unrolled = n_samples & ~31; - for (i = 0; i < n_src; i++) { - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { - unrolled = 0; - break; - } - } - } else - unrolled = 0; - - for (n = 0; n < unrolled; n += 32) { - g = _mm256_set1_ps(gain[0]); - in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0])); - in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8])); - in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16])); - in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24])); - - for (i = 1; i < n_src; i++) { - g = _mm256_set1_ps(gain[i]); - in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0]))); - in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8]))); - in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16]))); - in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24]))); - } - _mm256_store_ps(&d[n+ 0], in[0]); - _mm256_store_ps(&d[n+ 8], in[1]); - _mm256_store_ps(&d[n+16], in[2]); - _mm256_store_ps(&d[n+24], in[3]); - } - for (; n < n_samples; n++) { - __m128 in[1], g; - g = _mm_set_ss(gain[0]); - in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n])); - for (i = 1; i < n_src; i++) { - g = _mm_set_ss(gain[i]); - in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n]))); - } - _mm_store_ss(&d[n], in[0]); - } + if (n_gain == 0) + dsp_add_avx(obj, dst, src, n_src, n_samples); + else if (n_gain < n_src) + dsp_add_1_gain_avx(obj, dst, src, n_src, gain[0], n_samples); + else + dsp_add_n_gain_avx(obj, dst, src, n_src, gain, n_gain, n_samples); } } diff --git a/spa/plugins/filter-graph/audio-dsp-c.c b/spa/plugins/filter-graph/audio-dsp-c.c index ab018c0e0..cd6a8b5a6 100644 --- a/spa/plugins/filter-graph/audio-dsp-c.c +++ b/spa/plugins/filter-graph/audio-dsp-c.c @@ -18,20 +18,20 @@ #endif #include "audio-dsp-impl.h" -void dsp_clear_c(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples) +void dsp_clear_c(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples) { memset(dst, 0, sizeof(float) * n_samples); } -void dsp_copy_c(void *obj, void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, uint32_t n_samples) +void dsp_copy_c(void *obj, float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, uint32_t n_samples) { if (dst != src) spa_memcpy(dst, src, sizeof(float) * n_samples); } -static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, uint32_t n_samples) +static inline void dsp_add_c(void *obj, float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, uint32_t n_samples) { uint32_t i; const float *s = src; @@ -40,8 +40,8 @@ static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst, d[i] += s[i]; } -static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, float gain, uint32_t n_samples) +static inline void dsp_gain_c(void *obj, float * dst, + const float * src, float gain, uint32_t n_samples) { uint32_t i; const float *s = src; @@ -56,8 +56,8 @@ static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst, } } -static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, float gain, uint32_t n_samples) +static inline void dsp_gain_add_c(void *obj, float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, float gain, uint32_t n_samples) { uint32_t i; const float *s = src; @@ -75,22 +75,30 @@ static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst, void dsp_mix_gain_c(void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], - float gain[], uint32_t n_src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) { uint32_t i; if (n_src == 0) { dsp_clear_c(obj, dst, n_samples); } else { - dsp_gain_c(obj, dst, src[0], gain[0], n_samples); - for (i = 1; i < n_src; i++) - dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples); + if (n_gain < n_src) { + dsp_copy_c(obj, dst, src[0], n_samples); + for (i = 1; i < n_src; i++) + dsp_add_c(obj, dst, src[i], n_samples); + if (n_gain > 0) + dsp_gain_c(obj, dst, dst, gain[0], n_samples); + } else { + dsp_gain_c(obj, dst, src[0], gain[0], n_samples); + for (i = 1; i < n_src; i++) + dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples); + } } } -static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, uint32_t n_samples) +static inline void dsp_mult1_c(void *obj, float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, uint32_t n_samples) { uint32_t i; const float *s = src; @@ -100,8 +108,8 @@ static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst, } void dsp_mult_c(void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) { uint32_t i; diff --git a/spa/plugins/filter-graph/audio-dsp-impl.h b/spa/plugins/filter-graph/audio-dsp-impl.h index 8d1a86475..388a7453c 100644 --- a/spa/plugins/filter-graph/audio-dsp-impl.h +++ b/spa/plugins/filter-graph/audio-dsp-impl.h @@ -11,13 +11,13 @@ struct spa_fga_dsp * spa_fga_dsp_new(uint32_t cpu_flags); void spa_fga_dsp_free(struct spa_fga_dsp *dsp); #define MAKE_CLEAR_FUNC(arch) \ -void dsp_clear_##arch(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples) +void dsp_clear_##arch(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples) #define MAKE_COPY_FUNC(arch) \ -void dsp_copy_##arch(void *obj, void * SPA_RESTRICT dst, \ - const void * SPA_RESTRICT src, uint32_t n_samples) +void dsp_copy_##arch(void *obj, float * SPA_RESTRICT dst, \ + const float * SPA_RESTRICT src, uint32_t n_samples) #define MAKE_MIX_GAIN_FUNC(arch) \ -void dsp_mix_gain_##arch(void *obj, void * SPA_RESTRICT dst, \ - const void * SPA_RESTRICT src[], float gain[], uint32_t n_src, uint32_t n_samples) +void dsp_mix_gain_##arch(void *obj, float * SPA_RESTRICT dst, \ + const float * SPA_RESTRICT src[], uint32_t n_src, float gain[], uint32_t n_gain, uint32_t n_samples) #define MAKE_SUM_FUNC(arch) \ void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \ const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples) @@ -25,8 +25,8 @@ void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \ void dsp_linear_##arch (void *obj, float * SPA_RESTRICT dst, \ const float * SPA_RESTRICT src, const float mult, const float add, uint32_t n_samples) #define MAKE_MULT_FUNC(arch) \ -void dsp_mult_##arch(void *obj, void * SPA_RESTRICT dst, \ - const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) +void dsp_mult_##arch(void *obj, float * SPA_RESTRICT dst, \ + const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) #define MAKE_BIQUAD_RUN_FUNC(arch) \ void dsp_biquad_run_##arch (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, \ float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples) diff --git a/spa/plugins/filter-graph/audio-dsp-sse.c b/spa/plugins/filter-graph/audio-dsp-sse.c index deb3fc668..35748c1a2 100644 --- a/spa/plugins/filter-graph/audio-dsp-sse.c +++ b/spa/plugins/filter-graph/audio-dsp-sse.c @@ -19,10 +19,153 @@ #include +static void dsp_add_sse(void *obj, float *dst, const float * SPA_RESTRICT src[], + uint32_t n_src, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m128 in[4]; + const float **s = (const float **)src; + float *d = dst; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm_load_ps(&s[0][n+ 0]); + in[1] = _mm_load_ps(&s[0][n+ 4]); + in[2] = _mm_load_ps(&s[0][n+ 8]); + in[3] = _mm_load_ps(&s[0][n+12]); + + for (i = 1; i < n_src; i++) { + in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0])); + in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4])); + in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8])); + in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12])); + } + _mm_store_ps(&d[n+ 0], in[0]); + _mm_store_ps(&d[n+ 4], in[1]); + _mm_store_ps(&d[n+ 8], in[2]); + _mm_store_ps(&d[n+12], in[3]); + } + for (; n < n_samples; n++) { + in[0] = _mm_load_ss(&s[0][n]); + for (i = 1; i < n_src; i++) + in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n])); + _mm_store_ss(&d[n], in[0]); + } +} + +static void dsp_add_1_gain_sse(void *obj, + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m128 in[4], g; + const float **s = (const float **)src; + float *d = dst; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + g = _mm_set1_ps(gain); + + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm_load_ps(&s[0][n+ 0]); + in[1] = _mm_load_ps(&s[0][n+ 4]); + in[2] = _mm_load_ps(&s[0][n+ 8]); + in[3] = _mm_load_ps(&s[0][n+12]); + + for (i = 1; i < n_src; i++) { + in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0])); + in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4])); + in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8])); + in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12])); + } + _mm_store_ps(&d[n+ 0], _mm_mul_ps(in[0], g)); + _mm_store_ps(&d[n+ 4], _mm_mul_ps(in[1], g)); + _mm_store_ps(&d[n+ 8], _mm_mul_ps(in[2], g)); + _mm_store_ps(&d[n+12], _mm_mul_ps(in[3], g)); + } + for (; n < n_samples; n++) { + in[0] = _mm_load_ss(&s[0][n]); + for (i = 1; i < n_src; i++) + in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n])); + _mm_store_ss(&d[n], _mm_mul_ss(in[0], g)); + } +} + +static void dsp_add_n_gain_sse(void *obj, + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m128 in[4], g; + const float **s = (const float **)src; + float *d = dst; + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 0; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (n = 0; n < unrolled; n += 16) { + g = _mm_set1_ps(gain[0]); + in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0])); + in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4])); + in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8])); + in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12])); + + for (i = 1; i < n_src; i++) { + g = _mm_set1_ps(gain[i]); + in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0]))); + in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4]))); + in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8]))); + in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12]))); + } + _mm_store_ps(&d[n+ 0], in[0]); + _mm_store_ps(&d[n+ 4], in[1]); + _mm_store_ps(&d[n+ 8], in[2]); + _mm_store_ps(&d[n+12], in[3]); + } + for (; n < n_samples; n++) { + g = _mm_set_ss(gain[0]); + in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n])); + for (i = 1; i < n_src; i++) { + g = _mm_set_ss(gain[i]); + in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n]))); + } + _mm_store_ss(&d[n], in[0]); + } +} + void dsp_mix_gain_sse(void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], - float gain[], uint32_t n_src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) { if (n_src == 0) { memset(dst, 0, n_samples * sizeof(float)); @@ -30,50 +173,12 @@ void dsp_mix_gain_sse(void *obj, if (dst != src[0]) spa_memcpy(dst, src[0], n_samples * sizeof(float)); } else { - uint32_t n, i, unrolled; - __m128 in[4], g; - const float **s = (const float **)src; - float *d = dst; - - if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { - unrolled = n_samples & ~15; - for (i = 0; i < n_src; i++) { - if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { - unrolled = 0; - break; - } - } - } else - unrolled = 0; - - for (n = 0; n < unrolled; n += 16) { - g = _mm_set1_ps(gain[0]); - in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0])); - in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4])); - in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8])); - in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12])); - - for (i = 1; i < n_src; i++) { - g = _mm_set1_ps(gain[i]); - in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0]))); - in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4]))); - in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8]))); - in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12]))); - } - _mm_store_ps(&d[n+ 0], in[0]); - _mm_store_ps(&d[n+ 4], in[1]); - _mm_store_ps(&d[n+ 8], in[2]); - _mm_store_ps(&d[n+12], in[3]); - } - for (; n < n_samples; n++) { - g = _mm_set_ss(gain[0]); - in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n])); - for (i = 1; i < n_src; i++) { - g = _mm_set_ss(gain[i]); - in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n]))); - } - _mm_store_ss(&d[n], in[0]); - } + if (n_gain == 0) + dsp_add_sse(obj, dst, src, n_src, n_samples); + else if (n_gain < n_src) + dsp_add_1_gain_sse(obj, dst, src, n_src, gain[0], n_samples); + else + dsp_add_n_gain_sse(obj, dst, src, n_src, gain, n_gain, n_samples); } } diff --git a/spa/plugins/filter-graph/audio-dsp.h b/spa/plugins/filter-graph/audio-dsp.h index c8b338fc3..4fc06eb78 100644 --- a/spa/plugins/filter-graph/audio-dsp.h +++ b/spa/plugins/filter-graph/audio-dsp.h @@ -22,14 +22,14 @@ struct spa_fga_dsp_methods { #define SPA_VERSION_FGA_DSP_METHODS 0 uint32_t version; - void (*clear) (void *obj, void * SPA_RESTRICT dst, uint32_t n_samples); + void (*clear) (void *obj, float * SPA_RESTRICT dst, uint32_t n_samples); void (*copy) (void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, uint32_t n_samples); + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, uint32_t n_samples); void (*mix_gain) (void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], - float gain[], uint32_t n_src, uint32_t n_samples); + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples); void (*sum) (void *obj, float * dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples); @@ -52,8 +52,8 @@ struct spa_fga_dsp_methods { float * dst, const float * SPA_RESTRICT src, const float mult, const float add, uint32_t n_samples); void (*mult) (void *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples); + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples); void (*biquad_run) (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples); @@ -61,25 +61,25 @@ struct spa_fga_dsp_methods { float *dst, const float *src, uint32_t n_samples); }; -static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, void * SPA_RESTRICT dst, uint32_t n_samples) +static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, float * SPA_RESTRICT dst, uint32_t n_samples) { spa_api_method_v(spa_fga_dsp, &obj->iface, clear, 0, dst, n_samples); } static inline void spa_fga_dsp_copy(struct spa_fga_dsp *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src, uint32_t n_samples) { spa_api_method_v(spa_fga_dsp, &obj->iface, copy, 0, dst, src, n_samples); } static inline void spa_fga_dsp_mix_gain(struct spa_fga_dsp *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], - float gain[], uint32_t n_src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, + float gain[], uint32_t n_gain, uint32_t n_samples) { spa_api_method_v(spa_fga_dsp, &obj->iface, mix_gain, 0, - dst, src, gain, n_src, n_samples); + dst, src, n_src, gain, n_gain, n_samples); } static inline void spa_fga_dsp_sum(struct spa_fga_dsp *obj, float * dst, const float * SPA_RESTRICT a, @@ -143,8 +143,8 @@ static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj, dst, src, mult, add, n_samples); } static inline void spa_fga_dsp_mult(struct spa_fga_dsp *obj, - void * SPA_RESTRICT dst, - const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) { spa_api_method_v(spa_fga_dsp, &obj->iface, mult, 0, dst, src, n_src, n_samples); diff --git a/spa/plugins/filter-graph/builtin_plugin.c b/spa/plugins/filter-graph/builtin_plugin.c index b36958175..e0ec3c2ad 100644 --- a/spa/plugins/filter-graph/builtin_plugin.c +++ b/spa/plugins/filter-graph/builtin_plugin.c @@ -121,8 +121,9 @@ static void mixer_run(void * Instance, unsigned long SampleCount) struct builtin *impl = Instance; int i, n_src = 0; float *out = impl->port[0]; - const void *src[8]; + const float *src[8]; float gains[8]; + bool eq_gain = true; if (out == NULL) return; @@ -136,8 +137,13 @@ static void mixer_run(void * Instance, unsigned long SampleCount) src[n_src] = in; gains[n_src++] = gain; + if (gain != gains[0]) + eq_gain = false; } - spa_fga_dsp_mix_gain(impl->dsp, out, src, gains, n_src, SampleCount); + if (eq_gain) + spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, 1, SampleCount); + else + spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, n_src, SampleCount); } static struct spa_fga_port mixer_ports[] = { @@ -1589,7 +1595,7 @@ static void mult_run(void * Instance, unsigned long SampleCount) struct builtin *impl = Instance; int i, n_src = 0; float *out = impl->port[0]; - const void *src[8]; + const float *src[8]; if (out == NULL) return;