From 5075f27ea09eb07999eef6a00e9323e1304ba76b Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Fri, 10 Apr 2026 11:57:09 +0200 Subject: [PATCH] filter-graph: small convolver optimizations Use FMA when we can, make sure FMA compilation is supported and the CPU also supports it at runtime. Avoid divisions by doing the modulo increment more explicitly. --- spa/plugins/filter-graph/audio-dsp-avx2.c | 19 ++++++++----------- spa/plugins/filter-graph/audio-dsp.c | 2 +- spa/plugins/filter-graph/convolver.c | 13 +++++++++---- spa/plugins/filter-graph/meson.build | 10 +++++----- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index 76c7b17d5..346b26ab3 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst, for (i = 1; i < n_src; i++) { g = _mm256_set1_ps(gain[i]); - in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0]))); - in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8]))); - in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16]))); - in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24]))); + in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]); + in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]); + in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]); + in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]); } _mm256_store_ps(&d[n+ 0], in[0]); _mm256_store_ps(&d[n+ 8], in[1]); @@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd) { - __m256 aa, bb, dc, x0, x1; + __m256 aa, bb, dc, x1; aa = _mm256_moveldup_ps(ab); bb = _mm256_movehdup_ps(ab); - x0 = _mm256_mul_ps(aa, cd); dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1)); x1 = _mm256_mul_ps(bb, dc); - return _mm256_addsub_ps(x0, x1); + return _mm256_fmaddsub_ps(aa, cd, x1); } void dsp_fft_cmul_avx2(void *obj, void *fft, @@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft, bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ dd[0] = _mm256_mul_pz(aa[0], bb[0]); dd[1] = _mm256_mul_pz(aa[1], bb[1]); - dd[0] = _mm256_mul_ps(dd[0], s); - dd[1] = _mm256_mul_ps(dd[1], s); t[0] = _mm256_load_ps(&src[2*i]); t[1] = _mm256_load_ps(&src[2*i+8]); - t[0] = _mm256_add_ps(t[0], dd[0]); - t[1] = _mm256_add_ps(t[1], dd[1]); + t[0] = _mm256_fmadd_ps(dd[0], s, t[0]); + t[1] = _mm256_fmadd_ps(dd[1], s, t[1]); _mm256_store_ps(&dst[2*i], t[0]); _mm256_store_ps(&dst[2*i+8], t[1]); } diff --git a/spa/plugins/filter-graph/audio-dsp.c b/spa/plugins/filter-graph/audio-dsp.c index 133b53db5..d0c4ef008 100644 --- a/spa/plugins/filter-graph/audio-dsp.c +++ b/spa/plugins/filter-graph/audio-dsp.c @@ -24,7 +24,7 @@ struct dsp_info { static const struct dsp_info dsp_table[] = { #if defined (HAVE_AVX2) - { SPA_CPU_FLAG_AVX2, + { SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3, .funcs.clear = dsp_clear_c, .funcs.copy = dsp_copy_c, .funcs.mix_gain = dsp_mix_gain_avx2, diff --git a/spa/plugins/filter-graph/convolver.c b/spa/plugins/filter-graph/convolver.c index a077c6ec1..788b118e3 100644 --- a/spa/plugins/filter-graph/convolver.c +++ b/spa/plugins/filter-graph/convolver.c @@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons if (conv->segCount > 1) { if (inputBufferFill == 0) { - int indexAudio = (conv->current + 1) % conv->segCount; + int indexAudio = conv->current; + + if (++indexAudio == conv->segCount) + indexAudio = 0; spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult, conv->segmentsIr[1], @@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons conv->fftComplexSize, conv->scale); for (i = 2; i < conv->segCount; i++) { - indexAudio = (conv->current + i) % conv->segCount; + if (++indexAudio == conv->segCount) + indexAudio = 0; spa_fga_dsp_fft_cmuladd(dsp, conv->fft, conv->pre_mult, @@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]); - conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1); + if (conv->current == 0) + conv->current = conv->segCount; + conv->current--; } - processed += processing; } conv->inputBufferFill = inputBufferFill; diff --git a/spa/plugins/filter-graph/meson.build b/spa/plugins/filter-graph/meson.build index 94ee0bd25..20b90f4c4 100644 --- a/spa/plugins/filter-graph/meson.build +++ b/spa/plugins/filter-graph/meson.build @@ -18,16 +18,16 @@ if have_sse simd_cargs += ['-DHAVE_SSE'] simd_dependencies += filter_graph_sse endif -if have_avx2 - filter_graph_avx2 = static_library('filter_graph_avx2', +if have_avx2 and have_fma + filter_graph_avx2_fma = static_library('filter_graph_avx2_fma', ['audio-dsp-avx2.c' ], include_directories : [configinc], - c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'], + c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'], dependencies : [ spa_dep ], install : false ) - simd_cargs += ['-DHAVE_AVX2'] - simd_dependencies += filter_graph_avx2 + simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA'] + simd_dependencies += filter_graph_avx2_fma endif if have_neon filter_graph_neon = static_library('filter_graph_neon',