From 5075f27ea09eb07999eef6a00e9323e1304ba76b Mon Sep 17 00:00:00 2001
From: Wim Taymans <wtaymans@redhat.com>
Date: Fri, 10 Apr 2026 11:57:09 +0200
Subject: [PATCH] filter-graph: small convolver optimizations

Use FMA when we can, make sure FMA compilation is supported and the CPU
also supports it at runtime.

Avoid divisions by doing the modulo increment more explicitly.
---
 spa/plugins/filter-graph/audio-dsp-avx2.c | 19 ++++++++-----------
 spa/plugins/filter-graph/audio-dsp.c      |  2 +-
 spa/plugins/filter-graph/convolver.c      | 13 +++++++++----
 spa/plugins/filter-graph/meson.build      | 10 +++++-----
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c
index 76c7b17d5..346b26ab3 100644
--- a/spa/plugins/filter-graph/audio-dsp-avx2.c
+++ b/spa/plugins/filter-graph/audio-dsp-avx2.c
@@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst,
 
 		for (i = 1; i < n_src; i++) {
 			g = _mm256_set1_ps(gain[i]);
-			in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
-			in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
-			in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
-			in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
+			in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]);
+			in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]);
+			in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]);
+			in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]);
 		}
 		_mm256_store_ps(&d[n+ 0], in[0]);
 		_mm256_store_ps(&d[n+ 8], in[1]);
@@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
 
 inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
 {
-	__m256 aa, bb, dc, x0, x1;
+	__m256 aa, bb, dc, x1;
 	aa = _mm256_moveldup_ps(ab);
 	bb = _mm256_movehdup_ps(ab);
-	x0 = _mm256_mul_ps(aa, cd);
 	dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
 	x1 = _mm256_mul_ps(bb, dc);
-	return _mm256_addsub_ps(x0, x1);
+	return _mm256_fmaddsub_ps(aa, cd, x1);
 }
 
 void dsp_fft_cmul_avx2(void *obj, void *fft,
@@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft,
 		bb[1] = _mm256_load_ps(&b[2*i+8]);	/* br2 bi2 br3 bi3 */
 		dd[0] = _mm256_mul_pz(aa[0], bb[0]);
 		dd[1] = _mm256_mul_pz(aa[1], bb[1]);
-		dd[0] = _mm256_mul_ps(dd[0], s);
-		dd[1] = _mm256_mul_ps(dd[1], s);
 		t[0] = _mm256_load_ps(&src[2*i]);
 		t[1] = _mm256_load_ps(&src[2*i+8]);
-		t[0] = _mm256_add_ps(t[0], dd[0]);
-		t[1] = _mm256_add_ps(t[1], dd[1]);
+		t[0] = _mm256_fmadd_ps(dd[0], s, t[0]);
+		t[1] = _mm256_fmadd_ps(dd[1], s, t[1]);
 		_mm256_store_ps(&dst[2*i], t[0]);
 		_mm256_store_ps(&dst[2*i+8], t[1]);
 	}
diff --git a/spa/plugins/filter-graph/audio-dsp.c b/spa/plugins/filter-graph/audio-dsp.c
index 133b53db5..d0c4ef008 100644
--- a/spa/plugins/filter-graph/audio-dsp.c
+++ b/spa/plugins/filter-graph/audio-dsp.c
@@ -24,7 +24,7 @@ struct dsp_info {
 static const struct dsp_info dsp_table[] =
 {
 #if defined (HAVE_AVX2)
-	{ SPA_CPU_FLAG_AVX2,
+	{ SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3,
 		.funcs.clear = dsp_clear_c,
 		.funcs.copy = dsp_copy_c,
 		.funcs.mix_gain = dsp_mix_gain_avx2,
diff --git a/spa/plugins/filter-graph/convolver.c b/spa/plugins/filter-graph/convolver.c
index a077c6ec1..788b118e3 100644
--- a/spa/plugins/filter-graph/convolver.c
+++ b/spa/plugins/filter-graph/convolver.c
@@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 
 		if (conv->segCount > 1) {
 			if (inputBufferFill == 0) {
-				int indexAudio = (conv->current + 1) % conv->segCount;
+				int indexAudio = conv->current;
+
+				if (++indexAudio == conv->segCount)
+					indexAudio = 0;
 
 				spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult,
 						conv->segmentsIr[1],
@@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 						conv->fftComplexSize, conv->scale);
 
 				for (i = 2; i < conv->segCount; i++) {
-					indexAudio = (conv->current + i) % conv->segCount;
+					if (++indexAudio == conv->segCount)
+						indexAudio = 0;
 
 					spa_fga_dsp_fft_cmuladd(dsp, conv->fft,
 							conv->pre_mult,
@@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 
 			SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]);
 
-			conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1);
+			if (conv->current == 0)
+				conv->current = conv->segCount;
+			conv->current--;
 		}
-
 		processed += processing;
 	}
 	conv->inputBufferFill = inputBufferFill;
diff --git a/spa/plugins/filter-graph/meson.build b/spa/plugins/filter-graph/meson.build
index 94ee0bd25..20b90f4c4 100644
--- a/spa/plugins/filter-graph/meson.build
+++ b/spa/plugins/filter-graph/meson.build
@@ -18,16 +18,16 @@ if have_sse
   simd_cargs += ['-DHAVE_SSE']
   simd_dependencies += filter_graph_sse
 endif
-if have_avx2
-  filter_graph_avx2 = static_library('filter_graph_avx2',
+if have_avx2 and have_fma
+  filter_graph_avx2_fma = static_library('filter_graph_avx2_fma',
     ['audio-dsp-avx2.c' ],
     include_directories : [configinc],
-    c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'],
+    c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
     dependencies : [ spa_dep ],
     install : false
     )
-  simd_cargs += ['-DHAVE_AVX2']
-  simd_dependencies += filter_graph_avx2
+  simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA']
+  simd_dependencies += filter_graph_avx2_fma
 endif
 if have_neon
   filter_graph_neon = static_library('filter_graph_neon',