filter-graph: small convolver optimizations

Use FMA when we can, make sure FMA compilation is supported and the CPU also supports it at runtime. Avoid divisions by doing the modulo increment more explicitly.
2026-06-06 03:02:54 -04:00 · 2026-04-10 11:57:09 +02:00 · 2026-04-10 11:57:09 +02:00 · 5075f27ea0
commit 5075f27ea0
parent c2f85ffc51
4 changed files with 23 additions and 21 deletions
--- a/spa/plugins/filter-graph/audio-dsp-avx2.c
+++ b/spa/plugins/filter-graph/audio-dsp-avx2.c
@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst,
 		for (i = 1; i < n_src; i++) {
 			g = _mm256_set1_ps(gain[i]);
-			in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
+			in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]);
-			in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
+			in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]);
-			in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
+			in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]);
-			in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
+			in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]);
 		}
 		_mm256_store_ps(&d[n+ 0], in[0]);
 		_mm256_store_ps(&d[n+ 8], in[1]);
@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
 inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
 {
-	__m256 aa, bb, dc, x0, x1;
+	__m256 aa, bb, dc, x1;
 	aa = _mm256_moveldup_ps(ab);
 	bb = _mm256_movehdup_ps(ab);
 	x0 = _mm256_mul_ps(aa, cd);
 	dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
 	x1 = _mm256_mul_ps(bb, dc);
-	return _mm256_addsub_ps(x0, x1);
+	return _mm256_fmaddsub_ps(aa, cd, x1);
 }
 void dsp_fft_cmul_avx2(void *obj, void *fft,
@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft,
 		bb[1] = _mm256_load_ps(&b[2*i+8]);	/* br2 bi2 br3 bi3 */
 		dd[0] = _mm256_mul_pz(aa[0], bb[0]);
 		dd[1] = _mm256_mul_pz(aa[1], bb[1]);
 		dd[0] = _mm256_mul_ps(dd[0], s);
 		dd[1] = _mm256_mul_ps(dd[1], s);
 		t[0] = _mm256_load_ps(&src[2*i]);
 		t[1] = _mm256_load_ps(&src[2*i+8]);
-		t[0] = _mm256_add_ps(t[0], dd[0]);
+		t[0] = _mm256_fmadd_ps(dd[0], s, t[0]);
-		t[1] = _mm256_add_ps(t[1], dd[1]);
+		t[1] = _mm256_fmadd_ps(dd[1], s, t[1]);
 		_mm256_store_ps(&dst[2*i], t[0]);
 		_mm256_store_ps(&dst[2*i+8], t[1]);
 	}
--- a/spa/plugins/filter-graph/audio-dsp.c
+++ b/spa/plugins/filter-graph/audio-dsp.c
@ -24,7 +24,7 @@ struct dsp_info {
 static const struct dsp_info dsp_table[] =
 {
 #if defined (HAVE_AVX2)
-	{ SPA_CPU_FLAG_AVX2,
+	{ SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3,
 		.funcs.clear = dsp_clear_c,
 		.funcs.copy = dsp_copy_c,
 		.funcs.mix_gain = dsp_mix_gain_avx2,
--- a/spa/plugins/filter-graph/convolver.c
+++ b/spa/plugins/filter-graph/convolver.c
@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 		if (conv->segCount > 1) {
 			if (inputBufferFill == 0) {
-				int indexAudio = (conv->current + 1) % conv->segCount;
+				int indexAudio = conv->current;
 				if (++indexAudio == conv->segCount)
 					indexAudio = 0;
 				spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult,
 						conv->segmentsIr[1],
@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 						conv->fftComplexSize, conv->scale);
 				for (i = 2; i < conv->segCount; i++) {
-					indexAudio = (conv->current + i) % conv->segCount;
+					if (++indexAudio == conv->segCount)
 						indexAudio = 0;
 					spa_fga_dsp_fft_cmuladd(dsp, conv->fft,
 							conv->pre_mult,
@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
 			SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]);
-			conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1);
+			if (conv->current == 0)
 				conv->current = conv->segCount;
 			conv->current--;
 		}
 		processed += processing;
 	}
 	conv->inputBufferFill = inputBufferFill;
--- a/spa/plugins/filter-graph/meson.build
+++ b/spa/plugins/filter-graph/meson.build
@ -18,16 +18,16 @@ if have_sse
  simd_cargs += ['-DHAVE_SSE']
  simd_dependencies += filter_graph_sse
 endif
-if have_avx2
+if have_avx2 and have_fma
-  filter_graph_avx2 = static_library('filter_graph_avx2',
+  filter_graph_avx2_fma = static_library('filter_graph_avx2_fma',
    ['audio-dsp-avx2.c' ],
    include_directories : [configinc],
-    c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'],
+    c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
    dependencies : [ spa_dep ],
    install : false
    )
-  simd_cargs += ['-DHAVE_AVX2']
+  simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA']
-  simd_dependencies += filter_graph_avx2
+  simd_dependencies += filter_graph_avx2_fma
 endif
 if have_neon
  filter_graph_neon = static_library('filter_graph_neon',