filter-graph: small convolver optimizations

Use FMA when we can, make sure FMA compilation is supported and the CPU
also supports it at runtime.

Avoid divisions by doing the modulo increment more explicitly.
This commit is contained in:
Wim Taymans 2026-04-10 11:57:09 +02:00
parent c2f85ffc51
commit 5075f27ea0
4 changed files with 23 additions and 21 deletions

View file

@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst,
for (i = 1; i < n_src; i++) {
g = _mm256_set1_ps(gain[i]);
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]);
in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]);
in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]);
in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]);
}
_mm256_store_ps(&d[n+ 0], in[0]);
_mm256_store_ps(&d[n+ 8], in[1]);
@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
{
__m256 aa, bb, dc, x0, x1;
__m256 aa, bb, dc, x1;
aa = _mm256_moveldup_ps(ab);
bb = _mm256_movehdup_ps(ab);
x0 = _mm256_mul_ps(aa, cd);
dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
x1 = _mm256_mul_ps(bb, dc);
return _mm256_addsub_ps(x0, x1);
return _mm256_fmaddsub_ps(aa, cd, x1);
}
void dsp_fft_cmul_avx2(void *obj, void *fft,
@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft,
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
dd[0] = _mm256_mul_ps(dd[0], s);
dd[1] = _mm256_mul_ps(dd[1], s);
t[0] = _mm256_load_ps(&src[2*i]);
t[1] = _mm256_load_ps(&src[2*i+8]);
t[0] = _mm256_add_ps(t[0], dd[0]);
t[1] = _mm256_add_ps(t[1], dd[1]);
t[0] = _mm256_fmadd_ps(dd[0], s, t[0]);
t[1] = _mm256_fmadd_ps(dd[1], s, t[1]);
_mm256_store_ps(&dst[2*i], t[0]);
_mm256_store_ps(&dst[2*i+8], t[1]);
}

View file

@ -24,7 +24,7 @@ struct dsp_info {
static const struct dsp_info dsp_table[] =
{
#if defined (HAVE_AVX2)
{ SPA_CPU_FLAG_AVX2,
{ SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3,
.funcs.clear = dsp_clear_c,
.funcs.copy = dsp_copy_c,
.funcs.mix_gain = dsp_mix_gain_avx2,

View file

@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
if (conv->segCount > 1) {
if (inputBufferFill == 0) {
int indexAudio = (conv->current + 1) % conv->segCount;
int indexAudio = conv->current;
if (++indexAudio == conv->segCount)
indexAudio = 0;
spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult,
conv->segmentsIr[1],
@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
conv->fftComplexSize, conv->scale);
for (i = 2; i < conv->segCount; i++) {
indexAudio = (conv->current + i) % conv->segCount;
if (++indexAudio == conv->segCount)
indexAudio = 0;
spa_fga_dsp_fft_cmuladd(dsp, conv->fft,
conv->pre_mult,
@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]);
conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1);
if (conv->current == 0)
conv->current = conv->segCount;
conv->current--;
}
processed += processing;
}
conv->inputBufferFill = inputBufferFill;

View file

@ -18,16 +18,16 @@ if have_sse
simd_cargs += ['-DHAVE_SSE']
simd_dependencies += filter_graph_sse
endif
if have_avx2
filter_graph_avx2 = static_library('filter_graph_avx2',
if have_avx2 and have_fma
filter_graph_avx2_fma = static_library('filter_graph_avx2_fma',
['audio-dsp-avx2.c' ],
include_directories : [configinc],
c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'],
c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
dependencies : [ spa_dep ],
install : false
)
simd_cargs += ['-DHAVE_AVX2']
simd_dependencies += filter_graph_avx2
simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA']
simd_dependencies += filter_graph_avx2_fma
endif
if have_neon
filter_graph_neon = static_library('filter_graph_neon',