diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index 76c7b17d5..346b26ab3 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst, for (i = 1; i < n_src; i++) { g = _mm256_set1_ps(gain[i]); - in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0]))); - in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8]))); - in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16]))); - in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24]))); + in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]); + in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]); + in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]); + in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]); } _mm256_store_ps(&d[n+ 0], in[0]); _mm256_store_ps(&d[n+ 8], in[1]); @@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd) { - __m256 aa, bb, dc, x0, x1; + __m256 aa, bb, dc, x1; aa = _mm256_moveldup_ps(ab); bb = _mm256_movehdup_ps(ab); - x0 = _mm256_mul_ps(aa, cd); dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1)); x1 = _mm256_mul_ps(bb, dc); - return _mm256_addsub_ps(x0, x1); + return _mm256_fmaddsub_ps(aa, cd, x1); } void dsp_fft_cmul_avx2(void *obj, void *fft, @@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft, bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ dd[0] = _mm256_mul_pz(aa[0], bb[0]); dd[1] = _mm256_mul_pz(aa[1], bb[1]); - dd[0] = _mm256_mul_ps(dd[0], s); - dd[1] = _mm256_mul_ps(dd[1], s); t[0] = _mm256_load_ps(&src[2*i]); t[1] = _mm256_load_ps(&src[2*i+8]); - t[0] = _mm256_add_ps(t[0], dd[0]); - t[1] = _mm256_add_ps(t[1], dd[1]); + t[0] = _mm256_fmadd_ps(dd[0], s, t[0]); + t[1] = _mm256_fmadd_ps(dd[1], s, t[1]); _mm256_store_ps(&dst[2*i], t[0]); _mm256_store_ps(&dst[2*i+8], t[1]); } diff --git a/spa/plugins/filter-graph/audio-dsp.c b/spa/plugins/filter-graph/audio-dsp.c index 133b53db5..d0c4ef008 100644 --- a/spa/plugins/filter-graph/audio-dsp.c +++ b/spa/plugins/filter-graph/audio-dsp.c @@ -24,7 +24,7 @@ struct dsp_info { static const struct dsp_info dsp_table[] = { #if defined (HAVE_AVX2) - { SPA_CPU_FLAG_AVX2, + { SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3, .funcs.clear = dsp_clear_c, .funcs.copy = dsp_copy_c, .funcs.mix_gain = dsp_mix_gain_avx2, diff --git a/spa/plugins/filter-graph/convolver.c b/spa/plugins/filter-graph/convolver.c index a077c6ec1..788b118e3 100644 --- a/spa/plugins/filter-graph/convolver.c +++ b/spa/plugins/filter-graph/convolver.c @@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons if (conv->segCount > 1) { if (inputBufferFill == 0) { - int indexAudio = (conv->current + 1) % conv->segCount; + int indexAudio = conv->current; + + if (++indexAudio == conv->segCount) + indexAudio = 0; spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult, conv->segmentsIr[1], @@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons conv->fftComplexSize, conv->scale); for (i = 2; i < conv->segCount; i++) { - indexAudio = (conv->current + i) % conv->segCount; + if (++indexAudio == conv->segCount) + indexAudio = 0; spa_fga_dsp_fft_cmuladd(dsp, conv->fft, conv->pre_mult, @@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]); - conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1); + if (conv->current == 0) + conv->current = conv->segCount; + conv->current--; } - processed += processing; } conv->inputBufferFill = inputBufferFill; diff --git a/spa/plugins/filter-graph/meson.build b/spa/plugins/filter-graph/meson.build index 94ee0bd25..20b90f4c4 100644 --- a/spa/plugins/filter-graph/meson.build +++ b/spa/plugins/filter-graph/meson.build @@ -18,16 +18,16 @@ if have_sse simd_cargs += ['-DHAVE_SSE'] simd_dependencies += filter_graph_sse endif -if have_avx2 - filter_graph_avx2 = static_library('filter_graph_avx2', +if have_avx2 and have_fma + filter_graph_avx2_fma = static_library('filter_graph_avx2_fma', ['audio-dsp-avx2.c' ], include_directories : [configinc], - c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'], + c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'], dependencies : [ spa_dep ], install : false ) - simd_cargs += ['-DHAVE_AVX2'] - simd_dependencies += filter_graph_avx2 + simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA'] + simd_dependencies += filter_graph_avx2_fma endif if have_neon filter_graph_neon = static_library('filter_graph_neon',