From 7f8ce3570966fec43c6fc3e711189a34c73410f7 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Fri, 18 Oct 2024 16:26:39 +0200 Subject: [PATCH] filter-chain: add support for fftw in the convolver It's faster than pffft. --- meson.build | 3 + src/modules/meson.build | 5 +- src/modules/module-filter-chain/dsp-ops-avx.c | 90 ++++++++++++++++ src/modules/module-filter-chain/dsp-ops-c.c | 62 +++++++++++ src/modules/module-filter-chain/dsp-ops-sse.c | 100 ++++++++++++++++++ src/modules/module-filter-chain/dsp-ops.c | 8 +- src/modules/module-filter-chain/dsp-ops.h | 4 + 7 files changed, 267 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 62cf11c80..a33d2ed33 100644 --- a/meson.build +++ b/meson.build @@ -311,6 +311,9 @@ cdata.set('HAVE_DBUS', dbus_dep.found()) sdl_dep = dependency('sdl2', required : get_option('sdl2')) summary({'SDL2 (video examples)': sdl_dep.found()}, bool_yn: true, section: 'Misc dependencies') drm_dep = dependency('libdrm', required : false) +fftw_dep = dependency('fftw3f', required : false) +summary({'fftw3f (filter-chain convolver)': fftw_dep.found()}, bool_yn: true, section: 'Misc dependencies') +cdata.set('HAVE_FFTW', fftw_dep.found()) if get_option('readline').disabled() readline_dep = dependency('', required: false) diff --git a/src/modules/meson.build b/src/modules/meson.build index 3f400f087..ee3121b6f 100644 --- a/src/modules/meson.build +++ b/src/modules/meson.build @@ -83,6 +83,7 @@ if have_sse filter_chain_sse = static_library('filter_chain_sse', ['module-filter-chain/pffft.c', 'module-filter-chain/dsp-ops-sse.c' ], + include_directories : [configinc], c_args : [sse_args, '-O3', '-DHAVE_SSE'], dependencies : [ spa_dep ], install : false @@ -93,6 +94,7 @@ endif if have_avx filter_chain_avx = static_library('filter_chain_avx', ['module-filter-chain/dsp-ops-avx.c' ], + include_directories : [configinc], c_args : [avx_args, fma_args,'-O3', '-DHAVE_AVX'], dependencies : [ spa_dep ], install : false @@ -115,8 +117,9 @@ filter_chain_c = static_library('filter_chain_c', ['module-filter-chain/pffft.c', 'module-filter-chain/dsp-ops.c', 'module-filter-chain/dsp-ops-c.c' ], + include_directories : [configinc], c_args : [simd_cargs, '-O3', '-DPFFFT_SIMD_DISABLE'], - dependencies : [ spa_dep ], + dependencies : [ spa_dep, fftw_dep], install : false ) simd_dependencies += filter_chain_c diff --git a/src/modules/module-filter-chain/dsp-ops-avx.c b/src/modules/module-filter-chain/dsp-ops-avx.c index 1609da821..6f189a4d7 100644 --- a/src/modules/module-filter-chain/dsp-ops-avx.c +++ b/src/modules/module-filter-chain/dsp-ops-avx.c @@ -123,3 +123,93 @@ void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b, _mm_store_ss(&r[n], in[0]); } } + +inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd) +{ + __m256 aa, bb, dc, x0, x1; + aa = _mm256_moveldup_ps(ab); + bb = _mm256_movehdup_ps(ab); + x0 = _mm256_mul_ps(aa, cd); + dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1)); + x1 = _mm256_mul_ps(bb, dc); + return _mm256_addsub_ps(x0, x1); +} + +void dsp_fft_cmul_avx(struct dsp_ops *ops, void *fft, + float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, + const float * SPA_RESTRICT b, uint32_t len, const float scale) +{ +#ifdef HAVE_FFTW + __m256 s = _mm256_set1_ps(scale); + __m256 aa[2], bb[2], dd[2]; + uint32_t i, unrolled; + + if (SPA_IS_ALIGNED(a, 32) && + SPA_IS_ALIGNED(b, 32) && + SPA_IS_ALIGNED(dst, 32)) + unrolled = len & ~7; + else + unrolled = 0; + + for (i = 0; i < unrolled; i+=8) { + aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ + aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */ + bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ + bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ + dd[0] = _mm256_mul_pz(aa[0], bb[0]); + dd[1] = _mm256_mul_pz(aa[1], bb[1]); + dd[0] = _mm256_mul_ps(dd[0], s); + dd[1] = _mm256_mul_ps(dd[1], s); + _mm256_store_ps(&dst[2*i], dd[0]); + _mm256_store_ps(&dst[2*i+8], dd[1]); + } + for (; i < len; i++) { + dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else + pffft_zconvolve(fft, a, b, dst, scale); +#endif +} + +void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft, + float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, + const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, + uint32_t len, const float scale) +{ +#ifdef HAVE_FFTW + __m256 s = _mm256_set1_ps(scale); + __m256 aa[2], bb[2], dd[2], t[1]; + uint32_t i, unrolled; + + if (SPA_IS_ALIGNED(a, 32) && + SPA_IS_ALIGNED(b, 32) && + SPA_IS_ALIGNED(dst, 32)) + unrolled = len & ~7; + else + unrolled = 0; + + for (i = 0; i < unrolled; i+=8) { + aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ + aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */ + bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ + bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ + dd[0] = _mm256_mul_pz(aa[0], bb[0]); + dd[1] = _mm256_mul_pz(aa[1], bb[1]); + dd[0] = _mm256_mul_ps(dd[0], s); + dd[1] = _mm256_mul_ps(dd[1], s); + t[0] = _mm256_load_ps(&dst[2*i]); + t[1] = _mm256_load_ps(&dst[2*i+8]); + t[0] = _mm256_add_ps(t[0], dd[0]); + t[1] = _mm256_add_ps(t[1], dd[1]); + _mm256_store_ps(&dst[2*i], t[0]); + _mm256_store_ps(&dst[2*i+8], t[1]); + } + for (; i < len; i++) { + dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else + pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); +#endif +} diff --git a/src/modules/module-filter-chain/dsp-ops-c.c b/src/modules/module-filter-chain/dsp-ops-c.c index 04d356eef..59814d872 100644 --- a/src/modules/module-filter-chain/dsp-ops-c.c +++ b/src/modules/module-filter-chain/dsp-ops-c.c @@ -6,10 +6,16 @@ #include #include #include +#include #include +#include "config.h" +#ifdef HAVE_FFTW +#include +#else #include "pffft.h" +#endif #include "dsp-ops.h" void dsp_clear_c(struct dsp_ops *ops, void * SPA_RESTRICT dst, uint32_t n_samples) @@ -209,26 +215,75 @@ void dsp_delay_c(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_b } } +#ifdef HAVE_FFTW +struct fft_info { + fftwf_plan plan_r2c; + fftwf_plan plan_c2r; +}; +#endif + void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real) { +#ifdef HAVE_FFTW + struct fft_info *info = calloc(1, sizeof(struct fft_info)); + float *rdata; + fftwf_complex *cdata; + + if (info == NULL) + return NULL; + + rdata = fftwf_alloc_real (size * 2); + cdata = fftwf_alloc_complex (size + 1); + + info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE); + info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE); + + fftwf_free(rdata); + fftwf_free(cdata); + + return info; +#else return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX); +#endif } void dsp_fft_free_c(struct dsp_ops *ops, void *fft) { +#ifdef HAVE_FFTW + struct fft_info *info = fft; + fftwf_destroy_plan(info->plan_r2c); + fftwf_destroy_plan(info->plan_c2r); + free(info); +#else pffft_destroy_setup(fft); +#endif } void dsp_fft_run_c(struct dsp_ops *ops, void *fft, int direction, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) { +#ifdef HAVE_FFTW + struct fft_info *info = fft; + if (direction > 0) + fftwf_execute_dft_r2c (info->plan_r2c, (float*)src, (fftwf_complex*)dst); + else + fftwf_execute_dft_c2r (info->plan_c2r, (fftwf_complex*)src, dst); +#else pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); +#endif } void dsp_fft_cmul_c(struct dsp_ops *ops, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) { +#ifdef HAVE_FFTW + for (uint32_t i = 0; i <= len; i++) { + dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else pffft_zconvolve(fft, a, b, dst, scale); +#endif } void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft, @@ -236,6 +291,13 @@ void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) { +#ifdef HAVE_FFTW + for (uint32_t i = 0; i <= len; i++) { + dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); +#endif } diff --git a/src/modules/module-filter-chain/dsp-ops-sse.c b/src/modules/module-filter-chain/dsp-ops-sse.c index a5c346ae0..61d76cb32 100644 --- a/src/modules/module-filter-chain/dsp-ops-sse.c +++ b/src/modules/module-filter-chain/dsp-ops-sse.c @@ -6,9 +6,12 @@ #include #include #include +#include #include +#include "config.h" + #include "dsp-ops.h" #include @@ -533,3 +536,100 @@ void dsp_delay_sse(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n } *pos = w; } + +inline static void _mm_mul_pz(__m128 *a, __m128 *b, __m128 *d) +{ + __m128 ar, ai, br, bi, arbr, arbi, aibi, aibr, dr, di; + ar = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(2,0,2,0)); /* ar0 ar1 ar2 ar3 */ + ai = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(3,1,3,1)); /* ai0 ai1 ai2 ai3 */ + br = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(2,0,2,0)); /* br0 br1 br2 br3 */ + bi = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(3,1,3,1)) /* bi0 bi1 bi2 bi3 */; + + arbr = _mm_mul_ps(ar, br); /* ar * br */ + arbi = _mm_mul_ps(ar, bi); /* ar * bi */ + + aibi = _mm_mul_ps(ai, bi); /* ai * bi */ + aibr = _mm_mul_ps(ai, br); /* ai * br */ + + dr = _mm_sub_ps(arbr, aibi); /* ar * br - ai * bi */ + di = _mm_add_ps(arbi, aibr); /* ar * bi + ai * br */ + d[0] = _mm_unpacklo_ps(dr, di); + d[1] = _mm_unpackhi_ps(dr, di); +} + +void dsp_fft_cmul_sse(struct dsp_ops *ops, void *fft, + float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, + const float * SPA_RESTRICT b, uint32_t len, const float scale) +{ +#ifdef HAVE_FFTW + __m128 s = _mm_set1_ps(scale); + __m128 aa[2], bb[2], dd[2]; + uint32_t i, unrolled; + + if (SPA_IS_ALIGNED(a, 16) && + SPA_IS_ALIGNED(b, 16) && + SPA_IS_ALIGNED(dst, 16)) + unrolled = len & ~3; + else + unrolled = 0; + + for (i = 0; i < unrolled; i+=4) { + aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ + aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */ + bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ + bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */ + _mm_mul_pz(aa, bb, dd); + dd[0] = _mm_mul_ps(dd[0], s); + dd[1] = _mm_mul_ps(dd[1], s); + _mm_store_ps(&dst[2*i], dd[0]); + _mm_store_ps(&dst[2*i+4], dd[1]); + } + for (; i < len; i++) { + dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else + pffft_zconvolve(fft, a, b, dst, scale); +#endif +} + +void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft, + float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, + const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, + uint32_t len, const float scale) +{ +#ifdef HAVE_FFTW + __m128 s = _mm_set1_ps(scale); + __m128 aa[2], bb[2], dd[2], t[1]; + uint32_t i, unrolled; + + if (SPA_IS_ALIGNED(a, 16) && + SPA_IS_ALIGNED(b, 16) && + SPA_IS_ALIGNED(dst, 16)) + unrolled = len & ~3; + else + unrolled = 0; + + for (i = 0; i < unrolled; i+=4) { + aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ + aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */ + bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ + bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */ + _mm_mul_pz(aa, bb, dd); + dd[0] = _mm_mul_ps(dd[0], s); + dd[1] = _mm_mul_ps(dd[1], s); + t[0] = _mm_load_ps(&dst[2*i]); + t[1] = _mm_load_ps(&dst[2*i+4]); + t[0] = _mm_add_ps(t[0], dd[0]); + t[1] = _mm_add_ps(t[1], dd[1]); + _mm_store_ps(&dst[2*i], t[0]); + _mm_store_ps(&dst[2*i+4], t[1]); + } + for (; i < len; i++) { + dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + } +#else + pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); +#endif +} diff --git a/src/modules/module-filter-chain/dsp-ops.c b/src/modules/module-filter-chain/dsp-ops.c index 1eafd273b..599642263 100644 --- a/src/modules/module-filter-chain/dsp-ops.c +++ b/src/modules/module-filter-chain/dsp-ops.c @@ -33,8 +33,8 @@ static struct dsp_info dsp_table[] = .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, .funcs.fft_run = dsp_fft_run_c, - .funcs.fft_cmul = dsp_fft_cmul_c, - .funcs.fft_cmuladd = dsp_fft_cmuladd_c, + .funcs.fft_cmul = dsp_fft_cmul_avx, + .funcs.fft_cmuladd = dsp_fft_cmuladd_avx, .funcs.biquadn_run = dsp_biquadn_run_sse, .funcs.delay = dsp_delay_sse, }, @@ -51,8 +51,8 @@ static struct dsp_info dsp_table[] = .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, .funcs.fft_run = dsp_fft_run_c, - .funcs.fft_cmul = dsp_fft_cmul_c, - .funcs.fft_cmuladd = dsp_fft_cmuladd_c, + .funcs.fft_cmul = dsp_fft_cmul_sse, + .funcs.fft_cmuladd = dsp_fft_cmuladd_sse, .funcs.biquadn_run = dsp_biquadn_run_sse, .funcs.delay = dsp_delay_sse, }, diff --git a/src/modules/module-filter-chain/dsp-ops.h b/src/modules/module-filter-chain/dsp-ops.h index 79a3e391a..433050333 100644 --- a/src/modules/module-filter-chain/dsp-ops.h +++ b/src/modules/module-filter-chain/dsp-ops.h @@ -148,10 +148,14 @@ MAKE_SUM_FUNC(sse); MAKE_BIQUAD_RUN_FUNC(sse); MAKE_BIQUADN_RUN_FUNC(sse); MAKE_DELAY_FUNC(sse); +MAKE_FFT_CMUL_FUNC(sse); +MAKE_FFT_CMULADD_FUNC(sse); #endif #if defined (HAVE_AVX) MAKE_MIX_GAIN_FUNC(avx); MAKE_SUM_FUNC(avx); +MAKE_FFT_CMUL_FUNC(avx); +MAKE_FFT_CMULADD_FUNC(avx); #endif #endif /* DSP_OPS_H */