filter-chain: add support for fftw in the convolver

It's faster than pffft.
This commit is contained in:
Wim Taymans 2024-10-18 16:26:39 +02:00
parent f810c7c15f
commit 7f8ce35709
7 changed files with 267 additions and 5 deletions

View file

@ -311,6 +311,9 @@ cdata.set('HAVE_DBUS', dbus_dep.found())
sdl_dep = dependency('sdl2', required : get_option('sdl2'))
summary({'SDL2 (video examples)': sdl_dep.found()}, bool_yn: true, section: 'Misc dependencies')
drm_dep = dependency('libdrm', required : false)
fftw_dep = dependency('fftw3f', required : false)
summary({'fftw3f (filter-chain convolver)': fftw_dep.found()}, bool_yn: true, section: 'Misc dependencies')
cdata.set('HAVE_FFTW', fftw_dep.found())
if get_option('readline').disabled()
readline_dep = dependency('', required: false)

View file

@ -83,6 +83,7 @@ if have_sse
filter_chain_sse = static_library('filter_chain_sse',
['module-filter-chain/pffft.c',
'module-filter-chain/dsp-ops-sse.c' ],
include_directories : [configinc],
c_args : [sse_args, '-O3', '-DHAVE_SSE'],
dependencies : [ spa_dep ],
install : false
@ -93,6 +94,7 @@ endif
if have_avx
filter_chain_avx = static_library('filter_chain_avx',
['module-filter-chain/dsp-ops-avx.c' ],
include_directories : [configinc],
c_args : [avx_args, fma_args,'-O3', '-DHAVE_AVX'],
dependencies : [ spa_dep ],
install : false
@ -115,8 +117,9 @@ filter_chain_c = static_library('filter_chain_c',
['module-filter-chain/pffft.c',
'module-filter-chain/dsp-ops.c',
'module-filter-chain/dsp-ops-c.c' ],
include_directories : [configinc],
c_args : [simd_cargs, '-O3', '-DPFFFT_SIMD_DISABLE'],
dependencies : [ spa_dep ],
dependencies : [ spa_dep, fftw_dep],
install : false
)
simd_dependencies += filter_chain_c

View file

@ -123,3 +123,93 @@ void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b,
_mm_store_ss(&r[n], in[0]);
}
}
inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
{
__m256 aa, bb, dc, x0, x1;
aa = _mm256_moveldup_ps(ab);
bb = _mm256_movehdup_ps(ab);
x0 = _mm256_mul_ps(aa, cd);
dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
x1 = _mm256_mul_ps(bb, dc);
return _mm256_addsub_ps(x0, x1);
}
void dsp_fft_cmul_avx(struct dsp_ops *ops, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
__m256 s = _mm256_set1_ps(scale);
__m256 aa[2], bb[2], dd[2];
uint32_t i, unrolled;
if (SPA_IS_ALIGNED(a, 32) &&
SPA_IS_ALIGNED(b, 32) &&
SPA_IS_ALIGNED(dst, 32))
unrolled = len & ~7;
else
unrolled = 0;
for (i = 0; i < unrolled; i+=8) {
aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */
bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
dd[0] = _mm256_mul_ps(dd[0], s);
dd[1] = _mm256_mul_ps(dd[1], s);
_mm256_store_ps(&dst[2*i], dd[0]);
_mm256_store_ps(&dst[2*i+8], dd[1]);
}
for (; i < len; i++) {
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve(fft, a, b, dst, scale);
#endif
}
void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
__m256 s = _mm256_set1_ps(scale);
__m256 aa[2], bb[2], dd[2], t[1];
uint32_t i, unrolled;
if (SPA_IS_ALIGNED(a, 32) &&
SPA_IS_ALIGNED(b, 32) &&
SPA_IS_ALIGNED(dst, 32))
unrolled = len & ~7;
else
unrolled = 0;
for (i = 0; i < unrolled; i+=8) {
aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */
bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
dd[0] = _mm256_mul_ps(dd[0], s);
dd[1] = _mm256_mul_ps(dd[1], s);
t[0] = _mm256_load_ps(&dst[2*i]);
t[1] = _mm256_load_ps(&dst[2*i+8]);
t[0] = _mm256_add_ps(t[0], dd[0]);
t[1] = _mm256_add_ps(t[1], dd[1]);
_mm256_store_ps(&dst[2*i], t[0]);
_mm256_store_ps(&dst[2*i+8], t[1]);
}
for (; i < len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
#endif
}

View file

@ -6,10 +6,16 @@
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <errno.h>
#include <spa/utils/defs.h>
#include "config.h"
#ifdef HAVE_FFTW
#include <fftw3.h>
#else
#include "pffft.h"
#endif
#include "dsp-ops.h"
void dsp_clear_c(struct dsp_ops *ops, void * SPA_RESTRICT dst, uint32_t n_samples)
@ -209,26 +215,75 @@ void dsp_delay_c(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_b
}
}
#ifdef HAVE_FFTW
struct fft_info {
fftwf_plan plan_r2c;
fftwf_plan plan_c2r;
};
#endif
void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real)
{
#ifdef HAVE_FFTW
struct fft_info *info = calloc(1, sizeof(struct fft_info));
float *rdata;
fftwf_complex *cdata;
if (info == NULL)
return NULL;
rdata = fftwf_alloc_real (size * 2);
cdata = fftwf_alloc_complex (size + 1);
info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE);
info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE);
fftwf_free(rdata);
fftwf_free(cdata);
return info;
#else
return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX);
#endif
}
void dsp_fft_free_c(struct dsp_ops *ops, void *fft)
{
#ifdef HAVE_FFTW
struct fft_info *info = fft;
fftwf_destroy_plan(info->plan_r2c);
fftwf_destroy_plan(info->plan_c2r);
free(info);
#else
pffft_destroy_setup(fft);
#endif
}
void dsp_fft_run_c(struct dsp_ops *ops, void *fft, int direction,
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
{
#ifdef HAVE_FFTW
struct fft_info *info = fft;
if (direction > 0)
fftwf_execute_dft_r2c (info->plan_r2c, (float*)src, (fftwf_complex*)dst);
else
fftwf_execute_dft_c2r (info->plan_c2r, (fftwf_complex*)src, dst);
#else
pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD);
#endif
}
void dsp_fft_cmul_c(struct dsp_ops *ops, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
for (uint32_t i = 0; i <= len; i++) {
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve(fft, a, b, dst, scale);
#endif
}
void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft,
@ -236,6 +291,13 @@ void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
for (uint32_t i = 0; i <= len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
#endif
}

View file

@ -6,9 +6,12 @@
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <complex.h>
#include <spa/utils/defs.h>
#include "config.h"
#include "dsp-ops.h"
#include <xmmintrin.h>
@ -533,3 +536,100 @@ void dsp_delay_sse(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n
}
*pos = w;
}
inline static void _mm_mul_pz(__m128 *a, __m128 *b, __m128 *d)
{
__m128 ar, ai, br, bi, arbr, arbi, aibi, aibr, dr, di;
ar = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(2,0,2,0)); /* ar0 ar1 ar2 ar3 */
ai = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(3,1,3,1)); /* ai0 ai1 ai2 ai3 */
br = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(2,0,2,0)); /* br0 br1 br2 br3 */
bi = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(3,1,3,1)) /* bi0 bi1 bi2 bi3 */;
arbr = _mm_mul_ps(ar, br); /* ar * br */
arbi = _mm_mul_ps(ar, bi); /* ar * bi */
aibi = _mm_mul_ps(ai, bi); /* ai * bi */
aibr = _mm_mul_ps(ai, br); /* ai * br */
dr = _mm_sub_ps(arbr, aibi); /* ar * br - ai * bi */
di = _mm_add_ps(arbi, aibr); /* ar * bi + ai * br */
d[0] = _mm_unpacklo_ps(dr, di);
d[1] = _mm_unpackhi_ps(dr, di);
}
void dsp_fft_cmul_sse(struct dsp_ops *ops, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
__m128 s = _mm_set1_ps(scale);
__m128 aa[2], bb[2], dd[2];
uint32_t i, unrolled;
if (SPA_IS_ALIGNED(a, 16) &&
SPA_IS_ALIGNED(b, 16) &&
SPA_IS_ALIGNED(dst, 16))
unrolled = len & ~3;
else
unrolled = 0;
for (i = 0; i < unrolled; i+=4) {
aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */
bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */
_mm_mul_pz(aa, bb, dd);
dd[0] = _mm_mul_ps(dd[0], s);
dd[1] = _mm_mul_ps(dd[1], s);
_mm_store_ps(&dst[2*i], dd[0]);
_mm_store_ps(&dst[2*i+4], dd[1]);
}
for (; i < len; i++) {
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve(fft, a, b, dst, scale);
#endif
}
void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale)
{
#ifdef HAVE_FFTW
__m128 s = _mm_set1_ps(scale);
__m128 aa[2], bb[2], dd[2], t[1];
uint32_t i, unrolled;
if (SPA_IS_ALIGNED(a, 16) &&
SPA_IS_ALIGNED(b, 16) &&
SPA_IS_ALIGNED(dst, 16))
unrolled = len & ~3;
else
unrolled = 0;
for (i = 0; i < unrolled; i+=4) {
aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */
bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */
_mm_mul_pz(aa, bb, dd);
dd[0] = _mm_mul_ps(dd[0], s);
dd[1] = _mm_mul_ps(dd[1], s);
t[0] = _mm_load_ps(&dst[2*i]);
t[1] = _mm_load_ps(&dst[2*i+4]);
t[0] = _mm_add_ps(t[0], dd[0]);
t[1] = _mm_add_ps(t[1], dd[1]);
_mm_store_ps(&dst[2*i], t[0]);
_mm_store_ps(&dst[2*i+4], t[1]);
}
for (; i < len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
}
#else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
#endif
}

View file

@ -33,8 +33,8 @@ static struct dsp_info dsp_table[] =
.funcs.fft_new = dsp_fft_new_c,
.funcs.fft_free = dsp_fft_free_c,
.funcs.fft_run = dsp_fft_run_c,
.funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.fft_cmul = dsp_fft_cmul_avx,
.funcs.fft_cmuladd = dsp_fft_cmuladd_avx,
.funcs.biquadn_run = dsp_biquadn_run_sse,
.funcs.delay = dsp_delay_sse,
},
@ -51,8 +51,8 @@ static struct dsp_info dsp_table[] =
.funcs.fft_new = dsp_fft_new_c,
.funcs.fft_free = dsp_fft_free_c,
.funcs.fft_run = dsp_fft_run_c,
.funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.fft_cmul = dsp_fft_cmul_sse,
.funcs.fft_cmuladd = dsp_fft_cmuladd_sse,
.funcs.biquadn_run = dsp_biquadn_run_sse,
.funcs.delay = dsp_delay_sse,
},

View file

@ -148,10 +148,14 @@ MAKE_SUM_FUNC(sse);
MAKE_BIQUAD_RUN_FUNC(sse);
MAKE_BIQUADN_RUN_FUNC(sse);
MAKE_DELAY_FUNC(sse);
MAKE_FFT_CMUL_FUNC(sse);
MAKE_FFT_CMULADD_FUNC(sse);
#endif
#if defined (HAVE_AVX)
MAKE_MIX_GAIN_FUNC(avx);
MAKE_SUM_FUNC(avx);
MAKE_FFT_CMUL_FUNC(avx);
MAKE_FFT_CMULADD_FUNC(avx);
#endif
#endif /* DSP_OPS_H */