mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2025-10-29 05:40:27 -04:00
filter-chain: add support for fftw in the convolver
It's faster than pffft.
This commit is contained in:
parent
f810c7c15f
commit
7f8ce35709
7 changed files with 267 additions and 5 deletions
|
|
@ -311,6 +311,9 @@ cdata.set('HAVE_DBUS', dbus_dep.found())
|
|||
sdl_dep = dependency('sdl2', required : get_option('sdl2'))
|
||||
summary({'SDL2 (video examples)': sdl_dep.found()}, bool_yn: true, section: 'Misc dependencies')
|
||||
drm_dep = dependency('libdrm', required : false)
|
||||
fftw_dep = dependency('fftw3f', required : false)
|
||||
summary({'fftw3f (filter-chain convolver)': fftw_dep.found()}, bool_yn: true, section: 'Misc dependencies')
|
||||
cdata.set('HAVE_FFTW', fftw_dep.found())
|
||||
|
||||
if get_option('readline').disabled()
|
||||
readline_dep = dependency('', required: false)
|
||||
|
|
|
|||
|
|
@ -83,6 +83,7 @@ if have_sse
|
|||
filter_chain_sse = static_library('filter_chain_sse',
|
||||
['module-filter-chain/pffft.c',
|
||||
'module-filter-chain/dsp-ops-sse.c' ],
|
||||
include_directories : [configinc],
|
||||
c_args : [sse_args, '-O3', '-DHAVE_SSE'],
|
||||
dependencies : [ spa_dep ],
|
||||
install : false
|
||||
|
|
@ -93,6 +94,7 @@ endif
|
|||
if have_avx
|
||||
filter_chain_avx = static_library('filter_chain_avx',
|
||||
['module-filter-chain/dsp-ops-avx.c' ],
|
||||
include_directories : [configinc],
|
||||
c_args : [avx_args, fma_args,'-O3', '-DHAVE_AVX'],
|
||||
dependencies : [ spa_dep ],
|
||||
install : false
|
||||
|
|
@ -115,8 +117,9 @@ filter_chain_c = static_library('filter_chain_c',
|
|||
['module-filter-chain/pffft.c',
|
||||
'module-filter-chain/dsp-ops.c',
|
||||
'module-filter-chain/dsp-ops-c.c' ],
|
||||
include_directories : [configinc],
|
||||
c_args : [simd_cargs, '-O3', '-DPFFFT_SIMD_DISABLE'],
|
||||
dependencies : [ spa_dep ],
|
||||
dependencies : [ spa_dep, fftw_dep],
|
||||
install : false
|
||||
)
|
||||
simd_dependencies += filter_chain_c
|
||||
|
|
|
|||
|
|
@ -123,3 +123,93 @@ void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b,
|
|||
_mm_store_ss(&r[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
|
||||
{
|
||||
__m256 aa, bb, dc, x0, x1;
|
||||
aa = _mm256_moveldup_ps(ab);
|
||||
bb = _mm256_movehdup_ps(ab);
|
||||
x0 = _mm256_mul_ps(aa, cd);
|
||||
dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
|
||||
x1 = _mm256_mul_ps(bb, dc);
|
||||
return _mm256_addsub_ps(x0, x1);
|
||||
}
|
||||
|
||||
void dsp_fft_cmul_avx(struct dsp_ops *ops, void *fft,
|
||||
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
|
||||
const float * SPA_RESTRICT b, uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
__m256 s = _mm256_set1_ps(scale);
|
||||
__m256 aa[2], bb[2], dd[2];
|
||||
uint32_t i, unrolled;
|
||||
|
||||
if (SPA_IS_ALIGNED(a, 32) &&
|
||||
SPA_IS_ALIGNED(b, 32) &&
|
||||
SPA_IS_ALIGNED(dst, 32))
|
||||
unrolled = len & ~7;
|
||||
else
|
||||
unrolled = 0;
|
||||
|
||||
for (i = 0; i < unrolled; i+=8) {
|
||||
aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
|
||||
aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */
|
||||
bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
|
||||
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
|
||||
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
|
||||
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
|
||||
dd[0] = _mm256_mul_ps(dd[0], s);
|
||||
dd[1] = _mm256_mul_ps(dd[1], s);
|
||||
_mm256_store_ps(&dst[2*i], dd[0]);
|
||||
_mm256_store_ps(&dst[2*i+8], dd[1]);
|
||||
}
|
||||
for (; i < len; i++) {
|
||||
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve(fft, a, b, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft,
|
||||
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
|
||||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
|
||||
uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
__m256 s = _mm256_set1_ps(scale);
|
||||
__m256 aa[2], bb[2], dd[2], t[1];
|
||||
uint32_t i, unrolled;
|
||||
|
||||
if (SPA_IS_ALIGNED(a, 32) &&
|
||||
SPA_IS_ALIGNED(b, 32) &&
|
||||
SPA_IS_ALIGNED(dst, 32))
|
||||
unrolled = len & ~7;
|
||||
else
|
||||
unrolled = 0;
|
||||
|
||||
for (i = 0; i < unrolled; i+=8) {
|
||||
aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
|
||||
aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */
|
||||
bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
|
||||
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
|
||||
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
|
||||
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
|
||||
dd[0] = _mm256_mul_ps(dd[0], s);
|
||||
dd[1] = _mm256_mul_ps(dd[1], s);
|
||||
t[0] = _mm256_load_ps(&dst[2*i]);
|
||||
t[1] = _mm256_load_ps(&dst[2*i+8]);
|
||||
t[0] = _mm256_add_ps(t[0], dd[0]);
|
||||
t[1] = _mm256_add_ps(t[1], dd[1]);
|
||||
_mm256_store_ps(&dst[2*i], t[0]);
|
||||
_mm256_store_ps(&dst[2*i+8], t[1]);
|
||||
}
|
||||
for (; i < len; i++) {
|
||||
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,10 +6,16 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <spa/utils/defs.h>
|
||||
|
||||
#include "config.h"
|
||||
#ifdef HAVE_FFTW
|
||||
#include <fftw3.h>
|
||||
#else
|
||||
#include "pffft.h"
|
||||
#endif
|
||||
#include "dsp-ops.h"
|
||||
|
||||
void dsp_clear_c(struct dsp_ops *ops, void * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
|
|
@ -209,26 +215,75 @@ void dsp_delay_c(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_b
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FFTW
|
||||
struct fft_info {
|
||||
fftwf_plan plan_r2c;
|
||||
fftwf_plan plan_c2r;
|
||||
};
|
||||
#endif
|
||||
|
||||
void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
struct fft_info *info = calloc(1, sizeof(struct fft_info));
|
||||
float *rdata;
|
||||
fftwf_complex *cdata;
|
||||
|
||||
if (info == NULL)
|
||||
return NULL;
|
||||
|
||||
rdata = fftwf_alloc_real (size * 2);
|
||||
cdata = fftwf_alloc_complex (size + 1);
|
||||
|
||||
info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE);
|
||||
info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE);
|
||||
|
||||
fftwf_free(rdata);
|
||||
fftwf_free(cdata);
|
||||
|
||||
return info;
|
||||
#else
|
||||
return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX);
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsp_fft_free_c(struct dsp_ops *ops, void *fft)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
struct fft_info *info = fft;
|
||||
fftwf_destroy_plan(info->plan_r2c);
|
||||
fftwf_destroy_plan(info->plan_c2r);
|
||||
free(info);
|
||||
#else
|
||||
pffft_destroy_setup(fft);
|
||||
#endif
|
||||
}
|
||||
void dsp_fft_run_c(struct dsp_ops *ops, void *fft, int direction,
|
||||
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
struct fft_info *info = fft;
|
||||
if (direction > 0)
|
||||
fftwf_execute_dft_r2c (info->plan_r2c, (float*)src, (fftwf_complex*)dst);
|
||||
else
|
||||
fftwf_execute_dft_c2r (info->plan_c2r, (fftwf_complex*)src, dst);
|
||||
#else
|
||||
pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD);
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsp_fft_cmul_c(struct dsp_ops *ops, void *fft,
|
||||
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
|
||||
const float * SPA_RESTRICT b, uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
for (uint32_t i = 0; i <= len; i++) {
|
||||
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve(fft, a, b, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft,
|
||||
|
|
@ -236,6 +291,13 @@ void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft,
|
|||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
|
||||
uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
for (uint32_t i = 0; i <= len; i++) {
|
||||
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,9 +6,12 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include <spa/utils/defs.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "dsp-ops.h"
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
|
@ -533,3 +536,100 @@ void dsp_delay_sse(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n
|
|||
}
|
||||
*pos = w;
|
||||
}
|
||||
|
||||
inline static void _mm_mul_pz(__m128 *a, __m128 *b, __m128 *d)
|
||||
{
|
||||
__m128 ar, ai, br, bi, arbr, arbi, aibi, aibr, dr, di;
|
||||
ar = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(2,0,2,0)); /* ar0 ar1 ar2 ar3 */
|
||||
ai = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(3,1,3,1)); /* ai0 ai1 ai2 ai3 */
|
||||
br = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(2,0,2,0)); /* br0 br1 br2 br3 */
|
||||
bi = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(3,1,3,1)) /* bi0 bi1 bi2 bi3 */;
|
||||
|
||||
arbr = _mm_mul_ps(ar, br); /* ar * br */
|
||||
arbi = _mm_mul_ps(ar, bi); /* ar * bi */
|
||||
|
||||
aibi = _mm_mul_ps(ai, bi); /* ai * bi */
|
||||
aibr = _mm_mul_ps(ai, br); /* ai * br */
|
||||
|
||||
dr = _mm_sub_ps(arbr, aibi); /* ar * br - ai * bi */
|
||||
di = _mm_add_ps(arbi, aibr); /* ar * bi + ai * br */
|
||||
d[0] = _mm_unpacklo_ps(dr, di);
|
||||
d[1] = _mm_unpackhi_ps(dr, di);
|
||||
}
|
||||
|
||||
void dsp_fft_cmul_sse(struct dsp_ops *ops, void *fft,
|
||||
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
|
||||
const float * SPA_RESTRICT b, uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
__m128 s = _mm_set1_ps(scale);
|
||||
__m128 aa[2], bb[2], dd[2];
|
||||
uint32_t i, unrolled;
|
||||
|
||||
if (SPA_IS_ALIGNED(a, 16) &&
|
||||
SPA_IS_ALIGNED(b, 16) &&
|
||||
SPA_IS_ALIGNED(dst, 16))
|
||||
unrolled = len & ~3;
|
||||
else
|
||||
unrolled = 0;
|
||||
|
||||
for (i = 0; i < unrolled; i+=4) {
|
||||
aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
|
||||
aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */
|
||||
bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
|
||||
bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */
|
||||
_mm_mul_pz(aa, bb, dd);
|
||||
dd[0] = _mm_mul_ps(dd[0], s);
|
||||
dd[1] = _mm_mul_ps(dd[1], s);
|
||||
_mm_store_ps(&dst[2*i], dd[0]);
|
||||
_mm_store_ps(&dst[2*i+4], dd[1]);
|
||||
}
|
||||
for (; i < len; i++) {
|
||||
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve(fft, a, b, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft,
|
||||
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
|
||||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
|
||||
uint32_t len, const float scale)
|
||||
{
|
||||
#ifdef HAVE_FFTW
|
||||
__m128 s = _mm_set1_ps(scale);
|
||||
__m128 aa[2], bb[2], dd[2], t[1];
|
||||
uint32_t i, unrolled;
|
||||
|
||||
if (SPA_IS_ALIGNED(a, 16) &&
|
||||
SPA_IS_ALIGNED(b, 16) &&
|
||||
SPA_IS_ALIGNED(dst, 16))
|
||||
unrolled = len & ~3;
|
||||
else
|
||||
unrolled = 0;
|
||||
|
||||
for (i = 0; i < unrolled; i+=4) {
|
||||
aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */
|
||||
aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */
|
||||
bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */
|
||||
bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */
|
||||
_mm_mul_pz(aa, bb, dd);
|
||||
dd[0] = _mm_mul_ps(dd[0], s);
|
||||
dd[1] = _mm_mul_ps(dd[1], s);
|
||||
t[0] = _mm_load_ps(&dst[2*i]);
|
||||
t[1] = _mm_load_ps(&dst[2*i+4]);
|
||||
t[0] = _mm_add_ps(t[0], dd[0]);
|
||||
t[1] = _mm_add_ps(t[1], dd[1]);
|
||||
_mm_store_ps(&dst[2*i], t[0]);
|
||||
_mm_store_ps(&dst[2*i+4], t[1]);
|
||||
}
|
||||
for (; i < len; i++) {
|
||||
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
|
||||
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
|
||||
}
|
||||
#else
|
||||
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,8 +33,8 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.fft_new = dsp_fft_new_c,
|
||||
.funcs.fft_free = dsp_fft_free_c,
|
||||
.funcs.fft_run = dsp_fft_run_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_c,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_avx,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_avx,
|
||||
.funcs.biquadn_run = dsp_biquadn_run_sse,
|
||||
.funcs.delay = dsp_delay_sse,
|
||||
},
|
||||
|
|
@ -51,8 +51,8 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.fft_new = dsp_fft_new_c,
|
||||
.funcs.fft_free = dsp_fft_free_c,
|
||||
.funcs.fft_run = dsp_fft_run_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_c,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_sse,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_sse,
|
||||
.funcs.biquadn_run = dsp_biquadn_run_sse,
|
||||
.funcs.delay = dsp_delay_sse,
|
||||
},
|
||||
|
|
|
|||
|
|
@ -148,10 +148,14 @@ MAKE_SUM_FUNC(sse);
|
|||
MAKE_BIQUAD_RUN_FUNC(sse);
|
||||
MAKE_BIQUADN_RUN_FUNC(sse);
|
||||
MAKE_DELAY_FUNC(sse);
|
||||
MAKE_FFT_CMUL_FUNC(sse);
|
||||
MAKE_FFT_CMULADD_FUNC(sse);
|
||||
#endif
|
||||
#if defined (HAVE_AVX)
|
||||
MAKE_MIX_GAIN_FUNC(avx);
|
||||
MAKE_SUM_FUNC(avx);
|
||||
MAKE_FFT_CMUL_FUNC(avx);
|
||||
MAKE_FFT_CMULADD_FUNC(avx);
|
||||
#endif
|
||||
|
||||
#endif /* DSP_OPS_H */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue