filter-chain: improve pffft compilation and CPU support

Compile different pffft versions per CPU.
Plug the right version depending on the runtime CPU.

See #1543
This commit is contained in:
Wim Taymans 2021-08-24 12:04:09 +02:00
parent 0f5face73f
commit 123fe3d1c5
7 changed files with 243 additions and 90 deletions

View file

@ -45,17 +45,50 @@ pipewire_module_loopback = shared_library('pipewire-module-loopback',
dependencies : [mathlib, dl_lib, pipewire_dep],
)
simd_cargs = []
simd_dependencies = []
if have_sse
pffft_sse = static_library('pffft_sse',
['module-filter-chain/pffft.c' ],
c_args : [sse_args, '-O3', '-DHAVE_SSE'],
include_directories : [spa_inc],
install : false
)
simd_cargs += ['-DHAVE_SSE']
simd_dependencies += pffft_sse
endif
if have_neon
pffft_neon = static_library('pffft_neon',
['module-filter-chain/pffft.c' ],
c_args : [neon_args, '-O3', '-DHAVE_NEON'],
include_directories : [spa_inc],
install : false
)
simd_cargs += ['-DHAVE_NEON']
simd_dependencies += pffft_neon
endif
pffft_c = static_library('pffft_c',
['module-filter-chain/pffft.c' ],
c_args : [simd_cargs, '-O3', '-DPFFFT_SIMD_DISABLE'],
include_directories : [spa_inc],
install : false
)
simd_dependencies += pffft_c
pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
[ 'module-filter-chain.c',
'module-filter-chain/biquad.c',
'module-filter-chain/ladspa_plugin.c',
'module-filter-chain/builtin_plugin.c',
'module-filter-chain/pffft.c',
'module-filter-chain/convolver.c' ],
include_directories : [configinc, spa_inc],
install : true,
install_dir : modules_install_dir,
install_rpath: modules_install_dir,
link_with : simd_dependencies,
dependencies : [mathlib, dl_lib, pipewire_dep, sndfile_dep],
)

View file

@ -39,6 +39,7 @@
#include <spa/utils/string.h>
#include <spa/utils/json.h>
#include <spa/param/profiler.h>
#include <spa/support/cpu.h>
#include <spa/debug/pod.h>
#include <pipewire/utils.h>
@ -1629,6 +1630,9 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
struct impl *impl;
uint32_t id = pw_global_get_id(pw_impl_module_get_global(module));
const char *str;
const struct spa_support *support;
uint32_t n_support;
struct spa_cpu *cpu_iface;
int res;
impl = calloc(1, sizeof(struct impl));
@ -1637,6 +1641,10 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
pw_log_debug("module %p: new %s", impl, args);
support = pw_context_get_support(context, &n_support);
cpu_iface = spa_support_find(support, n_support, SPA_TYPE_INTERFACE_CPU);
init_builtin_plugin(cpu_iface ? spa_cpu_get_flags(cpu_iface) : 0);
if (args)
props = pw_properties_new_string(args);
else
@ -1727,7 +1735,6 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
pw_log_error("can't connect: %m");
goto error;
}
pw_properties_free(props);
pw_proxy_add_listener((struct pw_proxy*)impl->core,

View file

@ -34,6 +34,7 @@
#include "plugin.h"
#include "biquad.h"
#include "pffft.h"
#include "convolver.h"
struct builtin {
@ -622,3 +623,8 @@ struct fc_plugin *load_builtin_plugin(const char *plugin, const char *config)
{
return &builtin_plugin;
}
void init_builtin_plugin(uint32_t cpu_flags)
{
pffft_select_cpu(cpu_flags);
}

View file

@ -29,7 +29,6 @@
#include <spa/utils/defs.h>
#include <math.h>
#include <xmmintrin.h>
#include "pffft.h"
@ -132,6 +131,11 @@ static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
}
static inline void fft_sum(float *r, const float *a, const float *b,int len)
{
pffft_sum(a, b, r, len);
}
static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
{
struct convolver1 *conv;
@ -211,25 +215,6 @@ static void convolver1_free(struct convolver1 *conv)
free(conv);
}
void Sum(float* result, const float* a, const float* b, int len)
{
int i;
#if defined (__SSE__)
const int end4 = 4 * (len / 4);
for (i = 0; i < end4; i += 4) {
const __m128 va = _mm_load_ps(&a[i]);
const __m128 vb = _mm_load_ps(&b[i]);
_mm_store_ps(&result[i], _mm_add_ps(va,vb));
}
for (i = end4; i < len; ++i) {
result[i] = a[i] + b[i];
}
#else
for (i = 0; i < len; i++)
result[i] = a[i] + b[i];
#endif
}
static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
{
int i, processed = 0;
@ -270,7 +255,7 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
fft_sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
conv->inputBufferFill += processing;
if (conv->inputBufferFill == conv->blockSize) {

View file

@ -60,8 +60,11 @@
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <spa/support/cpu.h>
/* detect compiler flavour */
#if defined(_MSC_VER)
#define COMPILER_MSVC
@ -93,7 +96,7 @@
/*
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
#if !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_ALTIVEC))
typedef vector float v4sf;
#define SIMD_SZ 4
#define VZERO() ((vector float) vec_splat_u8(0))
@ -125,12 +128,18 @@ inline v4sf ld_ps1(const float *p)
x3 = vec_mergel(y1, y3); \
}
#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
#define pffft_funcs pffft_funcs_altivec
#define new_setup_simd new_setup_altivec
#define zreorder_simd zreorder_altivec
#define zconvolve_accumulate_simd zconvolve_accumulate_altivec
#define transform_simd transform_altivec
#define sum_simd sum_altivec
/*
SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_SSE))
#include <xmmintrin.h>
typedef __m128 v4sf;
@ -145,12 +154,18 @@ typedef __m128 v4sf;
#define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
#define pffft_funcs pffft_funcs_sse
#define new_setup_simd new_setup_sse
#define zreorder_simd zreorder_sse
#define zconvolve_accumulate_simd zconvolve_accumulate_sse
#define transform_simd transform_sse
#define sum_simd sum_sse
/*
ARM NEON support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_NEON))
#include <arm_neon.h>
typedef float32x4_t v4sf;
#define SIMD_SZ 4
@ -172,7 +187,13 @@ typedef float32x4_t v4sf;
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
#define pffft_funcs pffft_funcs_neon
#define new_setup_simd new_setup_neon
#define zreorder_simd zreorder_neon
#define zconvolve_accumulate_simd zconvolve_accumulate_neon
#define transform_simd transform_neon
#define sum_simd sum_neon
#else
#if !defined(PFFFT_SIMD_DISABLE)
#warning "building with simd disabled !\n";
@ -190,7 +211,13 @@ typedef float v4sf;
#define VMADD(a,b,c) ((a)*(b)+(c))
#define VSUB(a,b) ((a)-(b))
#define LD_PS1(p) (p)
#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
#define pffft_funcs pffft_funcs_c
#define new_setup_simd new_setup_c
#define zreorder_simd zreorder_c
#define zconvolve_accumulate_simd zconvolve_accumulate_c
#define transform_simd transform_c
#define sum_simd sum_c
#endif
// shortcuts for complex multiplcations
@ -212,7 +239,7 @@ typedef union v4sf_union {
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
/* detect bugs with the vector support macros */
void validate_pffft_simd()
static void validate_pffft_simd()
{
float f[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
v4sf_union a0, a1, a2, a3, t, u;
@ -270,35 +297,11 @@ void validate_pffft_simd()
assertv4(a3, 3, 7, 11, 15);
}
#else
void validate_pffft_simd()
static void validate_pffft_simd()
{
} // allow test_pffft.c to call this function even when simd is not available..
#endif //!PFFFT_SIMD_DISABLE
/* SSE and co like 16-bytes aligned pointers */
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
void *pffft_aligned_malloc(size_t nb_bytes)
{
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
if (!p0)
return (void *)0;
p = (void *)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) &
(~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
*((void **)p - 1) = p0;
return p;
}
void pffft_aligned_free(void *p)
{
if (p)
free(*((void **)p - 1));
}
int pffft_simd_size()
{
return SIMD_SZ;
}
/*
passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
*/
@ -1297,7 +1300,7 @@ static void rffti1_ps(int n, float *wa, int *ifac)
}
} /* rffti1 */
void cffti1_ps(int n, float *wa, int *ifac)
static void cffti1_ps(int n, float *wa, int *ifac)
{
static const int ntryh[] = { 5, 3, 4, 2, 0 };
int k1, j, ii;
@ -1335,7 +1338,7 @@ void cffti1_ps(int n, float *wa, int *ifac)
}
} /* cffti1 */
v4sf *cfftf1_ps(int n, const v4sf * input_readonly, v4sf * work1, v4sf * work2,
static v4sf *cfftf1_ps(int n, const v4sf * input_readonly, v4sf * work1, v4sf * work2,
const float *wa, const int *ifac, int isign)
{
v4sf *in = (v4sf *) input_readonly;
@ -1399,7 +1402,17 @@ struct PFFFT_Setup {
float *twiddle; // points into 'data', N/4 elements
};
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
struct funcs {
PFFFT_Setup * (*new_setup) (int N, pffft_transform_t transform);
void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered);
void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void (*sum)(const float *a, const float *b, float *ab, int len);
int (*simd_size)(void);
void (*validate)(void);
};
static PFFFT_Setup *new_setup_simd(int N, pffft_transform_t transform)
{
PFFFT_Setup *s = (PFFFT_Setup *) malloc(sizeof(PFFFT_Setup));
int k, m;
@ -1462,12 +1475,6 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
return s;
}
void pffft_destroy_setup(PFFFT_Setup * s)
{
pffft_aligned_free(s->data);
free(s);
}
#if !defined(PFFFT_SIMD_DISABLE)
/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
@ -1512,7 +1519,7 @@ static void unreversed_copy(int N, const v4sf * in, v4sf * out, int out_stride)
UNINTERLEAVE2(h0, g1, out[0], out[1]);
}
void pffft_zreorder(PFFFT_Setup * setup, const float *in, float *out,
static void zreorder_simd(PFFFT_Setup * setup, const float *in, float *out,
pffft_direction_t direction)
{
int k, N = setup->N, Ncvec = setup->Ncvec;
@ -1563,7 +1570,7 @@ void pffft_zreorder(PFFFT_Setup * setup, const float *in, float *out,
}
}
void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
static void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
{
int k, dk = Ncvec / SIMD_SZ; // number of 4x4 matrix blocks
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
@ -1626,7 +1633,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
}
}
void pffft_cplx_preprocess(int Ncvec, const v4sf * in, v4sf * out,
static void pffft_cplx_preprocess(int Ncvec, const v4sf * in, v4sf * out,
const v4sf * e)
{
int k, dk = Ncvec / SIMD_SZ; // number of 4x4 matrix blocks
@ -1908,8 +1915,8 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf * in,
uout[2 * Ncvec - 1].f[3] = ci3;
}
void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
float *foutput, v4sf * scratch,
static void transform_simd(PFFFT_Setup * setup, const float *finput,
float *foutput, float * scratch,
pffft_direction_t direction, int ordered)
{
int k, Ncvec = setup->Ncvec;
@ -1921,7 +1928,7 @@ void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
const v4sf *vinput = (const v4sf *)finput;
v4sf *voutput = (v4sf *) foutput;
v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
v4sf *buff[2] = { voutput, scratch ? (v4sf*)scratch : scratch_on_stack };
int ib = (nf_odd ^ ordered ? 1 : 0);
assert(VALIGNED(finput) && VALIGNED(foutput));
@ -1998,7 +2005,7 @@ void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
assert(buff[ib] == voutput);
}
void pffft_zconvolve_accumulate(PFFFT_Setup * s, const float *a, const float *b,
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b,
float *ab, float scaling)
{
int Ncvec = s->Ncvec;
@ -2101,12 +2108,25 @@ void pffft_zconvolve_accumulate(PFFFT_Setup * s, const float *a, const float *b,
}
}
static void sum_simd(const float *a, const float *b, float *ab, int len)
{
const v4sf *RESTRICT va = (const v4sf *)a;
const v4sf *RESTRICT vb = (const v4sf *)b;
v4sf *RESTRICT vab = (v4sf *) ab;
int i;
const int end4 = len / SIMD_SZ;
for (i = 0; i < end4; i += 1)
vab[i] = VADD(va[i],vb[i]);
for (i = i * 4; i < len; ++i)
ab[i] = a[i] + b[i];
}
#else // defined(PFFFT_SIMD_DISABLE)
// standard routine using scalar floats, without SIMD stuff.
#define pffft_zreorder_nosimd pffft_zreorder
void pffft_zreorder_nosimd(PFFFT_Setup * setup, const float *in, float *out,
static void zreorder_simd(PFFFT_Setup * setup, const float *in, float *out,
pffft_direction_t direction)
{
int k, N = setup->N;
@ -2129,8 +2149,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup * setup, const float *in, float *out,
}
}
#define pffft_transform_internal_nosimd pffft_transform_internal
void pffft_transform_internal_nosimd(PFFFT_Setup * setup, const float *input,
static void transform_simd(PFFFT_Setup * setup, const float *input,
float *output, float *scratch,
pffft_direction_t direction, int ordered)
{
@ -2198,8 +2217,7 @@ void pffft_transform_internal_nosimd(PFFFT_Setup * setup, const float *input,
assert(buff[ib] == output);
}
#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s, const float *a,
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a,
const float *b, float *ab, float scaling)
{
int i, Ncvec = s->Ncvec;
@ -2225,20 +2243,120 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s, const float *a,
ab[2 * i + 1] += ai * scaling;
}
}
static void sum_simd(const float *a, const float *b, float *ab, int len)
{
int i;
for (i = 0; i < len; ++i)
ab[i] = VADD(a[i], b[i]);
}
#endif // defined(PFFFT_SIMD_DISABLE)
void pffft_transform(PFFFT_Setup * setup, const float *input, float *output,
float *work, pffft_direction_t direction)
static int simd_size_simd(void)
{
pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
0);
return SIMD_SZ;
}
void pffft_transform_ordered(PFFFT_Setup * setup, const float *input,
float *output, float *work,
pffft_direction_t direction)
struct funcs pffft_funcs = {
.new_setup = new_setup_simd,
.transform = transform_simd,
.zreorder = zreorder_simd,
.zconvolve_accumulate = zconvolve_accumulate_simd,
.sum = sum_simd,
.simd_size = simd_size_simd,
.validate = validate_pffft_simd,
};
#if defined(PFFFT_SIMD_DISABLE)
extern struct funcs pffft_funcs_c;
#if (defined(HAVE_SSE))
extern struct funcs pffft_funcs_sse;
#endif
#if (defined(HAVE_ALTIVEC))
extern struct funcs pffft_funcs_altivec;
#endif
#if (defined(HAVE_NEON))
extern struct funcs pffft_funcs_neon;
#endif
static struct funcs *funcs = &pffft_funcs_c;
/* SSE and co like 16-bytes aligned pointers */
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
void *pffft_aligned_malloc(size_t nb_bytes)
{
pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
1);
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
if (!p0)
return (void *)0;
p = (void *)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) &
(~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
*((void **)p - 1) = p0;
return p;
}
void pffft_aligned_free(void *p)
{
if (p)
free(*((void **)p - 1));
}
int pffft_simd_size(void)
{
return funcs->simd_size();
}
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
{
return funcs->new_setup(N, transform);
}
void pffft_destroy_setup(PFFFT_Setup * s)
{
pffft_aligned_free(s->data);
free(s);
}
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
return funcs->transform(setup, input, output, work, direction, 0);
}
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
return funcs->transform(setup, input, output, work, direction, 1);
}
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction)
{
return funcs->zreorder(setup, input, output, direction);
}
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling)
{
return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling);
}
void pffft_sum(const float *a, const float *b, float *ab, int len)
{
return funcs->sum(a, b, ab, len);
}
void pffft_select_cpu(int flags)
{
funcs = &pffft_funcs_c;
#if defined(HAVE_SSE)
if (flags & SPA_CPU_FLAG_SSE)
funcs = &pffft_funcs_sse;
#endif
#if defined(HAVE_NEON)
if (flags & SPA_CPU_FLAG_NEON)
funcs = &pffft_funcs_neon;
#endif
#if defined(HAVE_ALTIVEC)
if (flags & SPA_CPU_FLAG_ALTIVEC)
funcs = &pffft_funcs_altivec;
#endif
}
#endif

View file

@ -159,6 +159,7 @@ extern "C" {
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void pffft_sum(const float *a, const float *b, float *ab, int len);
/*
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
@ -170,6 +171,8 @@ extern "C" {
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size();
void pffft_select_cpu(int flags);
#ifdef __cplusplus
}
#endif

View file

@ -94,3 +94,4 @@ static inline void fc_descriptor_free(struct fc_descriptor *desc)
struct fc_plugin *load_ladspa_plugin(const char *path, const char *config);
struct fc_plugin *load_builtin_plugin(const char *path, const char *config);
void init_builtin_plugin(uint32_t cpu_flags);