mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2025-11-01 22:58:50 -04:00
filter-chain: improve pffft compilation and CPU support
Compile different pffft versions per CPU. Plug the right version depending on the runtime CPU. See #1543
This commit is contained in:
parent
0f5face73f
commit
123fe3d1c5
7 changed files with 243 additions and 90 deletions
|
|
@ -45,17 +45,50 @@ pipewire_module_loopback = shared_library('pipewire-module-loopback',
|
|||
dependencies : [mathlib, dl_lib, pipewire_dep],
|
||||
)
|
||||
|
||||
simd_cargs = []
|
||||
simd_dependencies = []
|
||||
|
||||
if have_sse
|
||||
pffft_sse = static_library('pffft_sse',
|
||||
['module-filter-chain/pffft.c' ],
|
||||
c_args : [sse_args, '-O3', '-DHAVE_SSE'],
|
||||
include_directories : [spa_inc],
|
||||
install : false
|
||||
)
|
||||
simd_cargs += ['-DHAVE_SSE']
|
||||
simd_dependencies += pffft_sse
|
||||
endif
|
||||
if have_neon
|
||||
pffft_neon = static_library('pffft_neon',
|
||||
['module-filter-chain/pffft.c' ],
|
||||
c_args : [neon_args, '-O3', '-DHAVE_NEON'],
|
||||
include_directories : [spa_inc],
|
||||
install : false
|
||||
)
|
||||
simd_cargs += ['-DHAVE_NEON']
|
||||
simd_dependencies += pffft_neon
|
||||
endif
|
||||
|
||||
pffft_c = static_library('pffft_c',
|
||||
['module-filter-chain/pffft.c' ],
|
||||
c_args : [simd_cargs, '-O3', '-DPFFFT_SIMD_DISABLE'],
|
||||
include_directories : [spa_inc],
|
||||
install : false
|
||||
)
|
||||
simd_dependencies += pffft_c
|
||||
|
||||
|
||||
pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
|
||||
[ 'module-filter-chain.c',
|
||||
'module-filter-chain/biquad.c',
|
||||
'module-filter-chain/ladspa_plugin.c',
|
||||
'module-filter-chain/builtin_plugin.c',
|
||||
'module-filter-chain/pffft.c',
|
||||
'module-filter-chain/convolver.c' ],
|
||||
include_directories : [configinc, spa_inc],
|
||||
install : true,
|
||||
install_dir : modules_install_dir,
|
||||
install_rpath: modules_install_dir,
|
||||
link_with : simd_dependencies,
|
||||
dependencies : [mathlib, dl_lib, pipewire_dep, sndfile_dep],
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@
|
|||
#include <spa/utils/string.h>
|
||||
#include <spa/utils/json.h>
|
||||
#include <spa/param/profiler.h>
|
||||
#include <spa/support/cpu.h>
|
||||
#include <spa/debug/pod.h>
|
||||
|
||||
#include <pipewire/utils.h>
|
||||
|
|
@ -1629,6 +1630,9 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
|
|||
struct impl *impl;
|
||||
uint32_t id = pw_global_get_id(pw_impl_module_get_global(module));
|
||||
const char *str;
|
||||
const struct spa_support *support;
|
||||
uint32_t n_support;
|
||||
struct spa_cpu *cpu_iface;
|
||||
int res;
|
||||
|
||||
impl = calloc(1, sizeof(struct impl));
|
||||
|
|
@ -1637,6 +1641,10 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
|
|||
|
||||
pw_log_debug("module %p: new %s", impl, args);
|
||||
|
||||
support = pw_context_get_support(context, &n_support);
|
||||
cpu_iface = spa_support_find(support, n_support, SPA_TYPE_INTERFACE_CPU);
|
||||
init_builtin_plugin(cpu_iface ? spa_cpu_get_flags(cpu_iface) : 0);
|
||||
|
||||
if (args)
|
||||
props = pw_properties_new_string(args);
|
||||
else
|
||||
|
|
@ -1727,7 +1735,6 @@ int pipewire__module_init(struct pw_impl_module *module, const char *args)
|
|||
pw_log_error("can't connect: %m");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pw_properties_free(props);
|
||||
|
||||
pw_proxy_add_listener((struct pw_proxy*)impl->core,
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
#include "plugin.h"
|
||||
|
||||
#include "biquad.h"
|
||||
#include "pffft.h"
|
||||
#include "convolver.h"
|
||||
|
||||
struct builtin {
|
||||
|
|
@ -622,3 +623,8 @@ struct fc_plugin *load_builtin_plugin(const char *plugin, const char *config)
|
|||
{
|
||||
return &builtin_plugin;
|
||||
}
|
||||
|
||||
void init_builtin_plugin(uint32_t cpu_flags)
|
||||
{
|
||||
pffft_select_cpu(cpu_flags);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@
|
|||
#include <spa/utils/defs.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
|
|
@ -132,6 +131,11 @@ static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
|
|||
pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
|
||||
}
|
||||
|
||||
static inline void fft_sum(float *r, const float *a, const float *b,int len)
|
||||
{
|
||||
pffft_sum(a, b, r, len);
|
||||
}
|
||||
|
||||
static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
|
||||
{
|
||||
struct convolver1 *conv;
|
||||
|
|
@ -211,25 +215,6 @@ static void convolver1_free(struct convolver1 *conv)
|
|||
free(conv);
|
||||
}
|
||||
|
||||
void Sum(float* result, const float* a, const float* b, int len)
|
||||
{
|
||||
int i;
|
||||
#if defined (__SSE__)
|
||||
const int end4 = 4 * (len / 4);
|
||||
for (i = 0; i < end4; i += 4) {
|
||||
const __m128 va = _mm_load_ps(&a[i]);
|
||||
const __m128 vb = _mm_load_ps(&b[i]);
|
||||
_mm_store_ps(&result[i], _mm_add_ps(va,vb));
|
||||
}
|
||||
for (i = end4; i < len; ++i) {
|
||||
result[i] = a[i] + b[i];
|
||||
}
|
||||
#else
|
||||
for (i = 0; i < len; i++)
|
||||
result[i] = a[i] + b[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
|
||||
{
|
||||
int i, processed = 0;
|
||||
|
|
@ -270,7 +255,7 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
|
|||
|
||||
ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
|
||||
|
||||
Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
|
||||
fft_sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
|
||||
|
||||
conv->inputBufferFill += processing;
|
||||
if (conv->inputBufferFill == conv->blockSize) {
|
||||
|
|
|
|||
|
|
@ -60,8 +60,11 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <spa/support/cpu.h>
|
||||
|
||||
/* detect compiler flavour */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC
|
||||
|
|
@ -81,19 +84,19 @@
|
|||
#define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
|
||||
#endif
|
||||
|
||||
/*
|
||||
/*
|
||||
vector support macros: the rest of the code is independant of
|
||||
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||||
vectors should be limited to these macros
|
||||
vectors should be limited to these macros
|
||||
*/
|
||||
|
||||
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
|
||||
//#define PFFFT_SIMD_DISABLE
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_ALTIVEC))
|
||||
typedef vector float v4sf;
|
||||
#define SIMD_SZ 4
|
||||
#define VZERO() ((vector float) vec_splat_u8(0))
|
||||
|
|
@ -125,12 +128,18 @@ inline v4sf ld_ps1(const float *p)
|
|||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
|
||||
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
#define pffft_funcs pffft_funcs_altivec
|
||||
#define new_setup_simd new_setup_altivec
|
||||
#define zreorder_simd zreorder_altivec
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_altivec
|
||||
#define transform_simd transform_altivec
|
||||
#define sum_simd sum_altivec
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_SSE))
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
|
|
@ -145,12 +154,18 @@ typedef __m128 v4sf;
|
|||
#define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
|
||||
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
#define pffft_funcs pffft_funcs_sse
|
||||
#define new_setup_simd new_setup_sse
|
||||
#define zreorder_simd zreorder_sse
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_sse
|
||||
#define transform_simd transform_sse
|
||||
#define sum_simd sum_sse
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(HAVE_NEON))
|
||||
#include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
#define SIMD_SZ 4
|
||||
|
|
@ -172,7 +187,13 @@ typedef float32x4_t v4sf;
|
|||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
|
||||
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
#define pffft_funcs pffft_funcs_neon
|
||||
#define new_setup_simd new_setup_neon
|
||||
#define zreorder_simd zreorder_neon
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_neon
|
||||
#define transform_simd transform_neon
|
||||
#define sum_simd sum_neon
|
||||
#else
|
||||
#if !defined(PFFFT_SIMD_DISABLE)
|
||||
#warning "building with simd disabled !\n";
|
||||
|
|
@ -190,7 +211,13 @@ typedef float v4sf;
|
|||
#define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
#define VSUB(a,b) ((a)-(b))
|
||||
#define LD_PS1(p) (p)
|
||||
#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
|
||||
#define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
#define pffft_funcs pffft_funcs_c
|
||||
#define new_setup_simd new_setup_c
|
||||
#define zreorder_simd zreorder_c
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_c
|
||||
#define transform_simd transform_c
|
||||
#define sum_simd sum_c
|
||||
#endif
|
||||
|
||||
// shortcuts for complex multiplcations
|
||||
|
|
@ -212,7 +239,7 @@ typedef union v4sf_union {
|
|||
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
|
||||
|
||||
/* detect bugs with the vector support macros */
|
||||
void validate_pffft_simd()
|
||||
static void validate_pffft_simd()
|
||||
{
|
||||
float f[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
|
||||
v4sf_union a0, a1, a2, a3, t, u;
|
||||
|
|
@ -270,35 +297,11 @@ void validate_pffft_simd()
|
|||
assertv4(a3, 3, 7, 11, 15);
|
||||
}
|
||||
#else
|
||||
void validate_pffft_simd()
|
||||
static void validate_pffft_simd()
|
||||
{
|
||||
} // allow test_pffft.c to call this function even when simd is not available..
|
||||
#endif //!PFFFT_SIMD_DISABLE
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
|
||||
void *pffft_aligned_malloc(size_t nb_bytes)
|
||||
{
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0)
|
||||
return (void *)0;
|
||||
p = (void *)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) &
|
||||
(~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
|
||||
*((void **)p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
void pffft_aligned_free(void *p)
|
||||
{
|
||||
if (p)
|
||||
free(*((void **)p - 1));
|
||||
}
|
||||
|
||||
int pffft_simd_size()
|
||||
{
|
||||
return SIMD_SZ;
|
||||
}
|
||||
|
||||
/*
|
||||
passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
|
||||
*/
|
||||
|
|
@ -1297,7 +1300,7 @@ static void rffti1_ps(int n, float *wa, int *ifac)
|
|||
}
|
||||
} /* rffti1 */
|
||||
|
||||
void cffti1_ps(int n, float *wa, int *ifac)
|
||||
static void cffti1_ps(int n, float *wa, int *ifac)
|
||||
{
|
||||
static const int ntryh[] = { 5, 3, 4, 2, 0 };
|
||||
int k1, j, ii;
|
||||
|
|
@ -1335,7 +1338,7 @@ void cffti1_ps(int n, float *wa, int *ifac)
|
|||
}
|
||||
} /* cffti1 */
|
||||
|
||||
v4sf *cfftf1_ps(int n, const v4sf * input_readonly, v4sf * work1, v4sf * work2,
|
||||
static v4sf *cfftf1_ps(int n, const v4sf * input_readonly, v4sf * work1, v4sf * work2,
|
||||
const float *wa, const int *ifac, int isign)
|
||||
{
|
||||
v4sf *in = (v4sf *) input_readonly;
|
||||
|
|
@ -1399,7 +1402,17 @@ struct PFFFT_Setup {
|
|||
float *twiddle; // points into 'data', N/4 elements
|
||||
};
|
||||
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
|
||||
struct funcs {
|
||||
PFFFT_Setup * (*new_setup) (int N, pffft_transform_t transform);
|
||||
void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered);
|
||||
void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
void (*sum)(const float *a, const float *b, float *ab, int len);
|
||||
int (*simd_size)(void);
|
||||
void (*validate)(void);
|
||||
};
|
||||
|
||||
static PFFFT_Setup *new_setup_simd(int N, pffft_transform_t transform)
|
||||
{
|
||||
PFFFT_Setup *s = (PFFFT_Setup *) malloc(sizeof(PFFFT_Setup));
|
||||
int k, m;
|
||||
|
|
@ -1462,12 +1475,6 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
|
|||
return s;
|
||||
}
|
||||
|
||||
void pffft_destroy_setup(PFFFT_Setup * s)
|
||||
{
|
||||
pffft_aligned_free(s->data);
|
||||
free(s);
|
||||
}
|
||||
|
||||
#if !defined(PFFFT_SIMD_DISABLE)
|
||||
|
||||
/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
|
||||
|
|
@ -1512,7 +1519,7 @@ static void unreversed_copy(int N, const v4sf * in, v4sf * out, int out_stride)
|
|||
UNINTERLEAVE2(h0, g1, out[0], out[1]);
|
||||
}
|
||||
|
||||
void pffft_zreorder(PFFFT_Setup * setup, const float *in, float *out,
|
||||
static void zreorder_simd(PFFFT_Setup * setup, const float *in, float *out,
|
||||
pffft_direction_t direction)
|
||||
{
|
||||
int k, N = setup->N, Ncvec = setup->Ncvec;
|
||||
|
|
@ -1563,7 +1570,7 @@ void pffft_zreorder(PFFFT_Setup * setup, const float *in, float *out,
|
|||
}
|
||||
}
|
||||
|
||||
void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
|
||||
static void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
|
||||
{
|
||||
int k, dk = Ncvec / SIMD_SZ; // number of 4x4 matrix blocks
|
||||
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
|
||||
|
|
@ -1626,7 +1633,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
|
|||
}
|
||||
}
|
||||
|
||||
void pffft_cplx_preprocess(int Ncvec, const v4sf * in, v4sf * out,
|
||||
static void pffft_cplx_preprocess(int Ncvec, const v4sf * in, v4sf * out,
|
||||
const v4sf * e)
|
||||
{
|
||||
int k, dk = Ncvec / SIMD_SZ; // number of 4x4 matrix blocks
|
||||
|
|
@ -1908,8 +1915,8 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf * in,
|
|||
uout[2 * Ncvec - 1].f[3] = ci3;
|
||||
}
|
||||
|
||||
void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
|
||||
float *foutput, v4sf * scratch,
|
||||
static void transform_simd(PFFFT_Setup * setup, const float *finput,
|
||||
float *foutput, float * scratch,
|
||||
pffft_direction_t direction, int ordered)
|
||||
{
|
||||
int k, Ncvec = setup->Ncvec;
|
||||
|
|
@ -1921,7 +1928,7 @@ void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
|
|||
|
||||
const v4sf *vinput = (const v4sf *)finput;
|
||||
v4sf *voutput = (v4sf *) foutput;
|
||||
v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
|
||||
v4sf *buff[2] = { voutput, scratch ? (v4sf*)scratch : scratch_on_stack };
|
||||
int ib = (nf_odd ^ ordered ? 1 : 0);
|
||||
|
||||
assert(VALIGNED(finput) && VALIGNED(foutput));
|
||||
|
|
@ -1998,7 +2005,7 @@ void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
|
|||
assert(buff[ib] == voutput);
|
||||
}
|
||||
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup * s, const float *a, const float *b,
|
||||
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b,
|
||||
float *ab, float scaling)
|
||||
{
|
||||
int Ncvec = s->Ncvec;
|
||||
|
|
@ -2101,12 +2108,25 @@ void pffft_zconvolve_accumulate(PFFFT_Setup * s, const float *a, const float *b,
|
|||
}
|
||||
}
|
||||
|
||||
static void sum_simd(const float *a, const float *b, float *ab, int len)
|
||||
{
|
||||
const v4sf *RESTRICT va = (const v4sf *)a;
|
||||
const v4sf *RESTRICT vb = (const v4sf *)b;
|
||||
v4sf *RESTRICT vab = (v4sf *) ab;
|
||||
int i;
|
||||
const int end4 = len / SIMD_SZ;
|
||||
|
||||
for (i = 0; i < end4; i += 1)
|
||||
vab[i] = VADD(va[i],vb[i]);
|
||||
for (i = i * 4; i < len; ++i)
|
||||
ab[i] = a[i] + b[i];
|
||||
}
|
||||
|
||||
#else // defined(PFFFT_SIMD_DISABLE)
|
||||
|
||||
// standard routine using scalar floats, without SIMD stuff.
|
||||
|
||||
#define pffft_zreorder_nosimd pffft_zreorder
|
||||
void pffft_zreorder_nosimd(PFFFT_Setup * setup, const float *in, float *out,
|
||||
static void zreorder_simd(PFFFT_Setup * setup, const float *in, float *out,
|
||||
pffft_direction_t direction)
|
||||
{
|
||||
int k, N = setup->N;
|
||||
|
|
@ -2129,8 +2149,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup * setup, const float *in, float *out,
|
|||
}
|
||||
}
|
||||
|
||||
#define pffft_transform_internal_nosimd pffft_transform_internal
|
||||
void pffft_transform_internal_nosimd(PFFFT_Setup * setup, const float *input,
|
||||
static void transform_simd(PFFFT_Setup * setup, const float *input,
|
||||
float *output, float *scratch,
|
||||
pffft_direction_t direction, int ordered)
|
||||
{
|
||||
|
|
@ -2198,8 +2217,7 @@ void pffft_transform_internal_nosimd(PFFFT_Setup * setup, const float *input,
|
|||
assert(buff[ib] == output);
|
||||
}
|
||||
|
||||
#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
|
||||
void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s, const float *a,
|
||||
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a,
|
||||
const float *b, float *ab, float scaling)
|
||||
{
|
||||
int i, Ncvec = s->Ncvec;
|
||||
|
|
@ -2225,20 +2243,120 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s, const float *a,
|
|||
ab[2 * i + 1] += ai * scaling;
|
||||
}
|
||||
}
|
||||
static void sum_simd(const float *a, const float *b, float *ab, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len; ++i)
|
||||
ab[i] = VADD(a[i], b[i]);
|
||||
}
|
||||
|
||||
#endif // defined(PFFFT_SIMD_DISABLE)
|
||||
|
||||
void pffft_transform(PFFFT_Setup * setup, const float *input, float *output,
|
||||
float *work, pffft_direction_t direction)
|
||||
static int simd_size_simd(void)
|
||||
{
|
||||
pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
|
||||
0);
|
||||
return SIMD_SZ;
|
||||
}
|
||||
|
||||
void pffft_transform_ordered(PFFFT_Setup * setup, const float *input,
|
||||
float *output, float *work,
|
||||
pffft_direction_t direction)
|
||||
struct funcs pffft_funcs = {
|
||||
.new_setup = new_setup_simd,
|
||||
.transform = transform_simd,
|
||||
.zreorder = zreorder_simd,
|
||||
.zconvolve_accumulate = zconvolve_accumulate_simd,
|
||||
.sum = sum_simd,
|
||||
.simd_size = simd_size_simd,
|
||||
.validate = validate_pffft_simd,
|
||||
};
|
||||
|
||||
#if defined(PFFFT_SIMD_DISABLE)
|
||||
|
||||
extern struct funcs pffft_funcs_c;
|
||||
#if (defined(HAVE_SSE))
|
||||
extern struct funcs pffft_funcs_sse;
|
||||
#endif
|
||||
#if (defined(HAVE_ALTIVEC))
|
||||
extern struct funcs pffft_funcs_altivec;
|
||||
#endif
|
||||
#if (defined(HAVE_NEON))
|
||||
extern struct funcs pffft_funcs_neon;
|
||||
#endif
|
||||
|
||||
static struct funcs *funcs = &pffft_funcs_c;
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
|
||||
void *pffft_aligned_malloc(size_t nb_bytes)
|
||||
{
|
||||
pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
|
||||
1);
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0)
|
||||
return (void *)0;
|
||||
p = (void *)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) &
|
||||
(~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
|
||||
*((void **)p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
void pffft_aligned_free(void *p)
|
||||
{
|
||||
if (p)
|
||||
free(*((void **)p - 1));
|
||||
}
|
||||
|
||||
int pffft_simd_size(void)
|
||||
{
|
||||
return funcs->simd_size();
|
||||
}
|
||||
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
|
||||
{
|
||||
return funcs->new_setup(N, transform);
|
||||
}
|
||||
|
||||
void pffft_destroy_setup(PFFFT_Setup * s)
|
||||
{
|
||||
pffft_aligned_free(s->data);
|
||||
free(s);
|
||||
}
|
||||
|
||||
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
|
||||
{
|
||||
return funcs->transform(setup, input, output, work, direction, 0);
|
||||
}
|
||||
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
|
||||
{
|
||||
return funcs->transform(setup, input, output, work, direction, 1);
|
||||
}
|
||||
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction)
|
||||
{
|
||||
return funcs->zreorder(setup, input, output, direction);
|
||||
}
|
||||
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling)
|
||||
{
|
||||
return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling);
|
||||
}
|
||||
|
||||
void pffft_sum(const float *a, const float *b, float *ab, int len)
|
||||
{
|
||||
return funcs->sum(a, b, ab, len);
|
||||
}
|
||||
|
||||
void pffft_select_cpu(int flags)
|
||||
{
|
||||
funcs = &pffft_funcs_c;
|
||||
#if defined(HAVE_SSE)
|
||||
if (flags & SPA_CPU_FLAG_SSE)
|
||||
funcs = &pffft_funcs_sse;
|
||||
#endif
|
||||
#if defined(HAVE_NEON)
|
||||
if (flags & SPA_CPU_FLAG_NEON)
|
||||
funcs = &pffft_funcs_neon;
|
||||
#endif
|
||||
#if defined(HAVE_ALTIVEC)
|
||||
if (flags & SPA_CPU_FLAG_ALTIVEC)
|
||||
funcs = &pffft_funcs_altivec;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ extern "C" {
|
|||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
void pffft_sum(const float *a, const float *b, float *ab, int len);
|
||||
/*
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
|
|
@ -170,6 +171,8 @@ extern "C" {
|
|||
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
|
||||
int pffft_simd_size();
|
||||
|
||||
void pffft_select_cpu(int flags);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -94,3 +94,4 @@ static inline void fc_descriptor_free(struct fc_descriptor *desc)
|
|||
|
||||
struct fc_plugin *load_ladspa_plugin(const char *path, const char *config);
|
||||
struct fc_plugin *load_builtin_plugin(const char *path, const char *config);
|
||||
void init_builtin_plugin(uint32_t cpu_flags);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue