mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2025-11-06 13:30:01 -05:00
filter-chain: optimize convolver some more
Add function to convolve without accumulate and use that when possible to avoid some memset and memcpy.
This commit is contained in:
parent
1fe54e5f17
commit
994630cb3a
3 changed files with 161 additions and 45 deletions
|
|
@ -58,6 +58,7 @@ struct convolver1 {
|
|||
int inputBufferFill;
|
||||
|
||||
int current;
|
||||
float scale;
|
||||
};
|
||||
|
||||
static void *fft_alloc(int size)
|
||||
|
|
@ -82,11 +83,6 @@ static void fft_cpx_free(struct fft_cpx *cpx)
|
|||
fft_free(cpx->v);
|
||||
}
|
||||
|
||||
static void fft_cpx_clear(struct fft_cpx *cpx, int size)
|
||||
{
|
||||
memset(cpx->v, 0, sizeof(float) * 2 * size);
|
||||
}
|
||||
|
||||
static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
|
||||
{
|
||||
memcpy(dst->v, src->v, sizeof(float) * 2 * size);
|
||||
|
|
@ -125,6 +121,11 @@ static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
|
|||
pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
|
||||
}
|
||||
|
||||
static inline void fft_convolve(void *fft, struct fft_cpx *r,
|
||||
const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
|
||||
{
|
||||
pffft_zconvolve(fft, a->v, b->v, r->v, scale);
|
||||
}
|
||||
static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
|
||||
const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
|
||||
{
|
||||
|
|
@ -192,6 +193,7 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
|
|||
conv->inputBuffer = fft_alloc(sizeof(float) * conv->segSize);
|
||||
conv->inputBufferFill = 0;
|
||||
conv->current = 0;
|
||||
conv->scale = 1.0f / conv->segSize;
|
||||
|
||||
return conv;
|
||||
}
|
||||
|
|
@ -235,23 +237,36 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
|
|||
|
||||
fft_run(conv->fft, conv->inputBuffer, &conv->segments[conv->current]);
|
||||
|
||||
if (conv->inputBufferFill == 0) {
|
||||
fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);
|
||||
if (conv->segCount > 1) {
|
||||
if (conv->inputBufferFill == 0) {
|
||||
int indexAudio = (conv->current + 1) % conv->segCount;
|
||||
|
||||
for (i = 1; i < conv->segCount; i++) {
|
||||
const int indexIr = i;
|
||||
const int indexAudio = (conv->current + i) % conv->segCount;
|
||||
|
||||
fft_convolve_accum(conv->fft, &conv->pre_mult,
|
||||
&conv->segmentsIr[indexIr],
|
||||
fft_convolve(conv->fft, &conv->pre_mult,
|
||||
&conv->segmentsIr[1],
|
||||
&conv->segments[indexAudio],
|
||||
conv->fftComplexSize, 1.0f / conv->segSize);
|
||||
}
|
||||
}
|
||||
fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
|
||||
conv->fftComplexSize, conv->scale);
|
||||
|
||||
fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
|
||||
conv->fftComplexSize, 1.0f / conv->segSize);
|
||||
for (i = 2; i < conv->segCount; i++) {
|
||||
indexAudio = (conv->current + i) % conv->segCount;
|
||||
|
||||
fft_convolve_accum(conv->fft, &conv->pre_mult,
|
||||
&conv->segmentsIr[i],
|
||||
&conv->segments[indexAudio],
|
||||
conv->fftComplexSize, conv->scale);
|
||||
}
|
||||
}
|
||||
fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
|
||||
|
||||
fft_convolve_accum(conv->fft, &conv->conv,
|
||||
&conv->segments[conv->current],
|
||||
&conv->segmentsIr[0],
|
||||
conv->fftComplexSize, conv->scale);
|
||||
} else {
|
||||
fft_convolve(conv->fft, &conv->conv,
|
||||
&conv->segments[conv->current],
|
||||
&conv->segmentsIr[0],
|
||||
conv->fftComplexSize, conv->scale);
|
||||
}
|
||||
|
||||
ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
|
||||
|
||||
|
|
|
|||
|
|
@ -133,6 +133,7 @@ inline v4sf ld_ps1(const float *p)
|
|||
#define new_setup_simd new_setup_altivec
|
||||
#define zreorder_simd zreorder_altivec
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_altivec
|
||||
#define zconvolve_simd zconvolve_altivec
|
||||
#define transform_simd transform_altivec
|
||||
#define sum_simd sum_altivec
|
||||
|
||||
|
|
@ -159,6 +160,7 @@ typedef __m128 v4sf;
|
|||
#define new_setup_simd new_setup_sse
|
||||
#define zreorder_simd zreorder_sse
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_sse
|
||||
#define zconvolve_simd zconvolve_sse
|
||||
#define transform_simd transform_sse
|
||||
#define sum_simd sum_sse
|
||||
|
||||
|
|
@ -192,6 +194,7 @@ typedef float32x4_t v4sf;
|
|||
#define new_setup_simd new_setup_neon
|
||||
#define zreorder_simd zreorder_neon
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_neon
|
||||
#define zconvolve_simd zconvolve_neon
|
||||
#define transform_simd transform_neon
|
||||
#define sum_simd sum_neon
|
||||
#else
|
||||
|
|
@ -216,6 +219,7 @@ typedef float v4sf;
|
|||
#define new_setup_simd new_setup_c
|
||||
#define zreorder_simd zreorder_c
|
||||
#define zconvolve_accumulate_simd zconvolve_accumulate_c
|
||||
#define zconvolve_simd zconvolve_c
|
||||
#define transform_simd transform_c
|
||||
#define sum_simd sum_c
|
||||
#endif
|
||||
|
|
@ -1407,6 +1411,7 @@ struct funcs {
|
|||
void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered);
|
||||
void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
void (*zconvolve)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
void (*sum)(const float *a, const float *b, float *ab, int len);
|
||||
int (*simd_size)(void);
|
||||
void (*validate)(void);
|
||||
|
|
@ -2008,7 +2013,7 @@ static void transform_simd(PFFFT_Setup * setup, const float *finput,
|
|||
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b,
|
||||
float *ab, float scaling)
|
||||
{
|
||||
int Ncvec = s->Ncvec;
|
||||
const int Ncvec2 = s->Ncvec * 2;
|
||||
const v4sf *RESTRICT va = (const v4sf *)a;
|
||||
const v4sf *RESTRICT vb = (const v4sf *)b;
|
||||
v4sf *RESTRICT vab = (v4sf *) ab;
|
||||
|
|
@ -2048,7 +2053,7 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
|
|||
#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
|
||||
const float *a_ = a, *b_ = b;
|
||||
float *ab_ = ab;
|
||||
int N = Ncvec;
|
||||
int N = s->Ncvec;
|
||||
asm volatile ("mov r8, %2 \n"
|
||||
"vdup.f32 q15, %4 \n"
|
||||
"1: \n"
|
||||
|
|
@ -2084,22 +2089,22 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
|
|||
"q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q15", "memory");
|
||||
#else // default routine, works fine for non-arm cpus with current compilers
|
||||
for (i = 0; i < Ncvec; i += 2) {
|
||||
for (i = 0; i < Ncvec2; i += 4) {
|
||||
v4sf ar, ai, br, bi;
|
||||
ar = va[2 * i + 0];
|
||||
ai = va[2 * i + 1];
|
||||
br = vb[2 * i + 0];
|
||||
bi = vb[2 * i + 1];
|
||||
ar = va[i + 0];
|
||||
ai = va[i + 1];
|
||||
br = vb[i + 0];
|
||||
bi = vb[i + 1];
|
||||
VCPLXMUL(ar, ai, br, bi);
|
||||
vab[2 * i + 0] = VMADD(ar, vscal, vab[2 * i + 0]);
|
||||
vab[2 * i + 1] = VMADD(ai, vscal, vab[2 * i + 1]);
|
||||
ar = va[2 * i + 2];
|
||||
ai = va[2 * i + 3];
|
||||
br = vb[2 * i + 2];
|
||||
bi = vb[2 * i + 3];
|
||||
vab[i + 0] = VMADD(ar, vscal, vab[i + 0]);
|
||||
vab[i + 1] = VMADD(ai, vscal, vab[i + 1]);
|
||||
ar = va[i + 2];
|
||||
ai = va[i + 3];
|
||||
br = vb[i + 2];
|
||||
bi = vb[i + 3];
|
||||
VCPLXMUL(ar, ai, br, bi);
|
||||
vab[2 * i + 2] = VMADD(ar, vscal, vab[2 * i + 2]);
|
||||
vab[2 * i + 3] = VMADD(ai, vscal, vab[2 * i + 3]);
|
||||
vab[i + 2] = VMADD(ar, vscal, vab[i + 2]);
|
||||
vab[i + 3] = VMADD(ai, vscal, vab[i + 3]);
|
||||
}
|
||||
#endif
|
||||
if (s->transform == PFFFT_REAL) {
|
||||
|
|
@ -2108,6 +2113,67 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
|
|||
}
|
||||
}
|
||||
|
||||
static void zconvolve_simd(PFFFT_Setup * s, const float *a, const float *b,
|
||||
float *ab, float scaling)
|
||||
{
|
||||
v4sf vscal = LD_PS1(scaling);
|
||||
const v4sf * RESTRICT va = (const v4sf*)a;
|
||||
const v4sf * RESTRICT vb = (const v4sf*)b;
|
||||
v4sf * RESTRICT vab = (v4sf*)ab;
|
||||
float sar, sai, sbr, sbi;
|
||||
const int Ncvec2 = s->Ncvec * 2;
|
||||
int i;
|
||||
|
||||
#ifdef __arm__
|
||||
__builtin_prefetch(va);
|
||||
__builtin_prefetch(vb);
|
||||
__builtin_prefetch(vab);
|
||||
__builtin_prefetch(va+2);
|
||||
__builtin_prefetch(vb+2);
|
||||
__builtin_prefetch(vab+2);
|
||||
__builtin_prefetch(va+4);
|
||||
__builtin_prefetch(vb+4);
|
||||
__builtin_prefetch(vab+4);
|
||||
__builtin_prefetch(va+6);
|
||||
__builtin_prefetch(vb+6);
|
||||
__builtin_prefetch(vab+6);
|
||||
# ifndef __clang__
|
||||
# define ZCONVOLVE_USING_INLINE_NEON_ASM
|
||||
# endif
|
||||
#endif
|
||||
|
||||
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
|
||||
sar = ((v4sf_union*)va)[0].f[0];
|
||||
sai = ((v4sf_union*)va)[1].f[0];
|
||||
sbr = ((v4sf_union*)vb)[0].f[0];
|
||||
sbi = ((v4sf_union*)vb)[1].f[0];
|
||||
|
||||
/* default routine, works fine for non-arm cpus with current compilers */
|
||||
for (i = 0; i < Ncvec2; i += 4) {
|
||||
v4sf var, vai, vbr, vbi;
|
||||
var = va[i + 0];
|
||||
vai = va[i + 1];
|
||||
vbr = vb[i + 0];
|
||||
vbi = vb[i + 1];
|
||||
VCPLXMUL(var, vai, vbr, vbi);
|
||||
vab[i + 0] = VMUL(var, vscal);
|
||||
vab[i + 1] = VMUL(vai, vscal);
|
||||
var = va[i + 2];
|
||||
vai = va[i + 3];
|
||||
vbr = vb[i + 2];
|
||||
vbi = vb[i + 3];
|
||||
VCPLXMUL(var, vai, vbr, vbi);
|
||||
vab[i + 2] = VMUL(var, vscal);
|
||||
vab[i + 3] = VMUL(vai, vscal);
|
||||
}
|
||||
|
||||
if (s->transform == PFFFT_REAL) {
|
||||
((v4sf_union*)vab)[0].f[0] = sar * sbr * scaling;
|
||||
((v4sf_union*)vab)[1].f[0] = sai * sbi * scaling;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void sum_simd(const float *a, const float *b, float *ab, int len)
|
||||
{
|
||||
const v4sf *RESTRICT va = (const v4sf *)a;
|
||||
|
|
@ -2217,30 +2283,58 @@ static void transform_simd(PFFFT_Setup * setup, const float *input,
|
|||
assert(buff[ib] == output);
|
||||
}
|
||||
|
||||
|
||||
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a,
|
||||
const float *b, float *ab, float scaling)
|
||||
{
|
||||
int i, Ncvec = s->Ncvec;
|
||||
int i, Ncvec2 = s->Ncvec * 2;
|
||||
|
||||
if (s->transform == PFFFT_REAL) {
|
||||
// take care of the fftpack ordering
|
||||
ab[0] += a[0] * b[0] * scaling;
|
||||
ab[2 * Ncvec - 1] +=
|
||||
a[2 * Ncvec - 1] * b[2 * Ncvec - 1] * scaling;
|
||||
ab[Ncvec2 - 1] +=
|
||||
a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
|
||||
++ab;
|
||||
++a;
|
||||
++b;
|
||||
--Ncvec;
|
||||
Ncvec2 -= 2;
|
||||
}
|
||||
for (i = 0; i < Ncvec; ++i) {
|
||||
for (i = 0; i < Ncvec2; i += 2) {
|
||||
float ar, ai, br, bi;
|
||||
ar = a[2 * i + 0];
|
||||
ai = a[2 * i + 1];
|
||||
br = b[2 * i + 0];
|
||||
bi = b[2 * i + 1];
|
||||
ar = a[i + 0];
|
||||
ai = a[i + 1];
|
||||
br = b[i + 0];
|
||||
bi = b[i + 1];
|
||||
VCPLXMUL(ar, ai, br, bi);
|
||||
ab[2 * i + 0] += ar * scaling;
|
||||
ab[2 * i + 1] += ai * scaling;
|
||||
ab[i + 0] += ar * scaling;
|
||||
ab[i + 1] += ai * scaling;
|
||||
}
|
||||
}
|
||||
|
||||
static void zconvolve_simd(PFFFT_Setup * s, const float *a,
|
||||
const float *b, float *ab, float scaling)
|
||||
{
|
||||
int i, Ncvec2 = s->Ncvec * 2;
|
||||
|
||||
if (s->transform == PFFFT_REAL) {
|
||||
// take care of the fftpack ordering
|
||||
ab[0] = a[0] * b[0] * scaling;
|
||||
ab[Ncvec2 - 1] =
|
||||
a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
|
||||
++ab;
|
||||
++a;
|
||||
++b;
|
||||
Ncvec2 -= 2;
|
||||
}
|
||||
for (i = 0; i < Ncvec2; i += 2) {
|
||||
float ar, ai, br, bi;
|
||||
ar = a[i + 0];
|
||||
ai = a[i + 1];
|
||||
br = b[i + 0];
|
||||
bi = b[i + 1];
|
||||
VCPLXMUL(ar, ai, br, bi);
|
||||
ab[i + 0] = ar * scaling;
|
||||
ab[i + 1] = ai * scaling;
|
||||
}
|
||||
}
|
||||
static void sum_simd(const float *a, const float *b, float *ab, int len)
|
||||
|
|
@ -2262,6 +2356,7 @@ struct funcs pffft_funcs = {
|
|||
.transform = transform_simd,
|
||||
.zreorder = zreorder_simd,
|
||||
.zconvolve_accumulate = zconvolve_accumulate_simd,
|
||||
.zconvolve = zconvolve_simd,
|
||||
.sum = sum_simd,
|
||||
.simd_size = simd_size_simd,
|
||||
.validate = validate_pffft_simd,
|
||||
|
|
@ -2337,6 +2432,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const fl
|
|||
return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling);
|
||||
}
|
||||
|
||||
void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling)
|
||||
{
|
||||
return funcs->zconvolve(setup, dft_a, dft_b, dft_ab, scaling);
|
||||
}
|
||||
|
||||
void pffft_sum(const float *a, const float *b, float *ab, int len)
|
||||
{
|
||||
return funcs->sum(a, b, ab, len);
|
||||
|
|
|
|||
|
|
@ -158,6 +158,7 @@ extern "C" {
|
|||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
void pffft_sum(const float *a, const float *b, float *ab, int len);
|
||||
/*
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue