filter-chain: optimize convolver some more

Add function to convolve without accumulate and use that when possible
to avoid some memset and memcpy.
This commit is contained in:
Wim Taymans 2021-08-27 16:09:00 +02:00
parent 1fe54e5f17
commit 994630cb3a
3 changed files with 161 additions and 45 deletions

View file

@ -58,6 +58,7 @@ struct convolver1 {
int inputBufferFill; int inputBufferFill;
int current; int current;
float scale;
}; };
static void *fft_alloc(int size) static void *fft_alloc(int size)
@ -82,11 +83,6 @@ static void fft_cpx_free(struct fft_cpx *cpx)
fft_free(cpx->v); fft_free(cpx->v);
} }
static void fft_cpx_clear(struct fft_cpx *cpx, int size)
{
memset(cpx->v, 0, sizeof(float) * 2 * size);
}
static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size) static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
{ {
memcpy(dst->v, src->v, sizeof(float) * 2 * size); memcpy(dst->v, src->v, sizeof(float) * 2 * size);
@ -125,6 +121,11 @@ static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD); pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
} }
static inline void fft_convolve(void *fft, struct fft_cpx *r,
const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
{
pffft_zconvolve(fft, a->v, b->v, r->v, scale);
}
static inline void fft_convolve_accum(void *fft, struct fft_cpx *r, static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale) const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
{ {
@ -192,6 +193,7 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
conv->inputBuffer = fft_alloc(sizeof(float) * conv->segSize); conv->inputBuffer = fft_alloc(sizeof(float) * conv->segSize);
conv->inputBufferFill = 0; conv->inputBufferFill = 0;
conv->current = 0; conv->current = 0;
conv->scale = 1.0f / conv->segSize;
return conv; return conv;
} }
@ -235,23 +237,36 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
fft_run(conv->fft, conv->inputBuffer, &conv->segments[conv->current]); fft_run(conv->fft, conv->inputBuffer, &conv->segments[conv->current]);
if (conv->inputBufferFill == 0) { if (conv->segCount > 1) {
fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize); if (conv->inputBufferFill == 0) {
int indexAudio = (conv->current + 1) % conv->segCount;
for (i = 1; i < conv->segCount; i++) { fft_convolve(conv->fft, &conv->pre_mult,
const int indexIr = i; &conv->segmentsIr[1],
const int indexAudio = (conv->current + i) % conv->segCount;
fft_convolve_accum(conv->fft, &conv->pre_mult,
&conv->segmentsIr[indexIr],
&conv->segments[indexAudio], &conv->segments[indexAudio],
conv->fftComplexSize, 1.0f / conv->segSize); conv->fftComplexSize, conv->scale);
}
}
fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0], for (i = 2; i < conv->segCount; i++) {
conv->fftComplexSize, 1.0f / conv->segSize); indexAudio = (conv->current + i) % conv->segCount;
fft_convolve_accum(conv->fft, &conv->pre_mult,
&conv->segmentsIr[i],
&conv->segments[indexAudio],
conv->fftComplexSize, conv->scale);
}
}
fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
fft_convolve_accum(conv->fft, &conv->conv,
&conv->segments[conv->current],
&conv->segmentsIr[0],
conv->fftComplexSize, conv->scale);
} else {
fft_convolve(conv->fft, &conv->conv,
&conv->segments[conv->current],
&conv->segmentsIr[0],
conv->fftComplexSize, conv->scale);
}
ifft_run(conv->ifft, &conv->conv, conv->fft_buffer); ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);

View file

@ -133,6 +133,7 @@ inline v4sf ld_ps1(const float *p)
#define new_setup_simd new_setup_altivec #define new_setup_simd new_setup_altivec
#define zreorder_simd zreorder_altivec #define zreorder_simd zreorder_altivec
#define zconvolve_accumulate_simd zconvolve_accumulate_altivec #define zconvolve_accumulate_simd zconvolve_accumulate_altivec
#define zconvolve_simd zconvolve_altivec
#define transform_simd transform_altivec #define transform_simd transform_altivec
#define sum_simd sum_altivec #define sum_simd sum_altivec
@ -159,6 +160,7 @@ typedef __m128 v4sf;
#define new_setup_simd new_setup_sse #define new_setup_simd new_setup_sse
#define zreorder_simd zreorder_sse #define zreorder_simd zreorder_sse
#define zconvolve_accumulate_simd zconvolve_accumulate_sse #define zconvolve_accumulate_simd zconvolve_accumulate_sse
#define zconvolve_simd zconvolve_sse
#define transform_simd transform_sse #define transform_simd transform_sse
#define sum_simd sum_sse #define sum_simd sum_sse
@ -192,6 +194,7 @@ typedef float32x4_t v4sf;
#define new_setup_simd new_setup_neon #define new_setup_simd new_setup_neon
#define zreorder_simd zreorder_neon #define zreorder_simd zreorder_neon
#define zconvolve_accumulate_simd zconvolve_accumulate_neon #define zconvolve_accumulate_simd zconvolve_accumulate_neon
#define zconvolve_simd zconvolve_neon
#define transform_simd transform_neon #define transform_simd transform_neon
#define sum_simd sum_neon #define sum_simd sum_neon
#else #else
@ -216,6 +219,7 @@ typedef float v4sf;
#define new_setup_simd new_setup_c #define new_setup_simd new_setup_c
#define zreorder_simd zreorder_c #define zreorder_simd zreorder_c
#define zconvolve_accumulate_simd zconvolve_accumulate_c #define zconvolve_accumulate_simd zconvolve_accumulate_c
#define zconvolve_simd zconvolve_c
#define transform_simd transform_c #define transform_simd transform_c
#define sum_simd sum_c #define sum_simd sum_c
#endif #endif
@ -1407,6 +1411,7 @@ struct funcs {
void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered); void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered);
void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void (*zconvolve)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void (*sum)(const float *a, const float *b, float *ab, int len); void (*sum)(const float *a, const float *b, float *ab, int len);
int (*simd_size)(void); int (*simd_size)(void);
void (*validate)(void); void (*validate)(void);
@ -2008,7 +2013,7 @@ static void transform_simd(PFFFT_Setup * setup, const float *finput,
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b, static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b,
float *ab, float scaling) float *ab, float scaling)
{ {
int Ncvec = s->Ncvec; const int Ncvec2 = s->Ncvec * 2;
const v4sf *RESTRICT va = (const v4sf *)a; const v4sf *RESTRICT va = (const v4sf *)a;
const v4sf *RESTRICT vb = (const v4sf *)b; const v4sf *RESTRICT vb = (const v4sf *)b;
v4sf *RESTRICT vab = (v4sf *) ab; v4sf *RESTRICT vab = (v4sf *) ab;
@ -2048,7 +2053,7 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
const float *a_ = a, *b_ = b; const float *a_ = a, *b_ = b;
float *ab_ = ab; float *ab_ = ab;
int N = Ncvec; int N = s->Ncvec;
asm volatile ("mov r8, %2 \n" asm volatile ("mov r8, %2 \n"
"vdup.f32 q15, %4 \n" "vdup.f32 q15, %4 \n"
"1: \n" "1: \n"
@ -2084,22 +2089,22 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
"q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13", "q15", "memory"); "q10", "q11", "q12", "q13", "q15", "memory");
#else // default routine, works fine for non-arm cpus with current compilers #else // default routine, works fine for non-arm cpus with current compilers
for (i = 0; i < Ncvec; i += 2) { for (i = 0; i < Ncvec2; i += 4) {
v4sf ar, ai, br, bi; v4sf ar, ai, br, bi;
ar = va[2 * i + 0]; ar = va[i + 0];
ai = va[2 * i + 1]; ai = va[i + 1];
br = vb[2 * i + 0]; br = vb[i + 0];
bi = vb[2 * i + 1]; bi = vb[i + 1];
VCPLXMUL(ar, ai, br, bi); VCPLXMUL(ar, ai, br, bi);
vab[2 * i + 0] = VMADD(ar, vscal, vab[2 * i + 0]); vab[i + 0] = VMADD(ar, vscal, vab[i + 0]);
vab[2 * i + 1] = VMADD(ai, vscal, vab[2 * i + 1]); vab[i + 1] = VMADD(ai, vscal, vab[i + 1]);
ar = va[2 * i + 2]; ar = va[i + 2];
ai = va[2 * i + 3]; ai = va[i + 3];
br = vb[2 * i + 2]; br = vb[i + 2];
bi = vb[2 * i + 3]; bi = vb[i + 3];
VCPLXMUL(ar, ai, br, bi); VCPLXMUL(ar, ai, br, bi);
vab[2 * i + 2] = VMADD(ar, vscal, vab[2 * i + 2]); vab[i + 2] = VMADD(ar, vscal, vab[i + 2]);
vab[2 * i + 3] = VMADD(ai, vscal, vab[2 * i + 3]); vab[i + 3] = VMADD(ai, vscal, vab[i + 3]);
} }
#endif #endif
if (s->transform == PFFFT_REAL) { if (s->transform == PFFFT_REAL) {
@ -2108,6 +2113,67 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
} }
} }
static void zconvolve_simd(PFFFT_Setup * s, const float *a, const float *b,
float *ab, float scaling)
{
v4sf vscal = LD_PS1(scaling);
const v4sf * RESTRICT va = (const v4sf*)a;
const v4sf * RESTRICT vb = (const v4sf*)b;
v4sf * RESTRICT vab = (v4sf*)ab;
float sar, sai, sbr, sbi;
const int Ncvec2 = s->Ncvec * 2;
int i;
#ifdef __arm__
__builtin_prefetch(va);
__builtin_prefetch(vb);
__builtin_prefetch(vab);
__builtin_prefetch(va+2);
__builtin_prefetch(vb+2);
__builtin_prefetch(vab+2);
__builtin_prefetch(va+4);
__builtin_prefetch(vb+4);
__builtin_prefetch(vab+4);
__builtin_prefetch(va+6);
__builtin_prefetch(vb+6);
__builtin_prefetch(vab+6);
# ifndef __clang__
# define ZCONVOLVE_USING_INLINE_NEON_ASM
# endif
#endif
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
sar = ((v4sf_union*)va)[0].f[0];
sai = ((v4sf_union*)va)[1].f[0];
sbr = ((v4sf_union*)vb)[0].f[0];
sbi = ((v4sf_union*)vb)[1].f[0];
/* default routine, works fine for non-arm cpus with current compilers */
for (i = 0; i < Ncvec2; i += 4) {
v4sf var, vai, vbr, vbi;
var = va[i + 0];
vai = va[i + 1];
vbr = vb[i + 0];
vbi = vb[i + 1];
VCPLXMUL(var, vai, vbr, vbi);
vab[i + 0] = VMUL(var, vscal);
vab[i + 1] = VMUL(vai, vscal);
var = va[i + 2];
vai = va[i + 3];
vbr = vb[i + 2];
vbi = vb[i + 3];
VCPLXMUL(var, vai, vbr, vbi);
vab[i + 2] = VMUL(var, vscal);
vab[i + 3] = VMUL(vai, vscal);
}
if (s->transform == PFFFT_REAL) {
((v4sf_union*)vab)[0].f[0] = sar * sbr * scaling;
((v4sf_union*)vab)[1].f[0] = sai * sbi * scaling;
}
}
static void sum_simd(const float *a, const float *b, float *ab, int len) static void sum_simd(const float *a, const float *b, float *ab, int len)
{ {
const v4sf *RESTRICT va = (const v4sf *)a; const v4sf *RESTRICT va = (const v4sf *)a;
@ -2217,30 +2283,58 @@ static void transform_simd(PFFFT_Setup * setup, const float *input,
assert(buff[ib] == output); assert(buff[ib] == output);
} }
static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a,
const float *b, float *ab, float scaling) const float *b, float *ab, float scaling)
{ {
int i, Ncvec = s->Ncvec; int i, Ncvec2 = s->Ncvec * 2;
if (s->transform == PFFFT_REAL) { if (s->transform == PFFFT_REAL) {
// take care of the fftpack ordering // take care of the fftpack ordering
ab[0] += a[0] * b[0] * scaling; ab[0] += a[0] * b[0] * scaling;
ab[2 * Ncvec - 1] += ab[Ncvec2 - 1] +=
a[2 * Ncvec - 1] * b[2 * Ncvec - 1] * scaling; a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
++ab; ++ab;
++a; ++a;
++b; ++b;
--Ncvec; Ncvec2 -= 2;
} }
for (i = 0; i < Ncvec; ++i) { for (i = 0; i < Ncvec2; i += 2) {
float ar, ai, br, bi; float ar, ai, br, bi;
ar = a[2 * i + 0]; ar = a[i + 0];
ai = a[2 * i + 1]; ai = a[i + 1];
br = b[2 * i + 0]; br = b[i + 0];
bi = b[2 * i + 1]; bi = b[i + 1];
VCPLXMUL(ar, ai, br, bi); VCPLXMUL(ar, ai, br, bi);
ab[2 * i + 0] += ar * scaling; ab[i + 0] += ar * scaling;
ab[2 * i + 1] += ai * scaling; ab[i + 1] += ai * scaling;
}
}
static void zconvolve_simd(PFFFT_Setup * s, const float *a,
const float *b, float *ab, float scaling)
{
int i, Ncvec2 = s->Ncvec * 2;
if (s->transform == PFFFT_REAL) {
// take care of the fftpack ordering
ab[0] = a[0] * b[0] * scaling;
ab[Ncvec2 - 1] =
a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
++ab;
++a;
++b;
Ncvec2 -= 2;
}
for (i = 0; i < Ncvec2; i += 2) {
float ar, ai, br, bi;
ar = a[i + 0];
ai = a[i + 1];
br = b[i + 0];
bi = b[i + 1];
VCPLXMUL(ar, ai, br, bi);
ab[i + 0] = ar * scaling;
ab[i + 1] = ai * scaling;
} }
} }
static void sum_simd(const float *a, const float *b, float *ab, int len) static void sum_simd(const float *a, const float *b, float *ab, int len)
@ -2262,6 +2356,7 @@ struct funcs pffft_funcs = {
.transform = transform_simd, .transform = transform_simd,
.zreorder = zreorder_simd, .zreorder = zreorder_simd,
.zconvolve_accumulate = zconvolve_accumulate_simd, .zconvolve_accumulate = zconvolve_accumulate_simd,
.zconvolve = zconvolve_simd,
.sum = sum_simd, .sum = sum_simd,
.simd_size = simd_size_simd, .simd_size = simd_size_simd,
.validate = validate_pffft_simd, .validate = validate_pffft_simd,
@ -2337,6 +2432,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const fl
return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling); return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling);
} }
void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling)
{
return funcs->zconvolve(setup, dft_a, dft_b, dft_ab, scaling);
}
void pffft_sum(const float *a, const float *b, float *ab, int len) void pffft_sum(const float *a, const float *b, float *ab, int len)
{ {
return funcs->sum(a, b, ab, len); return funcs->sum(a, b, ab, len);

View file

@ -158,6 +158,7 @@ extern "C" {
The dft_a, dft_b and dft_ab pointers may alias. The dft_a, dft_b and dft_ab pointers may alias.
*/ */
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
void pffft_sum(const float *a, const float *b, float *ab, int len); void pffft_sum(const float *a, const float *b, float *ab, int len);
/* /*