diff --git a/src/modules/module-filter-chain/convolver.c b/src/modules/module-filter-chain/convolver.c index 24dd2e50e..52f1ad694 100644 --- a/src/modules/module-filter-chain/convolver.c +++ b/src/modules/module-filter-chain/convolver.c @@ -58,6 +58,7 @@ struct convolver1 { int inputBufferFill; int current; + float scale; }; static void *fft_alloc(int size) @@ -82,11 +83,6 @@ static void fft_cpx_free(struct fft_cpx *cpx) fft_free(cpx->v); } -static void fft_cpx_clear(struct fft_cpx *cpx, int size) -{ - memset(cpx->v, 0, sizeof(float) * 2 * size); -} - static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size) { memcpy(dst->v, src->v, sizeof(float) * 2 * size); @@ -125,6 +121,11 @@ static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out) pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD); } +static inline void fft_convolve(void *fft, struct fft_cpx *r, + const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale) +{ + pffft_zconvolve(fft, a->v, b->v, r->v, scale); +} static inline void fft_convolve_accum(void *fft, struct fft_cpx *r, const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale) { @@ -192,6 +193,7 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen) conv->inputBuffer = fft_alloc(sizeof(float) * conv->segSize); conv->inputBufferFill = 0; conv->current = 0; + conv->scale = 1.0f / conv->segSize; return conv; } @@ -235,23 +237,36 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou fft_run(conv->fft, conv->inputBuffer, &conv->segments[conv->current]); - if (conv->inputBufferFill == 0) { - fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize); + if (conv->segCount > 1) { + if (conv->inputBufferFill == 0) { + int indexAudio = (conv->current + 1) % conv->segCount; - for (i = 1; i < conv->segCount; i++) { - const int indexIr = i; - const int indexAudio = (conv->current + i) % conv->segCount; - - fft_convolve_accum(conv->fft, &conv->pre_mult, - &conv->segmentsIr[indexIr], + fft_convolve(conv->fft, &conv->pre_mult, + &conv->segmentsIr[1], &conv->segments[indexAudio], - conv->fftComplexSize, 1.0f / conv->segSize); - } - } - fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize); + conv->fftComplexSize, conv->scale); - fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0], - conv->fftComplexSize, 1.0f / conv->segSize); + for (i = 2; i < conv->segCount; i++) { + indexAudio = (conv->current + i) % conv->segCount; + + fft_convolve_accum(conv->fft, &conv->pre_mult, + &conv->segmentsIr[i], + &conv->segments[indexAudio], + conv->fftComplexSize, conv->scale); + } + } + fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize); + + fft_convolve_accum(conv->fft, &conv->conv, + &conv->segments[conv->current], + &conv->segmentsIr[0], + conv->fftComplexSize, conv->scale); + } else { + fft_convolve(conv->fft, &conv->conv, + &conv->segments[conv->current], + &conv->segmentsIr[0], + conv->fftComplexSize, conv->scale); + } ifft_run(conv->ifft, &conv->conv, conv->fft_buffer); diff --git a/src/modules/module-filter-chain/pffft.c b/src/modules/module-filter-chain/pffft.c index aee30de44..82da123ed 100644 --- a/src/modules/module-filter-chain/pffft.c +++ b/src/modules/module-filter-chain/pffft.c @@ -133,6 +133,7 @@ inline v4sf ld_ps1(const float *p) #define new_setup_simd new_setup_altivec #define zreorder_simd zreorder_altivec #define zconvolve_accumulate_simd zconvolve_accumulate_altivec +#define zconvolve_simd zconvolve_altivec #define transform_simd transform_altivec #define sum_simd sum_altivec @@ -159,6 +160,7 @@ typedef __m128 v4sf; #define new_setup_simd new_setup_sse #define zreorder_simd zreorder_sse #define zconvolve_accumulate_simd zconvolve_accumulate_sse +#define zconvolve_simd zconvolve_sse #define transform_simd transform_sse #define sum_simd sum_sse @@ -192,6 +194,7 @@ typedef float32x4_t v4sf; #define new_setup_simd new_setup_neon #define zreorder_simd zreorder_neon #define zconvolve_accumulate_simd zconvolve_accumulate_neon +#define zconvolve_simd zconvolve_neon #define transform_simd transform_neon #define sum_simd sum_neon #else @@ -216,6 +219,7 @@ typedef float v4sf; #define new_setup_simd new_setup_c #define zreorder_simd zreorder_c #define zconvolve_accumulate_simd zconvolve_accumulate_c +#define zconvolve_simd zconvolve_c #define transform_simd transform_c #define sum_simd sum_c #endif @@ -1407,6 +1411,7 @@ struct funcs { void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered); void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + void (*zconvolve)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); void (*sum)(const float *a, const float *b, float *ab, int len); int (*simd_size)(void); void (*validate)(void); @@ -2008,7 +2013,7 @@ static void transform_simd(PFFFT_Setup * setup, const float *finput, static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b, float *ab, float scaling) { - int Ncvec = s->Ncvec; + const int Ncvec2 = s->Ncvec * 2; const v4sf *RESTRICT va = (const v4sf *)a; const v4sf *RESTRICT vb = (const v4sf *)b; v4sf *RESTRICT vab = (v4sf *) ab; @@ -2048,7 +2053,7 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc const float *a_ = a, *b_ = b; float *ab_ = ab; - int N = Ncvec; + int N = s->Ncvec; asm volatile ("mov r8, %2 \n" "vdup.f32 q15, %4 \n" "1: \n" @@ -2084,22 +2089,22 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q15", "memory"); #else // default routine, works fine for non-arm cpus with current compilers - for (i = 0; i < Ncvec; i += 2) { + for (i = 0; i < Ncvec2; i += 4) { v4sf ar, ai, br, bi; - ar = va[2 * i + 0]; - ai = va[2 * i + 1]; - br = vb[2 * i + 0]; - bi = vb[2 * i + 1]; + ar = va[i + 0]; + ai = va[i + 1]; + br = vb[i + 0]; + bi = vb[i + 1]; VCPLXMUL(ar, ai, br, bi); - vab[2 * i + 0] = VMADD(ar, vscal, vab[2 * i + 0]); - vab[2 * i + 1] = VMADD(ai, vscal, vab[2 * i + 1]); - ar = va[2 * i + 2]; - ai = va[2 * i + 3]; - br = vb[2 * i + 2]; - bi = vb[2 * i + 3]; + vab[i + 0] = VMADD(ar, vscal, vab[i + 0]); + vab[i + 1] = VMADD(ai, vscal, vab[i + 1]); + ar = va[i + 2]; + ai = va[i + 3]; + br = vb[i + 2]; + bi = vb[i + 3]; VCPLXMUL(ar, ai, br, bi); - vab[2 * i + 2] = VMADD(ar, vscal, vab[2 * i + 2]); - vab[2 * i + 3] = VMADD(ai, vscal, vab[2 * i + 3]); + vab[i + 2] = VMADD(ar, vscal, vab[i + 2]); + vab[i + 3] = VMADD(ai, vscal, vab[i + 3]); } #endif if (s->transform == PFFFT_REAL) { @@ -2108,6 +2113,67 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo } } +static void zconvolve_simd(PFFFT_Setup * s, const float *a, const float *b, + float *ab, float scaling) +{ + v4sf vscal = LD_PS1(scaling); + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + float sar, sai, sbr, sbi; + const int Ncvec2 = s->Ncvec * 2; + int i; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + sar = ((v4sf_union*)va)[0].f[0]; + sai = ((v4sf_union*)va)[1].f[0]; + sbr = ((v4sf_union*)vb)[0].f[0]; + sbi = ((v4sf_union*)vb)[1].f[0]; + + /* default routine, works fine for non-arm cpus with current compilers */ + for (i = 0; i < Ncvec2; i += 4) { + v4sf var, vai, vbr, vbi; + var = va[i + 0]; + vai = va[i + 1]; + vbr = vb[i + 0]; + vbi = vb[i + 1]; + VCPLXMUL(var, vai, vbr, vbi); + vab[i + 0] = VMUL(var, vscal); + vab[i + 1] = VMUL(vai, vscal); + var = va[i + 2]; + vai = va[i + 3]; + vbr = vb[i + 2]; + vbi = vb[i + 3]; + VCPLXMUL(var, vai, vbr, vbi); + vab[i + 2] = VMUL(var, vscal); + vab[i + 3] = VMUL(vai, vscal); + } + + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = sar * sbr * scaling; + ((v4sf_union*)vab)[1].f[0] = sai * sbi * scaling; + } +} + + static void sum_simd(const float *a, const float *b, float *ab, int len) { const v4sf *RESTRICT va = (const v4sf *)a; @@ -2217,30 +2283,58 @@ static void transform_simd(PFFFT_Setup * setup, const float *input, assert(buff[ib] == output); } + static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b, float *ab, float scaling) { - int i, Ncvec = s->Ncvec; + int i, Ncvec2 = s->Ncvec * 2; if (s->transform == PFFFT_REAL) { // take care of the fftpack ordering ab[0] += a[0] * b[0] * scaling; - ab[2 * Ncvec - 1] += - a[2 * Ncvec - 1] * b[2 * Ncvec - 1] * scaling; + ab[Ncvec2 - 1] += + a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling; ++ab; ++a; ++b; - --Ncvec; + Ncvec2 -= 2; } - for (i = 0; i < Ncvec; ++i) { + for (i = 0; i < Ncvec2; i += 2) { float ar, ai, br, bi; - ar = a[2 * i + 0]; - ai = a[2 * i + 1]; - br = b[2 * i + 0]; - bi = b[2 * i + 1]; + ar = a[i + 0]; + ai = a[i + 1]; + br = b[i + 0]; + bi = b[i + 1]; VCPLXMUL(ar, ai, br, bi); - ab[2 * i + 0] += ar * scaling; - ab[2 * i + 1] += ai * scaling; + ab[i + 0] += ar * scaling; + ab[i + 1] += ai * scaling; + } +} + +static void zconvolve_simd(PFFFT_Setup * s, const float *a, + const float *b, float *ab, float scaling) +{ + int i, Ncvec2 = s->Ncvec * 2; + + if (s->transform == PFFFT_REAL) { + // take care of the fftpack ordering + ab[0] = a[0] * b[0] * scaling; + ab[Ncvec2 - 1] = + a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling; + ++ab; + ++a; + ++b; + Ncvec2 -= 2; + } + for (i = 0; i < Ncvec2; i += 2) { + float ar, ai, br, bi; + ar = a[i + 0]; + ai = a[i + 1]; + br = b[i + 0]; + bi = b[i + 1]; + VCPLXMUL(ar, ai, br, bi); + ab[i + 0] = ar * scaling; + ab[i + 1] = ai * scaling; } } static void sum_simd(const float *a, const float *b, float *ab, int len) @@ -2262,6 +2356,7 @@ struct funcs pffft_funcs = { .transform = transform_simd, .zreorder = zreorder_simd, .zconvolve_accumulate = zconvolve_accumulate_simd, + .zconvolve = zconvolve_simd, .sum = sum_simd, .simd_size = simd_size_simd, .validate = validate_pffft_simd, @@ -2337,6 +2432,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const fl return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling); } +void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling) +{ + return funcs->zconvolve(setup, dft_a, dft_b, dft_ab, scaling); +} + void pffft_sum(const float *a, const float *b, float *ab, int len) { return funcs->sum(a, b, ab, len); diff --git a/src/modules/module-filter-chain/pffft.h b/src/modules/module-filter-chain/pffft.h index cf83833f2..eb103d072 100644 --- a/src/modules/module-filter-chain/pffft.h +++ b/src/modules/module-filter-chain/pffft.h @@ -158,6 +158,7 @@ extern "C" { The dft_a, dft_b and dft_ab pointers may alias. */ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); void pffft_sum(const float *a, const float *b, float *ab, int len); /*