filter-chain: optimize convolver some more

Add function to convolve without accumulate and use that when possible to avoid some memset and memcpy.
2025-11-06 13:30:01 -05:00 · 2021-08-27 16:09:00 +02:00 · 2021-08-27 16:09:00 +02:00 · 994630cb3a
commit 994630cb3a
parent 1fe54e5f17
3 changed files with 161 additions and 45 deletions
--- a/src/modules/module-filter-chain/convolver.c
+++ b/src/modules/module-filter-chain/convolver.c
@ -58,6 +58,7 @@ struct convolver1 {
 	int inputBufferFill;

 	int current;
+	float scale;
 };

 static void *fft_alloc(int size)
@ -82,11 +83,6 @@ static void fft_cpx_free(struct fft_cpx *cpx)
 	fft_free(cpx->v);
 }

-static void fft_cpx_clear(struct fft_cpx *cpx, int size)
-{
-	memset(cpx->v, 0, sizeof(float) * 2 * size);
-}
-
 static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
 {
 	memcpy(dst->v, src->v, sizeof(float) * 2 * size);
@ -125,6 +121,11 @@ static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
 	pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
 }

+static inline void fft_convolve(void *fft, struct fft_cpx *r,
+		const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
+{
+	pffft_zconvolve(fft, a->v, b->v, r->v, scale);
+}
 static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
 		const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
 {
@ -192,6 +193,7 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 	conv->inputBuffer = fft_alloc(sizeof(float) * conv->segSize);
 	conv->inputBufferFill = 0;
 	conv->current = 0;
+	conv->scale = 1.0f / conv->segSize;

 	return conv;
 }
@ -235,23 +237,36 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou

 		fft_run(conv->fft, conv->inputBuffer, &conv->segments[conv->current]);

-		if (conv->inputBufferFill == 0) {
-			fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);
+		if (conv->segCount > 1) {
+			if (conv->inputBufferFill == 0) {
+				int indexAudio = (conv->current + 1) % conv->segCount;

-			for (i = 1; i < conv->segCount; i++) {
-				const int indexIr = i;
-				const int indexAudio = (conv->current + i) % conv->segCount;
-
-				fft_convolve_accum(conv->fft, &conv->pre_mult,
-						&conv->segmentsIr[indexIr],
+				fft_convolve(conv->fft, &conv->pre_mult,
+						&conv->segmentsIr[1],
 						&conv->segments[indexAudio],
-						conv->fftComplexSize, 1.0f / conv->segSize);
-			}
-		}
-		fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
+						conv->fftComplexSize, conv->scale);

-		fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
-				conv->fftComplexSize, 1.0f / conv->segSize);
+				for (i = 2; i < conv->segCount; i++) {
+					indexAudio = (conv->current + i) % conv->segCount;
+
+					fft_convolve_accum(conv->fft, &conv->pre_mult,
+							&conv->segmentsIr[i],
+							&conv->segments[indexAudio],
+							conv->fftComplexSize, conv->scale);
+				}
+			}
+			fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
+
+			fft_convolve_accum(conv->fft, &conv->conv,
+					&conv->segments[conv->current],
+					&conv->segmentsIr[0],
+					conv->fftComplexSize, conv->scale);
+		} else {
+			fft_convolve(conv->fft, &conv->conv,
+					&conv->segments[conv->current],
+					&conv->segmentsIr[0],
+					conv->fftComplexSize, conv->scale);
+		}

 		ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);

--- a/src/modules/module-filter-chain/pffft.c
+++ b/src/modules/module-filter-chain/pffft.c
@ -133,6 +133,7 @@ inline v4sf ld_ps1(const float *p)
 #define new_setup_simd new_setup_altivec
 #define zreorder_simd zreorder_altivec
 #define zconvolve_accumulate_simd zconvolve_accumulate_altivec
+#define zconvolve_simd zconvolve_altivec
 #define transform_simd transform_altivec
 #define sum_simd sum_altivec

@ -159,6 +160,7 @@ typedef __m128 v4sf;
 #define new_setup_simd new_setup_sse
 #define zreorder_simd zreorder_sse
 #define zconvolve_accumulate_simd zconvolve_accumulate_sse
+#define zconvolve_simd zconvolve_sse
 #define transform_simd transform_sse
 #define sum_simd sum_sse

@ -192,6 +194,7 @@ typedef float32x4_t v4sf;
 #define new_setup_simd new_setup_neon
 #define zreorder_simd zreorder_neon
 #define zconvolve_accumulate_simd zconvolve_accumulate_neon
+#define zconvolve_simd zconvolve_neon
 #define transform_simd transform_neon
 #define sum_simd sum_neon
 #else
@ -216,6 +219,7 @@ typedef float v4sf;
 #define new_setup_simd new_setup_c
 #define zreorder_simd zreorder_c
 #define zconvolve_accumulate_simd zconvolve_accumulate_c
+#define zconvolve_simd zconvolve_c
 #define transform_simd transform_c
 #define sum_simd sum_c
 #endif
@ -1407,6 +1411,7 @@ struct funcs {
  void (*transform) (PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction, int ordered);
  void (*zreorder)(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
  void (*zconvolve_accumulate)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+  void (*zconvolve)(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
  void (*sum)(const float *a, const float *b, float *ab, int len);
  int (*simd_size)(void);
  void (*validate)(void);
@ -2008,7 +2013,7 @@ static void transform_simd(PFFFT_Setup * setup, const float *finput,
 static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const float *b,
 				float *ab, float scaling)
 {
-	int Ncvec = s->Ncvec;
+	const int Ncvec2 = s->Ncvec * 2;
 	const v4sf *RESTRICT va = (const v4sf *)a;
 	const v4sf *RESTRICT vb = (const v4sf *)b;
 	v4sf *RESTRICT vab = (v4sf *) ab;
@ -2048,7 +2053,7 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
 #ifdef ZCONVOLVE_USING_INLINE_ASM	// inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
 	const float *a_ = a, *b_ = b;
 	float *ab_ = ab;
-	int N = Ncvec;
+	int N = s->Ncvec;
 	asm volatile ("mov         r8, %2                  \n"
 		      "vdup.f32    q15, %4                 \n"
 		      "1:                                  \n"
@ -2084,22 +2089,22 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
 		      "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
 		      "q10", "q11", "q12", "q13", "q15", "memory");
 #else				// default routine, works fine for non-arm cpus with current compilers
-	for (i = 0; i < Ncvec; i += 2) {
+	for (i = 0; i < Ncvec2; i += 4) {
 		v4sf ar, ai, br, bi;
-		ar = va[2 * i + 0];
-		ai = va[2 * i + 1];
-		br = vb[2 * i + 0];
-		bi = vb[2 * i + 1];
+		ar = va[i + 0];
+		ai = va[i + 1];
+		br = vb[i + 0];
+		bi = vb[i + 1];
 		VCPLXMUL(ar, ai, br, bi);
-		vab[2 * i + 0] = VMADD(ar, vscal, vab[2 * i + 0]);
-		vab[2 * i + 1] = VMADD(ai, vscal, vab[2 * i + 1]);
-		ar = va[2 * i + 2];
-		ai = va[2 * i + 3];
-		br = vb[2 * i + 2];
-		bi = vb[2 * i + 3];
+		vab[i + 0] = VMADD(ar, vscal, vab[i + 0]);
+		vab[i + 1] = VMADD(ai, vscal, vab[i + 1]);
+		ar = va[i + 2];
+		ai = va[i + 3];
+		br = vb[i + 2];
+		bi = vb[i + 3];
 		VCPLXMUL(ar, ai, br, bi);
-		vab[2 * i + 2] = VMADD(ar, vscal, vab[2 * i + 2]);
-		vab[2 * i + 3] = VMADD(ai, vscal, vab[2 * i + 3]);
+		vab[i + 2] = VMADD(ar, vscal, vab[i + 2]);
+		vab[i + 3] = VMADD(ai, vscal, vab[i + 3]);
 	}
 #endif
 	if (s->transform == PFFFT_REAL) {
@ -2108,6 +2113,67 @@ static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a, const flo
 	}
 }

+static void zconvolve_simd(PFFFT_Setup * s, const float *a, const float *b,
+				float *ab, float scaling)
+{
+	v4sf vscal = LD_PS1(scaling);
+	const v4sf * RESTRICT va = (const v4sf*)a;
+	const v4sf * RESTRICT vb = (const v4sf*)b;
+	v4sf * RESTRICT vab = (v4sf*)ab;
+	float sar, sai, sbr, sbi;
+	const int Ncvec2 = s->Ncvec * 2;
+	int i;
+
+#ifdef __arm__
+	__builtin_prefetch(va);
+	__builtin_prefetch(vb);
+	__builtin_prefetch(vab);
+	__builtin_prefetch(va+2);
+	__builtin_prefetch(vb+2);
+	__builtin_prefetch(vab+2);
+	__builtin_prefetch(va+4);
+	__builtin_prefetch(vb+4);
+	__builtin_prefetch(vab+4);
+	__builtin_prefetch(va+6);
+	__builtin_prefetch(vb+6);
+	__builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
+#endif
+
+	assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+	sar = ((v4sf_union*)va)[0].f[0];
+	sai = ((v4sf_union*)va)[1].f[0];
+	sbr = ((v4sf_union*)vb)[0].f[0];
+	sbi = ((v4sf_union*)vb)[1].f[0];
+
+	/* default routine, works fine for non-arm cpus with current compilers */
+	for (i = 0; i < Ncvec2; i += 4) {
+		v4sf var, vai, vbr, vbi;
+		var = va[i + 0];
+		vai = va[i + 1];
+		vbr = vb[i + 0];
+		vbi = vb[i + 1];
+		VCPLXMUL(var, vai, vbr, vbi);
+		vab[i + 0] = VMUL(var, vscal);
+		vab[i + 1] = VMUL(vai, vscal);
+		var = va[i + 2];
+		vai = va[i + 3];
+		vbr = vb[i + 2];
+		vbi = vb[i + 3];
+		VCPLXMUL(var, vai, vbr, vbi);
+		vab[i + 2] = VMUL(var, vscal);
+		vab[i + 3] = VMUL(vai, vscal);
+	}
+
+	if (s->transform == PFFFT_REAL) {
+		((v4sf_union*)vab)[0].f[0] = sar * sbr * scaling;
+		((v4sf_union*)vab)[1].f[0] = sai * sbi * scaling;
+	}
+}
+
+
 static void sum_simd(const float *a, const float *b, float *ab, int len)
 {
 	const v4sf *RESTRICT va = (const v4sf *)a;
@ -2217,30 +2283,58 @@ static void transform_simd(PFFFT_Setup * setup, const float *input,
 	assert(buff[ib] == output);
 }

+
 static void zconvolve_accumulate_simd(PFFFT_Setup * s, const float *a,
 				       const float *b, float *ab, float scaling)
 {
-	int i, Ncvec = s->Ncvec;
+	int i, Ncvec2 = s->Ncvec * 2;

 	if (s->transform == PFFFT_REAL) {
 		// take care of the fftpack ordering
 		ab[0] += a[0] * b[0] * scaling;
-		ab[2 * Ncvec - 1] +=
-		    a[2 * Ncvec - 1] * b[2 * Ncvec - 1] * scaling;
+		ab[Ncvec2 - 1] +=
+		    a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
 		++ab;
 		++a;
 		++b;
-		--Ncvec;
+		Ncvec2 -= 2;
 	}
-	for (i = 0; i < Ncvec; ++i) {
+	for (i = 0; i < Ncvec2; i += 2) {
 		float ar, ai, br, bi;
-		ar = a[2 * i + 0];
-		ai = a[2 * i + 1];
-		br = b[2 * i + 0];
-		bi = b[2 * i + 1];
+		ar = a[i + 0];
+		ai = a[i + 1];
+		br = b[i + 0];
+		bi = b[i + 1];
 		VCPLXMUL(ar, ai, br, bi);
-		ab[2 * i + 0] += ar * scaling;
-		ab[2 * i + 1] += ai * scaling;
+		ab[i + 0] += ar * scaling;
+		ab[i + 1] += ai * scaling;
+	}
+}
+
+static void zconvolve_simd(PFFFT_Setup * s, const float *a,
+				       const float *b, float *ab, float scaling)
+{
+	int i, Ncvec2 = s->Ncvec * 2;
+
+	if (s->transform == PFFFT_REAL) {
+		// take care of the fftpack ordering
+		ab[0] = a[0] * b[0] * scaling;
+		ab[Ncvec2 - 1] =
+		    a[Ncvec2 - 1] * b[Ncvec2 - 1] * scaling;
+		++ab;
+		++a;
+		++b;
+		Ncvec2 -= 2;
+	}
+	for (i = 0; i < Ncvec2; i += 2) {
+		float ar, ai, br, bi;
+		ar = a[i + 0];
+		ai = a[i + 1];
+		br = b[i + 0];
+		bi = b[i + 1];
+		VCPLXMUL(ar, ai, br, bi);
+		ab[i + 0] = ar * scaling;
+		ab[i + 1] = ai * scaling;
 	}
 }
 static void sum_simd(const float *a, const float *b, float *ab, int len)
@ -2262,6 +2356,7 @@ struct funcs pffft_funcs = {
 	.transform = transform_simd,
 	.zreorder = zreorder_simd,
 	.zconvolve_accumulate = zconvolve_accumulate_simd,
+	.zconvolve = zconvolve_simd,
 	.sum = sum_simd,
 	.simd_size = simd_size_simd,
 	.validate = validate_pffft_simd,
@ -2337,6 +2432,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const fl
 	return funcs->zconvolve_accumulate(setup, dft_a, dft_b, dft_ab, scaling);
 }

+void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling)
+{
+	return funcs->zconvolve(setup, dft_a, dft_b, dft_ab, scaling);
+}
+
 void pffft_sum(const float *a, const float *b, float *ab, int len)
 {
 	return funcs->sum(a, b, ab, len);
--- a/src/modules/module-filter-chain/pffft.h
+++ b/src/modules/module-filter-chain/pffft.h
@ -158,6 +158,7 @@ extern "C" {
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+  void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);

  void pffft_sum(const float *a, const float *b, float *ab, int len);
  /*