filter-chain: optimize biquads a little

Add sse optimized biquads. Make a new dsp function to run multiple biquads on multiple channels. This makes it possible to unroll some operations and run the channels in parallel later.
2025-10-29 05:40:27 -04:00 · 2024-10-11 17:49:34 +02:00 · 2024-10-11 17:49:34 +02:00 · ac21541741
commit ac21541741
parent 673352893a
5 changed files with 177 additions and 8 deletions
--- a/src/modules/module-filter-chain/builtin_plugin.c
+++ b/src/modules/module-filter-chain/builtin_plugin.c
@ -1882,13 +1882,10 @@ static void param_eq_connect_port(void * Instance, unsigned long Port,
 static void param_eq_run(void * Instance, unsigned long SampleCount)
 {
 	struct param_eq_impl *impl = Instance;
-	float *in = impl->port[1];
-	float *out = impl->port[0];
+	const float *in[] = { impl->port[1] };
+	float *out[] = { impl->port[0] };

-	for (uint32_t i = 0; i < impl->n_bq; i++) {
-		dsp_ops_biquad_run(dsp_ops, &impl->bq[i], out, in, SampleCount);
-		in = out;
-	}
+	dsp_ops_biquadn_run(dsp_ops, impl->bq, impl->n_bq, out, in, 1, SampleCount);
 }

 static const struct fc_descriptor param_eq_desc = {
--- a/src/modules/module-filter-chain/dsp-ops-c.c
+++ b/src/modules/module-filter-chain/dsp-ops-c.c
@ -135,6 +135,24 @@ void dsp_biquad_run_c(struct dsp_ops *ops, struct biquad *bq,
 #undef F
 }

+void dsp_biquadn_run_c(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
+		float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
+		uint32_t n_src, uint32_t n_samples)
+{
+	uint32_t i, j;
+	const float *s;
+	float *d;
+	struct biquad *b = bq;
+	for (i = 0; i < n_src; i++, bq+=n_src) {
+		s = in[i];
+		d = out[i];
+		for (j = 0; j < n_bq; j++) {
+			dsp_biquad_run_c(ops, &b[j], d, s, n_samples);
+			s = d;
+		}
+	}
+}
+
 void dsp_sum_c(struct dsp_ops *ops, float * dst,
 		const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
 {
--- a/src/modules/module-filter-chain/dsp-ops-sse.c
+++ b/src/modules/module-filter-chain/dsp-ops-sse.c
@ -5,6 +5,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
+#include <float.h>

 #include <spa/utils/defs.h>

@ -120,3 +121,100 @@ void dsp_sum_sse(struct dsp_ops *ops, float *r, const float *a, const float *b,
 		_mm_store_ss(&r[n], in[0]);
 	}
 }
+
+void dsp_biquad_run_sse(struct dsp_ops *ops, struct biquad *bq,
+		float *out, const float *in, uint32_t n_samples)
+{
+	__m128 x, y, z;
+	__m128 b012;
+	__m128 a12;
+	__m128 x12;
+	uint32_t i;
+
+	b012 = _mm_setr_ps(bq->b0, bq->b1, bq->b2, 0.0f);  /* b0  b1  b2  0 */
+	a12 = _mm_setr_ps(0.0f, bq->a1, bq->a2, 0.0f);	  /* 0   a1  a2  0 */
+	x12 = _mm_setr_ps(bq->x1, bq->x2, 0.0f, 0.0f);	  /* x1  x2  0   0 */
+
+	for (i = 0; i < n_samples; i++) {
+		x = _mm_load1_ps(&in[i]);		/*  x         x         x      x */
+		z = _mm_mul_ps(x, b012);		/*  b0*x      b1*x      b2*x   0 */
+		z = _mm_add_ps(z, x12); 		/*  b0*x+x1   b1*x+x2   b2*x   0 */
+		_mm_store_ss(&out[i], z);		/*  out[i] = b0*x+x1 */
+		y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0));	/*  b0*x+x1  b0*x+x1  b0*x+x1  b0*x+x1 = y*/
+		y = _mm_mul_ps(y, a12);		        /*  0        a1*y     a2*y     0 */
+		y = _mm_sub_ps(z, y);	 		/*  y        x1       x2       0 */
+		x12 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1));    /*  x1  x2  0  0*/
+	}
+#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
+	bq->x1 = F(x12[0]);
+	bq->x2 = F(x12[1]);
+#undef F
+}
+
+static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bq0, struct biquad *bq1,
+		float *out, const float *in, uint32_t n_samples)
+{
+	__m128 x, y, z;
+	__m128 b0, b1;
+	__m128 a0, a1;
+	__m128 x0, x1;
+	uint32_t i;
+
+	b0 = _mm_setr_ps(bq0->b0, bq0->b1, bq0->b2, 0.0f);  /* b0  b1  b2  0 */
+	a0 = _mm_setr_ps(0.0f, bq0->a1, bq0->a2, 0.0f);	    /* 0   a1  a2  0 */
+	x0 = _mm_setr_ps(bq0->x1, bq0->x2, 0.0f, 0.0f);	    /* x1  x2  0   0 */
+
+	b1 = _mm_setr_ps(bq1->b0, bq1->b1, bq1->b2, 0.0f);  /* b0  b1  b2  0 */
+	a1 = _mm_setr_ps(0.0f, bq1->a1, bq1->a2, 0.0f);	    /* 0   a1  a2  0 */
+	x1 = _mm_setr_ps(bq1->x1, bq1->x2, 0.0f, 0.0f);	    /* x1  x2  0   0 */
+
+	for (i = 0; i < n_samples; i++) {
+		x = _mm_load1_ps(&in[i]);			/*  x         x         x      x */
+
+		z = _mm_mul_ps(x, b0);				/*  b0*x      b1*x      b2*x   0 */
+		z = _mm_add_ps(z, x0); 				/*  b0*x+x1   b1*x+x2   b2*x   0 */
+		y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0));	/*  b0*x+x1  b0*x+x1  b0*x+x1  b0*x+x1 = y*/
+		x = _mm_mul_ps(y, a0);			        /*  0        a1*y     a2*y     0 */
+		x = _mm_sub_ps(z, x);	 			/*  y        x1       x2       0 */
+		x0 = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3,3,2,1));    /*  x1  x2  0  0*/
+
+		z = _mm_mul_ps(y, b1);				/*  b0*x      b1*x      b2*x   0 */
+		z = _mm_add_ps(z, x1); 				/*  b0*x+x1   b1*x+x2   b2*x   0 */
+		x = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0));	/*  b0*x+x1  b0*x+x1  b0*x+x1  b0*x+x1 = y*/
+		y = _mm_mul_ps(x, a1);			        /*  0        a1*y     a2*y     0 */
+		y = _mm_sub_ps(z, y);	 			/*  y        x1       x2       0 */
+		x1 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1));    /*  x1  x2  0  0*/
+
+		_mm_store_ss(&out[i], x);			/*  out[i] = b0*x+x1 */
+	}
+#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
+	bq0->x1 = F(x0[0]);
+	bq0->x2 = F(x0[1]);
+	bq1->x1 = F(x1[0]);
+	bq1->x2 = F(x1[1]);
+#undef F
+}
+
+void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
+		float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
+		uint32_t n_src, uint32_t n_samples)
+{
+	uint32_t i, j;
+	const float *s;
+	float *d;
+	uint32_t unrolled = n_bq & ~1;
+	struct biquad *b = bq;
+
+	for (i = 0; i < n_src; i++, b+=n_src) {
+		s = in[i];
+		d = out[i];
+		for (j = 0; j < unrolled; j+=2) {
+			dsp_biquad_run2_sse(ops, &b[j], &b[j+1], d, s, n_samples);
+			s = d;
+		}
+		for (; j < n_bq; j++) {
+			dsp_biquad_run_sse(ops, &b[j], d, s, n_samples);
+			s = d;
+		}
+	}
+}
--- a/src/modules/module-filter-chain/dsp-ops.c
+++ b/src/modules/module-filter-chain/dsp-ops.c
@ -5,6 +5,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
+#include <time.h>

 #include <spa/support/cpu.h>
 #include <spa/utils/defs.h>
@ -25,7 +26,7 @@ static struct dsp_info dsp_table[] =
 		.funcs.clear = dsp_clear_c,
 		.funcs.copy = dsp_copy_c,
 		.funcs.mix_gain = dsp_mix_gain_sse,
-		.funcs.biquad_run = dsp_biquad_run_c,
+		.funcs.biquad_run = dsp_biquad_run_sse,
 		.funcs.sum = dsp_sum_avx,
 		.funcs.linear = dsp_linear_c,
 		.funcs.mult = dsp_mult_c,
@ -34,6 +35,7 @@ static struct dsp_info dsp_table[] =
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_c,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
+		.funcs.biquadn_run = dsp_biquadn_run_sse,
 	},
 #endif
 #if defined (HAVE_SSE)
@ -41,7 +43,7 @@ static struct dsp_info dsp_table[] =
 		.funcs.clear = dsp_clear_c,
 		.funcs.copy = dsp_copy_c,
 		.funcs.mix_gain = dsp_mix_gain_sse,
-		.funcs.biquad_run = dsp_biquad_run_c,
+		.funcs.biquad_run = dsp_biquad_run_sse,
 		.funcs.sum = dsp_sum_sse,
 		.funcs.linear = dsp_linear_c,
 		.funcs.mult = dsp_mult_c,
@ -50,6 +52,7 @@ static struct dsp_info dsp_table[] =
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_c,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
+		.funcs.biquadn_run = dsp_biquadn_run_sse,
 	},
 #endif
 	{ 0,
@ -65,6 +68,7 @@ static struct dsp_info dsp_table[] =
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_c,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
+		.funcs.biquadn_run = dsp_biquadn_run_c,
 	},
 };

@ -99,3 +103,43 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags)

 	return 0;
 }
+
+int dsp_ops_benchmark(void)
+{
+	struct dsp_ops ops[3];
+	uint32_t i;
+	struct biquad bq;
+	float in[2048], out[2048];
+	struct timespec ts;
+	uint64_t t1, t2, t3, t4;
+
+	dsp_ops_init(&ops[0], 0);
+	dsp_ops_init(&ops[1], SPA_CPU_FLAG_SSE);
+	dsp_ops_init(&ops[2], SPA_CPU_FLAG_AVX);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+        t1 = SPA_TIMESPEC_TO_NSEC(&ts);
+
+	for (i = 0; i < 8192; i++)
+		dsp_ops_biquad_run(&ops[0], &bq, out, in, 2048);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+        t2 = SPA_TIMESPEC_TO_NSEC(&ts);
+
+	for (i = 0; i < 8192; i++)
+		dsp_ops_biquad_run(&ops[1], &bq, out, in, 2048);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+        t3 = SPA_TIMESPEC_TO_NSEC(&ts);
+
+	for (i = 0; i < 8192; i++)
+		dsp_ops_biquad_run(&ops[2], &bq, out, in, 2048);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+        t4 = SPA_TIMESPEC_TO_NSEC(&ts);
+
+	fprintf(stderr, "%"PRIu64" %"PRIu64" %"PRIu64" speedup:%f\n",
+			t2-t1, t3-t2, t4-t3,
+			((double)(t2-t1))/(t3-t2));
+	return 0;
+}
--- a/src/modules/module-filter-chain/dsp-ops.h
+++ b/src/modules/module-filter-chain/dsp-ops.h
@ -43,6 +43,9 @@ struct dsp_ops_funcs {
 	void (*mult) (struct dsp_ops *ops,
 			void * SPA_RESTRICT dst,
 			const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
+	void (*biquadn_run) (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
+			float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
+			uint32_t n_src, uint32_t n_samples);
 };

 struct dsp_ops {
@ -56,6 +59,7 @@ struct dsp_ops {
 };

 int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
+int dsp_ops_benchmark(void);

 #define dsp_ops_free(ops)		(ops)->free(ops)

@ -66,6 +70,7 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
 #define dsp_ops_sum(ops,...)		(ops)->funcs.sum(ops, __VA_ARGS__)
 #define dsp_ops_linear(ops,...)		(ops)->funcs.linear(ops, __VA_ARGS__)
 #define dsp_ops_mult(ops,...)		(ops)->funcs.mult(ops, __VA_ARGS__)
+#define dsp_ops_biquadn_run(ops,...)	(ops)->funcs.biquadn_run(ops, __VA_ARGS__)

 #define dsp_ops_fft_new(ops,...)	(ops)->funcs.fft_new(ops, __VA_ARGS__)
 #define dsp_ops_fft_free(ops,...)	(ops)->funcs.fft_free(ops, __VA_ARGS__)
@ -93,6 +98,9 @@ void dsp_linear_##arch (struct dsp_ops *ops, float * SPA_RESTRICT dst, \
 #define MAKE_MULT_FUNC(arch) \
 void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst,	\
 	const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
+#define MAKE_BIQUADN_RUN_FUNC(arch) \
+void dsp_biquadn_run_##arch (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, \
+	float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)

 #define MAKE_FFT_NEW_FUNC(arch) \
 void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real)
@ -111,6 +119,7 @@ void dsp_fft_cmuladd_##arch(struct dsp_ops *ops, void *fft,		\
 	const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,	\
 	uint32_t len, const float scale)

+
 MAKE_CLEAR_FUNC(c);
 MAKE_COPY_FUNC(c);
 MAKE_MIX_GAIN_FUNC(c);
@ -118,6 +127,7 @@ MAKE_BIQUAD_RUN_FUNC(c);
 MAKE_SUM_FUNC(c);
 MAKE_LINEAR_FUNC(c);
 MAKE_MULT_FUNC(c);
+MAKE_BIQUADN_RUN_FUNC(c);

 MAKE_FFT_NEW_FUNC(c);
 MAKE_FFT_FREE_FUNC(c);
@ -128,6 +138,8 @@ MAKE_FFT_CMULADD_FUNC(c);
 #if defined (HAVE_SSE)
 MAKE_MIX_GAIN_FUNC(sse);
 MAKE_SUM_FUNC(sse);
+MAKE_BIQUAD_RUN_FUNC(sse);
+MAKE_BIQUADN_RUN_FUNC(sse);
 #endif
 #if defined (HAVE_AVX)
 MAKE_SUM_FUNC(avx);