filter-chain: optimize biquads a little

Add sse optimized biquads.

Make a new dsp function to run multiple biquads on multiple channels.
This makes it possible to unroll some operations and run the channels
in parallel later.
This commit is contained in:
Wim Taymans 2024-10-11 17:49:34 +02:00
parent 673352893a
commit ac21541741
5 changed files with 177 additions and 8 deletions

View file

@ -1882,13 +1882,10 @@ static void param_eq_connect_port(void * Instance, unsigned long Port,
static void param_eq_run(void * Instance, unsigned long SampleCount) static void param_eq_run(void * Instance, unsigned long SampleCount)
{ {
struct param_eq_impl *impl = Instance; struct param_eq_impl *impl = Instance;
float *in = impl->port[1]; const float *in[] = { impl->port[1] };
float *out = impl->port[0]; float *out[] = { impl->port[0] };
for (uint32_t i = 0; i < impl->n_bq; i++) { dsp_ops_biquadn_run(dsp_ops, impl->bq, impl->n_bq, out, in, 1, SampleCount);
dsp_ops_biquad_run(dsp_ops, &impl->bq[i], out, in, SampleCount);
in = out;
}
} }
static const struct fc_descriptor param_eq_desc = { static const struct fc_descriptor param_eq_desc = {

View file

@ -135,6 +135,24 @@ void dsp_biquad_run_c(struct dsp_ops *ops, struct biquad *bq,
#undef F #undef F
} }
void dsp_biquadn_run_c(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t i, j;
const float *s;
float *d;
struct biquad *b = bq;
for (i = 0; i < n_src; i++, bq+=n_src) {
s = in[i];
d = out[i];
for (j = 0; j < n_bq; j++) {
dsp_biquad_run_c(ops, &b[j], d, s, n_samples);
s = d;
}
}
}
void dsp_sum_c(struct dsp_ops *ops, float * dst, void dsp_sum_c(struct dsp_ops *ops, float * dst,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples) const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
{ {

View file

@ -5,6 +5,7 @@
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include <math.h> #include <math.h>
#include <float.h>
#include <spa/utils/defs.h> #include <spa/utils/defs.h>
@ -120,3 +121,100 @@ void dsp_sum_sse(struct dsp_ops *ops, float *r, const float *a, const float *b,
_mm_store_ss(&r[n], in[0]); _mm_store_ss(&r[n], in[0]);
} }
} }
void dsp_biquad_run_sse(struct dsp_ops *ops, struct biquad *bq,
float *out, const float *in, uint32_t n_samples)
{
__m128 x, y, z;
__m128 b012;
__m128 a12;
__m128 x12;
uint32_t i;
b012 = _mm_setr_ps(bq->b0, bq->b1, bq->b2, 0.0f); /* b0 b1 b2 0 */
a12 = _mm_setr_ps(0.0f, bq->a1, bq->a2, 0.0f); /* 0 a1 a2 0 */
x12 = _mm_setr_ps(bq->x1, bq->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
for (i = 0; i < n_samples; i++) {
x = _mm_load1_ps(&in[i]); /* x x x x */
z = _mm_mul_ps(x, b012); /* b0*x b1*x b2*x 0 */
z = _mm_add_ps(z, x12); /* b0*x+x1 b1*x+x2 b2*x 0 */
_mm_store_ss(&out[i], z); /* out[i] = b0*x+x1 */
y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
y = _mm_mul_ps(y, a12); /* 0 a1*y a2*y 0 */
y = _mm_sub_ps(z, y); /* y x1 x2 0 */
x12 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
}
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
bq->x1 = F(x12[0]);
bq->x2 = F(x12[1]);
#undef F
}
static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bq0, struct biquad *bq1,
float *out, const float *in, uint32_t n_samples)
{
__m128 x, y, z;
__m128 b0, b1;
__m128 a0, a1;
__m128 x0, x1;
uint32_t i;
b0 = _mm_setr_ps(bq0->b0, bq0->b1, bq0->b2, 0.0f); /* b0 b1 b2 0 */
a0 = _mm_setr_ps(0.0f, bq0->a1, bq0->a2, 0.0f); /* 0 a1 a2 0 */
x0 = _mm_setr_ps(bq0->x1, bq0->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
b1 = _mm_setr_ps(bq1->b0, bq1->b1, bq1->b2, 0.0f); /* b0 b1 b2 0 */
a1 = _mm_setr_ps(0.0f, bq1->a1, bq1->a2, 0.0f); /* 0 a1 a2 0 */
x1 = _mm_setr_ps(bq1->x1, bq1->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
for (i = 0; i < n_samples; i++) {
x = _mm_load1_ps(&in[i]); /* x x x x */
z = _mm_mul_ps(x, b0); /* b0*x b1*x b2*x 0 */
z = _mm_add_ps(z, x0); /* b0*x+x1 b1*x+x2 b2*x 0 */
y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
x = _mm_mul_ps(y, a0); /* 0 a1*y a2*y 0 */
x = _mm_sub_ps(z, x); /* y x1 x2 0 */
x0 = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
z = _mm_mul_ps(y, b1); /* b0*x b1*x b2*x 0 */
z = _mm_add_ps(z, x1); /* b0*x+x1 b1*x+x2 b2*x 0 */
x = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
y = _mm_mul_ps(x, a1); /* 0 a1*y a2*y 0 */
y = _mm_sub_ps(z, y); /* y x1 x2 0 */
x1 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
_mm_store_ss(&out[i], x); /* out[i] = b0*x+x1 */
}
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
bq0->x1 = F(x0[0]);
bq0->x2 = F(x0[1]);
bq1->x1 = F(x1[0]);
bq1->x2 = F(x1[1]);
#undef F
}
void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t i, j;
const float *s;
float *d;
uint32_t unrolled = n_bq & ~1;
struct biquad *b = bq;
for (i = 0; i < n_src; i++, b+=n_src) {
s = in[i];
d = out[i];
for (j = 0; j < unrolled; j+=2) {
dsp_biquad_run2_sse(ops, &b[j], &b[j+1], d, s, n_samples);
s = d;
}
for (; j < n_bq; j++) {
dsp_biquad_run_sse(ops, &b[j], d, s, n_samples);
s = d;
}
}
}

View file

@ -5,6 +5,7 @@
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include <math.h> #include <math.h>
#include <time.h>
#include <spa/support/cpu.h> #include <spa/support/cpu.h>
#include <spa/utils/defs.h> #include <spa/utils/defs.h>
@ -25,7 +26,7 @@ static struct dsp_info dsp_table[] =
.funcs.clear = dsp_clear_c, .funcs.clear = dsp_clear_c,
.funcs.copy = dsp_copy_c, .funcs.copy = dsp_copy_c,
.funcs.mix_gain = dsp_mix_gain_sse, .funcs.mix_gain = dsp_mix_gain_sse,
.funcs.biquad_run = dsp_biquad_run_c, .funcs.biquad_run = dsp_biquad_run_sse,
.funcs.sum = dsp_sum_avx, .funcs.sum = dsp_sum_avx,
.funcs.linear = dsp_linear_c, .funcs.linear = dsp_linear_c,
.funcs.mult = dsp_mult_c, .funcs.mult = dsp_mult_c,
@ -34,6 +35,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_run = dsp_fft_run_c, .funcs.fft_run = dsp_fft_run_c,
.funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_sse,
}, },
#endif #endif
#if defined (HAVE_SSE) #if defined (HAVE_SSE)
@ -41,7 +43,7 @@ static struct dsp_info dsp_table[] =
.funcs.clear = dsp_clear_c, .funcs.clear = dsp_clear_c,
.funcs.copy = dsp_copy_c, .funcs.copy = dsp_copy_c,
.funcs.mix_gain = dsp_mix_gain_sse, .funcs.mix_gain = dsp_mix_gain_sse,
.funcs.biquad_run = dsp_biquad_run_c, .funcs.biquad_run = dsp_biquad_run_sse,
.funcs.sum = dsp_sum_sse, .funcs.sum = dsp_sum_sse,
.funcs.linear = dsp_linear_c, .funcs.linear = dsp_linear_c,
.funcs.mult = dsp_mult_c, .funcs.mult = dsp_mult_c,
@ -50,6 +52,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_run = dsp_fft_run_c, .funcs.fft_run = dsp_fft_run_c,
.funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_sse,
}, },
#endif #endif
{ 0, { 0,
@ -65,6 +68,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_run = dsp_fft_run_c, .funcs.fft_run = dsp_fft_run_c,
.funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_c,
}, },
}; };
@ -99,3 +103,43 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags)
return 0; return 0;
} }
int dsp_ops_benchmark(void)
{
struct dsp_ops ops[3];
uint32_t i;
struct biquad bq;
float in[2048], out[2048];
struct timespec ts;
uint64_t t1, t2, t3, t4;
dsp_ops_init(&ops[0], 0);
dsp_ops_init(&ops[1], SPA_CPU_FLAG_SSE);
dsp_ops_init(&ops[2], SPA_CPU_FLAG_AVX);
clock_gettime(CLOCK_MONOTONIC, &ts);
t1 = SPA_TIMESPEC_TO_NSEC(&ts);
for (i = 0; i < 8192; i++)
dsp_ops_biquad_run(&ops[0], &bq, out, in, 2048);
clock_gettime(CLOCK_MONOTONIC, &ts);
t2 = SPA_TIMESPEC_TO_NSEC(&ts);
for (i = 0; i < 8192; i++)
dsp_ops_biquad_run(&ops[1], &bq, out, in, 2048);
clock_gettime(CLOCK_MONOTONIC, &ts);
t3 = SPA_TIMESPEC_TO_NSEC(&ts);
for (i = 0; i < 8192; i++)
dsp_ops_biquad_run(&ops[2], &bq, out, in, 2048);
clock_gettime(CLOCK_MONOTONIC, &ts);
t4 = SPA_TIMESPEC_TO_NSEC(&ts);
fprintf(stderr, "%"PRIu64" %"PRIu64" %"PRIu64" speedup:%f\n",
t2-t1, t3-t2, t4-t3,
((double)(t2-t1))/(t3-t2));
return 0;
}

View file

@ -43,6 +43,9 @@ struct dsp_ops_funcs {
void (*mult) (struct dsp_ops *ops, void (*mult) (struct dsp_ops *ops,
void * SPA_RESTRICT dst, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples); const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
void (*biquadn_run) (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
uint32_t n_src, uint32_t n_samples);
}; };
struct dsp_ops { struct dsp_ops {
@ -56,6 +59,7 @@ struct dsp_ops {
}; };
int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags); int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
int dsp_ops_benchmark(void);
#define dsp_ops_free(ops) (ops)->free(ops) #define dsp_ops_free(ops) (ops)->free(ops)
@ -66,6 +70,7 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
#define dsp_ops_sum(ops,...) (ops)->funcs.sum(ops, __VA_ARGS__) #define dsp_ops_sum(ops,...) (ops)->funcs.sum(ops, __VA_ARGS__)
#define dsp_ops_linear(ops,...) (ops)->funcs.linear(ops, __VA_ARGS__) #define dsp_ops_linear(ops,...) (ops)->funcs.linear(ops, __VA_ARGS__)
#define dsp_ops_mult(ops,...) (ops)->funcs.mult(ops, __VA_ARGS__) #define dsp_ops_mult(ops,...) (ops)->funcs.mult(ops, __VA_ARGS__)
#define dsp_ops_biquadn_run(ops,...) (ops)->funcs.biquadn_run(ops, __VA_ARGS__)
#define dsp_ops_fft_new(ops,...) (ops)->funcs.fft_new(ops, __VA_ARGS__) #define dsp_ops_fft_new(ops,...) (ops)->funcs.fft_new(ops, __VA_ARGS__)
#define dsp_ops_fft_free(ops,...) (ops)->funcs.fft_free(ops, __VA_ARGS__) #define dsp_ops_fft_free(ops,...) (ops)->funcs.fft_free(ops, __VA_ARGS__)
@ -93,6 +98,9 @@ void dsp_linear_##arch (struct dsp_ops *ops, float * SPA_RESTRICT dst, \
#define MAKE_MULT_FUNC(arch) \ #define MAKE_MULT_FUNC(arch) \
void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \ void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples) const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
#define MAKE_BIQUADN_RUN_FUNC(arch) \
void dsp_biquadn_run_##arch (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, \
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)
#define MAKE_FFT_NEW_FUNC(arch) \ #define MAKE_FFT_NEW_FUNC(arch) \
void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real) void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real)
@ -111,6 +119,7 @@ void dsp_fft_cmuladd_##arch(struct dsp_ops *ops, void *fft, \
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \ const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \
uint32_t len, const float scale) uint32_t len, const float scale)
MAKE_CLEAR_FUNC(c); MAKE_CLEAR_FUNC(c);
MAKE_COPY_FUNC(c); MAKE_COPY_FUNC(c);
MAKE_MIX_GAIN_FUNC(c); MAKE_MIX_GAIN_FUNC(c);
@ -118,6 +127,7 @@ MAKE_BIQUAD_RUN_FUNC(c);
MAKE_SUM_FUNC(c); MAKE_SUM_FUNC(c);
MAKE_LINEAR_FUNC(c); MAKE_LINEAR_FUNC(c);
MAKE_MULT_FUNC(c); MAKE_MULT_FUNC(c);
MAKE_BIQUADN_RUN_FUNC(c);
MAKE_FFT_NEW_FUNC(c); MAKE_FFT_NEW_FUNC(c);
MAKE_FFT_FREE_FUNC(c); MAKE_FFT_FREE_FUNC(c);
@ -128,6 +138,8 @@ MAKE_FFT_CMULADD_FUNC(c);
#if defined (HAVE_SSE) #if defined (HAVE_SSE)
MAKE_MIX_GAIN_FUNC(sse); MAKE_MIX_GAIN_FUNC(sse);
MAKE_SUM_FUNC(sse); MAKE_SUM_FUNC(sse);
MAKE_BIQUAD_RUN_FUNC(sse);
MAKE_BIQUADN_RUN_FUNC(sse);
#endif #endif
#if defined (HAVE_AVX) #if defined (HAVE_AVX)
MAKE_SUM_FUNC(avx); MAKE_SUM_FUNC(avx);