mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2025-10-29 05:40:27 -04:00
filter-chain: optimize biquads a little
Add sse optimized biquads. Make a new dsp function to run multiple biquads on multiple channels. This makes it possible to unroll some operations and run the channels in parallel later.
This commit is contained in:
parent
673352893a
commit
ac21541741
5 changed files with 177 additions and 8 deletions
|
|
@ -1882,13 +1882,10 @@ static void param_eq_connect_port(void * Instance, unsigned long Port,
|
|||
static void param_eq_run(void * Instance, unsigned long SampleCount)
|
||||
{
|
||||
struct param_eq_impl *impl = Instance;
|
||||
float *in = impl->port[1];
|
||||
float *out = impl->port[0];
|
||||
const float *in[] = { impl->port[1] };
|
||||
float *out[] = { impl->port[0] };
|
||||
|
||||
for (uint32_t i = 0; i < impl->n_bq; i++) {
|
||||
dsp_ops_biquad_run(dsp_ops, &impl->bq[i], out, in, SampleCount);
|
||||
in = out;
|
||||
}
|
||||
dsp_ops_biquadn_run(dsp_ops, impl->bq, impl->n_bq, out, in, 1, SampleCount);
|
||||
}
|
||||
|
||||
static const struct fc_descriptor param_eq_desc = {
|
||||
|
|
|
|||
|
|
@ -135,6 +135,24 @@ void dsp_biquad_run_c(struct dsp_ops *ops, struct biquad *bq,
|
|||
#undef F
|
||||
}
|
||||
|
||||
void dsp_biquadn_run_c(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
|
||||
uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i, j;
|
||||
const float *s;
|
||||
float *d;
|
||||
struct biquad *b = bq;
|
||||
for (i = 0; i < n_src; i++, bq+=n_src) {
|
||||
s = in[i];
|
||||
d = out[i];
|
||||
for (j = 0; j < n_bq; j++) {
|
||||
dsp_biquad_run_c(ops, &b[j], d, s, n_samples);
|
||||
s = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dsp_sum_c(struct dsp_ops *ops, float * dst,
|
||||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <spa/utils/defs.h>
|
||||
|
||||
|
|
@ -120,3 +121,100 @@ void dsp_sum_sse(struct dsp_ops *ops, float *r, const float *a, const float *b,
|
|||
_mm_store_ss(&r[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
void dsp_biquad_run_sse(struct dsp_ops *ops, struct biquad *bq,
|
||||
float *out, const float *in, uint32_t n_samples)
|
||||
{
|
||||
__m128 x, y, z;
|
||||
__m128 b012;
|
||||
__m128 a12;
|
||||
__m128 x12;
|
||||
uint32_t i;
|
||||
|
||||
b012 = _mm_setr_ps(bq->b0, bq->b1, bq->b2, 0.0f); /* b0 b1 b2 0 */
|
||||
a12 = _mm_setr_ps(0.0f, bq->a1, bq->a2, 0.0f); /* 0 a1 a2 0 */
|
||||
x12 = _mm_setr_ps(bq->x1, bq->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
|
||||
|
||||
for (i = 0; i < n_samples; i++) {
|
||||
x = _mm_load1_ps(&in[i]); /* x x x x */
|
||||
z = _mm_mul_ps(x, b012); /* b0*x b1*x b2*x 0 */
|
||||
z = _mm_add_ps(z, x12); /* b0*x+x1 b1*x+x2 b2*x 0 */
|
||||
_mm_store_ss(&out[i], z); /* out[i] = b0*x+x1 */
|
||||
y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
|
||||
y = _mm_mul_ps(y, a12); /* 0 a1*y a2*y 0 */
|
||||
y = _mm_sub_ps(z, y); /* y x1 x2 0 */
|
||||
x12 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
|
||||
}
|
||||
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
|
||||
bq->x1 = F(x12[0]);
|
||||
bq->x2 = F(x12[1]);
|
||||
#undef F
|
||||
}
|
||||
|
||||
static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bq0, struct biquad *bq1,
|
||||
float *out, const float *in, uint32_t n_samples)
|
||||
{
|
||||
__m128 x, y, z;
|
||||
__m128 b0, b1;
|
||||
__m128 a0, a1;
|
||||
__m128 x0, x1;
|
||||
uint32_t i;
|
||||
|
||||
b0 = _mm_setr_ps(bq0->b0, bq0->b1, bq0->b2, 0.0f); /* b0 b1 b2 0 */
|
||||
a0 = _mm_setr_ps(0.0f, bq0->a1, bq0->a2, 0.0f); /* 0 a1 a2 0 */
|
||||
x0 = _mm_setr_ps(bq0->x1, bq0->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
|
||||
|
||||
b1 = _mm_setr_ps(bq1->b0, bq1->b1, bq1->b2, 0.0f); /* b0 b1 b2 0 */
|
||||
a1 = _mm_setr_ps(0.0f, bq1->a1, bq1->a2, 0.0f); /* 0 a1 a2 0 */
|
||||
x1 = _mm_setr_ps(bq1->x1, bq1->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
|
||||
|
||||
for (i = 0; i < n_samples; i++) {
|
||||
x = _mm_load1_ps(&in[i]); /* x x x x */
|
||||
|
||||
z = _mm_mul_ps(x, b0); /* b0*x b1*x b2*x 0 */
|
||||
z = _mm_add_ps(z, x0); /* b0*x+x1 b1*x+x2 b2*x 0 */
|
||||
y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
|
||||
x = _mm_mul_ps(y, a0); /* 0 a1*y a2*y 0 */
|
||||
x = _mm_sub_ps(z, x); /* y x1 x2 0 */
|
||||
x0 = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
|
||||
|
||||
z = _mm_mul_ps(y, b1); /* b0*x b1*x b2*x 0 */
|
||||
z = _mm_add_ps(z, x1); /* b0*x+x1 b1*x+x2 b2*x 0 */
|
||||
x = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
|
||||
y = _mm_mul_ps(x, a1); /* 0 a1*y a2*y 0 */
|
||||
y = _mm_sub_ps(z, y); /* y x1 x2 0 */
|
||||
x1 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
|
||||
|
||||
_mm_store_ss(&out[i], x); /* out[i] = b0*x+x1 */
|
||||
}
|
||||
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
|
||||
bq0->x1 = F(x0[0]);
|
||||
bq0->x2 = F(x0[1]);
|
||||
bq1->x1 = F(x1[0]);
|
||||
bq1->x2 = F(x1[1]);
|
||||
#undef F
|
||||
}
|
||||
|
||||
void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
|
||||
uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i, j;
|
||||
const float *s;
|
||||
float *d;
|
||||
uint32_t unrolled = n_bq & ~1;
|
||||
struct biquad *b = bq;
|
||||
|
||||
for (i = 0; i < n_src; i++, b+=n_src) {
|
||||
s = in[i];
|
||||
d = out[i];
|
||||
for (j = 0; j < unrolled; j+=2) {
|
||||
dsp_biquad_run2_sse(ops, &b[j], &b[j+1], d, s, n_samples);
|
||||
s = d;
|
||||
}
|
||||
for (; j < n_bq; j++) {
|
||||
dsp_biquad_run_sse(ops, &b[j], d, s, n_samples);
|
||||
s = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <spa/support/cpu.h>
|
||||
#include <spa/utils/defs.h>
|
||||
|
|
@ -25,7 +26,7 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.clear = dsp_clear_c,
|
||||
.funcs.copy = dsp_copy_c,
|
||||
.funcs.mix_gain = dsp_mix_gain_sse,
|
||||
.funcs.biquad_run = dsp_biquad_run_c,
|
||||
.funcs.biquad_run = dsp_biquad_run_sse,
|
||||
.funcs.sum = dsp_sum_avx,
|
||||
.funcs.linear = dsp_linear_c,
|
||||
.funcs.mult = dsp_mult_c,
|
||||
|
|
@ -34,6 +35,7 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.fft_run = dsp_fft_run_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_c,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
|
||||
.funcs.biquadn_run = dsp_biquadn_run_sse,
|
||||
},
|
||||
#endif
|
||||
#if defined (HAVE_SSE)
|
||||
|
|
@ -41,7 +43,7 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.clear = dsp_clear_c,
|
||||
.funcs.copy = dsp_copy_c,
|
||||
.funcs.mix_gain = dsp_mix_gain_sse,
|
||||
.funcs.biquad_run = dsp_biquad_run_c,
|
||||
.funcs.biquad_run = dsp_biquad_run_sse,
|
||||
.funcs.sum = dsp_sum_sse,
|
||||
.funcs.linear = dsp_linear_c,
|
||||
.funcs.mult = dsp_mult_c,
|
||||
|
|
@ -50,6 +52,7 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.fft_run = dsp_fft_run_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_c,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
|
||||
.funcs.biquadn_run = dsp_biquadn_run_sse,
|
||||
},
|
||||
#endif
|
||||
{ 0,
|
||||
|
|
@ -65,6 +68,7 @@ static struct dsp_info dsp_table[] =
|
|||
.funcs.fft_run = dsp_fft_run_c,
|
||||
.funcs.fft_cmul = dsp_fft_cmul_c,
|
||||
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
|
||||
.funcs.biquadn_run = dsp_biquadn_run_c,
|
||||
},
|
||||
};
|
||||
|
||||
|
|
@ -99,3 +103,43 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dsp_ops_benchmark(void)
|
||||
{
|
||||
struct dsp_ops ops[3];
|
||||
uint32_t i;
|
||||
struct biquad bq;
|
||||
float in[2048], out[2048];
|
||||
struct timespec ts;
|
||||
uint64_t t1, t2, t3, t4;
|
||||
|
||||
dsp_ops_init(&ops[0], 0);
|
||||
dsp_ops_init(&ops[1], SPA_CPU_FLAG_SSE);
|
||||
dsp_ops_init(&ops[2], SPA_CPU_FLAG_AVX);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t1 = SPA_TIMESPEC_TO_NSEC(&ts);
|
||||
|
||||
for (i = 0; i < 8192; i++)
|
||||
dsp_ops_biquad_run(&ops[0], &bq, out, in, 2048);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t2 = SPA_TIMESPEC_TO_NSEC(&ts);
|
||||
|
||||
for (i = 0; i < 8192; i++)
|
||||
dsp_ops_biquad_run(&ops[1], &bq, out, in, 2048);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t3 = SPA_TIMESPEC_TO_NSEC(&ts);
|
||||
|
||||
for (i = 0; i < 8192; i++)
|
||||
dsp_ops_biquad_run(&ops[2], &bq, out, in, 2048);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t4 = SPA_TIMESPEC_TO_NSEC(&ts);
|
||||
|
||||
fprintf(stderr, "%"PRIu64" %"PRIu64" %"PRIu64" speedup:%f\n",
|
||||
t2-t1, t3-t2, t4-t3,
|
||||
((double)(t2-t1))/(t3-t2));
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ struct dsp_ops_funcs {
|
|||
void (*mult) (struct dsp_ops *ops,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
|
||||
void (*biquadn_run) (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
|
||||
uint32_t n_src, uint32_t n_samples);
|
||||
};
|
||||
|
||||
struct dsp_ops {
|
||||
|
|
@ -56,6 +59,7 @@ struct dsp_ops {
|
|||
};
|
||||
|
||||
int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
|
||||
int dsp_ops_benchmark(void);
|
||||
|
||||
#define dsp_ops_free(ops) (ops)->free(ops)
|
||||
|
||||
|
|
@ -66,6 +70,7 @@ int dsp_ops_init(struct dsp_ops *ops, uint32_t cpu_flags);
|
|||
#define dsp_ops_sum(ops,...) (ops)->funcs.sum(ops, __VA_ARGS__)
|
||||
#define dsp_ops_linear(ops,...) (ops)->funcs.linear(ops, __VA_ARGS__)
|
||||
#define dsp_ops_mult(ops,...) (ops)->funcs.mult(ops, __VA_ARGS__)
|
||||
#define dsp_ops_biquadn_run(ops,...) (ops)->funcs.biquadn_run(ops, __VA_ARGS__)
|
||||
|
||||
#define dsp_ops_fft_new(ops,...) (ops)->funcs.fft_new(ops, __VA_ARGS__)
|
||||
#define dsp_ops_fft_free(ops,...) (ops)->funcs.fft_free(ops, __VA_ARGS__)
|
||||
|
|
@ -93,6 +98,9 @@ void dsp_linear_##arch (struct dsp_ops *ops, float * SPA_RESTRICT dst, \
|
|||
#define MAKE_MULT_FUNC(arch) \
|
||||
void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \
|
||||
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
|
||||
#define MAKE_BIQUADN_RUN_FUNC(arch) \
|
||||
void dsp_biquadn_run_##arch (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, \
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)
|
||||
|
||||
#define MAKE_FFT_NEW_FUNC(arch) \
|
||||
void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real)
|
||||
|
|
@ -111,6 +119,7 @@ void dsp_fft_cmuladd_##arch(struct dsp_ops *ops, void *fft, \
|
|||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \
|
||||
uint32_t len, const float scale)
|
||||
|
||||
|
||||
MAKE_CLEAR_FUNC(c);
|
||||
MAKE_COPY_FUNC(c);
|
||||
MAKE_MIX_GAIN_FUNC(c);
|
||||
|
|
@ -118,6 +127,7 @@ MAKE_BIQUAD_RUN_FUNC(c);
|
|||
MAKE_SUM_FUNC(c);
|
||||
MAKE_LINEAR_FUNC(c);
|
||||
MAKE_MULT_FUNC(c);
|
||||
MAKE_BIQUADN_RUN_FUNC(c);
|
||||
|
||||
MAKE_FFT_NEW_FUNC(c);
|
||||
MAKE_FFT_FREE_FUNC(c);
|
||||
|
|
@ -128,6 +138,8 @@ MAKE_FFT_CMULADD_FUNC(c);
|
|||
#if defined (HAVE_SSE)
|
||||
MAKE_MIX_GAIN_FUNC(sse);
|
||||
MAKE_SUM_FUNC(sse);
|
||||
MAKE_BIQUAD_RUN_FUNC(sse);
|
||||
MAKE_BIQUADN_RUN_FUNC(sse);
|
||||
#endif
|
||||
#if defined (HAVE_AVX)
|
||||
MAKE_SUM_FUNC(avx);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue