filter-chain: optimize delay function

This commit is contained in:
Wim Taymans 2024-10-16 16:16:03 +02:00
parent d8bd84183d
commit 5e87f1d4f4
5 changed files with 69 additions and 18 deletions

View file

@ -1150,10 +1150,10 @@ static void *delay_instantiate(const struct fc_descriptor * Descriptor,
return NULL;
impl->rate = SampleRate;
impl->buffer_samples = (uint32_t)(max_delay * impl->rate);
impl->buffer_samples = SPA_ROUND_UP_N((uint32_t)(max_delay * impl->rate), 64);
pw_log_info("max-delay:%f seconds rate:%lu samples:%d", max_delay, impl->rate, impl->buffer_samples);
impl->buffer = calloc(impl->buffer_samples, sizeof(float));
impl->buffer = calloc(impl->buffer_samples * 2 + 64, sizeof(float));
if (impl->buffer == NULL) {
delay_cleanup(impl);
return NULL;
@ -1175,27 +1175,13 @@ static void delay_run(void * Instance, unsigned long SampleCount)
struct delay_impl *impl = Instance;
float *in = impl->port[1], *out = impl->port[0];
float delay = impl->port[2][0];
unsigned long n;
uint32_t r, w;
if (delay != impl->delay) {
impl->delay_samples = SPA_CLAMP((uint32_t)(delay * impl->rate), 0u, impl->buffer_samples-1);
impl->delay = delay;
}
r = impl->ptr;
w = impl->ptr + impl->delay_samples;
if (w >= impl->buffer_samples)
w -= impl->buffer_samples;
for (n = 0; n < SampleCount; n++) {
impl->buffer[w] = in[n];
out[n] = impl->buffer[r];
if (++r >= impl->buffer_samples)
r = 0;
if (++w >= impl->buffer_samples)
w = 0;
}
impl->ptr = r;
dsp_ops_delay(dsp_ops, impl->buffer, &impl->ptr, impl->buffer_samples,
impl->delay_samples, out, in, SampleCount);
}
static struct fc_port delay_ports[] = {

View file

@ -188,6 +188,27 @@ void dsp_linear_c(struct dsp_ops *ops, float * dst,
}
}
void dsp_delay_c(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer,
uint32_t delay, float *dst, const float *src, uint32_t n_samples)
{
if (delay == 0) {
dsp_copy_c(ops, dst, src, n_samples);
} else {
uint32_t w, o, i;
w = *pos;
o = n_buffer - delay;
for (i = 0; i < n_samples; i++) {
buffer[w] = buffer[w + n_buffer] = src[i];
dst[i] = buffer[w + o];
w = w + 1 > n_buffer ? 0 : w + 1;
}
*pos = w;
}
}
void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real)
{
return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX);

View file

@ -500,3 +500,36 @@ void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq,
}
}
}
void dsp_delay_sse(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay,
float *dst, const float *src, uint32_t n_samples)
{
__m128 t[1];
uint32_t w = *pos;
uint32_t o = n_buffer - delay;
uint32_t n, unrolled;
if (SPA_IS_ALIGNED(src, 16) &&
SPA_IS_ALIGNED(dst, 16))
unrolled = n_samples & ~3;
else
unrolled = 0;
for(n = 0; n < unrolled; n += 4) {
t[0] = _mm_load_ps(&src[n]);
_mm_storeu_ps(&buffer[w], t[0]);
_mm_storeu_ps(&buffer[w+n_buffer], t[0]);
t[0] = _mm_loadu_ps(&buffer[w+o]);
_mm_store_ps(&dst[n], t[0]);
w = w + 4 >= n_buffer ? 0 : w + 4;
}
for(; n < n_samples; n++) {
t[0] = _mm_load_ss(&src[n]);
_mm_store_ss(&buffer[w], t[0]);
_mm_store_ss(&buffer[w+n_buffer], t[0]);
t[0] = _mm_load_ss(&buffer[w+o]);
_mm_store_ss(&dst[n], t[0]);
w = w + 1 >= n_buffer ? 0 : w + 1;
}
*pos = w;
}

View file

@ -36,6 +36,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_sse,
.funcs.delay = dsp_delay_sse,
},
#endif
#if defined (HAVE_SSE)
@ -53,6 +54,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_sse,
.funcs.delay = dsp_delay_sse,
},
#endif
{ 0,
@ -69,6 +71,7 @@ static struct dsp_info dsp_table[] =
.funcs.fft_cmul = dsp_fft_cmul_c,
.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
.funcs.biquadn_run = dsp_biquadn_run_c,
.funcs.delay = dsp_delay_c,
},
};

View file

@ -46,6 +46,8 @@ struct dsp_ops_funcs {
void (*biquadn_run) (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride,
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
uint32_t n_src, uint32_t n_samples);
void (*delay) (struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay,
float *dst, const float *src, uint32_t n_samples);
};
struct dsp_ops {
@ -71,6 +73,7 @@ int dsp_ops_benchmark(void);
#define dsp_ops_linear(ops,...) (ops)->funcs.linear(ops, __VA_ARGS__)
#define dsp_ops_mult(ops,...) (ops)->funcs.mult(ops, __VA_ARGS__)
#define dsp_ops_biquadn_run(ops,...) (ops)->funcs.biquadn_run(ops, __VA_ARGS__)
#define dsp_ops_delay(ops,...) (ops)->funcs.delay(ops, __VA_ARGS__)
#define dsp_ops_fft_new(ops,...) (ops)->funcs.fft_new(ops, __VA_ARGS__)
#define dsp_ops_fft_free(ops,...) (ops)->funcs.fft_free(ops, __VA_ARGS__)
@ -101,6 +104,9 @@ void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \
#define MAKE_BIQUADN_RUN_FUNC(arch) \
void dsp_biquadn_run_##arch (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, \
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)
#define MAKE_DELAY_FUNC(arch) \
void dsp_delay_##arch (struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, \
uint32_t delay, float *dst, const float *src, uint32_t n_samples)
#define MAKE_FFT_NEW_FUNC(arch) \
void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real)
@ -128,6 +134,7 @@ MAKE_SUM_FUNC(c);
MAKE_LINEAR_FUNC(c);
MAKE_MULT_FUNC(c);
MAKE_BIQUADN_RUN_FUNC(c);
MAKE_DELAY_FUNC(c);
MAKE_FFT_NEW_FUNC(c);
MAKE_FFT_FREE_FUNC(c);
@ -140,6 +147,7 @@ MAKE_MIX_GAIN_FUNC(sse);
MAKE_SUM_FUNC(sse);
MAKE_BIQUAD_RUN_FUNC(sse);
MAKE_BIQUADN_RUN_FUNC(sse);
MAKE_DELAY_FUNC(sse);
#endif
#if defined (HAVE_AVX)
MAKE_SUM_FUNC(avx);