diff --git a/src/modules/module-filter-chain/builtin_plugin.c b/src/modules/module-filter-chain/builtin_plugin.c index ba76f3267..eb1159a2b 100644 --- a/src/modules/module-filter-chain/builtin_plugin.c +++ b/src/modules/module-filter-chain/builtin_plugin.c @@ -1150,10 +1150,10 @@ static void *delay_instantiate(const struct fc_descriptor * Descriptor, return NULL; impl->rate = SampleRate; - impl->buffer_samples = (uint32_t)(max_delay * impl->rate); + impl->buffer_samples = SPA_ROUND_UP_N((uint32_t)(max_delay * impl->rate), 64); pw_log_info("max-delay:%f seconds rate:%lu samples:%d", max_delay, impl->rate, impl->buffer_samples); - impl->buffer = calloc(impl->buffer_samples, sizeof(float)); + impl->buffer = calloc(impl->buffer_samples * 2 + 64, sizeof(float)); if (impl->buffer == NULL) { delay_cleanup(impl); return NULL; @@ -1175,27 +1175,13 @@ static void delay_run(void * Instance, unsigned long SampleCount) struct delay_impl *impl = Instance; float *in = impl->port[1], *out = impl->port[0]; float delay = impl->port[2][0]; - unsigned long n; - uint32_t r, w; if (delay != impl->delay) { impl->delay_samples = SPA_CLAMP((uint32_t)(delay * impl->rate), 0u, impl->buffer_samples-1); impl->delay = delay; } - r = impl->ptr; - w = impl->ptr + impl->delay_samples; - if (w >= impl->buffer_samples) - w -= impl->buffer_samples; - - for (n = 0; n < SampleCount; n++) { - impl->buffer[w] = in[n]; - out[n] = impl->buffer[r]; - if (++r >= impl->buffer_samples) - r = 0; - if (++w >= impl->buffer_samples) - w = 0; - } - impl->ptr = r; + dsp_ops_delay(dsp_ops, impl->buffer, &impl->ptr, impl->buffer_samples, + impl->delay_samples, out, in, SampleCount); } static struct fc_port delay_ports[] = { diff --git a/src/modules/module-filter-chain/dsp-ops-c.c b/src/modules/module-filter-chain/dsp-ops-c.c index 86dd9c779..04d356eef 100644 --- a/src/modules/module-filter-chain/dsp-ops-c.c +++ b/src/modules/module-filter-chain/dsp-ops-c.c @@ -188,6 +188,27 @@ void dsp_linear_c(struct dsp_ops *ops, float * dst, } } + +void dsp_delay_c(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, + uint32_t delay, float *dst, const float *src, uint32_t n_samples) +{ + if (delay == 0) { + dsp_copy_c(ops, dst, src, n_samples); + } else { + uint32_t w, o, i; + + w = *pos; + o = n_buffer - delay; + + for (i = 0; i < n_samples; i++) { + buffer[w] = buffer[w + n_buffer] = src[i]; + dst[i] = buffer[w + o]; + w = w + 1 > n_buffer ? 0 : w + 1; + } + *pos = w; + } +} + void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real) { return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX); diff --git a/src/modules/module-filter-chain/dsp-ops-sse.c b/src/modules/module-filter-chain/dsp-ops-sse.c index 71abc6f26..a5c346ae0 100644 --- a/src/modules/module-filter-chain/dsp-ops-sse.c +++ b/src/modules/module-filter-chain/dsp-ops-sse.c @@ -500,3 +500,36 @@ void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, } } } + +void dsp_delay_sse(struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay, + float *dst, const float *src, uint32_t n_samples) +{ + __m128 t[1]; + uint32_t w = *pos; + uint32_t o = n_buffer - delay; + uint32_t n, unrolled; + + if (SPA_IS_ALIGNED(src, 16) && + SPA_IS_ALIGNED(dst, 16)) + unrolled = n_samples & ~3; + else + unrolled = 0; + + for(n = 0; n < unrolled; n += 4) { + t[0] = _mm_load_ps(&src[n]); + _mm_storeu_ps(&buffer[w], t[0]); + _mm_storeu_ps(&buffer[w+n_buffer], t[0]); + t[0] = _mm_loadu_ps(&buffer[w+o]); + _mm_store_ps(&dst[n], t[0]); + w = w + 4 >= n_buffer ? 0 : w + 4; + } + for(; n < n_samples; n++) { + t[0] = _mm_load_ss(&src[n]); + _mm_store_ss(&buffer[w], t[0]); + _mm_store_ss(&buffer[w+n_buffer], t[0]); + t[0] = _mm_load_ss(&buffer[w+o]); + _mm_store_ss(&dst[n], t[0]); + w = w + 1 >= n_buffer ? 0 : w + 1; + } + *pos = w; +} diff --git a/src/modules/module-filter-chain/dsp-ops.c b/src/modules/module-filter-chain/dsp-ops.c index 7a3fe0ee6..25346fab6 100644 --- a/src/modules/module-filter-chain/dsp-ops.c +++ b/src/modules/module-filter-chain/dsp-ops.c @@ -36,6 +36,7 @@ static struct dsp_info dsp_table[] = .funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.biquadn_run = dsp_biquadn_run_sse, + .funcs.delay = dsp_delay_sse, }, #endif #if defined (HAVE_SSE) @@ -53,6 +54,7 @@ static struct dsp_info dsp_table[] = .funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.biquadn_run = dsp_biquadn_run_sse, + .funcs.delay = dsp_delay_sse, }, #endif { 0, @@ -69,6 +71,7 @@ static struct dsp_info dsp_table[] = .funcs.fft_cmul = dsp_fft_cmul_c, .funcs.fft_cmuladd = dsp_fft_cmuladd_c, .funcs.biquadn_run = dsp_biquadn_run_c, + .funcs.delay = dsp_delay_c, }, }; diff --git a/src/modules/module-filter-chain/dsp-ops.h b/src/modules/module-filter-chain/dsp-ops.h index 796455014..663cec970 100644 --- a/src/modules/module-filter-chain/dsp-ops.h +++ b/src/modules/module-filter-chain/dsp-ops.h @@ -46,6 +46,8 @@ struct dsp_ops_funcs { void (*biquadn_run) (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples); + void (*delay) (struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay, + float *dst, const float *src, uint32_t n_samples); }; struct dsp_ops { @@ -71,6 +73,7 @@ int dsp_ops_benchmark(void); #define dsp_ops_linear(ops,...) (ops)->funcs.linear(ops, __VA_ARGS__) #define dsp_ops_mult(ops,...) (ops)->funcs.mult(ops, __VA_ARGS__) #define dsp_ops_biquadn_run(ops,...) (ops)->funcs.biquadn_run(ops, __VA_ARGS__) +#define dsp_ops_delay(ops,...) (ops)->funcs.delay(ops, __VA_ARGS__) #define dsp_ops_fft_new(ops,...) (ops)->funcs.fft_new(ops, __VA_ARGS__) #define dsp_ops_fft_free(ops,...) (ops)->funcs.fft_free(ops, __VA_ARGS__) @@ -101,6 +104,9 @@ void dsp_mult_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \ #define MAKE_BIQUADN_RUN_FUNC(arch) \ void dsp_biquadn_run_##arch (struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, \ float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples) +#define MAKE_DELAY_FUNC(arch) \ +void dsp_delay_##arch (struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32_t n_buffer, \ + uint32_t delay, float *dst, const float *src, uint32_t n_samples) #define MAKE_FFT_NEW_FUNC(arch) \ void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real) @@ -128,6 +134,7 @@ MAKE_SUM_FUNC(c); MAKE_LINEAR_FUNC(c); MAKE_MULT_FUNC(c); MAKE_BIQUADN_RUN_FUNC(c); +MAKE_DELAY_FUNC(c); MAKE_FFT_NEW_FUNC(c); MAKE_FFT_FREE_FUNC(c); @@ -140,6 +147,7 @@ MAKE_MIX_GAIN_FUNC(sse); MAKE_SUM_FUNC(sse); MAKE_BIQUAD_RUN_FUNC(sse); MAKE_BIQUADN_RUN_FUNC(sse); +MAKE_DELAY_FUNC(sse); #endif #if defined (HAVE_AVX) MAKE_SUM_FUNC(avx);