From 2c0ce6afc2c82f774492b0eada53eae317194e4d Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Tue, 15 Oct 2024 16:10:25 +0200 Subject: [PATCH] audioconvert: SSE optimize delay and convolver --- spa/plugins/audioconvert/channelmix-ops-c.c | 42 ++++++++-- spa/plugins/audioconvert/channelmix-ops-sse.c | 82 ++++++++++++++++++- spa/plugins/audioconvert/channelmix-ops.c | 1 + spa/plugins/audioconvert/channelmix-ops.h | 1 - spa/plugins/audioconvert/delay.h | 58 ------------- 5 files changed, 115 insertions(+), 69 deletions(-) delete mode 100644 spa/plugins/audioconvert/delay.h diff --git a/spa/plugins/audioconvert/channelmix-ops-c.c b/spa/plugins/audioconvert/channelmix-ops-c.c index 2e4d25a20..43da8823e 100644 --- a/spa/plugins/audioconvert/channelmix-ops-c.c +++ b/spa/plugins/audioconvert/channelmix-ops-c.c @@ -102,6 +102,36 @@ static void lr4_process_c(struct lr4 *lr4, float *dst, const float *src, const f #undef F } +static inline void delay_convolve_run_c(float *buffer, uint32_t *pos, + uint32_t n_buffer, uint32_t delay, + const float *taps, uint32_t n_taps, + float *dst, const float *src, const float vol, uint32_t n_samples) +{ + uint32_t i, j; + uint32_t w = *pos; + uint32_t o = n_buffer - delay - n_taps-1; + + if (n_taps == 1) { + for (i = 0; i < n_samples; i++) { + buffer[w] = buffer[w + n_buffer] = src[i]; + dst[i] = buffer[w + o] * vol; + w = w + 1 >= n_buffer ? 0 : w + 1; + } + } else { + for (i = 0; i < n_samples; i++) { + float sum = 0.0f; + + buffer[w] = buffer[w + n_buffer] = src[i]; + for (j = 0; j < n_taps; j++) + sum += taps[j] * buffer[w+o+j]; + dst[i] = sum * vol; + + w = w + 1 >= n_buffer ? 0 : w + 1; + } + } + *pos = w; +} + #define _M(ch) (1UL << SPA_AUDIO_CHANNEL_ ## ch) void @@ -239,9 +269,9 @@ channelmix_f32_2_4_c(struct channelmix *mix, void * SPA_RESTRICT dst[], } else { sub_c(d[2], s[0], s[1], n_samples); - delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[3], d[2], -v3, n_samples); - delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[2], d[2], v2, n_samples); } } @@ -307,9 +337,9 @@ channelmix_f32_2_5p1_c(struct channelmix *mix, void * SPA_RESTRICT dst[], } else { sub_c(d[4], s[0], s[1], n_samples); - delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples); - delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[4], d[4], v4, n_samples); } } @@ -343,9 +373,9 @@ channelmix_f32_2_7p1_c(struct channelmix *mix, void * SPA_RESTRICT dst[], } else { sub_c(d[6], s[0], s[1], n_samples); - delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples); - delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, + delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[6], d[6], v6, n_samples); } } diff --git a/spa/plugins/audioconvert/channelmix-ops-sse.c b/spa/plugins/audioconvert/channelmix-ops-sse.c index 7a300b45f..f7b2d59cb 100644 --- a/spa/plugins/audioconvert/channelmix-ops-sse.c +++ b/spa/plugins/audioconvert/channelmix-ops-sse.c @@ -257,6 +257,80 @@ static void lr4_process_2_sse(struct lr4 *lr40, struct lr4 *lr41, float *dst0, f #undef F } +static inline void convolver_run(const float *src, float *dst, + const float *taps, uint32_t n_taps, const __m128 vol) +{ + __m128 t[1], sum[4]; + uint32_t i; + + sum[0] = _mm_setzero_ps(); + for(i = 0; i < n_taps; i+=4) { + t[0] = _mm_loadu_ps(&src[i]); + sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_loadu_ps(&taps[i]), t[0])); + } + sum[0] = _mm_add_ps(sum[0], _mm_movehl_ps(sum[0], sum[0])); + sum[0] = _mm_add_ss(sum[0], _mm_shuffle_ps(sum[0], sum[0], 0x55)); + t[0] = _mm_mul_ss(sum[0], vol); + _mm_store_ss(dst, t[0]); +} + +static inline void delay_convolve_run_sse(float *buffer, uint32_t *pos, + uint32_t n_buffer, uint32_t delay, + const float *taps, uint32_t n_taps, + float *dst, const float *src, const float vol, uint32_t n_samples) +{ + __m128 t[1]; + const __m128 v = _mm_set1_ps(vol); + uint32_t i; + uint32_t w = *pos; + uint32_t o = n_buffer - delay - n_taps-1; + uint32_t n, unrolled; + + if (SPA_IS_ALIGNED(src, 16) && + SPA_IS_ALIGNED(dst, 16)) + unrolled = n_samples & ~3; + else + unrolled = 0; + + if (n_taps == 1) { + for(n = 0; n < unrolled; n += 4) { + t[0] = _mm_load_ps(&src[n]); + _mm_storeu_ps(&buffer[w], t[0]); + _mm_storeu_ps(&buffer[w+n_buffer], t[0]); + t[0] = _mm_loadu_ps(&buffer[w+o]); + t[0] = _mm_mul_ps(t[0], v); + _mm_store_ps(&dst[n], t[0]); + w = w + 4 >= n_buffer ? 0 : w + 4; + } + for(; n < n_samples; n++) { + t[0] = _mm_load_ss(&src[n]); + _mm_store_ss(&buffer[w], t[0]); + _mm_store_ss(&buffer[w+n_buffer], t[0]); + t[0] = _mm_load_ss(&buffer[w+o]); + t[0] = _mm_mul_ss(t[0], v); + _mm_store_ss(&dst[n], t[0]); + w = w + 1 >= n_buffer ? 0 : w + 1; + } + } else { + for(n = 0; n < unrolled; n += 4) { + t[0] = _mm_load_ps(&src[n]); + _mm_storeu_ps(&buffer[w], t[0]); + _mm_storeu_ps(&buffer[w+n_buffer], t[0]); + for(i = 0; i < 4; i++) + convolver_run(&buffer[w+o+i], &dst[n+i], taps, n_taps, v); + w = w + 4 >= n_buffer ? 0 : w + 4; + } + for(; n < n_samples; n++) { + t[0] = _mm_load_ss(&src[n]); + _mm_store_ss(&buffer[w], t[0]); + _mm_store_ss(&buffer[w+n_buffer], t[0]); + convolver_run(&buffer[w+o], &dst[n], taps, n_taps, v); + w = w + 1 >= n_buffer ? 0 : w + 1; + } + } + *pos = w; +} + void channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_samples) @@ -371,9 +445,9 @@ channelmix_f32_2_5p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[], } else { sub_sse(d[4], s[0], s[1], n_samples); - delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, + delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples); - delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, + delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[4], d[4], v4, n_samples); } } @@ -407,9 +481,9 @@ channelmix_f32_2_7p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[], } else { sub_sse(d[6], s[0], s[1], n_samples); - delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, + delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples); - delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, + delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay, mix->taps, mix->n_taps, d[6], d[6], v6, n_samples); } } diff --git a/spa/plugins/audioconvert/channelmix-ops.c b/spa/plugins/audioconvert/channelmix-ops.c index cabc556b8..f7db4c61c 100644 --- a/spa/plugins/audioconvert/channelmix-ops.c +++ b/spa/plugins/audioconvert/channelmix-ops.c @@ -775,6 +775,7 @@ int channelmix_init(struct channelmix *mix) mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f); mix->func_name = info->name; + spa_zero(mix->taps); if (mix->hilbert_taps > 0) { mix->n_taps = SPA_CLAMP(mix->hilbert_taps, 15u, MAX_TAPS) | 1; blackman_window(mix->taps, mix->n_taps); diff --git a/spa/plugins/audioconvert/channelmix-ops.h b/spa/plugins/audioconvert/channelmix-ops.h index 4f6249c3c..2af6ecc3f 100644 --- a/spa/plugins/audioconvert/channelmix-ops.h +++ b/spa/plugins/audioconvert/channelmix-ops.h @@ -10,7 +10,6 @@ #include #include "crossover.h" -#include "delay.h" #define VOLUME_MIN 0.0f #define VOLUME_NORM 1.0f diff --git a/spa/plugins/audioconvert/delay.h b/spa/plugins/audioconvert/delay.h deleted file mode 100644 index eee66eef6..000000000 --- a/spa/plugins/audioconvert/delay.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Spa */ -/* SPDX-FileCopyrightText: Copyright © 2022 Wim Taymans */ -/* SPDX-License-Identifier: MIT */ - -#ifndef DELAY_H -#define DELAY_H - -#ifdef __cplusplus -extern "C" { -#endif - -static inline void delay_run(float *buffer, uint32_t *pos, - uint32_t n_buffer, uint32_t delay, - float *dst, const float *src, const float vol, uint32_t n_samples) -{ - uint32_t i; - uint32_t w = *pos; - uint32_t o = n_buffer - delay; - - for (i = 0; i < n_samples; i++) { - buffer[w] = buffer[w + n_buffer] = src[i]; - dst[i] = buffer[w + o] * vol; - w = w + 1 >= n_buffer ? 0 : w + 1; - } - *pos = w; -} - -static inline void delay_convolve_run(float *buffer, uint32_t *pos, - uint32_t n_buffer, uint32_t delay, - const float *taps, uint32_t n_taps, - float *dst, const float *src, const float vol, uint32_t n_samples) -{ - uint32_t i, j; - uint32_t w = *pos; - uint32_t o = n_buffer - delay - n_taps-1; - - if (n_taps == 1) { - delay_run(buffer, pos, n_buffer, delay, dst, src, vol, n_samples); - return; - } - for (i = 0; i < n_samples; i++) { - float sum = 0.0f; - - buffer[w] = buffer[w + n_buffer] = src[i]; - for (j = 0; j < n_taps; j++) - sum += taps[j] * buffer[w+o+j]; - dst[i] = sum * vol; - - w = w + 1 >= n_buffer ? 0 : w + 1; - } - *pos = w; -} - -#ifdef __cplusplus -} -#endif - -#endif /* DELAY_H */