From 7fc020098cdc92bc2fa507c86827b045f3f4913b Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Wed, 22 Apr 2026 14:09:26 +0200 Subject: [PATCH] dsp: shuffle per implementation --- spa/plugins/filter-graph/audio-dsp-avx2.c | 85 ++++++++++++- spa/plugins/filter-graph/audio-dsp-c.c | 76 ++---------- spa/plugins/filter-graph/audio-dsp-impl.h | 6 + spa/plugins/filter-graph/audio-dsp-sse.c | 139 +++++++++++++++------- spa/plugins/filter-graph/audio-dsp.c | 12 +- 5 files changed, 205 insertions(+), 113 deletions(-) diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index 499efb540..a8adf0105 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -10,7 +10,9 @@ #include -#ifndef HAVE_FFTW +#ifdef HAVE_FFTW +#include +#else #include "pffft.h" #endif #include "audio-dsp-impl.h" @@ -235,6 +237,87 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t } } +#define FFT_BLOCK 8 + +#ifdef HAVE_FFTW +struct fft_info { + fftwf_plan plan_r2c; + fftwf_plan plan_c2r; + uint32_t size; +}; + +/* interleaved [r0,i0,...,r7,i7] -> blocked [r0..r7,i0..i7] */ +static void fft_blocked_avx2(float *data, uint32_t len) +{ + const __m256i idx = _mm256_setr_epi32(0,2,4,6,1,3,5,7); + uint32_t i; + for (i = 0; i < len; i += FFT_BLOCK) { + __m256 v0 = _mm256_load_ps(&data[0]); /* r0 i0 r1 i1 r2 i2 r3 i3 */ + __m256 v1 = _mm256_load_ps(&data[8]); /* r4 i4 r5 i5 r6 i6 r7 i7 */ + __m256 t0 = _mm256_permutevar8x32_ps(v0, idx); /* r0 r1 r2 r3 i0 i1 i2 i3 */ + __m256 t1 = _mm256_permutevar8x32_ps(v1, idx); /* r4 r5 r6 r7 i4 i5 i6 i7 */ + _mm256_store_ps(&data[0], _mm256_permute2f128_ps(t0, t1, 0x20)); + _mm256_store_ps(&data[8], _mm256_permute2f128_ps(t0, t1, 0x31)); + data += 2 * FFT_BLOCK; + } +} + +/* blocked [r0..r7,i0..i7] -> interleaved [r0,i0,...,r7,i7] */ +static void fft_interleaved_avx2(float *data, uint32_t len) +{ + const __m256i idx = _mm256_setr_epi32(0,4,1,5,2,6,3,7); + uint32_t i; + for (i = 0; i < len; i += FFT_BLOCK) { + __m256 r = _mm256_load_ps(&data[0]); /* r0 r1 r2 r3 r4 r5 r6 r7 */ + __m256 im = _mm256_load_ps(&data[8]); /* i0 i1 i2 i3 i4 i5 i6 i7 */ + __m256 t0 = _mm256_permute2f128_ps(r, im, 0x20); /* r0 r1 r2 r3 i0 i1 i2 i3 */ + __m256 t1 = _mm256_permute2f128_ps(r, im, 0x31); /* r4 r5 r6 r7 i4 i5 i6 i7 */ + _mm256_store_ps(&data[0], _mm256_permutevar8x32_ps(t0, idx)); + _mm256_store_ps(&data[8], _mm256_permutevar8x32_ps(t1, idx)); + data += 2 * FFT_BLOCK; + } +} +#endif + +void *dsp_fft_memalloc_avx2(void *obj, uint32_t size, bool real) +{ +#ifdef HAVE_FFTW + return fftwf_alloc_real(real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); +#else + if (real) + return pffft_aligned_malloc(size * sizeof(float)); + else + return pffft_aligned_malloc(size * 2 * sizeof(float)); +#endif +} + +void dsp_fft_memclear_avx2(void *obj, void *data, uint32_t size, bool real) +{ +#ifdef HAVE_FFTW + spa_fga_dsp_clear(obj, data, real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); +#else + spa_fga_dsp_clear(obj, data, real ? size : size * 2); +#endif +} + +void dsp_fft_run_avx2(void *obj, void *fft, int direction, + const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) +{ +#ifdef HAVE_FFTW + struct fft_info *info = fft; + uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); + if (direction > 0) { + fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); + fft_blocked_avx2(dst, freq_size); + } else { + fft_interleaved_avx2((float*)src, freq_size); + fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); + } +#else + pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); +#endif +} + void dsp_fft_cmul_avx2(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) diff --git a/spa/plugins/filter-graph/audio-dsp-c.c b/spa/plugins/filter-graph/audio-dsp-c.c index 70841dca3..6b829e48e 100644 --- a/spa/plugins/filter-graph/audio-dsp-c.c +++ b/spa/plugins/filter-graph/audio-dsp-c.c @@ -235,44 +235,12 @@ void dsp_delay_c(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, } } -#define FFT_BLOCK 8 - #ifdef HAVE_FFTW struct fft_info { fftwf_plan plan_r2c; fftwf_plan plan_c2r; uint32_t size; }; - -/* interleaved [r0,i0,r1,i1,...] -> blocked [r0..r7,i0..i7,r8..r15,i8..i15,...] */ -static void fft_blocked(float *data, uint32_t len) -{ - float tmp[2 * FFT_BLOCK]; - uint32_t i, j; - for (i = 0; i < len; i += FFT_BLOCK) { - memcpy(tmp, data, 2 * FFT_BLOCK * sizeof(float)); - for (j = 0; j < FFT_BLOCK; j++) { - data[j] = tmp[2*j]; - data[FFT_BLOCK+j] = tmp[2*j+1]; - } - data += 2 * FFT_BLOCK; - } -} - -/* blocked [r0..r7,i0..i7,...] -> interleaved [r0,i0,r1,i1,...] */ -static void fft_interleaved(float *data, uint32_t len) -{ - float tmp[2 * FFT_BLOCK]; - uint32_t i, j; - for (i = 0; i < len; i += FFT_BLOCK) { - memcpy(tmp, data, 2 * FFT_BLOCK * sizeof(float)); - for (j = 0; j < FFT_BLOCK; j++) { - data[2*j] = tmp[j]; - data[2*j+1] = tmp[FFT_BLOCK+j]; - } - data += 2 * FFT_BLOCK; - } -} #endif void *dsp_fft_new_c(void *obj, uint32_t size, bool real) @@ -317,7 +285,10 @@ void dsp_fft_free_c(void *obj, void *fft) void *dsp_fft_memalloc_c(void *obj, uint32_t size, bool real) { #ifdef HAVE_FFTW - return fftwf_alloc_real(real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); + if (real) + return fftwf_alloc_real(size); + else + return fftwf_alloc_complex(size); #else if (real) return pffft_aligned_malloc(size * sizeof(float)); @@ -338,7 +309,7 @@ void dsp_fft_memfree_c(void *obj, void *data) void dsp_fft_memclear_c(void *obj, void *data, uint32_t size, bool real) { #ifdef HAVE_FFTW - spa_fga_dsp_clear(obj, data, real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); + spa_fga_dsp_clear(obj, data, real ? size : size * 2); #else spa_fga_dsp_clear(obj, data, real ? size : size * 2); #endif @@ -349,14 +320,10 @@ void dsp_fft_run_c(void *obj, void *fft, int direction, { #ifdef HAVE_FFTW struct fft_info *info = fft; - uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); - if (direction > 0) { + if (direction > 0) fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); - fft_blocked(dst, freq_size); - } else { - fft_interleaved((float*)src, freq_size); + else fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); - } #else pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); #endif @@ -367,17 +334,9 @@ void dsp_fft_cmul_c(void *obj, void *fft, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW - uint32_t i, j, plen = SPA_ROUND_UP_N(len, FFT_BLOCK); - for (i = 0; i < plen; i += FFT_BLOCK) { - for (j = 0; j < FFT_BLOCK; j++) { - float ar = a[j], ai = a[FFT_BLOCK+j]; - float br = b[j], bi = b[FFT_BLOCK+j]; - dst[j] = (ar * br - ai * bi) * scale; - dst[FFT_BLOCK+j] = (ar * bi + ai * br) * scale; - } - a += 2 * FFT_BLOCK; - b += 2 * FFT_BLOCK; - dst += 2 * FFT_BLOCK; + for (uint32_t i = 0; i < len; i++) { + dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } #else pffft_zconvolve(fft, a, b, dst, scale); @@ -390,18 +349,9 @@ void dsp_fft_cmuladd_c(void *obj, void *fft, uint32_t len, const float scale) { #ifdef HAVE_FFTW - uint32_t i, j, plen = SPA_ROUND_UP_N(len, FFT_BLOCK); - for (i = 0; i < plen; i += FFT_BLOCK) { - for (j = 0; j < FFT_BLOCK; j++) { - float ar = a[j], ai = a[FFT_BLOCK+j]; - float br = b[j], bi = b[FFT_BLOCK+j]; - dst[j] = src[j] + (ar * br - ai * bi) * scale; - dst[FFT_BLOCK+j] = src[FFT_BLOCK+j] + (ar * bi + ai * br) * scale; - } - a += 2 * FFT_BLOCK; - b += 2 * FFT_BLOCK; - src += 2 * FFT_BLOCK; - dst += 2 * FFT_BLOCK; + for (uint32_t i = 0; i < len; i++) { + dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); diff --git a/spa/plugins/filter-graph/audio-dsp-impl.h b/spa/plugins/filter-graph/audio-dsp-impl.h index a5d3c975f..02bdb7e84 100644 --- a/spa/plugins/filter-graph/audio-dsp-impl.h +++ b/spa/plugins/filter-graph/audio-dsp-impl.h @@ -81,12 +81,18 @@ MAKE_MIX_GAIN_FUNC(sse); MAKE_SUM_FUNC(sse); MAKE_BIQUAD_RUN_FUNC(sse); MAKE_DELAY_FUNC(sse); +MAKE_FFT_MEMALLOC_FUNC(sse); +MAKE_FFT_MEMCLEAR_FUNC(sse); +MAKE_FFT_RUN_FUNC(sse); MAKE_FFT_CMUL_FUNC(sse); MAKE_FFT_CMULADD_FUNC(sse); #endif #if defined (HAVE_AVX2) MAKE_MIX_GAIN_FUNC(avx2); MAKE_SUM_FUNC(avx2); +MAKE_FFT_MEMALLOC_FUNC(avx2); +MAKE_FFT_MEMCLEAR_FUNC(avx2); +MAKE_FFT_RUN_FUNC(avx2); MAKE_FFT_CMUL_FUNC(avx2); MAKE_FFT_CMULADD_FUNC(avx2); #endif diff --git a/spa/plugins/filter-graph/audio-dsp-sse.c b/spa/plugins/filter-graph/audio-dsp-sse.c index 5aa99e29a..9323b3a81 100644 --- a/spa/plugins/filter-graph/audio-dsp-sse.c +++ b/spa/plugins/filter-graph/audio-dsp-sse.c @@ -12,7 +12,9 @@ #include -#ifndef HAVE_FFTW +#ifdef HAVE_FFTW +#include +#else #include "pffft.h" #endif @@ -682,34 +684,98 @@ void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, u *pos = w; } +#define FFT_BLOCK 4 + +#ifdef HAVE_FFTW +struct fft_info { + fftwf_plan plan_r2c; + fftwf_plan plan_c2r; + uint32_t size; +}; + +/* interleaved [r0,i0,r1,i1,r2,i2,r3,i3] -> blocked [r0,r1,r2,r3,i0,i1,i2,i3] */ +static void fft_blocked_sse(float *data, uint32_t len) +{ + uint32_t i; + for (i = 0; i < len; i += FFT_BLOCK) { + __m128 v0 = _mm_load_ps(&data[0]); /* r0 i0 r1 i1 */ + __m128 v1 = _mm_load_ps(&data[4]); /* r2 i2 r3 i3 */ + _mm_store_ps(&data[0], _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0))); + _mm_store_ps(&data[4], _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3,1,3,1))); + data += 2 * FFT_BLOCK; + } +} + +/* blocked [r0,r1,r2,r3,i0,i1,i2,i3] -> interleaved [r0,i0,r1,i1,r2,i2,r3,i3] */ +static void fft_interleaved_sse(float *data, uint32_t len) +{ + uint32_t i; + for (i = 0; i < len; i += FFT_BLOCK) { + __m128 r = _mm_load_ps(&data[0]); /* r0 r1 r2 r3 */ + __m128 im = _mm_load_ps(&data[4]); /* i0 i1 i2 i3 */ + _mm_store_ps(&data[0], _mm_unpacklo_ps(r, im)); + _mm_store_ps(&data[4], _mm_unpackhi_ps(r, im)); + data += 2 * FFT_BLOCK; + } +} +#endif + +void *dsp_fft_memalloc_sse(void *obj, uint32_t size, bool real) +{ +#ifdef HAVE_FFTW + return fftwf_alloc_real(real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); +#else + if (real) + return pffft_aligned_malloc(size * sizeof(float)); + else + return pffft_aligned_malloc(size * 2 * sizeof(float)); +#endif +} + +void dsp_fft_memclear_sse(void *obj, void *data, uint32_t size, bool real) +{ +#ifdef HAVE_FFTW + spa_fga_dsp_clear(obj, data, real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); +#else + spa_fga_dsp_clear(obj, data, real ? size : size * 2); +#endif +} + +void dsp_fft_run_sse(void *obj, void *fft, int direction, + const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) +{ +#ifdef HAVE_FFTW + struct fft_info *info = fft; + uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); + if (direction > 0) { + fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); + fft_blocked_sse(dst, freq_size); + } else { + fft_interleaved_sse((float*)src, freq_size); + fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); + } +#else + pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); +#endif +} + void dsp_fft_cmul_sse(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW __m128 s = _mm_set1_ps(scale); - uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; + uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; - for (i = 0; i < plen; i += 16) { - __m128 ar, ai, br, bi, dr, di; - - ar = _mm_load_ps(&a[i]); - ai = _mm_load_ps(&a[i+8]); - br = _mm_load_ps(&b[i]); - bi = _mm_load_ps(&b[i+8]); - dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + for (i = 0; i < plen; i += 2 * FFT_BLOCK) { + __m128 ar = _mm_load_ps(&a[i]); + __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); + __m128 br = _mm_load_ps(&b[i]); + __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); + __m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + __m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); _mm_store_ps(&dst[i], _mm_mul_ps(dr, s)); - _mm_store_ps(&dst[i+8], _mm_mul_ps(di, s)); - - ar = _mm_load_ps(&a[i+4]); - ai = _mm_load_ps(&a[i+12]); - br = _mm_load_ps(&b[i+4]); - bi = _mm_load_ps(&b[i+12]); - dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); - _mm_store_ps(&dst[i+4], _mm_mul_ps(dr, s)); - _mm_store_ps(&dst[i+12], _mm_mul_ps(di, s)); + _mm_store_ps(&dst[i + FFT_BLOCK], _mm_mul_ps(di, s)); } #else pffft_zconvolve(fft, a, b, dst, scale); @@ -723,31 +789,18 @@ void dsp_fft_cmuladd_sse(void *obj, void *fft, { #ifdef HAVE_FFTW __m128 s = _mm_set1_ps(scale); - uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; + uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; - for (i = 0; i < plen; i += 16) { - __m128 ar, ai, br, bi, dr, di; - - ar = _mm_load_ps(&a[i]); - ai = _mm_load_ps(&a[i+8]); - br = _mm_load_ps(&b[i]); - bi = _mm_load_ps(&b[i+8]); - dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + for (i = 0; i < plen; i += 2 * FFT_BLOCK) { + __m128 ar = _mm_load_ps(&a[i]); + __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); + __m128 br = _mm_load_ps(&b[i]); + __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); + __m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + __m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); _mm_store_ps(&dst[i], _mm_add_ps(_mm_load_ps(&src[i]), _mm_mul_ps(dr, s))); - _mm_store_ps(&dst[i+8], _mm_add_ps(_mm_load_ps(&src[i+8]), - _mm_mul_ps(di, s))); - - ar = _mm_load_ps(&a[i+4]); - ai = _mm_load_ps(&a[i+12]); - br = _mm_load_ps(&b[i+4]); - bi = _mm_load_ps(&b[i+12]); - dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); - _mm_store_ps(&dst[i+4], _mm_add_ps(_mm_load_ps(&src[i+4]), - _mm_mul_ps(dr, s))); - _mm_store_ps(&dst[i+12], _mm_add_ps(_mm_load_ps(&src[i+12]), + _mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps(_mm_load_ps(&src[i + FFT_BLOCK]), _mm_mul_ps(di, s))); } #else diff --git a/spa/plugins/filter-graph/audio-dsp.c b/spa/plugins/filter-graph/audio-dsp.c index d0c4ef008..d900d0b4d 100644 --- a/spa/plugins/filter-graph/audio-dsp.c +++ b/spa/plugins/filter-graph/audio-dsp.c @@ -34,10 +34,10 @@ static const struct dsp_info dsp_table[] = .funcs.mult = dsp_mult_c, .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, - .funcs.fft_memalloc = dsp_fft_memalloc_c, + .funcs.fft_memalloc = dsp_fft_memalloc_avx2, .funcs.fft_memfree = dsp_fft_memfree_c, - .funcs.fft_memclear = dsp_fft_memclear_c, - .funcs.fft_run = dsp_fft_run_c, + .funcs.fft_memclear = dsp_fft_memclear_avx2, + .funcs.fft_run = dsp_fft_run_avx2, .funcs.fft_cmul = dsp_fft_cmul_avx2, .funcs.fft_cmuladd = dsp_fft_cmuladd_avx2, .funcs.delay = dsp_delay_sse, @@ -54,10 +54,10 @@ static const struct dsp_info dsp_table[] = .funcs.mult = dsp_mult_c, .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, - .funcs.fft_memalloc = dsp_fft_memalloc_c, + .funcs.fft_memalloc = dsp_fft_memalloc_sse, .funcs.fft_memfree = dsp_fft_memfree_c, - .funcs.fft_memclear = dsp_fft_memclear_c, - .funcs.fft_run = dsp_fft_run_c, + .funcs.fft_memclear = dsp_fft_memclear_sse, + .funcs.fft_run = dsp_fft_run_sse, .funcs.fft_cmul = dsp_fft_cmul_sse, .funcs.fft_cmuladd = dsp_fft_cmuladd_sse, .funcs.delay = dsp_delay_sse,