From 46b8380490aba6241a5590f9d980e0fd8091b625 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Wed, 22 Apr 2026 13:08:27 +0200 Subject: [PATCH] dsp: store Real/Imag in blocks of 8 Shuffle FFT output into real/imag blocks so that they are easier to handle in the complex multiply. Do the unshuffle again before doing the inverse FFT. --- spa/plugins/filter-graph/audio-dsp-avx2.c | 87 +++++------------ spa/plugins/filter-graph/audio-dsp-c.c | 83 +++++++++++++--- spa/plugins/filter-graph/audio-dsp-sse.c | 113 +++++++++------------- 3 files changed, 138 insertions(+), 145 deletions(-) diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index 346b26ab3..499efb540 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -235,47 +235,25 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t } } -inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd) -{ - __m256 aa, bb, dc, x1; - aa = _mm256_moveldup_ps(ab); - bb = _mm256_movehdup_ps(ab); - dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1)); - x1 = _mm256_mul_ps(bb, dc); - return _mm256_fmaddsub_ps(aa, cd, x1); -} - void dsp_fft_cmul_avx2(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW __m256 s = _mm256_set1_ps(scale); - __m256 aa[2], bb[2], dd[2]; - uint32_t i, unrolled; + uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; - if (SPA_IS_ALIGNED(a, 32) && - SPA_IS_ALIGNED(b, 32) && - SPA_IS_ALIGNED(dst, 32)) - unrolled = len & ~7; - else - unrolled = 0; - - for (i = 0; i < unrolled; i+=8) { - aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ - aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */ - bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ - bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ - dd[0] = _mm256_mul_pz(aa[0], bb[0]); - dd[1] = _mm256_mul_pz(aa[1], bb[1]); - dd[0] = _mm256_mul_ps(dd[0], s); - dd[1] = _mm256_mul_ps(dd[1], s); - _mm256_store_ps(&dst[2*i], dd[0]); - _mm256_store_ps(&dst[2*i+8], dd[1]); - } - for (; i < len; i++) { - dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + for (i = 0; i < plen; i += 16) { + __m256 ar = _mm256_load_ps(&a[i]); + __m256 ai = _mm256_load_ps(&a[i+8]); + __m256 br = _mm256_load_ps(&b[i]); + __m256 bi = _mm256_load_ps(&b[i+8]); + __m256 dr = _mm256_mul_ps(ar, br); + __m256 di = _mm256_mul_ps(ar, bi); + dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ + di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ + _mm256_store_ps(&dst[i], _mm256_mul_ps(dr, s)); + _mm256_store_ps(&dst[i+8], _mm256_mul_ps(di, s)); } #else pffft_zconvolve(fft, a, b, dst, scale); @@ -289,34 +267,21 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft, { #ifdef HAVE_FFTW __m256 s = _mm256_set1_ps(scale); - __m256 aa[2], bb[2], dd[2], t[2]; - uint32_t i, unrolled; + uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; - if (SPA_IS_ALIGNED(a, 32) && - SPA_IS_ALIGNED(b, 32) && - SPA_IS_ALIGNED(src, 32) && - SPA_IS_ALIGNED(dst, 32)) - unrolled = len & ~7; - else - unrolled = 0; - - for (i = 0; i < unrolled; i+=8) { - aa[0] = _mm256_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ - aa[1] = _mm256_load_ps(&a[2*i+8]); /* ar1 ai1 ar2 ai2 */ - bb[0] = _mm256_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ - bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */ - dd[0] = _mm256_mul_pz(aa[0], bb[0]); - dd[1] = _mm256_mul_pz(aa[1], bb[1]); - t[0] = _mm256_load_ps(&src[2*i]); - t[1] = _mm256_load_ps(&src[2*i+8]); - t[0] = _mm256_fmadd_ps(dd[0], s, t[0]); - t[1] = _mm256_fmadd_ps(dd[1], s, t[1]); - _mm256_store_ps(&dst[2*i], t[0]); - _mm256_store_ps(&dst[2*i+8], t[1]); - } - for (; i < len; i++) { - dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + for (i = 0; i < plen; i += 16) { + __m256 ar = _mm256_load_ps(&a[i]); + __m256 ai = _mm256_load_ps(&a[i+8]); + __m256 br = _mm256_load_ps(&b[i]); + __m256 bi = _mm256_load_ps(&b[i+8]); + __m256 dr = _mm256_mul_ps(ar, br); + __m256 di = _mm256_mul_ps(ar, bi); + dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ + di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ + _mm256_store_ps(&dst[i], _mm256_fmadd_ps(dr, s, + _mm256_load_ps(&src[i]))); + _mm256_store_ps(&dst[i+8], _mm256_fmadd_ps(di, s, + _mm256_load_ps(&src[i+8]))); } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); diff --git a/spa/plugins/filter-graph/audio-dsp-c.c b/spa/plugins/filter-graph/audio-dsp-c.c index 0aab9decc..70841dca3 100644 --- a/spa/plugins/filter-graph/audio-dsp-c.c +++ b/spa/plugins/filter-graph/audio-dsp-c.c @@ -235,11 +235,44 @@ void dsp_delay_c(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, } } +#define FFT_BLOCK 8 + #ifdef HAVE_FFTW struct fft_info { fftwf_plan plan_r2c; fftwf_plan plan_c2r; + uint32_t size; }; + +/* interleaved [r0,i0,r1,i1,...] -> blocked [r0..r7,i0..i7,r8..r15,i8..i15,...] */ +static void fft_blocked(float *data, uint32_t len) +{ + float tmp[2 * FFT_BLOCK]; + uint32_t i, j; + for (i = 0; i < len; i += FFT_BLOCK) { + memcpy(tmp, data, 2 * FFT_BLOCK * sizeof(float)); + for (j = 0; j < FFT_BLOCK; j++) { + data[j] = tmp[2*j]; + data[FFT_BLOCK+j] = tmp[2*j+1]; + } + data += 2 * FFT_BLOCK; + } +} + +/* blocked [r0..r7,i0..i7,...] -> interleaved [r0,i0,r1,i1,...] */ +static void fft_interleaved(float *data, uint32_t len) +{ + float tmp[2 * FFT_BLOCK]; + uint32_t i, j; + for (i = 0; i < len; i += FFT_BLOCK) { + memcpy(tmp, data, 2 * FFT_BLOCK * sizeof(float)); + for (j = 0; j < FFT_BLOCK; j++) { + data[2*j] = tmp[j]; + data[2*j+1] = tmp[FFT_BLOCK+j]; + } + data += 2 * FFT_BLOCK; + } +} #endif void *dsp_fft_new_c(void *obj, uint32_t size, bool real) @@ -252,6 +285,8 @@ void *dsp_fft_new_c(void *obj, uint32_t size, bool real) if (info == NULL) return NULL; + info->size = size; + rdata = fftwf_alloc_real(size * 2); cdata = fftwf_alloc_complex(size + 1); @@ -282,10 +317,7 @@ void dsp_fft_free_c(void *obj, void *fft) void *dsp_fft_memalloc_c(void *obj, uint32_t size, bool real) { #ifdef HAVE_FFTW - if (real) - return fftwf_alloc_real(size); - else - return fftwf_alloc_complex(size); + return fftwf_alloc_real(real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); #else if (real) return pffft_aligned_malloc(size * sizeof(float)); @@ -306,7 +338,7 @@ void dsp_fft_memfree_c(void *obj, void *data) void dsp_fft_memclear_c(void *obj, void *data, uint32_t size, bool real) { #ifdef HAVE_FFTW - spa_fga_dsp_clear(obj, data, real ? size : size * 2); + spa_fga_dsp_clear(obj, data, real ? size : SPA_ROUND_UP_N(size, FFT_BLOCK) * 2); #else spa_fga_dsp_clear(obj, data, real ? size : size * 2); #endif @@ -317,10 +349,14 @@ void dsp_fft_run_c(void *obj, void *fft, int direction, { #ifdef HAVE_FFTW struct fft_info *info = fft; - if (direction > 0) - fftwf_execute_dft_r2c (info->plan_r2c, (float*)src, (fftwf_complex*)dst); - else - fftwf_execute_dft_c2r (info->plan_c2r, (fftwf_complex*)src, dst); + uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); + if (direction > 0) { + fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); + fft_blocked(dst, freq_size); + } else { + fft_interleaved((float*)src, freq_size); + fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); + } #else pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); #endif @@ -331,9 +367,17 @@ void dsp_fft_cmul_c(void *obj, void *fft, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW - for (uint32_t i = 0; i < len; i++) { - dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + uint32_t i, j, plen = SPA_ROUND_UP_N(len, FFT_BLOCK); + for (i = 0; i < plen; i += FFT_BLOCK) { + for (j = 0; j < FFT_BLOCK; j++) { + float ar = a[j], ai = a[FFT_BLOCK+j]; + float br = b[j], bi = b[FFT_BLOCK+j]; + dst[j] = (ar * br - ai * bi) * scale; + dst[FFT_BLOCK+j] = (ar * bi + ai * br) * scale; + } + a += 2 * FFT_BLOCK; + b += 2 * FFT_BLOCK; + dst += 2 * FFT_BLOCK; } #else pffft_zconvolve(fft, a, b, dst, scale); @@ -346,9 +390,18 @@ void dsp_fft_cmuladd_c(void *obj, void *fft, uint32_t len, const float scale) { #ifdef HAVE_FFTW - for (uint32_t i = 0; i < len; i++) { - dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + uint32_t i, j, plen = SPA_ROUND_UP_N(len, FFT_BLOCK); + for (i = 0; i < plen; i += FFT_BLOCK) { + for (j = 0; j < FFT_BLOCK; j++) { + float ar = a[j], ai = a[FFT_BLOCK+j]; + float br = b[j], bi = b[FFT_BLOCK+j]; + dst[j] = src[j] + (ar * br - ai * bi) * scale; + dst[FFT_BLOCK+j] = src[FFT_BLOCK+j] + (ar * bi + ai * br) * scale; + } + a += 2 * FFT_BLOCK; + b += 2 * FFT_BLOCK; + src += 2 * FFT_BLOCK; + dst += 2 * FFT_BLOCK; } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); diff --git a/spa/plugins/filter-graph/audio-dsp-sse.c b/spa/plugins/filter-graph/audio-dsp-sse.c index e3a877b71..5aa99e29a 100644 --- a/spa/plugins/filter-graph/audio-dsp-sse.c +++ b/spa/plugins/filter-graph/audio-dsp-sse.c @@ -682,56 +682,34 @@ void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, u *pos = w; } -inline static void _mm_mul_pz(__m128 *a, __m128 *b, __m128 *d) -{ - __m128 ar, ai, br, bi, arbr, arbi, aibi, aibr, dr, di; - ar = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(2,0,2,0)); /* ar0 ar1 ar2 ar3 */ - ai = _mm_shuffle_ps(a[0], a[1], _MM_SHUFFLE(3,1,3,1)); /* ai0 ai1 ai2 ai3 */ - br = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(2,0,2,0)); /* br0 br1 br2 br3 */ - bi = _mm_shuffle_ps(b[0], b[1], _MM_SHUFFLE(3,1,3,1)) /* bi0 bi1 bi2 bi3 */; - - arbr = _mm_mul_ps(ar, br); /* ar * br */ - arbi = _mm_mul_ps(ar, bi); /* ar * bi */ - - aibi = _mm_mul_ps(ai, bi); /* ai * bi */ - aibr = _mm_mul_ps(ai, br); /* ai * br */ - - dr = _mm_sub_ps(arbr, aibi); /* ar * br - ai * bi */ - di = _mm_add_ps(arbi, aibr); /* ar * bi + ai * br */ - d[0] = _mm_unpacklo_ps(dr, di); - d[1] = _mm_unpackhi_ps(dr, di); -} - void dsp_fft_cmul_sse(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW __m128 s = _mm_set1_ps(scale); - __m128 aa[2], bb[2], dd[2]; - uint32_t i, unrolled; + uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; - if (SPA_IS_ALIGNED(a, 16) && - SPA_IS_ALIGNED(b, 16) && - SPA_IS_ALIGNED(dst, 16)) - unrolled = len & ~3; - else - unrolled = 0; + for (i = 0; i < plen; i += 16) { + __m128 ar, ai, br, bi, dr, di; - for (i = 0; i < unrolled; i+=4) { - aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ - aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */ - bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ - bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */ - _mm_mul_pz(aa, bb, dd); - dd[0] = _mm_mul_ps(dd[0], s); - dd[1] = _mm_mul_ps(dd[1], s); - _mm_store_ps(&dst[2*i], dd[0]); - _mm_store_ps(&dst[2*i+4], dd[1]); - } - for (; i < len; i++) { - dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + ar = _mm_load_ps(&a[i]); + ai = _mm_load_ps(&a[i+8]); + br = _mm_load_ps(&b[i]); + bi = _mm_load_ps(&b[i+8]); + dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + _mm_store_ps(&dst[i], _mm_mul_ps(dr, s)); + _mm_store_ps(&dst[i+8], _mm_mul_ps(di, s)); + + ar = _mm_load_ps(&a[i+4]); + ai = _mm_load_ps(&a[i+12]); + br = _mm_load_ps(&b[i+4]); + bi = _mm_load_ps(&b[i+12]); + dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + _mm_store_ps(&dst[i+4], _mm_mul_ps(dr, s)); + _mm_store_ps(&dst[i+12], _mm_mul_ps(di, s)); } #else pffft_zconvolve(fft, a, b, dst, scale); @@ -745,35 +723,32 @@ void dsp_fft_cmuladd_sse(void *obj, void *fft, { #ifdef HAVE_FFTW __m128 s = _mm_set1_ps(scale); - __m128 aa[2], bb[2], dd[2], t[2]; - uint32_t i, unrolled; + uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; - if (SPA_IS_ALIGNED(a, 16) && - SPA_IS_ALIGNED(b, 16) && - SPA_IS_ALIGNED(src, 16) && - SPA_IS_ALIGNED(dst, 16)) - unrolled = len & ~3; - else - unrolled = 0; + for (i = 0; i < plen; i += 16) { + __m128 ar, ai, br, bi, dr, di; - for (i = 0; i < unrolled; i+=4) { - aa[0] = _mm_load_ps(&a[2*i]); /* ar0 ai0 ar1 ai1 */ - aa[1] = _mm_load_ps(&a[2*i+4]); /* ar1 ai1 ar2 ai2 */ - bb[0] = _mm_load_ps(&b[2*i]); /* br0 bi0 br1 bi1 */ - bb[1] = _mm_load_ps(&b[2*i+4]); /* br2 bi2 br3 bi3 */ - _mm_mul_pz(aa, bb, dd); - dd[0] = _mm_mul_ps(dd[0], s); - dd[1] = _mm_mul_ps(dd[1], s); - t[0] = _mm_load_ps(&src[2*i]); - t[1] = _mm_load_ps(&src[2*i+4]); - t[0] = _mm_add_ps(t[0], dd[0]); - t[1] = _mm_add_ps(t[1], dd[1]); - _mm_store_ps(&dst[2*i], t[0]); - _mm_store_ps(&dst[2*i+4], t[1]); - } - for (; i < len; i++) { - dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + ar = _mm_load_ps(&a[i]); + ai = _mm_load_ps(&a[i+8]); + br = _mm_load_ps(&b[i]); + bi = _mm_load_ps(&b[i+8]); + dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + _mm_store_ps(&dst[i], _mm_add_ps(_mm_load_ps(&src[i]), + _mm_mul_ps(dr, s))); + _mm_store_ps(&dst[i+8], _mm_add_ps(_mm_load_ps(&src[i+8]), + _mm_mul_ps(di, s))); + + ar = _mm_load_ps(&a[i+4]); + ai = _mm_load_ps(&a[i+12]); + br = _mm_load_ps(&b[i+4]); + bi = _mm_load_ps(&b[i+12]); + dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); + di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); + _mm_store_ps(&dst[i+4], _mm_add_ps(_mm_load_ps(&src[i+4]), + _mm_mul_ps(dr, s))); + _mm_store_ps(&dst[i+12], _mm_add_ps(_mm_load_ps(&src[i+12]), + _mm_mul_ps(di, s))); } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);