dsp: move scaling out of complex multiply

do scaling as part of iFFT.
This commit is contained in:
Wim Taymans 2026-04-22 14:34:58 +02:00
parent 7fc020098c
commit aabcbf1261
6 changed files with 108 additions and 83 deletions

View file

@ -239,13 +239,18 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
#define FFT_BLOCK 8 #define FFT_BLOCK 8
#ifdef HAVE_FFTW
struct fft_info { struct fft_info {
#ifdef HAVE_FFTW
fftwf_plan plan_r2c; fftwf_plan plan_r2c;
fftwf_plan plan_c2r; fftwf_plan plan_c2r;
#else
void *setup;
#endif
uint32_t size; uint32_t size;
}; };
#ifdef HAVE_FFTW
/* interleaved [r0,i0,...,r7,i7] -> blocked [r0..r7,i0..i7] */ /* interleaved [r0,i0,...,r7,i7] -> blocked [r0..r7,i0..i7] */
static void fft_blocked_avx2(float *data, uint32_t len) static void fft_blocked_avx2(float *data, uint32_t len)
{ {
@ -262,16 +267,17 @@ static void fft_blocked_avx2(float *data, uint32_t len)
} }
} }
/* blocked [r0..r7,i0..i7] -> interleaved [r0,i0,...,r7,i7] */ /* blocked [r0..r7,i0..i7] -> interleaved [r0,i0,...,r7,i7] with scaling */
static void fft_interleaved_avx2(float *data, uint32_t len) static void fft_interleaved_avx2(float *data, uint32_t len, float scale)
{ {
const __m256i idx = _mm256_setr_epi32(0,4,1,5,2,6,3,7); const __m256i idx = _mm256_setr_epi32(0,4,1,5,2,6,3,7);
__m256 s = _mm256_set1_ps(scale);
uint32_t i; uint32_t i;
for (i = 0; i < len; i += FFT_BLOCK) { for (i = 0; i < len; i += FFT_BLOCK) {
__m256 r = _mm256_load_ps(&data[0]); /* r0 r1 r2 r3 r4 r5 r6 r7 */ __m256 r = _mm256_mul_ps(_mm256_load_ps(&data[0]), s);
__m256 im = _mm256_load_ps(&data[8]); /* i0 i1 i2 i3 i4 i5 i6 i7 */ __m256 im = _mm256_mul_ps(_mm256_load_ps(&data[8]), s);
__m256 t0 = _mm256_permute2f128_ps(r, im, 0x20); /* r0 r1 r2 r3 i0 i1 i2 i3 */ __m256 t0 = _mm256_permute2f128_ps(r, im, 0x20);
__m256 t1 = _mm256_permute2f128_ps(r, im, 0x31); /* r4 r5 r6 r7 i4 i5 i6 i7 */ __m256 t1 = _mm256_permute2f128_ps(r, im, 0x31);
_mm256_store_ps(&data[0], _mm256_permutevar8x32_ps(t0, idx)); _mm256_store_ps(&data[0], _mm256_permutevar8x32_ps(t0, idx));
_mm256_store_ps(&data[8], _mm256_permutevar8x32_ps(t1, idx)); _mm256_store_ps(&data[8], _mm256_permutevar8x32_ps(t1, idx));
data += 2 * FFT_BLOCK; data += 2 * FFT_BLOCK;
@ -303,27 +309,29 @@ void dsp_fft_memclear_avx2(void *obj, void *data, uint32_t size, bool real)
void dsp_fft_run_avx2(void *obj, void *fft, int direction, void dsp_fft_run_avx2(void *obj, void *fft, int direction,
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
{ {
#ifdef HAVE_FFTW
struct fft_info *info = fft; struct fft_info *info = fft;
#ifdef HAVE_FFTW
uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK);
if (direction > 0) { if (direction > 0) {
fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst);
fft_blocked_avx2(dst, freq_size); fft_blocked_avx2(dst, freq_size);
} else { } else {
fft_interleaved_avx2((float*)src, freq_size); fft_interleaved_avx2((float*)src, freq_size, 1.0f / info->size);
fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst);
} }
#else #else
pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); if (direction < 0)
spa_fga_dsp_linear(obj, (float*)src, (float*)src,
1.0f / info->size, 0.0f, info->size);
pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD);
#endif #endif
} }
void dsp_fft_cmul_avx2(void *obj, void *fft, void dsp_fft_cmul_avx2(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
__m256 s = _mm256_set1_ps(scale);
uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2;
for (i = 0; i < plen; i += 16) { for (i = 0; i < plen; i += 16) {
@ -335,21 +343,21 @@ void dsp_fft_cmul_avx2(void *obj, void *fft,
__m256 di = _mm256_mul_ps(ar, bi); __m256 di = _mm256_mul_ps(ar, bi);
dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */
di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */
_mm256_store_ps(&dst[i], _mm256_mul_ps(dr, s)); _mm256_store_ps(&dst[i], dr);
_mm256_store_ps(&dst[i+8], _mm256_mul_ps(di, s)); _mm256_store_ps(&dst[i+8], di);
} }
#else #else
pffft_zconvolve(fft, a, b, dst, scale); struct fft_info *info = fft;
pffft_zconvolve(info->setup, a, b, dst, 1.0f);
#endif #endif
} }
void dsp_fft_cmuladd_avx2(void *obj, void *fft, void dsp_fft_cmuladd_avx2(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale) uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
__m256 s = _mm256_set1_ps(scale);
uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2;
for (i = 0; i < plen; i += 16) { for (i = 0; i < plen; i += 16) {
@ -361,12 +369,13 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft,
__m256 di = _mm256_mul_ps(ar, bi); __m256 di = _mm256_mul_ps(ar, bi);
dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */
di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */
_mm256_store_ps(&dst[i], _mm256_fmadd_ps(dr, s, _mm256_store_ps(&dst[i], _mm256_add_ps(dr,
_mm256_load_ps(&src[i]))); _mm256_load_ps(&src[i])));
_mm256_store_ps(&dst[i+8], _mm256_fmadd_ps(di, s, _mm256_store_ps(&dst[i+8], _mm256_add_ps(di,
_mm256_load_ps(&src[i+8]))); _mm256_load_ps(&src[i+8])));
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); struct fft_info *info = fft;
pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f);
#endif #endif
} }

View file

@ -235,51 +235,55 @@ void dsp_delay_c(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer,
} }
} }
#ifdef HAVE_FFTW
struct fft_info { struct fft_info {
#ifdef HAVE_FFTW
fftwf_plan plan_r2c; fftwf_plan plan_r2c;
fftwf_plan plan_c2r; fftwf_plan plan_c2r;
#else
void *setup;
#endif
uint32_t size; uint32_t size;
}; };
#endif
void *dsp_fft_new_c(void *obj, uint32_t size, bool real) void *dsp_fft_new_c(void *obj, uint32_t size, bool real)
{ {
#ifdef HAVE_FFTW
struct fft_info *info = calloc(1, sizeof(struct fft_info)); struct fft_info *info = calloc(1, sizeof(struct fft_info));
float *rdata;
fftwf_complex *cdata;
if (info == NULL) if (info == NULL)
return NULL; return NULL;
info->size = size; info->size = size;
rdata = fftwf_alloc_real(size * 2); #ifdef HAVE_FFTW
cdata = fftwf_alloc_complex(size + 1); {
float *rdata;
fftwf_complex *cdata;
info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE); rdata = fftwf_alloc_real(size * 2);
info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE); cdata = fftwf_alloc_complex(size + 1);
fftwf_free(rdata); info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE);
fftwf_free(cdata); info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE);
return info; fftwf_free(rdata);
fftwf_free(cdata);
}
#else #else
return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX); info->setup = pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX);
#endif #endif
return info;
} }
void dsp_fft_free_c(void *obj, void *fft) void dsp_fft_free_c(void *obj, void *fft)
{ {
#ifdef HAVE_FFTW
struct fft_info *info = fft; struct fft_info *info = fft;
#ifdef HAVE_FFTW
fftwf_destroy_plan(info->plan_r2c); fftwf_destroy_plan(info->plan_r2c);
fftwf_destroy_plan(info->plan_c2r); fftwf_destroy_plan(info->plan_c2r);
free(info);
#else #else
pffft_destroy_setup(fft); pffft_destroy_setup(info->setup);
#endif #endif
free(info);
} }
void *dsp_fft_memalloc_c(void *obj, uint32_t size, bool real) void *dsp_fft_memalloc_c(void *obj, uint32_t size, bool real)
@ -318,43 +322,51 @@ void dsp_fft_memclear_c(void *obj, void *data, uint32_t size, bool real)
void dsp_fft_run_c(void *obj, void *fft, int direction, void dsp_fft_run_c(void *obj, void *fft, int direction,
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
{ {
#ifdef HAVE_FFTW
struct fft_info *info = fft; struct fft_info *info = fft;
if (direction > 0) #ifdef HAVE_FFTW
if (direction > 0) {
fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst);
else } else {
spa_fga_dsp_linear(obj, (float*)src, (float*)src,
1.0f / info->size, 0.0f, (info->size / 2 + 1) * 2);
fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst);
}
#else #else
pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); if (direction < 0)
spa_fga_dsp_linear(obj, (float*)src, (float*)src,
1.0f / info->size, 0.0f, info->size);
pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD);
#endif #endif
} }
void dsp_fft_cmul_c(void *obj, void *fft, void dsp_fft_cmul_c(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
for (uint32_t i = 0; i < len; i++) { for (uint32_t i = 0; i < len; i++) {
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1];
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ];
} }
#else #else
pffft_zconvolve(fft, a, b, dst, scale); struct fft_info *info = fft;
pffft_zconvolve(info->setup, a, b, dst, 1.0f);
#endif #endif
} }
void dsp_fft_cmuladd_c(void *obj, void *fft, void dsp_fft_cmuladd_c(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale) uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
for (uint32_t i = 0; i < len; i++) { for (uint32_t i = 0; i < len; i++) {
dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = src[2*i ] + a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1];
dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = src[2*i+1] + a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ];
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); struct fft_info *info = fft;
pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f);
#endif #endif
} }

View file

@ -50,12 +50,12 @@ void dsp_fft_run_##arch(void *obj, void *fft, int direction, \
#define MAKE_FFT_CMUL_FUNC(arch) \ #define MAKE_FFT_CMUL_FUNC(arch) \
void dsp_fft_cmul_##arch(void *obj, void *fft, \ void dsp_fft_cmul_##arch(void *obj, void *fft, \
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, \ float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, \
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len)
#define MAKE_FFT_CMULADD_FUNC(arch) \ #define MAKE_FFT_CMULADD_FUNC(arch) \
void dsp_fft_cmuladd_##arch(void *obj, void *fft, \ void dsp_fft_cmuladd_##arch(void *obj, void *fft, \
float * dst, const float * src, \ float * dst, const float * src, \
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \ const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \
uint32_t len, const float scale) uint32_t len)
MAKE_CLEAR_FUNC(c); MAKE_CLEAR_FUNC(c);

View file

@ -686,13 +686,18 @@ void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, u
#define FFT_BLOCK 4 #define FFT_BLOCK 4
#ifdef HAVE_FFTW
struct fft_info { struct fft_info {
#ifdef HAVE_FFTW
fftwf_plan plan_r2c; fftwf_plan plan_r2c;
fftwf_plan plan_c2r; fftwf_plan plan_c2r;
#else
void *setup;
#endif
uint32_t size; uint32_t size;
}; };
#ifdef HAVE_FFTW
/* interleaved [r0,i0,r1,i1,r2,i2,r3,i3] -> blocked [r0,r1,r2,r3,i0,i1,i2,i3] */ /* interleaved [r0,i0,r1,i1,r2,i2,r3,i3] -> blocked [r0,r1,r2,r3,i0,i1,i2,i3] */
static void fft_blocked_sse(float *data, uint32_t len) static void fft_blocked_sse(float *data, uint32_t len)
{ {
@ -706,13 +711,14 @@ static void fft_blocked_sse(float *data, uint32_t len)
} }
} }
/* blocked [r0,r1,r2,r3,i0,i1,i2,i3] -> interleaved [r0,i0,r1,i1,r2,i2,r3,i3] */ /* blocked [r0,r1,r2,r3,i0,i1,i2,i3] -> interleaved [r0,i0,r1,i1,r2,i2,r3,i3] with scaling */
static void fft_interleaved_sse(float *data, uint32_t len) static void fft_interleaved_sse(float *data, uint32_t len, float scale)
{ {
uint32_t i; uint32_t i;
__m128 s = _mm_set1_ps(scale);
for (i = 0; i < len; i += FFT_BLOCK) { for (i = 0; i < len; i += FFT_BLOCK) {
__m128 r = _mm_load_ps(&data[0]); /* r0 r1 r2 r3 */ __m128 r = _mm_mul_ps(_mm_load_ps(&data[0]), s);
__m128 im = _mm_load_ps(&data[4]); /* i0 i1 i2 i3 */ __m128 im = _mm_mul_ps(_mm_load_ps(&data[4]), s);
_mm_store_ps(&data[0], _mm_unpacklo_ps(r, im)); _mm_store_ps(&data[0], _mm_unpacklo_ps(r, im));
_mm_store_ps(&data[4], _mm_unpackhi_ps(r, im)); _mm_store_ps(&data[4], _mm_unpackhi_ps(r, im));
data += 2 * FFT_BLOCK; data += 2 * FFT_BLOCK;
@ -744,27 +750,29 @@ void dsp_fft_memclear_sse(void *obj, void *data, uint32_t size, bool real)
void dsp_fft_run_sse(void *obj, void *fft, int direction, void dsp_fft_run_sse(void *obj, void *fft, int direction,
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
{ {
#ifdef HAVE_FFTW
struct fft_info *info = fft; struct fft_info *info = fft;
#ifdef HAVE_FFTW
uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK);
if (direction > 0) { if (direction > 0) {
fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst);
fft_blocked_sse(dst, freq_size); fft_blocked_sse(dst, freq_size);
} else { } else {
fft_interleaved_sse((float*)src, freq_size); fft_interleaved_sse((float*)src, freq_size, 1.0f / info->size);
fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst);
} }
#else #else
pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); if (direction < 0)
spa_fga_dsp_linear(obj, (float*)src, (float*)src,
1.0f / info->size, 0.0f, info->size);
pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD);
#endif #endif
} }
void dsp_fft_cmul_sse(void *obj, void *fft, void dsp_fft_cmul_sse(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
__m128 s = _mm_set1_ps(scale);
uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2;
for (i = 0; i < plen; i += 2 * FFT_BLOCK) { for (i = 0; i < plen; i += 2 * FFT_BLOCK) {
@ -772,23 +780,23 @@ void dsp_fft_cmul_sse(void *obj, void *fft,
__m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]);
__m128 br = _mm_load_ps(&b[i]); __m128 br = _mm_load_ps(&b[i]);
__m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]);
__m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); _mm_store_ps(&dst[i], _mm_sub_ps(
__m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); _mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)));
_mm_store_ps(&dst[i], _mm_mul_ps(dr, s)); _mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps(
_mm_store_ps(&dst[i + FFT_BLOCK], _mm_mul_ps(di, s)); _mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)));
} }
#else #else
pffft_zconvolve(fft, a, b, dst, scale); struct fft_info *info = fft;
pffft_zconvolve(info->setup, a, b, dst, 1.0f);
#endif #endif
} }
void dsp_fft_cmuladd_sse(void *obj, void *fft, void dsp_fft_cmuladd_sse(void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale) uint32_t len)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
__m128 s = _mm_set1_ps(scale);
uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2;
for (i = 0; i < plen; i += 2 * FFT_BLOCK) { for (i = 0; i < plen; i += 2 * FFT_BLOCK) {
@ -796,14 +804,13 @@ void dsp_fft_cmuladd_sse(void *obj, void *fft,
__m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]);
__m128 br = _mm_load_ps(&b[i]); __m128 br = _mm_load_ps(&b[i]);
__m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]);
__m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi));
__m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br));
_mm_store_ps(&dst[i], _mm_add_ps(_mm_load_ps(&src[i]), _mm_store_ps(&dst[i], _mm_add_ps(_mm_load_ps(&src[i]),
_mm_mul_ps(dr, s))); _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi))));
_mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps(_mm_load_ps(&src[i + FFT_BLOCK]), _mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps(_mm_load_ps(&src[i + FFT_BLOCK]),
_mm_mul_ps(di, s))); _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br))));
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); struct fft_info *info = fft;
pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f);
#endif #endif
} }

View file

@ -43,11 +43,11 @@ struct spa_fga_dsp_methods {
const float * SPA_RESTRICT src, float * SPA_RESTRICT dst); const float * SPA_RESTRICT src, float * SPA_RESTRICT dst);
void (*fft_cmul) (void *obj, void *fft, void (*fft_cmul) (void *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale); const float * SPA_RESTRICT b, uint32_t len);
void (*fft_cmuladd) (void *obj, void *fft, void (*fft_cmuladd) (void *obj, void *fft,
float * dst, const float * src, float * dst, const float * src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale); uint32_t len);
void (*linear) (void *obj, void (*linear) (void *obj,
float * dst, const float * SPA_RESTRICT src, float * dst, const float * SPA_RESTRICT src,
const float mult, const float add, uint32_t n_samples); const float mult, const float add, uint32_t n_samples);
@ -123,18 +123,18 @@ static inline void spa_fga_dsp_fft_run(struct spa_fga_dsp *obj, void *fft, int d
} }
static inline void spa_fga_dsp_fft_cmul(struct spa_fga_dsp *obj, void *fft, static inline void spa_fga_dsp_fft_cmul(struct spa_fga_dsp *obj, void *fft,
float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len)
{ {
spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmul, 0, spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmul, 0,
fft, dst, a, b, len, scale); fft, dst, a, b, len);
} }
static inline void spa_fga_dsp_fft_cmuladd(struct spa_fga_dsp *obj, void *fft, static inline void spa_fga_dsp_fft_cmuladd(struct spa_fga_dsp *obj, void *fft,
float * dst, const float * src, float * dst, const float * src,
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b,
uint32_t len, const float scale) uint32_t len)
{ {
spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmuladd, 0, spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmuladd, 0,
fft, dst, src, a, b, len, scale); fft, dst, src, a, b, len);
} }
static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj, static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj,
float * dst, const float * SPA_RESTRICT src, float * dst, const float * SPA_RESTRICT src,

View file

@ -39,8 +39,6 @@ struct partition {
struct ir *ir; struct ir *ir;
int time_idx; int time_idx;
int precalc_idx; int precalc_idx;
float scale;
}; };
struct convolver struct convolver
@ -191,7 +189,6 @@ static struct partition *partition_new(struct convolver *conv, int block,
spa_fga_dsp_fft_run(dsp, part->fft, 1, r->time_buffer[0], r->segments[j]); spa_fga_dsp_fft_run(dsp, part->fft, 1, r->time_buffer[0], r->segments[j]);
} }
} }
part->scale = 1.0f / part->time_size;
partition_reset(dsp, part); partition_reset(dsp, part);
return part; return part;
@ -218,7 +215,7 @@ static int partition_run(struct spa_fga_dsp *dsp, struct partition *part, const
part->freq, part->freq,
part->segments[current], part->segments[current],
r->segments[0], r->segments[0],
part->freq_size, part->scale); part->freq_size);
for (j = 1; j < part->n_segments; j++) { for (j = 1; j < part->n_segments; j++) {
if (++current == part->n_segments) if (++current == part->n_segments)
@ -229,7 +226,7 @@ static int partition_run(struct spa_fga_dsp *dsp, struct partition *part, const
part->freq, part->freq,
part->segments[current], part->segments[current],
r->segments[j], r->segments[j],
part->freq_size, part->scale); part->freq_size);
} }
spa_fga_dsp_fft_run(dsp, part->ifft, -1, part->freq, r->time_buffer[idx]); spa_fga_dsp_fft_run(dsp, part->ifft, -1, part->freq, r->time_buffer[idx]);