diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index a8adf0105..f73dc8b0a 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -239,13 +239,18 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t #define FFT_BLOCK 8 -#ifdef HAVE_FFTW struct fft_info { +#ifdef HAVE_FFTW fftwf_plan plan_r2c; fftwf_plan plan_c2r; +#else + void *setup; +#endif uint32_t size; }; +#ifdef HAVE_FFTW + /* interleaved [r0,i0,...,r7,i7] -> blocked [r0..r7,i0..i7] */ static void fft_blocked_avx2(float *data, uint32_t len) { @@ -262,16 +267,17 @@ static void fft_blocked_avx2(float *data, uint32_t len) } } -/* blocked [r0..r7,i0..i7] -> interleaved [r0,i0,...,r7,i7] */ -static void fft_interleaved_avx2(float *data, uint32_t len) +/* blocked [r0..r7,i0..i7] -> interleaved [r0,i0,...,r7,i7] with scaling */ +static void fft_interleaved_avx2(float *data, uint32_t len, float scale) { const __m256i idx = _mm256_setr_epi32(0,4,1,5,2,6,3,7); + __m256 s = _mm256_set1_ps(scale); uint32_t i; for (i = 0; i < len; i += FFT_BLOCK) { - __m256 r = _mm256_load_ps(&data[0]); /* r0 r1 r2 r3 r4 r5 r6 r7 */ - __m256 im = _mm256_load_ps(&data[8]); /* i0 i1 i2 i3 i4 i5 i6 i7 */ - __m256 t0 = _mm256_permute2f128_ps(r, im, 0x20); /* r0 r1 r2 r3 i0 i1 i2 i3 */ - __m256 t1 = _mm256_permute2f128_ps(r, im, 0x31); /* r4 r5 r6 r7 i4 i5 i6 i7 */ + __m256 r = _mm256_mul_ps(_mm256_load_ps(&data[0]), s); + __m256 im = _mm256_mul_ps(_mm256_load_ps(&data[8]), s); + __m256 t0 = _mm256_permute2f128_ps(r, im, 0x20); + __m256 t1 = _mm256_permute2f128_ps(r, im, 0x31); _mm256_store_ps(&data[0], _mm256_permutevar8x32_ps(t0, idx)); _mm256_store_ps(&data[8], _mm256_permutevar8x32_ps(t1, idx)); data += 2 * FFT_BLOCK; @@ -303,27 +309,29 @@ void dsp_fft_memclear_avx2(void *obj, void *data, uint32_t size, bool real) void dsp_fft_run_avx2(void *obj, void *fft, int direction, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) { -#ifdef HAVE_FFTW struct fft_info *info = fft; +#ifdef HAVE_FFTW uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); if (direction > 0) { fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); fft_blocked_avx2(dst, freq_size); } else { - fft_interleaved_avx2((float*)src, freq_size); + fft_interleaved_avx2((float*)src, freq_size, 1.0f / info->size); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); } #else - pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); + if (direction < 0) + spa_fga_dsp_linear(obj, (float*)src, (float*)src, + 1.0f / info->size, 0.0f, info->size); + pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); #endif } void dsp_fft_cmul_avx2(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, - const float * SPA_RESTRICT b, uint32_t len, const float scale) + const float * SPA_RESTRICT b, uint32_t len) { #ifdef HAVE_FFTW - __m256 s = _mm256_set1_ps(scale); uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; for (i = 0; i < plen; i += 16) { @@ -335,21 +343,21 @@ void dsp_fft_cmul_avx2(void *obj, void *fft, __m256 di = _mm256_mul_ps(ar, bi); dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ - _mm256_store_ps(&dst[i], _mm256_mul_ps(dr, s)); - _mm256_store_ps(&dst[i+8], _mm256_mul_ps(di, s)); + _mm256_store_ps(&dst[i], dr); + _mm256_store_ps(&dst[i+8], di); } #else - pffft_zconvolve(fft, a, b, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve(info->setup, a, b, dst, 1.0f); #endif } void dsp_fft_cmuladd_avx2(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, - uint32_t len, const float scale) + uint32_t len) { #ifdef HAVE_FFTW - __m256 s = _mm256_set1_ps(scale); uint32_t i, plen = SPA_ROUND_UP_N(len, 8) * 2; for (i = 0; i < plen; i += 16) { @@ -361,12 +369,13 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft, __m256 di = _mm256_mul_ps(ar, bi); dr = _mm256_fnmadd_ps(ai, bi, dr); /* ar*br - ai*bi */ di = _mm256_fmadd_ps(ai, br, di); /* ar*bi + ai*br */ - _mm256_store_ps(&dst[i], _mm256_fmadd_ps(dr, s, + _mm256_store_ps(&dst[i], _mm256_add_ps(dr, _mm256_load_ps(&src[i]))); - _mm256_store_ps(&dst[i+8], _mm256_fmadd_ps(di, s, + _mm256_store_ps(&dst[i+8], _mm256_add_ps(di, _mm256_load_ps(&src[i+8]))); } #else - pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f); #endif } diff --git a/spa/plugins/filter-graph/audio-dsp-c.c b/spa/plugins/filter-graph/audio-dsp-c.c index 6b829e48e..312f8907d 100644 --- a/spa/plugins/filter-graph/audio-dsp-c.c +++ b/spa/plugins/filter-graph/audio-dsp-c.c @@ -235,51 +235,55 @@ void dsp_delay_c(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, } } -#ifdef HAVE_FFTW struct fft_info { +#ifdef HAVE_FFTW fftwf_plan plan_r2c; fftwf_plan plan_c2r; +#else + void *setup; +#endif uint32_t size; }; -#endif void *dsp_fft_new_c(void *obj, uint32_t size, bool real) { -#ifdef HAVE_FFTW struct fft_info *info = calloc(1, sizeof(struct fft_info)); - float *rdata; - fftwf_complex *cdata; if (info == NULL) return NULL; info->size = size; - rdata = fftwf_alloc_real(size * 2); - cdata = fftwf_alloc_complex(size + 1); +#ifdef HAVE_FFTW + { + float *rdata; + fftwf_complex *cdata; - info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE); - info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE); + rdata = fftwf_alloc_real(size * 2); + cdata = fftwf_alloc_complex(size + 1); - fftwf_free(rdata); - fftwf_free(cdata); + info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE); + info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE); - return info; + fftwf_free(rdata); + fftwf_free(cdata); + } #else - return pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX); + info->setup = pffft_new_setup(size, real ? PFFFT_REAL : PFFFT_COMPLEX); #endif + return info; } void dsp_fft_free_c(void *obj, void *fft) { -#ifdef HAVE_FFTW struct fft_info *info = fft; +#ifdef HAVE_FFTW fftwf_destroy_plan(info->plan_r2c); fftwf_destroy_plan(info->plan_c2r); - free(info); #else - pffft_destroy_setup(fft); + pffft_destroy_setup(info->setup); #endif + free(info); } void *dsp_fft_memalloc_c(void *obj, uint32_t size, bool real) @@ -318,43 +322,51 @@ void dsp_fft_memclear_c(void *obj, void *data, uint32_t size, bool real) void dsp_fft_run_c(void *obj, void *fft, int direction, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) { -#ifdef HAVE_FFTW struct fft_info *info = fft; - if (direction > 0) +#ifdef HAVE_FFTW + if (direction > 0) { fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); - else + } else { + spa_fga_dsp_linear(obj, (float*)src, (float*)src, + 1.0f / info->size, 0.0f, (info->size / 2 + 1) * 2); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); + } #else - pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); + if (direction < 0) + spa_fga_dsp_linear(obj, (float*)src, (float*)src, + 1.0f / info->size, 0.0f, info->size); + pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); #endif } void dsp_fft_cmul_c(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, - const float * SPA_RESTRICT b, uint32_t len, const float scale) + const float * SPA_RESTRICT b, uint32_t len) { #ifdef HAVE_FFTW for (uint32_t i = 0; i < len; i++) { - dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + dst[2*i ] = a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]; + dst[2*i+1] = a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]; } #else - pffft_zconvolve(fft, a, b, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve(info->setup, a, b, dst, 1.0f); #endif } void dsp_fft_cmuladd_c(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, - uint32_t len, const float scale) + uint32_t len) { #ifdef HAVE_FFTW for (uint32_t i = 0; i < len; i++) { - dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + dst[2*i ] = src[2*i ] + a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]; + dst[2*i+1] = src[2*i+1] + a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]; } #else - pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f); #endif } diff --git a/spa/plugins/filter-graph/audio-dsp-impl.h b/spa/plugins/filter-graph/audio-dsp-impl.h index 02bdb7e84..ed98e5a93 100644 --- a/spa/plugins/filter-graph/audio-dsp-impl.h +++ b/spa/plugins/filter-graph/audio-dsp-impl.h @@ -50,12 +50,12 @@ void dsp_fft_run_##arch(void *obj, void *fft, int direction, \ #define MAKE_FFT_CMUL_FUNC(arch) \ void dsp_fft_cmul_##arch(void *obj, void *fft, \ float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, \ - const float * SPA_RESTRICT b, uint32_t len, const float scale) + const float * SPA_RESTRICT b, uint32_t len) #define MAKE_FFT_CMULADD_FUNC(arch) \ void dsp_fft_cmuladd_##arch(void *obj, void *fft, \ float * dst, const float * src, \ const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, \ - uint32_t len, const float scale) + uint32_t len) MAKE_CLEAR_FUNC(c); diff --git a/spa/plugins/filter-graph/audio-dsp-sse.c b/spa/plugins/filter-graph/audio-dsp-sse.c index 9323b3a81..0b2051a63 100644 --- a/spa/plugins/filter-graph/audio-dsp-sse.c +++ b/spa/plugins/filter-graph/audio-dsp-sse.c @@ -686,13 +686,18 @@ void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, u #define FFT_BLOCK 4 -#ifdef HAVE_FFTW struct fft_info { +#ifdef HAVE_FFTW fftwf_plan plan_r2c; fftwf_plan plan_c2r; +#else + void *setup; +#endif uint32_t size; }; +#ifdef HAVE_FFTW + /* interleaved [r0,i0,r1,i1,r2,i2,r3,i3] -> blocked [r0,r1,r2,r3,i0,i1,i2,i3] */ static void fft_blocked_sse(float *data, uint32_t len) { @@ -706,13 +711,14 @@ static void fft_blocked_sse(float *data, uint32_t len) } } -/* blocked [r0,r1,r2,r3,i0,i1,i2,i3] -> interleaved [r0,i0,r1,i1,r2,i2,r3,i3] */ -static void fft_interleaved_sse(float *data, uint32_t len) +/* blocked [r0,r1,r2,r3,i0,i1,i2,i3] -> interleaved [r0,i0,r1,i1,r2,i2,r3,i3] with scaling */ +static void fft_interleaved_sse(float *data, uint32_t len, float scale) { uint32_t i; + __m128 s = _mm_set1_ps(scale); for (i = 0; i < len; i += FFT_BLOCK) { - __m128 r = _mm_load_ps(&data[0]); /* r0 r1 r2 r3 */ - __m128 im = _mm_load_ps(&data[4]); /* i0 i1 i2 i3 */ + __m128 r = _mm_mul_ps(_mm_load_ps(&data[0]), s); + __m128 im = _mm_mul_ps(_mm_load_ps(&data[4]), s); _mm_store_ps(&data[0], _mm_unpacklo_ps(r, im)); _mm_store_ps(&data[4], _mm_unpackhi_ps(r, im)); data += 2 * FFT_BLOCK; @@ -744,27 +750,29 @@ void dsp_fft_memclear_sse(void *obj, void *data, uint32_t size, bool real) void dsp_fft_run_sse(void *obj, void *fft, int direction, const float * SPA_RESTRICT src, float * SPA_RESTRICT dst) { -#ifdef HAVE_FFTW struct fft_info *info = fft; +#ifdef HAVE_FFTW uint32_t freq_size = SPA_ROUND_UP_N(info->size / 2 + 1, FFT_BLOCK); if (direction > 0) { fftwf_execute_dft_r2c(info->plan_r2c, (float*)src, (fftwf_complex*)dst); fft_blocked_sse(dst, freq_size); } else { - fft_interleaved_sse((float*)src, freq_size); + fft_interleaved_sse((float*)src, freq_size, 1.0f / info->size); fftwf_execute_dft_c2r(info->plan_c2r, (fftwf_complex*)src, dst); } #else - pffft_transform(fft, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); + if (direction < 0) + spa_fga_dsp_linear(obj, (float*)src, (float*)src, + 1.0f / info->size, 0.0f, info->size); + pffft_transform(info->setup, src, dst, NULL, direction < 0 ? PFFFT_BACKWARD : PFFFT_FORWARD); #endif } void dsp_fft_cmul_sse(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, - const float * SPA_RESTRICT b, uint32_t len, const float scale) + const float * SPA_RESTRICT b, uint32_t len) { #ifdef HAVE_FFTW - __m128 s = _mm_set1_ps(scale); uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; for (i = 0; i < plen; i += 2 * FFT_BLOCK) { @@ -772,23 +780,23 @@ void dsp_fft_cmul_sse(void *obj, void *fft, __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); __m128 br = _mm_load_ps(&b[i]); __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); - __m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - __m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); - _mm_store_ps(&dst[i], _mm_mul_ps(dr, s)); - _mm_store_ps(&dst[i + FFT_BLOCK], _mm_mul_ps(di, s)); + _mm_store_ps(&dst[i], _mm_sub_ps( + _mm_mul_ps(ar, br), _mm_mul_ps(ai, bi))); + _mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps( + _mm_mul_ps(ar, bi), _mm_mul_ps(ai, br))); } #else - pffft_zconvolve(fft, a, b, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve(info->setup, a, b, dst, 1.0f); #endif } void dsp_fft_cmuladd_sse(void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT src, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, - uint32_t len, const float scale) + uint32_t len) { #ifdef HAVE_FFTW - __m128 s = _mm_set1_ps(scale); uint32_t i, plen = SPA_ROUND_UP_N(len, FFT_BLOCK) * 2; for (i = 0; i < plen; i += 2 * FFT_BLOCK) { @@ -796,14 +804,13 @@ void dsp_fft_cmuladd_sse(void *obj, void *fft, __m128 ai = _mm_load_ps(&a[i + FFT_BLOCK]); __m128 br = _mm_load_ps(&b[i]); __m128 bi = _mm_load_ps(&b[i + FFT_BLOCK]); - __m128 dr = _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)); - __m128 di = _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)); _mm_store_ps(&dst[i], _mm_add_ps(_mm_load_ps(&src[i]), - _mm_mul_ps(dr, s))); + _mm_sub_ps(_mm_mul_ps(ar, br), _mm_mul_ps(ai, bi)))); _mm_store_ps(&dst[i + FFT_BLOCK], _mm_add_ps(_mm_load_ps(&src[i + FFT_BLOCK]), - _mm_mul_ps(di, s))); + _mm_add_ps(_mm_mul_ps(ar, bi), _mm_mul_ps(ai, br)))); } #else - pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); + struct fft_info *info = fft; + pffft_zconvolve_accumulate(info->setup, a, b, src, dst, 1.0f); #endif } diff --git a/spa/plugins/filter-graph/audio-dsp.h b/spa/plugins/filter-graph/audio-dsp.h index 81cbdaf36..446d72f1d 100644 --- a/spa/plugins/filter-graph/audio-dsp.h +++ b/spa/plugins/filter-graph/audio-dsp.h @@ -43,11 +43,11 @@ struct spa_fga_dsp_methods { const float * SPA_RESTRICT src, float * SPA_RESTRICT dst); void (*fft_cmul) (void *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, - const float * SPA_RESTRICT b, uint32_t len, const float scale); + const float * SPA_RESTRICT b, uint32_t len); void (*fft_cmuladd) (void *obj, void *fft, float * dst, const float * src, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, - uint32_t len, const float scale); + uint32_t len); void (*linear) (void *obj, float * dst, const float * SPA_RESTRICT src, const float mult, const float add, uint32_t n_samples); @@ -123,18 +123,18 @@ static inline void spa_fga_dsp_fft_run(struct spa_fga_dsp *obj, void *fft, int d } static inline void spa_fga_dsp_fft_cmul(struct spa_fga_dsp *obj, void *fft, float * SPA_RESTRICT dst, const float * SPA_RESTRICT a, - const float * SPA_RESTRICT b, uint32_t len, const float scale) + const float * SPA_RESTRICT b, uint32_t len) { spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmul, 0, - fft, dst, a, b, len, scale); + fft, dst, a, b, len); } static inline void spa_fga_dsp_fft_cmuladd(struct spa_fga_dsp *obj, void *fft, float * dst, const float * src, const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, - uint32_t len, const float scale) + uint32_t len) { spa_api_method_v(spa_fga_dsp, &obj->iface, fft_cmuladd, 0, - fft, dst, src, a, b, len, scale); + fft, dst, src, a, b, len); } static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj, float * dst, const float * SPA_RESTRICT src, diff --git a/spa/plugins/filter-graph/convolver.c b/spa/plugins/filter-graph/convolver.c index 0233193f8..b3de7b314 100644 --- a/spa/plugins/filter-graph/convolver.c +++ b/spa/plugins/filter-graph/convolver.c @@ -39,8 +39,6 @@ struct partition { struct ir *ir; int time_idx; int precalc_idx; - - float scale; }; struct convolver @@ -191,7 +189,6 @@ static struct partition *partition_new(struct convolver *conv, int block, spa_fga_dsp_fft_run(dsp, part->fft, 1, r->time_buffer[0], r->segments[j]); } } - part->scale = 1.0f / part->time_size; partition_reset(dsp, part); return part; @@ -218,7 +215,7 @@ static int partition_run(struct spa_fga_dsp *dsp, struct partition *part, const part->freq, part->segments[current], r->segments[0], - part->freq_size, part->scale); + part->freq_size); for (j = 1; j < part->n_segments; j++) { if (++current == part->n_segments) @@ -229,7 +226,7 @@ static int partition_run(struct spa_fga_dsp *dsp, struct partition *part, const part->freq, part->segments[current], r->segments[j], - part->freq_size, part->scale); + part->freq_size); } spa_fga_dsp_fft_run(dsp, part->ifft, -1, part->freq, r->time_buffer[idx]);