diff --git a/src/modules/module-filter-chain/dsp-ops-avx.c b/src/modules/module-filter-chain/dsp-ops-avx.c index 3a771d567..15f2a1472 100644 --- a/src/modules/module-filter-chain/dsp-ops-avx.c +++ b/src/modules/module-filter-chain/dsp-ops-avx.c @@ -187,6 +187,7 @@ void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft, if (SPA_IS_ALIGNED(a, 32) && SPA_IS_ALIGNED(b, 32) && + SPA_IS_ALIGNED(src, 32) && SPA_IS_ALIGNED(dst, 32)) unrolled = len & ~7; else @@ -201,16 +202,16 @@ void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft, dd[1] = _mm256_mul_pz(aa[1], bb[1]); dd[0] = _mm256_mul_ps(dd[0], s); dd[1] = _mm256_mul_ps(dd[1], s); - t[0] = _mm256_load_ps(&dst[2*i]); - t[1] = _mm256_load_ps(&dst[2*i+8]); + t[0] = _mm256_load_ps(&src[2*i]); + t[1] = _mm256_load_ps(&src[2*i+8]); t[0] = _mm256_add_ps(t[0], dd[0]); t[1] = _mm256_add_ps(t[1], dd[1]); _mm256_store_ps(&dst[2*i], t[0]); _mm256_store_ps(&dst[2*i+8], t[1]); } for (; i < len; i++) { - dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); diff --git a/src/modules/module-filter-chain/dsp-ops-c.c b/src/modules/module-filter-chain/dsp-ops-c.c index 59814d872..16170cb30 100644 --- a/src/modules/module-filter-chain/dsp-ops-c.c +++ b/src/modules/module-filter-chain/dsp-ops-c.c @@ -277,7 +277,7 @@ void dsp_fft_cmul_c(struct dsp_ops *ops, void *fft, const float * SPA_RESTRICT b, uint32_t len, const float scale) { #ifdef HAVE_FFTW - for (uint32_t i = 0; i <= len; i++) { + for (uint32_t i = 0; i < len; i++) { dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } @@ -292,9 +292,9 @@ void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft, uint32_t len, const float scale) { #ifdef HAVE_FFTW - for (uint32_t i = 0; i <= len; i++) { - dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + for (uint32_t i = 0; i < len; i++) { + dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); diff --git a/src/modules/module-filter-chain/dsp-ops-sse.c b/src/modules/module-filter-chain/dsp-ops-sse.c index 0f4dea9e2..9e7d7b753 100644 --- a/src/modules/module-filter-chain/dsp-ops-sse.c +++ b/src/modules/module-filter-chain/dsp-ops-sse.c @@ -608,6 +608,7 @@ void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft, if (SPA_IS_ALIGNED(a, 16) && SPA_IS_ALIGNED(b, 16) && + SPA_IS_ALIGNED(src, 16) && SPA_IS_ALIGNED(dst, 16)) unrolled = len & ~3; else @@ -621,16 +622,16 @@ void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft, _mm_mul_pz(aa, bb, dd); dd[0] = _mm_mul_ps(dd[0], s); dd[1] = _mm_mul_ps(dd[1], s); - t[0] = _mm_load_ps(&dst[2*i]); - t[1] = _mm_load_ps(&dst[2*i+4]); + t[0] = _mm_load_ps(&src[2*i]); + t[1] = _mm_load_ps(&src[2*i+4]); t[0] = _mm_add_ps(t[0], dd[0]); t[1] = _mm_add_ps(t[1], dd[1]); _mm_store_ps(&dst[2*i], t[0]); _mm_store_ps(&dst[2*i+4], t[1]); } for (; i < len; i++) { - dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; - dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; + dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; + dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; } #else pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);