filter-chain: fix cmuladd

We need to add the result to src and store in dst. Also use the right
len in C fallback.
This commit is contained in:
Wim Taymans 2024-10-29 11:55:03 +01:00
parent b2a70f5933
commit 3f41b93aa5
3 changed files with 14 additions and 12 deletions

View file

@ -187,6 +187,7 @@ void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft,
if (SPA_IS_ALIGNED(a, 32) && if (SPA_IS_ALIGNED(a, 32) &&
SPA_IS_ALIGNED(b, 32) && SPA_IS_ALIGNED(b, 32) &&
SPA_IS_ALIGNED(src, 32) &&
SPA_IS_ALIGNED(dst, 32)) SPA_IS_ALIGNED(dst, 32))
unrolled = len & ~7; unrolled = len & ~7;
else else
@ -201,16 +202,16 @@ void dsp_fft_cmuladd_avx(struct dsp_ops *ops, void *fft,
dd[1] = _mm256_mul_pz(aa[1], bb[1]); dd[1] = _mm256_mul_pz(aa[1], bb[1]);
dd[0] = _mm256_mul_ps(dd[0], s); dd[0] = _mm256_mul_ps(dd[0], s);
dd[1] = _mm256_mul_ps(dd[1], s); dd[1] = _mm256_mul_ps(dd[1], s);
t[0] = _mm256_load_ps(&dst[2*i]); t[0] = _mm256_load_ps(&src[2*i]);
t[1] = _mm256_load_ps(&dst[2*i+8]); t[1] = _mm256_load_ps(&src[2*i+8]);
t[0] = _mm256_add_ps(t[0], dd[0]); t[0] = _mm256_add_ps(t[0], dd[0]);
t[1] = _mm256_add_ps(t[1], dd[1]); t[1] = _mm256_add_ps(t[1], dd[1]);
_mm256_store_ps(&dst[2*i], t[0]); _mm256_store_ps(&dst[2*i], t[0]);
_mm256_store_ps(&dst[2*i+8], t[1]); _mm256_store_ps(&dst[2*i+8], t[1]);
} }
for (; i < len; i++) { for (; i < len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);

View file

@ -277,7 +277,7 @@ void dsp_fft_cmul_c(struct dsp_ops *ops, void *fft,
const float * SPA_RESTRICT b, uint32_t len, const float scale) const float * SPA_RESTRICT b, uint32_t len, const float scale)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
for (uint32_t i = 0; i <= len; i++) { for (uint32_t i = 0; i < len; i++) {
dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
} }
@ -292,9 +292,9 @@ void dsp_fft_cmuladd_c(struct dsp_ops *ops, void *fft,
uint32_t len, const float scale) uint32_t len, const float scale)
{ {
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
for (uint32_t i = 0; i <= len; i++) { for (uint32_t i = 0; i < len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);

View file

@ -608,6 +608,7 @@ void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft,
if (SPA_IS_ALIGNED(a, 16) && if (SPA_IS_ALIGNED(a, 16) &&
SPA_IS_ALIGNED(b, 16) && SPA_IS_ALIGNED(b, 16) &&
SPA_IS_ALIGNED(src, 16) &&
SPA_IS_ALIGNED(dst, 16)) SPA_IS_ALIGNED(dst, 16))
unrolled = len & ~3; unrolled = len & ~3;
else else
@ -621,16 +622,16 @@ void dsp_fft_cmuladd_sse(struct dsp_ops *ops, void *fft,
_mm_mul_pz(aa, bb, dd); _mm_mul_pz(aa, bb, dd);
dd[0] = _mm_mul_ps(dd[0], s); dd[0] = _mm_mul_ps(dd[0], s);
dd[1] = _mm_mul_ps(dd[1], s); dd[1] = _mm_mul_ps(dd[1], s);
t[0] = _mm_load_ps(&dst[2*i]); t[0] = _mm_load_ps(&src[2*i]);
t[1] = _mm_load_ps(&dst[2*i+4]); t[1] = _mm_load_ps(&src[2*i+4]);
t[0] = _mm_add_ps(t[0], dd[0]); t[0] = _mm_add_ps(t[0], dd[0]);
t[1] = _mm_add_ps(t[1], dd[1]); t[1] = _mm_add_ps(t[1], dd[1]);
_mm_store_ps(&dst[2*i], t[0]); _mm_store_ps(&dst[2*i], t[0]);
_mm_store_ps(&dst[2*i+4], t[1]); _mm_store_ps(&dst[2*i+4], t[1]);
} }
for (; i < len; i++) { for (; i < len; i++) {
dst[2*i ] += (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale; dst[2*i ] = src[2*i ] + (a[2*i] * b[2*i ] - a[2*i+1] * b[2*i+1]) * scale;
dst[2*i+1] += (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale; dst[2*i+1] = src[2*i+1] + (a[2*i] * b[2*i+1] + a[2*i+1] * b[2*i ]) * scale;
} }
#else #else
pffft_zconvolve_accumulate(fft, a, b, src, dst, scale); pffft_zconvolve_accumulate(fft, a, b, src, dst, scale);