dsp: add SSE and AVX2 mult and linear functions

This commit is contained in:
Wim Taymans 2026-04-22 16:04:07 +02:00
parent 3e7e61dcb7
commit 3c2552e671
4 changed files with 204 additions and 4 deletions

View file

@ -237,6 +237,104 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
} }
} }
void dsp_mult_avx2(void *obj,
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m256 in[4];
if (n_src == 0) {
memset(dst, 0, n_samples * sizeof(float));
return;
}
if (dst != src[0])
spa_memcpy(dst, src[0], n_samples * sizeof(float));
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 1; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (i = 1; i < n_src; i++) {
for (n = 0; n < unrolled; n += 32) {
in[0] = _mm256_mul_ps(_mm256_load_ps(&dst[n+ 0]), _mm256_load_ps(&src[i][n+ 0]));
in[1] = _mm256_mul_ps(_mm256_load_ps(&dst[n+ 8]), _mm256_load_ps(&src[i][n+ 8]));
in[2] = _mm256_mul_ps(_mm256_load_ps(&dst[n+16]), _mm256_load_ps(&src[i][n+16]));
in[3] = _mm256_mul_ps(_mm256_load_ps(&dst[n+24]), _mm256_load_ps(&src[i][n+24]));
_mm256_store_ps(&dst[n+ 0], in[0]);
_mm256_store_ps(&dst[n+ 8], in[1]);
_mm256_store_ps(&dst[n+16], in[2]);
_mm256_store_ps(&dst[n+24], in[3]);
}
for (; n < n_samples; n++)
dst[n] *= src[i][n];
}
}
void dsp_linear_avx2(void *obj, float * dst,
const float * SPA_RESTRICT src, const float mult,
const float add, uint32_t n_samples)
{
uint32_t n, unrolled;
__m256 m, a;
if (mult == 0.0f) {
a = _mm256_set1_ps(add);
unrolled = n_samples & ~31;
for (n = 0; n < unrolled; n += 32) {
_mm256_storeu_ps(&dst[n+ 0], a);
_mm256_storeu_ps(&dst[n+ 8], a);
_mm256_storeu_ps(&dst[n+16], a);
_mm256_storeu_ps(&dst[n+24], a);
}
for (; n < n_samples; n++)
dst[n] = add;
return;
}
if (SPA_LIKELY(SPA_IS_ALIGNED(src, 32) && SPA_IS_ALIGNED(dst, 32)))
unrolled = n_samples & ~31;
else
unrolled = 0;
m = _mm256_set1_ps(mult);
if (add == 0.0f) {
if (mult == 1.0f) {
if (dst != src)
spa_memcpy(dst, src, n_samples * sizeof(float));
return;
}
for (n = 0; n < unrolled; n += 32) {
_mm256_store_ps(&dst[n+ 0], _mm256_mul_ps(m, _mm256_load_ps(&src[n+ 0])));
_mm256_store_ps(&dst[n+ 8], _mm256_mul_ps(m, _mm256_load_ps(&src[n+ 8])));
_mm256_store_ps(&dst[n+16], _mm256_mul_ps(m, _mm256_load_ps(&src[n+16])));
_mm256_store_ps(&dst[n+24], _mm256_mul_ps(m, _mm256_load_ps(&src[n+24])));
}
for (; n < n_samples; n++)
dst[n] = mult * src[n];
} else {
a = _mm256_set1_ps(add);
for (n = 0; n < unrolled; n += 32) {
_mm256_store_ps(&dst[n+ 0], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+ 0]), a));
_mm256_store_ps(&dst[n+ 8], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+ 8]), a));
_mm256_store_ps(&dst[n+16], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+16]), a));
_mm256_store_ps(&dst[n+24], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+24]), a));
}
for (; n < n_samples; n++)
dst[n] = mult * src[n] + add;
}
}
#define FFT_BLOCK 8 #define FFT_BLOCK 8
struct fft_info { struct fft_info {

View file

@ -79,6 +79,8 @@ MAKE_FFT_CMULADD_FUNC(c);
#if defined (HAVE_SSE) #if defined (HAVE_SSE)
MAKE_MIX_GAIN_FUNC(sse); MAKE_MIX_GAIN_FUNC(sse);
MAKE_SUM_FUNC(sse); MAKE_SUM_FUNC(sse);
MAKE_LINEAR_FUNC(sse);
MAKE_MULT_FUNC(sse);
MAKE_BIQUAD_RUN_FUNC(sse); MAKE_BIQUAD_RUN_FUNC(sse);
MAKE_DELAY_FUNC(sse); MAKE_DELAY_FUNC(sse);
MAKE_FFT_MEMALLOC_FUNC(sse); MAKE_FFT_MEMALLOC_FUNC(sse);
@ -90,6 +92,8 @@ MAKE_FFT_CMULADD_FUNC(sse);
#if defined (HAVE_AVX2) #if defined (HAVE_AVX2)
MAKE_MIX_GAIN_FUNC(avx2); MAKE_MIX_GAIN_FUNC(avx2);
MAKE_SUM_FUNC(avx2); MAKE_SUM_FUNC(avx2);
MAKE_LINEAR_FUNC(avx2);
MAKE_MULT_FUNC(avx2);
MAKE_FFT_MEMALLOC_FUNC(avx2); MAKE_FFT_MEMALLOC_FUNC(avx2);
MAKE_FFT_MEMCLEAR_FUNC(avx2); MAKE_FFT_MEMCLEAR_FUNC(avx2);
MAKE_FFT_RUN_FUNC(avx2); MAKE_FFT_RUN_FUNC(avx2);

View file

@ -615,6 +615,104 @@ void dsp_biquad_run_sse(void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq
} }
} }
void dsp_mult_sse(void *obj,
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m128 in[4];
if (n_src == 0) {
memset(dst, 0, n_samples * sizeof(float));
return;
}
if (dst != src[0])
spa_memcpy(dst, src[0], n_samples * sizeof(float));
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
unrolled = n_samples & ~15;
for (i = 1; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (i = 1; i < n_src; i++) {
for (n = 0; n < unrolled; n += 16) {
in[0] = _mm_mul_ps(_mm_load_ps(&dst[n+ 0]), _mm_load_ps(&src[i][n+ 0]));
in[1] = _mm_mul_ps(_mm_load_ps(&dst[n+ 4]), _mm_load_ps(&src[i][n+ 4]));
in[2] = _mm_mul_ps(_mm_load_ps(&dst[n+ 8]), _mm_load_ps(&src[i][n+ 8]));
in[3] = _mm_mul_ps(_mm_load_ps(&dst[n+12]), _mm_load_ps(&src[i][n+12]));
_mm_store_ps(&dst[n+ 0], in[0]);
_mm_store_ps(&dst[n+ 4], in[1]);
_mm_store_ps(&dst[n+ 8], in[2]);
_mm_store_ps(&dst[n+12], in[3]);
}
for (; n < n_samples; n++)
dst[n] *= src[i][n];
}
}
void dsp_linear_sse(void *obj, float * dst,
const float * SPA_RESTRICT src, const float mult,
const float add, uint32_t n_samples)
{
uint32_t n, unrolled;
__m128 m, a;
if (mult == 0.0f) {
a = _mm_set1_ps(add);
unrolled = n_samples & ~15;
for (n = 0; n < unrolled; n += 16) {
_mm_storeu_ps(&dst[n+ 0], a);
_mm_storeu_ps(&dst[n+ 4], a);
_mm_storeu_ps(&dst[n+ 8], a);
_mm_storeu_ps(&dst[n+12], a);
}
for (; n < n_samples; n++)
dst[n] = add;
return;
}
if (SPA_LIKELY(SPA_IS_ALIGNED(src, 16) && SPA_IS_ALIGNED(dst, 16)))
unrolled = n_samples & ~15;
else
unrolled = 0;
m = _mm_set1_ps(mult);
if (add == 0.0f) {
if (mult == 1.0f) {
if (dst != src)
spa_memcpy(dst, src, n_samples * sizeof(float));
return;
}
for (n = 0; n < unrolled; n += 16) {
_mm_store_ps(&dst[n+ 0], _mm_mul_ps(m, _mm_load_ps(&src[n+ 0])));
_mm_store_ps(&dst[n+ 4], _mm_mul_ps(m, _mm_load_ps(&src[n+ 4])));
_mm_store_ps(&dst[n+ 8], _mm_mul_ps(m, _mm_load_ps(&src[n+ 8])));
_mm_store_ps(&dst[n+12], _mm_mul_ps(m, _mm_load_ps(&src[n+12])));
}
for (; n < n_samples; n++)
dst[n] = mult * src[n];
} else {
a = _mm_set1_ps(add);
for (n = 0; n < unrolled; n += 16) {
_mm_store_ps(&dst[n+ 0], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 0])), a));
_mm_store_ps(&dst[n+ 4], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 4])), a));
_mm_store_ps(&dst[n+ 8], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 8])), a));
_mm_store_ps(&dst[n+12], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+12])), a));
}
for (; n < n_samples; n++)
dst[n] = mult * src[n] + add;
}
}
void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay, void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay,
float *dst, const float *src, uint32_t n_samples, float fb, float ff) float *dst, const float *src, uint32_t n_samples, float fb, float ff)
{ {

View file

@ -30,8 +30,8 @@ static const struct dsp_info dsp_table[] =
.funcs.mix_gain = dsp_mix_gain_avx2, .funcs.mix_gain = dsp_mix_gain_avx2,
.funcs.biquad_run = dsp_biquad_run_sse, .funcs.biquad_run = dsp_biquad_run_sse,
.funcs.sum = dsp_sum_avx2, .funcs.sum = dsp_sum_avx2,
.funcs.linear = dsp_linear_c, .funcs.linear = dsp_linear_avx2,
.funcs.mult = dsp_mult_c, .funcs.mult = dsp_mult_avx2,
.funcs.fft_new = dsp_fft_new_c, .funcs.fft_new = dsp_fft_new_c,
.funcs.fft_free = dsp_fft_free_c, .funcs.fft_free = dsp_fft_free_c,
.funcs.fft_memalloc = dsp_fft_memalloc_avx2, .funcs.fft_memalloc = dsp_fft_memalloc_avx2,
@ -50,8 +50,8 @@ static const struct dsp_info dsp_table[] =
.funcs.mix_gain = dsp_mix_gain_sse, .funcs.mix_gain = dsp_mix_gain_sse,
.funcs.biquad_run = dsp_biquad_run_sse, .funcs.biquad_run = dsp_biquad_run_sse,
.funcs.sum = dsp_sum_sse, .funcs.sum = dsp_sum_sse,
.funcs.linear = dsp_linear_c, .funcs.linear = dsp_linear_sse,
.funcs.mult = dsp_mult_c, .funcs.mult = dsp_mult_sse,
.funcs.fft_new = dsp_fft_new_c, .funcs.fft_new = dsp_fft_new_c,
.funcs.fft_free = dsp_fft_free_c, .funcs.fft_free = dsp_fft_free_c,
.funcs.fft_memalloc = dsp_fft_memalloc_sse, .funcs.fft_memalloc = dsp_fft_memalloc_sse,