filter-chain: add avx mix function

This commit is contained in:
Wim Taymans 2024-10-18 16:25:14 +02:00
parent 8e8b661340
commit f810c7c15f
3 changed files with 62 additions and 1 deletions

View file

@ -8,10 +8,70 @@
#include <spa/utils/defs.h> #include <spa/utils/defs.h>
#include "config.h"
#include "dsp-ops.h" #include "dsp-ops.h"
#include <immintrin.h> #include <immintrin.h>
void dsp_mix_gain_avx(struct dsp_ops *ops,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples)
{
if (n_src == 0) {
memset(dst, 0, n_samples * sizeof(float));
} else if (n_src == 1 && gain[0] == 1.0f) {
if (dst != src[0])
spa_memcpy(dst, src[0], n_samples * sizeof(float));
} else {
uint32_t n, i, unrolled;
__m256 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 32) {
g = _mm256_set1_ps(gain[0]);
in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0]));
in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8]));
in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16]));
in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24]));
for (i = 1; i < n_src; i++) {
g = _mm256_set1_ps(gain[i]);
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
}
_mm256_store_ps(&d[n+ 0], in[0]);
_mm256_store_ps(&d[n+ 8], in[1]);
_mm256_store_ps(&d[n+16], in[2]);
_mm256_store_ps(&d[n+24], in[3]);
}
for (; n < n_samples; n++) {
__m128 in[1], g;
g = _mm_set_ss(gain[0]);
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
for (i = 1; i < n_src; i++) {
g = _mm_set_ss(gain[i]);
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
}
_mm_store_ss(&d[n], in[0]);
}
}
}
void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples) void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples)
{ {
uint32_t n, unrolled; uint32_t n, unrolled;

View file

@ -25,7 +25,7 @@ static struct dsp_info dsp_table[] =
{ SPA_CPU_FLAG_AVX, { SPA_CPU_FLAG_AVX,
.funcs.clear = dsp_clear_c, .funcs.clear = dsp_clear_c,
.funcs.copy = dsp_copy_c, .funcs.copy = dsp_copy_c,
.funcs.mix_gain = dsp_mix_gain_sse, .funcs.mix_gain = dsp_mix_gain_avx,
.funcs.biquad_run = dsp_biquad_run_sse, .funcs.biquad_run = dsp_biquad_run_sse,
.funcs.sum = dsp_sum_avx, .funcs.sum = dsp_sum_avx,
.funcs.linear = dsp_linear_c, .funcs.linear = dsp_linear_c,

View file

@ -150,6 +150,7 @@ MAKE_BIQUADN_RUN_FUNC(sse);
MAKE_DELAY_FUNC(sse); MAKE_DELAY_FUNC(sse);
#endif #endif
#if defined (HAVE_AVX) #if defined (HAVE_AVX)
MAKE_MIX_GAIN_FUNC(avx);
MAKE_SUM_FUNC(avx); MAKE_SUM_FUNC(avx);
#endif #endif