diff --git a/spa/plugins/filter-graph/audio-dsp-avx2.c b/spa/plugins/filter-graph/audio-dsp-avx2.c index f73dc8b0a..e14821d2d 100644 --- a/spa/plugins/filter-graph/audio-dsp-avx2.c +++ b/spa/plugins/filter-graph/audio-dsp-avx2.c @@ -237,6 +237,104 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t } } +void dsp_mult_avx2(void *obj, + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], + uint32_t n_src, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m256 in[4]; + + if (n_src == 0) { + memset(dst, 0, n_samples * sizeof(float)); + return; + } + + if (dst != src[0]) + spa_memcpy(dst, src[0], n_samples * sizeof(float)); + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) { + unrolled = n_samples & ~31; + for (i = 1; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (i = 1; i < n_src; i++) { + for (n = 0; n < unrolled; n += 32) { + in[0] = _mm256_mul_ps(_mm256_load_ps(&dst[n+ 0]), _mm256_load_ps(&src[i][n+ 0])); + in[1] = _mm256_mul_ps(_mm256_load_ps(&dst[n+ 8]), _mm256_load_ps(&src[i][n+ 8])); + in[2] = _mm256_mul_ps(_mm256_load_ps(&dst[n+16]), _mm256_load_ps(&src[i][n+16])); + in[3] = _mm256_mul_ps(_mm256_load_ps(&dst[n+24]), _mm256_load_ps(&src[i][n+24])); + _mm256_store_ps(&dst[n+ 0], in[0]); + _mm256_store_ps(&dst[n+ 8], in[1]); + _mm256_store_ps(&dst[n+16], in[2]); + _mm256_store_ps(&dst[n+24], in[3]); + } + for (; n < n_samples; n++) + dst[n] *= src[i][n]; + } +} + +void dsp_linear_avx2(void *obj, float * dst, + const float * SPA_RESTRICT src, const float mult, + const float add, uint32_t n_samples) +{ + uint32_t n, unrolled; + __m256 m, a; + + if (mult == 0.0f) { + a = _mm256_set1_ps(add); + unrolled = n_samples & ~31; + for (n = 0; n < unrolled; n += 32) { + _mm256_storeu_ps(&dst[n+ 0], a); + _mm256_storeu_ps(&dst[n+ 8], a); + _mm256_storeu_ps(&dst[n+16], a); + _mm256_storeu_ps(&dst[n+24], a); + } + for (; n < n_samples; n++) + dst[n] = add; + return; + } + + if (SPA_LIKELY(SPA_IS_ALIGNED(src, 32) && SPA_IS_ALIGNED(dst, 32))) + unrolled = n_samples & ~31; + else + unrolled = 0; + + m = _mm256_set1_ps(mult); + + if (add == 0.0f) { + if (mult == 1.0f) { + if (dst != src) + spa_memcpy(dst, src, n_samples * sizeof(float)); + return; + } + for (n = 0; n < unrolled; n += 32) { + _mm256_store_ps(&dst[n+ 0], _mm256_mul_ps(m, _mm256_load_ps(&src[n+ 0]))); + _mm256_store_ps(&dst[n+ 8], _mm256_mul_ps(m, _mm256_load_ps(&src[n+ 8]))); + _mm256_store_ps(&dst[n+16], _mm256_mul_ps(m, _mm256_load_ps(&src[n+16]))); + _mm256_store_ps(&dst[n+24], _mm256_mul_ps(m, _mm256_load_ps(&src[n+24]))); + } + for (; n < n_samples; n++) + dst[n] = mult * src[n]; + } else { + a = _mm256_set1_ps(add); + for (n = 0; n < unrolled; n += 32) { + _mm256_store_ps(&dst[n+ 0], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+ 0]), a)); + _mm256_store_ps(&dst[n+ 8], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+ 8]), a)); + _mm256_store_ps(&dst[n+16], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+16]), a)); + _mm256_store_ps(&dst[n+24], _mm256_fmadd_ps(m, _mm256_load_ps(&src[n+24]), a)); + } + for (; n < n_samples; n++) + dst[n] = mult * src[n] + add; + } +} + #define FFT_BLOCK 8 struct fft_info { diff --git a/spa/plugins/filter-graph/audio-dsp-impl.h b/spa/plugins/filter-graph/audio-dsp-impl.h index ed98e5a93..4b63bf22f 100644 --- a/spa/plugins/filter-graph/audio-dsp-impl.h +++ b/spa/plugins/filter-graph/audio-dsp-impl.h @@ -79,6 +79,8 @@ MAKE_FFT_CMULADD_FUNC(c); #if defined (HAVE_SSE) MAKE_MIX_GAIN_FUNC(sse); MAKE_SUM_FUNC(sse); +MAKE_LINEAR_FUNC(sse); +MAKE_MULT_FUNC(sse); MAKE_BIQUAD_RUN_FUNC(sse); MAKE_DELAY_FUNC(sse); MAKE_FFT_MEMALLOC_FUNC(sse); @@ -90,6 +92,8 @@ MAKE_FFT_CMULADD_FUNC(sse); #if defined (HAVE_AVX2) MAKE_MIX_GAIN_FUNC(avx2); MAKE_SUM_FUNC(avx2); +MAKE_LINEAR_FUNC(avx2); +MAKE_MULT_FUNC(avx2); MAKE_FFT_MEMALLOC_FUNC(avx2); MAKE_FFT_MEMCLEAR_FUNC(avx2); MAKE_FFT_RUN_FUNC(avx2); diff --git a/spa/plugins/filter-graph/audio-dsp-sse.c b/spa/plugins/filter-graph/audio-dsp-sse.c index 0b2051a63..ae5f1d196 100644 --- a/spa/plugins/filter-graph/audio-dsp-sse.c +++ b/spa/plugins/filter-graph/audio-dsp-sse.c @@ -615,6 +615,104 @@ void dsp_biquad_run_sse(void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq } } +void dsp_mult_sse(void *obj, + float * SPA_RESTRICT dst, + const float * SPA_RESTRICT src[], + uint32_t n_src, uint32_t n_samples) +{ + uint32_t n, i, unrolled; + __m128 in[4]; + + if (n_src == 0) { + memset(dst, 0, n_samples * sizeof(float)); + return; + } + + if (dst != src[0]) + spa_memcpy(dst, src[0], n_samples * sizeof(float)); + + if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) { + unrolled = n_samples & ~15; + for (i = 1; i < n_src; i++) { + if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) { + unrolled = 0; + break; + } + } + } else + unrolled = 0; + + for (i = 1; i < n_src; i++) { + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm_mul_ps(_mm_load_ps(&dst[n+ 0]), _mm_load_ps(&src[i][n+ 0])); + in[1] = _mm_mul_ps(_mm_load_ps(&dst[n+ 4]), _mm_load_ps(&src[i][n+ 4])); + in[2] = _mm_mul_ps(_mm_load_ps(&dst[n+ 8]), _mm_load_ps(&src[i][n+ 8])); + in[3] = _mm_mul_ps(_mm_load_ps(&dst[n+12]), _mm_load_ps(&src[i][n+12])); + _mm_store_ps(&dst[n+ 0], in[0]); + _mm_store_ps(&dst[n+ 4], in[1]); + _mm_store_ps(&dst[n+ 8], in[2]); + _mm_store_ps(&dst[n+12], in[3]); + } + for (; n < n_samples; n++) + dst[n] *= src[i][n]; + } +} + +void dsp_linear_sse(void *obj, float * dst, + const float * SPA_RESTRICT src, const float mult, + const float add, uint32_t n_samples) +{ + uint32_t n, unrolled; + __m128 m, a; + + if (mult == 0.0f) { + a = _mm_set1_ps(add); + unrolled = n_samples & ~15; + for (n = 0; n < unrolled; n += 16) { + _mm_storeu_ps(&dst[n+ 0], a); + _mm_storeu_ps(&dst[n+ 4], a); + _mm_storeu_ps(&dst[n+ 8], a); + _mm_storeu_ps(&dst[n+12], a); + } + for (; n < n_samples; n++) + dst[n] = add; + return; + } + + if (SPA_LIKELY(SPA_IS_ALIGNED(src, 16) && SPA_IS_ALIGNED(dst, 16))) + unrolled = n_samples & ~15; + else + unrolled = 0; + + m = _mm_set1_ps(mult); + + if (add == 0.0f) { + if (mult == 1.0f) { + if (dst != src) + spa_memcpy(dst, src, n_samples * sizeof(float)); + return; + } + for (n = 0; n < unrolled; n += 16) { + _mm_store_ps(&dst[n+ 0], _mm_mul_ps(m, _mm_load_ps(&src[n+ 0]))); + _mm_store_ps(&dst[n+ 4], _mm_mul_ps(m, _mm_load_ps(&src[n+ 4]))); + _mm_store_ps(&dst[n+ 8], _mm_mul_ps(m, _mm_load_ps(&src[n+ 8]))); + _mm_store_ps(&dst[n+12], _mm_mul_ps(m, _mm_load_ps(&src[n+12]))); + } + for (; n < n_samples; n++) + dst[n] = mult * src[n]; + } else { + a = _mm_set1_ps(add); + for (n = 0; n < unrolled; n += 16) { + _mm_store_ps(&dst[n+ 0], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 0])), a)); + _mm_store_ps(&dst[n+ 4], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 4])), a)); + _mm_store_ps(&dst[n+ 8], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+ 8])), a)); + _mm_store_ps(&dst[n+12], _mm_add_ps(_mm_mul_ps(m, _mm_load_ps(&src[n+12])), a)); + } + for (; n < n_samples; n++) + dst[n] = mult * src[n] + add; + } +} + void dsp_delay_sse(void *obj, float *buffer, uint32_t *pos, uint32_t n_buffer, uint32_t delay, float *dst, const float *src, uint32_t n_samples, float fb, float ff) { diff --git a/spa/plugins/filter-graph/audio-dsp.c b/spa/plugins/filter-graph/audio-dsp.c index d900d0b4d..d72c46d87 100644 --- a/spa/plugins/filter-graph/audio-dsp.c +++ b/spa/plugins/filter-graph/audio-dsp.c @@ -30,8 +30,8 @@ static const struct dsp_info dsp_table[] = .funcs.mix_gain = dsp_mix_gain_avx2, .funcs.biquad_run = dsp_biquad_run_sse, .funcs.sum = dsp_sum_avx2, - .funcs.linear = dsp_linear_c, - .funcs.mult = dsp_mult_c, + .funcs.linear = dsp_linear_avx2, + .funcs.mult = dsp_mult_avx2, .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, .funcs.fft_memalloc = dsp_fft_memalloc_avx2, @@ -50,8 +50,8 @@ static const struct dsp_info dsp_table[] = .funcs.mix_gain = dsp_mix_gain_sse, .funcs.biquad_run = dsp_biquad_run_sse, .funcs.sum = dsp_sum_sse, - .funcs.linear = dsp_linear_c, - .funcs.mult = dsp_mult_c, + .funcs.linear = dsp_linear_sse, + .funcs.mult = dsp_mult_sse, .funcs.fft_new = dsp_fft_new_c, .funcs.fft_free = dsp_fft_free_c, .funcs.fft_memalloc = dsp_fft_memalloc_sse,