diff --git a/src/modules/meson.build b/src/modules/meson.build index 367031e98..087c72371 100644 --- a/src/modules/meson.build +++ b/src/modules/meson.build @@ -71,6 +71,16 @@ if have_sse simd_cargs += ['-DHAVE_SSE'] simd_dependencies += filter_chain_sse endif +if have_avx + filter_chain_avx = static_library('filter_chain_avx', + ['module-filter-chain/dsp-ops-avx.c' ], + c_args : [avx_args, fma_args,'-O3', '-DHAVE_AVX'], + dependencies : [ spa_dep ], + install : false + ) + simd_cargs += ['-DHAVE_AVX'] + simd_dependencies += filter_chain_avx +endif if have_neon filter_chain_neon = static_library('filter_chain_neon', ['module-filter-chain/pffft.c' ], diff --git a/src/modules/module-filter-chain/dsp-ops-avx.c b/src/modules/module-filter-chain/dsp-ops-avx.c new file mode 100644 index 000000000..64abcaa44 --- /dev/null +++ b/src/modules/module-filter-chain/dsp-ops-avx.c @@ -0,0 +1,85 @@ +/* Spa + * + * Copyright © 2022 Wim Taymans + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "dsp-ops.h" + +#include + +void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples) +{ + uint32_t n, unrolled; + __m256 in[4]; + + unrolled = n_samples & ~31; + + if (SPA_LIKELY(SPA_IS_ALIGNED(r, 32)) && + SPA_LIKELY(SPA_IS_ALIGNED(a, 32)) && + SPA_LIKELY(SPA_IS_ALIGNED(b, 32))) { + for (n = 0; n < unrolled; n += 32) { + in[0] = _mm256_load_ps(&a[n+ 0]); + in[1] = _mm256_load_ps(&a[n+ 8]); + in[2] = _mm256_load_ps(&a[n+16]); + in[3] = _mm256_load_ps(&a[n+24]); + + in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&b[n+ 0])); + in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&b[n+ 8])); + in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&b[n+16])); + in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&b[n+24])); + + _mm256_store_ps(&r[n+ 0], in[0]); + _mm256_store_ps(&r[n+ 8], in[1]); + _mm256_store_ps(&r[n+16], in[2]); + _mm256_store_ps(&r[n+24], in[3]); + } + } else { + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm256_loadu_ps(&a[n+ 0]); + in[1] = _mm256_loadu_ps(&a[n+ 8]); + in[2] = _mm256_loadu_ps(&a[n+16]); + in[3] = _mm256_loadu_ps(&a[n+24]); + + in[0] = _mm256_add_ps(in[0], _mm256_loadu_ps(&b[n+ 0])); + in[1] = _mm256_add_ps(in[1], _mm256_loadu_ps(&b[n+ 8])); + in[2] = _mm256_add_ps(in[2], _mm256_loadu_ps(&b[n+16])); + in[3] = _mm256_add_ps(in[3], _mm256_loadu_ps(&b[n+24])); + + _mm256_storeu_ps(&r[n+ 0], in[0]); + _mm256_storeu_ps(&r[n+ 8], in[1]); + _mm256_storeu_ps(&r[n+16], in[2]); + _mm256_storeu_ps(&r[n+24], in[3]); + } + } + for (; n < n_samples; n++) { + __m128 in[1]; + in[0] = _mm_load_ss(&a[n]); + in[0] = _mm_add_ss(in[0], _mm_load_ss(&b[n])); + _mm_store_ss(&r[n], in[0]); + } +} diff --git a/src/modules/module-filter-chain/dsp-ops-c.c b/src/modules/module-filter-chain/dsp-ops-c.c index a371229ff..576ab7582 100644 --- a/src/modules/module-filter-chain/dsp-ops-c.c +++ b/src/modules/module-filter-chain/dsp-ops-c.c @@ -133,3 +133,11 @@ void dsp_biquad_run_c(struct dsp_ops *ops, struct biquad *bq, #undef F } +void dsp_sum_c(struct dsp_ops *ops, float * dst, + const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples) +{ + uint32_t i; + for (i = 0; i < n_samples; i++) + dst[i] = a[i] + b[i]; +} + diff --git a/src/modules/module-filter-chain/dsp-ops-sse.c b/src/modules/module-filter-chain/dsp-ops-sse.c index cabcae340..bcc3499af 100644 --- a/src/modules/module-filter-chain/dsp-ops-sse.c +++ b/src/modules/module-filter-chain/dsp-ops-sse.c @@ -89,3 +89,54 @@ void dsp_mix_gain_sse(struct dsp_ops *ops, } } } + +void dsp_sum_sse(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples) +{ + uint32_t n, unrolled; + __m128 in[4]; + + unrolled = n_samples & ~15; + + if (SPA_LIKELY(SPA_IS_ALIGNED(r, 16)) && + SPA_LIKELY(SPA_IS_ALIGNED(a, 16)) && + SPA_LIKELY(SPA_IS_ALIGNED(b, 16))) { + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm_load_ps(&a[n+ 0]); + in[1] = _mm_load_ps(&a[n+ 4]); + in[2] = _mm_load_ps(&a[n+ 8]); + in[3] = _mm_load_ps(&a[n+12]); + + in[0] = _mm_add_ps(in[0], _mm_load_ps(&b[n+ 0])); + in[1] = _mm_add_ps(in[1], _mm_load_ps(&b[n+ 4])); + in[2] = _mm_add_ps(in[2], _mm_load_ps(&b[n+ 8])); + in[3] = _mm_add_ps(in[3], _mm_load_ps(&b[n+12])); + + _mm_store_ps(&r[n+ 0], in[0]); + _mm_store_ps(&r[n+ 4], in[1]); + _mm_store_ps(&r[n+ 8], in[2]); + _mm_store_ps(&r[n+12], in[3]); + } + } else { + for (n = 0; n < unrolled; n += 16) { + in[0] = _mm_loadu_ps(&a[n+ 0]); + in[1] = _mm_loadu_ps(&a[n+ 4]); + in[2] = _mm_loadu_ps(&a[n+ 8]); + in[3] = _mm_loadu_ps(&a[n+12]); + + in[0] = _mm_add_ps(in[0], _mm_loadu_ps(&b[n+ 0])); + in[1] = _mm_add_ps(in[1], _mm_loadu_ps(&b[n+ 4])); + in[2] = _mm_add_ps(in[2], _mm_loadu_ps(&b[n+ 8])); + in[3] = _mm_add_ps(in[3], _mm_loadu_ps(&b[n+12])); + + _mm_storeu_ps(&r[n+ 0], in[0]); + _mm_storeu_ps(&r[n+ 4], in[1]); + _mm_storeu_ps(&r[n+ 8], in[2]); + _mm_storeu_ps(&r[n+12], in[3]); + } + } + for (; n < n_samples; n++) { + in[0] = _mm_load_ss(&a[n]); + in[0] = _mm_add_ss(in[0], _mm_load_ss(&b[n])); + _mm_store_ss(&r[n], in[0]); + } +} diff --git a/src/modules/module-filter-chain/dsp-ops.c b/src/modules/module-filter-chain/dsp-ops.c index f0b4e2a72..cf064c824 100644 --- a/src/modules/module-filter-chain/dsp-ops.c +++ b/src/modules/module-filter-chain/dsp-ops.c @@ -40,12 +40,22 @@ struct dsp_info { static struct dsp_info dsp_table[] = { +#if defined (HAVE_AVX) + { SPA_CPU_FLAG_AVX, + .funcs.clear = dsp_clear_c, + .funcs.copy = dsp_copy_c, + .funcs.mix_gain = dsp_mix_gain_sse, + .funcs.biquad_run = dsp_biquad_run_c, + .funcs.sum = dsp_sum_avx, + }, +#endif #if defined (HAVE_SSE) { SPA_CPU_FLAG_SSE, .funcs.clear = dsp_clear_c, .funcs.copy = dsp_copy_c, .funcs.mix_gain = dsp_mix_gain_sse, .funcs.biquad_run = dsp_biquad_run_c, + .funcs.sum = dsp_sum_sse, }, #endif { 0, @@ -53,6 +63,7 @@ static struct dsp_info dsp_table[] = .funcs.copy = dsp_copy_c, .funcs.mix_gain = dsp_mix_gain_c, .funcs.biquad_run = dsp_biquad_run_c, + .funcs.sum = dsp_sum_c, }, }; diff --git a/src/modules/module-filter-chain/dsp-ops.h b/src/modules/module-filter-chain/dsp-ops.h index 568f56505..19fdc05c6 100644 --- a/src/modules/module-filter-chain/dsp-ops.h +++ b/src/modules/module-filter-chain/dsp-ops.h @@ -42,6 +42,9 @@ struct dsp_ops_funcs { float gain[], uint32_t n_src, uint32_t n_samples); void (*biquad_run) (struct dsp_ops *ops, struct biquad *bq, float *out, const float *in, uint32_t n_samples); + void (*sum) (struct dsp_ops *ops, + float * dst, const float * SPA_RESTRICT a, + const float * SPA_RESTRICT b, uint32_t n_samples); }; struct dsp_ops { @@ -62,6 +65,7 @@ int dsp_ops_init(struct dsp_ops *ops); #define dsp_ops_copy(ops,...) (ops)->funcs.copy(ops, __VA_ARGS__) #define dsp_ops_mix_gain(ops,...) (ops)->funcs.mix_gain(ops, __VA_ARGS__) #define dsp_ops_biquad_run(ops,...) (ops)->funcs.biquad_run(ops, __VA_ARGS__) +#define dsp_ops_sum(ops,...) (ops)->funcs.sum(ops, __VA_ARGS__) #define MAKE_CLEAR_FUNC(arch) \ void dsp_clear_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, uint32_t n_samples) @@ -74,14 +78,22 @@ void dsp_mix_gain_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, \ #define MAKE_BIQUAD_RUN_FUNC(arch) \ void dsp_biquad_run_##arch (struct dsp_ops *ops, struct biquad *bq, \ float *out, const float *in, uint32_t n_samples) +#define MAKE_SUM_FUNC(arch) \ +void dsp_sum_##arch (struct dsp_ops *ops, float * SPA_RESTRICT dst, \ + const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples); MAKE_CLEAR_FUNC(c); MAKE_COPY_FUNC(c); MAKE_MIX_GAIN_FUNC(c); MAKE_BIQUAD_RUN_FUNC(c); +MAKE_SUM_FUNC(c); #if defined (HAVE_SSE) MAKE_MIX_GAIN_FUNC(sse); +MAKE_SUM_FUNC(sse); +#endif +#if defined (HAVE_AVX) +MAKE_SUM_FUNC(avx); #endif #endif /* DSP_OPS_H */