filter-graph: optimize mix function a little

Make special cases for no and 1 gain value when mixing.
This commit is contained in:
Wim Taymans 2025-01-07 09:19:05 +01:00
parent efa615945e
commit d6030adada
6 changed files with 367 additions and 141 deletions

View file

@ -16,10 +16,156 @@
#include <immintrin.h>
static void dsp_add_avx(void *obj, float *dst, const float * SPA_RESTRICT src[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m256 in[4];
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 32) {
in[0] = _mm256_load_ps(&s[0][n+ 0]);
in[1] = _mm256_load_ps(&s[0][n+ 8]);
in[2] = _mm256_load_ps(&s[0][n+16]);
in[3] = _mm256_load_ps(&s[0][n+24]);
for (i = 1; i < n_src; i++) {
in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0]));
in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8]));
in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16]));
in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24]));
}
_mm256_store_ps(&d[n+ 0], in[0]);
_mm256_store_ps(&d[n+ 8], in[1]);
_mm256_store_ps(&d[n+16], in[2]);
_mm256_store_ps(&d[n+24], in[3]);
}
for (; n < n_samples; n++) {
__m128 in[1];
in[0] = _mm_load_ss(&s[0][n]);
for (i = 1; i < n_src; i++)
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
_mm_store_ss(&d[n], in[0]);
}
}
static void dsp_add_1_gain_avx(void *obj, float *dst, const float * SPA_RESTRICT src[],
uint32_t n_src, float gain, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m256 in[4], g;
const float **s = (const float **)src;
float *d = dst;
__m128 g1;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
g = _mm256_set1_ps(gain);
g1 = _mm_set_ss(gain);
for (n = 0; n < unrolled; n += 32) {
in[0] = _mm256_load_ps(&s[0][n+ 0]);
in[1] = _mm256_load_ps(&s[0][n+ 8]);
in[2] = _mm256_load_ps(&s[0][n+16]);
in[3] = _mm256_load_ps(&s[0][n+24]);
for (i = 1; i < n_src; i++) {
in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0]));
in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8]));
in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16]));
in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24]));
}
_mm256_store_ps(&d[n+ 0], _mm256_mul_ps(g, in[0]));
_mm256_store_ps(&d[n+ 8], _mm256_mul_ps(g, in[1]));
_mm256_store_ps(&d[n+16], _mm256_mul_ps(g, in[2]));
_mm256_store_ps(&d[n+24], _mm256_mul_ps(g, in[3]));
}
for (; n < n_samples; n++) {
__m128 in[1];
in[0] = _mm_load_ss(&s[0][n]);
for (i = 1; i < n_src; i++)
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
_mm_store_ss(&d[n], _mm_mul_ss(g1, in[0]));
}
}
static void dsp_add_n_gain_avx(void *obj, float *dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m256 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 32) {
g = _mm256_set1_ps(gain[0]);
in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0]));
in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8]));
in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16]));
in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24]));
for (i = 1; i < n_src; i++) {
g = _mm256_set1_ps(gain[i]);
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
}
_mm256_store_ps(&d[n+ 0], in[0]);
_mm256_store_ps(&d[n+ 8], in[1]);
_mm256_store_ps(&d[n+16], in[2]);
_mm256_store_ps(&d[n+24], in[3]);
}
for (; n < n_samples; n++) {
__m128 in[1], g;
g = _mm_set_ss(gain[0]);
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
for (i = 1; i < n_src; i++) {
g = _mm_set_ss(gain[i]);
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
}
_mm_store_ss(&d[n], in[0]);
}
}
void dsp_mix_gain_avx(void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
if (n_src == 0) {
memset(dst, 0, n_samples * sizeof(float));
@ -27,51 +173,12 @@ void dsp_mix_gain_avx(void *obj,
if (dst != src[0])
spa_memcpy(dst, src[0], n_samples * sizeof(float));
} else {
uint32_t n, i, unrolled;
__m256 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
unrolled = n_samples & ~31;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 32) {
g = _mm256_set1_ps(gain[0]);
in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0]));
in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8]));
in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16]));
in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24]));
for (i = 1; i < n_src; i++) {
g = _mm256_set1_ps(gain[i]);
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
}
_mm256_store_ps(&d[n+ 0], in[0]);
_mm256_store_ps(&d[n+ 8], in[1]);
_mm256_store_ps(&d[n+16], in[2]);
_mm256_store_ps(&d[n+24], in[3]);
}
for (; n < n_samples; n++) {
__m128 in[1], g;
g = _mm_set_ss(gain[0]);
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
for (i = 1; i < n_src; i++) {
g = _mm_set_ss(gain[i]);
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
}
_mm_store_ss(&d[n], in[0]);
}
if (n_gain == 0)
dsp_add_avx(obj, dst, src, n_src, n_samples);
else if (n_gain < n_src)
dsp_add_1_gain_avx(obj, dst, src, n_src, gain[0], n_samples);
else
dsp_add_n_gain_avx(obj, dst, src, n_src, gain, n_gain, n_samples);
}
}

View file

@ -18,20 +18,20 @@
#endif
#include "audio-dsp-impl.h"
void dsp_clear_c(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
void dsp_clear_c(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
{
memset(dst, 0, sizeof(float) * n_samples);
}
void dsp_copy_c(void *obj, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, uint32_t n_samples)
void dsp_copy_c(void *obj, float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, uint32_t n_samples)
{
if (dst != src)
spa_memcpy(dst, src, sizeof(float) * n_samples);
}
static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, uint32_t n_samples)
static inline void dsp_add_c(void *obj, float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, uint32_t n_samples)
{
uint32_t i;
const float *s = src;
@ -40,8 +40,8 @@ static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst,
d[i] += s[i];
}
static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, float gain, uint32_t n_samples)
static inline void dsp_gain_c(void *obj, float * dst,
const float * src, float gain, uint32_t n_samples)
{
uint32_t i;
const float *s = src;
@ -56,8 +56,8 @@ static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst,
}
}
static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, float gain, uint32_t n_samples)
static inline void dsp_gain_add_c(void *obj, float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, float gain, uint32_t n_samples)
{
uint32_t i;
const float *s = src;
@ -75,22 +75,30 @@ static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst,
void dsp_mix_gain_c(void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
uint32_t i;
if (n_src == 0) {
dsp_clear_c(obj, dst, n_samples);
} else {
dsp_gain_c(obj, dst, src[0], gain[0], n_samples);
for (i = 1; i < n_src; i++)
dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples);
if (n_gain < n_src) {
dsp_copy_c(obj, dst, src[0], n_samples);
for (i = 1; i < n_src; i++)
dsp_add_c(obj, dst, src[i], n_samples);
if (n_gain > 0)
dsp_gain_c(obj, dst, dst, gain[0], n_samples);
} else {
dsp_gain_c(obj, dst, src[0], gain[0], n_samples);
for (i = 1; i < n_src; i++)
dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples);
}
}
}
static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, uint32_t n_samples)
static inline void dsp_mult1_c(void *obj, float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, uint32_t n_samples)
{
uint32_t i;
const float *s = src;
@ -100,8 +108,8 @@ static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst,
}
void dsp_mult_c(void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t i;

View file

@ -11,13 +11,13 @@ struct spa_fga_dsp * spa_fga_dsp_new(uint32_t cpu_flags);
void spa_fga_dsp_free(struct spa_fga_dsp *dsp);
#define MAKE_CLEAR_FUNC(arch) \
void dsp_clear_##arch(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
void dsp_clear_##arch(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
#define MAKE_COPY_FUNC(arch) \
void dsp_copy_##arch(void *obj, void * SPA_RESTRICT dst, \
const void * SPA_RESTRICT src, uint32_t n_samples)
void dsp_copy_##arch(void *obj, float * SPA_RESTRICT dst, \
const float * SPA_RESTRICT src, uint32_t n_samples)
#define MAKE_MIX_GAIN_FUNC(arch) \
void dsp_mix_gain_##arch(void *obj, void * SPA_RESTRICT dst, \
const void * SPA_RESTRICT src[], float gain[], uint32_t n_src, uint32_t n_samples)
void dsp_mix_gain_##arch(void *obj, float * SPA_RESTRICT dst, \
const float * SPA_RESTRICT src[], uint32_t n_src, float gain[], uint32_t n_gain, uint32_t n_samples)
#define MAKE_SUM_FUNC(arch) \
void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
@ -25,8 +25,8 @@ void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \
void dsp_linear_##arch (void *obj, float * SPA_RESTRICT dst, \
const float * SPA_RESTRICT src, const float mult, const float add, uint32_t n_samples)
#define MAKE_MULT_FUNC(arch) \
void dsp_mult_##arch(void *obj, void * SPA_RESTRICT dst, \
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
void dsp_mult_##arch(void *obj, float * SPA_RESTRICT dst, \
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
#define MAKE_BIQUAD_RUN_FUNC(arch) \
void dsp_biquad_run_##arch (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, \
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)

View file

@ -19,10 +19,153 @@
#include <xmmintrin.h>
static void dsp_add_sse(void *obj, float *dst, const float * SPA_RESTRICT src[],
uint32_t n_src, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m128 in[4];
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
unrolled = n_samples & ~15;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 16) {
in[0] = _mm_load_ps(&s[0][n+ 0]);
in[1] = _mm_load_ps(&s[0][n+ 4]);
in[2] = _mm_load_ps(&s[0][n+ 8]);
in[3] = _mm_load_ps(&s[0][n+12]);
for (i = 1; i < n_src; i++) {
in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0]));
in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4]));
in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8]));
in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12]));
}
_mm_store_ps(&d[n+ 0], in[0]);
_mm_store_ps(&d[n+ 4], in[1]);
_mm_store_ps(&d[n+ 8], in[2]);
_mm_store_ps(&d[n+12], in[3]);
}
for (; n < n_samples; n++) {
in[0] = _mm_load_ss(&s[0][n]);
for (i = 1; i < n_src; i++)
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
_mm_store_ss(&d[n], in[0]);
}
}
static void dsp_add_1_gain_sse(void *obj,
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m128 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
unrolled = n_samples & ~15;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
g = _mm_set1_ps(gain);
for (n = 0; n < unrolled; n += 16) {
in[0] = _mm_load_ps(&s[0][n+ 0]);
in[1] = _mm_load_ps(&s[0][n+ 4]);
in[2] = _mm_load_ps(&s[0][n+ 8]);
in[3] = _mm_load_ps(&s[0][n+12]);
for (i = 1; i < n_src; i++) {
in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0]));
in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4]));
in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8]));
in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12]));
}
_mm_store_ps(&d[n+ 0], _mm_mul_ps(in[0], g));
_mm_store_ps(&d[n+ 4], _mm_mul_ps(in[1], g));
_mm_store_ps(&d[n+ 8], _mm_mul_ps(in[2], g));
_mm_store_ps(&d[n+12], _mm_mul_ps(in[3], g));
}
for (; n < n_samples; n++) {
in[0] = _mm_load_ss(&s[0][n]);
for (i = 1; i < n_src; i++)
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
_mm_store_ss(&d[n], _mm_mul_ss(in[0], g));
}
}
static void dsp_add_n_gain_sse(void *obj,
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
uint32_t n, i, unrolled;
__m128 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
unrolled = n_samples & ~15;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 16) {
g = _mm_set1_ps(gain[0]);
in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0]));
in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4]));
in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8]));
in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12]));
for (i = 1; i < n_src; i++) {
g = _mm_set1_ps(gain[i]);
in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0])));
in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4])));
in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8])));
in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12])));
}
_mm_store_ps(&d[n+ 0], in[0]);
_mm_store_ps(&d[n+ 4], in[1]);
_mm_store_ps(&d[n+ 8], in[2]);
_mm_store_ps(&d[n+12], in[3]);
}
for (; n < n_samples; n++) {
g = _mm_set_ss(gain[0]);
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
for (i = 1; i < n_src; i++) {
g = _mm_set_ss(gain[i]);
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
}
_mm_store_ss(&d[n], in[0]);
}
}
void dsp_mix_gain_sse(void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
if (n_src == 0) {
memset(dst, 0, n_samples * sizeof(float));
@ -30,50 +173,12 @@ void dsp_mix_gain_sse(void *obj,
if (dst != src[0])
spa_memcpy(dst, src[0], n_samples * sizeof(float));
} else {
uint32_t n, i, unrolled;
__m128 in[4], g;
const float **s = (const float **)src;
float *d = dst;
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
unrolled = n_samples & ~15;
for (i = 0; i < n_src; i++) {
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
unrolled = 0;
break;
}
}
} else
unrolled = 0;
for (n = 0; n < unrolled; n += 16) {
g = _mm_set1_ps(gain[0]);
in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0]));
in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4]));
in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8]));
in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12]));
for (i = 1; i < n_src; i++) {
g = _mm_set1_ps(gain[i]);
in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0])));
in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4])));
in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8])));
in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12])));
}
_mm_store_ps(&d[n+ 0], in[0]);
_mm_store_ps(&d[n+ 4], in[1]);
_mm_store_ps(&d[n+ 8], in[2]);
_mm_store_ps(&d[n+12], in[3]);
}
for (; n < n_samples; n++) {
g = _mm_set_ss(gain[0]);
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
for (i = 1; i < n_src; i++) {
g = _mm_set_ss(gain[i]);
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
}
_mm_store_ss(&d[n], in[0]);
}
if (n_gain == 0)
dsp_add_sse(obj, dst, src, n_src, n_samples);
else if (n_gain < n_src)
dsp_add_1_gain_sse(obj, dst, src, n_src, gain[0], n_samples);
else
dsp_add_n_gain_sse(obj, dst, src, n_src, gain, n_gain, n_samples);
}
}

View file

@ -22,14 +22,14 @@ struct spa_fga_dsp_methods {
#define SPA_VERSION_FGA_DSP_METHODS 0
uint32_t version;
void (*clear) (void *obj, void * SPA_RESTRICT dst, uint32_t n_samples);
void (*clear) (void *obj, float * SPA_RESTRICT dst, uint32_t n_samples);
void (*copy) (void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, uint32_t n_samples);
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, uint32_t n_samples);
void (*mix_gain) (void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples);
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples);
void (*sum) (void *obj,
float * dst, const float * SPA_RESTRICT a,
const float * SPA_RESTRICT b, uint32_t n_samples);
@ -52,8 +52,8 @@ struct spa_fga_dsp_methods {
float * dst, const float * SPA_RESTRICT src,
const float mult, const float add, uint32_t n_samples);
void (*mult) (void *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
void (*biquad_run) (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride,
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
uint32_t n_src, uint32_t n_samples);
@ -61,25 +61,25 @@ struct spa_fga_dsp_methods {
float *dst, const float *src, uint32_t n_samples);
};
static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
{
spa_api_method_v(spa_fga_dsp, &obj->iface, clear, 0,
dst, n_samples);
}
static inline void spa_fga_dsp_copy(struct spa_fga_dsp *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src, uint32_t n_samples)
{
spa_api_method_v(spa_fga_dsp, &obj->iface, copy, 0,
dst, src, n_samples);
}
static inline void spa_fga_dsp_mix_gain(struct spa_fga_dsp *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[],
float gain[], uint32_t n_src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src,
float gain[], uint32_t n_gain, uint32_t n_samples)
{
spa_api_method_v(spa_fga_dsp, &obj->iface, mix_gain, 0,
dst, src, gain, n_src, n_samples);
dst, src, n_src, gain, n_gain, n_samples);
}
static inline void spa_fga_dsp_sum(struct spa_fga_dsp *obj,
float * dst, const float * SPA_RESTRICT a,
@ -143,8 +143,8 @@ static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj,
dst, src, mult, add, n_samples);
}
static inline void spa_fga_dsp_mult(struct spa_fga_dsp *obj,
void * SPA_RESTRICT dst,
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
float * SPA_RESTRICT dst,
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
{
spa_api_method_v(spa_fga_dsp, &obj->iface, mult, 0,
dst, src, n_src, n_samples);

View file

@ -121,8 +121,9 @@ static void mixer_run(void * Instance, unsigned long SampleCount)
struct builtin *impl = Instance;
int i, n_src = 0;
float *out = impl->port[0];
const void *src[8];
const float *src[8];
float gains[8];
bool eq_gain = true;
if (out == NULL)
return;
@ -136,8 +137,13 @@ static void mixer_run(void * Instance, unsigned long SampleCount)
src[n_src] = in;
gains[n_src++] = gain;
if (gain != gains[0])
eq_gain = false;
}
spa_fga_dsp_mix_gain(impl->dsp, out, src, gains, n_src, SampleCount);
if (eq_gain)
spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, 1, SampleCount);
else
spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, n_src, SampleCount);
}
static struct spa_fga_port mixer_ports[] = {
@ -1589,7 +1595,7 @@ static void mult_run(void * Instance, unsigned long SampleCount)
struct builtin *impl = Instance;
int i, n_src = 0;
float *out = impl->port[0];
const void *src[8];
const float *src[8];
if (out == NULL)
return;