mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2026-02-16 22:05:31 -05:00
filter-graph: optimize mix function a little
Make special cases for no and 1 gain value when mixing.
This commit is contained in:
parent
efa615945e
commit
d6030adada
6 changed files with 367 additions and 141 deletions
|
|
@ -16,10 +16,156 @@
|
|||
|
||||
#include <immintrin.h>
|
||||
|
||||
static void dsp_add_avx(void *obj, float *dst, const float * SPA_RESTRICT src[],
|
||||
uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m256 in[4];
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
|
||||
unrolled = n_samples & ~31;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 32) {
|
||||
in[0] = _mm256_load_ps(&s[0][n+ 0]);
|
||||
in[1] = _mm256_load_ps(&s[0][n+ 8]);
|
||||
in[2] = _mm256_load_ps(&s[0][n+16]);
|
||||
in[3] = _mm256_load_ps(&s[0][n+24]);
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0]));
|
||||
in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8]));
|
||||
in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16]));
|
||||
in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24]));
|
||||
}
|
||||
_mm256_store_ps(&d[n+ 0], in[0]);
|
||||
_mm256_store_ps(&d[n+ 8], in[1]);
|
||||
_mm256_store_ps(&d[n+16], in[2]);
|
||||
_mm256_store_ps(&d[n+24], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
__m128 in[1];
|
||||
in[0] = _mm_load_ss(&s[0][n]);
|
||||
for (i = 1; i < n_src; i++)
|
||||
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static void dsp_add_1_gain_avx(void *obj, float *dst, const float * SPA_RESTRICT src[],
|
||||
uint32_t n_src, float gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m256 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
__m128 g1;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
|
||||
unrolled = n_samples & ~31;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
g = _mm256_set1_ps(gain);
|
||||
g1 = _mm_set_ss(gain);
|
||||
|
||||
for (n = 0; n < unrolled; n += 32) {
|
||||
in[0] = _mm256_load_ps(&s[0][n+ 0]);
|
||||
in[1] = _mm256_load_ps(&s[0][n+ 8]);
|
||||
in[2] = _mm256_load_ps(&s[0][n+16]);
|
||||
in[3] = _mm256_load_ps(&s[0][n+24]);
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n+ 0]));
|
||||
in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n+ 8]));
|
||||
in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n+16]));
|
||||
in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n+24]));
|
||||
}
|
||||
_mm256_store_ps(&d[n+ 0], _mm256_mul_ps(g, in[0]));
|
||||
_mm256_store_ps(&d[n+ 8], _mm256_mul_ps(g, in[1]));
|
||||
_mm256_store_ps(&d[n+16], _mm256_mul_ps(g, in[2]));
|
||||
_mm256_store_ps(&d[n+24], _mm256_mul_ps(g, in[3]));
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
__m128 in[1];
|
||||
in[0] = _mm_load_ss(&s[0][n]);
|
||||
for (i = 1; i < n_src; i++)
|
||||
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
|
||||
_mm_store_ss(&d[n], _mm_mul_ss(g1, in[0]));
|
||||
}
|
||||
}
|
||||
|
||||
static void dsp_add_n_gain_avx(void *obj, float *dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m256 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
|
||||
unrolled = n_samples & ~31;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 32) {
|
||||
g = _mm256_set1_ps(gain[0]);
|
||||
in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0]));
|
||||
in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8]));
|
||||
in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16]));
|
||||
in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24]));
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm256_set1_ps(gain[i]);
|
||||
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
|
||||
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
|
||||
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
|
||||
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
|
||||
}
|
||||
_mm256_store_ps(&d[n+ 0], in[0]);
|
||||
_mm256_store_ps(&d[n+ 8], in[1]);
|
||||
_mm256_store_ps(&d[n+16], in[2]);
|
||||
_mm256_store_ps(&d[n+24], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
__m128 in[1], g;
|
||||
g = _mm_set_ss(gain[0]);
|
||||
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set_ss(gain[i]);
|
||||
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
|
||||
}
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void dsp_mix_gain_avx(void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float gain[], uint32_t n_src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
if (n_src == 0) {
|
||||
memset(dst, 0, n_samples * sizeof(float));
|
||||
|
|
@ -27,51 +173,12 @@ void dsp_mix_gain_avx(void *obj,
|
|||
if (dst != src[0])
|
||||
spa_memcpy(dst, src[0], n_samples * sizeof(float));
|
||||
} else {
|
||||
uint32_t n, i, unrolled;
|
||||
__m256 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
|
||||
unrolled = n_samples & ~31;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 32) {
|
||||
g = _mm256_set1_ps(gain[0]);
|
||||
in[0] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 0]));
|
||||
in[1] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+ 8]));
|
||||
in[2] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+16]));
|
||||
in[3] = _mm256_mul_ps(g, _mm256_load_ps(&s[0][n+24]));
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm256_set1_ps(gain[i]);
|
||||
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
|
||||
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
|
||||
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
|
||||
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
|
||||
}
|
||||
_mm256_store_ps(&d[n+ 0], in[0]);
|
||||
_mm256_store_ps(&d[n+ 8], in[1]);
|
||||
_mm256_store_ps(&d[n+16], in[2]);
|
||||
_mm256_store_ps(&d[n+24], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
__m128 in[1], g;
|
||||
g = _mm_set_ss(gain[0]);
|
||||
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set_ss(gain[i]);
|
||||
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
|
||||
}
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
if (n_gain == 0)
|
||||
dsp_add_avx(obj, dst, src, n_src, n_samples);
|
||||
else if (n_gain < n_src)
|
||||
dsp_add_1_gain_avx(obj, dst, src, n_src, gain[0], n_samples);
|
||||
else
|
||||
dsp_add_n_gain_avx(obj, dst, src, n_src, gain, n_gain, n_samples);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,20 +18,20 @@
|
|||
#endif
|
||||
#include "audio-dsp-impl.h"
|
||||
|
||||
void dsp_clear_c(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
void dsp_clear_c(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
{
|
||||
memset(dst, 0, sizeof(float) * n_samples);
|
||||
}
|
||||
|
||||
void dsp_copy_c(void *obj, void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples)
|
||||
void dsp_copy_c(void *obj, float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples)
|
||||
{
|
||||
if (dst != src)
|
||||
spa_memcpy(dst, src, sizeof(float) * n_samples);
|
||||
}
|
||||
|
||||
static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples)
|
||||
static inline void dsp_add_c(void *obj, float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
const float *s = src;
|
||||
|
|
@ -40,8 +40,8 @@ static inline void dsp_add_c(void *obj, void * SPA_RESTRICT dst,
|
|||
d[i] += s[i];
|
||||
}
|
||||
|
||||
static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, float gain, uint32_t n_samples)
|
||||
static inline void dsp_gain_c(void *obj, float * dst,
|
||||
const float * src, float gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
const float *s = src;
|
||||
|
|
@ -56,8 +56,8 @@ static inline void dsp_gain_c(void *obj, void * SPA_RESTRICT dst,
|
|||
}
|
||||
}
|
||||
|
||||
static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, float gain, uint32_t n_samples)
|
||||
static inline void dsp_gain_add_c(void *obj, float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, float gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
const float *s = src;
|
||||
|
|
@ -75,22 +75,30 @@ static inline void dsp_gain_add_c(void *obj, void * SPA_RESTRICT dst,
|
|||
|
||||
|
||||
void dsp_mix_gain_c(void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float gain[], uint32_t n_src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
if (n_src == 0) {
|
||||
dsp_clear_c(obj, dst, n_samples);
|
||||
} else {
|
||||
dsp_gain_c(obj, dst, src[0], gain[0], n_samples);
|
||||
for (i = 1; i < n_src; i++)
|
||||
dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples);
|
||||
if (n_gain < n_src) {
|
||||
dsp_copy_c(obj, dst, src[0], n_samples);
|
||||
for (i = 1; i < n_src; i++)
|
||||
dsp_add_c(obj, dst, src[i], n_samples);
|
||||
if (n_gain > 0)
|
||||
dsp_gain_c(obj, dst, dst, gain[0], n_samples);
|
||||
} else {
|
||||
dsp_gain_c(obj, dst, src[0], gain[0], n_samples);
|
||||
for (i = 1; i < n_src; i++)
|
||||
dsp_gain_add_c(obj, dst, src[i], gain[i], n_samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples)
|
||||
static inline void dsp_mult1_c(void *obj, float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
const float *s = src;
|
||||
|
|
@ -100,8 +108,8 @@ static inline void dsp_mult1_c(void *obj, void * SPA_RESTRICT dst,
|
|||
}
|
||||
|
||||
void dsp_mult_c(void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[],
|
||||
uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t i;
|
||||
|
|
|
|||
|
|
@ -11,13 +11,13 @@ struct spa_fga_dsp * spa_fga_dsp_new(uint32_t cpu_flags);
|
|||
void spa_fga_dsp_free(struct spa_fga_dsp *dsp);
|
||||
|
||||
#define MAKE_CLEAR_FUNC(arch) \
|
||||
void dsp_clear_##arch(void *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
void dsp_clear_##arch(void *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
#define MAKE_COPY_FUNC(arch) \
|
||||
void dsp_copy_##arch(void *obj, void * SPA_RESTRICT dst, \
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples)
|
||||
void dsp_copy_##arch(void *obj, float * SPA_RESTRICT dst, \
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples)
|
||||
#define MAKE_MIX_GAIN_FUNC(arch) \
|
||||
void dsp_mix_gain_##arch(void *obj, void * SPA_RESTRICT dst, \
|
||||
const void * SPA_RESTRICT src[], float gain[], uint32_t n_src, uint32_t n_samples)
|
||||
void dsp_mix_gain_##arch(void *obj, float * SPA_RESTRICT dst, \
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src, float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
#define MAKE_SUM_FUNC(arch) \
|
||||
void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \
|
||||
const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
|
||||
|
|
@ -25,8 +25,8 @@ void dsp_sum_##arch (void *obj, float * SPA_RESTRICT dst, \
|
|||
void dsp_linear_##arch (void *obj, float * SPA_RESTRICT dst, \
|
||||
const float * SPA_RESTRICT src, const float mult, const float add, uint32_t n_samples)
|
||||
#define MAKE_MULT_FUNC(arch) \
|
||||
void dsp_mult_##arch(void *obj, void * SPA_RESTRICT dst, \
|
||||
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
|
||||
void dsp_mult_##arch(void *obj, float * SPA_RESTRICT dst, \
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
|
||||
#define MAKE_BIQUAD_RUN_FUNC(arch) \
|
||||
void dsp_biquad_run_##arch (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride, \
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples)
|
||||
|
|
|
|||
|
|
@ -19,10 +19,153 @@
|
|||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
static void dsp_add_sse(void *obj, float *dst, const float * SPA_RESTRICT src[],
|
||||
uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m128 in[4];
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
|
||||
unrolled = n_samples & ~15;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 16) {
|
||||
in[0] = _mm_load_ps(&s[0][n+ 0]);
|
||||
in[1] = _mm_load_ps(&s[0][n+ 4]);
|
||||
in[2] = _mm_load_ps(&s[0][n+ 8]);
|
||||
in[3] = _mm_load_ps(&s[0][n+12]);
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0]));
|
||||
in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4]));
|
||||
in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8]));
|
||||
in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12]));
|
||||
}
|
||||
_mm_store_ps(&d[n+ 0], in[0]);
|
||||
_mm_store_ps(&d[n+ 4], in[1]);
|
||||
_mm_store_ps(&d[n+ 8], in[2]);
|
||||
_mm_store_ps(&d[n+12], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
in[0] = _mm_load_ss(&s[0][n]);
|
||||
for (i = 1; i < n_src; i++)
|
||||
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static void dsp_add_1_gain_sse(void *obj,
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m128 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
|
||||
unrolled = n_samples & ~15;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
g = _mm_set1_ps(gain);
|
||||
|
||||
for (n = 0; n < unrolled; n += 16) {
|
||||
in[0] = _mm_load_ps(&s[0][n+ 0]);
|
||||
in[1] = _mm_load_ps(&s[0][n+ 4]);
|
||||
in[2] = _mm_load_ps(&s[0][n+ 8]);
|
||||
in[3] = _mm_load_ps(&s[0][n+12]);
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
in[0] = _mm_add_ps(in[0], _mm_load_ps(&s[i][n+ 0]));
|
||||
in[1] = _mm_add_ps(in[1], _mm_load_ps(&s[i][n+ 4]));
|
||||
in[2] = _mm_add_ps(in[2], _mm_load_ps(&s[i][n+ 8]));
|
||||
in[3] = _mm_add_ps(in[3], _mm_load_ps(&s[i][n+12]));
|
||||
}
|
||||
_mm_store_ps(&d[n+ 0], _mm_mul_ps(in[0], g));
|
||||
_mm_store_ps(&d[n+ 4], _mm_mul_ps(in[1], g));
|
||||
_mm_store_ps(&d[n+ 8], _mm_mul_ps(in[2], g));
|
||||
_mm_store_ps(&d[n+12], _mm_mul_ps(in[3], g));
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
in[0] = _mm_load_ss(&s[0][n]);
|
||||
for (i = 1; i < n_src; i++)
|
||||
in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
|
||||
_mm_store_ss(&d[n], _mm_mul_ss(in[0], g));
|
||||
}
|
||||
}
|
||||
|
||||
static void dsp_add_n_gain_sse(void *obj,
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
uint32_t n, i, unrolled;
|
||||
__m128 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
|
||||
unrolled = n_samples & ~15;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 16) {
|
||||
g = _mm_set1_ps(gain[0]);
|
||||
in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0]));
|
||||
in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4]));
|
||||
in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8]));
|
||||
in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12]));
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set1_ps(gain[i]);
|
||||
in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0])));
|
||||
in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4])));
|
||||
in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8])));
|
||||
in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12])));
|
||||
}
|
||||
_mm_store_ps(&d[n+ 0], in[0]);
|
||||
_mm_store_ps(&d[n+ 4], in[1]);
|
||||
_mm_store_ps(&d[n+ 8], in[2]);
|
||||
_mm_store_ps(&d[n+12], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
g = _mm_set_ss(gain[0]);
|
||||
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set_ss(gain[i]);
|
||||
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
|
||||
}
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
}
|
||||
|
||||
void dsp_mix_gain_sse(void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float gain[], uint32_t n_src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
if (n_src == 0) {
|
||||
memset(dst, 0, n_samples * sizeof(float));
|
||||
|
|
@ -30,50 +173,12 @@ void dsp_mix_gain_sse(void *obj,
|
|||
if (dst != src[0])
|
||||
spa_memcpy(dst, src[0], n_samples * sizeof(float));
|
||||
} else {
|
||||
uint32_t n, i, unrolled;
|
||||
__m128 in[4], g;
|
||||
const float **s = (const float **)src;
|
||||
float *d = dst;
|
||||
|
||||
if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 16))) {
|
||||
unrolled = n_samples & ~15;
|
||||
for (i = 0; i < n_src; i++) {
|
||||
if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 16))) {
|
||||
unrolled = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else
|
||||
unrolled = 0;
|
||||
|
||||
for (n = 0; n < unrolled; n += 16) {
|
||||
g = _mm_set1_ps(gain[0]);
|
||||
in[0] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 0]));
|
||||
in[1] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 4]));
|
||||
in[2] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+ 8]));
|
||||
in[3] = _mm_mul_ps(g, _mm_load_ps(&s[0][n+12]));
|
||||
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set1_ps(gain[i]);
|
||||
in[0] = _mm_add_ps(in[0], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 0])));
|
||||
in[1] = _mm_add_ps(in[1], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 4])));
|
||||
in[2] = _mm_add_ps(in[2], _mm_mul_ps(g, _mm_load_ps(&s[i][n+ 8])));
|
||||
in[3] = _mm_add_ps(in[3], _mm_mul_ps(g, _mm_load_ps(&s[i][n+12])));
|
||||
}
|
||||
_mm_store_ps(&d[n+ 0], in[0]);
|
||||
_mm_store_ps(&d[n+ 4], in[1]);
|
||||
_mm_store_ps(&d[n+ 8], in[2]);
|
||||
_mm_store_ps(&d[n+12], in[3]);
|
||||
}
|
||||
for (; n < n_samples; n++) {
|
||||
g = _mm_set_ss(gain[0]);
|
||||
in[0] = _mm_mul_ss(g, _mm_load_ss(&s[0][n]));
|
||||
for (i = 1; i < n_src; i++) {
|
||||
g = _mm_set_ss(gain[i]);
|
||||
in[0] = _mm_add_ss(in[0], _mm_mul_ss(g, _mm_load_ss(&s[i][n])));
|
||||
}
|
||||
_mm_store_ss(&d[n], in[0]);
|
||||
}
|
||||
if (n_gain == 0)
|
||||
dsp_add_sse(obj, dst, src, n_src, n_samples);
|
||||
else if (n_gain < n_src)
|
||||
dsp_add_1_gain_sse(obj, dst, src, n_src, gain[0], n_samples);
|
||||
else
|
||||
dsp_add_n_gain_sse(obj, dst, src, n_src, gain, n_gain, n_samples);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -22,14 +22,14 @@ struct spa_fga_dsp_methods {
|
|||
#define SPA_VERSION_FGA_DSP_METHODS 0
|
||||
uint32_t version;
|
||||
|
||||
void (*clear) (void *obj, void * SPA_RESTRICT dst, uint32_t n_samples);
|
||||
void (*clear) (void *obj, float * SPA_RESTRICT dst, uint32_t n_samples);
|
||||
void (*copy) (void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples);
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples);
|
||||
void (*mix_gain) (void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float gain[], uint32_t n_src, uint32_t n_samples);
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples);
|
||||
void (*sum) (void *obj,
|
||||
float * dst, const float * SPA_RESTRICT a,
|
||||
const float * SPA_RESTRICT b, uint32_t n_samples);
|
||||
|
|
@ -52,8 +52,8 @@ struct spa_fga_dsp_methods {
|
|||
float * dst, const float * SPA_RESTRICT src,
|
||||
const float mult, const float add, uint32_t n_samples);
|
||||
void (*mult) (void *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples);
|
||||
void (*biquad_run) (void *obj, struct biquad *bq, uint32_t n_bq, uint32_t bq_stride,
|
||||
float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[],
|
||||
uint32_t n_src, uint32_t n_samples);
|
||||
|
|
@ -61,25 +61,25 @@ struct spa_fga_dsp_methods {
|
|||
float *dst, const float *src, uint32_t n_samples);
|
||||
};
|
||||
|
||||
static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, void * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
static inline void spa_fga_dsp_clear(struct spa_fga_dsp *obj, float * SPA_RESTRICT dst, uint32_t n_samples)
|
||||
{
|
||||
spa_api_method_v(spa_fga_dsp, &obj->iface, clear, 0,
|
||||
dst, n_samples);
|
||||
}
|
||||
static inline void spa_fga_dsp_copy(struct spa_fga_dsp *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src, uint32_t n_samples)
|
||||
{
|
||||
spa_api_method_v(spa_fga_dsp, &obj->iface, copy, 0,
|
||||
dst, src, n_samples);
|
||||
}
|
||||
static inline void spa_fga_dsp_mix_gain(struct spa_fga_dsp *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[],
|
||||
float gain[], uint32_t n_src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src,
|
||||
float gain[], uint32_t n_gain, uint32_t n_samples)
|
||||
{
|
||||
spa_api_method_v(spa_fga_dsp, &obj->iface, mix_gain, 0,
|
||||
dst, src, gain, n_src, n_samples);
|
||||
dst, src, n_src, gain, n_gain, n_samples);
|
||||
}
|
||||
static inline void spa_fga_dsp_sum(struct spa_fga_dsp *obj,
|
||||
float * dst, const float * SPA_RESTRICT a,
|
||||
|
|
@ -143,8 +143,8 @@ static inline void spa_fga_dsp_linear(struct spa_fga_dsp *obj,
|
|||
dst, src, mult, add, n_samples);
|
||||
}
|
||||
static inline void spa_fga_dsp_mult(struct spa_fga_dsp *obj,
|
||||
void * SPA_RESTRICT dst,
|
||||
const void * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
|
||||
float * SPA_RESTRICT dst,
|
||||
const float * SPA_RESTRICT src[], uint32_t n_src, uint32_t n_samples)
|
||||
{
|
||||
spa_api_method_v(spa_fga_dsp, &obj->iface, mult, 0,
|
||||
dst, src, n_src, n_samples);
|
||||
|
|
|
|||
|
|
@ -121,8 +121,9 @@ static void mixer_run(void * Instance, unsigned long SampleCount)
|
|||
struct builtin *impl = Instance;
|
||||
int i, n_src = 0;
|
||||
float *out = impl->port[0];
|
||||
const void *src[8];
|
||||
const float *src[8];
|
||||
float gains[8];
|
||||
bool eq_gain = true;
|
||||
|
||||
if (out == NULL)
|
||||
return;
|
||||
|
|
@ -136,8 +137,13 @@ static void mixer_run(void * Instance, unsigned long SampleCount)
|
|||
|
||||
src[n_src] = in;
|
||||
gains[n_src++] = gain;
|
||||
if (gain != gains[0])
|
||||
eq_gain = false;
|
||||
}
|
||||
spa_fga_dsp_mix_gain(impl->dsp, out, src, gains, n_src, SampleCount);
|
||||
if (eq_gain)
|
||||
spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, 1, SampleCount);
|
||||
else
|
||||
spa_fga_dsp_mix_gain(impl->dsp, out, src, n_src, gains, n_src, SampleCount);
|
||||
}
|
||||
|
||||
static struct spa_fga_port mixer_ports[] = {
|
||||
|
|
@ -1589,7 +1595,7 @@ static void mult_run(void * Instance, unsigned long SampleCount)
|
|||
struct builtin *impl = Instance;
|
||||
int i, n_src = 0;
|
||||
float *out = impl->port[0];
|
||||
const void *src[8];
|
||||
const float *src[8];
|
||||
|
||||
if (out == NULL)
|
||||
return;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue