mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2025-11-03 09:01:54 -05:00
audioconvert: optimize lr4 handling
Make an SSE optimized LR4 function and one that handles 2 LR4s in parallel.
This commit is contained in:
parent
fec3730489
commit
ad84c45c0c
7 changed files with 160 additions and 59 deletions
|
|
@ -5,6 +5,7 @@
|
|||
#include "channelmix-ops.h"
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <float.h>
|
||||
|
||||
static inline void clear_sse(float *d, uint32_t n_samples)
|
||||
{
|
||||
|
|
@ -149,6 +150,113 @@ void channelmix_copy_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
|||
vol_sse(d[i], s[i], mix->matrix[i][i], n_samples);
|
||||
}
|
||||
|
||||
static void lr4_process_sse(struct lr4 *lr4, float *dst, const float *src, const float vol, int samples)
|
||||
{
|
||||
__m128 x, y, z;
|
||||
__m128 b012;
|
||||
__m128 a12;
|
||||
__m128 x12, y12, v;
|
||||
int i;
|
||||
|
||||
if (vol == 0.0f || !lr4->active) {
|
||||
vol_sse(dst, src, vol, samples);
|
||||
return;
|
||||
}
|
||||
|
||||
b012 = _mm_setr_ps(lr4->bq.b0, lr4->bq.b1, lr4->bq.b2, 0.0f); /* b0 b1 b2 0 */
|
||||
a12 = _mm_setr_ps(0.0f, lr4->bq.a1, lr4->bq.a2, 0.0f); /* 0 a1 a2 0 */
|
||||
x12 = _mm_setr_ps(lr4->x1, lr4->x2, 0.0f, 0.0f); /* x1 x2 0 0 */
|
||||
y12 = _mm_setr_ps(lr4->y1, lr4->y2, 0.0f, 0.0f); /* y1 y2 0 0 */
|
||||
v = _mm_setr_ps(vol, vol, 0.0f, 0.0f);
|
||||
|
||||
for (i = 0; i < samples; i++) {
|
||||
x = _mm_load1_ps(&src[i]); /* x x x x */
|
||||
|
||||
z = _mm_mul_ps(x, b012); /* b0*x b1*x b2*x 0 */
|
||||
z = _mm_add_ps(z, x12); /* b0*x+x1 b1*x+x2 b2*x 0 */
|
||||
y = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0)); /* b0*x+x1 b0*x+x1 b0*x+x1 b0*x+x1 = y*/
|
||||
x = _mm_mul_ps(y, a12); /* 0 a1*y a2*y 0 */
|
||||
x = _mm_sub_ps(z, x); /* y x1 x2 0 */
|
||||
x12 = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3,3,2,1)); /* x1 x2 0 0*/
|
||||
|
||||
z = _mm_mul_ps(y, b012);
|
||||
z = _mm_add_ps(z, y12);
|
||||
x = _mm_shuffle_ps(z, z, _MM_SHUFFLE(0,0,0,0));
|
||||
y = _mm_mul_ps(x, a12);
|
||||
y = _mm_sub_ps(z, y);
|
||||
y12 = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3,3,2,1));
|
||||
|
||||
x = _mm_mul_ps(x, v);
|
||||
_mm_store_ss(&dst[i], x);
|
||||
}
|
||||
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
|
||||
lr4->x1 = F(x12[0]);
|
||||
lr4->x2 = F(x12[1]);
|
||||
lr4->y1 = F(y12[0]);
|
||||
lr4->y2 = F(y12[1]);
|
||||
#undef F
|
||||
}
|
||||
|
||||
static void lr4_process_2_sse(struct lr4 *lr40, struct lr4 *lr41, float *dst0, float *dst1,
|
||||
const float *src0, const float *src1, const float vol0, const float vol1, uint32_t samples)
|
||||
{
|
||||
__m128 x, y, z;
|
||||
__m128 b0, b1, b2;
|
||||
__m128 a1, a2;
|
||||
__m128 x1, x2;
|
||||
__m128 y1, y2, v;
|
||||
uint32_t i;
|
||||
|
||||
b0 = _mm_setr_ps(lr40->bq.b0, lr41->bq.b0, 0.0f, 0.0f);
|
||||
b1 = _mm_setr_ps(lr40->bq.b1, lr41->bq.b1, 0.0f, 0.0f);
|
||||
b2 = _mm_setr_ps(lr40->bq.b2, lr41->bq.b2, 0.0f, 0.0f);
|
||||
a1 = _mm_setr_ps(lr40->bq.a1, lr41->bq.a1, 0.0f, 0.0f);
|
||||
a2 = _mm_setr_ps(lr40->bq.a2, lr41->bq.a2, 0.0f, 0.0f);
|
||||
x1 = _mm_setr_ps(lr40->x1, lr41->x1, 0.0f, 0.0f);
|
||||
x2 = _mm_setr_ps(lr40->x2, lr41->x2, 0.0f, 0.0f);
|
||||
y1 = _mm_setr_ps(lr40->y1, lr41->y1, 0.0f, 0.0f);
|
||||
y2 = _mm_setr_ps(lr40->y2, lr41->y2, 0.0f, 0.0f);
|
||||
v = _mm_setr_ps(vol0, vol1, 0.0f, 0.0f);
|
||||
|
||||
for (i = 0; i < samples; i++) {
|
||||
x = _mm_setr_ps(src0[i], src1[i], 0.0f, 0.0f);
|
||||
|
||||
y = _mm_mul_ps(x, b0); /* y = x * b0 */
|
||||
y = _mm_add_ps(y, x1); /* y = x * b0 + x1*/
|
||||
z = _mm_mul_ps(y, a1); /* z = a1 * y */
|
||||
x1 = _mm_mul_ps(x, b1); /* x1 = x * b1 */
|
||||
x1 = _mm_add_ps(x1, x2); /* x1 = x * b1 + x2*/
|
||||
x1 = _mm_sub_ps(x1, z); /* x1 = x * b1 + x2 - a1 * y*/
|
||||
z = _mm_mul_ps(y, a2); /* z = a2 * y */
|
||||
x2 = _mm_mul_ps(x, b2); /* x2 = x * b2 */
|
||||
x2 = _mm_sub_ps(x2, z); /* x2 = x * b2 - a2 * y*/
|
||||
|
||||
x = _mm_mul_ps(y, b0); /* y = x * b0 */
|
||||
x = _mm_add_ps(x, y1); /* y = x * b0 + x1*/
|
||||
z = _mm_mul_ps(x, a1); /* z = a1 * y */
|
||||
y1 = _mm_mul_ps(y, b1); /* x1 = x * b1 */
|
||||
y1 = _mm_add_ps(y1, y2); /* x1 = x * b1 + x2*/
|
||||
y1 = _mm_sub_ps(y1, z); /* x1 = x * b1 + x2 - a1 * y*/
|
||||
z = _mm_mul_ps(x, a2); /* z = a2 * y */
|
||||
y2 = _mm_mul_ps(y, b2); /* x2 = x * b2 */
|
||||
y2 = _mm_sub_ps(y2, z); /* x2 = x * b2 - a2 * y*/
|
||||
|
||||
x = _mm_mul_ps(x, v);
|
||||
dst0[i] = x[0];
|
||||
dst1[i] = x[1];
|
||||
}
|
||||
#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x))
|
||||
lr40->x1 = F(x1[0]);
|
||||
lr40->x2 = F(x2[0]);
|
||||
lr40->y1 = F(y1[0]);
|
||||
lr40->y2 = F(y2[0]);
|
||||
lr41->x1 = F(x1[1]);
|
||||
lr41->x2 = F(x2[1]);
|
||||
lr41->y1 = F(y1[1]);
|
||||
lr41->y2 = F(y2[1]);
|
||||
#undef F
|
||||
}
|
||||
|
||||
void
|
||||
channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||
const void * SPA_RESTRICT src[], uint32_t n_samples)
|
||||
|
|
@ -172,13 +280,10 @@ channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
|||
if (n_j == 0) {
|
||||
clear_sse(di, n_samples);
|
||||
} else if (n_j == 1) {
|
||||
if (mix->lr4[i].active)
|
||||
lr4_process(&mix->lr4[i], di, sj[0], mj[0], n_samples);
|
||||
else
|
||||
vol_sse(di, sj[0], mj[0], n_samples);
|
||||
lr4_process_sse(&mix->lr4[i], di, sj[0], mj[0], n_samples);
|
||||
} else {
|
||||
conv_sse(di, sj, mj, n_j, n_samples);
|
||||
lr4_process(&mix->lr4[i], di, di, 1.0f, n_samples);
|
||||
lr4_process_sse(&mix->lr4[i], di, di, 1.0f, n_samples);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -239,8 +344,7 @@ channelmix_f32_2_3p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
|||
_mm_store_ss(&d[2][n], _mm_mul_ss(c[0], mh));
|
||||
}
|
||||
}
|
||||
lr4_process(&mix->lr4[3], d[3], d[2], v3, n_samples);
|
||||
lr4_process(&mix->lr4[2], d[2], d[2], v2, n_samples);
|
||||
lr4_process_2_sse(&mix->lr4[3], &mix->lr4[2], d[3], d[2], d[2], d[2], v3, v2, n_samples);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue