diff --git a/src/modules/module-filter-chain/dsp-ops-sse.c b/src/modules/module-filter-chain/dsp-ops-sse.c index a707e959a..71abc6f26 100644 --- a/src/modules/module-filter-chain/dsp-ops-sse.c +++ b/src/modules/module-filter-chain/dsp-ops-sse.c @@ -151,7 +151,7 @@ void dsp_biquad_run_sse(struct dsp_ops *ops, struct biquad *bq, #undef F } -static void dsp_biquad2_run_sse(struct dsp_ops *ops, struct biquad *bq0, struct biquad *bq1, +static void dsp_biquad2_run_sse(struct dsp_ops *ops, struct biquad *bq, float *out, const float *in, uint32_t n_samples) { __m128 x, y, z; @@ -160,13 +160,13 @@ static void dsp_biquad2_run_sse(struct dsp_ops *ops, struct biquad *bq0, struct __m128 x0, x1; uint32_t i; - b0 = _mm_setr_ps(bq0->b0, bq0->b1, bq0->b2, 0.0f); /* b0 b1 b2 0 */ - a0 = _mm_setr_ps(0.0f, bq0->a1, bq0->a2, 0.0f); /* 0 a1 a2 0 */ - x0 = _mm_setr_ps(bq0->x1, bq0->x2, 0.0f, 0.0f); /* x1 x2 0 0 */ + b0 = _mm_setr_ps(bq[0].b0, bq[0].b1, bq[0].b2, 0.0f); /* b0 b1 b2 0 */ + a0 = _mm_setr_ps(0.0f, bq[0].a1, bq[0].a2, 0.0f); /* 0 a1 a2 0 */ + x0 = _mm_setr_ps(bq[0].x1, bq[0].x2, 0.0f, 0.0f); /* x1 x2 0 0 */ - b1 = _mm_setr_ps(bq1->b0, bq1->b1, bq1->b2, 0.0f); /* b0 b1 b2 0 */ - a1 = _mm_setr_ps(0.0f, bq1->a1, bq1->a2, 0.0f); /* 0 a1 a2 0 */ - x1 = _mm_setr_ps(bq1->x1, bq1->x2, 0.0f, 0.0f); /* x1 x2 0 0 */ + b1 = _mm_setr_ps(bq[1].b0, bq[1].b1, bq[1].b2, 0.0f); /* b0 b1 b2 0 */ + a1 = _mm_setr_ps(0.0f, bq[1].a1, bq[1].a2, 0.0f); /* 0 a1 a2 0 */ + x1 = _mm_setr_ps(bq[1].x1, bq[1].x2, 0.0f, 0.0f); /* x1 x2 0 0 */ for (i = 0; i < n_samples; i++) { x = _mm_load1_ps(&in[i]); /* x x x x */ @@ -188,15 +188,15 @@ static void dsp_biquad2_run_sse(struct dsp_ops *ops, struct biquad *bq0, struct _mm_store_ss(&out[i], x); /* out[i] = b0*x+x1 */ } #define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x)) - bq0->x1 = F(x0[0]); - bq0->x2 = F(x0[1]); - bq1->x1 = F(x1[0]); - bq1->x2 = F(x1[1]); + bq[0].x1 = F(x0[0]); + bq[0].x2 = F(x0[1]); + bq[1].x1 = F(x1[0]); + bq[1].x2 = F(x1[1]); #undef F } -static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bqL, struct biquad *bqR, - float *outL, float *outR, const float *inL, const float *inR, uint32_t n_samples) +static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t bq_stride, + float **out, const float **in, uint32_t n_samples) { __m128 x, y, z; __m128 b0, b1, b2; @@ -204,16 +204,16 @@ static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bqL, struct __m128 x1, x2; uint32_t i; - b0 = _mm_setr_ps(bqL->b0, bqR->b0, 0.0f, 0.0f); /* b00 b10 0 0 */ - b1 = _mm_setr_ps(bqL->b1, bqR->b1, 0.0f, 0.0f); /* b01 b11 0 0 */ - b2 = _mm_setr_ps(bqL->b2, bqR->b2, 0.0f, 0.0f); /* b02 b12 0 0 */ - a1 = _mm_setr_ps(bqL->a1, bqR->a1, 0.0f, 0.0f); /* b00 b10 0 0 */ - a2 = _mm_setr_ps(bqL->a2, bqR->a2, 0.0f, 0.0f); /* b01 b11 0 0 */ - x1 = _mm_setr_ps(bqL->x1, bqR->x1, 0.0f, 0.0f); /* b00 b10 0 0 */ - x2 = _mm_setr_ps(bqL->x2, bqR->x2, 0.0f, 0.0f); /* b01 b11 0 0 */ + b0 = _mm_setr_ps(bq[0].b0, bq[bq_stride].b0, 0.0f, 0.0f); /* b00 b10 0 0 */ + b1 = _mm_setr_ps(bq[0].b1, bq[bq_stride].b1, 0.0f, 0.0f); /* b01 b11 0 0 */ + b2 = _mm_setr_ps(bq[0].b2, bq[bq_stride].b2, 0.0f, 0.0f); /* b02 b12 0 0 */ + a1 = _mm_setr_ps(bq[0].a1, bq[bq_stride].a1, 0.0f, 0.0f); /* b00 b10 0 0 */ + a2 = _mm_setr_ps(bq[0].a2, bq[bq_stride].a2, 0.0f, 0.0f); /* b01 b11 0 0 */ + x1 = _mm_setr_ps(bq[0].x1, bq[bq_stride].x1, 0.0f, 0.0f); /* b00 b10 0 0 */ + x2 = _mm_setr_ps(bq[0].x2, bq[bq_stride].x2, 0.0f, 0.0f); /* b01 b11 0 0 */ for (i = 0; i < n_samples; i++) { - x = _mm_setr_ps(inL[i], inR[i], 0.0f, 0.0f); + x = _mm_setr_ps(in[0][i], in[1][i], 0.0f, 0.0f); y = _mm_mul_ps(x, b0); /* y = x * b0 */ y = _mm_add_ps(y, x1); /* y = x * b0 + x1*/ @@ -225,21 +225,20 @@ static void dsp_biquad_run2_sse(struct dsp_ops *ops, struct biquad *bqL, struct x2 = _mm_mul_ps(x, b2); /* x2 = x * b2 */ x2 = _mm_sub_ps(x2, z); /* x2 = x * b2 - a2 * y*/ - outL[i] = y[0]; - outR[i] = y[1]; + out[0][i] = y[0]; + out[1][i] = y[1]; } #define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x)) - bqL->x1 = F(x1[0]); - bqL->x2 = F(x2[0]); - bqR->x1 = F(x1[1]); - bqR->x2 = F(x2[1]); + bq[0*bq_stride].x1 = F(x1[0]); + bq[0*bq_stride].x2 = F(x2[0]); + bq[1*bq_stride].x1 = F(x1[1]); + bq[1*bq_stride].x2 = F(x2[1]); #undef F } -static void dsp_biquad2_run2_sse(struct dsp_ops *ops, struct biquad *bqL0, struct biquad *bqL1, - struct biquad *bqR0, struct biquad *bqR1, - float *outL, float *outR, const float *inL, const float *inR, uint32_t n_samples) +static void dsp_biquad2_run2_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t bq_stride, + float **out, const float **in, uint32_t n_samples) { __m128 x, y, z; __m128 b00, b01, b02, b10, b11, b12; @@ -247,24 +246,24 @@ static void dsp_biquad2_run2_sse(struct dsp_ops *ops, struct biquad *bqL0, struc __m128 x01, x02, x11, x12; uint32_t i; - b00 = _mm_setr_ps(bqL0->b0, bqR0->b0, 0.0f, 0.0f); /* b00 b10 0 0 */ - b01 = _mm_setr_ps(bqL0->b1, bqR0->b1, 0.0f, 0.0f); /* b01 b11 0 0 */ - b02 = _mm_setr_ps(bqL0->b2, bqR0->b2, 0.0f, 0.0f); /* b02 b12 0 0 */ - a01 = _mm_setr_ps(bqL0->a1, bqR0->a1, 0.0f, 0.0f); /* b00 b10 0 0 */ - a02 = _mm_setr_ps(bqL0->a2, bqR0->a2, 0.0f, 0.0f); /* b01 b11 0 0 */ - x01 = _mm_setr_ps(bqL0->x1, bqR0->x1, 0.0f, 0.0f); /* b00 b10 0 0 */ - x02 = _mm_setr_ps(bqL0->x2, bqR0->x2, 0.0f, 0.0f); /* b01 b11 0 0 */ + b00 = _mm_setr_ps(bq[0].b0, bq[bq_stride].b0, 0.0f, 0.0f); /* b00 b10 0 0 */ + b01 = _mm_setr_ps(bq[0].b1, bq[bq_stride].b1, 0.0f, 0.0f); /* b01 b11 0 0 */ + b02 = _mm_setr_ps(bq[0].b2, bq[bq_stride].b2, 0.0f, 0.0f); /* b02 b12 0 0 */ + a01 = _mm_setr_ps(bq[0].a1, bq[bq_stride].a1, 0.0f, 0.0f); /* b00 b10 0 0 */ + a02 = _mm_setr_ps(bq[0].a2, bq[bq_stride].a2, 0.0f, 0.0f); /* b01 b11 0 0 */ + x01 = _mm_setr_ps(bq[0].x1, bq[bq_stride].x1, 0.0f, 0.0f); /* b00 b10 0 0 */ + x02 = _mm_setr_ps(bq[0].x2, bq[bq_stride].x2, 0.0f, 0.0f); /* b01 b11 0 0 */ - b10 = _mm_setr_ps(bqL1->b0, bqR1->b0, 0.0f, 0.0f); /* b00 b10 0 0 */ - b11 = _mm_setr_ps(bqL1->b1, bqR1->b1, 0.0f, 0.0f); /* b01 b11 0 0 */ - b12 = _mm_setr_ps(bqL1->b2, bqR1->b2, 0.0f, 0.0f); /* b02 b12 0 0 */ - a11 = _mm_setr_ps(bqL1->a1, bqR1->a1, 0.0f, 0.0f); /* b00 b10 0 0 */ - a12 = _mm_setr_ps(bqL1->a2, bqR1->a2, 0.0f, 0.0f); /* b01 b11 0 0 */ - x11 = _mm_setr_ps(bqL1->x1, bqR1->x1, 0.0f, 0.0f); /* b00 b10 0 0 */ - x12 = _mm_setr_ps(bqL1->x2, bqR1->x2, 0.0f, 0.0f); /* b01 b11 0 0 */ + b10 = _mm_setr_ps(bq[1].b0, bq[bq_stride+1].b0, 0.0f, 0.0f); /* b00 b10 0 0 */ + b11 = _mm_setr_ps(bq[1].b1, bq[bq_stride+1].b1, 0.0f, 0.0f); /* b01 b11 0 0 */ + b12 = _mm_setr_ps(bq[1].b2, bq[bq_stride+1].b2, 0.0f, 0.0f); /* b02 b12 0 0 */ + a11 = _mm_setr_ps(bq[1].a1, bq[bq_stride+1].a1, 0.0f, 0.0f); /* b00 b10 0 0 */ + a12 = _mm_setr_ps(bq[1].a2, bq[bq_stride+1].a2, 0.0f, 0.0f); /* b01 b11 0 0 */ + x11 = _mm_setr_ps(bq[1].x1, bq[bq_stride+1].x1, 0.0f, 0.0f); /* b00 b10 0 0 */ + x12 = _mm_setr_ps(bq[1].x2, bq[bq_stride+1].x2, 0.0f, 0.0f); /* b01 b11 0 0 */ for (i = 0; i < n_samples; i++) { - x = _mm_setr_ps(inL[i], inR[i], 0.0f, 0.0f); + x = _mm_setr_ps(in[0][i], in[1][i], 0.0f, 0.0f); y = _mm_mul_ps(x, b00); /* y = x * b0 */ y = _mm_add_ps(y, x01); /* y = x * b0 + x1*/ @@ -288,18 +287,142 @@ static void dsp_biquad2_run2_sse(struct dsp_ops *ops, struct biquad *bqL0, struc x12 = _mm_mul_ps(x, b12); /* x2 = x * b2 */ x12 = _mm_sub_ps(x12, z); /* x2 = x * b2 - a2 * y*/ - outL[i] = y[0]; - outR[i] = y[1]; + out[0][i] = y[0]; + out[1][i] = y[1]; } #define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x)) - bqL0->x1 = F(x01[0]); - bqL0->x2 = F(x02[0]); - bqR0->x1 = F(x01[1]); - bqR0->x2 = F(x02[1]); - bqL1->x1 = F(x11[0]); - bqL1->x2 = F(x12[0]); - bqR1->x1 = F(x11[1]); - bqR1->x2 = F(x12[1]); + bq[0*bq_stride+0].x1 = F(x01[0]); + bq[0*bq_stride+0].x2 = F(x02[0]); + bq[1*bq_stride+0].x1 = F(x01[1]); + bq[1*bq_stride+0].x2 = F(x02[1]); + + bq[0*bq_stride+1].x1 = F(x11[0]); + bq[0*bq_stride+1].x2 = F(x12[0]); + bq[1*bq_stride+1].x1 = F(x11[1]); + bq[1*bq_stride+1].x2 = F(x12[1]); +#undef F +} + +static void dsp_biquad_run4_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t bq_stride, + float **out, const float **in, uint32_t n_samples) +{ + __m128 x, y, z; + __m128 b0, b1, b2; + __m128 a1, a2; + __m128 x1, x2; + uint32_t i; + + b0 = _mm_setr_ps(bq[0].b0, bq[bq_stride].b0, bq[2*bq_stride].b0, bq[3*bq_stride].b0); + b1 = _mm_setr_ps(bq[0].b1, bq[bq_stride].b1, bq[2*bq_stride].b1, bq[3*bq_stride].b1); + b2 = _mm_setr_ps(bq[0].b2, bq[bq_stride].b2, bq[2*bq_stride].b2, bq[3*bq_stride].b2); + a1 = _mm_setr_ps(bq[0].a1, bq[bq_stride].a1, bq[2*bq_stride].a1, bq[3*bq_stride].a1); + a2 = _mm_setr_ps(bq[0].a2, bq[bq_stride].a2, bq[2*bq_stride].a2, bq[3*bq_stride].a2); + x1 = _mm_setr_ps(bq[0].x1, bq[bq_stride].x1, bq[2*bq_stride].x1, bq[3*bq_stride].x1); + x2 = _mm_setr_ps(bq[0].x2, bq[bq_stride].x2, bq[2*bq_stride].x2, bq[3*bq_stride].x2); + + for (i = 0; i < n_samples; i++) { + x = _mm_setr_ps(in[0][i], in[1][i], in[2][i], in[3][i]); + + y = _mm_mul_ps(x, b0); /* y = x * b0 */ + y = _mm_add_ps(y, x1); /* y = x * b0 + x1*/ + z = _mm_mul_ps(y, a1); /* z = a1 * y */ + x1 = _mm_mul_ps(x, b1); /* x1 = x * b1 */ + x1 = _mm_add_ps(x1, x2); /* x1 = x * b1 + x2*/ + x1 = _mm_sub_ps(x1, z); /* x1 = x * b1 + x2 - a1 * y*/ + z = _mm_mul_ps(y, a2); /* z = a2 * y */ + x2 = _mm_mul_ps(x, b2); /* x2 = x * b2 */ + x2 = _mm_sub_ps(x2, z); /* x2 = x * b2 - a2 * y*/ + + out[0][i] = y[0]; + out[1][i] = y[1]; + out[2][i] = y[2]; + out[3][i] = y[3]; + } +#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x)) + bq[0*bq_stride].x1 = F(x1[0]); + bq[0*bq_stride].x2 = F(x2[0]); + bq[1*bq_stride].x1 = F(x1[1]); + bq[1*bq_stride].x2 = F(x2[1]); + bq[2*bq_stride].x1 = F(x1[2]); + bq[2*bq_stride].x2 = F(x2[2]); + bq[3*bq_stride].x1 = F(x1[3]); + bq[3*bq_stride].x2 = F(x2[3]); +#undef F +} + +static void dsp_biquad2_run4_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t bq_stride, + float **out, const float **in, uint32_t n_samples) +{ + __m128 x, y, z; + __m128 b00, b01, b02, b10, b11, b12; + __m128 a01, a02, a11, a12; + __m128 x01, x02, x11, x12; + uint32_t i; + + b00 = _mm_setr_ps(bq[0].b0, bq[bq_stride].b0, bq[2*bq_stride].b0, bq[3*bq_stride].b0); + b01 = _mm_setr_ps(bq[0].b1, bq[bq_stride].b1, bq[2*bq_stride].b1, bq[3*bq_stride].b1); + b02 = _mm_setr_ps(bq[0].b2, bq[bq_stride].b2, bq[2*bq_stride].b2, bq[3*bq_stride].b2); + a01 = _mm_setr_ps(bq[0].a1, bq[bq_stride].a1, bq[2*bq_stride].a1, bq[3*bq_stride].a1); + a02 = _mm_setr_ps(bq[0].a2, bq[bq_stride].a2, bq[2*bq_stride].a2, bq[3*bq_stride].a2); + x01 = _mm_setr_ps(bq[0].x1, bq[bq_stride].x1, bq[2*bq_stride].x1, bq[3*bq_stride].x1); + x02 = _mm_setr_ps(bq[0].x2, bq[bq_stride].x2, bq[2*bq_stride].x2, bq[3*bq_stride].x2); + + b10 = _mm_setr_ps(bq[1].b0, bq[bq_stride+1].b0, bq[2*bq_stride+1].b0, bq[3*bq_stride+1].b0); + b11 = _mm_setr_ps(bq[1].b1, bq[bq_stride+1].b1, bq[2*bq_stride+1].b1, bq[3*bq_stride+1].b1); + b12 = _mm_setr_ps(bq[1].b2, bq[bq_stride+1].b2, bq[2*bq_stride+1].b2, bq[3*bq_stride+1].b2); + a11 = _mm_setr_ps(bq[1].a1, bq[bq_stride+1].a1, bq[2*bq_stride+1].a1, bq[3*bq_stride+1].a1); + a12 = _mm_setr_ps(bq[1].a2, bq[bq_stride+1].a2, bq[2*bq_stride+1].a2, bq[3*bq_stride+1].a2); + x11 = _mm_setr_ps(bq[1].x1, bq[bq_stride+1].x1, bq[2*bq_stride+1].x1, bq[3*bq_stride+1].x1); + x12 = _mm_setr_ps(bq[1].x2, bq[bq_stride+1].x2, bq[2*bq_stride+1].x2, bq[3*bq_stride+1].x2); + + for (i = 0; i < n_samples; i++) { + x = _mm_setr_ps(in[0][i], in[1][i], in[2][i], in[3][i]); + + y = _mm_mul_ps(x, b00); /* y = x * b0 */ + y = _mm_add_ps(y, x01); /* y = x * b0 + x1*/ + z = _mm_mul_ps(y, a01); /* z = a1 * y */ + x01 = _mm_mul_ps(x, b01); /* x1 = x * b1 */ + x01 = _mm_add_ps(x01, x02); /* x1 = x * b1 + x2*/ + x01 = _mm_sub_ps(x01, z); /* x1 = x * b1 + x2 - a1 * y*/ + z = _mm_mul_ps(y, a02); /* z = a2 * y */ + x02 = _mm_mul_ps(x, b02); /* x2 = x * b2 */ + x02 = _mm_sub_ps(x02, z); /* x2 = x * b2 - a2 * y*/ + + x = y; + + y = _mm_mul_ps(x, b10); /* y = x * b0 */ + y = _mm_add_ps(y, x11); /* y = x * b0 + x1*/ + z = _mm_mul_ps(y, a11); /* z = a1 * y */ + x11 = _mm_mul_ps(x, b11); /* x1 = x * b1 */ + x11 = _mm_add_ps(x11, x12); /* x1 = x * b1 + x2*/ + x11 = _mm_sub_ps(x11, z); /* x1 = x * b1 + x2 - a1 * y*/ + z = _mm_mul_ps(y, a12); /* z = a2 * y*/ + x12 = _mm_mul_ps(x, b12); /* x2 = x * b2 */ + x12 = _mm_sub_ps(x12, z); /* x2 = x * b2 - a2 * y*/ + + out[0][i] = y[0]; + out[1][i] = y[1]; + out[2][i] = y[2]; + out[3][i] = y[3]; + } +#define F(x) (-FLT_MIN < (x) && (x) < FLT_MIN ? 0.0f : (x)) + bq[0*bq_stride+0].x1 = F(x01[0]); + bq[0*bq_stride+0].x2 = F(x02[0]); + bq[1*bq_stride+0].x1 = F(x01[1]); + bq[1*bq_stride+0].x2 = F(x02[1]); + bq[2*bq_stride+0].x1 = F(x01[2]); + bq[2*bq_stride+0].x2 = F(x02[2]); + bq[3*bq_stride+0].x1 = F(x01[3]); + bq[3*bq_stride+0].x2 = F(x02[3]); + + bq[0*bq_stride+1].x1 = F(x11[0]); + bq[0*bq_stride+1].x2 = F(x12[0]); + bq[1*bq_stride+1].x1 = F(x11[1]); + bq[1*bq_stride+1].x2 = F(x12[1]); + bq[2*bq_stride+1].x1 = F(x11[2]); + bq[2*bq_stride+1].x2 = F(x12[2]); + bq[3*bq_stride+1].x1 = F(x11[3]); + bq[3*bq_stride+1].x2 = F(x12[3]); #undef F } @@ -307,26 +430,54 @@ void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, float * SPA_RESTRICT out[], const float * SPA_RESTRICT in[], uint32_t n_src, uint32_t n_samples) { - uint32_t i, j; - uint32_t iunrolled = n_src & ~1; - uint32_t junrolled = n_bq & ~1; + uint32_t i, j, bqs2 = bq_stride*2, bqs4 = bqs2*2; + uint32_t iunrolled4 = n_src & ~3; + uint32_t iunrolled2 = n_src & ~1; + uint32_t junrolled2 = n_bq & ~1; - for (i = 0; i < iunrolled; i+=2, bq+=bq_stride) { - const float *s0 = in[i], *s1 = in[i+1]; - float *d0 = out[i], *d1 = out[i+1]; - if (s0 == NULL || s1 == NULL || d0 == NULL || d1 == NULL) + for (i = 0; i < iunrolled4; i+=4, bq+=bqs4) { + const float *s[4] = { in[i], in[i+1], in[i+2], in[i+3] }; + float *d[4] = { out[i], out[i+1], out[i+2], out[i+3] }; + + if (s[0] == NULL || s[1] == NULL || s[2] == NULL || s[3] == NULL || + d[0] == NULL || d[1] == NULL || d[2] == NULL || d[3] == NULL) break; - for (j = 0; j < junrolled; j+=2) { - dsp_biquad2_run2_sse(ops, &bq[j], &bq[j+1], &bq[j+bq_stride], &bq[j+bq_stride+1], - d0, d1, s0, s1, n_samples); - s0 = d0; - s1 = d1; + j = 0; + if (j < junrolled2) { + dsp_biquad2_run4_sse(ops, &bq[j], bq_stride, d, s, n_samples); + s[0] = d[0]; + s[1] = d[1]; + s[2] = d[2]; + s[3] = d[3]; + j+=2; } - for (; j < n_bq; j++) { - dsp_biquad_run2_sse(ops, &bq[j], &bq[j+bq_stride], d0, d1, s0, s1, n_samples); - s0 = d0; - s1 = d1; + for (; j < junrolled2; j+=2) { + dsp_biquad2_run4_sse(ops, &bq[j], bq_stride, d, s, n_samples); + } + if (j < n_bq) { + dsp_biquad_run4_sse(ops, &bq[j], bq_stride, d, s, n_samples); + } + } + for (; i < iunrolled2; i+=2, bq+=bqs2) { + const float *s[2] = { in[i], in[i+1] }; + float *d[2] = { out[i], out[i+1] }; + + if (s[0] == NULL || s[1] == NULL || d[0] == NULL || d[1] == NULL) + break; + + j = 0; + if (j < junrolled2) { + dsp_biquad2_run2_sse(ops, &bq[j], bq_stride, d, s, n_samples); + s[0] = d[0]; + s[1] = d[1]; + j+=2; + } + for (; j < junrolled2; j+=2) { + dsp_biquad2_run2_sse(ops, &bq[j], bq_stride, d, s, n_samples); + } + if (j < n_bq) { + dsp_biquad_run2_sse(ops, &bq[j], bq_stride, d, s, n_samples); } } for (; i < n_src; i++, bq+=bq_stride) { @@ -335,13 +486,17 @@ void dsp_biquadn_run_sse(struct dsp_ops *ops, struct biquad *bq, uint32_t n_bq, if (s == NULL || d == NULL) continue; - for (j = 0; j < junrolled; j+=2) { - dsp_biquad2_run_sse(ops, &bq[j], &bq[j+1], d, s, n_samples); + j = 0; + if (j < junrolled2) { + dsp_biquad2_run_sse(ops, &bq[j], d, s, n_samples); s = d; + j+=2; } - for (; j < n_bq; j++) { + for (; j < junrolled2; j+=2) { + dsp_biquad2_run_sse(ops, &bq[j], d, s, n_samples); + } + if (j < n_bq) { dsp_biquad_run_sse(ops, &bq[j], d, s, n_samples); - s = d; } } }