channelmix: add optimised NxM channelmix functions

2026-06-04 03:03:00 -04:00 · 2022-09-28 17:50:59 +02:00 · 2022-09-28 17:50:59 +02:00 · 637bc6f7c4
commit 637bc6f7c4
parent 10f1d545a7
4 changed files with 54 additions and 6 deletions
--- a/spa/plugins/audioconvert/channelmix-ops-c.c
+++ b/spa/plugins/audioconvert/channelmix-ops-c.c
@ -94,16 +94,16 @@ channelmix_f32_n_m_c(struct channelmix *mix, void * SPA_RESTRICT dst[],
 			clear_c(d[i], n_samples);
 	}
 	else {
-		for (n = 0; n < n_samples; n++) {
-			for (i = 0; i < n_dst; i++) {
+		for (i = 0; i < n_dst; i++) {
+			float *mi = mix->matrix[i], *di = d[i];
+			for (n = 0; n < n_samples; n++) {
 				float sum = 0.0f;
 				for (j = 0; j < n_src; j++)
-					sum += s[j][n] * mix->matrix[i][j];
-				d[i][n] = sum;
+					sum += s[j][n] * mi[j];
+				di[n] = sum;
 			}
-		}
-		for (i = 0; i < n_dst; i++)
 			lr4_process(&mix->lr4[i], d[i], d[i], 1.0f, n_samples);
+		}
 	}
 }

--- a/spa/plugins/audioconvert/channelmix-ops-sse.c
+++ b/spa/plugins/audioconvert/channelmix-ops-sse.c
@ -78,6 +78,50 @@ void channelmix_copy_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
 		vol_sse(d[i], s[i], mix->matrix[i][i], n_samples);
 }

+void
+channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
+		   const void * SPA_RESTRICT src[], uint32_t n_samples)
+{
+	float **d = (float **) dst;
+	const float **s = (const float **) src;
+	uint32_t n, unrolled;
+	uint32_t i, j, n_dst = mix->dst_chan, n_src = mix->src_chan;
+	__m128 mi[n_src], sum[2];
+	bool aligned = true;
+
+	for (j = 0; j < n_src; j++)
+		aligned &= SPA_IS_ALIGNED(s[j], 16);
+
+	for (i = 0; i < n_dst; i++) {
+		float *di = d[i];
+
+		for (j = 0; j < n_src; j++)
+			mi[j] = _mm_set1_ps(mix->matrix[i][j]);
+
+		if (aligned && SPA_IS_ALIGNED(d[i], 16))
+			unrolled = n_samples & ~7;
+		else
+			unrolled = 0;
+
+		for (n = 0; n < unrolled; n += 8) {
+			sum[0] = sum[1] = _mm_setzero_ps();
+			for (j = 0; j < n_src; j++) {
+				sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_load_ps(&s[j][n + 0]), mi[j]));
+				sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(_mm_load_ps(&s[j][n + 4]), mi[j]));
+			}
+			_mm_store_ps(&di[n + 0], sum[0]);
+			_mm_store_ps(&di[n + 4], sum[1]);
+		}
+		for (; n < n_samples; n++) {
+			sum[0] = _mm_setzero_ps();
+			for (j = 0; j < n_src; j++)
+				sum[0] = _mm_add_ss(sum[0], _mm_mul_ss(_mm_load_ss(&s[j][n]), mi[j]));
+			_mm_store_ss(&di[n], sum[0]);
+		}
+		lr4_process(&mix->lr4[i], d[i], d[i], 1.0f, n_samples);
+	}
+}
+
 /* FL+FR+FC+LFE -> FL+FR */
 void
 channelmix_f32_3p1_2_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
--- a/spa/plugins/audioconvert/channelmix-ops.c
+++ b/spa/plugins/audioconvert/channelmix-ops.c
@ -94,6 +94,9 @@ static const struct channelmix_info {
 	MAKE(8, MASK_7_1, 4, MASK_QUAD, channelmix_f32_7p1_4_c),
 	MAKE(8, MASK_7_1, 4, MASK_3_1, channelmix_f32_7p1_3p1_c),

+#if defined (HAVE_SSE)
+	MAKE(ANY, 0, ANY, 0, channelmix_f32_n_m_sse),
+#endif
 	MAKE(ANY, 0, ANY, 0, channelmix_f32_n_m_c),
 };
 #undef MAKE
--- a/spa/plugins/audioconvert/channelmix-ops.h
+++ b/spa/plugins/audioconvert/channelmix-ops.h
@ -147,6 +147,7 @@ DEFINE_FUNCTION(f32_7p1_4, c);

 #if defined (HAVE_SSE)
 DEFINE_FUNCTION(copy, sse);
+DEFINE_FUNCTION(f32_n_m, sse);
 DEFINE_FUNCTION(f32_3p1_2, sse);
 DEFINE_FUNCTION(f32_5p1_2, sse);
 DEFINE_FUNCTION(f32_5p1_3p1, sse);