From c02cdcb5ce9811ee1ee1471db71ebbb923fa7e10 Mon Sep 17 00:00:00 2001
From: Wim Taymans <wtaymans@redhat.com>
Date: Fri, 20 Mar 2026 17:53:00 +0100
Subject: [PATCH] audioconvert: add avx2 optimized s32_to f32d

Add an alternative avx2 s32_to_f32d implementation that doesn't use the
gather function for when gather is slow.

Don't overwrite the orinal cpu_flags but store the selected flags in a
new variable. Use this to debug the selected function cpu flags.

Build libraries with defines from previous libraries so that we can
reuse functions from them.

We can then remove the SSE2 | SLOW_GATHER function selection from the
list. We will now select avx2 and it will then switch implementations
based on the CPU flags.
---
 spa/plugins/audioconvert/audioconvert.c    |   8 +-
 spa/plugins/audioconvert/channelmix-ops.c  |   2 +-
 spa/plugins/audioconvert/channelmix-ops.h  |   1 +
 spa/plugins/audioconvert/fmt-ops-avx2.c    | 217 +++++++++++++++++++--
 spa/plugins/audioconvert/fmt-ops.c         |   8 +-
 spa/plugins/audioconvert/fmt-ops.h         |   1 +
 spa/plugins/audioconvert/meson.build       |  10 +-
 spa/plugins/audioconvert/peaks-ops.c       |   2 +-
 spa/plugins/audioconvert/peaks-ops.h       |   1 +
 spa/plugins/audioconvert/resample-native.c |   2 +-
 spa/plugins/audioconvert/resample.h        |   1 +
 spa/plugins/audioconvert/volume-ops.c      |   2 +-
 spa/plugins/audioconvert/volume-ops.h      |   1 +
 13 files changed, 218 insertions(+), 38 deletions(-)

diff --git a/spa/plugins/audioconvert/audioconvert.c b/spa/plugins/audioconvert/audioconvert.c
index 6b4243b2a..dce1367f1 100644
--- a/spa/plugins/audioconvert/audioconvert.c
+++ b/spa/plugins/audioconvert/audioconvert.c
@@ -2125,7 +2125,7 @@ static int setup_in_convert(struct impl *this)
 		return res;
 
 	spa_log_debug(this->log, "%p: got converter features %08x:%08x passthrough:%d remap:%d %s", this,
-			this->cpu_flags, in->conv.cpu_flags, in->conv.is_passthrough,
+			this->cpu_flags, in->conv.func_cpu_flags, in->conv.is_passthrough,
 			remap, in->conv.func_name);
 
 	return 0;
@@ -2282,7 +2282,7 @@ static int setup_channelmix(struct impl *this, uint32_t channels, uint32_t *posi
 	set_volume(this);
 
 	spa_log_debug(this->log, "%p: got channelmix features %08x:%08x flags:%08x %s",
-			this, this->cpu_flags, this->mix.cpu_flags,
+			this, this->cpu_flags, this->mix.func_cpu_flags,
 			this->mix.flags, this->mix.func_name);
 	return 0;
 }
@@ -2330,7 +2330,7 @@ static int setup_resample(struct impl *this)
 		res = resample_native_init(&this->resample);
 
 	spa_log_debug(this->log, "%p: got resample features %08x:%08x %s",
-			this, this->cpu_flags, this->resample.cpu_flags,
+			this, this->cpu_flags, this->resample.func_cpu_flags,
 			this->resample.func_name);
 	return res;
 }
@@ -2422,7 +2422,7 @@ static int setup_out_convert(struct impl *this)
 
 	spa_log_debug(this->log, "%p: got converter features %08x:%08x quant:%d:%d"
 			" passthrough:%d remap:%d %s", this,
-			this->cpu_flags, out->conv.cpu_flags, out->conv.method,
+			this->cpu_flags, out->conv.func_cpu_flags, out->conv.method,
 			out->conv.noise_bits, out->conv.is_passthrough, remap, out->conv.func_name);
 
 	return 0;
diff --git a/spa/plugins/audioconvert/channelmix-ops.c b/spa/plugins/audioconvert/channelmix-ops.c
index 7bf046cf0..c8c01eb81 100644
--- a/spa/plugins/audioconvert/channelmix-ops.c
+++ b/spa/plugins/audioconvert/channelmix-ops.c
@@ -885,8 +885,8 @@ int channelmix_init(struct channelmix *mix)
 	mix->free = impl_channelmix_free;
 	mix->process = info->process;
 	mix->set_volume = impl_channelmix_set_volume;
-	mix->cpu_flags = info->cpu_flags;
 	mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f);
+	mix->func_cpu_flags = info->cpu_flags;
 	mix->func_name = info->name;
 
 	spa_zero(mix->taps_mem);
diff --git a/spa/plugins/audioconvert/channelmix-ops.h b/spa/plugins/audioconvert/channelmix-ops.h
index 6ea2b9451..155079cd2 100644
--- a/spa/plugins/audioconvert/channelmix-ops.h
+++ b/spa/plugins/audioconvert/channelmix-ops.h
@@ -44,6 +44,7 @@ struct channelmix {
 	uint32_t upmix;
 
 	struct spa_log *log;
+	uint32_t func_cpu_flags;
 	const char *func_name;
 
 #define CHANNELMIX_FLAG_ZERO		(1<<0)		/**< all zero components */
diff --git a/spa/plugins/audioconvert/fmt-ops-avx2.c b/spa/plugins/audioconvert/fmt-ops-avx2.c
index af0af91f2..9c3dce52d 100644
--- a/spa/plugins/audioconvert/fmt-ops-avx2.c
+++ b/spa/plugins/audioconvert/fmt-ops-avx2.c
@@ -4,6 +4,8 @@
 
 #include "fmt-ops.h"
 
+#include <spa/support/cpu.h>
+
 #include <immintrin.h>
 // GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
 //      (see https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values)
@@ -285,7 +287,7 @@ conv_s16s_to_f32d_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const
 }
 
 static void
-conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s24_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int8_t *s = src;
@@ -321,7 +323,7 @@ conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 }
 
 static void
-conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s24_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int8_t *s = src;
@@ -373,7 +375,7 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	}
 }
 static void
-conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s24_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int8_t *s = src;
@@ -447,16 +449,22 @@ conv_s24_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
 	const int8_t *s = src[0];
 	uint32_t i = 0, n_channels = conv->n_channels;
 
-	for(; i + 3 < n_channels; i += 4)
-		conv_s24_to_f32d_4s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
-	for(; i + 1 < n_channels; i += 2)
-		conv_s24_to_f32d_2s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
-	for(; i < n_channels; i++)
-		conv_s24_to_f32d_1s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
+	if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
+#if defined (HAVE_SSE2)
+		conv_s24_to_f32d_sse2(conv, dst, src, n_samples);
+#endif
+	} else {
+		for(; i + 3 < n_channels; i += 4)
+			conv_s24_to_f32d_4s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
+		for(; i + 1 < n_channels; i += 2)
+			conv_s24_to_f32d_2s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
+		for(; i < n_channels; i++)
+			conv_s24_to_f32d_1s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
+	}
 }
 
 static void
-conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s32_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int32_t *s = src;
@@ -510,7 +518,7 @@ conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 }
 
 static void
-conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s32_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int32_t *s = src;
@@ -555,7 +563,7 @@ conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 }
 
 static void
-conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+conv_s32_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
 	const int32_t *s = src;
@@ -595,6 +603,169 @@ conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	}
 }
 
+
+static void
+conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+		uint32_t n_channels, uint32_t n_samples)
+{
+	const int32_t *s = src;
+	float *d0 = dst[0], *d1 = dst[1];
+	uint32_t n, unrolled;
+	__m256i in[4];
+	__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
+
+	if (SPA_IS_ALIGNED(d0, 32) &&
+	    SPA_IS_ALIGNED(d1, 32))
+		unrolled = n_samples & ~7;
+	else
+		unrolled = 0;
+
+	for(n = 0; n < unrolled; n += 8) {
+		in[0] = _mm256_setr_epi64x(
+				*((uint64_t*)&s[0*n_channels]),
+				*((uint64_t*)&s[1*n_channels]),
+				*((uint64_t*)&s[4*n_channels]),
+				*((uint64_t*)&s[5*n_channels]));
+		in[1] = _mm256_setr_epi64x(
+				*((uint64_t*)&s[2*n_channels]),
+				*((uint64_t*)&s[3*n_channels]),
+				*((uint64_t*)&s[6*n_channels]),
+				*((uint64_t*)&s[7*n_channels]));
+
+		out[0] = _mm256_cvtepi32_ps(in[0]);
+		out[1] = _mm256_cvtepi32_ps(in[1]);
+
+		out[0] = _mm256_mul_ps(out[0], factor); /* a0 b0 a1 b1 a4 b4 a5 b5 */
+		out[1] = _mm256_mul_ps(out[1], factor); /* a2 b2 a3 b3 a6 b6 a7 b7 */
+
+		t[0] = _mm256_unpacklo_ps(out[0], out[1]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
+		t[1] = _mm256_unpackhi_ps(out[0], out[1]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
+
+		out[0] = _mm256_unpacklo_ps(t[0], t[1]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
+		out[1] = _mm256_unpackhi_ps(t[0], t[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
+
+		_mm256_store_ps(&d0[n], out[0]);
+		_mm256_store_ps(&d1[n], out[1]);
+
+		s += 8*n_channels;
+	}
+	for(; n < n_samples; n++) {
+		__m128 out[2], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
+		out[0] = _mm_cvtsi32_ss(factor, s[0]);
+		out[1] = _mm_cvtsi32_ss(factor, s[1]);
+		out[0] = _mm_mul_ss(out[0], factor);
+		out[1] = _mm_mul_ss(out[1], factor);
+		_mm_store_ss(&d0[n], out[0]);
+		_mm_store_ss(&d1[n], out[1]);
+		s += n_channels;
+	}
+}
+
+static void
+conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+		uint32_t n_channels, uint32_t n_samples)
+{
+	const int32_t *s = src;
+	float *d0 = dst[0];
+	uint32_t n, unrolled;
+	__m256i in[2];
+	__m256 out[2], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
+
+	if (SPA_IS_ALIGNED(d0, 32))
+		unrolled = n_samples & ~7;
+	else
+		unrolled = 0;
+
+	for(n = 0; n < unrolled; n += 8) {
+		in[0] = _mm256_setr_epi32(
+				s[0*n_channels], s[1*n_channels],
+				s[2*n_channels], s[3*n_channels],
+				s[4*n_channels], s[5*n_channels],
+				s[6*n_channels], s[7*n_channels]);
+		out[0] = _mm256_cvtepi32_ps(in[0]);
+		out[0] = _mm256_mul_ps(out[0], factor);
+		_mm256_store_ps(&d0[n+0], out[0]);
+		s += 8*n_channels;
+	}
+	for(; n < n_samples; n++) {
+		__m128 out, factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
+		out = _mm_cvtsi32_ss(factor, s[0]);
+		out = _mm_mul_ss(out, factor);
+		_mm_store_ss(&d0[n], out);
+		s += n_channels;
+	}
+}
+
+static void
+conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
+		uint32_t n_channels, uint32_t n_samples)
+{
+	const int32_t *s = src;
+	float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
+	uint32_t n, unrolled;
+	__m256i in[4];
+	__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
+
+	if (SPA_IS_ALIGNED(d0, 32) &&
+	    SPA_IS_ALIGNED(d1, 32) &&
+	    SPA_IS_ALIGNED(d2, 32) &&
+	    SPA_IS_ALIGNED(d3, 32))
+		unrolled = n_samples & ~7;
+	else
+		unrolled = 0;
+
+	for(n = 0; n < unrolled; n += 8) {
+		in[0] = _mm256_setr_m128i(
+				_mm_loadu_si128((__m128i*)&s[0*n_channels]),
+				_mm_loadu_si128((__m128i*)&s[4*n_channels]));
+		in[1] = _mm256_setr_m128i(
+				_mm_loadu_si128((__m128i*)&s[1*n_channels]),
+				_mm_loadu_si128((__m128i*)&s[5*n_channels]));
+		in[2] = _mm256_setr_m128i(
+				_mm_loadu_si128((__m128i*)&s[2*n_channels]),
+				_mm_loadu_si128((__m128i*)&s[6*n_channels]));
+		in[3] = _mm256_setr_m128i(
+				_mm_loadu_si128((__m128i*)&s[3*n_channels]),
+				_mm_loadu_si128((__m128i*)&s[7*n_channels]));
+
+		out[0] = _mm256_cvtepi32_ps(in[0]);	/* a0 b0 c0 d0 a4 b4 c4 d4 */
+		out[1] = _mm256_cvtepi32_ps(in[1]);	/* a1 b1 c1 d1 a5 b5 c5 d5 */
+		out[2] = _mm256_cvtepi32_ps(in[2]);	/* a2 b2 c2 d2 a6 b6 c6 d6 */
+		out[3] = _mm256_cvtepi32_ps(in[3]);	/* a3 b3 c3 d3 a7 b7 c7 d7 */
+
+		out[0] = _mm256_mul_ps(out[0], factor);
+		out[1] = _mm256_mul_ps(out[1], factor);
+		out[2] = _mm256_mul_ps(out[2], factor);
+		out[3] = _mm256_mul_ps(out[3], factor);
+
+		t[0] = _mm256_unpacklo_ps(out[0], out[2]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
+		t[1] = _mm256_unpackhi_ps(out[0], out[2]); /* c0 c2 d0 d2 c4 c6 d4 d6 */
+		t[2] = _mm256_unpacklo_ps(out[1], out[3]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
+		t[3] = _mm256_unpackhi_ps(out[1], out[3]); /* c1 c3 d1 d3 c5 c7 d5 d7 */
+
+		out[0] = _mm256_unpacklo_ps(t[0], t[2]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
+		out[1] = _mm256_unpackhi_ps(t[0], t[2]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
+		out[2] = _mm256_unpacklo_ps(t[1], t[3]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
+		out[3] = _mm256_unpackhi_ps(t[1], t[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
+
+		_mm256_store_ps(&d0[n], out[0]);
+		_mm256_store_ps(&d1[n], out[1]);
+		_mm256_store_ps(&d2[n], out[2]);
+		_mm256_store_ps(&d3[n], out[3]);
+
+		s += 8*n_channels;
+	}
+	for(; n < n_samples; n++) {
+		__m128 out[4], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
+		__m128i in[1];
+		in[0] = _mm_setr_epi32(s[0], s[1], s[2], s[3]);
+		out[0] = _mm_cvtepi32_ps(in[0]);
+		out[0] = _mm_mul_ps(out[0], factor);
+		_MM_STOREM_PS(&d0[n], &d1[n], &d2[n], &d3[n], out[0]);
+		s += n_channels;
+	}
+}
+
 void
 conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 		uint32_t n_samples)
@@ -602,12 +773,21 @@ conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
 	const int32_t *s = src[0];
 	uint32_t i = 0, n_channels = conv->n_channels;
 
-	for(; i + 3 < n_channels; i += 4)
-		conv_s32_to_f32d_4s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
-	for(; i + 1 < n_channels; i += 2)
-		conv_s32_to_f32d_2s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
-	for(; i < n_channels; i++)
-		conv_s32_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+	if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
+		for(; i + 3 < n_channels; i += 4)
+			conv_s32_to_f32d_4s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+		for(; i + 1 < n_channels; i += 2)
+			conv_s32_to_f32d_2s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+		for(; i < n_channels; i++)
+			conv_s32_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+	} else {
+		for(; i + 3 < n_channels; i += 4)
+			conv_s32_to_f32d_4s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+		for(; i + 1 < n_channels; i += 2)
+			conv_s32_to_f32d_2s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+		for(; i < n_channels; i++)
+			conv_s32_to_f32d_1s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
+	}
 }
 
 static void
@@ -1187,3 +1367,4 @@ conv_f32d_to_s16s_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const
 		d += 2;
 	}
 }
+
diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c
index 057f3294a..34de40445 100644
--- a/spa/plugins/audioconvert/fmt-ops.c
+++ b/spa/plugins/audioconvert/fmt-ops.c
@@ -108,9 +108,6 @@ static struct conv_info conv_table[] =
 	MAKE(U32, F32, 0, conv_u32_to_f32_c),
 	MAKE(U32, F32P, 0, conv_u32_to_f32d_c),
 
-#if defined (HAVE_SSE2)
-	MAKE(S32, F32P, 0, conv_s32_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER),
-#endif
 #if defined (HAVE_AVX2)
 	MAKE(S32, F32P, 0, conv_s32_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
 #endif
@@ -132,9 +129,6 @@ static struct conv_info conv_table[] =
 
 	MAKE(S24, F32, 0, conv_s24_to_f32_c),
 	MAKE(S24P, F32P, 0, conv_s24d_to_f32d_c),
-#if defined (HAVE_SSE2)
-	MAKE(S24, F32P, 0, conv_s24_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER),
-#endif
 #if defined (HAVE_AVX2)
 	MAKE(S24, F32P, 0, conv_s24_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
 #endif
@@ -637,7 +631,7 @@ int convert_init(struct convert *conv)
 		conv->random[i] = random();
 
 	conv->is_passthrough = conv->src_fmt == conv->dst_fmt;
-	conv->cpu_flags = info->cpu_flags;
+	conv->func_cpu_flags = info->cpu_flags;
 	conv->update_noise = ninfo->noise;
 	conv->process = info->process;
 	conv->clear = cinfo ? cinfo->clear : NULL;
diff --git a/spa/plugins/audioconvert/fmt-ops.h b/spa/plugins/audioconvert/fmt-ops.h
index f738e3858..24b4b1aaf 100644
--- a/spa/plugins/audioconvert/fmt-ops.h
+++ b/spa/plugins/audioconvert/fmt-ops.h
@@ -219,6 +219,7 @@ struct convert {
 	uint32_t n_channels;
 	uint32_t rate;
 	uint32_t cpu_flags;
+	uint32_t func_cpu_flags;
 	const char *func_name;
 
 	unsigned int is_passthrough:1;
diff --git a/spa/plugins/audioconvert/meson.build b/spa/plugins/audioconvert/meson.build
index bd60872b6..559db4308 100644
--- a/spa/plugins/audioconvert/meson.build
+++ b/spa/plugins/audioconvert/meson.build
@@ -44,7 +44,7 @@ endif
 if have_sse2
   audioconvert_sse2 = static_library('audioconvert_sse2',
     ['fmt-ops-sse2.c' ],
-    c_args : [sse2_args, '-O3', '-DHAVE_SSE2'],
+    c_args : [sse2_args, '-O3', '-DHAVE_SSE2', simd_cargs],
     dependencies : [ spa_dep ],
     install : false
     )
@@ -55,7 +55,7 @@ if have_ssse3
   audioconvert_ssse3 = static_library('audioconvert_ssse3',
     ['fmt-ops-ssse3.c',
       'resample-native-ssse3.c' ],
-    c_args : [ssse3_args, '-O3', '-DHAVE_SSSE3'],
+    c_args : [ssse3_args, '-O3', '-DHAVE_SSSE3', simd_cargs],
     dependencies : [ spa_dep ],
     install : false
     )
@@ -65,7 +65,7 @@ endif
 if have_sse41
   audioconvert_sse41 = static_library('audioconvert_sse41',
     ['fmt-ops-sse41.c'],
-    c_args : [sse41_args, '-O3', '-DHAVE_SSE41'],
+    c_args : [sse41_args, '-O3', '-DHAVE_SSE41', simd_cargs],
     dependencies : [ spa_dep ],
     install : false
     )
@@ -75,7 +75,7 @@ endif
 if have_avx2 and have_fma
   audioconvert_avx2_fma = static_library('audioconvert_avx2_fma',
     ['resample-native-avx2.c'],
-    c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
+    c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA', simd_cargs],
     dependencies : [ spa_dep ],
     install : false
     )
@@ -85,7 +85,7 @@ endif
 if have_avx2
   audioconvert_avx2 = static_library('audioconvert_avx2',
     ['fmt-ops-avx2.c'],
-    c_args : [avx2_args, '-O3', '-DHAVE_AVX2'],
+    c_args : [avx2_args, '-O3', '-DHAVE_AVX2', simd_cargs],
     dependencies : [ spa_dep ],
     install : false
     )
diff --git a/spa/plugins/audioconvert/peaks-ops.c b/spa/plugins/audioconvert/peaks-ops.c
index 29b93a081..f7a897f90 100644
--- a/spa/plugins/audioconvert/peaks-ops.c
+++ b/spa/plugins/audioconvert/peaks-ops.c
@@ -60,7 +60,7 @@ int peaks_init(struct peaks *peaks)
 	if (info == NULL)
 		return -ENOTSUP;
 
-	peaks->cpu_flags = info->cpu_flags;
+	peaks->func_cpu_flags = info->cpu_flags;
 	peaks->func_name = info->name;
 	peaks->free = impl_peaks_free;
 	peaks->min_max = info->min_max;
diff --git a/spa/plugins/audioconvert/peaks-ops.h b/spa/plugins/audioconvert/peaks-ops.h
index 24092a4f7..40b20cfbc 100644
--- a/spa/plugins/audioconvert/peaks-ops.h
+++ b/spa/plugins/audioconvert/peaks-ops.h
@@ -14,6 +14,7 @@ extern struct spa_log_topic resample_log_topic;
 
 struct peaks {
 	uint32_t cpu_flags;
+	uint32_t func_cpu_flags;
 	const char *func_name;
 
 	struct spa_log *log;
diff --git a/spa/plugins/audioconvert/resample-native.c b/spa/plugins/audioconvert/resample-native.c
index 3604c5b45..5bb33ffc1 100644
--- a/spa/plugins/audioconvert/resample-native.c
+++ b/spa/plugins/audioconvert/resample-native.c
@@ -576,7 +576,7 @@ int resample_native_init(struct resample *r)
 			r, c->cutoff, r->quality, c->window, r->i_rate, r->o_rate, gcd, n_taps, n_phases,
 			r->cpu_flags, d->info->cpu_flags);
 
-	r->cpu_flags = d->info->cpu_flags;
+	r->func_cpu_flags = d->info->cpu_flags;
 
 	impl_native_reset(r);
 	impl_native_update_rate(r, 1.0);
diff --git a/spa/plugins/audioconvert/resample.h b/spa/plugins/audioconvert/resample.h
index fec3bf963..7b6e58415 100644
--- a/spa/plugins/audioconvert/resample.h
+++ b/spa/plugins/audioconvert/resample.h
@@ -38,6 +38,7 @@ struct resample {
 #define RESAMPLE_OPTION_PREFILL		(1<<0)
 	uint32_t options;
 	uint32_t cpu_flags;
+	uint32_t func_cpu_flags;
 	const char *func_name;
 
 	uint32_t channels;
diff --git a/spa/plugins/audioconvert/volume-ops.c b/spa/plugins/audioconvert/volume-ops.c
index bf6aa6909..b76ab4bec 100644
--- a/spa/plugins/audioconvert/volume-ops.c
+++ b/spa/plugins/audioconvert/volume-ops.c
@@ -56,7 +56,7 @@ int volume_init(struct volume *vol)
 	if (info == NULL)
 		return -ENOTSUP;
 
-	vol->cpu_flags = info->cpu_flags;
+	vol->func_cpu_flags = info->cpu_flags;
 	vol->func_name = info->name;
 	vol->free = impl_volume_free;
 	vol->process = info->process;
diff --git a/spa/plugins/audioconvert/volume-ops.h b/spa/plugins/audioconvert/volume-ops.h
index a50ee9a6f..51642110f 100644
--- a/spa/plugins/audioconvert/volume-ops.h
+++ b/spa/plugins/audioconvert/volume-ops.h
@@ -13,6 +13,7 @@
 
 struct volume {
 	uint32_t cpu_flags;
+	uint32_t func_cpu_flags;
 	const char *func_name;
 
 	struct spa_log *log;