From 3b422e31a218d6651b3796e8e40b756a3e393106 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Thu, 19 Mar 2026 16:21:35 +0100 Subject: [PATCH] cpu: add SLOW_GATHER flag Intel Skylake (level 0x16) is the first model with fast gather opcodes. Mark lower versions with the SLOW_GATHER flag. Prefer the SSE2 version of the format conversion without gather when SLOW_GATHER is set. Makes the conversion much faster on my Ivy Bridge. --- spa/include/spa/support/cpu.h | 1 + spa/plugins/audioconvert/fmt-ops.c | 6 ++++++ spa/plugins/support/cpu-x86.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/spa/include/spa/support/cpu.h b/spa/include/spa/support/cpu.h index c69338855..2faf42154 100644 --- a/spa/include/spa/support/cpu.h +++ b/spa/include/spa/support/cpu.h @@ -62,6 +62,7 @@ struct spa_cpu { struct spa_interface iface; }; #define SPA_CPU_FLAG_BMI2 (1<<18) /**< Bit Manipulation Instruction Set 2 */ #define SPA_CPU_FLAG_AVX512 (1<<19) /**< AVX-512 */ #define SPA_CPU_FLAG_SLOW_UNALIGNED (1<<20) /**< unaligned loads/stores are slow */ +#define SPA_CPU_FLAG_SLOW_GATHER (1<<21) /**< gather functions are slow */ /* PPC specific */ #define SPA_CPU_FLAG_ALTIVEC (1<<0) /**< standard */ diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c index 3fc2c5f0a..057f3294a 100644 --- a/spa/plugins/audioconvert/fmt-ops.c +++ b/spa/plugins/audioconvert/fmt-ops.c @@ -108,6 +108,9 @@ static struct conv_info conv_table[] = MAKE(U32, F32, 0, conv_u32_to_f32_c), MAKE(U32, F32P, 0, conv_u32_to_f32d_c), +#if defined (HAVE_SSE2) + MAKE(S32, F32P, 0, conv_s32_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER), +#endif #if defined (HAVE_AVX2) MAKE(S32, F32P, 0, conv_s32_to_f32d_avx2, SPA_CPU_FLAG_AVX2), #endif @@ -129,6 +132,9 @@ static struct conv_info conv_table[] = MAKE(S24, F32, 0, conv_s24_to_f32_c), MAKE(S24P, F32P, 0, conv_s24d_to_f32d_c), +#if defined (HAVE_SSE2) + MAKE(S24, F32P, 0, conv_s24_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER), +#endif #if defined (HAVE_AVX2) MAKE(S24, F32P, 0, conv_s24_to_f32d_avx2, SPA_CPU_FLAG_AVX2), #endif diff --git a/spa/plugins/support/cpu-x86.c b/spa/plugins/support/cpu-x86.c index c1c53855d..0fb866671 100644 --- a/spa/plugins/support/cpu-x86.c +++ b/spa/plugins/support/cpu-x86.c @@ -78,6 +78,8 @@ x86_init(struct impl *impl) if ((ebx & AVX512_BITS) == AVX512_BITS) flags |= SPA_CPU_FLAG_AVX512; } + if (max_level < 0x16) + flags |= SPA_CPU_FLAG_SLOW_GATHER; /* Check cpuid level of extended features. */ __cpuid (0x80000000, ext_level, ebx, ecx, edx);