diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c index 6e1ff62eb..896a3fa10 100644 --- a/spa/plugins/audioconvert/fmt-ops-sse2.c +++ b/spa/plugins/audioconvert/fmt-ops-sse2.c @@ -324,8 +324,8 @@ conv_f32d_to_s32_1_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE uint32_t n, unrolled; __m128 in[1]; __m128i out[4]; - __m128 int_max = _mm_set1_ps(S24_MAX_F); - __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); + __m128 scale = _mm_set1_ps(S32_SCALE); + __m128 int_min = _mm_set1_ps(S32_MIN); if (SPA_IS_ALIGNED(s0, 16)) unrolled = n_samples / 4; @@ -333,10 +333,9 @@ conv_f32d_to_s32_1_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); - in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); - - out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale); + in[0] = _mm_min_ps(in[0], int_min); + out[0] = _mm_cvtps_epi32(in[0]); out[1] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(0, 3, 2, 1)); out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2)); out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3)); @@ -349,9 +348,9 @@ conv_f32d_to_s32_1_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE } for(; n < n_samples; n++) { in[0] = _mm_load_ss(&s0[n]); - in[0] = _mm_mul_ss(in[0], int_max); - in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min)); - *d = _mm_cvtss_si32(in[0]) << 8; + in[0] = _mm_mul_ss(in[0], scale); + in[0] = _mm_min_ss(in[0], int_min); + *d = _mm_cvtss_si32(in[0]); d += n_channels; } } @@ -365,8 +364,8 @@ conv_f32d_to_s32_2_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE uint32_t n, unrolled; __m128 in[2]; __m128i out[2], t[2]; - __m128 int_max = _mm_set1_ps(S24_MAX_F); - __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); + __m128 scale = _mm_set1_ps(S32_SCALE); + __m128 int_min = _mm_set1_ps(S32_MIN); if (SPA_IS_ALIGNED(s0, 16) && SPA_IS_ALIGNED(s1, 16)) @@ -375,14 +374,14 @@ conv_f32d_to_s32_2_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), scale); - in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); - in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min)); + in[0] = _mm_min_ps(in[0], int_min); + in[1] = _mm_min_ps(in[1], int_min); - out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); - out[1] = _mm_slli_epi32(_mm_cvtps_epi32(in[1]), 8); + out[0] = _mm_cvtps_epi32(in[0]); + out[1] = _mm_cvtps_epi32(in[1]); t[0] = _mm_unpacklo_epi32(out[0], out[1]); t[1] = _mm_unpackhi_epi32(out[0], out[1]); @@ -399,9 +398,9 @@ conv_f32d_to_s32_2_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE in[0] = _mm_unpacklo_ps(in[0], in[1]); - in[0] = _mm_mul_ps(in[0], int_max); - in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); - out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); + in[0] = _mm_mul_ps(in[0], scale); + in[0] = _mm_min_ps(in[0], int_min); + out[0] = _mm_cvtps_epi32(in[0]); _mm_storel_epi64((__m128i*)d, out[0]); d += n_channels; } @@ -416,8 +415,8 @@ conv_f32d_to_s32_4_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE uint32_t n, unrolled; __m128 in[4]; __m128i out[4]; - __m128 int_max = _mm_set1_ps(S24_MAX_F); - __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); + __m128 scale = _mm_set1_ps(S32_SCALE); + __m128 int_min = _mm_set1_ps(S32_MIN); if (SPA_IS_ALIGNED(s0, 16) && SPA_IS_ALIGNED(s1, 16) && @@ -429,22 +428,22 @@ conv_f32d_to_s32_4_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE unrolled = 0; for(n = 0; unrolled--; n += 4) { - in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); - in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); - in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), int_max); - in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), int_max); + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), scale); + in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), scale); + in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), scale); - in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); - in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min)); - in[2] = _mm_min_ps(int_max, _mm_max_ps(in[2], int_min)); - in[3] = _mm_min_ps(int_max, _mm_max_ps(in[3], int_min)); + in[0] = _mm_min_ps(in[0], int_min); + in[1] = _mm_min_ps(in[1], int_min); + in[2] = _mm_min_ps(in[2], int_min); + in[3] = _mm_min_ps(in[3], int_min); _MM_TRANSPOSE4_PS(in[0], in[1], in[2], in[3]); - out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); - out[1] = _mm_slli_epi32(_mm_cvtps_epi32(in[1]), 8); - out[2] = _mm_slli_epi32(_mm_cvtps_epi32(in[2]), 8); - out[3] = _mm_slli_epi32(_mm_cvtps_epi32(in[3]), 8); + out[0] = _mm_cvtps_epi32(in[0]); + out[1] = _mm_cvtps_epi32(in[1]); + out[2] = _mm_cvtps_epi32(in[2]); + out[3] = _mm_cvtps_epi32(in[3]); _mm_store_si128((__m128i*)(d + 0*n_channels), out[0]); _mm_store_si128((__m128i*)(d + 1*n_channels), out[1]); @@ -462,9 +461,9 @@ conv_f32d_to_s32_4_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RE in[1] = _mm_unpacklo_ps(in[1], in[3]); in[0] = _mm_unpacklo_ps(in[0], in[1]); - in[0] = _mm_mul_ps(in[0], int_max); - in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min)); - out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8); + in[0] = _mm_mul_ps(in[0], scale); + in[0] = _mm_min_ps(in[0], int_min); + out[0] = _mm_cvtps_epi32(in[0]); _mm_storeu_si128((__m128i*)d, out[0]); d += n_channels; } diff --git a/spa/plugins/audioconvert/fmt-ops-ssse3.c b/spa/plugins/audioconvert/fmt-ops-ssse3.c index 93b30178e..45374b1ea 100644 --- a/spa/plugins/audioconvert/fmt-ops-ssse3.c +++ b/spa/plugins/audioconvert/fmt-ops-ssse3.c @@ -27,7 +27,7 @@ #include -#include +#include static void conv_s24_to_f32d_4_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c index ceb54bc0e..988f426e5 100644 --- a/spa/plugins/audioconvert/fmt-ops.c +++ b/spa/plugins/audioconvert/fmt-ops.c @@ -51,6 +51,8 @@ #define S24_TO_F32(v) (((int32_t)(v)) * (1.0f / S24_SCALE)) #define F32_TO_S24(v) (int32_t)(SPA_CLAMP(v, -1.0f, 1.0f) * S24_SCALE) +#define S32_SCALE 2147483648.0f +#define S32_MIN 2147483520.0f #define S32_TO_F32(v) S24_TO_F32((v) >> 8) #define F32_TO_S32(v) (F32_TO_S24(v) << 8) @@ -734,6 +736,7 @@ static const struct conv_info { uint32_t src_fmt; uint32_t dst_fmt; #define FEATURE_SSE2 SPA_CPU_FLAG_SSE2 +#define FEATURE_SSSE3 SPA_CPU_FLAG_SSSE3 uint32_t features; convert_func_t func; @@ -770,7 +773,7 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse41 }, #endif #if defined (__SSSE3__) - { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_ssse3 }, + { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSSE3, conv_s24_to_f32d_ssse3 }, #endif #if defined (__SSE2__) { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse2 }, diff --git a/spa/plugins/audioconvert/resample-native-sse.h b/spa/plugins/audioconvert/resample-native-sse.h index d52c14753..8762c2996 100644 --- a/spa/plugins/audioconvert/resample-native-sse.h +++ b/spa/plugins/audioconvert/resample-native-sse.h @@ -24,9 +24,6 @@ #include -#pragma GCC target("ssse3") -#include - static void inner_product_sse(float *d, const float * SPA_RESTRICT s, const float * SPA_RESTRICT taps, uint32_t n_taps) {