diff --git a/spa/include/spa/utils/defs.h b/spa/include/spa/utils/defs.h index 4654d4a10..9226f48de 100644 --- a/spa/include/spa/utils/defs.h +++ b/spa/include/spa/utils/defs.h @@ -160,6 +160,14 @@ struct spa_param_info { #define SPA_EXPORT #endif +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define SPA_RESTRICT restrict +#elif defined(__GNUC__) && __GNUC__ >= 4 +#define SPA_RESTRICT __restrict__ +#else +#define SPA_RESTRICT +#endif + #define SPA_ROUND_DOWN_N(num,align) ((num) & ~((align) - 1)) #define SPA_ROUND_UP_N(num,align) SPA_ROUND_DOWN_N((num) + ((align) - 1),align) diff --git a/spa/plugins/audioconvert/channelmix-ops-sse.c b/spa/plugins/audioconvert/channelmix-ops-sse.c index 6cb6881c6..fdd2e49ff 100644 --- a/spa/plugins/audioconvert/channelmix-ops-sse.c +++ b/spa/plugins/audioconvert/channelmix-ops-sse.c @@ -224,7 +224,7 @@ channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst], float **s = (float **) src; __m128 mix = _mm_set1_ps(v * 0.5f); __m128 vol = _mm_set1_ps(v); - __m128 avg; + __m128 avg[2]; float *sFL = s[0], *sFR = s[1], *sFC = s[2], *sLFE = s[3], *sSL = s[4], *sSR = s[5]; float *dFL = d[0], *dFR = d[1], *dFC = d[2], *dLFE = d[3]; @@ -238,7 +238,7 @@ channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst], SPA_IS_ALIGNED(dFR, 16) && SPA_IS_ALIGNED(dFC, 16) && SPA_IS_ALIGNED(dLFE, 16)) - unrolled = n_samples / 4; + unrolled = n_samples / 8; else unrolled = 0; @@ -247,37 +247,49 @@ channelmix_f32_5p1_3p1_sse(void *data, int n_dst, void *dst[n_dst], memset(d[i], 0, n_samples * sizeof(float)); } else if (v == VOLUME_NORM) { - for(n = 0; unrolled--; n += 4) { - avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); - _mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix)); - avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); - _mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix)); + for(n = 0; unrolled--; n += 8) { + avg[0] = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); + avg[1] = _mm_add_ps(_mm_load_ps(&sFL[n+4]), _mm_load_ps(&sSL[n+4])); + _mm_store_ps(&dFL[n], _mm_mul_ps(avg[0], mix)); + _mm_store_ps(&dFL[n+4], _mm_mul_ps(avg[1], mix)); + avg[0] = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); + avg[1] = _mm_add_ps(_mm_load_ps(&sFR[n+4]), _mm_load_ps(&sSR[n+4])); + _mm_store_ps(&dFR[n], _mm_mul_ps(avg[0], mix)); + _mm_store_ps(&dFR[n+4], _mm_mul_ps(avg[1], mix)); _mm_store_ps(&dFC[n], _mm_load_ps(&sFC[n])); + _mm_store_ps(&dFC[n+4], _mm_load_ps(&sFC[n+4])); _mm_store_ps(&dLFE[n], _mm_load_ps(&sLFE[n])); + _mm_store_ps(&dLFE[n+4], _mm_load_ps(&sLFE[n+4])); } for(; n < n_samples; n++) { - avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); - _mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix)); - avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); - _mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix)); + avg[0] = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); + _mm_store_ss(&dFL[n], _mm_mul_ss(avg[0], mix)); + avg[0] = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); + _mm_store_ss(&dFR[n], _mm_mul_ss(avg[0], mix)); _mm_store_ss(&dFC[n], _mm_load_ss(&sFC[n])); _mm_store_ss(&dLFE[n], _mm_load_ss(&sLFE[n])); } } else { - for(n = 0; unrolled--; n += 4) { - avg = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); - _mm_store_ps(&dFL[n], _mm_mul_ps(avg, mix)); - avg = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); - _mm_store_ps(&dFR[n], _mm_mul_ps(avg, mix)); + for(n = 0; unrolled--; n += 8) { + avg[0] = _mm_add_ps(_mm_load_ps(&sFL[n]), _mm_load_ps(&sSL[n])); + avg[1] = _mm_add_ps(_mm_load_ps(&sFL[n+4]), _mm_load_ps(&sSL[n+4])); + _mm_store_ps(&dFL[n], _mm_mul_ps(avg[0], mix)); + _mm_store_ps(&dFL[n+4], _mm_mul_ps(avg[1], mix)); + avg[0] = _mm_add_ps(_mm_load_ps(&sFR[n]), _mm_load_ps(&sSR[n])); + avg[1] = _mm_add_ps(_mm_load_ps(&sFR[n+4]), _mm_load_ps(&sSR[n+4])); + _mm_store_ps(&dFR[n], _mm_mul_ps(avg[0], mix)); + _mm_store_ps(&dFR[n+4], _mm_mul_ps(avg[1], mix)); _mm_store_ps(&dFC[n], _mm_mul_ps(_mm_load_ps(&sFC[n]), vol)); + _mm_store_ps(&dFC[n+4], _mm_mul_ps(_mm_load_ps(&sFC[n+4]), vol)); _mm_store_ps(&dLFE[n], _mm_mul_ps(_mm_load_ps(&sLFE[n]), vol)); + _mm_store_ps(&dLFE[n+4], _mm_mul_ps(_mm_load_ps(&sLFE[n+4]), vol)); } for(; n < n_samples; n++) { - avg = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); - _mm_store_ss(&dFL[n], _mm_mul_ss(avg, mix)); - avg = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); - _mm_store_ss(&dFR[n], _mm_mul_ss(avg, mix)); + avg[0] = _mm_add_ss(_mm_load_ss(&sFL[n]), _mm_load_ss(&sSL[n])); + _mm_store_ss(&dFL[n], _mm_mul_ss(avg[0], mix)); + avg[0] = _mm_add_ss(_mm_load_ss(&sFR[n]), _mm_load_ss(&sSR[n])); + _mm_store_ss(&dFR[n], _mm_mul_ss(avg[0], mix)); _mm_store_ss(&dFC[n], _mm_mul_ss(_mm_load_ss(&sFC[n]), vol)); _mm_store_ss(&dLFE[n], _mm_mul_ss(_mm_load_ss(&sLFE[n]), vol)); } diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c index e0cef1fc8..9528c410d 100644 --- a/spa/plugins/audioconvert/fmt-ops-sse2.c +++ b/spa/plugins/audioconvert/fmt-ops-sse2.c @@ -30,12 +30,12 @@ #include static void -conv_s16_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) +conv_s16_to_f32d_1_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) { const int16_t *s = src; float **d = (float **) dst; float *d0 = d[0]; - int n, unrolled; + uint32_t n, unrolled; __m128i in; __m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE); @@ -64,12 +64,12 @@ conv_s16_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels } static void -conv_s16_to_f32d_2_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) +conv_s16_to_f32d_2_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) { const int16_t *s = src; float **d = (float **) dst; float *d0 = d[0], *d1 = d[1]; - int n, unrolled; + uint32_t n, unrolled; __m128i in, t[2]; __m128 out[2], factor = _mm_set1_ps(1.0f / S16_SCALE); @@ -110,10 +110,10 @@ conv_s16_to_f32d_2_sse2(void *data, void *dst[], const void *src, int n_channels } static void -conv_s16_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s16_to_f32d_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int16_t *s = src[0]; - int i = 0; + uint32_t i = 0; for(; i + 1 < n_channels; i += 2) conv_s16_to_f32d_2_sse2(data, &dst[i], &s[i], n_channels, n_samples); @@ -122,16 +122,16 @@ conv_s16_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels } static void -conv_s24_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples) +conv_s24_to_f32d_1_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) { const uint8_t *s = src; float **d = (float **) dst; float *d0 = d[0]; - int n, unrolled; + uint32_t n, unrolled; __m128i in; __m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE); - if (SPA_IS_ALIGNED(d0, 16) && n_samples > 4) { + if (SPA_IS_ALIGNED(d0, 16)) { unrolled = n_samples / 4; if ((n_samples & 3) == 0) unrolled--; @@ -161,22 +161,167 @@ conv_s24_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels } static void -conv_s24_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_to_f32d_2_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) +{ + const uint8_t *s = src; + float **d = (float **) dst; + float *d0 = d[0], *d1 = d[1]; + uint32_t n, unrolled; + __m128i in[2]; + __m128 out[2], factor = _mm_set1_ps(1.0f / S24_SCALE); + + if (SPA_IS_ALIGNED(d0, 16)) { + unrolled = n_samples / 4; + if ((n_samples & 3) == 0) + unrolled--; + } + else + unrolled = 0; + + for(n = 0; unrolled--; n += 4) { + in[0] = _mm_setr_epi32( + *((uint32_t*)&s[0 + 0*n_channels]), + *((uint32_t*)&s[0 + 3*n_channels]), + *((uint32_t*)&s[0 + 6*n_channels]), + *((uint32_t*)&s[0 + 9*n_channels])); + in[1] = _mm_setr_epi32( + *((uint32_t*)&s[3 + 0*n_channels]), + *((uint32_t*)&s[3 + 3*n_channels]), + *((uint32_t*)&s[3 + 6*n_channels]), + *((uint32_t*)&s[3 + 9*n_channels])); + + in[0] = _mm_slli_epi32(in[0], 8); + in[1] = _mm_slli_epi32(in[1], 8); + + in[0] = _mm_srai_epi32(in[0], 8); + in[1] = _mm_srai_epi32(in[1], 8); + + out[0] = _mm_cvtepi32_ps(in[0]); + out[1] = _mm_cvtepi32_ps(in[1]); + + out[0] = _mm_mul_ps(out[0], factor); + out[1] = _mm_mul_ps(out[1], factor); + + _mm_store_ps(&d0[n], out[0]); + _mm_store_ps(&d1[n], out[1]); + + s += 12 * n_channels; + } + for(; n < n_samples; n++) { + out[0] = _mm_cvtsi32_ss(out[0], read_s24(s)); + out[1] = _mm_cvtsi32_ss(out[1], read_s24(s+3)); + out[0] = _mm_mul_ss(out[0], factor); + out[1] = _mm_mul_ss(out[1], factor); + _mm_store_ss(&d0[n], out[0]); + _mm_store_ss(&d1[n], out[1]); + s += 3 * n_channels; + } +} +static void +conv_s24_to_f32d_4_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) +{ + const uint8_t *s = src; + float **d = (float **) dst; + float *d0 = d[0], *d1 = d[1], *d2 = d[2], *d3 = d[3]; + uint32_t n, unrolled; + __m128i in[4]; + __m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE); + + if (SPA_IS_ALIGNED(d0, 16)) { + unrolled = n_samples / 4; + if ((n_samples & 3) == 0) + unrolled--; + } + else + unrolled = 0; + + for(n = 0; unrolled--; n += 4) { + in[0] = _mm_setr_epi32( + *((uint32_t*)&s[0 + 0*n_channels]), + *((uint32_t*)&s[0 + 3*n_channels]), + *((uint32_t*)&s[0 + 6*n_channels]), + *((uint32_t*)&s[0 + 9*n_channels])); + in[1] = _mm_setr_epi32( + *((uint32_t*)&s[3 + 0*n_channels]), + *((uint32_t*)&s[3 + 3*n_channels]), + *((uint32_t*)&s[3 + 6*n_channels]), + *((uint32_t*)&s[3 + 9*n_channels])); + in[2] = _mm_setr_epi32( + *((uint32_t*)&s[6 + 0*n_channels]), + *((uint32_t*)&s[6 + 3*n_channels]), + *((uint32_t*)&s[6 + 6*n_channels]), + *((uint32_t*)&s[6 + 9*n_channels])); + in[3] = _mm_setr_epi32( + *((uint32_t*)&s[9 + 0*n_channels]), + *((uint32_t*)&s[9 + 3*n_channels]), + *((uint32_t*)&s[9 + 6*n_channels]), + *((uint32_t*)&s[9 + 9*n_channels])); + + in[0] = _mm_slli_epi32(in[0], 8); + in[1] = _mm_slli_epi32(in[1], 8); + in[2] = _mm_slli_epi32(in[2], 8); + in[3] = _mm_slli_epi32(in[3], 8); + + in[0] = _mm_srai_epi32(in[0], 8); + in[1] = _mm_srai_epi32(in[1], 8); + in[2] = _mm_srai_epi32(in[2], 8); + in[3] = _mm_srai_epi32(in[3], 8); + + out[0] = _mm_cvtepi32_ps(in[0]); + out[1] = _mm_cvtepi32_ps(in[1]); + out[2] = _mm_cvtepi32_ps(in[2]); + out[3] = _mm_cvtepi32_ps(in[3]); + + out[0] = _mm_mul_ps(out[0], factor); + out[1] = _mm_mul_ps(out[1], factor); + out[2] = _mm_mul_ps(out[2], factor); + out[3] = _mm_mul_ps(out[3], factor); + + _mm_store_ps(&d0[n], out[0]); + _mm_store_ps(&d1[n], out[1]); + _mm_store_ps(&d2[n], out[2]); + _mm_store_ps(&d3[n], out[3]); + + s += 12 * n_channels; + } + for(; n < n_samples; n++) { + out[0] = _mm_cvtsi32_ss(out[0], read_s24(s)); + out[1] = _mm_cvtsi32_ss(out[1], read_s24(s+3)); + out[2] = _mm_cvtsi32_ss(out[2], read_s24(s+6)); + out[3] = _mm_cvtsi32_ss(out[3], read_s24(s+9)); + out[0] = _mm_mul_ss(out[0], factor); + out[1] = _mm_mul_ss(out[1], factor); + out[2] = _mm_mul_ss(out[2], factor); + out[3] = _mm_mul_ss(out[3], factor); + _mm_store_ss(&d0[n], out[0]); + _mm_store_ss(&d1[n], out[1]); + _mm_store_ss(&d2[n], out[2]); + _mm_store_ss(&d3[n], out[3]); + s += 3 * n_channels; + } +} + +static void +conv_s24_to_f32d_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int8_t *s = src[0]; - int i = 0; + uint32_t i = 0; + for(; i + 3 < n_channels; i += 4) + conv_s24_to_f32d_4_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); + for(; i + 1 < n_channels; i += 2) + conv_s24_to_f32d_2_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); for(; i < n_channels; i++) conv_s24_to_f32d_1_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); } static void -conv_f32d_to_s32_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32_1_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; const float *s0 = s[0]; int32_t *d = dst; - int n, unrolled; + uint32_t n, unrolled; __m128 in[1]; __m128i out[4]; __m128 int_max = _mm_set1_ps(S24_MAX_F); @@ -212,12 +357,12 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, const void *src[], int n_channels } static void -conv_f32d_to_s32_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32_2_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1]; int32_t *d = dst; - int n, unrolled; + uint32_t n, unrolled; __m128 in[2]; __m128i out[2], t[2]; __m128 int_max = _mm_set1_ps(S24_MAX_F); @@ -265,12 +410,12 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, const void *src[], int n_channels } static void -conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32_4_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3]; int32_t *d = dst; - int n, unrolled; + uint32_t n, unrolled; __m128 in[4]; __m128i out[4], t[4]; __m128 int_max = _mm_set1_ps(S24_MAX_F); @@ -279,7 +424,8 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels if (SPA_IS_ALIGNED(s0, 16) && SPA_IS_ALIGNED(s1, 16) && SPA_IS_ALIGNED(s2, 16) && - SPA_IS_ALIGNED(s3, 16)) + SPA_IS_ALIGNED(s3, 16) && + SPA_IS_ALIGNED(d, 16)) unrolled = n_samples / 4; else unrolled = 0; @@ -310,10 +456,10 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels out[2] = _mm_unpacklo_epi64(t[2], t[3]); out[3] = _mm_unpackhi_epi64(t[2], t[3]); - _mm_storeu_si128((__m128i*)(d + 0*n_channels), out[0]); - _mm_storeu_si128((__m128i*)(d + 1*n_channels), out[1]); - _mm_storeu_si128((__m128i*)(d + 2*n_channels), out[2]); - _mm_storeu_si128((__m128i*)(d + 3*n_channels), out[3]); + _mm_store_si128((__m128i*)(d + 0*n_channels), out[0]); + _mm_store_si128((__m128i*)(d + 1*n_channels), out[1]); + _mm_store_si128((__m128i*)(d + 2*n_channels), out[2]); + _mm_store_si128((__m128i*)(d + 3*n_channels), out[3]); d += 4*n_channels; } for(; n < n_samples; n++) { @@ -335,10 +481,10 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels } static void -conv_f32d_to_s32_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { int32_t *d = dst[0]; - int i = 0; + uint32_t i = 0; for(; i + 3 < n_channels; i += 4) conv_f32d_to_s32_4_sse2(data, &d[i], &src[i], n_channels, n_samples); @@ -349,12 +495,12 @@ conv_f32d_to_s32_sse2(void *data, void *dst[], const void *src[], int n_channels } static void -conv_f32d_to_s16_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) +conv_f32d_to_s16_1_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; const float *s0 = s[0]; int16_t *d = dst; - int n, unrolled; + uint32_t n, unrolled; __m128 in[2]; __m128i out[2]; __m128 int_max = _mm_set1_ps(S16_MAX_F); @@ -391,12 +537,12 @@ conv_f32d_to_s16_1_sse2(void *data, void *dst, const void *src[], int n_channels } static void -conv_f32d_to_s16_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples) +conv_f32d_to_s16_2_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; const float *s0 = s[0], *s1 = s[1]; int16_t *d = dst; - int n, unrolled; + uint32_t n, unrolled; __m128 in[2]; __m128i out[4], t[2]; __m128 int_max = _mm_set1_ps(S16_MAX_F); @@ -441,11 +587,76 @@ conv_f32d_to_s16_2_sse2(void *data, void *dst, const void *src[], int n_channels } static void -conv_f32d_to_s16_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s16_4_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) +{ + const float **s = (const float **) src; + const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3]; + int16_t *d = dst; + uint32_t n, unrolled; + __m128 in[4]; + __m128i out[4], t[4]; + __m128 int_max = _mm_set1_ps(S16_MAX_F); + __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max); + + if (SPA_IS_ALIGNED(s0, 16) && + SPA_IS_ALIGNED(s1, 16) && + SPA_IS_ALIGNED(s2, 16) && + SPA_IS_ALIGNED(s3, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; + + for(n = 0; unrolled--; n += 4) { + in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max); + in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max); + in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), int_max); + in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), int_max); + + t[0] = _mm_cvtps_epi32(in[0]); + t[1] = _mm_cvtps_epi32(in[1]); + t[2] = _mm_cvtps_epi32(in[2]); + t[3] = _mm_cvtps_epi32(in[3]); + + t[0] = _mm_packs_epi32(t[0], t[2]); + t[1] = _mm_packs_epi32(t[1], t[3]); + + out[0] = _mm_unpacklo_epi16(t[0], t[1]); + out[1] = _mm_unpackhi_epi16(t[0], t[1]); + out[2] = _mm_unpacklo_epi32(out[0], out[1]); + out[3] = _mm_unpackhi_epi32(out[0], out[1]); + + _mm_storel_pi((__m64*)(d + 0*n_channels), (__m128)out[2]); + _mm_storeh_pi((__m64*)(d + 1*n_channels), (__m128)out[2]); + _mm_storel_pi((__m64*)(d + 2*n_channels), (__m128)out[3]); + _mm_storeh_pi((__m64*)(d + 3*n_channels), (__m128)out[3]); + + d += 4*n_channels; + } + for(; n < n_samples; n++) { + in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max); + in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_max); + in[2] = _mm_mul_ss(_mm_load_ss(&s2[n]), int_max); + in[3] = _mm_mul_ss(_mm_load_ss(&s3[n]), int_max); + in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min)); + in[1] = _mm_min_ss(int_max, _mm_max_ss(in[1], int_min)); + in[2] = _mm_min_ss(int_max, _mm_max_ss(in[2], int_min)); + in[3] = _mm_min_ss(int_max, _mm_max_ss(in[3], int_min)); + d[0] = _mm_cvtss_si32(in[0]); + d[1] = _mm_cvtss_si32(in[1]); + d[2] = _mm_cvtss_si32(in[2]); + d[3] = _mm_cvtss_si32(in[3]); + d += n_channels; + } +} + +static void +conv_f32d_to_s16_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { int16_t *d = dst[0]; - int i = 0; + uint32_t i = 0; + for(; i + 3 < n_channels; i += 4) + conv_f32d_to_s16_4_sse2(data, &d[i], &src[i], n_channels, n_samples); for(; i + 1 < n_channels; i += 2) conv_f32d_to_s16_2_sse2(data, &d[i], &src[i], n_channels, n_samples); for(; i < n_channels; i++) diff --git a/spa/plugins/audioconvert/fmt-ops-sse41.c b/spa/plugins/audioconvert/fmt-ops-sse41.c new file mode 100644 index 000000000..567f368bb --- /dev/null +++ b/spa/plugins/audioconvert/fmt-ops-sse41.c @@ -0,0 +1,79 @@ +/* Spa + * + * Copyright © 2018 Wim Taymans + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include + +#include + +static void +conv_s24_to_f32d_1_sse41(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) +{ + const uint8_t *s = src; + float **d = (float **) dst; + float *d0 = d[0]; + uint32_t n, unrolled; + __m128i in; + __m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE); + + if (SPA_IS_ALIGNED(d0, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; + + for(n = 0; unrolled--; n += 4) { + in = _mm_insert_epi32(in, *((uint32_t*)&s[0 * n_channels]), 0); + in = _mm_insert_epi32(in, *((uint32_t*)&s[3 * n_channels]), 1); + in = _mm_insert_epi32(in, *((uint32_t*)&s[6 * n_channels]), 2); + in = _mm_insert_epi32(in, *((uint32_t*)&s[9 * n_channels]), 3); + in = _mm_slli_epi32(in, 8); + in = _mm_srai_epi32(in, 8); + out = _mm_cvtepi32_ps(in); + out = _mm_mul_ps(out, factor); + _mm_storeu_ps(&d0[n], out); + s += 12 * n_channels; + } + for(; n < n_samples; n++) { + out = _mm_cvtsi32_ss(out, read_s24(s)); + out = _mm_mul_ss(out, factor); + _mm_store_ss(&d0[n], out); + s += 3 * n_channels; + } +} + +static void +conv_s24_to_f32d_sse41(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) +{ + const int8_t *s = src[0]; + uint32_t i = 0; + + for(; i + 3 < n_channels; i += 4) + conv_s24_to_f32d_4_ssse3(data, &dst[i], &s[3*i], n_channels, n_samples); + for(; i + 1 < n_channels; i += 2) + conv_s24_to_f32d_2_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_s24_to_f32d_1_sse41(data, &dst[i], &s[3*i], n_channels, n_samples); +} diff --git a/spa/plugins/audioconvert/fmt-ops-ssse3.c b/spa/plugins/audioconvert/fmt-ops-ssse3.c new file mode 100644 index 000000000..93b30178e --- /dev/null +++ b/spa/plugins/audioconvert/fmt-ops-ssse3.c @@ -0,0 +1,108 @@ +/* Spa + * + * Copyright © 2018 Wim Taymans + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include + +#include + +static void +conv_s24_to_f32d_4_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, uint32_t n_channels, uint32_t n_samples) +{ + const uint8_t *s = src; + float **d = (float **) dst; + float *d0 = d[0], *d1 = d[1], *d2 = d[2], *d3 = d[3]; + uint32_t n, unrolled; + __m128i in[4]; + __m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE); + //const __m128i mask = _mm_setr_epi8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11); + const __m128i mask = _mm_set_epi8(15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, -1); + + if (SPA_IS_ALIGNED(d0, 16)) + unrolled = n_samples / 4; + else + unrolled = 0; + + for(n = 0; unrolled--; n += 4) { + in[0] = _mm_loadu_si128((__m128i*)(s + 0*n_channels)); + in[1] = _mm_loadu_si128((__m128i*)(s + 3*n_channels)); + in[2] = _mm_loadu_si128((__m128i*)(s + 6*n_channels)); + in[3] = _mm_loadu_si128((__m128i*)(s + 9*n_channels)); + in[0] = _mm_shuffle_epi8(in[0], mask); + in[1] = _mm_shuffle_epi8(in[1], mask); + in[2] = _mm_shuffle_epi8(in[2], mask); + in[3] = _mm_shuffle_epi8(in[3], mask); + in[0] = _mm_srai_epi32(in[0], 8); + in[1] = _mm_srai_epi32(in[1], 8); + in[2] = _mm_srai_epi32(in[2], 8); + in[3] = _mm_srai_epi32(in[3], 8); + out[0] = _mm_cvtepi32_ps(in[0]); + out[1] = _mm_cvtepi32_ps(in[1]); + out[2] = _mm_cvtepi32_ps(in[2]); + out[3] = _mm_cvtepi32_ps(in[3]); + out[0] = _mm_mul_ps(out[0], factor); + out[1] = _mm_mul_ps(out[1], factor); + out[2] = _mm_mul_ps(out[2], factor); + out[3] = _mm_mul_ps(out[3], factor); + + _MM_TRANSPOSE4_PS(out[0], out[1], out[2], out[3]); + + _mm_store_ps(&d0[n], out[0]); + _mm_store_ps(&d1[n], out[1]); + _mm_store_ps(&d2[n], out[2]); + _mm_store_ps(&d3[n], out[3]); + s += 12 * n_channels; + } + for(; n < n_samples; n++) { + out[0] = _mm_cvtsi32_ss(out[0], read_s24(s)); + out[1] = _mm_cvtsi32_ss(out[1], read_s24(s+3)); + out[2] = _mm_cvtsi32_ss(out[2], read_s24(s+6)); + out[3] = _mm_cvtsi32_ss(out[3], read_s24(s+9)); + out[0] = _mm_mul_ss(out[0], factor); + out[1] = _mm_mul_ss(out[1], factor); + out[2] = _mm_mul_ss(out[2], factor); + out[3] = _mm_mul_ss(out[3], factor); + _mm_store_ss(&d0[n], out[0]); + _mm_store_ss(&d1[n], out[1]); + _mm_store_ss(&d2[n], out[2]); + _mm_store_ss(&d3[n], out[3]); + s += 3 * n_channels; + } +} + +static void +conv_s24_to_f32d_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) +{ + const int8_t *s = src[0]; + uint32_t i = 0; + + for(; i + 3 < n_channels; i += 4) + conv_s24_to_f32d_4_ssse3(data, &dst[i], &s[3*i], n_channels, n_samples); + for(; i + 1 < n_channels; i += 2) + conv_s24_to_f32d_2_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_s24_to_f32d_1_sse2(data, &dst[i], &s[3*i], n_channels, n_samples); +} diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c index 9017ecf96..ff0be8862 100644 --- a/spa/plugins/audioconvert/fmt-ops.c +++ b/spa/plugins/audioconvert/fmt-ops.c @@ -83,68 +83,74 @@ static inline void write_s24(void *dst, int32_t val) #if defined (__SSE2__) #include "fmt-ops-sse2.c" #endif +#if defined (__SSSE3__) +#include "fmt-ops-ssse3.c" +#endif +#if defined (__SSE4_1__) +#include "fmt-ops-sse41.c" +#endif static void -conv_copy8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy8d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i; + uint32_t i; for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples); } static void -conv_copy8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy8(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { memcpy(dst[0], src[0], n_samples * n_channels); } static void -conv_copy16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy16d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i; + uint32_t i; for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * sizeof(int16_t)); } static void -conv_copy16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy16(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { memcpy(dst[0], src[0], n_samples * sizeof(int16_t) * n_channels); } static void -conv_copy24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy24d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i; + uint32_t i; for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * 3); } static void -conv_copy24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy24(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { memcpy(dst[0], src[0], n_samples * 3 * n_channels); } static void -conv_copy32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i; + uint32_t i; for (i = 0; i < n_channels; i++) memcpy(dst[i], src[i], n_samples * sizeof(int32_t)); } static void -conv_copy32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_copy32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { memcpy(dst[0], src[0], n_samples * sizeof(int32_t) * n_channels); } static void -conv_u8d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_u8d_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const uint8_t *s = src[i]; @@ -156,17 +162,17 @@ conv_u8d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_u8_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_u8_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_u8d_to_f32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_u8_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_u8_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t *s = src[0]; float **d = (float **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -175,11 +181,11 @@ conv_u8_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_u8d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_u8d_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t **s = (const uint8_t **) src; float *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -188,9 +194,9 @@ conv_u8d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s16d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s16d_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const int16_t *s = src[i]; @@ -201,17 +207,17 @@ conv_s16d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_s16_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s16_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_s16d_to_f32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_s16_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s16_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int16_t *s = src[0]; float **d = (float **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -220,11 +226,11 @@ conv_s16_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s16d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s16d_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int16_t **s = (const int16_t **) src; float *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -233,9 +239,9 @@ conv_s16d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s32d_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const int32_t *s = src[i]; @@ -247,17 +253,17 @@ conv_s32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_s32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s32_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_s32d_to_f32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_s32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s32_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int32_t *s = src[0]; float **d = (float **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -266,11 +272,11 @@ conv_s32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s32d_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int32_t **s = (const int32_t **) src; float *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -279,9 +285,9 @@ conv_s32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s24d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24d_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const int8_t *s = src[i]; @@ -295,17 +301,17 @@ conv_s24d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_s24_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_s24d_to_f32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_s24_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t *s = src[0]; float **d = (float **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -316,11 +322,11 @@ conv_s24_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s24d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24d_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t **s = (const uint8_t **) src; float *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -330,9 +336,9 @@ conv_s24d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_s24_32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_32d_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const int32_t *s = src[i]; @@ -344,17 +350,17 @@ conv_s24_32d_to_f32d(void *data, void *dst[], const void *src[], int n_channels, } static void -conv_s24_32_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_32_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_s24_32d_to_f32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_s24_32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_32_to_f32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int32_t *s = src[0]; float **d = (float **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -363,11 +369,11 @@ conv_s24_32_to_f32d(void *data, void *dst[], const void *src[], int n_channels, } static void -conv_s24_32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_s24_32d_to_f32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int32_t **s = (const int32_t **) src; float *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -376,9 +382,9 @@ conv_s24_32d_to_f32(void *data, void *dst[], const void *src[], int n_channels, } static void -conv_f32d_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_u8d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const float *s = src[i]; @@ -390,17 +396,17 @@ conv_f32d_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_u8(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_f32d_to_u8d(data, dst, src, 1, n_samples * n_channels); } static void -conv_f32_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_u8d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float *s = src[0]; uint8_t **d = (uint8_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -409,11 +415,11 @@ conv_f32_to_u8d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_u8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_u8(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; uint8_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -422,9 +428,9 @@ conv_f32d_to_u8(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s16d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const float *s = src[i]; @@ -436,17 +442,17 @@ conv_f32d_to_s16d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_f32_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s16(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_f32d_to_s16d(data, dst, src, 1, n_samples * n_channels); } static void -conv_f32_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s16d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float *s = src[0]; int16_t **d = (int16_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -455,11 +461,11 @@ conv_f32_to_s16d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_s16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s16(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; int16_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -468,9 +474,9 @@ conv_f32d_to_s16(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const float *s = src[i]; @@ -482,17 +488,17 @@ conv_f32d_to_s32d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_f32_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_f32d_to_s32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_f32_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float *s = src[0]; int32_t **d = (int32_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -501,11 +507,11 @@ conv_f32_to_s32d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_s32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; int32_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -516,9 +522,9 @@ conv_f32d_to_s32(void *data, void *dst[], const void *src[], int n_channels, int static void -conv_f32d_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s24d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const float *s = src[i]; @@ -532,17 +538,17 @@ conv_f32d_to_s24d(void *data, void *dst[], const void *src[], int n_channels, in } static void -conv_f32_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s24(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_f32d_to_s24d(data, dst, src, 1, n_samples * n_channels); } static void -conv_f32_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s24d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float *s = src[0]; uint8_t **d = (uint8_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -552,11 +558,11 @@ conv_f32_to_s24d(void *data, void *dst[], const void *src[], int n_channels, int } static void -conv_f32d_to_s24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s24(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; uint8_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -568,9 +574,9 @@ conv_f32d_to_s24(void *data, void *dst[], const void *src[], int n_channels, int static void -conv_f32d_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s24_32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { - int i, j; + uint32_t i, j; for (i = 0; i < n_channels; i++) { const float *s = src[i]; @@ -582,17 +588,17 @@ conv_f32d_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, } static void -conv_f32_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s24_32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { conv_f32d_to_s24_32d(data, dst, src, 1, n_samples * n_channels); } static void -conv_f32_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32_to_s24_32d(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float *s = src[0]; int32_t **d = (int32_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -601,11 +607,11 @@ conv_f32_to_s24_32d(void *data, void *dst[], const void *src[], int n_channels, } static void -conv_f32d_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +conv_f32d_to_s24_32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const float **s = (const float **) src; int32_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -614,11 +620,11 @@ conv_f32d_to_s24_32(void *data, void *dst[], const void *src[], int n_channels, } static void -deinterleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +deinterleave_8(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t *s = src[0]; uint8_t **d = (uint8_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -627,11 +633,11 @@ deinterleave_8(void *data, void *dst[], const void *src[], int n_channels, int n } static void -deinterleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +deinterleave_16(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint16_t *s = src[0]; uint16_t **d = (uint16_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -640,11 +646,11 @@ deinterleave_16(void *data, void *dst[], const void *src[], int n_channels, int } static void -deinterleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +deinterleave_24(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint8_t *s = src[0]; uint8_t **d = (uint8_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -655,11 +661,11 @@ deinterleave_24(void *data, void *dst[], const void *src[], int n_channels, int } static void -deinterleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +deinterleave_32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const uint32_t *s = src[0]; uint32_t **d = (uint32_t **) dst; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -668,11 +674,11 @@ deinterleave_32(void *data, void *dst[], const void *src[], int n_channels, int } static void -interleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +interleave_8(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int8_t **s = (const int8_t **) src; uint8_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -681,11 +687,11 @@ interleave_8(void *data, void *dst[], const void *src[], int n_channels, int n_s } static void -interleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +interleave_16(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int16_t **s = (const int16_t **) src; uint16_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -694,11 +700,11 @@ interleave_16(void *data, void *dst[], const void *src[], int n_channels, int n_ } static void -interleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +interleave_24(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int8_t **s = (const int8_t **) src; uint8_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) { @@ -709,11 +715,11 @@ interleave_24(void *data, void *dst[], const void *src[], int n_channels, int n_ } static void -interleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_samples) +interleave_32(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], uint32_t n_channels, uint32_t n_samples) { const int32_t **s = (const int32_t **) src; uint32_t *d = dst[0]; - int i, j; + uint32_t i, j; for (j = 0; j < n_samples; j++) { for (i = 0; i < n_channels; i++) @@ -722,8 +728,8 @@ interleave_32(void *data, void *dst[], const void *src[], int n_channels, int n_ } -typedef void (*convert_func_t) (void *data, void *dst[], const void *src[], - int n_channels, int n_samples); +typedef void (*convert_func_t) (void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], + uint32_t n_channels, uint32_t n_samples); static const struct conv_info { uint32_t src_fmt; @@ -761,6 +767,12 @@ static const struct conv_info { { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32, 0, conv_s24_to_f32 }, { SPA_AUDIO_FORMAT_S24P, SPA_AUDIO_FORMAT_F32P, 0, conv_s24d_to_f32d }, +#if defined (__SSE4_1__) + { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse41 }, +#endif +#if defined (__SSSE3__) + { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_ssse3 }, +#endif #if defined (__SSE2__) { SPA_AUDIO_FORMAT_S24, SPA_AUDIO_FORMAT_F32P, FEATURE_SSE2, conv_s24_to_f32d_sse2 }, #endif