diff --git a/spa/plugins/audioconvert/resample-native-avx2.c b/spa/plugins/audioconvert/resample-native-avx2.c index 83a28e5d1..d8d658c3a 100644 --- a/spa/plugins/audioconvert/resample-native-avx2.c +++ b/spa/plugins/audioconvert/resample-native-avx2.c @@ -40,8 +40,9 @@ static inline void inner_product_ip_avx2(float *d, const float * SPA_RESTRICT s, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, uint32_t n_taps) { - __m256 sy[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }, ty; - __m128 sx[2], tx; + __m256 sy[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), + _mm256_setzero_ps(), _mm256_setzero_ps() }, ty; + __m128 sx[4], tx; uint32_t i, n_taps4 = n_taps & ~0xf; for (i = 0; i < n_taps4; i += 16) { @@ -49,9 +50,11 @@ static inline void inner_product_ip_avx2(float *d, const float * SPA_RESTRICT s, sy[0] = _mm256_fmadd_ps(ty, _mm256_load_ps(t0 + i + 0), sy[0]); sy[1] = _mm256_fmadd_ps(ty, _mm256_load_ps(t1 + i + 0), sy[1]); ty = _mm256_loadu_ps(s + i + 8); - sy[0] = _mm256_fmadd_ps(ty, _mm256_load_ps(t0 + i + 8), sy[0]); - sy[1] = _mm256_fmadd_ps(ty, _mm256_load_ps(t1 + i + 8), sy[1]); + sy[2] = _mm256_fmadd_ps(ty, _mm256_load_ps(t0 + i + 8), sy[2]); + sy[3] = _mm256_fmadd_ps(ty, _mm256_load_ps(t1 + i + 8), sy[3]); } + sy[0] = _mm256_add_ps(sy[0], sy[2]); + sy[1] = _mm256_add_ps(sy[1], sy[3]); sx[0] = _mm_add_ps(_mm256_extractf128_ps(sy[0], 0), _mm256_extractf128_ps(sy[0], 1)); sx[1] = _mm_add_ps(_mm256_extractf128_ps(sy[1], 0), _mm256_extractf128_ps(sy[1], 1)); diff --git a/spa/plugins/audioconvert/resample-native-sse.c b/spa/plugins/audioconvert/resample-native-sse.c index d1426713a..5fc6ed6ea 100644 --- a/spa/plugins/audioconvert/resample-native-sse.c +++ b/spa/plugins/audioconvert/resample-native-sse.c @@ -32,7 +32,8 @@ static inline void inner_product_ip_sse(float *d, const float * SPA_RESTRICT s, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, uint32_t n_taps) { - __m128 sum[2] = { _mm_setzero_ps (), _mm_setzero_ps () }, t; + __m128 sum[4] = { _mm_setzero_ps(), _mm_setzero_ps(), + _mm_setzero_ps(), _mm_setzero_ps() }, t; uint32_t i; for (i = 0; i < n_taps; i += 8) { @@ -40,9 +41,11 @@ static inline void inner_product_ip_sse(float *d, const float * SPA_RESTRICT s, sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(t, _mm_load_ps(t0 + i + 0))); sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(t, _mm_load_ps(t1 + i + 0))); t = _mm_loadu_ps(s + i + 4); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(t, _mm_load_ps(t0 + i + 4))); - sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(t, _mm_load_ps(t1 + i + 4))); + sum[2] = _mm_add_ps(sum[2], _mm_mul_ps(t, _mm_load_ps(t0 + i + 4))); + sum[3] = _mm_add_ps(sum[3], _mm_mul_ps(t, _mm_load_ps(t1 + i + 4))); } + sum[0] = _mm_add_ps(sum[0], sum[2]); + sum[1] = _mm_add_ps(sum[1], sum[3]); sum[1] = _mm_mul_ps(_mm_sub_ps(sum[1], sum[0]), _mm_load1_ps(&x)); sum[0] = _mm_add_ps(sum[0], sum[1]); sum[0] = _mm_add_ps(sum[0], _mm_movehl_ps(sum[0], sum[0])); diff --git a/spa/plugins/audioconvert/resample-native-ssse3.c b/spa/plugins/audioconvert/resample-native-ssse3.c index 1ef26e700..e445e316d 100644 --- a/spa/plugins/audioconvert/resample-native-ssse3.c +++ b/spa/plugins/audioconvert/resample-native-ssse3.c @@ -82,7 +82,8 @@ static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, uint32_t n_taps) { - __m128 sum[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; + __m128 sum[4] = { _mm_setzero_ps(), _mm_setzero_ps(), + _mm_setzero_ps(), _mm_setzero_ps() }; __m128 r0, r1, r; uint32_t i; @@ -93,8 +94,8 @@ static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(r, _mm_load_ps(t0 + i + 0))); sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(r, _mm_load_ps(t1 + i + 0))); r = _mm_load_ps(s + i + 4); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); - sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); + sum[2] = _mm_add_ps(sum[2], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); + sum[3] = _mm_add_ps(sum[3], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); } break; case 4: @@ -107,8 +108,8 @@ static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s r0 = r1; r1 = _mm_load_ps(s + i + 7); r = (__m128)_mm_alignr_epi8((__m128i)r1, (__m128i)r0, 4); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); - sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); + sum[2] = _mm_add_ps(sum[2], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); + sum[3] = _mm_add_ps(sum[3], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); r0 = r1; } break; @@ -122,8 +123,8 @@ static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s r0 = r1; r1 = _mm_load_ps(s + i + 6); r = (__m128)_mm_alignr_epi8((__m128i)r1, (__m128i)r0, 8); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); - sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); + sum[2] = _mm_add_ps(sum[2], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); + sum[3] = _mm_add_ps(sum[3], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); r0 = r1; } break; @@ -137,12 +138,14 @@ static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s r0 = r1; r1 = _mm_load_ps(s + i + 5); r = (__m128)_mm_alignr_epi8((__m128i)r1, (__m128i)r0, 12); - sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); - sum[1] = _mm_add_ps(sum[1], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); + sum[2] = _mm_add_ps(sum[2], _mm_mul_ps(r, _mm_load_ps(t0 + i + 4))); + sum[3] = _mm_add_ps(sum[3], _mm_mul_ps(r, _mm_load_ps(t1 + i + 4))); r0 = r1; } break; } + sum[0] = _mm_add_ps(sum[0], sum[2]); + sum[1] = _mm_add_ps(sum[1], sum[3]); sum[1] = _mm_mul_ps(_mm_sub_ps(sum[1], sum[0]), _mm_load1_ps(&x)); sum[0] = _mm_add_ps(sum[0], sum[1]); sum[0] = _mm_add_ps(sum[0], _mm_movehdup_ps(sum[0]));