diff --git a/spa/plugins/audioconvert/resample-native-avx.c b/spa/plugins/audioconvert/resample-native-avx.c index 82e0e6970..b23c0b729 100644 --- a/spa/plugins/audioconvert/resample-native-avx.c +++ b/spa/plugins/audioconvert/resample-native-avx.c @@ -31,7 +31,7 @@ static void inner_product_avx(float *d, const float * SPA_RESTRICT s, const float * SPA_RESTRICT taps, uint32_t n_taps) { __m256 sy[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }, ty; - __m128 sx[2]; + __m128 sx[2], tx; uint32_t i = 0; uint32_t n_taps4 = n_taps & ~0xf; @@ -45,8 +45,10 @@ static void inner_product_avx(float *d, const float * SPA_RESTRICT s, sx[1] = _mm256_extractf128_ps(sy[0], 1); sx[0] = _mm256_extractf128_ps(sy[0], 0); for (; i < n_taps; i += 8) { - sx[0] = _mm_fmadd_ps(_mm_loadu_ps(s + i + 0), _mm_load_ps(taps + i + 0), sx[0]); - sx[1] = _mm_fmadd_ps(_mm_loadu_ps(s + i + 4), _mm_load_ps(taps + i + 4), sx[1]); + tx = (__m128)_mm_lddqu_si128((__m128i*)(s + i + 0)); + sx[0] = _mm_fmadd_ps(tx, _mm_load_ps(taps + i + 0), sx[0]); + tx = (__m128)_mm_lddqu_si128((__m128i*)(s + i + 4)); + sx[1] = _mm_fmadd_ps(tx, _mm_load_ps(taps + i + 4), sx[1]); } sx[0] = _mm_add_ps(sx[0], sx[1]); sx[0] = _mm_hadd_ps(sx[0], sx[0]); @@ -63,20 +65,21 @@ static void inner_product_ip_avx(float *d, const float * SPA_RESTRICT s, uint32_t i, n_taps4 = n_taps & ~0xf; for (i = 0; i < n_taps4; i += 16) { - ty = _mm256_loadu_ps(s + i + 0); + ty = (__m256)_mm256_lddqu_si256((__m256i*)(s + i + 0)); sy[0] = _mm256_fmadd_ps(ty, _mm256_load_ps(t0 + i + 0), sy[0]); sy[1] = _mm256_fmadd_ps(ty, _mm256_load_ps(t1 + i + 0), sy[1]); - ty = _mm256_loadu_ps(s + i + 8); + ty = (__m256)_mm256_lddqu_si256((__m256i*)(s + i + 8)); sy[0] = _mm256_fmadd_ps(ty, _mm256_load_ps(t0 + i + 8), sy[0]); sy[1] = _mm256_fmadd_ps(ty, _mm256_load_ps(t1 + i + 8), sy[1]); } - sx[0] = _mm256_extractf128_ps(_mm256_hadd_ps(sy[0], sy[0]), 0); - sx[1] = _mm256_extractf128_ps(_mm256_hadd_ps(sy[1], sy[1]), 0); + sx[0] = _mm_add_ps(_mm256_extractf128_ps(sy[0], 0), _mm256_extractf128_ps(sy[0], 1)); + sx[1] = _mm_add_ps(_mm256_extractf128_ps(sy[1], 0), _mm256_extractf128_ps(sy[1], 1)); + for (; i < n_taps; i += 8) { - tx = _mm_loadu_ps(s + i + 0); + tx = (__m128)_mm_lddqu_si128((__m128i*)(s + i + 0)); sx[0] = _mm_fmadd_ps(tx, _mm_load_ps(t0 + i + 0), sx[0]); sx[1] = _mm_fmadd_ps(tx, _mm_load_ps(t1 + i + 0), sx[1]); - tx = _mm_loadu_ps(s + i + 4); + tx = (__m128)_mm_lddqu_si128((__m128i*)(s + i + 4)); sx[0] = _mm_fmadd_ps(tx, _mm_load_ps(t0 + i + 4), sx[0]); sx[1] = _mm_fmadd_ps(tx, _mm_load_ps(t1 + i + 4), sx[1]); } diff --git a/spa/plugins/audioconvert/resample-native-impl.h b/spa/plugins/audioconvert/resample-native-impl.h index ca1e2729e..3cb278e2e 100644 --- a/spa/plugins/audioconvert/resample-native-impl.h +++ b/spa/plugins/audioconvert/resample-native-impl.h @@ -78,7 +78,7 @@ DEFINE_RESAMPLER(copy,arch) \ index += to_copy; \ offs += to_copy; \ } \ - *in_len = index - data->index; \ + *in_len = index; \ *out_len = offs; \ data->index = index; \ } @@ -143,8 +143,8 @@ DEFINE_RESAMPLER(inter,arch) \ phase = data->phase; \ \ for (o = offs; o < olen && index + n_taps <= ilen; o++) { \ - const float *ip; \ - float ph, x, *t0, *t1; \ + const float *ip, *t0, *t1; \ + float ph, x; \ uint32_t offset; \ \ ip = &s[index]; \ diff --git a/spa/plugins/audioconvert/resample.c b/spa/plugins/audioconvert/resample.c index a5d89a910..c72986162 100644 --- a/spa/plugins/audioconvert/resample.c +++ b/spa/plugins/audioconvert/resample.c @@ -157,7 +157,7 @@ static int setup_convert(struct impl *this, if (this->monitor) err = impl_peaks_init(&this->resample); - else if (0) + else if (1) err = impl_native_init(&this->resample); else err = impl_speex_init(&this->resample);