diff --git a/spa/plugins/audioconvert/fmt-ops-sse2.c b/spa/plugins/audioconvert/fmt-ops-sse2.c index d28f556d6..accff00b3 100644 --- a/spa/plugins/audioconvert/fmt-ops-sse2.c +++ b/spa/plugins/audioconvert/fmt-ops-sse2.c @@ -565,40 +565,44 @@ conv_f32d_to_s32_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const voi conv_f32d_to_s32_1s_sse2(conv, &d[i], &src[i], n_channels, n_samples); } +/* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */ +#define _MM_XORSHIFT_EPI32(r) \ +({ \ + __m128i i, t; \ + i = _mm_load_si128((__m128i*)r); \ + t = _mm_slli_epi32(i, 13); \ + i = _mm_xor_si128(i, t); \ + t = _mm_srli_epi32(i, 17); \ + i = _mm_xor_si128(i, t); \ + t = _mm_slli_epi32(i, 5); \ + i = _mm_xor_si128(i, t); \ + _mm_store_si128((__m128i*)r, i); \ + i; \ +}) + + static inline void update_dither_sse2(struct convert *conv, uint32_t n_samples) { uint32_t n; const uint32_t *r = SPA_PTR_ALIGN(conv->random, 16, uint32_t); float *dither = SPA_PTR_ALIGN(conv->dither, 16, float); __m128 scale = _mm_set1_ps(conv->scale), out[1]; - __m128i in[2], t[1]; + __m128i in[2]; - for (n = 0; n < n_samples; n += 4) { - /* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */ - in[0] = _mm_load_si128((__m128i*)r); - t[0] = _mm_slli_epi32(in[0], 13); - in[0] = _mm_xor_si128(in[0], t[0]); - t[0] = _mm_srli_epi32(in[0], 17); - in[0] = _mm_xor_si128(in[0], t[0]); - t[0] = _mm_slli_epi32(in[0], 5); - in[0] = _mm_xor_si128(in[0], t[0]); - _mm_store_si128((__m128i*)r, in[0]); - - if (conv->method >= DITHER_METHOD_TRIANGULAR) { - in[1] = _mm_load_si128((__m128i*)r); - t[0] = _mm_slli_epi32(in[1], 13); - in[1] = _mm_xor_si128(in[1], t[0]); - t[0] = _mm_srli_epi32(in[1], 17); - in[1] = _mm_xor_si128(in[1], t[0]); - t[0] = _mm_slli_epi32(in[1], 5); - in[1] = _mm_xor_si128(in[1], t[0]); - _mm_store_si128((__m128i*)r, in[1]); - - in[0] = _mm_add_epi32(in[0], in[1]); + if (conv->method < DITHER_METHOD_TRIANGULAR) { + for (n = 0; n < n_samples; n += 4) { + in[0] = _MM_XORSHIFT_EPI32(r); + out[0] = _mm_cvtepi32_ps(_MM_XORSHIFT_EPI32(r)); + out[0] = _mm_mul_ps(out[0], scale); + _mm_store_ps(&dither[n], out[0]); + } + } else { + for (n = 0; n < n_samples; n += 4) { + in[0] = _mm_add_epi32( _MM_XORSHIFT_EPI32(r), _MM_XORSHIFT_EPI32(r)); + out[0] = _mm_cvtepi32_ps(in[0]); + out[0] = _mm_mul_ps(out[0], scale); + _mm_store_ps(&dither[n], out[0]); } - out[0] = _mm_cvtepi32_ps(in[0]); - out[0] = _mm_mul_ps(out[0], scale); - _mm_store_ps(&dither[n], out[0]); } }