audioconvert: improve SSE2 dither generation

This commit is contained in:
Wim Taymans 2022-07-11 16:41:12 +02:00
parent 277addcca6
commit e313149f7f

View file

@ -565,40 +565,44 @@ conv_f32d_to_s32_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
conv_f32d_to_s32_1s_sse2(conv, &d[i], &src[i], n_channels, n_samples);
}
/* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */
#define _MM_XORSHIFT_EPI32(r) \
({ \
__m128i i, t; \
i = _mm_load_si128((__m128i*)r); \
t = _mm_slli_epi32(i, 13); \
i = _mm_xor_si128(i, t); \
t = _mm_srli_epi32(i, 17); \
i = _mm_xor_si128(i, t); \
t = _mm_slli_epi32(i, 5); \
i = _mm_xor_si128(i, t); \
_mm_store_si128((__m128i*)r, i); \
i; \
})
static inline void update_dither_sse2(struct convert *conv, uint32_t n_samples)
{
uint32_t n;
const uint32_t *r = SPA_PTR_ALIGN(conv->random, 16, uint32_t);
float *dither = SPA_PTR_ALIGN(conv->dither, 16, float);
__m128 scale = _mm_set1_ps(conv->scale), out[1];
__m128i in[2], t[1];
__m128i in[2];
for (n = 0; n < n_samples; n += 4) {
/* 32 bit xorshift PRNG, see https://en.wikipedia.org/wiki/Xorshift */
in[0] = _mm_load_si128((__m128i*)r);
t[0] = _mm_slli_epi32(in[0], 13);
in[0] = _mm_xor_si128(in[0], t[0]);
t[0] = _mm_srli_epi32(in[0], 17);
in[0] = _mm_xor_si128(in[0], t[0]);
t[0] = _mm_slli_epi32(in[0], 5);
in[0] = _mm_xor_si128(in[0], t[0]);
_mm_store_si128((__m128i*)r, in[0]);
if (conv->method >= DITHER_METHOD_TRIANGULAR) {
in[1] = _mm_load_si128((__m128i*)r);
t[0] = _mm_slli_epi32(in[1], 13);
in[1] = _mm_xor_si128(in[1], t[0]);
t[0] = _mm_srli_epi32(in[1], 17);
in[1] = _mm_xor_si128(in[1], t[0]);
t[0] = _mm_slli_epi32(in[1], 5);
in[1] = _mm_xor_si128(in[1], t[0]);
_mm_store_si128((__m128i*)r, in[1]);
in[0] = _mm_add_epi32(in[0], in[1]);
if (conv->method < DITHER_METHOD_TRIANGULAR) {
for (n = 0; n < n_samples; n += 4) {
in[0] = _MM_XORSHIFT_EPI32(r);
out[0] = _mm_cvtepi32_ps(_MM_XORSHIFT_EPI32(r));
out[0] = _mm_mul_ps(out[0], scale);
_mm_store_ps(&dither[n], out[0]);
}
} else {
for (n = 0; n < n_samples; n += 4) {
in[0] = _mm_add_epi32( _MM_XORSHIFT_EPI32(r), _MM_XORSHIFT_EPI32(r));
out[0] = _mm_cvtepi32_ps(in[0]);
out[0] = _mm_mul_ps(out[0], scale);
_mm_store_ps(&dither[n], out[0]);
}
out[0] = _mm_cvtepi32_ps(in[0]);
out[0] = _mm_mul_ps(out[0], scale);
_mm_store_ps(&dither[n], out[0]);
}
}