fmt-ops: add avx2 optimized version

Only one optimized version but the sse2 version are compiled with
the avx2 flags so that they get optimized better.
This commit is contained in:
Wim Taymans 2020-03-16 16:11:29 +01:00
parent 6eca935e61
commit 3a911dfe3b
7 changed files with 821 additions and 28 deletions

View file

@ -31,8 +31,7 @@ conv_s16_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
uint32_t n_channels, uint32_t n_samples)
{
const int16_t *s = src;
float **d = (float **) dst;
float *d0 = d[0];
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in;
__m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE);
@ -77,8 +76,7 @@ conv_s16_to_f32d_2_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const v
uint32_t n_samples)
{
const int16_t *s = src[0];
float **d = (float **) dst;
float *d0 = d[0], *d1 = d[1];
float *d0 = dst[0], *d1 = dst[1];
uint32_t n, unrolled;
__m128i in[2], t[4];
__m128 out[4], factor = _mm_set1_ps(1.0f / S16_SCALE);
@ -135,8 +133,7 @@ conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
float **d = (float **) dst;
float *d0 = d[0];
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in;
__m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE);
@ -175,8 +172,7 @@ conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
float **d = (float **) dst;
float *d0 = d[0], *d1 = d[1];
float *d0 = dst[0], *d1 = dst[1];
uint32_t n, unrolled;
__m128i in[2];
__m128 out[2], factor = _mm_set1_ps(1.0f / S24_SCALE);
@ -235,8 +231,7 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
float **d = (float **) dst;
float *d0 = d[0], *d1 = d[1], *d2 = d[2], *d3 = d[3];
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
uint32_t n, unrolled;
__m128i in[4];
__m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE);
@ -340,8 +335,7 @@ conv_s32_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
uint32_t n_channels, uint32_t n_samples)
{
const int32_t *s = src;
float **d = (float **) dst;
float *d0 = d[0];
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in;
__m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE);
@ -385,8 +379,7 @@ static void
conv_f32d_to_s32_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
uint32_t n_channels, uint32_t n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0];
const float *s0 = src[0];
int32_t *d = dst;
uint32_t n, unrolled;
__m128 in[1];
@ -426,8 +419,7 @@ static void
conv_f32d_to_s32_2s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
uint32_t n_channels, uint32_t n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1];
const float *s0 = src[0], *s1 = src[1];
int32_t *d = dst;
uint32_t n, unrolled;
__m128 in[2];
@ -478,8 +470,7 @@ static void
conv_f32d_to_s32_4s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
uint32_t n_channels, uint32_t n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3];
const float *s0 = src[0], *s1 = src[1], *s2 = src[2], *s3 = src[3];
int32_t *d = dst;
uint32_t n, unrolled;
__m128 in[4];
@ -556,8 +547,7 @@ static void
conv_f32d_to_s16_1s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
uint32_t n_channels, uint32_t n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0];
const float *s0 = src[0];
int16_t *d = dst;
uint32_t n, unrolled;
__m128 in[2];
@ -599,8 +589,7 @@ static void
conv_f32d_to_s16_2s_sse2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
uint32_t n_channels, uint32_t n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1];
const float *s0 = src[0], *s1 = src[1];
int16_t *d = dst;
uint32_t n, unrolled;
__m128 in[2];
@ -724,7 +713,6 @@ conv_f32d_to_s16_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
conv_f32d_to_s16_1s_sse2(conv, &d[i], &src[i], n_channels, n_samples);
}
void
conv_f32d_to_s16_2_sse2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
uint32_t n_samples)