mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2026-03-22 05:33:53 -04:00
Add an alternative avx2 s32_to_f32d implementation that doesn't use the gather function for when gather is slow. Don't overwrite the orinal cpu_flags but store the selected flags in a new variable. Use this to debug the selected function cpu flags. Build libraries with defines from previous libraries so that we can reuse functions from them. We can then remove the SSE2 | SLOW_GATHER function selection from the list. We will now select avx2 and it will then switch implementations based on the CPU flags.
1370 lines
45 KiB
C
1370 lines
45 KiB
C
/* Spa */
|
|
/* SPDX-FileCopyrightText: Copyright © 2018 Wim Taymans */
|
|
/* SPDX-License-Identifier: MIT */
|
|
|
|
#include "fmt-ops.h"
|
|
|
|
#include <spa/support/cpu.h>
|
|
|
|
#include <immintrin.h>
|
|
// GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
|
|
// (see https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values)
|
|
#ifndef _mm256_setr_m128i
|
|
# ifndef _mm256_set_m128i
|
|
# define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
|
|
# endif
|
|
# define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0))
|
|
#endif
|
|
|
|
#define _MM_CLAMP_PS(r,min,max) \
|
|
_mm_min_ps(_mm_max_ps(r, min), max)
|
|
|
|
#define _MM256_CLAMP_PS(r,min,max) \
|
|
_mm256_min_ps(_mm256_max_ps(r, min), max)
|
|
|
|
#define _MM_CLAMP_SS(r,min,max) \
|
|
_mm_min_ss(_mm_max_ss(r, min), max)
|
|
|
|
#define _MM256_BSWAP_EPI16(x) \
|
|
({ \
|
|
_mm256_or_si256( \
|
|
_mm256_slli_epi16(x, 8), \
|
|
_mm256_srli_epi16(x, 8)); \
|
|
})
|
|
|
|
#define _MM_TRANS_1x4_PS(v0,v1,v2,v3) \
|
|
({ \
|
|
v1 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
|
v2 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
|
v3 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
|
})
|
|
#define _MM_TRANS_1x4_EPI32(v0,v1,v2,v3) \
|
|
({ \
|
|
v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
|
v2 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
|
v3 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
|
})
|
|
|
|
#define _MM_STOREM_PS(d0,d1,d2,d3,v) \
|
|
({ \
|
|
__m128 o[3]; \
|
|
_MM_TRANS_1x4_PS(v, o[0], o[1], o[2]); \
|
|
_mm_store_ss(d0, v); \
|
|
_mm_store_ss(d1, o[0]); \
|
|
_mm_store_ss(d2, o[1]); \
|
|
_mm_store_ss(d3, o[2]); \
|
|
})
|
|
#define _MM_STOREM_EPI32(d0,d1,d2,d3,v) \
|
|
({ \
|
|
__m128i o[3]; \
|
|
_MM_TRANS_1x4_EPI32(v, o[0], o[1], o[2]); \
|
|
*d0 = _mm_cvtsi128_si32(v); \
|
|
*d1 = _mm_cvtsi128_si32(o[0]); \
|
|
*d2 = _mm_cvtsi128_si32(o[1]); \
|
|
*d3 = _mm_cvtsi128_si32(o[2]); \
|
|
})
|
|
|
|
static void
|
|
conv_s16_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int16_t *s = src;
|
|
float *d0 = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256i in = _mm256_setzero_si256();
|
|
__m256 out, factor = _mm256_set1_ps(1.0f / S16_SCALE);
|
|
|
|
if (SPA_LIKELY(SPA_IS_ALIGNED(d0, 32)))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in = _mm256_insert_epi16(in, s[0*n_channels], 1);
|
|
in = _mm256_insert_epi16(in, s[1*n_channels], 3);
|
|
in = _mm256_insert_epi16(in, s[2*n_channels], 5);
|
|
in = _mm256_insert_epi16(in, s[3*n_channels], 7);
|
|
in = _mm256_insert_epi16(in, s[4*n_channels], 9);
|
|
in = _mm256_insert_epi16(in, s[5*n_channels], 11);
|
|
in = _mm256_insert_epi16(in, s[6*n_channels], 13);
|
|
in = _mm256_insert_epi16(in, s[7*n_channels], 15);
|
|
|
|
in = _mm256_srai_epi32(in, 16);
|
|
out = _mm256_cvtepi32_ps(in);
|
|
out = _mm256_mul_ps(out, factor);
|
|
_mm256_store_ps(&d0[n], out);
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE);
|
|
out = _mm_cvtsi32_ss(factor, s[0]);
|
|
out = _mm_mul_ss(out, factor);
|
|
_mm_store_ss(&d0[n], out);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_s16_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const int16_t *s = src[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
for(; i < n_channels; i++)
|
|
conv_s16_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
}
|
|
|
|
|
|
static void
|
|
conv_s16s_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const uint16_t *s = src;
|
|
float *d0 = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256i in = _mm256_setzero_si256();
|
|
__m256 out, factor = _mm256_set1_ps(1.0f / S16_SCALE);
|
|
|
|
if (SPA_LIKELY(SPA_IS_ALIGNED(d0, 32)))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in = _mm256_insert_epi16(in, s[0*n_channels], 1);
|
|
in = _mm256_insert_epi16(in, s[1*n_channels], 3);
|
|
in = _mm256_insert_epi16(in, s[2*n_channels], 5);
|
|
in = _mm256_insert_epi16(in, s[3*n_channels], 7);
|
|
in = _mm256_insert_epi16(in, s[4*n_channels], 9);
|
|
in = _mm256_insert_epi16(in, s[5*n_channels], 11);
|
|
in = _mm256_insert_epi16(in, s[6*n_channels], 13);
|
|
in = _mm256_insert_epi16(in, s[7*n_channels], 15);
|
|
in = _MM256_BSWAP_EPI16(in);
|
|
|
|
in = _mm256_srai_epi32(in, 16);
|
|
out = _mm256_cvtepi32_ps(in);
|
|
out = _mm256_mul_ps(out, factor);
|
|
_mm256_store_ps(&d0[n], out);
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE);
|
|
out = _mm_cvtsi32_ss(factor, (int16_t)bswap_16(s[0]));
|
|
out = _mm_mul_ss(out, factor);
|
|
_mm_store_ss(&d0[n], out);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_s16s_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const uint16_t *s = src[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
for(; i < n_channels; i++)
|
|
conv_s16s_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
}
|
|
|
|
void
|
|
conv_s16_to_f32d_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const int16_t *s = src[0];
|
|
float *d0 = dst[0], *d1 = dst[1];
|
|
uint32_t n, unrolled;
|
|
__m256i in[2], t[4];
|
|
__m256 out[4], factor = _mm256_set1_ps(1.0f / S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s, 32) &&
|
|
SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 16) {
|
|
in[0] = _mm256_load_si256((__m256i*)(s + 0));
|
|
in[1] = _mm256_load_si256((__m256i*)(s + 16));
|
|
|
|
t[0] = _mm256_slli_epi32(in[0], 16);
|
|
t[0] = _mm256_srai_epi32(t[0], 16);
|
|
out[0] = _mm256_cvtepi32_ps(t[0]);
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
|
|
t[1] = _mm256_srai_epi32(in[0], 16);
|
|
out[1] = _mm256_cvtepi32_ps(t[1]);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
|
|
t[2] = _mm256_slli_epi32(in[1], 16);
|
|
t[2] = _mm256_srai_epi32(t[2], 16);
|
|
out[2] = _mm256_cvtepi32_ps(t[2]);
|
|
out[2] = _mm256_mul_ps(out[2], factor);
|
|
|
|
t[3] = _mm256_srai_epi32(in[1], 16);
|
|
out[3] = _mm256_cvtepi32_ps(t[3]);
|
|
out[3] = _mm256_mul_ps(out[3], factor);
|
|
|
|
_mm256_store_ps(&d0[n + 0], out[0]);
|
|
_mm256_store_ps(&d1[n + 0], out[1]);
|
|
_mm256_store_ps(&d0[n + 8], out[2]);
|
|
_mm256_store_ps(&d1[n + 8], out[3]);
|
|
|
|
s += 32;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S16_SCALE);
|
|
out[0] = _mm_cvtsi32_ss(factor, s[0]);
|
|
out[0] = _mm_mul_ss(out[0], factor);
|
|
out[1] = _mm_cvtsi32_ss(factor, s[1]);
|
|
out[1] = _mm_mul_ss(out[1], factor);
|
|
_mm_store_ss(&d0[n], out[0]);
|
|
_mm_store_ss(&d1[n], out[1]);
|
|
s += 2;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_s16s_to_f32d_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const uint16_t *s = src[0];
|
|
float *d0 = dst[0], *d1 = dst[1];
|
|
uint32_t n, unrolled;
|
|
__m256i in[2], t[4];
|
|
__m256 out[4], factor = _mm256_set1_ps(1.0f / S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s, 32) &&
|
|
SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 16) {
|
|
in[0] = _mm256_load_si256((__m256i*)(s + 0));
|
|
in[1] = _mm256_load_si256((__m256i*)(s + 16));
|
|
in[0] = _MM256_BSWAP_EPI16(in[0]);
|
|
in[1] = _MM256_BSWAP_EPI16(in[1]);
|
|
|
|
t[0] = _mm256_slli_epi32(in[0], 16);
|
|
t[0] = _mm256_srai_epi32(t[0], 16);
|
|
out[0] = _mm256_cvtepi32_ps(t[0]);
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
|
|
t[1] = _mm256_srai_epi32(in[0], 16);
|
|
out[1] = _mm256_cvtepi32_ps(t[1]);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
|
|
t[2] = _mm256_slli_epi32(in[1], 16);
|
|
t[2] = _mm256_srai_epi32(t[2], 16);
|
|
out[2] = _mm256_cvtepi32_ps(t[2]);
|
|
out[2] = _mm256_mul_ps(out[2], factor);
|
|
|
|
t[3] = _mm256_srai_epi32(in[1], 16);
|
|
out[3] = _mm256_cvtepi32_ps(t[3]);
|
|
out[3] = _mm256_mul_ps(out[3], factor);
|
|
|
|
_mm256_store_ps(&d0[n + 0], out[0]);
|
|
_mm256_store_ps(&d1[n + 0], out[1]);
|
|
_mm256_store_ps(&d0[n + 8], out[2]);
|
|
_mm256_store_ps(&d1[n + 8], out[3]);
|
|
|
|
s += 32;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S16_SCALE);
|
|
out[0] = _mm_cvtsi32_ss(factor, (int16_t)bswap_16(s[0]));
|
|
out[0] = _mm_mul_ss(out[0], factor);
|
|
out[1] = _mm_cvtsi32_ss(factor, (int16_t)bswap_16(s[1]));
|
|
out[1] = _mm_mul_ss(out[1], factor);
|
|
_mm_store_ss(&d0[n], out[0]);
|
|
_mm_store_ss(&d1[n], out[1]);
|
|
s += 2;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s24_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int8_t *s = src;
|
|
float *d0 = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m128i in;
|
|
__m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE);
|
|
__m128i mask1 = _mm_setr_epi32(0*n_channels, 3*n_channels, 6*n_channels, 9*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 16) && n_samples > 0) {
|
|
unrolled = n_samples & ~3;
|
|
if ((n_samples & 3) == 0)
|
|
unrolled -= 4;
|
|
}
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 4) {
|
|
in = _mm_i32gather_epi32((int*)s, mask1, 1);
|
|
in = _mm_slli_epi32(in, 8);
|
|
in = _mm_srai_epi32(in, 8);
|
|
out = _mm_cvtepi32_ps(in);
|
|
out = _mm_mul_ps(out, factor);
|
|
_mm_store_ps(&d0[n], out);
|
|
s += 12 * n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
out = _mm_cvtsi32_ss(factor, s24_to_s32(*(int24_t*)s));
|
|
out = _mm_mul_ss(out, factor);
|
|
_mm_store_ss(&d0[n], out);
|
|
s += 3 * n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s24_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int8_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1];
|
|
uint32_t n, unrolled;
|
|
__m128i in[2];
|
|
__m128 out[2], factor = _mm_set1_ps(1.0f / S24_SCALE);
|
|
__m128i mask1 = _mm_setr_epi32(0*n_channels, 3*n_channels, 6*n_channels, 9*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 16) &&
|
|
SPA_IS_ALIGNED(d1, 16) &&
|
|
n_samples > 0) {
|
|
unrolled = n_samples & ~3;
|
|
if ((n_samples & 3) == 0)
|
|
unrolled -= 4;
|
|
}
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 4) {
|
|
in[0] = _mm_i32gather_epi32((int*)&s[0], mask1, 1);
|
|
in[1] = _mm_i32gather_epi32((int*)&s[3], mask1, 1);
|
|
|
|
in[0] = _mm_slli_epi32(in[0], 8);
|
|
in[1] = _mm_slli_epi32(in[1], 8);
|
|
|
|
in[0] = _mm_srai_epi32(in[0], 8);
|
|
in[1] = _mm_srai_epi32(in[1], 8);
|
|
|
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
|
out[1] = _mm_cvtepi32_ps(in[1]);
|
|
|
|
out[0] = _mm_mul_ps(out[0], factor);
|
|
out[1] = _mm_mul_ps(out[1], factor);
|
|
|
|
_mm_store_ps(&d0[n], out[0]);
|
|
_mm_store_ps(&d1[n], out[1]);
|
|
|
|
s += 12 * n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*((int24_t*)s+0)));
|
|
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*((int24_t*)s+1)));
|
|
out[0] = _mm_mul_ss(out[0], factor);
|
|
out[1] = _mm_mul_ss(out[1], factor);
|
|
_mm_store_ss(&d0[n], out[0]);
|
|
_mm_store_ss(&d1[n], out[1]);
|
|
s += 3 * n_channels;
|
|
}
|
|
}
|
|
static void
|
|
conv_s24_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int8_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
|
|
uint32_t n, unrolled;
|
|
__m128i in[4];
|
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE);
|
|
__m128i mask1 = _mm_setr_epi32(0*n_channels, 3*n_channels, 6*n_channels, 9*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 16) &&
|
|
SPA_IS_ALIGNED(d1, 16) &&
|
|
SPA_IS_ALIGNED(d2, 16) &&
|
|
SPA_IS_ALIGNED(d3, 16) &&
|
|
n_samples > 0) {
|
|
unrolled = n_samples & ~3;
|
|
if ((n_samples & 3) == 0)
|
|
unrolled -= 4;
|
|
}
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 4) {
|
|
in[0] = _mm_i32gather_epi32((int*)&s[0], mask1, 1);
|
|
in[1] = _mm_i32gather_epi32((int*)&s[3], mask1, 1);
|
|
in[2] = _mm_i32gather_epi32((int*)&s[6], mask1, 1);
|
|
in[3] = _mm_i32gather_epi32((int*)&s[9], mask1, 1);
|
|
|
|
in[0] = _mm_slli_epi32(in[0], 8);
|
|
in[1] = _mm_slli_epi32(in[1], 8);
|
|
in[2] = _mm_slli_epi32(in[2], 8);
|
|
in[3] = _mm_slli_epi32(in[3], 8);
|
|
|
|
in[0] = _mm_srai_epi32(in[0], 8);
|
|
in[1] = _mm_srai_epi32(in[1], 8);
|
|
in[2] = _mm_srai_epi32(in[2], 8);
|
|
in[3] = _mm_srai_epi32(in[3], 8);
|
|
|
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
|
out[1] = _mm_cvtepi32_ps(in[1]);
|
|
out[2] = _mm_cvtepi32_ps(in[2]);
|
|
out[3] = _mm_cvtepi32_ps(in[3]);
|
|
|
|
out[0] = _mm_mul_ps(out[0], factor);
|
|
out[1] = _mm_mul_ps(out[1], factor);
|
|
out[2] = _mm_mul_ps(out[2], factor);
|
|
out[3] = _mm_mul_ps(out[3], factor);
|
|
|
|
_mm_store_ps(&d0[n], out[0]);
|
|
_mm_store_ps(&d1[n], out[1]);
|
|
_mm_store_ps(&d2[n], out[2]);
|
|
_mm_store_ps(&d3[n], out[3]);
|
|
|
|
s += 12 * n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
in[0] = _mm_setr_epi32(s24_to_s32(*((int24_t*)s+0)),
|
|
s24_to_s32(*((int24_t*)s+1)),
|
|
s24_to_s32(*((int24_t*)s+2)),
|
|
s24_to_s32(*((int24_t*)s+3)));
|
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
|
out[0] = _mm_mul_ps(out[0], factor);
|
|
_MM_STOREM_PS(&d0[n], &d1[n], &d2[n], &d3[n], out[0]);
|
|
s += 3 * n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_s24_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const int8_t *s = src[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
|
|
#if defined (HAVE_SSE2)
|
|
conv_s24_to_f32d_sse2(conv, dst, src, n_samples);
|
|
#endif
|
|
} else {
|
|
for(; i + 3 < n_channels; i += 4)
|
|
conv_s24_to_f32d_4s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
|
for(; i + 1 < n_channels; i += 2)
|
|
conv_s24_to_f32d_2s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
|
for(; i < n_channels; i++)
|
|
conv_s24_to_f32d_1s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s32_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
|
|
uint32_t n, unrolled;
|
|
__m256i in[4];
|
|
__m256 out[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
__m256i mask1 = _mm256_setr_epi32(0*n_channels, 1*n_channels, 2*n_channels, 3*n_channels,
|
|
4*n_channels, 5*n_channels, 6*n_channels, 7*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32) &&
|
|
SPA_IS_ALIGNED(d2, 32) &&
|
|
SPA_IS_ALIGNED(d3, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_i32gather_epi32((int*)&s[0], mask1, 4);
|
|
in[1] = _mm256_i32gather_epi32((int*)&s[1], mask1, 4);
|
|
in[2] = _mm256_i32gather_epi32((int*)&s[2], mask1, 4);
|
|
in[3] = _mm256_i32gather_epi32((int*)&s[3], mask1, 4);
|
|
|
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
|
out[1] = _mm256_cvtepi32_ps(in[1]);
|
|
out[2] = _mm256_cvtepi32_ps(in[2]);
|
|
out[3] = _mm256_cvtepi32_ps(in[3]);
|
|
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
out[2] = _mm256_mul_ps(out[2], factor);
|
|
out[3] = _mm256_mul_ps(out[3], factor);
|
|
|
|
_mm256_store_ps(&d0[n], out[0]);
|
|
_mm256_store_ps(&d1[n], out[1]);
|
|
_mm256_store_ps(&d2[n], out[2]);
|
|
_mm256_store_ps(&d3[n], out[3]);
|
|
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
__m128i in[1];
|
|
in[0] = _mm_setr_epi32(s[0], s[1], s[2], s[3]);
|
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
|
out[0] = _mm_mul_ps(out[0], factor);
|
|
_MM_STOREM_PS(&d0[n], &d1[n], &d2[n], &d3[n], out[0]);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s32_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1];
|
|
uint32_t n, unrolled;
|
|
__m256i in[4];
|
|
__m256 out[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
__m256i mask1 = _mm256_setr_epi32(0*n_channels, 1*n_channels, 2*n_channels, 3*n_channels,
|
|
4*n_channels, 5*n_channels, 6*n_channels, 7*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_i32gather_epi32((int*)&s[0], mask1, 4);
|
|
in[1] = _mm256_i32gather_epi32((int*)&s[1], mask1, 4);
|
|
|
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
|
out[1] = _mm256_cvtepi32_ps(in[1]);
|
|
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
|
|
_mm256_store_ps(&d0[n], out[0]);
|
|
_mm256_store_ps(&d1[n], out[1]);
|
|
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[2], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
out[0] = _mm_cvtsi32_ss(factor, s[0]);
|
|
out[1] = _mm_cvtsi32_ss(factor, s[1]);
|
|
out[0] = _mm_mul_ss(out[0], factor);
|
|
out[1] = _mm_mul_ss(out[1], factor);
|
|
_mm_store_ss(&d0[n], out[0]);
|
|
_mm_store_ss(&d1[n], out[1]);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s32_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256i in[2];
|
|
__m256 out[2], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
__m256i mask1 = _mm256_setr_epi32(0*n_channels, 1*n_channels, 2*n_channels, 3*n_channels,
|
|
4*n_channels, 5*n_channels, 6*n_channels, 7*n_channels);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 16) {
|
|
in[0] = _mm256_i32gather_epi32(&s[0*n_channels], mask1, 4);
|
|
in[1] = _mm256_i32gather_epi32(&s[8*n_channels], mask1, 4);
|
|
|
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
|
out[1] = _mm256_cvtepi32_ps(in[1]);
|
|
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
|
|
_mm256_store_ps(&d0[n+0], out[0]);
|
|
_mm256_store_ps(&d0[n+8], out[1]);
|
|
|
|
s += 16*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out, factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
out = _mm_cvtsi32_ss(factor, s[0]);
|
|
out = _mm_mul_ss(out, factor);
|
|
_mm_store_ss(&d0[n], out);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1];
|
|
uint32_t n, unrolled;
|
|
__m256i in[4];
|
|
__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_setr_epi64x(
|
|
*((uint64_t*)&s[0*n_channels]),
|
|
*((uint64_t*)&s[1*n_channels]),
|
|
*((uint64_t*)&s[4*n_channels]),
|
|
*((uint64_t*)&s[5*n_channels]));
|
|
in[1] = _mm256_setr_epi64x(
|
|
*((uint64_t*)&s[2*n_channels]),
|
|
*((uint64_t*)&s[3*n_channels]),
|
|
*((uint64_t*)&s[6*n_channels]),
|
|
*((uint64_t*)&s[7*n_channels]));
|
|
|
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
|
out[1] = _mm256_cvtepi32_ps(in[1]);
|
|
|
|
out[0] = _mm256_mul_ps(out[0], factor); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
out[1] = _mm256_mul_ps(out[1], factor); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
|
|
t[0] = _mm256_unpacklo_ps(out[0], out[1]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
|
|
t[1] = _mm256_unpackhi_ps(out[0], out[1]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
|
|
|
|
out[0] = _mm256_unpacklo_ps(t[0], t[1]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_unpackhi_ps(t[0], t[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
|
|
_mm256_store_ps(&d0[n], out[0]);
|
|
_mm256_store_ps(&d1[n], out[1]);
|
|
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[2], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
out[0] = _mm_cvtsi32_ss(factor, s[0]);
|
|
out[1] = _mm_cvtsi32_ss(factor, s[1]);
|
|
out[0] = _mm_mul_ss(out[0], factor);
|
|
out[1] = _mm_mul_ss(out[1], factor);
|
|
_mm_store_ss(&d0[n], out[0]);
|
|
_mm_store_ss(&d1[n], out[1]);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256i in[2];
|
|
__m256 out[2], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_setr_epi32(
|
|
s[0*n_channels], s[1*n_channels],
|
|
s[2*n_channels], s[3*n_channels],
|
|
s[4*n_channels], s[5*n_channels],
|
|
s[6*n_channels], s[7*n_channels]);
|
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
_mm256_store_ps(&d0[n+0], out[0]);
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out, factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
out = _mm_cvtsi32_ss(factor, s[0]);
|
|
out = _mm_mul_ss(out, factor);
|
|
_mm_store_ss(&d0[n], out);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src;
|
|
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
|
|
uint32_t n, unrolled;
|
|
__m256i in[4];
|
|
__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
|
|
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
|
SPA_IS_ALIGNED(d1, 32) &&
|
|
SPA_IS_ALIGNED(d2, 32) &&
|
|
SPA_IS_ALIGNED(d3, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_setr_m128i(
|
|
_mm_loadu_si128((__m128i*)&s[0*n_channels]),
|
|
_mm_loadu_si128((__m128i*)&s[4*n_channels]));
|
|
in[1] = _mm256_setr_m128i(
|
|
_mm_loadu_si128((__m128i*)&s[1*n_channels]),
|
|
_mm_loadu_si128((__m128i*)&s[5*n_channels]));
|
|
in[2] = _mm256_setr_m128i(
|
|
_mm_loadu_si128((__m128i*)&s[2*n_channels]),
|
|
_mm_loadu_si128((__m128i*)&s[6*n_channels]));
|
|
in[3] = _mm256_setr_m128i(
|
|
_mm_loadu_si128((__m128i*)&s[3*n_channels]),
|
|
_mm_loadu_si128((__m128i*)&s[7*n_channels]));
|
|
|
|
out[0] = _mm256_cvtepi32_ps(in[0]); /* a0 b0 c0 d0 a4 b4 c4 d4 */
|
|
out[1] = _mm256_cvtepi32_ps(in[1]); /* a1 b1 c1 d1 a5 b5 c5 d5 */
|
|
out[2] = _mm256_cvtepi32_ps(in[2]); /* a2 b2 c2 d2 a6 b6 c6 d6 */
|
|
out[3] = _mm256_cvtepi32_ps(in[3]); /* a3 b3 c3 d3 a7 b7 c7 d7 */
|
|
|
|
out[0] = _mm256_mul_ps(out[0], factor);
|
|
out[1] = _mm256_mul_ps(out[1], factor);
|
|
out[2] = _mm256_mul_ps(out[2], factor);
|
|
out[3] = _mm256_mul_ps(out[3], factor);
|
|
|
|
t[0] = _mm256_unpacklo_ps(out[0], out[2]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
|
|
t[1] = _mm256_unpackhi_ps(out[0], out[2]); /* c0 c2 d0 d2 c4 c6 d4 d6 */
|
|
t[2] = _mm256_unpacklo_ps(out[1], out[3]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
|
|
t[3] = _mm256_unpackhi_ps(out[1], out[3]); /* c1 c3 d1 d3 c5 c7 d5 d7 */
|
|
|
|
out[0] = _mm256_unpacklo_ps(t[0], t[2]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_unpackhi_ps(t[0], t[2]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
out[2] = _mm256_unpacklo_ps(t[1], t[3]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
|
|
out[3] = _mm256_unpackhi_ps(t[1], t[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
|
|
|
|
_mm256_store_ps(&d0[n], out[0]);
|
|
_mm256_store_ps(&d1[n], out[1]);
|
|
_mm256_store_ps(&d2[n], out[2]);
|
|
_mm256_store_ps(&d3[n], out[3]);
|
|
|
|
s += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
|
__m128i in[1];
|
|
in[0] = _mm_setr_epi32(s[0], s[1], s[2], s[3]);
|
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
|
out[0] = _mm_mul_ps(out[0], factor);
|
|
_MM_STOREM_PS(&d0[n], &d1[n], &d2[n], &d3[n], out[0]);
|
|
s += n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const int32_t *s = src[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
|
|
for(; i + 3 < n_channels; i += 4)
|
|
conv_s32_to_f32d_4s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
for(; i + 1 < n_channels; i += 2)
|
|
conv_s32_to_f32d_2s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
for(; i < n_channels; i++)
|
|
conv_s32_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
} else {
|
|
for(; i + 3 < n_channels; i += 4)
|
|
conv_s32_to_f32d_4s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
for(; i + 1 < n_channels; i += 2)
|
|
conv_s32_to_f32d_2s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
for(; i < n_channels; i++)
|
|
conv_s32_to_f32d_1s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_f32d_to_s32_1s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0];
|
|
int32_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m128 in[1];
|
|
__m128i out[4];
|
|
__m128 scale = _mm_set1_ps(S32_SCALE_F2I);
|
|
__m128 int_min = _mm_set1_ps(S32_MIN_F2I);
|
|
__m128 int_max = _mm_set1_ps(S32_MAX_F2I);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 16))
|
|
unrolled = n_samples & ~3;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 4) {
|
|
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), scale);
|
|
in[0] = _MM_CLAMP_PS(in[0], int_min, int_max);
|
|
out[0] = _mm_cvtps_epi32(in[0]);
|
|
_MM_STOREM_EPI32(&d[0*n_channels],
|
|
&d[1*n_channels],
|
|
&d[2*n_channels],
|
|
&d[3*n_channels], out[0]);
|
|
d += 4*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
in[0] = _mm_load_ss(&s0[n]);
|
|
in[0] = _mm_mul_ss(in[0], scale);
|
|
in[0] = _MM_CLAMP_SS(in[0], int_min, int_max);
|
|
*d = _mm_cvtss_si32(in[0]);
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
#define spa_write_unaligned(ptr, type, val) \
|
|
__extension__ ({ \
|
|
__typeof__(type) _val = (val); \
|
|
memcpy((ptr), &_val, sizeof(_val)); \
|
|
})
|
|
|
|
static void
|
|
conv_f32d_to_s32_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1];
|
|
int32_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m256 in[2];
|
|
__m256i out[2], t[2];
|
|
__m256 scale = _mm256_set1_ps(S32_SCALE_F2I);
|
|
__m256 int_min = _mm256_set1_ps(S32_MIN_F2I);
|
|
__m256 int_max = _mm256_set1_ps(S32_MAX_F2I);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n]), scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n]), scale);
|
|
|
|
in[0] = _MM256_CLAMP_PS(in[0], int_min, int_max);
|
|
in[1] = _MM256_CLAMP_PS(in[1], int_min, int_max);
|
|
|
|
out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[1] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
|
|
#ifdef __x86_64__
|
|
spa_write_unaligned(d + 0*n_channels, uint64_t, _mm256_extract_epi64(t[0], 0));
|
|
spa_write_unaligned(d + 1*n_channels, uint64_t, _mm256_extract_epi64(t[0], 1));
|
|
spa_write_unaligned(d + 2*n_channels, uint64_t, _mm256_extract_epi64(t[1], 0));
|
|
spa_write_unaligned(d + 3*n_channels, uint64_t, _mm256_extract_epi64(t[1], 1));
|
|
spa_write_unaligned(d + 4*n_channels, uint64_t, _mm256_extract_epi64(t[0], 2));
|
|
spa_write_unaligned(d + 5*n_channels, uint64_t, _mm256_extract_epi64(t[0], 3));
|
|
spa_write_unaligned(d + 6*n_channels, uint64_t, _mm256_extract_epi64(t[1], 2));
|
|
spa_write_unaligned(d + 7*n_channels, uint64_t, _mm256_extract_epi64(t[1], 3));
|
|
#else
|
|
_mm_storel_pi((__m64*)(d + 0*n_channels), (__m128)_mm256_extracti128_si256(t[0], 0));
|
|
_mm_storeh_pi((__m64*)(d + 1*n_channels), (__m128)_mm256_extracti128_si256(t[0], 0));
|
|
_mm_storel_pi((__m64*)(d + 2*n_channels), (__m128)_mm256_extracti128_si256(t[1], 0));
|
|
_mm_storeh_pi((__m64*)(d + 3*n_channels), (__m128)_mm256_extracti128_si256(t[1], 0));
|
|
_mm_storel_pi((__m64*)(d + 4*n_channels), (__m128)_mm256_extracti128_si256(t[0], 1));
|
|
_mm_storeh_pi((__m64*)(d + 5*n_channels), (__m128)_mm256_extracti128_si256(t[0], 1));
|
|
_mm_storel_pi((__m64*)(d + 6*n_channels), (__m128)_mm256_extracti128_si256(t[1], 1));
|
|
_mm_storeh_pi((__m64*)(d + 7*n_channels), (__m128)_mm256_extracti128_si256(t[1], 1));
|
|
#endif
|
|
d += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[2];
|
|
__m128i out[2];
|
|
__m128 scale = _mm_set1_ps(S32_SCALE_F2I);
|
|
__m128 int_min = _mm_set1_ps(S32_MIN_F2I);
|
|
__m128 int_max = _mm_set1_ps(S32_MAX_F2I);
|
|
|
|
in[0] = _mm_load_ss(&s0[n]);
|
|
in[1] = _mm_load_ss(&s1[n]);
|
|
|
|
in[0] = _mm_unpacklo_ps(in[0], in[1]);
|
|
|
|
in[0] = _mm_mul_ps(in[0], scale);
|
|
in[0] = _MM_CLAMP_PS(in[0], int_min, int_max);
|
|
out[0] = _mm_cvtps_epi32(in[0]);
|
|
_mm_storel_epi64((__m128i*)d, out[0]);
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_f32d_to_s32_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1], *s2 = src[2], *s3 = src[3];
|
|
int32_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m256 in[4];
|
|
__m256i out[4], t[4];
|
|
__m256 scale = _mm256_set1_ps(S32_SCALE_F2I);
|
|
__m256 int_min = _mm256_set1_ps(S32_MIN_F2I);
|
|
__m256 int_max = _mm256_set1_ps(S32_MAX_F2I);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32) &&
|
|
SPA_IS_ALIGNED(s2, 32) &&
|
|
SPA_IS_ALIGNED(s3, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n]), scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n]), scale);
|
|
in[2] = _mm256_mul_ps(_mm256_load_ps(&s2[n]), scale);
|
|
in[3] = _mm256_mul_ps(_mm256_load_ps(&s3[n]), scale);
|
|
|
|
in[0] = _MM256_CLAMP_PS(in[0], int_min, int_max);
|
|
in[1] = _MM256_CLAMP_PS(in[1], int_min, int_max);
|
|
in[2] = _MM256_CLAMP_PS(in[2], int_min, int_max);
|
|
in[3] = _MM256_CLAMP_PS(in[3], int_min, int_max);
|
|
|
|
out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
out[2] = _mm256_cvtps_epi32(in[2]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
|
|
out[3] = _mm256_cvtps_epi32(in[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[1] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
t[2] = _mm256_unpacklo_epi32(out[2], out[3]); /* c0 d0 c1 d1 c4 d4 c5 d5 */
|
|
t[3] = _mm256_unpackhi_epi32(out[2], out[3]); /* c2 d2 c3 d3 c6 d6 c7 d7 */
|
|
|
|
out[0] = _mm256_unpacklo_epi64(t[0], t[2]); /* a0 b0 c0 d0 a4 b4 c4 d4 */
|
|
out[1] = _mm256_unpackhi_epi64(t[0], t[2]); /* a1 b1 c1 d1 a5 b5 c5 d5 */
|
|
out[2] = _mm256_unpacklo_epi64(t[1], t[3]); /* a2 b2 c2 d2 a6 b6 c6 d6 */
|
|
out[3] = _mm256_unpackhi_epi64(t[1], t[3]); /* a3 b3 c3 d3 a7 b7 c7 d7 */
|
|
|
|
_mm_storeu_si128((__m128i*)(d + 0*n_channels), _mm256_extracti128_si256(out[0], 0));
|
|
_mm_storeu_si128((__m128i*)(d + 1*n_channels), _mm256_extracti128_si256(out[1], 0));
|
|
_mm_storeu_si128((__m128i*)(d + 2*n_channels), _mm256_extracti128_si256(out[2], 0));
|
|
_mm_storeu_si128((__m128i*)(d + 3*n_channels), _mm256_extracti128_si256(out[3], 0));
|
|
_mm_storeu_si128((__m128i*)(d + 4*n_channels), _mm256_extracti128_si256(out[0], 1));
|
|
_mm_storeu_si128((__m128i*)(d + 5*n_channels), _mm256_extracti128_si256(out[1], 1));
|
|
_mm_storeu_si128((__m128i*)(d + 6*n_channels), _mm256_extracti128_si256(out[2], 1));
|
|
_mm_storeu_si128((__m128i*)(d + 7*n_channels), _mm256_extracti128_si256(out[3], 1));
|
|
d += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[4];
|
|
__m128i out[4];
|
|
__m128 scale = _mm_set1_ps(S32_SCALE_F2I);
|
|
__m128 int_min = _mm_set1_ps(S32_MIN_F2I);
|
|
__m128 int_max = _mm_set1_ps(S32_MAX_F2I);
|
|
|
|
in[0] = _mm_setr_ps(s0[n], s1[n], s2[n], s3[n]);
|
|
in[0] = _mm_mul_ps(in[0], scale);
|
|
in[0] = _MM_CLAMP_PS(in[0], int_min, int_max);
|
|
out[0] = _mm_cvtps_epi32(in[0]);
|
|
_mm_storeu_si128((__m128i*)d, out[0]);
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_f32d_to_s32_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
int32_t *d = dst[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
for(; i + 3 < n_channels; i += 4)
|
|
conv_f32d_to_s32_4s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
for(; i + 1 < n_channels; i += 2)
|
|
conv_f32d_to_s32_2s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
for(; i < n_channels; i++)
|
|
conv_f32d_to_s32_1s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
}
|
|
|
|
static void
|
|
conv_f32d_to_s16_1s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0];
|
|
int16_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m128 in[2];
|
|
__m128i out[2];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 16))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_scale);
|
|
in[1] = _mm_mul_ps(_mm_load_ps(&s0[n+4]), int_scale);
|
|
out[0] = _mm_cvtps_epi32(in[0]);
|
|
out[1] = _mm_cvtps_epi32(in[1]);
|
|
out[0] = _mm_packs_epi32(out[0], out[1]);
|
|
|
|
d[0*n_channels] = _mm_extract_epi16(out[0], 0);
|
|
d[1*n_channels] = _mm_extract_epi16(out[0], 1);
|
|
d[2*n_channels] = _mm_extract_epi16(out[0], 2);
|
|
d[3*n_channels] = _mm_extract_epi16(out[0], 3);
|
|
d[4*n_channels] = _mm_extract_epi16(out[0], 4);
|
|
d[5*n_channels] = _mm_extract_epi16(out[0], 5);
|
|
d[6*n_channels] = _mm_extract_epi16(out[0], 6);
|
|
d[7*n_channels] = _mm_extract_epi16(out[0], 7);
|
|
d += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_scale);
|
|
in[0] = _MM_CLAMP_SS(in[0], int_min, int_max);
|
|
*d = _mm_cvtss_si32(in[0]);
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_f32d_to_s16_2s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1];
|
|
int16_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m256 in[2];
|
|
__m256i out[4], t[2];
|
|
__m256 int_scale = _mm256_set1_ps(S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n+0]), int_scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n+0]), int_scale);
|
|
|
|
out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[1] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
|
|
out[0] = _mm256_packs_epi32(t[0], t[1]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
|
|
spa_write_unaligned(d + 0*n_channels, uint32_t, _mm256_extract_epi32(out[0],0));
|
|
spa_write_unaligned(d + 1*n_channels, uint32_t, _mm256_extract_epi32(out[0],1));
|
|
spa_write_unaligned(d + 2*n_channels, uint32_t, _mm256_extract_epi32(out[0],2));
|
|
spa_write_unaligned(d + 3*n_channels, uint32_t, _mm256_extract_epi32(out[0],3));
|
|
spa_write_unaligned(d + 4*n_channels, uint32_t, _mm256_extract_epi32(out[0],4));
|
|
spa_write_unaligned(d + 5*n_channels, uint32_t, _mm256_extract_epi32(out[0],5));
|
|
spa_write_unaligned(d + 6*n_channels, uint32_t, _mm256_extract_epi32(out[0],6));
|
|
spa_write_unaligned(d + 7*n_channels, uint32_t, _mm256_extract_epi32(out[0],7));
|
|
|
|
d += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[2];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_scale);
|
|
in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_scale);
|
|
in[0] = _MM_CLAMP_SS(in[0], int_min, int_max);
|
|
in[1] = _MM_CLAMP_SS(in[1], int_min, int_max);
|
|
d[0] = _mm_cvtss_si32(in[0]);
|
|
d[1] = _mm_cvtss_si32(in[1]);
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
static void
|
|
conv_f32d_to_s16_4s_avx2(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
|
|
uint32_t n_channels, uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1], *s2 = src[2], *s3 = src[3];
|
|
int16_t *d = dst;
|
|
uint32_t n, unrolled;
|
|
__m256 in[4];
|
|
__m256i out[4], t[4];
|
|
__m256 int_scale = _mm256_set1_ps(S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32) &&
|
|
SPA_IS_ALIGNED(s2, 32) &&
|
|
SPA_IS_ALIGNED(s3, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n]), int_scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n]), int_scale);
|
|
in[2] = _mm256_mul_ps(_mm256_load_ps(&s2[n]), int_scale);
|
|
in[3] = _mm256_mul_ps(_mm256_load_ps(&s3[n]), int_scale);
|
|
|
|
t[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
t[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
t[2] = _mm256_cvtps_epi32(in[2]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
|
|
t[3] = _mm256_cvtps_epi32(in[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
|
|
|
|
t[0] = _mm256_packs_epi32(t[0], t[2]); /* a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7 */
|
|
t[1] = _mm256_packs_epi32(t[1], t[3]); /* b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7 */
|
|
|
|
out[0] = _mm256_unpacklo_epi16(t[0], t[1]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
out[1] = _mm256_unpackhi_epi16(t[0], t[1]); /* c0 d0 c1 d1 c2 d2 c3 d3 c4 d4 c5 d5 c6 d6 c7 d7 */
|
|
|
|
out[2] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 c0 d0 a1 b1 c1 d1 a4 b4 c4 d4 a5 b5 c5 d5 */
|
|
out[3] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 c2 d2 a3 b3 c3 d3 a6 b6 c6 d6 a7 b7 c7 d7 */
|
|
|
|
#ifdef __x86_64__
|
|
spa_write_unaligned(d + 0*n_channels, uint64_t, _mm256_extract_epi64(out[2], 0)); /* a0 b0 c0 d0 */
|
|
spa_write_unaligned(d + 1*n_channels, uint64_t, _mm256_extract_epi64(out[2], 1)); /* a1 b1 c1 d1 */
|
|
spa_write_unaligned(d + 2*n_channels, uint64_t, _mm256_extract_epi64(out[3], 0)); /* a2 b2 c2 d2 */
|
|
spa_write_unaligned(d + 3*n_channels, uint64_t, _mm256_extract_epi64(out[3], 1)); /* a3 b3 c3 d3 */
|
|
spa_write_unaligned(d + 4*n_channels, uint64_t, _mm256_extract_epi64(out[2], 2)); /* a4 b4 c4 d4 */
|
|
spa_write_unaligned(d + 5*n_channels, uint64_t, _mm256_extract_epi64(out[2], 3)); /* a5 b5 c5 d5 */
|
|
spa_write_unaligned(d + 6*n_channels, uint64_t, _mm256_extract_epi64(out[3], 2)); /* a6 b6 c6 d6 */
|
|
spa_write_unaligned(d + 7*n_channels, uint64_t, _mm256_extract_epi64(out[3], 3)); /* a7 b7 c7 d7 */
|
|
#else
|
|
_mm_storel_pi((__m64*)(d + 0*n_channels), (__m128)_mm256_extracti128_si256(out[2], 0));
|
|
_mm_storeh_pi((__m64*)(d + 1*n_channels), (__m128)_mm256_extracti128_si256(out[2], 0));
|
|
_mm_storel_pi((__m64*)(d + 2*n_channels), (__m128)_mm256_extracti128_si256(out[3], 0));
|
|
_mm_storeh_pi((__m64*)(d + 3*n_channels), (__m128)_mm256_extracti128_si256(out[3], 0));
|
|
_mm_storel_pi((__m64*)(d + 4*n_channels), (__m128)_mm256_extracti128_si256(out[2], 1));
|
|
_mm_storeh_pi((__m64*)(d + 5*n_channels), (__m128)_mm256_extracti128_si256(out[2], 1));
|
|
_mm_storel_pi((__m64*)(d + 6*n_channels), (__m128)_mm256_extracti128_si256(out[3], 1));
|
|
_mm_storeh_pi((__m64*)(d + 7*n_channels), (__m128)_mm256_extracti128_si256(out[3], 1));
|
|
#endif
|
|
|
|
d += 8*n_channels;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[4];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
in[0] = _mm_setr_ps(s0[n], s1[n], s2[n], s3[n]);
|
|
in[0] = _mm_mul_ps(in[0], int_scale);
|
|
in[0] = _MM_CLAMP_PS(in[0], int_min, int_max);
|
|
|
|
_MM_TRANS_1x4_PS(in[0], in[1], in[2], in[3]);
|
|
d[0] = _mm_cvtss_si32(in[0]);
|
|
d[1] = _mm_cvtss_si32(in[1]);
|
|
d[2] = _mm_cvtss_si32(in[2]);
|
|
d[3] = _mm_cvtss_si32(in[3]);
|
|
|
|
d += n_channels;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_f32d_to_s16_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
int16_t *d = dst[0];
|
|
uint32_t i = 0, n_channels = conv->n_channels;
|
|
|
|
for(; i + 3 < n_channels; i += 4)
|
|
conv_f32d_to_s16_4s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
for(; i + 1 < n_channels; i += 2)
|
|
conv_f32d_to_s16_2s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
for(; i < n_channels; i++)
|
|
conv_f32d_to_s16_1s_avx2(conv, &d[i], &src[i], n_channels, n_samples);
|
|
}
|
|
|
|
void
|
|
conv_f32d_to_s16_4_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1], *s2 = src[2], *s3 = src[3];
|
|
int16_t *d = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256 in[4];
|
|
__m256i out[4], t[4];
|
|
__m256 int_scale = _mm256_set1_ps(S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32) &&
|
|
SPA_IS_ALIGNED(s2, 32) &&
|
|
SPA_IS_ALIGNED(s3, 32))
|
|
unrolled = n_samples & ~7;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 8) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n]), int_scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n]), int_scale);
|
|
in[2] = _mm256_mul_ps(_mm256_load_ps(&s2[n]), int_scale);
|
|
in[3] = _mm256_mul_ps(_mm256_load_ps(&s3[n]), int_scale);
|
|
|
|
t[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
t[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
t[2] = _mm256_cvtps_epi32(in[2]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
|
|
t[3] = _mm256_cvtps_epi32(in[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
|
|
|
|
t[0] = _mm256_packs_epi32(t[0], t[2]); /* a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7 */
|
|
t[1] = _mm256_packs_epi32(t[1], t[3]); /* b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7 */
|
|
|
|
out[0] = _mm256_unpacklo_epi16(t[0], t[1]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
out[1] = _mm256_unpackhi_epi16(t[0], t[1]); /* c0 d0 c1 d1 c2 d2 c3 d3 c4 d4 c5 d5 c6 d6 c7 d7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 c0 d0 a1 b1 c1 d1 a4 b4 c4 d4 a5 b5 c5 d5 */
|
|
t[2] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 c2 d2 a3 b3 c3 d3 a6 b6 c6 d6 a7 b7 c7 d7 */
|
|
|
|
out[0] = _mm256_inserti128_si256(t[0], _mm256_extracti128_si256(t[2], 0), 1);
|
|
out[2] = _mm256_inserti128_si256(t[2], _mm256_extracti128_si256(t[0], 1), 0);
|
|
|
|
_mm256_store_si256((__m256i*)(d+0), out[0]);
|
|
_mm256_store_si256((__m256i*)(d+16), out[2]);
|
|
d += 32;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[4];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
in[0] = _mm_setr_ps(s0[n], s1[n], s2[n], s3[n]);
|
|
in[0] = _mm_mul_ps(in[0], int_scale);
|
|
in[0] = _MM_CLAMP_PS(in[0], int_min, int_max);
|
|
_MM_TRANS_1x4_PS(in[0], in[1], in[2], in[3]);
|
|
d[0] = _mm_cvtss_si32(in[0]);
|
|
d[1] = _mm_cvtss_si32(in[1]);
|
|
d[2] = _mm_cvtss_si32(in[2]);
|
|
d[3] = _mm_cvtss_si32(in[3]);
|
|
d += 4;
|
|
}
|
|
}
|
|
void
|
|
conv_f32d_to_s16_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1];
|
|
int16_t *d = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256 in[4];
|
|
__m256i out[4], t[4];
|
|
__m256 int_scale = _mm256_set1_ps(S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 16) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n+0]), int_scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n+0]), int_scale);
|
|
in[2] = _mm256_mul_ps(_mm256_load_ps(&s0[n+8]), int_scale);
|
|
in[3] = _mm256_mul_ps(_mm256_load_ps(&s1[n+8]), int_scale);
|
|
|
|
out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
out[2] = _mm256_cvtps_epi32(in[2]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[3] = _mm256_cvtps_epi32(in[3]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[1] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
t[2] = _mm256_unpacklo_epi32(out[2], out[3]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[3] = _mm256_unpackhi_epi32(out[2], out[3]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
|
|
out[0] = _mm256_packs_epi32(t[0], t[1]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
out[1] = _mm256_packs_epi32(t[2], t[3]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
|
|
_mm256_store_si256((__m256i*)(d+0), out[0]);
|
|
_mm256_store_si256((__m256i*)(d+16), out[1]);
|
|
|
|
d += 32;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[4];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_scale);
|
|
in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_scale);
|
|
in[0] = _MM_CLAMP_SS(in[0], int_min, int_max);
|
|
in[1] = _MM_CLAMP_SS(in[1], int_min, int_max);
|
|
d[0] = _mm_cvtss_si32(in[0]);
|
|
d[1] = _mm_cvtss_si32(in[1]);
|
|
d += 2;
|
|
}
|
|
}
|
|
|
|
void
|
|
conv_f32d_to_s16s_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
|
uint32_t n_samples)
|
|
{
|
|
const float *s0 = src[0], *s1 = src[1];
|
|
uint16_t *d = dst[0];
|
|
uint32_t n, unrolled;
|
|
__m256 in[4];
|
|
__m256i out[4], t[4];
|
|
__m256 int_scale = _mm256_set1_ps(S16_SCALE);
|
|
|
|
if (SPA_IS_ALIGNED(s0, 32) &&
|
|
SPA_IS_ALIGNED(s1, 32))
|
|
unrolled = n_samples & ~15;
|
|
else
|
|
unrolled = 0;
|
|
|
|
for(n = 0; n < unrolled; n += 16) {
|
|
in[0] = _mm256_mul_ps(_mm256_load_ps(&s0[n+0]), int_scale);
|
|
in[1] = _mm256_mul_ps(_mm256_load_ps(&s1[n+0]), int_scale);
|
|
in[2] = _mm256_mul_ps(_mm256_load_ps(&s0[n+8]), int_scale);
|
|
in[3] = _mm256_mul_ps(_mm256_load_ps(&s1[n+8]), int_scale);
|
|
|
|
out[0] = _mm256_cvtps_epi32(in[0]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[1] = _mm256_cvtps_epi32(in[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
out[2] = _mm256_cvtps_epi32(in[2]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
|
out[3] = _mm256_cvtps_epi32(in[3]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
|
|
|
t[0] = _mm256_unpacklo_epi32(out[0], out[1]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[1] = _mm256_unpackhi_epi32(out[0], out[1]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
t[2] = _mm256_unpacklo_epi32(out[2], out[3]); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
|
t[3] = _mm256_unpackhi_epi32(out[2], out[3]); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
|
|
|
out[0] = _mm256_packs_epi32(t[0], t[1]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
out[1] = _mm256_packs_epi32(t[2], t[3]); /* a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 */
|
|
out[0] = _MM256_BSWAP_EPI16(out[0]);
|
|
out[1] = _MM256_BSWAP_EPI16(out[1]);
|
|
|
|
_mm256_store_si256((__m256i*)(d+0), out[0]);
|
|
_mm256_store_si256((__m256i*)(d+16), out[1]);
|
|
|
|
d += 32;
|
|
}
|
|
for(; n < n_samples; n++) {
|
|
__m128 in[4];
|
|
__m128 int_scale = _mm_set1_ps(S16_SCALE);
|
|
__m128 int_max = _mm_set1_ps(S16_MAX);
|
|
__m128 int_min = _mm_set1_ps(S16_MIN);
|
|
|
|
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_scale);
|
|
in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_scale);
|
|
in[0] = _MM_CLAMP_SS(in[0], int_min, int_max);
|
|
in[1] = _MM_CLAMP_SS(in[1], int_min, int_max);
|
|
d[0] = bswap_16((uint16_t)_mm_cvtss_si32(in[0]));
|
|
d[1] = bswap_16((uint16_t)_mm_cvtss_si32(in[1]));
|
|
d += 2;
|
|
}
|
|
}
|
|
|