mem: align memory to requested alignment

Improve the allocators to always align the buffer memory to the
requested alignment
Use aligned read and writes for sse functions and check alignment,
optionally falling back to unaligned path.
Add more tests and benchmark cases
Check and warn for misaligned memory in plugins.
This commit is contained in:
Wim Taymans 2019-01-24 18:28:52 +01:00
parent dd66469570
commit 13bf70a8dd
19 changed files with 736 additions and 516 deletions

View file

@ -30,142 +30,148 @@
#include <emmintrin.h>
static void
conv_s16_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
conv_s16_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
{
const int16_t *s = src;
float **d = (float **) dst;
float *d0 = d[0];
int n = 0, unrolled;
int n, unrolled;
__m128i in;
__m128 out, factor = _mm_set1_ps(1.0f / S16_SCALE);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (SPA_IS_ALIGNED(d0, 16))
unrolled = n_samples / 4;
else
unrolled = 0;
for(; unrolled--; n += 4) {
in = _mm_insert_epi16(in, s[0*n_dst], 1);
in = _mm_insert_epi16(in, s[1*n_dst], 3);
in = _mm_insert_epi16(in, s[2*n_dst], 5);
in = _mm_insert_epi16(in, s[3*n_dst], 7);
for(n = 0; unrolled--; n += 4) {
in = _mm_insert_epi16(in, s[0*n_channels], 1);
in = _mm_insert_epi16(in, s[1*n_channels], 3);
in = _mm_insert_epi16(in, s[2*n_channels], 5);
in = _mm_insert_epi16(in, s[3*n_channels], 7);
in = _mm_srai_epi32(in, 16);
out = _mm_cvtepi32_ps(in);
out = _mm_mul_ps(out, factor);
_mm_storeu_ps(&d0[n], out);
s += 4*n_dst;
_mm_store_ps(&d0[n], out);
s += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
out = _mm_cvtsi32_ss(out, s[0]);
out = _mm_mul_ss(out, factor);
_mm_store_ss(&d0[n], out);
s += n_dst;
s += n_channels;
}
}
static void
conv_s16_to_f32d_2_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
conv_s16_to_f32d_2_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
{
const int16_t *s = src;
float **d = (float **) dst;
float *d0 = d[0], *d1 = d[1];
int n = 0, unrolled;
int n, unrolled;
__m128i in, t[2];
__m128 out[2], factor = _mm_set1_ps(1.0f / S16_SCALE);
if (n_dst == 2) {
if (n_channels == 2 &&
SPA_IS_ALIGNED(s, 16) &&
SPA_IS_ALIGNED(d0, 16) &&
SPA_IS_ALIGNED(d1, 16))
unrolled = n_samples / 4;
n_samples = n_samples & 3;
else
unrolled = 0;
for(; unrolled--; n += 4) {
in = _mm_loadu_si128((__m128i*)s);
for(n = 0; unrolled--; n += 4) {
in = _mm_load_si128((__m128i*)s);
t[0] = _mm_slli_epi32(in, 16);
t[0] = _mm_srai_epi32(t[0], 16);
t[1] = _mm_srai_epi32(in, 16);
t[0] = _mm_slli_epi32(in, 16);
t[0] = _mm_srai_epi32(t[0], 16);
t[1] = _mm_srai_epi32(in, 16);
out[0] = _mm_cvtepi32_ps(t[0]);
out[0] = _mm_mul_ps(out[0], factor);
out[1] = _mm_cvtepi32_ps(t[1]);
out[1] = _mm_mul_ps(out[1], factor);
out[0] = _mm_cvtepi32_ps(t[0]);
out[0] = _mm_mul_ps(out[0], factor);
out[1] = _mm_cvtepi32_ps(t[1]);
out[1] = _mm_mul_ps(out[1], factor);
_mm_storeu_ps(&d0[n], out[0]);
_mm_storeu_ps(&d1[n], out[1]);
_mm_store_ps(&d0[n], out[0]);
_mm_store_ps(&d1[n], out[1]);
s += 4*n_dst;
}
s += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(out[0], s[0]);
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_cvtsi32_ss(out[1], s[1]);
out[1] = _mm_mul_ss(out[1], factor);
_mm_store_ss(&d0[n], out[0]);
_mm_store_ss(&d1[n], out[1]);
s += n_dst;
s += n_channels;
}
}
static void
conv_s16_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
conv_s16_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
{
const int16_t *s = src[0];
int i = 0;
for(; i + 1 < n_dst; i += 2)
conv_s16_to_f32d_2_sse2(data, n_dst, &dst[i], &s[i], n_samples);
for(; i < n_dst; i++)
conv_s16_to_f32d_1_sse2(data, n_dst, &dst[i], &s[i], n_samples);
for(; i + 1 < n_channels; i += 2)
conv_s16_to_f32d_2_sse2(data, &dst[i], &s[i], n_channels, n_samples);
for(; i < n_channels; i++)
conv_s16_to_f32d_1_sse2(data, &dst[i], &s[i], n_channels, n_samples);
}
static void
conv_s24_to_f32d_1_sse2(void *data, int n_dst, void *dst[n_dst], const void *src, int n_samples)
conv_s24_to_f32d_1_sse2(void *data, void *dst[], const void *src, int n_channels, int n_samples)
{
const uint8_t *s = src;
float **d = (float **) dst;
float *d0 = d[0];
int n = 0, unrolled;
int n, unrolled;
__m128i in;
__m128 out, factor = _mm_set1_ps(1.0f / S24_SCALE);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (n_samples == 0) {
n_samples += 4;
unrolled--;
if (SPA_IS_ALIGNED(d0, 16) && n_samples > 4) {
unrolled = n_samples / 4;
if ((n_samples & 3) == 0)
unrolled--;
}
else
unrolled = 0;
for(; unrolled--; n += 4) {
for(n = 0; unrolled--; n += 4) {
in = _mm_setr_epi32(
*((uint32_t*)&s[0 * n_dst]),
*((uint32_t*)&s[3 * n_dst]),
*((uint32_t*)&s[6 * n_dst]),
*((uint32_t*)&s[9 * n_dst]));
*((uint32_t*)&s[0 * n_channels]),
*((uint32_t*)&s[3 * n_channels]),
*((uint32_t*)&s[6 * n_channels]),
*((uint32_t*)&s[9 * n_channels]));
in = _mm_slli_epi32(in, 8);
in = _mm_srai_epi32(in, 8);
out = _mm_cvtepi32_ps(in);
out = _mm_mul_ps(out, factor);
_mm_storeu_ps(&d0[n], out);
s += 12 * n_dst;
_mm_store_ps(&d0[n], out);
s += 12 * n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
out = _mm_cvtsi32_ss(out, read_s24(s));
out = _mm_mul_ss(out, factor);
_mm_store_ss(&d0[n], out);
s += 3 * n_dst;
s += 3 * n_channels;
}
}
static void
conv_s24_to_f32d_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
conv_s24_to_f32d_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
{
const int8_t *s = src[0];
int i = 0;
for(; i < n_dst; i++)
conv_s24_to_f32d_1_sse2(data, n_dst, &dst[i], &s[3*i], n_samples);
for(; i < n_channels; i++)
conv_s24_to_f32d_1_sse2(data, &dst[i], &s[3*i], n_channels, n_samples);
}
static void
conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s32_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0];
@ -176,11 +182,13 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
__m128 int_max = _mm_set1_ps(S24_MAX_F);
__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (SPA_IS_ALIGNED(s0, 16))
unrolled = n_samples / 4;
else
unrolled = 0;
for(n = 0; unrolled--; n += 4) {
in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
@ -188,23 +196,23 @@ conv_f32d_to_s32_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3));
d[0*n_src] = _mm_cvtsi128_si32(out[0]);
d[1*n_src] = _mm_cvtsi128_si32(out[1]);
d[2*n_src] = _mm_cvtsi128_si32(out[2]);
d[3*n_src] = _mm_cvtsi128_si32(out[3]);
d += 4*n_src;
d[0*n_channels] = _mm_cvtsi128_si32(out[0]);
d[1*n_channels] = _mm_cvtsi128_si32(out[1]);
d[2*n_channels] = _mm_cvtsi128_si32(out[2]);
d[3*n_channels] = _mm_cvtsi128_si32(out[3]);
d += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
in[0] = _mm_load_ss(&s0[n]);
in[0] = _mm_mul_ss(in[0], int_max);
in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
*d = _mm_cvtss_si32(in[0]) << 8;
d += n_src;
d += n_channels;
}
}
static void
conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s32_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1];
@ -215,12 +223,15 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
__m128 int_max = _mm_set1_ps(S24_MAX_F);
__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (SPA_IS_ALIGNED(s0, 16) &&
SPA_IS_ALIGNED(s1, 16))
unrolled = n_samples / 4;
else
unrolled = 0;
for(n = 0; unrolled--; n += 4) {
in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min));
@ -233,13 +244,13 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
t[2] = _mm_unpackhi_epi32(out[0], out[1]);
t[3] = _mm_shuffle_epi32(t[2], _MM_SHUFFLE(0, 0, 2, 2));
_mm_storel_epi64((__m128i*)(d + 0*n_src), t[0]);
_mm_storel_epi64((__m128i*)(d + 1*n_src), t[1]);
_mm_storel_epi64((__m128i*)(d + 2*n_src), t[2]);
_mm_storel_epi64((__m128i*)(d + 3*n_src), t[3]);
d += 4*n_src;
_mm_storel_epi64((__m128i*)(d + 0*n_channels), t[0]);
_mm_storel_epi64((__m128i*)(d + 1*n_channels), t[1]);
_mm_storel_epi64((__m128i*)(d + 2*n_channels), t[2]);
_mm_storel_epi64((__m128i*)(d + 3*n_channels), t[3]);
d += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
in[0] = _mm_load_ss(&s0[n]);
in[1] = _mm_load_ss(&s1[n]);
@ -249,12 +260,12 @@ conv_f32d_to_s32_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
_mm_storel_epi64((__m128i*)d, out[0]);
d += n_src;
d += n_channels;
}
}
static void
conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s32_4_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1], *s2 = s[2], *s3 = s[3];
@ -265,14 +276,19 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
__m128 int_max = _mm_set1_ps(S24_MAX_F);
__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (SPA_IS_ALIGNED(s0, 16) &&
SPA_IS_ALIGNED(s1, 16) &&
SPA_IS_ALIGNED(s2, 16) &&
SPA_IS_ALIGNED(s3, 16))
unrolled = n_samples / 4;
else
unrolled = 0;
for(n = 0; unrolled--; n += 4) {
in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
in[2] = _mm_mul_ps(_mm_loadu_ps(&s2[n]), int_max);
in[3] = _mm_mul_ps(_mm_loadu_ps(&s3[n]), int_max);
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
in[2] = _mm_mul_ps(_mm_load_ps(&s2[n]), int_max);
in[3] = _mm_mul_ps(_mm_load_ps(&s3[n]), int_max);
in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
in[1] = _mm_min_ps(int_max, _mm_max_ps(in[1], int_min));
@ -294,13 +310,13 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
out[2] = _mm_unpacklo_epi64(t[2], t[3]);
out[3] = _mm_unpackhi_epi64(t[2], t[3]);
_mm_storeu_si128((__m128i*)(d + 0*n_src), out[0]);
_mm_storeu_si128((__m128i*)(d + 1*n_src), out[1]);
_mm_storeu_si128((__m128i*)(d + 2*n_src), out[2]);
_mm_storeu_si128((__m128i*)(d + 3*n_src), out[3]);
d += 4*n_src;
_mm_storeu_si128((__m128i*)(d + 0*n_channels), out[0]);
_mm_storeu_si128((__m128i*)(d + 1*n_channels), out[1]);
_mm_storeu_si128((__m128i*)(d + 2*n_channels), out[2]);
_mm_storeu_si128((__m128i*)(d + 3*n_channels), out[3]);
d += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
in[0] = _mm_load_ss(&s0[n]);
in[1] = _mm_load_ss(&s1[n]);
in[2] = _mm_load_ss(&s2[n]);
@ -314,26 +330,26 @@ conv_f32d_to_s32_4_sse2(void *data, void *dst, int n_src, const void *src[n_src]
in[0] = _mm_min_ps(int_max, _mm_max_ps(in[0], int_min));
out[0] = _mm_slli_epi32(_mm_cvtps_epi32(in[0]), 8);
_mm_storeu_si128((__m128i*)d, out[0]);
d += n_src;
d += n_channels;
}
}
static void
conv_f32d_to_s32_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s32_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
{
int32_t *d = dst[0];
int i = 0;
for(; i + 3 < n_src; i += 4)
conv_f32d_to_s32_4_sse2(data, &d[i], n_src, &src[i], n_samples);
for(; i + 1 < n_src; i += 2)
conv_f32d_to_s32_2_sse2(data, &d[i], n_src, &src[i], n_samples);
for(; i < n_src; i++)
conv_f32d_to_s32_1_sse2(data, &d[i], n_src, &src[i], n_samples);
for(; i + 3 < n_channels; i += 4)
conv_f32d_to_s32_4_sse2(data, &d[i], &src[i], n_channels, n_samples);
for(; i + 1 < n_channels; i += 2)
conv_f32d_to_s32_2_sse2(data, &d[i], &src[i], n_channels, n_samples);
for(; i < n_channels; i++)
conv_f32d_to_s32_1_sse2(data, &d[i], &src[i], n_channels, n_samples);
}
static void
conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s16_1_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0];
@ -344,52 +360,59 @@ conv_f32d_to_s16_1_sse2(void *data, void *dst, int n_src, const void *src[n_src]
__m128 int_max = _mm_set1_ps(S16_MAX_F);
__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
unrolled = n_samples / 8;
n_samples = n_samples & 7;
if (SPA_IS_ALIGNED(s0, 16))
unrolled = n_samples / 8;
else
unrolled = 0;
for(n = 0; unrolled--; n += 8) {
in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_loadu_ps(&s0[n+4]), int_max);
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_load_ps(&s0[n+4]), int_max);
out[0] = _mm_cvtps_epi32(in[0]);
out[1] = _mm_cvtps_epi32(in[1]);
out[0] = _mm_packs_epi32(out[0], out[1]);
d[0*n_src] = _mm_extract_epi16(out[0], 0);
d[1*n_src] = _mm_extract_epi16(out[0], 1);
d[2*n_src] = _mm_extract_epi16(out[0], 2);
d[3*n_src] = _mm_extract_epi16(out[0], 3);
d[4*n_src] = _mm_extract_epi16(out[0], 4);
d[5*n_src] = _mm_extract_epi16(out[0], 5);
d[6*n_src] = _mm_extract_epi16(out[0], 6);
d[7*n_src] = _mm_extract_epi16(out[0], 7);
d += 8*n_src;
d[0*n_channels] = _mm_extract_epi16(out[0], 0);
d[1*n_channels] = _mm_extract_epi16(out[0], 1);
d[2*n_channels] = _mm_extract_epi16(out[0], 2);
d[3*n_channels] = _mm_extract_epi16(out[0], 3);
d[4*n_channels] = _mm_extract_epi16(out[0], 4);
d[5*n_channels] = _mm_extract_epi16(out[0], 5);
d[6*n_channels] = _mm_extract_epi16(out[0], 6);
d[7*n_channels] = _mm_extract_epi16(out[0], 7);
d += 8*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
fprintf(stderr, "%p %d %d %d\n", s0, n_samples, n, n_channels);
spa_assert_not_reached();
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max);
in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
*d = _mm_cvtss_si32(in[0]);
d += n_src;
d += n_channels;
}
}
static void
conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s16_2_sse2(void *data, void *dst, const void *src[], int n_channels, int n_samples)
{
const float **s = (const float **) src;
const float *s0 = s[0], *s1 = s[1];
int16_t *d = dst;
int n = 0, unrolled;
int n, unrolled;
__m128 in[2];
__m128i out[4], t[2];
__m128 int_max = _mm_set1_ps(S16_MAX_F);
__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
unrolled = n_samples / 4;
n_samples = n_samples & 3;
if (SPA_IS_ALIGNED(s0, 16) &&
SPA_IS_ALIGNED(s1, 16))
unrolled = n_samples / 4;
else
unrolled = 0;
for(; unrolled--; n += 4) {
in[0] = _mm_mul_ps(_mm_loadu_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_loadu_ps(&s1[n]), int_max);
for(n = 0; unrolled--; n += 4) {
in[0] = _mm_mul_ps(_mm_load_ps(&s0[n]), int_max);
in[1] = _mm_mul_ps(_mm_load_ps(&s1[n]), int_max);
t[0] = _mm_cvtps_epi32(in[0]);
t[1] = _mm_cvtps_epi32(in[1]);
@ -402,31 +425,33 @@ conv_f32d_to_s16_2_sse2(void *data, void *dst, int n_src, const void *src[n_src]
out[2] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(1, 0, 3, 2));
out[3] = _mm_shuffle_epi32(out[0], _MM_SHUFFLE(2, 1, 0, 3));
*((uint32_t*)(d + 0*n_src)) = _mm_cvtsi128_si32(out[0]);
*((uint32_t*)(d + 1*n_src)) = _mm_cvtsi128_si32(out[1]);
*((uint32_t*)(d + 2*n_src)) = _mm_cvtsi128_si32(out[2]);
*((uint32_t*)(d + 3*n_src)) = _mm_cvtsi128_si32(out[3]);
d += 4*n_src;
*((int32_t*)(d + 0*n_channels)) = _mm_cvtsi128_si32(out[0]);
*((int32_t*)(d + 1*n_channels)) = _mm_cvtsi128_si32(out[1]);
*((int32_t*)(d + 2*n_channels)) = _mm_cvtsi128_si32(out[2]);
*((int32_t*)(d + 3*n_channels)) = _mm_cvtsi128_si32(out[3]);
d += 4*n_channels;
}
for(; n_samples--; n++) {
for(; n < n_samples; n++) {
fprintf(stderr, "%p %p %d %d %d\n", s0, s1, n_samples, n, n_channels);
spa_assert_not_reached();
in[0] = _mm_mul_ss(_mm_load_ss(&s0[n]), int_max);
in[1] = _mm_mul_ss(_mm_load_ss(&s1[n]), int_max);
in[0] = _mm_min_ss(int_max, _mm_max_ss(in[0], int_min));
in[1] = _mm_min_ss(int_max, _mm_max_ss(in[1], int_min));
d[0] = _mm_cvtss_si32(in[0]);
d[1] = _mm_cvtss_si32(in[1]);
d += n_src;
d += n_channels;
}
}
static void
conv_f32d_to_s16_sse2(void *data, int n_dst, void *dst[n_dst], int n_src, const void *src[n_src], int n_samples)
conv_f32d_to_s16_sse2(void *data, void *dst[], const void *src[], int n_channels, int n_samples)
{
int16_t *d = dst[0];
int i = 0;
for(; i + 1 < n_src; i += 2)
conv_f32d_to_s16_2_sse2(data, &d[i], n_src, &src[i], n_samples);
for(; i < n_src; i++)
conv_f32d_to_s16_1_sse2(data, &d[i], n_src, &src[i], n_samples);
for(; i + 1 < n_channels; i += 2)
conv_f32d_to_s16_2_sse2(data, &d[i], &src[i], n_channels, n_samples);
for(; i < n_channels; i++)
conv_f32d_to_s16_1_sse2(data, &d[i], &src[i], n_channels, n_samples);
}