audioconvert: simplify 24 bits handling

Make a new uint42_t and int24_t type and use that to handle 24 bits
samples. This makes it easier because we can iterate and copy the
structs like other types.
This commit is contained in:
Wim Taymans 2022-07-01 12:24:35 +02:00
parent e395f62425
commit 817d5bd7a4
6 changed files with 204 additions and 247 deletions

View file

@ -147,7 +147,7 @@ void
conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in;
@ -164,21 +164,21 @@ conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in = _mm_setr_epi32(
*((uint32_t*)&s[0 * n_channels]),
*((uint32_t*)&s[3 * n_channels]),
*((uint32_t*)&s[6 * n_channels]),
*((uint32_t*)&s[9 * n_channels]));
*((uint32_t*)&s[1 * n_channels]),
*((uint32_t*)&s[2 * n_channels]),
*((uint32_t*)&s[3 * n_channels]));
in = _mm_slli_epi32(in, 8);
in = _mm_srai_epi32(in, 8);
out = _mm_cvtepi32_ps(in);
out = _mm_mul_ps(out, factor);
_mm_store_ps(&d0[n], out);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out = _mm_cvtsi32_ss(factor, read_s24(s));
out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out = _mm_mul_ss(out, factor);
_mm_store_ss(&d0[n], out);
s += 3 * n_channels;
s += n_channels;
}
}
@ -186,7 +186,7 @@ static void
conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0], *d1 = dst[1];
uint32_t n, unrolled;
__m128i in[2];
@ -205,14 +205,14 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in[0] = _mm_setr_epi32(
*((uint32_t*)&s[0 + 0*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]),
*((uint32_t*)&s[0 + 6*n_channels]),
*((uint32_t*)&s[0 + 9*n_channels]));
*((uint32_t*)&s[0 + 1*n_channels]),
*((uint32_t*)&s[0 + 2*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]));
in[1] = _mm_setr_epi32(
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]),
*((uint32_t*)&s[3 + 6*n_channels]),
*((uint32_t*)&s[3 + 9*n_channels]));
*((uint32_t*)&s[1 + 0*n_channels]),
*((uint32_t*)&s[1 + 1*n_channels]),
*((uint32_t*)&s[1 + 2*n_channels]),
*((uint32_t*)&s[1 + 3*n_channels]));
in[0] = _mm_slli_epi32(in[0], 8);
in[1] = _mm_slli_epi32(in[1], 8);
@ -229,23 +229,23 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ps(&d0[n], out[0]);
_mm_store_ps(&d1[n], out[1]);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_mul_ss(out[1], factor);
_mm_store_ss(&d0[n], out[0]);
_mm_store_ss(&d1[n], out[1]);
s += 3 * n_channels;
s += n_channels;
}
}
static void
conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
uint32_t n, unrolled;
__m128i in[4];
@ -266,24 +266,24 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in[0] = _mm_setr_epi32(
*((uint32_t*)&s[0 + 0*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]),
*((uint32_t*)&s[0 + 6*n_channels]),
*((uint32_t*)&s[0 + 9*n_channels]));
*((uint32_t*)&s[0 + 1*n_channels]),
*((uint32_t*)&s[0 + 2*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]));
in[1] = _mm_setr_epi32(
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]),
*((uint32_t*)&s[3 + 6*n_channels]),
*((uint32_t*)&s[3 + 9*n_channels]));
*((uint32_t*)&s[1 + 0*n_channels]),
*((uint32_t*)&s[1 + 1*n_channels]),
*((uint32_t*)&s[1 + 2*n_channels]),
*((uint32_t*)&s[1 + 3*n_channels]));
in[2] = _mm_setr_epi32(
*((uint32_t*)&s[6 + 0*n_channels]),
*((uint32_t*)&s[6 + 3*n_channels]),
*((uint32_t*)&s[6 + 6*n_channels]),
*((uint32_t*)&s[6 + 9*n_channels]));
*((uint32_t*)&s[2 + 0*n_channels]),
*((uint32_t*)&s[2 + 1*n_channels]),
*((uint32_t*)&s[2 + 2*n_channels]),
*((uint32_t*)&s[2 + 3*n_channels]));
in[3] = _mm_setr_epi32(
*((uint32_t*)&s[9 + 0*n_channels]),
*((uint32_t*)&s[9 + 3*n_channels]),
*((uint32_t*)&s[9 + 6*n_channels]),
*((uint32_t*)&s[9 + 9*n_channels]));
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 1*n_channels]),
*((uint32_t*)&s[3 + 2*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]));
in[0] = _mm_slli_epi32(in[0], 8);
in[1] = _mm_slli_epi32(in[1], 8);
@ -310,13 +310,13 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ps(&d2[n], out[2]);
_mm_store_ps(&d3[n], out[3]);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_mul_ss(out[1], factor);
out[2] = _mm_mul_ss(out[2], factor);
@ -325,7 +325,7 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ss(&d1[n], out[1]);
_mm_store_ss(&d2[n], out[2]);
_mm_store_ss(&d3[n], out[3]);
s += 3 * n_channels;
s += n_channels;
}
}

View file

@ -55,7 +55,7 @@ MAKE_COPY(24);
MAKE_COPY(32);
MAKE_COPY(64);
#define MAKE_D_TO_D_F(sname,stype,dname,dtype,func) \
#define MAKE_D_TO_D(sname,stype,dname,dtype,func) \
void conv_ ##sname## d_to_ ##dname## d_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -64,15 +64,12 @@ void conv_ ##sname## d_to_ ##dname## d_c(struct convert *conv, \
for (i = 0; i < n_channels; i++) { \
const stype *s = src[i]; \
dtype *d = dst[i]; \
for (j = 0; j < n_samples; j++) { \
func; \
} \
for (j = 0; j < n_samples; j++) \
d[j] = func (s[j]); \
} \
}
#define MAKE_D_TO_D(sname,stype,dname,dtype,func) \
MAKE_D_TO_D_F(sname,stype,dname,dtype, d[j] = func (s[j])) \
#define MAKE_I_TO_I_F(sname,stype,dname,dtype,func) \
#define MAKE_I_TO_I(sname,stype,dname,dtype,func) \
void conv_ ##sname## _to_ ##dname## _c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -81,14 +78,11 @@ void conv_ ##sname## _to_ ##dname## _c(struct convert *conv, \
const stype *s = src[0]; \
dtype *d = dst[0]; \
n_samples *= conv->n_channels; \
for (j = 0; j < n_samples; j++) { \
func; \
} \
for (j = 0; j < n_samples; j++) \
d[j] = func (s[j]); \
}
#define MAKE_I_TO_I(sname,stype,dname,dtype,func) \
MAKE_I_TO_I_F(sname,stype,dname,dtype, d[j] = func (s[j])) \
#define MAKE_I_TO_D_F(sname,stype,dname,dtype,func) \
#define MAKE_I_TO_D(sname,stype,dname,dtype,func) \
void conv_ ##sname## _to_ ##dname## d_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -97,15 +91,12 @@ void conv_ ##sname## _to_ ##dname## d_c(struct convert *conv, \
dtype **d = (dtype**)dst; \
uint32_t i, j, n_channels = conv->n_channels; \
for (j = 0; j < n_samples; j++) { \
for (i = 0; i < n_channels; i++) { \
func; \
} \
for (i = 0; i < n_channels; i++) \
d[i][j] = func (*s++); \
} \
}
#define MAKE_I_TO_D(sname,stype,dname,dtype,func) \
MAKE_I_TO_D_F(sname,stype,dname,dtype, d[i][j] = func (*s++)) \
#define MAKE_D_TO_I_F(sname,stype,dname,dtype,func) \
#define MAKE_D_TO_I(sname,stype,dname,dtype,func) \
void conv_ ##sname## d_to_ ##dname## _c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -114,13 +105,10 @@ void conv_ ##sname## d_to_ ##dname## _c(struct convert *conv, \
dtype *d = dst[0]; \
uint32_t i, j, n_channels = conv->n_channels; \
for (j = 0; j < n_samples; j++) { \
for (i = 0; i < n_channels; i++) { \
func; \
} \
for (i = 0; i < n_channels; i++) \
*d++ = func (s[i][j]); \
} \
}
#define MAKE_D_TO_I(sname,stype,dname,dtype,func) \
MAKE_D_TO_I_F(sname,stype,dname,dtype, *d++ = func (s[i][j])) \
/* to f32 */
MAKE_D_TO_D(u8, uint8_t, f32, float, U8_TO_F32);
@ -154,14 +142,14 @@ MAKE_I_TO_D(s32, int32_t, f32, float, S32_TO_F32);
MAKE_D_TO_I(s32, int32_t, f32, float, S32_TO_F32);
MAKE_I_TO_D(s32s, uint32_t, f32, float, S32S_TO_F32);
MAKE_I_TO_I_F(u24, uint8_t, f32, float, d[j] = U24_TO_F32(read_u24(s)); s += 3);
MAKE_I_TO_D_F(u24, uint8_t, f32, float, d[i][j] = U24_TO_F32(read_u24(s)); s += 3);
MAKE_I_TO_I(u24, uint24_t, f32, float, U24_TO_F32);
MAKE_I_TO_D(u24, uint24_t, f32, float, U24_TO_F32);
MAKE_D_TO_D_F(s24, int8_t, f32, float, d[j] = S24_TO_F32(read_s24(s)); s += 3);
MAKE_I_TO_I_F(s24, int8_t, f32, float, d[j] = S24_TO_F32(read_s24(s)); s += 3);
MAKE_I_TO_D_F(s24, int8_t, f32, float, d[i][j] = S24_TO_F32(read_s24(s)); s += 3);
MAKE_D_TO_I_F(s24, int8_t, f32, float, *d++ = S24_TO_F32(read_s24(&s[i][j*3])));
MAKE_I_TO_D_F(s24s, int8_t, f32, float, d[i][j] = S24_TO_F32(read_s24s(s)); s += 3);
MAKE_D_TO_D(s24, int24_t, f32, float, S24_TO_F32);
MAKE_I_TO_I(s24, int24_t, f32, float, S24_TO_F32);
MAKE_I_TO_D(s24, int24_t, f32, float, S24_TO_F32);
MAKE_D_TO_I(s24, int24_t, f32, float, S24_TO_F32);
MAKE_I_TO_D(s24s, int24_t, f32, float, S24S_TO_F32);
MAKE_I_TO_I(u24_32, uint32_t, f32, float, U24_32_TO_F32);
MAKE_I_TO_D(u24_32, uint32_t, f32, float, U24_32_TO_F32);
@ -211,14 +199,14 @@ MAKE_I_TO_D(f32, float, s32, int32_t, F32_TO_S32);
MAKE_D_TO_I(f32, float, s32, int32_t, F32_TO_S32);
MAKE_D_TO_I(f32, float, s32s, uint32_t, F32_TO_S32S);
MAKE_I_TO_I_F(f32, float, u24, uint8_t, write_u24(d, F32_TO_U24(s[j])); d += 3);
MAKE_D_TO_I_F(f32, float, u24, uint8_t, write_u24(d, F32_TO_U24(s[i][j])); d += 3);
MAKE_I_TO_I(f32, float, u24, uint24_t, F32_TO_U24);
MAKE_D_TO_I(f32, float, u24, uint24_t, F32_TO_U24);
MAKE_D_TO_D_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[j])); d += 3);
MAKE_I_TO_I_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[j])); d += 3);
MAKE_I_TO_D_F(f32, float, s24, uint8_t, write_s24(&d[i][j*3], F32_TO_S24(*s++)));
MAKE_D_TO_I_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[i][j])); d += 3);
MAKE_D_TO_I_F(f32, float, s24s, uint8_t, write_s24s(d, F32_TO_S24(s[i][j])); d += 3);
MAKE_D_TO_D(f32, float, s24, int24_t, F32_TO_S24);
MAKE_I_TO_I(f32, float, s24, int24_t, F32_TO_S24);
MAKE_I_TO_D(f32, float, s24, int24_t, F32_TO_S24);
MAKE_D_TO_I(f32, float, s24, int24_t, F32_TO_S24);
MAKE_D_TO_I(f32, float, s24s, int24_t, F32_TO_S24S);
MAKE_I_TO_I(f32, float, u24_32, uint32_t, F32_TO_U24_32);
MAKE_D_TO_I(f32, float, u24_32, uint32_t, F32_TO_U24_32);
@ -253,7 +241,7 @@ static inline void update_dither_c(struct convert *conv, uint32_t n_samples)
dither[n] = lcnoise(state) * scale;
}
#define MAKE_D_dither_F(dname,dtype,func) \
#define MAKE_D_dither(dname,dtype,func) \
void conv_f32d_to_ ##dname## d_dither_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -266,16 +254,13 @@ void conv_f32d_to_ ##dname## d_dither_c(struct convert *conv, \
dtype *d = dst[i]; \
for (j = 0; j < n_samples;) { \
chunk = SPA_MIN(n_samples - j, dither_size); \
for (k = 0; k < chunk; k++, j++) { \
func; \
} \
for (k = 0; k < chunk; k++, j++) \
d[j] = func (s[j], dither[k]); \
} \
} \
}
#define MAKE_D_dither(dname,dtype,func) \
MAKE_D_dither_F(dname,dtype, d[j] = func (s[j], dither[k])) \
#define MAKE_I_dither_F(dname,dtype,func) \
#define MAKE_I_dither(dname,dtype,func) \
void conv_f32d_to_ ##dname## _dither_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -288,14 +273,11 @@ void conv_f32d_to_ ##dname## _dither_c(struct convert *conv, \
for (j = 0; j < n_samples;) { \
chunk = SPA_MIN(n_samples - j, dither_size); \
for (k = 0; k < chunk; k++, j++) { \
for (i = 0; i < n_channels; i++) { \
func; \
} \
for (i = 0; i < n_channels; i++) \
*d++ = func (s[i][j], dither[k]); \
} \
} \
}
#define MAKE_I_dither(dname,dtype,func) \
MAKE_I_dither_F(dname,dtype, *d++ = func (s[i][j], dither[k])) \
MAKE_D_dither(u8, uint8_t, F32_TO_U8_D);
MAKE_I_dither(u8, uint8_t, F32_TO_U8_D);
@ -307,9 +289,9 @@ MAKE_I_dither(s16s, uint16_t, F32_TO_S16S_D);
MAKE_D_dither(s32, int32_t, F32_TO_S32_D);
MAKE_I_dither(s32, int32_t, F32_TO_S32_D);
MAKE_I_dither(s32s, uint32_t, F32_TO_S32S_D);
MAKE_D_dither_F(s24, uint8_t, write_s24(d, F32_TO_S24_D(s[j], dither[k])); d += 3);
MAKE_I_dither_F(s24, uint8_t, write_s24(d, F32_TO_S24_D(s[i][j], dither[k])); d += 3);
MAKE_I_dither_F(s24s, uint8_t, write_s24s(d, F32_TO_S24_D(s[i][j], dither[k])); d += 3);
MAKE_D_dither(s24, int24_t, F32_TO_S24_D);
MAKE_I_dither(s24, int24_t, F32_TO_S24_D);
MAKE_I_dither(s24s, int24_t, F32_TO_S24_D);
MAKE_D_dither(s24_32, int32_t, F32_TO_S24_32_D);
MAKE_I_dither(s24_32, int32_t, F32_TO_S24_32_D);
MAKE_I_dither(s24_32s, int32_t, F32_TO_S24_32S_D);
@ -335,7 +317,7 @@ MAKE_I_dither(s24_32s, int32_t, F32_TO_S24_32S_D);
#define F32_TO_S16_SH(s,sh,d) SHAPER5(int16_t, s, S16_SCALE, 0, sh, S16_MIN, S16_MAX, d)
#define F32_TO_S16S_SH(s,sh,d) bswap_16(F32_TO_S16_SH(s,sh,d))
#define MAKE_D_shaped_F(dname,dtype,func) \
#define MAKE_D_shaped(dname,dtype,func) \
void conv_f32d_to_ ##dname## d_shaped_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -350,17 +332,14 @@ void conv_f32d_to_ ##dname## d_shaped_c(struct convert *conv, \
uint32_t idx = sh->idx; \
for (j = 0; j < n_samples;) { \
chunk = SPA_MIN(n_samples - j, dither_size); \
for (k = 0; k < chunk; k++, j++) { \
func; \
} \
for (k = 0; k < chunk; k++, j++) \
d[j] = func (s[j], sh, dither[k]); \
} \
sh->idx = idx; \
} \
}
#define MAKE_D_shaped(dname,dtype,func) \
MAKE_D_shaped_F(dname,dtype, d[j] = func (s[j], sh, dither[k])) \
#define MAKE_I_shaped_F(dname,dtype,func) \
#define MAKE_I_shaped(dname,dtype,func) \
void conv_f32d_to_ ##dname## _shaped_c(struct convert *conv, \
void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], \
uint32_t n_samples) \
@ -376,15 +355,12 @@ void conv_f32d_to_ ##dname## _shaped_c(struct convert *conv, \
uint32_t idx = sh->idx; \
for (j = 0; j < n_samples;) { \
chunk = SPA_MIN(n_samples - j, dither_size); \
for (k = 0; k < chunk; k++, j++) { \
func; \
} \
for (k = 0; k < chunk; k++, j++) \
d[j*n_channels] = func (s[j], sh, dither[k]); \
} \
sh->idx = idx; \
} \
}
#define MAKE_I_shaped(dname,dtype,func) \
MAKE_I_shaped_F(dname,dtype, d[j*n_channels] = func (s[j], sh, dither[k])) \
MAKE_D_shaped(u8, uint8_t, F32_TO_U8_SH);
MAKE_I_shaped(u8, uint8_t, F32_TO_U8_SH);
@ -395,23 +371,21 @@ MAKE_I_shaped(s16, int16_t, F32_TO_S16_SH);
MAKE_I_shaped(s16s, uint16_t, F32_TO_S16S_SH);
#define MAKE_DEINTERLEAVE(size,type,func) \
MAKE_I_TO_D_F(size,type,size,type,func)
#define DEINTERLEAVE_COPY (d[i][j] = *s++)
MAKE_I_TO_D(size,type,size,type,func)
MAKE_DEINTERLEAVE(8, uint8_t, DEINTERLEAVE_COPY);
MAKE_DEINTERLEAVE(16, uint16_t, DEINTERLEAVE_COPY);
MAKE_DEINTERLEAVE(24, uint8_t, write_s24(&d[i][j*3], read_s24(s)); s+=3);
MAKE_DEINTERLEAVE(32, uint32_t, DEINTERLEAVE_COPY);
MAKE_DEINTERLEAVE(32s, uint32_t, d[i][j] = bswap_32(*s++));
MAKE_DEINTERLEAVE(64, uint64_t, DEINTERLEAVE_COPY);
MAKE_DEINTERLEAVE(8, uint8_t, (uint8_t));
MAKE_DEINTERLEAVE(16, uint16_t, (uint16_t));
MAKE_DEINTERLEAVE(24, uint24_t, (uint24_t));
MAKE_DEINTERLEAVE(32, uint32_t, (uint32_t));
MAKE_DEINTERLEAVE(32s, uint32_t, bswap_32);
MAKE_DEINTERLEAVE(64, uint64_t, (uint64_t));
#define MAKE_INTERLEAVE(size,type,func) \
MAKE_D_TO_I_F(size,type,size,type,func)
#define INTERLEAVE_COPY (*d++ = s[i][j])
MAKE_D_TO_I(size,type,size,type,func)
MAKE_INTERLEAVE(8, uint8_t, INTERLEAVE_COPY);
MAKE_INTERLEAVE(16, uint16_t, INTERLEAVE_COPY);
MAKE_INTERLEAVE(24, uint8_t, write_s24(d, read_s24(&s[i][j*3])); d+=3);
MAKE_INTERLEAVE(32, uint32_t, INTERLEAVE_COPY);
MAKE_INTERLEAVE(32s, uint32_t, *d++ = bswap_32(s[i][j]));
MAKE_INTERLEAVE(64, uint64_t, INTERLEAVE_COPY);
MAKE_INTERLEAVE(8, uint8_t, (uint8_t));
MAKE_INTERLEAVE(16, uint16_t, (uint16_t));
MAKE_INTERLEAVE(24, uint24_t, (uint24_t));
MAKE_INTERLEAVE(32, uint32_t, (uint32_t));
MAKE_INTERLEAVE(32s, uint32_t, bswap_32);
MAKE_INTERLEAVE(64, uint64_t, (uint64_t));

View file

@ -132,7 +132,7 @@ void
conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in;
@ -149,21 +149,21 @@ conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in = _mm_setr_epi32(
*((uint32_t*)&s[0 * n_channels]),
*((uint32_t*)&s[3 * n_channels]),
*((uint32_t*)&s[6 * n_channels]),
*((uint32_t*)&s[9 * n_channels]));
*((uint32_t*)&s[1 * n_channels]),
*((uint32_t*)&s[2 * n_channels]),
*((uint32_t*)&s[3 * n_channels]));
in = _mm_slli_epi32(in, 8);
in = _mm_srai_epi32(in, 8);
out = _mm_cvtepi32_ps(in);
out = _mm_mul_ps(out, factor);
_mm_store_ps(&d0[n], out);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out = _mm_cvtsi32_ss(factor, read_s24(s));
out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out = _mm_mul_ss(out, factor);
_mm_store_ss(&d0[n], out);
s += 3 * n_channels;
s += n_channels;
}
}
@ -171,7 +171,7 @@ static void
conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0], *d1 = dst[1];
uint32_t n, unrolled;
__m128i in[2];
@ -190,14 +190,14 @@ conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in[0] = _mm_setr_epi32(
*((uint32_t*)&s[0 + 0*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]),
*((uint32_t*)&s[0 + 6*n_channels]),
*((uint32_t*)&s[0 + 9*n_channels]));
*((uint32_t*)&s[0 + 1*n_channels]),
*((uint32_t*)&s[0 + 2*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]));
in[1] = _mm_setr_epi32(
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]),
*((uint32_t*)&s[3 + 6*n_channels]),
*((uint32_t*)&s[3 + 9*n_channels]));
*((uint32_t*)&s[1 + 0*n_channels]),
*((uint32_t*)&s[1 + 1*n_channels]),
*((uint32_t*)&s[1 + 2*n_channels]),
*((uint32_t*)&s[1 + 3*n_channels]));
in[0] = _mm_slli_epi32(in[0], 8);
in[1] = _mm_slli_epi32(in[1], 8);
@ -214,23 +214,23 @@ conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ps(&d0[n], out[0]);
_mm_store_ps(&d1[n], out[1]);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_mul_ss(out[1], factor);
_mm_store_ss(&d0[n], out[0]);
_mm_store_ss(&d1[n], out[1]);
s += 3 * n_channels;
s += n_channels;
}
}
static void
conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
uint32_t n, unrolled;
__m128i in[4];
@ -251,24 +251,24 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
for(n = 0; n < unrolled; n += 4) {
in[0] = _mm_setr_epi32(
*((uint32_t*)&s[0 + 0*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]),
*((uint32_t*)&s[0 + 6*n_channels]),
*((uint32_t*)&s[0 + 9*n_channels]));
*((uint32_t*)&s[0 + 1*n_channels]),
*((uint32_t*)&s[0 + 2*n_channels]),
*((uint32_t*)&s[0 + 3*n_channels]));
in[1] = _mm_setr_epi32(
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]),
*((uint32_t*)&s[3 + 6*n_channels]),
*((uint32_t*)&s[3 + 9*n_channels]));
*((uint32_t*)&s[1 + 0*n_channels]),
*((uint32_t*)&s[1 + 1*n_channels]),
*((uint32_t*)&s[1 + 2*n_channels]),
*((uint32_t*)&s[1 + 3*n_channels]));
in[2] = _mm_setr_epi32(
*((uint32_t*)&s[6 + 0*n_channels]),
*((uint32_t*)&s[6 + 3*n_channels]),
*((uint32_t*)&s[6 + 6*n_channels]),
*((uint32_t*)&s[6 + 9*n_channels]));
*((uint32_t*)&s[2 + 0*n_channels]),
*((uint32_t*)&s[2 + 1*n_channels]),
*((uint32_t*)&s[2 + 2*n_channels]),
*((uint32_t*)&s[2 + 3*n_channels]));
in[3] = _mm_setr_epi32(
*((uint32_t*)&s[9 + 0*n_channels]),
*((uint32_t*)&s[9 + 3*n_channels]),
*((uint32_t*)&s[9 + 6*n_channels]),
*((uint32_t*)&s[9 + 9*n_channels]));
*((uint32_t*)&s[3 + 0*n_channels]),
*((uint32_t*)&s[3 + 1*n_channels]),
*((uint32_t*)&s[3 + 2*n_channels]),
*((uint32_t*)&s[3 + 3*n_channels]));
in[0] = _mm_slli_epi32(in[0], 8);
in[1] = _mm_slli_epi32(in[1], 8);
@ -295,13 +295,13 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ps(&d2[n], out[2]);
_mm_store_ps(&d3[n], out[3]);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_mul_ss(out[1], factor);
out[2] = _mm_mul_ss(out[2], factor);
@ -310,7 +310,7 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
_mm_store_ss(&d1[n], out[1]);
_mm_store_ss(&d2[n], out[2]);
_mm_store_ss(&d3[n], out[3]);
s += 3 * n_channels;
s += n_channels;
}
}

View file

@ -30,7 +30,7 @@ static void
conv_s24_to_f32d_1s_sse41(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0];
uint32_t n, unrolled;
__m128i in = _mm_setzero_si128();
@ -43,21 +43,21 @@ conv_s24_to_f32d_1s_sse41(void *data, void * SPA_RESTRICT dst[], const void * SP
for(n = 0; n < unrolled; n += 4) {
in = _mm_insert_epi32(in, *((uint32_t*)&s[0 * n_channels]), 0);
in = _mm_insert_epi32(in, *((uint32_t*)&s[3 * n_channels]), 1);
in = _mm_insert_epi32(in, *((uint32_t*)&s[6 * n_channels]), 2);
in = _mm_insert_epi32(in, *((uint32_t*)&s[9 * n_channels]), 3);
in = _mm_insert_epi32(in, *((uint32_t*)&s[1 * n_channels]), 1);
in = _mm_insert_epi32(in, *((uint32_t*)&s[2 * n_channels]), 2);
in = _mm_insert_epi32(in, *((uint32_t*)&s[3 * n_channels]), 3);
in = _mm_slli_epi32(in, 8);
in = _mm_srai_epi32(in, 8);
out = _mm_cvtepi32_ps(in);
out = _mm_mul_ps(out, factor);
_mm_store_ps(&d0[n], out);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out = _mm_cvtsi32_ss(factor, read_s24(s));
out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out = _mm_mul_ss(out, factor);
_mm_store_ss(&d0[n], out);
s += 3 * n_channels;
s += n_channels;
}
}

View file

@ -30,7 +30,7 @@ static void
conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_channels, uint32_t n_samples)
{
const uint8_t *s = src;
const int24_t *s = src;
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
uint32_t n, unrolled;
__m128i in[4];
@ -48,9 +48,9 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP
for(n = 0; n < unrolled; n += 4) {
in[0] = _mm_loadu_si128((__m128i*)(s + 0*n_channels));
in[1] = _mm_loadu_si128((__m128i*)(s + 3*n_channels));
in[2] = _mm_loadu_si128((__m128i*)(s + 6*n_channels));
in[3] = _mm_loadu_si128((__m128i*)(s + 9*n_channels));
in[1] = _mm_loadu_si128((__m128i*)(s + 1*n_channels));
in[2] = _mm_loadu_si128((__m128i*)(s + 2*n_channels));
in[3] = _mm_loadu_si128((__m128i*)(s + 3*n_channels));
in[0] = _mm_shuffle_epi8(in[0], mask);
in[1] = _mm_shuffle_epi8(in[1], mask);
in[2] = _mm_shuffle_epi8(in[2], mask);
@ -74,13 +74,13 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP
_mm_store_ps(&d1[n], out[1]);
_mm_store_ps(&d2[n], out[2]);
_mm_store_ps(&d3[n], out[3]);
s += 12 * n_channels;
s += 4 * n_channels;
}
for(; n < n_samples; n++) {
out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
out[0] = _mm_mul_ss(out[0], factor);
out[1] = _mm_mul_ss(out[1], factor);
out[2] = _mm_mul_ss(out[2], factor);
@ -89,7 +89,7 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP
_mm_store_ss(&d1[n], out[1]);
_mm_store_ss(&d2[n], out[2]);
_mm_store_ss(&d3[n], out[3]);
s += 3 * n_channels;
s += n_channels;
}
}

View file

@ -79,17 +79,19 @@
#define U24_MAX 16777215u
#define U24_SCALE 8388607.5f
#define U24_OFFS 8388608.f
#define U24_TO_F32(v) ((((uint32_t)(v)) * (1.0f / U24_OFFS)) - 1.0)
#define F32_TO_U24(v) (uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX)
#define F32_TO_U24_D(v,d) (uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX)
#define U24_TO_F32(v) ((u24_to_u32(v) * (1.0f / U24_OFFS)) - 1.0)
#define F32_TO_U24(v) u32_to_u24(SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX))
#define F32_TO_U24_D(v,d) u32_to_u24(SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX))
#define S24_MIN -8388607
#define S24_MAX 8388607
#define S24_MAX_F 8388607.0f
#define S24_SCALE 8388607.0f
#define S24_TO_F32(v) (((int32_t)(v)) * (1.0f / S24_SCALE))
#define F32_TO_S24(v) (int32_t)SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX)
#define F32_TO_S24_D(v,d) (int32_t)SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX)
#define S24_TO_F32(v) (s24_to_s32(v) * (1.0f / S24_SCALE))
#define S24S_TO_F32(v) (s24_to_s32(bswap_s24(v)) * (1.0f / S24_SCALE))
#define F32_TO_S24(v) s32_to_s24(SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX))
#define F32_TO_S24S(v) bswap_s24(F32_TO_S24(v))
#define F32_TO_S24_D(v,d) s32_to_s24(SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX))
#define U32_MIN 0u
#define U32_MAX 4294967040u
@ -112,88 +114,69 @@
#define U24_32_TO_F32(v) U32_TO_F32((v)<<8)
#define U24_32S_TO_F32(v) U32_TO_F32(((int32_t)bswap_32(v))<<8)
#define F32_TO_U24_32(v) F32_TO_U24(v)
#define F32_TO_U24_32S(v) bswap_32(F32_TO_U24(v))
#define F32_TO_U24_32_D(v,d) F32_TO_U24_D(v,d)
#define F32_TO_U24_32S_D(v,d) bswap_32(F32_TO_U24_D(v,d))
#define F32_TO_U24_32(v) (uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX)
#define F32_TO_U24_32S(v) bswap_32(F32_TO_U24_32(v))
#define F32_TO_U24_32_D(v,d) (uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX)
#define F32_TO_U24_32S_D(v,d) bswap_32(F32_TO_U24_32_D(v,d))
#define S24_32_TO_F32(v) S32_TO_F32((v)<<8)
#define S24_32S_TO_F32(v) S32_TO_F32(((int32_t)bswap_32(v))<<8)
#define F32_TO_S24_32(v) F32_TO_S24(v)
#define F32_TO_S24_32S(v) bswap_32(F32_TO_S24(v))
#define F32_TO_S24_32_D(v,d) F32_TO_S24_D(v,d)
#define F32_TO_S24_32S_D(v,d) bswap_32(F32_TO_S24_D(v,d))
#define F32_TO_S24_32(v) (int32_t)SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX)
#define F32_TO_S24_32S(v) bswap_32(F32_TO_S24_32(v))
#define F32_TO_S24_32_D(v,d) (int32_t)SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX)
#define F32_TO_S24_32S_D(v,d) bswap_32(F32_TO_S24_32_D(v,d))
static inline uint32_t read_u24(const void *src)
{
const uint8_t *s = src;
typedef struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
return (((uint32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
uint8_t v3;
uint8_t v2;
uint8_t v1;
#else
return (((uint32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
uint8_t v1;
uint8_t v2;
uint8_t v3;
#endif
} __attribute__ ((packed)) uint24_t;
typedef struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
uint8_t v3;
uint8_t v2;
int8_t v1;
#else
int8_t v1;
uint8_t v2;
uint8_t v3;
#endif
} __attribute__ ((packed)) int24_t;
static inline uint32_t u24_to_u32(uint24_t src)
{
return ((uint32_t)src.v1 << 16) | ((uint32_t)src.v2 << 8) | (uint32_t)src.v3;
}
static inline int32_t read_s24(const void *src)
static inline uint24_t u32_to_u24(uint32_t src)
{
const int8_t *s = src;
#if __BYTE_ORDER == __LITTLE_ENDIAN
return (((int32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
#else
return (((int32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
#endif
return (uint24_t) { src >> 16, src >> 8, src };
}
static inline int32_t read_s24s(const void *src)
static inline int32_t s24_to_s32(int24_t src)
{
const int8_t *s = src;
#if __BYTE_ORDER == __LITTLE_ENDIAN
return (((int32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
#else
return (((int32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
#endif
return ((int32_t)src.v1 << 16) | ((uint32_t)src.v2 << 8) | (uint32_t)src.v3;
}
static inline void write_u24(void *dst, uint32_t val)
static inline int24_t s32_to_s24(int32_t src)
{
uint8_t *d = dst;
#if __BYTE_ORDER == __LITTLE_ENDIAN
d[0] = (uint8_t) (val);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val >> 16);
#else
d[0] = (uint8_t) (val >> 16);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val);
#endif
return (int24_t) { src >> 16, src >> 8, src };
}
static inline void write_s24(void *dst, int32_t val)
static inline uint24_t bswap_u24(uint24_t src)
{
uint8_t *d = dst;
#if __BYTE_ORDER == __LITTLE_ENDIAN
d[0] = (uint8_t) (val);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val >> 16);
#else
d[0] = (uint8_t) (val >> 16);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val);
#endif
return (uint24_t) { src.v3, src.v2, src.v1 };
}
static inline void write_s24s(void *dst, int32_t val)
static inline int24_t bswap_s24(int24_t src)
{
uint8_t *d = dst;
#if __BYTE_ORDER == __LITTLE_ENDIAN
d[0] = (uint8_t) (val >> 16);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val);
#else
d[0] = (uint8_t) (val);
d[1] = (uint8_t) (val >> 8);
d[2] = (uint8_t) (val >> 16);
#endif
return (int24_t) { src.v3, src.v2, src.v1 };
}
#define NS_MAX 8