audioconvert: simplify 24 bits handling

Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types.
2026-06-05 03:01:53 -04:00 · 2022-07-01 12:24:35 +02:00 · 2022-07-01 12:24:35 +02:00 · 817d5bd7a4
commit 817d5bd7a4
parent e395f62425
6 changed files with 204 additions and 247 deletions
--- a/spa/plugins/audioconvert/fmt-ops-avx2.c
+++ b/spa/plugins/audioconvert/fmt-ops-avx2.c
@ -147,7 +147,7 @@ void
 conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0];
 	uint32_t n, unrolled;
 	__m128i in;
@ -164,21 +164,21 @@ conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in = _mm_setr_epi32(
 			*((uint32_t*)&s[0 * n_channels]),
-			*((uint32_t*)&s[3 * n_channels]),
-			*((uint32_t*)&s[6 * n_channels]),
-			*((uint32_t*)&s[9 * n_channels]));
+			*((uint32_t*)&s[1 * n_channels]),
+			*((uint32_t*)&s[2 * n_channels]),
+			*((uint32_t*)&s[3 * n_channels]));
 		in = _mm_slli_epi32(in, 8);
 		in = _mm_srai_epi32(in, 8);
 		out = _mm_cvtepi32_ps(in);
 		out = _mm_mul_ps(out, factor);
 		_mm_store_ps(&d0[n], out);
-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out = _mm_cvtsi32_ss(factor, read_s24(s));
+		out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
 		out = _mm_mul_ss(out, factor);
 		_mm_store_ss(&d0[n], out);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

@ -186,7 +186,7 @@ static void
 conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0], *d1 = dst[1];
 	uint32_t n, unrolled;
 	__m128i in[2];
@ -205,14 +205,14 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_setr_epi32(
 			*((uint32_t*)&s[0 + 0*n_channels]),
-			*((uint32_t*)&s[0 + 3*n_channels]),
-			*((uint32_t*)&s[0 + 6*n_channels]),
-			*((uint32_t*)&s[0 + 9*n_channels]));
+			*((uint32_t*)&s[0 + 1*n_channels]),
+			*((uint32_t*)&s[0 + 2*n_channels]),
+			*((uint32_t*)&s[0 + 3*n_channels]));
 		in[1] = _mm_setr_epi32(
-			*((uint32_t*)&s[3 + 0*n_channels]),
-			*((uint32_t*)&s[3 + 3*n_channels]),
-			*((uint32_t*)&s[3 + 6*n_channels]),
-			*((uint32_t*)&s[3 + 9*n_channels]));
+			*((uint32_t*)&s[1 + 0*n_channels]),
+			*((uint32_t*)&s[1 + 1*n_channels]),
+			*((uint32_t*)&s[1 + 2*n_channels]),
+			*((uint32_t*)&s[1 + 3*n_channels]));

 		in[0] = _mm_slli_epi32(in[0], 8);
 		in[1] = _mm_slli_epi32(in[1], 8);
@ -229,23 +229,23 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ps(&d0[n], out[0]);
 		_mm_store_ps(&d1[n], out[1]);

-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
-		out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
+		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
+		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_mul_ss(out[1], factor);
 		_mm_store_ss(&d0[n], out[0]);
 		_mm_store_ss(&d1[n], out[1]);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }
 static void
 conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
 	uint32_t n, unrolled;
 	__m128i in[4];
@ -266,24 +266,24 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_setr_epi32(
 			*((uint32_t*)&s[0 + 0*n_channels]),
-			*((uint32_t*)&s[0 + 3*n_channels]),
-			*((uint32_t*)&s[0 + 6*n_channels]),
-			*((uint32_t*)&s[0 + 9*n_channels]));
+			*((uint32_t*)&s[0 + 1*n_channels]),
+			*((uint32_t*)&s[0 + 2*n_channels]),
+			*((uint32_t*)&s[0 + 3*n_channels]));
 		in[1] = _mm_setr_epi32(
-			*((uint32_t*)&s[3 + 0*n_channels]),
-			*((uint32_t*)&s[3 + 3*n_channels]),
-			*((uint32_t*)&s[3 + 6*n_channels]),
-			*((uint32_t*)&s[3 + 9*n_channels]));
+			*((uint32_t*)&s[1 + 0*n_channels]),
+			*((uint32_t*)&s[1 + 1*n_channels]),
+			*((uint32_t*)&s[1 + 2*n_channels]),
+			*((uint32_t*)&s[1 + 3*n_channels]));
 		in[2] = _mm_setr_epi32(
-			*((uint32_t*)&s[6 + 0*n_channels]),
-			*((uint32_t*)&s[6 + 3*n_channels]),
-			*((uint32_t*)&s[6 + 6*n_channels]),
-			*((uint32_t*)&s[6 + 9*n_channels]));
+			*((uint32_t*)&s[2 + 0*n_channels]),
+			*((uint32_t*)&s[2 + 1*n_channels]),
+			*((uint32_t*)&s[2 + 2*n_channels]),
+			*((uint32_t*)&s[2 + 3*n_channels]));
 		in[3] = _mm_setr_epi32(
-			*((uint32_t*)&s[9 + 0*n_channels]),
-			*((uint32_t*)&s[9 + 3*n_channels]),
-			*((uint32_t*)&s[9 + 6*n_channels]),
-			*((uint32_t*)&s[9 + 9*n_channels]));
+			*((uint32_t*)&s[3 + 0*n_channels]),
+			*((uint32_t*)&s[3 + 1*n_channels]),
+			*((uint32_t*)&s[3 + 2*n_channels]),
+			*((uint32_t*)&s[3 + 3*n_channels]));

 		in[0] = _mm_slli_epi32(in[0], 8);
 		in[1] = _mm_slli_epi32(in[1], 8);
@ -310,13 +310,13 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ps(&d2[n], out[2]);
 		_mm_store_ps(&d3[n], out[3]);

-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
-		out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
-		out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
-		out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
+		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
+		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
+		out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
+		out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_mul_ss(out[1], factor);
 		out[2] = _mm_mul_ss(out[2], factor);
@ -325,7 +325,7 @@ conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ss(&d1[n], out[1]);
 		_mm_store_ss(&d2[n], out[2]);
 		_mm_store_ss(&d3[n], out[3]);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

--- a/spa/plugins/audioconvert/fmt-ops-c.c
+++ b/spa/plugins/audioconvert/fmt-ops-c.c
@ -55,7 +55,7 @@ MAKE_COPY(24);
 MAKE_COPY(32);
 MAKE_COPY(64);

-#define MAKE_D_TO_D_F(sname,stype,dname,dtype,func) 				\
+#define MAKE_D_TO_D(sname,stype,dname,dtype,func) 				\
 void conv_ ##sname## d_to_ ##dname## d_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -64,15 +64,12 @@ void conv_ ##sname## d_to_ ##dname## d_c(struct convert *conv,			\
 	for (i = 0; i < n_channels; i++) {					\
 		const stype *s = src[i];					\
 		dtype *d = dst[i];						\
-		for (j = 0; j < n_samples; j++) {				\
-			func;							\
-		}								\
+		for (j = 0; j < n_samples; j++)					\
+			d[j] = func (s[j]);			 		\
 	}									\
 }
-#define MAKE_D_TO_D(sname,stype,dname,dtype,func) 				\
-	MAKE_D_TO_D_F(sname,stype,dname,dtype, d[j] = func (s[j])) 		\

-#define MAKE_I_TO_I_F(sname,stype,dname,dtype,func) 				\
+#define MAKE_I_TO_I(sname,stype,dname,dtype,func) 				\
 void conv_ ##sname## _to_ ##dname## _c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -81,14 +78,11 @@ void conv_ ##sname## _to_ ##dname## _c(struct convert *conv,			\
 	const stype *s = src[0];						\
 	dtype *d = dst[0];							\
 	n_samples *= conv->n_channels;						\
-	for (j = 0; j < n_samples; j++) {					\
-		func;								\
-	}									\
+	for (j = 0; j < n_samples; j++)						\
+		d[j] = func (s[j]);				 		\
 }
-#define MAKE_I_TO_I(sname,stype,dname,dtype,func) 				\
-	MAKE_I_TO_I_F(sname,stype,dname,dtype, d[j] = func (s[j])) 		\

-#define MAKE_I_TO_D_F(sname,stype,dname,dtype,func) 				\
+#define MAKE_I_TO_D(sname,stype,dname,dtype,func) 				\
 void conv_ ##sname## _to_ ##dname## d_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -97,15 +91,12 @@ void conv_ ##sname## _to_ ##dname## d_c(struct convert *conv,			\
 	dtype **d = (dtype**)dst;						\
 	uint32_t i, j, n_channels = conv->n_channels;				\
 	for (j = 0; j < n_samples; j++) {					\
-		for (i = 0; i < n_channels; i++) {				\
-			func;							\
-		}								\
+		for (i = 0; i < n_channels; i++)				\
+			d[i][j] = func (*s++);			 		\
 	}									\
 }
-#define MAKE_I_TO_D(sname,stype,dname,dtype,func) 				\
-	MAKE_I_TO_D_F(sname,stype,dname,dtype, d[i][j] = func (*s++)) 		\

-#define MAKE_D_TO_I_F(sname,stype,dname,dtype,func) 				\
+#define MAKE_D_TO_I(sname,stype,dname,dtype,func) 				\
 void conv_ ##sname## d_to_ ##dname## _c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -114,13 +105,10 @@ void conv_ ##sname## d_to_ ##dname## _c(struct convert *conv,			\
 	dtype *d = dst[0];							\
 	uint32_t i, j, n_channels = conv->n_channels;				\
 	for (j = 0; j < n_samples; j++) {					\
-		for (i = 0; i < n_channels; i++) {				\
-			func;							\
-		}								\
+		for (i = 0; i < n_channels; i++)				\
+			*d++ = func (s[i][j]);			 		\
 	}									\
 }
-#define MAKE_D_TO_I(sname,stype,dname,dtype,func) 				\
-	MAKE_D_TO_I_F(sname,stype,dname,dtype, *d++ = func (s[i][j])) 		\

 /* to f32 */
 MAKE_D_TO_D(u8, uint8_t, f32, float, U8_TO_F32);
@ -154,14 +142,14 @@ MAKE_I_TO_D(s32, int32_t, f32, float, S32_TO_F32);
 MAKE_D_TO_I(s32, int32_t, f32, float, S32_TO_F32);
 MAKE_I_TO_D(s32s, uint32_t, f32, float, S32S_TO_F32);

-MAKE_I_TO_I_F(u24, uint8_t, f32, float, d[j] = U24_TO_F32(read_u24(s)); s += 3);
-MAKE_I_TO_D_F(u24, uint8_t, f32, float, d[i][j] = U24_TO_F32(read_u24(s)); s += 3);
+MAKE_I_TO_I(u24, uint24_t, f32, float, U24_TO_F32);
+MAKE_I_TO_D(u24, uint24_t, f32, float, U24_TO_F32);

-MAKE_D_TO_D_F(s24, int8_t, f32, float, d[j] = S24_TO_F32(read_s24(s)); s += 3);
-MAKE_I_TO_I_F(s24, int8_t, f32, float, d[j] = S24_TO_F32(read_s24(s)); s += 3);
-MAKE_I_TO_D_F(s24, int8_t, f32, float, d[i][j] = S24_TO_F32(read_s24(s)); s += 3);
-MAKE_D_TO_I_F(s24, int8_t, f32, float, *d++ = S24_TO_F32(read_s24(&s[i][j*3])));
-MAKE_I_TO_D_F(s24s, int8_t, f32, float, d[i][j] = S24_TO_F32(read_s24s(s)); s += 3);
+MAKE_D_TO_D(s24, int24_t, f32, float, S24_TO_F32);
+MAKE_I_TO_I(s24, int24_t, f32, float, S24_TO_F32);
+MAKE_I_TO_D(s24, int24_t, f32, float, S24_TO_F32);
+MAKE_D_TO_I(s24, int24_t, f32, float, S24_TO_F32);
+MAKE_I_TO_D(s24s, int24_t, f32, float, S24S_TO_F32);

 MAKE_I_TO_I(u24_32, uint32_t, f32, float, U24_32_TO_F32);
 MAKE_I_TO_D(u24_32, uint32_t, f32, float, U24_32_TO_F32);
@ -211,14 +199,14 @@ MAKE_I_TO_D(f32, float, s32, int32_t, F32_TO_S32);
 MAKE_D_TO_I(f32, float, s32, int32_t, F32_TO_S32);
 MAKE_D_TO_I(f32, float, s32s, uint32_t, F32_TO_S32S);

-MAKE_I_TO_I_F(f32, float, u24, uint8_t, write_u24(d, F32_TO_U24(s[j])); d += 3);
-MAKE_D_TO_I_F(f32, float, u24, uint8_t, write_u24(d, F32_TO_U24(s[i][j])); d += 3);
+MAKE_I_TO_I(f32, float, u24, uint24_t, F32_TO_U24);
+MAKE_D_TO_I(f32, float, u24, uint24_t, F32_TO_U24);

-MAKE_D_TO_D_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[j])); d += 3);
-MAKE_I_TO_I_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[j])); d += 3);
-MAKE_I_TO_D_F(f32, float, s24, uint8_t, write_s24(&d[i][j*3], F32_TO_S24(*s++)));
-MAKE_D_TO_I_F(f32, float, s24, uint8_t, write_s24(d, F32_TO_S24(s[i][j])); d += 3);
-MAKE_D_TO_I_F(f32, float, s24s, uint8_t, write_s24s(d, F32_TO_S24(s[i][j])); d += 3);
+MAKE_D_TO_D(f32, float, s24, int24_t, F32_TO_S24);
+MAKE_I_TO_I(f32, float, s24, int24_t, F32_TO_S24);
+MAKE_I_TO_D(f32, float, s24, int24_t, F32_TO_S24);
+MAKE_D_TO_I(f32, float, s24, int24_t, F32_TO_S24);
+MAKE_D_TO_I(f32, float, s24s, int24_t, F32_TO_S24S);

 MAKE_I_TO_I(f32, float, u24_32, uint32_t, F32_TO_U24_32);
 MAKE_D_TO_I(f32, float, u24_32, uint32_t, F32_TO_U24_32);
@ -253,7 +241,7 @@ static inline void update_dither_c(struct convert *conv, uint32_t n_samples)
 		dither[n] = lcnoise(state) * scale;
 }

-#define MAKE_D_dither_F(dname,dtype,func) 					\
+#define MAKE_D_dither(dname,dtype,func) 					\
 void conv_f32d_to_ ##dname## d_dither_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -266,16 +254,13 @@ void conv_f32d_to_ ##dname## d_dither_c(struct convert *conv,			\
 		dtype *d = dst[i];						\
 		for (j = 0; j < n_samples;) {					\
 			chunk = SPA_MIN(n_samples - j, dither_size);		\
-			for (k = 0; k < chunk; k++, j++) {			\
-				func;						\
-			}							\
+			for (k = 0; k < chunk; k++, j++)			\
+				d[j] = func (s[j], dither[k]);			\
 		}								\
 	}									\
 }
-#define MAKE_D_dither(dname,dtype,func) 					\
-	MAKE_D_dither_F(dname,dtype, d[j] = func (s[j], dither[k]))		\

-#define MAKE_I_dither_F(dname,dtype,func) 					\
+#define MAKE_I_dither(dname,dtype,func) 					\
 void conv_f32d_to_ ##dname## _dither_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -288,14 +273,11 @@ void conv_f32d_to_ ##dname## _dither_c(struct convert *conv,			\
 	for (j = 0; j < n_samples;) {						\
 		chunk = SPA_MIN(n_samples - j, dither_size);			\
 		for (k = 0; k < chunk; k++, j++) {				\
-			for (i = 0; i < n_channels; i++) {			\
-				func;						\
-			}							\
+			for (i = 0; i < n_channels; i++)			\
+				*d++ = func (s[i][j], dither[k]);		\
 		}								\
 	}									\
 }
-#define MAKE_I_dither(dname,dtype,func) 					\
-	MAKE_I_dither_F(dname,dtype, *d++ = func (s[i][j], dither[k]))		\

 MAKE_D_dither(u8, uint8_t, F32_TO_U8_D);
 MAKE_I_dither(u8, uint8_t, F32_TO_U8_D);
@ -307,9 +289,9 @@ MAKE_I_dither(s16s, uint16_t, F32_TO_S16S_D);
 MAKE_D_dither(s32, int32_t, F32_TO_S32_D);
 MAKE_I_dither(s32, int32_t, F32_TO_S32_D);
 MAKE_I_dither(s32s, uint32_t, F32_TO_S32S_D);
-MAKE_D_dither_F(s24, uint8_t, write_s24(d, F32_TO_S24_D(s[j], dither[k])); d += 3);
-MAKE_I_dither_F(s24, uint8_t, write_s24(d, F32_TO_S24_D(s[i][j], dither[k])); d += 3);
-MAKE_I_dither_F(s24s, uint8_t, write_s24s(d, F32_TO_S24_D(s[i][j], dither[k])); d += 3);
+MAKE_D_dither(s24, int24_t, F32_TO_S24_D);
+MAKE_I_dither(s24, int24_t, F32_TO_S24_D);
+MAKE_I_dither(s24s, int24_t, F32_TO_S24_D);
 MAKE_D_dither(s24_32, int32_t, F32_TO_S24_32_D);
 MAKE_I_dither(s24_32, int32_t, F32_TO_S24_32_D);
 MAKE_I_dither(s24_32s, int32_t, F32_TO_S24_32S_D);
@ -335,7 +317,7 @@ MAKE_I_dither(s24_32s, int32_t, F32_TO_S24_32S_D);
 #define F32_TO_S16_SH(s,sh,d)	SHAPER5(int16_t, s, S16_SCALE, 0, sh, S16_MIN, S16_MAX, d)
 #define F32_TO_S16S_SH(s,sh,d)	bswap_16(F32_TO_S16_SH(s,sh,d))

-#define MAKE_D_shaped_F(dname,dtype,func) 					\
+#define MAKE_D_shaped(dname,dtype,func) 					\
 void conv_f32d_to_ ##dname## d_shaped_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -350,17 +332,14 @@ void conv_f32d_to_ ##dname## d_shaped_c(struct convert *conv,			\
 		uint32_t idx = sh->idx;						\
 		for (j = 0; j < n_samples;) {					\
 			chunk = SPA_MIN(n_samples - j, dither_size);		\
-			for (k = 0; k < chunk; k++, j++) {			\
-				func;						\
-			}							\
+			for (k = 0; k < chunk; k++, j++)			\
+				d[j] = func (s[j], sh, dither[k]);		\
 		}								\
 		sh->idx = idx;							\
 	}									\
 }
-#define MAKE_D_shaped(dname,dtype,func) 					\
-	MAKE_D_shaped_F(dname,dtype, d[j] = func (s[j], sh, dither[k]))		\

-#define MAKE_I_shaped_F(dname,dtype,func) 					\
+#define MAKE_I_shaped(dname,dtype,func) 					\
 void conv_f32d_to_ ##dname## _shaped_c(struct convert *conv,			\
 		void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],	\
                uint32_t n_samples)						\
@ -376,15 +355,12 @@ void conv_f32d_to_ ##dname## _shaped_c(struct convert *conv,			\
 		uint32_t idx = sh->idx;						\
 		for (j = 0; j < n_samples;) {					\
 			chunk = SPA_MIN(n_samples - j, dither_size);		\
-			for (k = 0; k < chunk; k++, j++) {			\
-				func;						\
-			}							\
+			for (k = 0; k < chunk; k++, j++)			\
+				d[j*n_channels] = func (s[j], sh, dither[k]);	\
 		}								\
 		sh->idx = idx;							\
 	}									\
 }
-#define MAKE_I_shaped(dname,dtype,func) 						\
-	MAKE_I_shaped_F(dname,dtype, d[j*n_channels] = func (s[j], sh, dither[k]))	\

 MAKE_D_shaped(u8, uint8_t, F32_TO_U8_SH);
 MAKE_I_shaped(u8, uint8_t, F32_TO_U8_SH);
@ -395,23 +371,21 @@ MAKE_I_shaped(s16, int16_t, F32_TO_S16_SH);
 MAKE_I_shaped(s16s, uint16_t, F32_TO_S16S_SH);

 #define MAKE_DEINTERLEAVE(size,type,func)					\
-	MAKE_I_TO_D_F(size,type,size,type,func)
-#define DEINTERLEAVE_COPY	(d[i][j] = *s++)
+	MAKE_I_TO_D(size,type,size,type,func)

-MAKE_DEINTERLEAVE(8, uint8_t, DEINTERLEAVE_COPY);
-MAKE_DEINTERLEAVE(16, uint16_t, DEINTERLEAVE_COPY);
-MAKE_DEINTERLEAVE(24, uint8_t, write_s24(&d[i][j*3], read_s24(s)); s+=3);
-MAKE_DEINTERLEAVE(32, uint32_t, DEINTERLEAVE_COPY);
-MAKE_DEINTERLEAVE(32s, uint32_t, d[i][j] = bswap_32(*s++));
-MAKE_DEINTERLEAVE(64, uint64_t, DEINTERLEAVE_COPY);
+MAKE_DEINTERLEAVE(8, uint8_t, (uint8_t));
+MAKE_DEINTERLEAVE(16, uint16_t, (uint16_t));
+MAKE_DEINTERLEAVE(24, uint24_t, (uint24_t));
+MAKE_DEINTERLEAVE(32, uint32_t, (uint32_t));
+MAKE_DEINTERLEAVE(32s, uint32_t, bswap_32);
+MAKE_DEINTERLEAVE(64, uint64_t, (uint64_t));

 #define MAKE_INTERLEAVE(size,type,func)						\
-	MAKE_D_TO_I_F(size,type,size,type,func)
-#define INTERLEAVE_COPY		(*d++ = s[i][j])
+	MAKE_D_TO_I(size,type,size,type,func)

-MAKE_INTERLEAVE(8, uint8_t, INTERLEAVE_COPY);
-MAKE_INTERLEAVE(16, uint16_t, INTERLEAVE_COPY);
-MAKE_INTERLEAVE(24, uint8_t, write_s24(d, read_s24(&s[i][j*3])); d+=3);
-MAKE_INTERLEAVE(32, uint32_t, INTERLEAVE_COPY);
-MAKE_INTERLEAVE(32s, uint32_t, *d++ = bswap_32(s[i][j]));
-MAKE_INTERLEAVE(64, uint64_t, INTERLEAVE_COPY);
+MAKE_INTERLEAVE(8, uint8_t, (uint8_t));
+MAKE_INTERLEAVE(16, uint16_t, (uint16_t));
+MAKE_INTERLEAVE(24, uint24_t, (uint24_t));
+MAKE_INTERLEAVE(32, uint32_t, (uint32_t));
+MAKE_INTERLEAVE(32s, uint32_t, bswap_32);
+MAKE_INTERLEAVE(64, uint64_t, (uint64_t));
--- a/spa/plugins/audioconvert/fmt-ops-sse2.c
+++ b/spa/plugins/audioconvert/fmt-ops-sse2.c
@ -132,7 +132,7 @@ void
 conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0];
 	uint32_t n, unrolled;
 	__m128i in;
@ -149,21 +149,21 @@ conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in = _mm_setr_epi32(
 			*((uint32_t*)&s[0 * n_channels]),
-			*((uint32_t*)&s[3 * n_channels]),
-			*((uint32_t*)&s[6 * n_channels]),
-			*((uint32_t*)&s[9 * n_channels]));
+			*((uint32_t*)&s[1 * n_channels]),
+			*((uint32_t*)&s[2 * n_channels]),
+			*((uint32_t*)&s[3 * n_channels]));
 		in = _mm_slli_epi32(in, 8);
 		in = _mm_srai_epi32(in, 8);
 		out = _mm_cvtepi32_ps(in);
 		out = _mm_mul_ps(out, factor);
 		_mm_store_ps(&d0[n], out);
-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out = _mm_cvtsi32_ss(factor, read_s24(s));
+		out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
 		out = _mm_mul_ss(out, factor);
 		_mm_store_ss(&d0[n], out);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

@ -171,7 +171,7 @@ static void
 conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0], *d1 = dst[1];
 	uint32_t n, unrolled;
 	__m128i in[2];
@ -190,14 +190,14 @@ conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_setr_epi32(
 			*((uint32_t*)&s[0 + 0*n_channels]),
-			*((uint32_t*)&s[0 + 3*n_channels]),
-			*((uint32_t*)&s[0 + 6*n_channels]),
-			*((uint32_t*)&s[0 + 9*n_channels]));
+			*((uint32_t*)&s[0 + 1*n_channels]),
+			*((uint32_t*)&s[0 + 2*n_channels]),
+			*((uint32_t*)&s[0 + 3*n_channels]));
 		in[1] = _mm_setr_epi32(
-			*((uint32_t*)&s[3 + 0*n_channels]),
-			*((uint32_t*)&s[3 + 3*n_channels]),
-			*((uint32_t*)&s[3 + 6*n_channels]),
-			*((uint32_t*)&s[3 + 9*n_channels]));
+			*((uint32_t*)&s[1 + 0*n_channels]),
+			*((uint32_t*)&s[1 + 1*n_channels]),
+			*((uint32_t*)&s[1 + 2*n_channels]),
+			*((uint32_t*)&s[1 + 3*n_channels]));

 		in[0] = _mm_slli_epi32(in[0], 8);
 		in[1] = _mm_slli_epi32(in[1], 8);
@ -214,23 +214,23 @@ conv_s24_to_f32d_2s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ps(&d0[n], out[0]);
 		_mm_store_ps(&d1[n], out[1]);

-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
-		out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
+		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
+		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_mul_ss(out[1], factor);
 		_mm_store_ss(&d0[n], out[0]);
 		_mm_store_ss(&d1[n], out[1]);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }
 static void
 conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
 	uint32_t n, unrolled;
 	__m128i in[4];
@ -251,24 +251,24 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 	for(n = 0; n < unrolled; n += 4) {
 		in[0] = _mm_setr_epi32(
 			*((uint32_t*)&s[0 + 0*n_channels]),
-			*((uint32_t*)&s[0 + 3*n_channels]),
-			*((uint32_t*)&s[0 + 6*n_channels]),
-			*((uint32_t*)&s[0 + 9*n_channels]));
+			*((uint32_t*)&s[0 + 1*n_channels]),
+			*((uint32_t*)&s[0 + 2*n_channels]),
+			*((uint32_t*)&s[0 + 3*n_channels]));
 		in[1] = _mm_setr_epi32(
-			*((uint32_t*)&s[3 + 0*n_channels]),
-			*((uint32_t*)&s[3 + 3*n_channels]),
-			*((uint32_t*)&s[3 + 6*n_channels]),
-			*((uint32_t*)&s[3 + 9*n_channels]));
+			*((uint32_t*)&s[1 + 0*n_channels]),
+			*((uint32_t*)&s[1 + 1*n_channels]),
+			*((uint32_t*)&s[1 + 2*n_channels]),
+			*((uint32_t*)&s[1 + 3*n_channels]));
 		in[2] = _mm_setr_epi32(
-			*((uint32_t*)&s[6 + 0*n_channels]),
-			*((uint32_t*)&s[6 + 3*n_channels]),
-			*((uint32_t*)&s[6 + 6*n_channels]),
-			*((uint32_t*)&s[6 + 9*n_channels]));
+			*((uint32_t*)&s[2 + 0*n_channels]),
+			*((uint32_t*)&s[2 + 1*n_channels]),
+			*((uint32_t*)&s[2 + 2*n_channels]),
+			*((uint32_t*)&s[2 + 3*n_channels]));
 		in[3] = _mm_setr_epi32(
-			*((uint32_t*)&s[9 + 0*n_channels]),
-			*((uint32_t*)&s[9 + 3*n_channels]),
-			*((uint32_t*)&s[9 + 6*n_channels]),
-			*((uint32_t*)&s[9 + 9*n_channels]));
+			*((uint32_t*)&s[3 + 0*n_channels]),
+			*((uint32_t*)&s[3 + 1*n_channels]),
+			*((uint32_t*)&s[3 + 2*n_channels]),
+			*((uint32_t*)&s[3 + 3*n_channels]));

 		in[0] = _mm_slli_epi32(in[0], 8);
 		in[1] = _mm_slli_epi32(in[1], 8);
@ -295,13 +295,13 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ps(&d2[n], out[2]);
 		_mm_store_ps(&d3[n], out[3]);

-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
-		out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
-		out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
-		out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
+		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
+		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
+		out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
+		out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_mul_ss(out[1], factor);
 		out[2] = _mm_mul_ss(out[2], factor);
@ -310,7 +310,7 @@ conv_s24_to_f32d_4s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA
 		_mm_store_ss(&d1[n], out[1]);
 		_mm_store_ss(&d2[n], out[2]);
 		_mm_store_ss(&d3[n], out[3]);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

--- a/spa/plugins/audioconvert/fmt-ops-sse41.c
+++ b/spa/plugins/audioconvert/fmt-ops-sse41.c
@ -30,7 +30,7 @@ static void
 conv_s24_to_f32d_1s_sse41(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0];
 	uint32_t n, unrolled;
 	__m128i in = _mm_setzero_si128();
@ -43,21 +43,21 @@ conv_s24_to_f32d_1s_sse41(void *data, void * SPA_RESTRICT dst[], const void * SP

 	for(n = 0; n < unrolled; n += 4) {
 		in = _mm_insert_epi32(in, *((uint32_t*)&s[0 * n_channels]), 0);
-		in = _mm_insert_epi32(in, *((uint32_t*)&s[3 * n_channels]), 1);
-		in = _mm_insert_epi32(in, *((uint32_t*)&s[6 * n_channels]), 2);
-		in = _mm_insert_epi32(in, *((uint32_t*)&s[9 * n_channels]), 3);
+		in = _mm_insert_epi32(in, *((uint32_t*)&s[1 * n_channels]), 1);
+		in = _mm_insert_epi32(in, *((uint32_t*)&s[2 * n_channels]), 2);
+		in = _mm_insert_epi32(in, *((uint32_t*)&s[3 * n_channels]), 3);
 		in = _mm_slli_epi32(in, 8);
 		in = _mm_srai_epi32(in, 8);
 		out = _mm_cvtepi32_ps(in);
 		out = _mm_mul_ps(out, factor);
 		_mm_store_ps(&d0[n], out);
-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out = _mm_cvtsi32_ss(factor, read_s24(s));
+		out = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
 		out = _mm_mul_ss(out, factor);
 		_mm_store_ss(&d0[n], out);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

--- a/spa/plugins/audioconvert/fmt-ops-ssse3.c
+++ b/spa/plugins/audioconvert/fmt-ops-ssse3.c
@ -30,7 +30,7 @@ static void
 conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 		uint32_t n_channels, uint32_t n_samples)
 {
-	const uint8_t *s = src;
+	const int24_t *s = src;
 	float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
 	uint32_t n, unrolled;
 	__m128i in[4];
@ -48,9 +48,9 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP

 	for(n = 0; n < unrolled; n += 4) {
                in[0] = _mm_loadu_si128((__m128i*)(s + 0*n_channels));
-                in[1] = _mm_loadu_si128((__m128i*)(s + 3*n_channels));
-                in[2] = _mm_loadu_si128((__m128i*)(s + 6*n_channels));
-                in[3] = _mm_loadu_si128((__m128i*)(s + 9*n_channels));
+                in[1] = _mm_loadu_si128((__m128i*)(s + 1*n_channels));
+                in[2] = _mm_loadu_si128((__m128i*)(s + 2*n_channels));
+                in[3] = _mm_loadu_si128((__m128i*)(s + 3*n_channels));
 		in[0] = _mm_shuffle_epi8(in[0], mask);
 		in[1] = _mm_shuffle_epi8(in[1], mask);
 		in[2] = _mm_shuffle_epi8(in[2], mask);
@ -74,13 +74,13 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP
 		_mm_store_ps(&d1[n], out[1]);
 		_mm_store_ps(&d2[n], out[2]);
 		_mm_store_ps(&d3[n], out[3]);
-		s += 12 * n_channels;
+		s += 4 * n_channels;
 	}
 	for(; n < n_samples; n++) {
-		out[0] = _mm_cvtsi32_ss(factor, read_s24(s));
-		out[1] = _mm_cvtsi32_ss(factor, read_s24(s+3));
-		out[2] = _mm_cvtsi32_ss(factor, read_s24(s+6));
-		out[3] = _mm_cvtsi32_ss(factor, read_s24(s+9));
+		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
+		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
+		out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
+		out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
 		out[0] = _mm_mul_ss(out[0], factor);
 		out[1] = _mm_mul_ss(out[1], factor);
 		out[2] = _mm_mul_ss(out[2], factor);
@ -89,7 +89,7 @@ conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SP
 		_mm_store_ss(&d1[n], out[1]);
 		_mm_store_ss(&d2[n], out[2]);
 		_mm_store_ss(&d3[n], out[3]);
-		s += 3 * n_channels;
+		s += n_channels;
 	}
 }

--- a/spa/plugins/audioconvert/fmt-ops.h
+++ b/spa/plugins/audioconvert/fmt-ops.h
@ -79,17 +79,19 @@
 #define U24_MAX			16777215u
 #define U24_SCALE		8388607.5f
 #define U24_OFFS		8388608.f
-#define U24_TO_F32(v)		((((uint32_t)(v)) * (1.0f / U24_OFFS)) - 1.0)
-#define F32_TO_U24(v)		(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX)
-#define F32_TO_U24_D(v,d)	(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX)
+#define U24_TO_F32(v)		((u24_to_u32(v) * (1.0f / U24_OFFS)) - 1.0)
+#define F32_TO_U24(v)		u32_to_u24(SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX))
+#define F32_TO_U24_D(v,d)	u32_to_u24(SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX))

 #define S24_MIN			-8388607
 #define S24_MAX			8388607
 #define S24_MAX_F		8388607.0f
 #define S24_SCALE		8388607.0f
-#define S24_TO_F32(v)		(((int32_t)(v)) * (1.0f / S24_SCALE))
-#define F32_TO_S24(v)		(int32_t)SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX)
-#define F32_TO_S24_D(v,d)	(int32_t)SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX)
+#define S24_TO_F32(v)		(s24_to_s32(v) * (1.0f / S24_SCALE))
+#define S24S_TO_F32(v)		(s24_to_s32(bswap_s24(v)) * (1.0f / S24_SCALE))
+#define F32_TO_S24(v)		s32_to_s24(SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX))
+#define F32_TO_S24S(v)		bswap_s24(F32_TO_S24(v))
+#define F32_TO_S24_D(v,d)	s32_to_s24(SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX))

 #define U32_MIN			0u
 #define U32_MAX			4294967040u
@ -112,88 +114,69 @@

 #define U24_32_TO_F32(v)	U32_TO_F32((v)<<8)
 #define U24_32S_TO_F32(v)	U32_TO_F32(((int32_t)bswap_32(v))<<8)
-#define F32_TO_U24_32(v)	F32_TO_U24(v)
-#define F32_TO_U24_32S(v)	bswap_32(F32_TO_U24(v))
-#define F32_TO_U24_32_D(v,d)	F32_TO_U24_D(v,d)
-#define F32_TO_U24_32S_D(v,d)	bswap_32(F32_TO_U24_D(v,d))
+#define F32_TO_U24_32(v)	(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS, U24_MIN, U24_MAX)
+#define F32_TO_U24_32S(v)	bswap_32(F32_TO_U24_32(v))
+#define F32_TO_U24_32_D(v,d)	(uint32_t)SPA_CLAMP((v) * U24_SCALE + U24_OFFS + (d), U24_MIN, U24_MAX)
+#define F32_TO_U24_32S_D(v,d)	bswap_32(F32_TO_U24_32_D(v,d))

 #define S24_32_TO_F32(v)	S32_TO_F32((v)<<8)
 #define S24_32S_TO_F32(v)	S32_TO_F32(((int32_t)bswap_32(v))<<8)
-#define F32_TO_S24_32(v)	F32_TO_S24(v)
-#define F32_TO_S24_32S(v)	bswap_32(F32_TO_S24(v))
-#define F32_TO_S24_32_D(v,d)	F32_TO_S24_D(v,d)
-#define F32_TO_S24_32S_D(v,d)	bswap_32(F32_TO_S24_D(v,d))
+#define F32_TO_S24_32(v)	(int32_t)SPA_CLAMP((v) * S24_SCALE, S24_MIN, S24_MAX)
+#define F32_TO_S24_32S(v)	bswap_32(F32_TO_S24_32(v))
+#define F32_TO_S24_32_D(v,d)	(int32_t)SPA_CLAMP((v) * S24_SCALE + (d), S24_MIN, S24_MAX)
+#define F32_TO_S24_32S_D(v,d)	bswap_32(F32_TO_S24_32_D(v,d))

-static inline uint32_t read_u24(const void *src)
-{
-	const uint8_t *s = src;
+typedef struct {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-	return (((uint32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
+	uint8_t v3;
+	uint8_t v2;
+	uint8_t v1;
 #else
-	return (((uint32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
+	uint8_t v1;
+	uint8_t v2;
+	uint8_t v3;
 #endif
+} __attribute__ ((packed)) uint24_t;
+
+typedef struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+	uint8_t v3;
+	uint8_t v2;
+	int8_t v1;
+#else
+	int8_t v1;
+	uint8_t v2;
+	uint8_t v3;
+#endif
+} __attribute__ ((packed)) int24_t;
+
+static inline uint32_t u24_to_u32(uint24_t src)
+{
+	return ((uint32_t)src.v1 << 16) | ((uint32_t)src.v2 << 8) | (uint32_t)src.v3;
 }

-static inline int32_t read_s24(const void *src)
+static inline uint24_t u32_to_u24(uint32_t src)
 {
-	const int8_t *s = src;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	return (((int32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
-#else
-	return (((int32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
-#endif
+	return (uint24_t) { src >> 16, src >> 8, src };
 }

-static inline int32_t read_s24s(const void *src)
+static inline int32_t s24_to_s32(int24_t src)
 {
-	const int8_t *s = src;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	return (((int32_t)s[0] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[2]);
-#else
-	return (((int32_t)s[2] << 16) | ((uint32_t)(uint8_t)s[1] << 8) | (uint32_t)(uint8_t)s[0]);
-#endif
+	return ((int32_t)src.v1 << 16) | ((uint32_t)src.v2 << 8) | (uint32_t)src.v3;
 }

-static inline void write_u24(void *dst, uint32_t val)
+static inline int24_t s32_to_s24(int32_t src)
 {
-	uint8_t *d = dst;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	d[0] = (uint8_t) (val);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val >> 16);
-#else
-	d[0] = (uint8_t) (val >> 16);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val);
-#endif
+	return (int24_t) { src >> 16, src >> 8, src };
 }

-static inline void write_s24(void *dst, int32_t val)
+static inline uint24_t bswap_u24(uint24_t src)
 {
-	uint8_t *d = dst;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	d[0] = (uint8_t) (val);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val >> 16);
-#else
-	d[0] = (uint8_t) (val >> 16);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val);
-#endif
+	return (uint24_t) { src.v3, src.v2, src.v1 };
 }
-
-static inline void write_s24s(void *dst, int32_t val)
+static inline int24_t bswap_s24(int24_t src)
 {
-	uint8_t *d = dst;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-	d[0] = (uint8_t) (val >> 16);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val);
-#else
-	d[0] = (uint8_t) (val);
-	d[1] = (uint8_t) (val >> 8);
-	d[2] = (uint8_t) (val >> 16);
-#endif
+	return (int24_t) { src.v3, src.v2, src.v1 };
 }

 #define NS_MAX	8