pipewire/spa/plugins/audioconvert/fmt-ops-ssse3.c

/* Spa */
/* SPDX-FileCopyrightText: Copyright © 2018 Wim Taymans */
/* SPDX-License-Identifier: MIT */

#include "fmt-ops.h"

#include <tmmintrin.h>

static void
conv_s24_to_f32d_4s_ssse3(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
		uint32_t n_channels, uint32_t n_samples)
{
	const int24_t *s = src;
	float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
	uint32_t n, unrolled;
	__m128i in[4];
	__m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE);
	const __m128i mask = _mm_setr_epi8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11);
	//const __m128i mask = _mm_set_epi8(15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, -1);

	if (SPA_IS_ALIGNED(d0, 16) &&
	    SPA_IS_ALIGNED(d1, 16) &&
	    SPA_IS_ALIGNED(d2, 16) &&
	    SPA_IS_ALIGNED(d3, 16))
		unrolled = n_samples & ~3;
	else
		unrolled = 0;

	for(n = 0; n < unrolled; n += 4) {
                in[0] = _mm_loadu_si128((__m128i*)(s + 0*n_channels));
                in[1] = _mm_loadu_si128((__m128i*)(s + 1*n_channels));
                in[2] = _mm_loadu_si128((__m128i*)(s + 2*n_channels));
                in[3] = _mm_loadu_si128((__m128i*)(s + 3*n_channels));
		in[0] = _mm_shuffle_epi8(in[0], mask);
		in[1] = _mm_shuffle_epi8(in[1], mask);
		in[2] = _mm_shuffle_epi8(in[2], mask);
		in[3] = _mm_shuffle_epi8(in[3], mask);
		in[0] = _mm_srai_epi32(in[0], 8);
		in[1] = _mm_srai_epi32(in[1], 8);
		in[2] = _mm_srai_epi32(in[2], 8);
		in[3] = _mm_srai_epi32(in[3], 8);
		out[0] = _mm_cvtepi32_ps(in[0]);
		out[1] = _mm_cvtepi32_ps(in[1]);
		out[2] = _mm_cvtepi32_ps(in[2]);
		out[3] = _mm_cvtepi32_ps(in[3]);
		out[0] = _mm_mul_ps(out[0], factor);
		out[1] = _mm_mul_ps(out[1], factor);
		out[2] = _mm_mul_ps(out[2], factor);
		out[3] = _mm_mul_ps(out[3], factor);

		_MM_TRANSPOSE4_PS(out[0], out[1], out[2], out[3]);

		_mm_store_ps(&d0[n], out[0]);
		_mm_store_ps(&d1[n], out[1]);
		_mm_store_ps(&d2[n], out[2]);
		_mm_store_ps(&d3[n], out[3]);
		s += 4 * n_channels;
	}
	for(; n < n_samples; n++) {
		out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));
		out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));
		out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));
		out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));
		out[0] = _mm_mul_ss(out[0], factor);
		out[1] = _mm_mul_ss(out[1], factor);
		out[2] = _mm_mul_ss(out[2], factor);
		out[3] = _mm_mul_ss(out[3], factor);
		_mm_store_ss(&d0[n], out[0]);
		_mm_store_ss(&d1[n], out[1]);
		_mm_store_ss(&d2[n], out[2]);
		_mm_store_ss(&d3[n], out[3]);
		s += n_channels;
	}
}

void
conv_s24_to_f32d_1s_sse2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
		uint32_t n_channels, uint32_t n_samples);

void
conv_s24_to_f32d_ssse3(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
		uint32_t n_samples)
{
	const int8_t *s = src[0];
	uint32_t i = 0, n_channels = conv->n_channels;

	for(; i + 3 < n_channels; i += 4)
		conv_s24_to_f32d_4s_ssse3(conv, &dst[i], &s[3*i], n_channels, n_samples);
	for(; i < n_channels; i++)
		conv_s24_to_f32d_1s_sse2(conv, &dst[i], &s[3*i], n_channels, n_samples);
}
treewide: use SPDX tags to specify copyright information SPDX tags make the licensing information easy to understand and clear, and they are machine parseable. See https://spdx.dev for more information. 2023-02-08 18:12:00 +01:00			`/* Spa */`
			`/* SPDX-FileCopyrightText: Copyright © 2018 Wim Taymans */`
			`/* SPDX-License-Identifier: MIT */`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00
audioconvert: handle more optimizations Compile an optimized library for the given CPU with the right flags, then link it with the main library. 2019-03-27 17:58:48 +01:00			`#include "fmt-ops.h"`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00
fmt-ops: use faster f32 -> s32 conversion 2019-03-26 17:24:14 +01:00			`#include <tmmintrin.h>`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00
			`static void`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`conv_s24_to_f32d_4s_ssse3(void data, void SPA_RESTRICT dst[], const void * SPA_RESTRICT src,`
			`uint32_t n_channels, uint32_t n_samples)`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`{`
audioconvert: simplify 24 bits handling Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types. 2022-07-01 12:24:35 +02:00			`const int24_t *s = src;`
fmt-ops: add avx2 optimized version Only one optimized version but the sse2 version are compiled with the avx2 flags so that they get optimized better. 2020-03-16 16:11:29 +01:00			`float d0 = dst[0], d1 = dst[1], d2 = dst[2], d3 = dst[3];`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`uint32_t n, unrolled;`
			`__m128i in[4];`
			`__m128 out[4], factor = _mm_set1_ps(1.0f / S24_SCALE);`
audioconvert: handle more optimizations Compile an optimized library for the given CPU with the right flags, then link it with the main library. 2019-03-27 17:58:48 +01:00			`const __m128i mask = _mm_setr_epi8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11);`
			`//const __m128i mask = _mm_set_epi8(15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, -1);`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00
audioconvert: improve benchmark Also include the simd versions in the benchmark Fix some issues found by new test 2019-03-28 13:26:06 +01:00			`if (SPA_IS_ALIGNED(d0, 16) &&`
			`SPA_IS_ALIGNED(d1, 16) &&`
			`SPA_IS_ALIGNED(d2, 16) &&`
			`SPA_IS_ALIGNED(d3, 16))`
audioconvert: add avx optimizations 2019-03-28 16:45:57 +01:00			`unrolled = n_samples & ~3;`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`else`
			`unrolled = 0;`

audioconvert: add avx optimizations 2019-03-28 16:45:57 +01:00			`for(n = 0; n < unrolled; n += 4) {`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`in[0] = _mm_loadu_si128((__m128i)(s + 0n_channels));`
audioconvert: simplify 24 bits handling Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types. 2022-07-01 12:24:35 +02:00			`in[1] = _mm_loadu_si128((__m128i)(s + 1n_channels));`
			`in[2] = _mm_loadu_si128((__m128i)(s + 2n_channels));`
			`in[3] = _mm_loadu_si128((__m128i)(s + 3n_channels));`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`in[0] = _mm_shuffle_epi8(in[0], mask);`
			`in[1] = _mm_shuffle_epi8(in[1], mask);`
			`in[2] = _mm_shuffle_epi8(in[2], mask);`
			`in[3] = _mm_shuffle_epi8(in[3], mask);`
			`in[0] = _mm_srai_epi32(in[0], 8);`
			`in[1] = _mm_srai_epi32(in[1], 8);`
			`in[2] = _mm_srai_epi32(in[2], 8);`
			`in[3] = _mm_srai_epi32(in[3], 8);`
			`out[0] = _mm_cvtepi32_ps(in[0]);`
			`out[1] = _mm_cvtepi32_ps(in[1]);`
			`out[2] = _mm_cvtepi32_ps(in[2]);`
			`out[3] = _mm_cvtepi32_ps(in[3]);`
			`out[0] = _mm_mul_ps(out[0], factor);`
			`out[1] = _mm_mul_ps(out[1], factor);`
			`out[2] = _mm_mul_ps(out[2], factor);`
			`out[3] = _mm_mul_ps(out[3], factor);`

			`_MM_TRANSPOSE4_PS(out[0], out[1], out[2], out[3]);`

			`_mm_store_ps(&d0[n], out[0]);`
			`_mm_store_ps(&d1[n], out[1]);`
			`_mm_store_ps(&d2[n], out[2]);`
			`_mm_store_ps(&d3[n], out[3]);`
audioconvert: simplify 24 bits handling Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types. 2022-07-01 12:24:35 +02:00			`s += 4 * n_channels;`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`}`
			`for(; n < n_samples; n++) {`
audioconvert: simplify 24 bits handling Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types. 2022-07-01 12:24:35 +02:00			`out[0] = _mm_cvtsi32_ss(factor, s24_to_s32(*s));`
			`out[1] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+1)));`
			`out[2] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+2)));`
			`out[3] = _mm_cvtsi32_ss(factor, s24_to_s32(*(s+3)));`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`out[0] = _mm_mul_ss(out[0], factor);`
			`out[1] = _mm_mul_ss(out[1], factor);`
			`out[2] = _mm_mul_ss(out[2], factor);`
			`out[3] = _mm_mul_ss(out[3], factor);`
			`_mm_store_ss(&d0[n], out[0]);`
			`_mm_store_ss(&d1[n], out[1]);`
			`_mm_store_ss(&d2[n], out[2]);`
			`_mm_store_ss(&d3[n], out[3]);`
audioconvert: simplify 24 bits handling Make a new uint42_t and int24_t type and use that to handle 24 bits samples. This makes it easier because we can iterate and copy the structs like other types. 2022-07-01 12:24:35 +02:00			`s += n_channels;`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`}`
			`}`

audioconvert: improve benchmark Also include the simd versions in the benchmark Fix some issues found by new test 2019-03-28 13:26:06 +01:00			`void`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`conv_s24_to_f32d_1s_sse2(void data, void SPA_RESTRICT dst[], const void * SPA_RESTRICT src,`
			`uint32_t n_channels, uint32_t n_samples);`
audioconvert: handle more optimizations Compile an optimized library for the given CPU with the right flags, then link it with the main library. 2019-03-27 17:58:48 +01:00
			`void`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`conv_s24_to_f32d_ssse3(struct convert conv, void SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],`
			`uint32_t n_samples)`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`{`
			`const int8_t *s = src[0];`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`uint32_t i = 0, n_channels = conv->n_channels;`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00
			`for(; i + 3 < n_channels; i += 4)`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`conv_s24_to_f32d_4s_ssse3(conv, &dst[i], &s[3*i], n_channels, n_samples);`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`for(; i < n_channels; i++)`
audioconvert: pass state to functions Pass some state to convert and channelmix functions. This makes it possible to select per channel optimized convert functions but also makes it possible to implement noise shaping later. Pass the channelmix matrix and volume in the state. Handle specialized 2 channel s16 -> f32 conversion 2019-03-29 17:39:59 +01:00			`conv_s24_to_f32d_1s_sse2(conv, &dst[i], &s[3*i], n_channels, n_samples);`
audioconvert: some more optimizations 2019-03-20 13:04:44 +01:00			`}`