pipewire/spa/plugins/audiomixer/mix-ops-avx.c

/* Spa */
/* SPDX-FileCopyrightText: Copyright © 2019 Wim Taymans */
/* SPDX-License-Identifier: MIT */

#include <string.h>
#include <stdio.h>
#include <math.h>

#include <spa/utils/defs.h>

#include "mix-ops.h"

#include <immintrin.h>

void
mix_f32_avx(struct mix_ops *ops, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[],
		uint32_t n_src, uint32_t n_samples)
{
	n_samples *= ops->n_channels;

	if (n_src == 0)
		memset(dst, 0, n_samples * ops->n_channels * sizeof(float));
	else if (n_src == 1) {
		if (dst != src[0])
			spa_memcpy(dst, src[0], n_samples * sizeof(float));
	} else {
		uint32_t i, n, unrolled;
		const float **s = (const float **)src;
		float *d = dst;

		if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {
			unrolled = n_samples & ~31;
			for (i = 0; i < n_src; i++) {
				if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {
					unrolled = 0;
					break;
				}
			}
		} else
			unrolled = 0;

		for (n = 0; n < unrolled; n += 32) {
			__m256 in[4];

			in[0] = _mm256_load_ps(&s[0][n +  0]);
			in[1] = _mm256_load_ps(&s[0][n +  8]);
			in[2] = _mm256_load_ps(&s[0][n + 16]);
			in[3] = _mm256_load_ps(&s[0][n + 24]);
			for (i = 1; i < n_src; i++) {
				in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n +  0]));
				in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n +  8]));
				in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n + 16]));
				in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n + 24]));
			}
			_mm256_store_ps(&d[n +  0], in[0]);
			_mm256_store_ps(&d[n +  8], in[1]);
			_mm256_store_ps(&d[n + 16], in[2]);
			_mm256_store_ps(&d[n + 24], in[3]);
		}
		for (; n < n_samples; n++) {
			__m128 in[1];
			in[0] = _mm_load_ss(&s[0][n]);
			for (i = 1; i < n_src; i++)
				in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));
			_mm_store_ss(&d[n], in[0]);
		}
	}
}
treewide: use SPDX tags to specify copyright information SPDX tags make the licensing information easy to understand and clear, and they are machine parseable. See https://spdx.dev for more information. 2023-02-08 18:12:00 +01:00			`/* Spa */`
			`/* SPDX-FileCopyrightText: Copyright © 2019 Wim Taymans */`
			`/* SPDX-License-Identifier: MIT */`
audiomixer: add AVX optimized mixer functions 2019-10-17 09:36:18 +02:00
			`#include <string.h>`
			`#include <stdio.h>`
			`#include <math.h>`

			`#include <spa/utils/defs.h>`

			`#include "mix-ops.h"`

			`#include <immintrin.h>`

			`void`
			`mix_f32_avx(struct mix_ops ops, void SPA_RESTRICT dst, const void * SPA_RESTRICT src[],`
			`uint32_t n_src, uint32_t n_samples)`
			`{`
audiomixer: optimize avx mixer some more Add avx mixer to test and benchmark Rework and unroll the avx mixer some more. The SSE one is 10 times faster than the C one, The AVX is 20 times faster. The SSE2 function is 5 times faster than the C one. 2022-07-10 23:13:24 +02:00			`n_samples *= ops->n_channels;`
audiomixer: add AVX optimized mixer functions 2019-10-17 09:36:18 +02:00
			`if (n_src == 0)`
audiomixer: support multiple channels when mixing with AVX, SSE and SSE2 2021-09-09 09:28:24 -04:00			`memset(dst, 0, n_samples * ops->n_channels * sizeof(float));`
audiomixer: optimize avx mixer some more Add avx mixer to test and benchmark Rework and unroll the avx mixer some more. The SSE one is 10 times faster than the C one, The AVX is 20 times faster. The SSE2 function is 5 times faster than the C one. 2022-07-10 23:13:24 +02:00			`else if (n_src == 1) {`
			`if (dst != src[0])`
			`spa_memcpy(dst, src[0], n_samples * sizeof(float));`
			`} else {`
			`uint32_t i, n, unrolled;`
			`const float s = (const float )src;`
			`float *d = dst;`

			`if (SPA_LIKELY(SPA_IS_ALIGNED(dst, 32))) {`
			`unrolled = n_samples & ~31;`
			`for (i = 0; i < n_src; i++) {`
			`if (SPA_UNLIKELY(!SPA_IS_ALIGNED(src[i], 32))) {`
			`unrolled = 0;`
			`break;`
			`}`
			`}`
			`} else`
			`unrolled = 0;`

			`for (n = 0; n < unrolled; n += 32) {`
			`__m256 in[4];`

			`in[0] = _mm256_load_ps(&s[0][n + 0]);`
			`in[1] = _mm256_load_ps(&s[0][n + 8]);`
			`in[2] = _mm256_load_ps(&s[0][n + 16]);`
			`in[3] = _mm256_load_ps(&s[0][n + 24]);`
			`for (i = 1; i < n_src; i++) {`
			`in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&s[i][n + 0]));`
			`in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&s[i][n + 8]));`
			`in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&s[i][n + 16]));`
			`in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&s[i][n + 24]));`
			`}`
			`_mm256_store_ps(&d[n + 0], in[0]);`
			`_mm256_store_ps(&d[n + 8], in[1]);`
			`_mm256_store_ps(&d[n + 16], in[2]);`
			`_mm256_store_ps(&d[n + 24], in[3]);`
			`}`
			`for (; n < n_samples; n++) {`
			`__m128 in[1];`
			`in[0] = _mm_load_ss(&s[0][n]);`
			`for (i = 1; i < n_src; i++)`
			`in[0] = _mm_add_ss(in[0], _mm_load_ss(&s[i][n]));`
			`_mm_store_ss(&d[n], in[0]);`
			`}`
			`}`
audiomixer: add AVX optimized mixer functions 2019-10-17 09:36:18 +02:00			`}`