mirror of
				https://gitlab.freedesktop.org/pipewire/pipewire.git
				synced 2025-11-03 09:01:54 -05:00 
			
		
		
		
	filter-chain: add optimized sum function
This commit is contained in:
		
							parent
							
								
									cd38d7b53b
								
							
						
					
					
						commit
						0f2f113bdc
					
				
					 6 changed files with 177 additions and 0 deletions
				
			
		| 
						 | 
				
			
			@ -71,6 +71,16 @@ if have_sse
 | 
			
		|||
  simd_cargs += ['-DHAVE_SSE']
 | 
			
		||||
  simd_dependencies += filter_chain_sse
 | 
			
		||||
endif
 | 
			
		||||
if have_avx
 | 
			
		||||
  filter_chain_avx = static_library('filter_chain_avx',
 | 
			
		||||
    ['module-filter-chain/dsp-ops-avx.c' ],
 | 
			
		||||
    c_args : [avx_args, fma_args,'-O3', '-DHAVE_AVX'],
 | 
			
		||||
    dependencies : [ spa_dep ],
 | 
			
		||||
    install : false
 | 
			
		||||
    )
 | 
			
		||||
  simd_cargs += ['-DHAVE_AVX']
 | 
			
		||||
  simd_dependencies += filter_chain_avx
 | 
			
		||||
endif
 | 
			
		||||
if have_neon
 | 
			
		||||
  filter_chain_neon = static_library('filter_chain_neon',
 | 
			
		||||
    ['module-filter-chain/pffft.c' ],
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										85
									
								
								src/modules/module-filter-chain/dsp-ops-avx.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								src/modules/module-filter-chain/dsp-ops-avx.c
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,85 @@
 | 
			
		|||
/* Spa
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright © 2022 Wim Taymans
 | 
			
		||||
 *
 | 
			
		||||
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
			
		||||
 * copy of this software and associated documentation files (the "Software"),
 | 
			
		||||
 * to deal in the Software without restriction, including without limitation
 | 
			
		||||
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 | 
			
		||||
 * and/or sell copies of the Software, and to permit persons to whom the
 | 
			
		||||
 * Software is furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 * The above copyright notice and this permission notice (including the next
 | 
			
		||||
 * paragraph) shall be included in all copies or substantial portions of the
 | 
			
		||||
 * Software.
 | 
			
		||||
 *
 | 
			
		||||
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 | 
			
		||||
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 | 
			
		||||
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 | 
			
		||||
 * DEALINGS IN THE SOFTWARE.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
 | 
			
		||||
#include <spa/utils/defs.h>
 | 
			
		||||
 | 
			
		||||
#include "dsp-ops.h"
 | 
			
		||||
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
 | 
			
		||||
void dsp_sum_avx(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples)
 | 
			
		||||
{
 | 
			
		||||
	uint32_t n, unrolled;
 | 
			
		||||
	__m256 in[4];
 | 
			
		||||
 | 
			
		||||
	unrolled = n_samples & ~31;
 | 
			
		||||
 | 
			
		||||
	if (SPA_LIKELY(SPA_IS_ALIGNED(r, 32)) &&
 | 
			
		||||
	    SPA_LIKELY(SPA_IS_ALIGNED(a, 32)) &&
 | 
			
		||||
	    SPA_LIKELY(SPA_IS_ALIGNED(b, 32))) {
 | 
			
		||||
		for (n = 0; n < unrolled; n += 32) {
 | 
			
		||||
			in[0] = _mm256_load_ps(&a[n+ 0]);
 | 
			
		||||
			in[1] = _mm256_load_ps(&a[n+ 8]);
 | 
			
		||||
			in[2] = _mm256_load_ps(&a[n+16]);
 | 
			
		||||
			in[3] = _mm256_load_ps(&a[n+24]);
 | 
			
		||||
 | 
			
		||||
			in[0] = _mm256_add_ps(in[0], _mm256_load_ps(&b[n+ 0]));
 | 
			
		||||
			in[1] = _mm256_add_ps(in[1], _mm256_load_ps(&b[n+ 8]));
 | 
			
		||||
			in[2] = _mm256_add_ps(in[2], _mm256_load_ps(&b[n+16]));
 | 
			
		||||
			in[3] = _mm256_add_ps(in[3], _mm256_load_ps(&b[n+24]));
 | 
			
		||||
 | 
			
		||||
			_mm256_store_ps(&r[n+ 0], in[0]);
 | 
			
		||||
			_mm256_store_ps(&r[n+ 8], in[1]);
 | 
			
		||||
			_mm256_store_ps(&r[n+16], in[2]);
 | 
			
		||||
			_mm256_store_ps(&r[n+24], in[3]);
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		for (n = 0; n < unrolled; n += 16) {
 | 
			
		||||
			in[0] = _mm256_loadu_ps(&a[n+ 0]);
 | 
			
		||||
			in[1] = _mm256_loadu_ps(&a[n+ 8]);
 | 
			
		||||
			in[2] = _mm256_loadu_ps(&a[n+16]);
 | 
			
		||||
			in[3] = _mm256_loadu_ps(&a[n+24]);
 | 
			
		||||
 | 
			
		||||
			in[0] = _mm256_add_ps(in[0], _mm256_loadu_ps(&b[n+ 0]));
 | 
			
		||||
			in[1] = _mm256_add_ps(in[1], _mm256_loadu_ps(&b[n+ 8]));
 | 
			
		||||
			in[2] = _mm256_add_ps(in[2], _mm256_loadu_ps(&b[n+16]));
 | 
			
		||||
			in[3] = _mm256_add_ps(in[3], _mm256_loadu_ps(&b[n+24]));
 | 
			
		||||
 | 
			
		||||
			_mm256_storeu_ps(&r[n+ 0], in[0]);
 | 
			
		||||
			_mm256_storeu_ps(&r[n+ 8], in[1]);
 | 
			
		||||
			_mm256_storeu_ps(&r[n+16], in[2]);
 | 
			
		||||
			_mm256_storeu_ps(&r[n+24], in[3]);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	for (; n < n_samples; n++) {
 | 
			
		||||
		__m128 in[1];
 | 
			
		||||
		in[0] = _mm_load_ss(&a[n]);
 | 
			
		||||
		in[0] = _mm_add_ss(in[0], _mm_load_ss(&b[n]));
 | 
			
		||||
		_mm_store_ss(&r[n], in[0]);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -133,3 +133,11 @@ void dsp_biquad_run_c(struct dsp_ops *ops, struct biquad *bq,
 | 
			
		|||
#undef F
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void dsp_sum_c(struct dsp_ops *ops, float * dst,
 | 
			
		||||
		const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples)
 | 
			
		||||
{
 | 
			
		||||
	uint32_t i;
 | 
			
		||||
	for (i = 0; i < n_samples; i++)
 | 
			
		||||
		dst[i] = a[i] + b[i];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -89,3 +89,54 @@ void dsp_mix_gain_sse(struct dsp_ops *ops,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void dsp_sum_sse(struct dsp_ops *ops, float *r, const float *a, const float *b, uint32_t n_samples)
 | 
			
		||||
{
 | 
			
		||||
	uint32_t n, unrolled;
 | 
			
		||||
	__m128 in[4];
 | 
			
		||||
 | 
			
		||||
	unrolled = n_samples & ~15;
 | 
			
		||||
 | 
			
		||||
	if (SPA_LIKELY(SPA_IS_ALIGNED(r, 16)) &&
 | 
			
		||||
	    SPA_LIKELY(SPA_IS_ALIGNED(a, 16)) &&
 | 
			
		||||
	    SPA_LIKELY(SPA_IS_ALIGNED(b, 16))) {
 | 
			
		||||
		for (n = 0; n < unrolled; n += 16) {
 | 
			
		||||
			in[0] = _mm_load_ps(&a[n+ 0]);
 | 
			
		||||
			in[1] = _mm_load_ps(&a[n+ 4]);
 | 
			
		||||
			in[2] = _mm_load_ps(&a[n+ 8]);
 | 
			
		||||
			in[3] = _mm_load_ps(&a[n+12]);
 | 
			
		||||
 | 
			
		||||
			in[0] = _mm_add_ps(in[0], _mm_load_ps(&b[n+ 0]));
 | 
			
		||||
			in[1] = _mm_add_ps(in[1], _mm_load_ps(&b[n+ 4]));
 | 
			
		||||
			in[2] = _mm_add_ps(in[2], _mm_load_ps(&b[n+ 8]));
 | 
			
		||||
			in[3] = _mm_add_ps(in[3], _mm_load_ps(&b[n+12]));
 | 
			
		||||
 | 
			
		||||
			_mm_store_ps(&r[n+ 0], in[0]);
 | 
			
		||||
			_mm_store_ps(&r[n+ 4], in[1]);
 | 
			
		||||
			_mm_store_ps(&r[n+ 8], in[2]);
 | 
			
		||||
			_mm_store_ps(&r[n+12], in[3]);
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		for (n = 0; n < unrolled; n += 16) {
 | 
			
		||||
			in[0] = _mm_loadu_ps(&a[n+ 0]);
 | 
			
		||||
			in[1] = _mm_loadu_ps(&a[n+ 4]);
 | 
			
		||||
			in[2] = _mm_loadu_ps(&a[n+ 8]);
 | 
			
		||||
			in[3] = _mm_loadu_ps(&a[n+12]);
 | 
			
		||||
 | 
			
		||||
			in[0] = _mm_add_ps(in[0], _mm_loadu_ps(&b[n+ 0]));
 | 
			
		||||
			in[1] = _mm_add_ps(in[1], _mm_loadu_ps(&b[n+ 4]));
 | 
			
		||||
			in[2] = _mm_add_ps(in[2], _mm_loadu_ps(&b[n+ 8]));
 | 
			
		||||
			in[3] = _mm_add_ps(in[3], _mm_loadu_ps(&b[n+12]));
 | 
			
		||||
 | 
			
		||||
			_mm_storeu_ps(&r[n+ 0], in[0]);
 | 
			
		||||
			_mm_storeu_ps(&r[n+ 4], in[1]);
 | 
			
		||||
			_mm_storeu_ps(&r[n+ 8], in[2]);
 | 
			
		||||
			_mm_storeu_ps(&r[n+12], in[3]);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	for (; n < n_samples; n++) {
 | 
			
		||||
		in[0] = _mm_load_ss(&a[n]);
 | 
			
		||||
		in[0] = _mm_add_ss(in[0], _mm_load_ss(&b[n]));
 | 
			
		||||
		_mm_store_ss(&r[n], in[0]);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,12 +40,22 @@ struct dsp_info {
 | 
			
		|||
 | 
			
		||||
static struct dsp_info dsp_table[] =
 | 
			
		||||
{
 | 
			
		||||
#if defined (HAVE_AVX)
 | 
			
		||||
	{ SPA_CPU_FLAG_AVX,
 | 
			
		||||
		.funcs.clear = dsp_clear_c,
 | 
			
		||||
		.funcs.copy = dsp_copy_c,
 | 
			
		||||
		.funcs.mix_gain = dsp_mix_gain_sse,
 | 
			
		||||
		.funcs.biquad_run = dsp_biquad_run_c,
 | 
			
		||||
		.funcs.sum = dsp_sum_avx,
 | 
			
		||||
	},
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (HAVE_SSE)
 | 
			
		||||
	{ SPA_CPU_FLAG_SSE,
 | 
			
		||||
		.funcs.clear = dsp_clear_c,
 | 
			
		||||
		.funcs.copy = dsp_copy_c,
 | 
			
		||||
		.funcs.mix_gain = dsp_mix_gain_sse,
 | 
			
		||||
		.funcs.biquad_run = dsp_biquad_run_c,
 | 
			
		||||
		.funcs.sum = dsp_sum_sse,
 | 
			
		||||
	},
 | 
			
		||||
#endif
 | 
			
		||||
	{ 0,
 | 
			
		||||
| 
						 | 
				
			
			@ -53,6 +63,7 @@ static struct dsp_info dsp_table[] =
 | 
			
		|||
		.funcs.copy = dsp_copy_c,
 | 
			
		||||
		.funcs.mix_gain = dsp_mix_gain_c,
 | 
			
		||||
		.funcs.biquad_run = dsp_biquad_run_c,
 | 
			
		||||
		.funcs.sum = dsp_sum_c,
 | 
			
		||||
	},
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,6 +42,9 @@ struct dsp_ops_funcs {
 | 
			
		|||
			float gain[], uint32_t n_src, uint32_t n_samples);
 | 
			
		||||
	void (*biquad_run) (struct dsp_ops *ops, struct biquad *bq,
 | 
			
		||||
			float *out, const float *in, uint32_t n_samples);
 | 
			
		||||
	void (*sum) (struct dsp_ops *ops,
 | 
			
		||||
			float * dst, const float * SPA_RESTRICT a,
 | 
			
		||||
			const float * SPA_RESTRICT b, uint32_t n_samples);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct dsp_ops {
 | 
			
		||||
| 
						 | 
				
			
			@ -62,6 +65,7 @@ int dsp_ops_init(struct dsp_ops *ops);
 | 
			
		|||
#define dsp_ops_copy(ops,...)		(ops)->funcs.copy(ops, __VA_ARGS__)
 | 
			
		||||
#define dsp_ops_mix_gain(ops,...)	(ops)->funcs.mix_gain(ops, __VA_ARGS__)
 | 
			
		||||
#define dsp_ops_biquad_run(ops,...)	(ops)->funcs.biquad_run(ops, __VA_ARGS__)
 | 
			
		||||
#define dsp_ops_sum(ops,...)		(ops)->funcs.sum(ops, __VA_ARGS__)
 | 
			
		||||
 | 
			
		||||
#define MAKE_CLEAR_FUNC(arch) \
 | 
			
		||||
void dsp_clear_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst, uint32_t n_samples)
 | 
			
		||||
| 
						 | 
				
			
			@ -74,14 +78,22 @@ void dsp_mix_gain_##arch(struct dsp_ops *ops, void * SPA_RESTRICT dst,	\
 | 
			
		|||
#define MAKE_BIQUAD_RUN_FUNC(arch) \
 | 
			
		||||
void dsp_biquad_run_##arch (struct dsp_ops *ops, struct biquad *bq,	\
 | 
			
		||||
	float *out, const float *in, uint32_t n_samples)
 | 
			
		||||
#define MAKE_SUM_FUNC(arch) \
 | 
			
		||||
void dsp_sum_##arch (struct dsp_ops *ops, float * SPA_RESTRICT dst, \
 | 
			
		||||
	const float * SPA_RESTRICT a, const float * SPA_RESTRICT b, uint32_t n_samples);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MAKE_CLEAR_FUNC(c);
 | 
			
		||||
MAKE_COPY_FUNC(c);
 | 
			
		||||
MAKE_MIX_GAIN_FUNC(c);
 | 
			
		||||
MAKE_BIQUAD_RUN_FUNC(c);
 | 
			
		||||
MAKE_SUM_FUNC(c);
 | 
			
		||||
#if defined (HAVE_SSE)
 | 
			
		||||
MAKE_MIX_GAIN_FUNC(sse);
 | 
			
		||||
MAKE_SUM_FUNC(sse);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (HAVE_AVX)
 | 
			
		||||
MAKE_SUM_FUNC(avx);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif /* DSP_OPS_H */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue