mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2026-04-11 08:21:07 -04:00
filter-graph: small convolver optimizations
Use FMA when we can, make sure FMA compilation is supported and the CPU also supports it at runtime. Avoid divisions by doing the modulo increment more explicitly.
This commit is contained in:
parent
c2f85ffc51
commit
5075f27ea0
4 changed files with 23 additions and 21 deletions
|
|
@ -140,10 +140,10 @@ static void dsp_add_n_gain_avx2(void *obj, float *dst,
|
||||||
|
|
||||||
for (i = 1; i < n_src; i++) {
|
for (i = 1; i < n_src; i++) {
|
||||||
g = _mm256_set1_ps(gain[i]);
|
g = _mm256_set1_ps(gain[i]);
|
||||||
in[0] = _mm256_add_ps(in[0], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 0])));
|
in[0] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 0]), in[0]);
|
||||||
in[1] = _mm256_add_ps(in[1], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+ 8])));
|
in[1] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+ 8]), in[1]);
|
||||||
in[2] = _mm256_add_ps(in[2], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+16])));
|
in[2] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+16]), in[2]);
|
||||||
in[3] = _mm256_add_ps(in[3], _mm256_mul_ps(g, _mm256_load_ps(&s[i][n+24])));
|
in[3] = _mm256_fmadd_ps(g, _mm256_load_ps(&s[i][n+24]), in[3]);
|
||||||
}
|
}
|
||||||
_mm256_store_ps(&d[n+ 0], in[0]);
|
_mm256_store_ps(&d[n+ 0], in[0]);
|
||||||
_mm256_store_ps(&d[n+ 8], in[1]);
|
_mm256_store_ps(&d[n+ 8], in[1]);
|
||||||
|
|
@ -237,13 +237,12 @@ void dsp_sum_avx2(void *obj, float *r, const float *a, const float *b, uint32_t
|
||||||
|
|
||||||
inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
|
inline static __m256 _mm256_mul_pz(__m256 ab, __m256 cd)
|
||||||
{
|
{
|
||||||
__m256 aa, bb, dc, x0, x1;
|
__m256 aa, bb, dc, x1;
|
||||||
aa = _mm256_moveldup_ps(ab);
|
aa = _mm256_moveldup_ps(ab);
|
||||||
bb = _mm256_movehdup_ps(ab);
|
bb = _mm256_movehdup_ps(ab);
|
||||||
x0 = _mm256_mul_ps(aa, cd);
|
|
||||||
dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
|
dc = _mm256_shuffle_ps(cd, cd, _MM_SHUFFLE(2,3,0,1));
|
||||||
x1 = _mm256_mul_ps(bb, dc);
|
x1 = _mm256_mul_ps(bb, dc);
|
||||||
return _mm256_addsub_ps(x0, x1);
|
return _mm256_fmaddsub_ps(aa, cd, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void dsp_fft_cmul_avx2(void *obj, void *fft,
|
void dsp_fft_cmul_avx2(void *obj, void *fft,
|
||||||
|
|
@ -308,12 +307,10 @@ void dsp_fft_cmuladd_avx2(void *obj, void *fft,
|
||||||
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
|
bb[1] = _mm256_load_ps(&b[2*i+8]); /* br2 bi2 br3 bi3 */
|
||||||
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
|
dd[0] = _mm256_mul_pz(aa[0], bb[0]);
|
||||||
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
|
dd[1] = _mm256_mul_pz(aa[1], bb[1]);
|
||||||
dd[0] = _mm256_mul_ps(dd[0], s);
|
|
||||||
dd[1] = _mm256_mul_ps(dd[1], s);
|
|
||||||
t[0] = _mm256_load_ps(&src[2*i]);
|
t[0] = _mm256_load_ps(&src[2*i]);
|
||||||
t[1] = _mm256_load_ps(&src[2*i+8]);
|
t[1] = _mm256_load_ps(&src[2*i+8]);
|
||||||
t[0] = _mm256_add_ps(t[0], dd[0]);
|
t[0] = _mm256_fmadd_ps(dd[0], s, t[0]);
|
||||||
t[1] = _mm256_add_ps(t[1], dd[1]);
|
t[1] = _mm256_fmadd_ps(dd[1], s, t[1]);
|
||||||
_mm256_store_ps(&dst[2*i], t[0]);
|
_mm256_store_ps(&dst[2*i], t[0]);
|
||||||
_mm256_store_ps(&dst[2*i+8], t[1]);
|
_mm256_store_ps(&dst[2*i+8], t[1]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ struct dsp_info {
|
||||||
static const struct dsp_info dsp_table[] =
|
static const struct dsp_info dsp_table[] =
|
||||||
{
|
{
|
||||||
#if defined (HAVE_AVX2)
|
#if defined (HAVE_AVX2)
|
||||||
{ SPA_CPU_FLAG_AVX2,
|
{ SPA_CPU_FLAG_AVX2 | SPA_CPU_FLAG_FMA3,
|
||||||
.funcs.clear = dsp_clear_c,
|
.funcs.clear = dsp_clear_c,
|
||||||
.funcs.copy = dsp_copy_c,
|
.funcs.copy = dsp_copy_c,
|
||||||
.funcs.mix_gain = dsp_mix_gain_avx2,
|
.funcs.mix_gain = dsp_mix_gain_avx2,
|
||||||
|
|
|
||||||
|
|
@ -171,7 +171,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
|
||||||
|
|
||||||
if (conv->segCount > 1) {
|
if (conv->segCount > 1) {
|
||||||
if (inputBufferFill == 0) {
|
if (inputBufferFill == 0) {
|
||||||
int indexAudio = (conv->current + 1) % conv->segCount;
|
int indexAudio = conv->current;
|
||||||
|
|
||||||
|
if (++indexAudio == conv->segCount)
|
||||||
|
indexAudio = 0;
|
||||||
|
|
||||||
spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult,
|
spa_fga_dsp_fft_cmul(dsp, conv->fft, conv->pre_mult,
|
||||||
conv->segmentsIr[1],
|
conv->segmentsIr[1],
|
||||||
|
|
@ -179,7 +182,8 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
|
||||||
conv->fftComplexSize, conv->scale);
|
conv->fftComplexSize, conv->scale);
|
||||||
|
|
||||||
for (i = 2; i < conv->segCount; i++) {
|
for (i = 2; i < conv->segCount; i++) {
|
||||||
indexAudio = (conv->current + i) % conv->segCount;
|
if (++indexAudio == conv->segCount)
|
||||||
|
indexAudio = 0;
|
||||||
|
|
||||||
spa_fga_dsp_fft_cmuladd(dsp, conv->fft,
|
spa_fga_dsp_fft_cmuladd(dsp, conv->fft,
|
||||||
conv->pre_mult,
|
conv->pre_mult,
|
||||||
|
|
@ -214,9 +218,10 @@ static int convolver1_run(struct spa_fga_dsp *dsp, struct convolver1 *conv, cons
|
||||||
|
|
||||||
SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]);
|
SPA_SWAP(conv->fft_buffer[0], conv->fft_buffer[1]);
|
||||||
|
|
||||||
conv->current = (conv->current > 0) ? (conv->current - 1) : (conv->segCount - 1);
|
if (conv->current == 0)
|
||||||
|
conv->current = conv->segCount;
|
||||||
|
conv->current--;
|
||||||
}
|
}
|
||||||
|
|
||||||
processed += processing;
|
processed += processing;
|
||||||
}
|
}
|
||||||
conv->inputBufferFill = inputBufferFill;
|
conv->inputBufferFill = inputBufferFill;
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,16 @@ if have_sse
|
||||||
simd_cargs += ['-DHAVE_SSE']
|
simd_cargs += ['-DHAVE_SSE']
|
||||||
simd_dependencies += filter_graph_sse
|
simd_dependencies += filter_graph_sse
|
||||||
endif
|
endif
|
||||||
if have_avx2
|
if have_avx2 and have_fma
|
||||||
filter_graph_avx2 = static_library('filter_graph_avx2',
|
filter_graph_avx2_fma = static_library('filter_graph_avx2_fma',
|
||||||
['audio-dsp-avx2.c' ],
|
['audio-dsp-avx2.c' ],
|
||||||
include_directories : [configinc],
|
include_directories : [configinc],
|
||||||
c_args : [avx2_args, fma_args,'-O3', '-DHAVE_AVX2'],
|
c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
simd_cargs += ['-DHAVE_AVX2']
|
simd_cargs += ['-DHAVE_AVX2', '-DHAVE_FMA']
|
||||||
simd_dependencies += filter_graph_avx2
|
simd_dependencies += filter_graph_avx2_fma
|
||||||
endif
|
endif
|
||||||
if have_neon
|
if have_neon
|
||||||
filter_graph_neon = static_library('filter_graph_neon',
|
filter_graph_neon = static_library('filter_graph_neon',
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue