mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2026-03-16 05:34:04 -04:00
audioconvert: SSE optimize delay and convolver
This commit is contained in:
parent
33fb2f04c7
commit
2c0ce6afc2
5 changed files with 115 additions and 69 deletions
|
|
@ -102,6 +102,36 @@ static void lr4_process_c(struct lr4 *lr4, float *dst, const float *src, const f
|
||||||
#undef F
|
#undef F
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void delay_convolve_run_c(float *buffer, uint32_t *pos,
|
||||||
|
uint32_t n_buffer, uint32_t delay,
|
||||||
|
const float *taps, uint32_t n_taps,
|
||||||
|
float *dst, const float *src, const float vol, uint32_t n_samples)
|
||||||
|
{
|
||||||
|
uint32_t i, j;
|
||||||
|
uint32_t w = *pos;
|
||||||
|
uint32_t o = n_buffer - delay - n_taps-1;
|
||||||
|
|
||||||
|
if (n_taps == 1) {
|
||||||
|
for (i = 0; i < n_samples; i++) {
|
||||||
|
buffer[w] = buffer[w + n_buffer] = src[i];
|
||||||
|
dst[i] = buffer[w + o] * vol;
|
||||||
|
w = w + 1 >= n_buffer ? 0 : w + 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < n_samples; i++) {
|
||||||
|
float sum = 0.0f;
|
||||||
|
|
||||||
|
buffer[w] = buffer[w + n_buffer] = src[i];
|
||||||
|
for (j = 0; j < n_taps; j++)
|
||||||
|
sum += taps[j] * buffer[w+o+j];
|
||||||
|
dst[i] = sum * vol;
|
||||||
|
|
||||||
|
w = w + 1 >= n_buffer ? 0 : w + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*pos = w;
|
||||||
|
}
|
||||||
|
|
||||||
#define _M(ch) (1UL << SPA_AUDIO_CHANNEL_ ## ch)
|
#define _M(ch) (1UL << SPA_AUDIO_CHANNEL_ ## ch)
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -239,9 +269,9 @@ channelmix_f32_2_4_c(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
} else {
|
} else {
|
||||||
sub_c(d[2], s[0], s[1], n_samples);
|
sub_c(d[2], s[0], s[1], n_samples);
|
||||||
|
|
||||||
delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[3], d[2], -v3, n_samples);
|
mix->taps, mix->n_taps, d[3], d[2], -v3, n_samples);
|
||||||
delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[2], d[2], v2, n_samples);
|
mix->taps, mix->n_taps, d[2], d[2], v2, n_samples);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -307,9 +337,9 @@ channelmix_f32_2_5p1_c(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
} else {
|
} else {
|
||||||
sub_c(d[4], s[0], s[1], n_samples);
|
sub_c(d[4], s[0], s[1], n_samples);
|
||||||
|
|
||||||
delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples);
|
mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples);
|
||||||
delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[4], d[4], v4, n_samples);
|
mix->taps, mix->n_taps, d[4], d[4], v4, n_samples);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -343,9 +373,9 @@ channelmix_f32_2_7p1_c(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
} else {
|
} else {
|
||||||
sub_c(d[6], s[0], s[1], n_samples);
|
sub_c(d[6], s[0], s[1], n_samples);
|
||||||
|
|
||||||
delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples);
|
mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples);
|
||||||
delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_c(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[6], d[6], v6, n_samples);
|
mix->taps, mix->n_taps, d[6], d[6], v6, n_samples);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -257,6 +257,80 @@ static void lr4_process_2_sse(struct lr4 *lr40, struct lr4 *lr41, float *dst0, f
|
||||||
#undef F
|
#undef F
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void convolver_run(const float *src, float *dst,
|
||||||
|
const float *taps, uint32_t n_taps, const __m128 vol)
|
||||||
|
{
|
||||||
|
__m128 t[1], sum[4];
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
sum[0] = _mm_setzero_ps();
|
||||||
|
for(i = 0; i < n_taps; i+=4) {
|
||||||
|
t[0] = _mm_loadu_ps(&src[i]);
|
||||||
|
sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_loadu_ps(&taps[i]), t[0]));
|
||||||
|
}
|
||||||
|
sum[0] = _mm_add_ps(sum[0], _mm_movehl_ps(sum[0], sum[0]));
|
||||||
|
sum[0] = _mm_add_ss(sum[0], _mm_shuffle_ps(sum[0], sum[0], 0x55));
|
||||||
|
t[0] = _mm_mul_ss(sum[0], vol);
|
||||||
|
_mm_store_ss(dst, t[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void delay_convolve_run_sse(float *buffer, uint32_t *pos,
|
||||||
|
uint32_t n_buffer, uint32_t delay,
|
||||||
|
const float *taps, uint32_t n_taps,
|
||||||
|
float *dst, const float *src, const float vol, uint32_t n_samples)
|
||||||
|
{
|
||||||
|
__m128 t[1];
|
||||||
|
const __m128 v = _mm_set1_ps(vol);
|
||||||
|
uint32_t i;
|
||||||
|
uint32_t w = *pos;
|
||||||
|
uint32_t o = n_buffer - delay - n_taps-1;
|
||||||
|
uint32_t n, unrolled;
|
||||||
|
|
||||||
|
if (SPA_IS_ALIGNED(src, 16) &&
|
||||||
|
SPA_IS_ALIGNED(dst, 16))
|
||||||
|
unrolled = n_samples & ~3;
|
||||||
|
else
|
||||||
|
unrolled = 0;
|
||||||
|
|
||||||
|
if (n_taps == 1) {
|
||||||
|
for(n = 0; n < unrolled; n += 4) {
|
||||||
|
t[0] = _mm_load_ps(&src[n]);
|
||||||
|
_mm_storeu_ps(&buffer[w], t[0]);
|
||||||
|
_mm_storeu_ps(&buffer[w+n_buffer], t[0]);
|
||||||
|
t[0] = _mm_loadu_ps(&buffer[w+o]);
|
||||||
|
t[0] = _mm_mul_ps(t[0], v);
|
||||||
|
_mm_store_ps(&dst[n], t[0]);
|
||||||
|
w = w + 4 >= n_buffer ? 0 : w + 4;
|
||||||
|
}
|
||||||
|
for(; n < n_samples; n++) {
|
||||||
|
t[0] = _mm_load_ss(&src[n]);
|
||||||
|
_mm_store_ss(&buffer[w], t[0]);
|
||||||
|
_mm_store_ss(&buffer[w+n_buffer], t[0]);
|
||||||
|
t[0] = _mm_load_ss(&buffer[w+o]);
|
||||||
|
t[0] = _mm_mul_ss(t[0], v);
|
||||||
|
_mm_store_ss(&dst[n], t[0]);
|
||||||
|
w = w + 1 >= n_buffer ? 0 : w + 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(n = 0; n < unrolled; n += 4) {
|
||||||
|
t[0] = _mm_load_ps(&src[n]);
|
||||||
|
_mm_storeu_ps(&buffer[w], t[0]);
|
||||||
|
_mm_storeu_ps(&buffer[w+n_buffer], t[0]);
|
||||||
|
for(i = 0; i < 4; i++)
|
||||||
|
convolver_run(&buffer[w+o+i], &dst[n+i], taps, n_taps, v);
|
||||||
|
w = w + 4 >= n_buffer ? 0 : w + 4;
|
||||||
|
}
|
||||||
|
for(; n < n_samples; n++) {
|
||||||
|
t[0] = _mm_load_ss(&src[n]);
|
||||||
|
_mm_store_ss(&buffer[w], t[0]);
|
||||||
|
_mm_store_ss(&buffer[w+n_buffer], t[0]);
|
||||||
|
convolver_run(&buffer[w+o], &dst[n], taps, n_taps, v);
|
||||||
|
w = w + 1 >= n_buffer ? 0 : w + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*pos = w;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
const void * SPA_RESTRICT src[], uint32_t n_samples)
|
const void * SPA_RESTRICT src[], uint32_t n_samples)
|
||||||
|
|
@ -371,9 +445,9 @@ channelmix_f32_2_5p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
} else {
|
} else {
|
||||||
sub_sse(d[4], s[0], s[1], n_samples);
|
sub_sse(d[4], s[0], s[1], n_samples);
|
||||||
|
|
||||||
delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples);
|
mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples);
|
||||||
delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[4], d[4], v4, n_samples);
|
mix->taps, mix->n_taps, d[4], d[4], v4, n_samples);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -407,9 +481,9 @@ channelmix_f32_2_7p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
|
||||||
} else {
|
} else {
|
||||||
sub_sse(d[6], s[0], s[1], n_samples);
|
sub_sse(d[6], s[0], s[1], n_samples);
|
||||||
|
|
||||||
delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples);
|
mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples);
|
||||||
delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
|
||||||
mix->taps, mix->n_taps, d[6], d[6], v6, n_samples);
|
mix->taps, mix->n_taps, d[6], d[6], v6, n_samples);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -775,6 +775,7 @@ int channelmix_init(struct channelmix *mix)
|
||||||
mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f);
|
mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f);
|
||||||
mix->func_name = info->name;
|
mix->func_name = info->name;
|
||||||
|
|
||||||
|
spa_zero(mix->taps);
|
||||||
if (mix->hilbert_taps > 0) {
|
if (mix->hilbert_taps > 0) {
|
||||||
mix->n_taps = SPA_CLAMP(mix->hilbert_taps, 15u, MAX_TAPS) | 1;
|
mix->n_taps = SPA_CLAMP(mix->hilbert_taps, 15u, MAX_TAPS) | 1;
|
||||||
blackman_window(mix->taps, mix->n_taps);
|
blackman_window(mix->taps, mix->n_taps);
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@
|
||||||
#include <spa/param/audio/raw.h>
|
#include <spa/param/audio/raw.h>
|
||||||
|
|
||||||
#include "crossover.h"
|
#include "crossover.h"
|
||||||
#include "delay.h"
|
|
||||||
|
|
||||||
#define VOLUME_MIN 0.0f
|
#define VOLUME_MIN 0.0f
|
||||||
#define VOLUME_NORM 1.0f
|
#define VOLUME_NORM 1.0f
|
||||||
|
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
||||||
/* Spa */
|
|
||||||
/* SPDX-FileCopyrightText: Copyright © 2022 Wim Taymans */
|
|
||||||
/* SPDX-License-Identifier: MIT */
|
|
||||||
|
|
||||||
#ifndef DELAY_H
|
|
||||||
#define DELAY_H
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline void delay_run(float *buffer, uint32_t *pos,
|
|
||||||
uint32_t n_buffer, uint32_t delay,
|
|
||||||
float *dst, const float *src, const float vol, uint32_t n_samples)
|
|
||||||
{
|
|
||||||
uint32_t i;
|
|
||||||
uint32_t w = *pos;
|
|
||||||
uint32_t o = n_buffer - delay;
|
|
||||||
|
|
||||||
for (i = 0; i < n_samples; i++) {
|
|
||||||
buffer[w] = buffer[w + n_buffer] = src[i];
|
|
||||||
dst[i] = buffer[w + o] * vol;
|
|
||||||
w = w + 1 >= n_buffer ? 0 : w + 1;
|
|
||||||
}
|
|
||||||
*pos = w;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void delay_convolve_run(float *buffer, uint32_t *pos,
|
|
||||||
uint32_t n_buffer, uint32_t delay,
|
|
||||||
const float *taps, uint32_t n_taps,
|
|
||||||
float *dst, const float *src, const float vol, uint32_t n_samples)
|
|
||||||
{
|
|
||||||
uint32_t i, j;
|
|
||||||
uint32_t w = *pos;
|
|
||||||
uint32_t o = n_buffer - delay - n_taps-1;
|
|
||||||
|
|
||||||
if (n_taps == 1) {
|
|
||||||
delay_run(buffer, pos, n_buffer, delay, dst, src, vol, n_samples);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (i = 0; i < n_samples; i++) {
|
|
||||||
float sum = 0.0f;
|
|
||||||
|
|
||||||
buffer[w] = buffer[w + n_buffer] = src[i];
|
|
||||||
for (j = 0; j < n_taps; j++)
|
|
||||||
sum += taps[j] * buffer[w+o+j];
|
|
||||||
dst[i] = sum * vol;
|
|
||||||
|
|
||||||
w = w + 1 >= n_buffer ? 0 : w + 1;
|
|
||||||
}
|
|
||||||
*pos = w;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* DELAY_H */
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue