echo-cancel: Add SSE optimisation to the adrian module

Optimises the core inner-product function, which takes the most CPU. The
SSE-optimised bits of the adrian echo canceller only if the CPU that PA
is running on actually supports SSE.
This commit is contained in:
Arun Raghavan 2010-09-21 20:42:32 +05:30
parent ab4223e9cf
commit 963250abb9
7 changed files with 61 additions and 12 deletions

View file

@ -17,6 +17,10 @@
#include "adrian-aec.h" #include "adrian-aec.h"
#ifdef __SSE__
#include <xmmintrin.h>
#endif
/* Vector Dot Product */ /* Vector Dot Product */
static REAL dotp(REAL a[], REAL b[]) static REAL dotp(REAL a[], REAL b[])
{ {
@ -31,8 +35,32 @@ static REAL dotp(REAL a[], REAL b[])
return sum0 + sum1; return sum0 + sum1;
} }
static REAL dotp_sse(REAL a[], REAL b[]) __attribute__((noinline));
static REAL dotp_sse(REAL a[], REAL b[])
{
#ifdef __SSE__
/* This is taken from speex's inner product implementation */
int j;
REAL sum;
__m128 acc = _mm_setzero_ps();
AEC* AEC_init(int RATE) for (j=0;j<NLMS_LEN;j+=8)
{
acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j)));
acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4)));
}
acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55));
_mm_store_ss(&sum, acc);
return sum;
#else
return dotp(a, b);
#endif
}
AEC* AEC_init(int RATE, int have_vector)
{ {
AEC *a = pa_xnew(AEC, 1); AEC *a = pa_xnew(AEC, 1);
a->hangover = 0; a->hangover = 0;
@ -57,6 +85,11 @@ AEC* AEC_init(int RATE)
a->dumpcnt = 0; a->dumpcnt = 0;
memset(a->ws, 0, sizeof(a->ws)); memset(a->ws, 0, sizeof(a->ws));
if (have_vector)
a->dotp = dotp_sse;
else
a->dotp = dotp;
return a; return a;
} }
@ -146,7 +179,7 @@ static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize)
// (mic signal - estimated mic signal from spk signal) // (mic signal - estimated mic signal from spk signal)
e = d; e = d;
if (a->hangover > 0) { if (a->hangover > 0) {
e -= dotp(a->w, a->x + a->j); e -= a->dotp(a->w, a->x + a->j);
} }
ef = IIR1_highpass(a->Fe, e); // pre-whitening of e ef = IIR1_highpass(a->Fe, e); // pre-whitening of e

View file

@ -13,6 +13,13 @@
#ifndef _AEC_H /* include only once */ #ifndef _AEC_H /* include only once */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <pulsecore/macro.h>
#include <pulse/xmalloc.h>
#define WIDEB 2 #define WIDEB 2
// use double if your CPU does software-emulation of float // use double if your CPU does software-emulation of float
@ -315,6 +322,9 @@ struct AEC {
// variables are public for visualization // variables are public for visualization
int hangover; int hangover;
float stepsize; float stepsize;
// vfuncs that are picked based on processor features available
REAL (*dotp) (REAL[], REAL[]);
}; };
/* Double-Talk Detector /* Double-Talk Detector
@ -338,7 +348,7 @@ static void AEC_leaky(AEC *a);
*/ */
static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize); static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize);
AEC* AEC_init(int RATE); AEC* AEC_init(int RATE, int have_vector);
/* Acoustic Echo Cancellation and Suppression of one sample /* Acoustic Echo Cancellation and Suppression of one sample
* in d: microphone signal with echo * in d: microphone signal with echo

View file

@ -51,12 +51,12 @@ static void pa_adrian_ec_fixate_spec(pa_sample_spec *source_ss, pa_channel_map *
*sink_map = *source_map; *sink_map = *source_map;
} }
pa_bool_t pa_adrian_ec_init(pa_echo_canceller *ec, pa_bool_t pa_adrian_ec_init(pa_core *c, pa_echo_canceller *ec,
pa_sample_spec *source_ss, pa_channel_map *source_map, pa_sample_spec *source_ss, pa_channel_map *source_map,
pa_sample_spec *sink_ss, pa_channel_map *sink_map, pa_sample_spec *sink_ss, pa_channel_map *sink_map,
uint32_t *blocksize, const char *args) uint32_t *blocksize, const char *args)
{ {
int framelen, rate; int framelen, rate, have_vector = 0;
uint32_t frame_size_ms; uint32_t frame_size_ms;
pa_modargs *ma; pa_modargs *ma;
@ -80,7 +80,11 @@ pa_bool_t pa_adrian_ec_init(pa_echo_canceller *ec,
pa_log_debug ("Using framelen %d, blocksize %u, channels %d, rate %d", framelen, ec->params.priv.adrian.blocksize, source_ss->channels, source_ss->rate); pa_log_debug ("Using framelen %d, blocksize %u, channels %d, rate %d", framelen, ec->params.priv.adrian.blocksize, source_ss->channels, source_ss->rate);
ec->params.priv.adrian.aec = AEC_init(rate); /* For now we only support SSE */
if (c->cpu_info.cpu_type == PA_CPU_X86 && (c->cpu_info.flags.x86 & PA_CPU_X86_SSE))
have_vector = 1;
ec->params.priv.adrian.aec = AEC_init(rate, have_vector);
if (!ec->params.priv.adrian.aec) if (!ec->params.priv.adrian.aec)
goto fail; goto fail;

View file

@ -27,5 +27,5 @@
typedef struct AEC AEC; typedef struct AEC AEC;
AEC* AEC_init(int RATE); AEC* AEC_init(int RATE, int have_vector);
int AEC_doAEC(AEC *a, int d_, int x_); int AEC_doAEC(AEC *a, int d_, int x_);

View file

@ -25,6 +25,7 @@
#include <pulse/sample.h> #include <pulse/sample.h>
#include <pulse/channelmap.h> #include <pulse/channelmap.h>
#include <pulsecore/core.h>
#include <pulsecore/macro.h> #include <pulsecore/macro.h>
#include <speex/speex_echo.h> #include <speex/speex_echo.h>
@ -50,7 +51,8 @@ struct pa_echo_canceller_params {
typedef struct pa_echo_canceller pa_echo_canceller; typedef struct pa_echo_canceller pa_echo_canceller;
struct pa_echo_canceller { struct pa_echo_canceller {
pa_bool_t (*init) (pa_echo_canceller *ec, pa_bool_t (*init) (pa_core *c,
pa_echo_canceller *ec,
pa_sample_spec *source_ss, pa_sample_spec *source_ss,
pa_channel_map *source_map, pa_channel_map *source_map,
pa_sample_spec *sink_ss, pa_sample_spec *sink_ss,
@ -64,7 +66,7 @@ struct pa_echo_canceller {
}; };
/* Speex canceller functions */ /* Speex canceller functions */
pa_bool_t pa_speex_ec_init(pa_echo_canceller *ec, pa_bool_t pa_speex_ec_init(pa_core *c, pa_echo_canceller *ec,
pa_sample_spec *source_ss, pa_channel_map *source_map, pa_sample_spec *source_ss, pa_channel_map *source_map,
pa_sample_spec *sink_ss, pa_channel_map *sink_map, pa_sample_spec *sink_ss, pa_channel_map *sink_map,
uint32_t *blocksize, const char *args); uint32_t *blocksize, const char *args);
@ -72,7 +74,7 @@ void pa_speex_ec_run(pa_echo_canceller *ec, const uint8_t *rec, const uint8_t *p
void pa_speex_ec_done(pa_echo_canceller *ec); void pa_speex_ec_done(pa_echo_canceller *ec);
/* Adrian Andre's echo canceller */ /* Adrian Andre's echo canceller */
pa_bool_t pa_adrian_ec_init(pa_echo_canceller *ec, pa_bool_t pa_adrian_ec_init(pa_core *c, pa_echo_canceller *ec,
pa_sample_spec *source_ss, pa_channel_map *source_map, pa_sample_spec *source_ss, pa_channel_map *source_map,
pa_sample_spec *sink_ss, pa_channel_map *sink_map, pa_sample_spec *sink_ss, pa_channel_map *sink_map,
uint32_t *blocksize, const char *args); uint32_t *blocksize, const char *args);

View file

@ -1398,7 +1398,7 @@ int pa__init(pa_module*m) {
u->asyncmsgq = pa_asyncmsgq_new(0); u->asyncmsgq = pa_asyncmsgq_new(0);
u->need_realign = TRUE; u->need_realign = TRUE;
if (u->ec->init) { if (u->ec->init) {
if (!u->ec->init(u->ec, &source_ss, &source_map, &sink_ss, &sink_map, &u->blocksize, pa_modargs_get_value(ma, "aec_args", NULL))) { if (!u->ec->init(u->core, u->ec, &source_ss, &source_map, &sink_ss, &sink_map, &u->blocksize, pa_modargs_get_value(ma, "aec_args", NULL))) {
pa_log("Failed to init AEC engine"); pa_log("Failed to init AEC engine");
goto fail; goto fail;
} }

View file

@ -48,7 +48,7 @@ static void pa_speex_ec_fixate_spec(pa_sample_spec *source_ss, pa_channel_map *s
*sink_map = *source_map; *sink_map = *source_map;
} }
pa_bool_t pa_speex_ec_init(pa_echo_canceller *ec, pa_bool_t pa_speex_ec_init(pa_core *c, pa_echo_canceller *ec,
pa_sample_spec *source_ss, pa_channel_map *source_map, pa_sample_spec *source_ss, pa_channel_map *source_map,
pa_sample_spec *sink_ss, pa_channel_map *sink_map, pa_sample_spec *sink_ss, pa_channel_map *sink_map,
uint32_t *blocksize, const char *args) uint32_t *blocksize, const char *args)