pulseaudio/src/modules/echo-cancel/adrian-aec.c
Arun Raghavan 83d3c8f22b echo-cancel: orc-ify some bits for optimisation
This uses Orc to optimise an inner loop in the core NLMS function of the
Adrian echo canceller.
2010-09-23 17:17:03 +05:30

274 lines
6.5 KiB
C

/* aec.cpp
*
* Copyright (C) DFS Deutsche Flugsicherung (2004, 2005).
* All Rights Reserved.
*
* Acoustic Echo Cancellation NLMS-pw algorithm
*
* Version 0.3 filter created with www.dsptutor.freeuk.com
* Version 0.3.1 Allow change of stability parameter delta
* Version 0.4 Leaky Normalized LMS - pre whitening algorithm
*/
#include <math.h>
#include <string.h>
#include <pulse/xmalloc.h>
#include "adrian-aec.h"
#ifndef DISABLE_ORC
#include "adrian-aec-orc.h"
#endif
#ifdef __SSE__
#include <xmmintrin.h>
#endif
/* Vector Dot Product */
static REAL dotp(REAL a[], REAL b[])
{
REAL sum0 = 0.0, sum1 = 0.0;
int j;
for (j = 0; j < NLMS_LEN; j += 2) {
// optimize: partial loop unrolling
sum0 += a[j] * b[j];
sum1 += a[j + 1] * b[j + 1];
}
return sum0 + sum1;
}
static REAL dotp_sse(REAL a[], REAL b[]) __attribute__((noinline));
static REAL dotp_sse(REAL a[], REAL b[])
{
#ifdef __SSE__
/* This is taken from speex's inner product implementation */
int j;
REAL sum;
__m128 acc = _mm_setzero_ps();
for (j=0;j<NLMS_LEN;j+=8)
{
acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j)));
acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4)));
}
acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55));
_mm_store_ss(&sum, acc);
return sum;
#else
return dotp(a, b);
#endif
}
AEC* AEC_init(int RATE, int have_vector)
{
AEC *a = pa_xnew(AEC, 1);
a->hangover = 0;
memset(a->x, 0, sizeof(a->x));
memset(a->xf, 0, sizeof(a->xf));
memset(a->w, 0, sizeof(a->w));
a->j = NLMS_EXT;
a->delta = 0.0f;
AEC_setambient(a, NoiseFloor);
a->dfast = a->dslow = M75dB_PCM;
a->xfast = a->xslow = M80dB_PCM;
a->gain = 1.0f;
a->Fx = IIR1_init(2000.0f/RATE);
a->Fe = IIR1_init(2000.0f/RATE);
a->cutoff = FIR_HP_300Hz_init();
a->acMic = IIR_HP_init();
a->acSpk = IIR_HP_init();
a->aes_y2 = M0dB;
a->fdwdisplay = -1;
a->dumpcnt = 0;
memset(a->ws, 0, sizeof(a->ws));
if (have_vector)
a->dotp = dotp_sse;
else
a->dotp = dotp;
return a;
}
// Adrian soft decision DTD
// (Dual Average Near-End to Far-End signal Ratio DTD)
// This algorithm uses exponential smoothing with differnt
// ageing parameters to get fast and slow near-end and far-end
// signal averages. The ratio of NFRs term
// (dfast / xfast) / (dslow / xslow) is used to compute the stepsize
// A ratio value of 2.5 is mapped to stepsize 0, a ratio of 0 is
// mapped to 1.0 with a limited linear function.
static float AEC_dtd(AEC *a, REAL d, REAL x)
{
float stepsize;
float ratio, M;
// fast near-end and far-end average
a->dfast += ALPHAFAST * (fabsf(d) - a->dfast);
a->xfast += ALPHAFAST * (fabsf(x) - a->xfast);
// slow near-end and far-end average
a->dslow += ALPHASLOW * (fabsf(d) - a->dslow);
a->xslow += ALPHASLOW * (fabsf(x) - a->xslow);
if (a->xfast < M70dB_PCM) {
return 0.0; // no Spk signal
}
if (a->dfast < M70dB_PCM) {
return 0.0; // no Mic signal
}
// ratio of NFRs
ratio = (a->dfast * a->xslow) / (a->dslow * a->xfast);
// begrenzte lineare Kennlinie
M = (STEPY2 - STEPY1) / (STEPX2 - STEPX1);
if (ratio < STEPX1) {
stepsize = STEPY1;
} else if (ratio > STEPX2) {
stepsize = STEPY2;
} else {
// Punktrichtungsform einer Geraden
stepsize = M * (ratio - STEPX1) + STEPY1;
}
return stepsize;
}
static void AEC_leaky(AEC *a)
// The xfast signal is used to charge the hangover timer to Thold.
// When hangover expires (no Spk signal for some time) the vector w
// is erased. This is my implementation of Leaky NLMS.
{
if (a->xfast >= M70dB_PCM) {
// vector w is valid for hangover Thold time
a->hangover = Thold;
} else {
if (a->hangover > 1) {
--(a->hangover);
} else if (1 == a->hangover) {
--(a->hangover);
// My Leaky NLMS is to erase vector w when hangover expires
memset(a->w, 0, sizeof(a->w));
}
}
}
#if 0
void AEC::openwdisplay() {
// open TCP connection to program wdisplay.tcl
fdwdisplay = socket_async("127.0.0.1", 50999);
};
#endif
static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize)
{
REAL e;
REAL ef;
a->x[a->j] = x_;
a->xf[a->j] = IIR1_highpass(a->Fx, x_); // pre-whitening of x
// calculate error value
// (mic signal - estimated mic signal from spk signal)
e = d;
if (a->hangover > 0) {
e -= a->dotp(a->w, a->x + a->j);
}
ef = IIR1_highpass(a->Fe, e); // pre-whitening of e
// optimize: iterative dotp(xf, xf)
a->dotp_xf_xf += (a->xf[a->j] * a->xf[a->j] - a->xf[a->j + NLMS_LEN - 1] * a->xf[a->j + NLMS_LEN - 1]);
if (stepsize > 0.0) {
// calculate variable step size
REAL mikro_ef = stepsize * ef / a->dotp_xf_xf;
#ifdef DISABLE_ORC
// update tap weights (filter learning)
int i;
for (i = 0; i < NLMS_LEN; i += 2) {
// optimize: partial loop unrolling
a->w[i] += mikro_ef * a->xf[i + a->j];
a->w[i + 1] += mikro_ef * a->xf[i + a->j + 1];
}
#else
update_tap_weights(a->w, &a->xf[a->j], mikro_ef, NLMS_LEN);
#endif
}
if (--(a->j) < 0) {
// optimize: decrease number of memory copies
a->j = NLMS_EXT;
memmove(a->x + a->j + 1, a->x, (NLMS_LEN - 1) * sizeof(REAL));
memmove(a->xf + a->j + 1, a->xf, (NLMS_LEN - 1) * sizeof(REAL));
}
// Saturation
if (e > MAXPCM) {
return MAXPCM;
} else if (e < -MAXPCM) {
return -MAXPCM;
} else {
return e;
}
}
int AEC_doAEC(AEC *a, int d_, int x_)
{
REAL d = (REAL) d_;
REAL x = (REAL) x_;
// Mic Highpass Filter - to remove DC
d = IIR_HP_highpass(a->acMic, d);
// Mic Highpass Filter - cut-off below 300Hz
d = FIR_HP_300Hz_highpass(a->cutoff, d);
// Amplify, for e.g. Soundcards with -6dB max. volume
d *= a->gain;
// Spk Highpass Filter - to remove DC
x = IIR_HP_highpass(a->acSpk, x);
// Double Talk Detector
a->stepsize = AEC_dtd(a, d, x);
// Leaky (ageing of vector w)
AEC_leaky(a);
// Acoustic Echo Cancellation
d = AEC_nlms_pw(a, d, x, a->stepsize);
#if 0
if (fdwdisplay >= 0) {
if (++dumpcnt >= (WIDEB*RATE/10)) {
// wdisplay creates 10 dumps per seconds = large CPU load!
dumpcnt = 0;
write(fdwdisplay, ws, DUMP_LEN*sizeof(float));
// we don't check return value. This is not production quality!!!
memset(ws, 0, sizeof(ws));
} else {
int i;
for (i = 0; i < DUMP_LEN; i += 2) {
// optimize: partial loop unrolling
ws[i] += w[i];
ws[i + 1] += w[i + 1];
}
}
}
#endif
return (int) d;
}