echo-cancel: add webrtc AEC3 support

Drop a number of now unsupported features, and add new parameters for
pre-/post-amplification.

Part-of: <https://gitlab.freedesktop.org/pulseaudio/pulseaudio/-/merge_requests/395>
This commit is contained in:
Eero Nurkkala 2020-10-20 16:20:23 -04:00 committed by Arun Raghavan
parent b16b107171
commit 22bbb5b3ba

View file

@ -3,8 +3,8 @@
Copyright 2011 Collabora Ltd.
2015 Aldebaran SoftBank Group
Contributor: Arun Raghavan <mail@arunraghavan.net>
2020 Arun Raghavan <arun@asymptotic.io>
2020 Eero Nurkkala <eero.nurkkala@offcode.fi>
PulseAudio is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
@ -34,80 +34,47 @@ PA_C_DECL_BEGIN
#include "echo-cancel.h"
PA_C_DECL_END
#include <webrtc/modules/audio_processing/include/audio_processing.h>
#include <webrtc/modules/interface/module_common_types.h>
#include <webrtc/system_wrappers/include/trace.h>
#define WEBRTC_APM_DEBUG_DUMP 0
#include <modules/audio_processing/include/audio_processing.h>
#define BLOCK_SIZE_US 10000
#define DEFAULT_HIGH_PASS_FILTER true
#define DEFAULT_NOISE_SUPPRESSION true
#define DEFAULT_TRANSIENT_NOISE_SUPPRESSION true
#define DEFAULT_ANALOG_GAIN_CONTROL true
#define DEFAULT_DIGITAL_GAIN_CONTROL false
#define DEFAULT_MOBILE false
#define DEFAULT_ROUTING_MODE "speakerphone"
#define DEFAULT_COMFORT_NOISE true
#define DEFAULT_DRIFT_COMPENSATION false
#define DEFAULT_VAD true
#define DEFAULT_EXTENDED_FILTER false
#define DEFAULT_INTELLIGIBILITY_ENHANCER false
#define DEFAULT_EXPERIMENTAL_AGC false
#define DEFAULT_VAD false
#define DEFAULT_AGC_START_VOLUME 85
#define DEFAULT_BEAMFORMING false
#define DEFAULT_TRACE false
#define DEFAULT_POSTAMP_ENABLE false
#define DEFAULT_POSTAMP_GAIN_DB 0
#define DEFAULT_PREAMP_ENABLE false
#define DEFAULT_PREAMP_GAIN_DB 0
#define WEBRTC_AGC_MAX_VOLUME 255
#define WEBRTC_POSTAMP_GAIN_MAX_DB 90
#define WEBRTC_PREAMP_GAIN_MAX_DB 90
static const char* const valid_modargs[] = {
"high_pass_filter",
"noise_suppression",
"agc_start_volume",
"analog_gain_control",
"digital_gain_control",
"high_pass_filter",
"mobile",
"routing_mode",
"comfort_noise",
"drift_compensation",
"noise_suppression",
"post_amplifier",
"post_amplifier_gain",
"pre_amplifier",
"pre_amplifier_gain",
"transient_noise_suppression",
"voice_detection",
"extended_filter",
"intelligibility_enhancer",
"experimental_agc",
"agc_start_volume",
"beamforming",
"mic_geometry", /* documented in parse_mic_geometry() */
"target_direction", /* documented in parse_mic_geometry() */
"trace",
NULL
};
static int routing_mode_from_string(const char *rmode) {
if (pa_streq(rmode, "quiet-earpiece-or-headset"))
return webrtc::EchoControlMobile::kQuietEarpieceOrHeadset;
else if (pa_streq(rmode, "earpiece"))
return webrtc::EchoControlMobile::kEarpiece;
else if (pa_streq(rmode, "loud-earpiece"))
return webrtc::EchoControlMobile::kLoudEarpiece;
else if (pa_streq(rmode, "speakerphone"))
return webrtc::EchoControlMobile::kSpeakerphone;
else if (pa_streq(rmode, "loud-speakerphone"))
return webrtc::EchoControlMobile::kLoudSpeakerphone;
else
return -1;
}
class PaWebrtcTraceCallback : public webrtc::TraceCallback {
void Print(webrtc::TraceLevel level, const char *message, int length)
{
if (level & webrtc::kTraceError || level & webrtc::kTraceCritical)
pa_log("%s", message);
else if (level & webrtc::kTraceWarning)
pa_log_warn("%s", message);
else if (level & webrtc::kTraceInfo)
pa_log_info("%s", message);
else
pa_log_debug("%s", message);
}
};
static int webrtc_volume_from_pa(pa_volume_t v)
{
return (v * WEBRTC_AGC_MAX_VOLUME) / PA_VOLUME_NORM;
@ -120,8 +87,7 @@ static pa_volume_t webrtc_volume_to_pa(int v)
static void webrtc_ec_fixate_spec(pa_sample_spec *rec_ss, pa_channel_map *rec_map,
pa_sample_spec *play_ss, pa_channel_map *play_map,
pa_sample_spec *out_ss, pa_channel_map *out_map,
bool beamforming)
pa_sample_spec *out_ss, pa_channel_map *out_map)
{
rec_ss->format = PA_SAMPLE_FLOAT32NE;
play_ss->format = PA_SAMPLE_FLOAT32NE;
@ -139,110 +105,22 @@ static void webrtc_ec_fixate_spec(pa_sample_spec *rec_ss, pa_channel_map *rec_ma
*out_ss = *rec_ss;
*out_map = *rec_map;
if (beamforming) {
/* The beamformer gives us a single channel */
out_ss->channels = 1;
pa_channel_map_init_mono(out_map);
}
/* Playback stream rate needs to be the same as capture */
play_ss->rate = rec_ss->rate;
}
static bool parse_point(const char **point, float (&f)[3]) {
int ret, length;
ret = sscanf(*point, "%g,%g,%g%n", &f[0], &f[1], &f[2], &length);
if (ret != 3)
return false;
/* Consume the bytes we've read so far */
*point += length;
return true;
}
static bool parse_mic_geometry(const char **mic_geometry, std::vector<webrtc::Point>& geometry) {
/* The microphone geometry is expressed as cartesian point form:
* x1,y1,z1,x2,y2,z2,...
*
* Where x1,y1,z1 is the position of the first microphone with regards to
* the array's "center", x2,y2,z2 the position of the second, and so on.
*
* 'x' is the horizontal coordinate, with positive values being to the
* right from the mic array's perspective.
*
* 'y' is the depth coordinate, with positive values being in front of the
* array.
*
* 'z' is the vertical coordinate, with positive values being above the
* array.
*
* All distances are in meters.
*/
/* The target direction is expected to be in spherical point form:
* a,e,r
*
* Where 'a' is the azimuth of the target point relative to the center of
* the array, 'e' its elevation, and 'r' the radius.
*
* 0 radians azimuth is to the right of the array, and positive angles
* move in a counter-clockwise direction.
*
* 0 radians elevation is horizontal w.r.t. the array, and positive
* angles go upwards.
*
* radius is distance from the array center in meters.
*/
long unsigned int i;
float f[3];
for (i = 0; i < geometry.size(); i++) {
if (!parse_point(mic_geometry, f)) {
pa_log("Failed to parse channel %lu in mic_geometry", i);
return false;
}
/* Except for the last point, we should have a trailing comma */
if (i != geometry.size() - 1) {
if (**mic_geometry != ',') {
pa_log("Failed to parse channel %lu in mic_geometry", i);
return false;
}
(*mic_geometry)++;
}
pa_log_debug("Got mic #%lu position: (%g, %g, %g)", i, f[0], f[1], f[2]);
geometry[i].c[0] = f[0];
geometry[i].c[1] = f[1];
geometry[i].c[2] = f[2];
}
if (**mic_geometry != '\0') {
pa_log("Failed to parse mic_geometry value: more parameters than expected");
return false;
}
return true;
}
bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
pa_sample_spec *rec_ss, pa_channel_map *rec_map,
pa_sample_spec *play_ss, pa_channel_map *play_map,
pa_sample_spec *out_ss, pa_channel_map *out_map,
uint32_t *nframes, const char *args) {
webrtc::AudioProcessing *apm = NULL;
webrtc::AudioProcessing *apm = webrtc::AudioProcessingBuilder().Create();
webrtc::ProcessingConfig pconfig;
webrtc::Config config;
bool hpf, ns, agc, dgc, mobile, cn, vad, ext_filter, intelligibility, experimental_agc, beamforming;
int rm = -1, i;
uint32_t agc_start_volume;
webrtc::AudioProcessing::Config config;
bool hpf, ns, tns, agc, dgc, mobile, pre_amp, vad, post_amp;
int i;
uint32_t agc_start_volume, pre_amp_gain, post_amp_gain;
pa_modargs *ma;
bool trace = false;
if (!(ma = pa_modargs_new(args, valid_modargs))) {
pa_log("Failed to parse submodule arguments.");
@ -261,6 +139,12 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
goto fail;
}
tns = DEFAULT_TRANSIENT_NOISE_SUPPRESSION;
if (pa_modargs_get_value_boolean(ma, "transient_noise_suppression", &tns) < 0) {
pa_log("Failed to parse transient_noise_suppression value");
goto fail;
}
agc = DEFAULT_ANALOG_GAIN_CONTROL;
if (pa_modargs_get_value_boolean(ma, "analog_gain_control", &agc) < 0) {
pa_log("Failed to parse analog_gain_control value");
@ -278,6 +162,36 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
goto fail;
}
pre_amp = DEFAULT_PREAMP_ENABLE;
if (pa_modargs_get_value_boolean(ma, "pre_amplifier", &pre_amp) < 0) {
pa_log("Failed to parse pre_amplifier value");
goto fail;
}
pre_amp_gain = DEFAULT_PREAMP_GAIN_DB;
if (pa_modargs_get_value_u32(ma, "pre_amplifier_gain", &pre_amp_gain) < 0) {
pa_log("Failed to parse pre_amplifier_gain value");
goto fail;
}
if (pre_amp_gain > WEBRTC_PREAMP_GAIN_MAX_DB) {
pa_log("Preamp gain must not exceed %u", WEBRTC_PREAMP_GAIN_MAX_DB);
goto fail;
}
post_amp = DEFAULT_POSTAMP_ENABLE;
if (pa_modargs_get_value_boolean(ma, "post_amplifier", &post_amp) < 0) {
pa_log("Failed to parse post_amplifier value");
goto fail;
}
post_amp_gain = DEFAULT_POSTAMP_GAIN_DB;
if (pa_modargs_get_value_u32(ma, "post_amplifier_gain", &post_amp_gain) < 0) {
pa_log("Failed to parse post_amplifier_gain value");
goto fail;
}
if (post_amp_gain > WEBRTC_POSTAMP_GAIN_MAX_DB) {
pa_log("Postamp gain must not exceed %u", WEBRTC_POSTAMP_GAIN_MAX_DB);
goto fail;
}
mobile = DEFAULT_MOBILE;
if (pa_modargs_get_value_boolean(ma, "mobile", &mobile) < 0) {
pa_log("Failed to parse mobile value");
@ -285,33 +199,6 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
}
ec->params.drift_compensation = DEFAULT_DRIFT_COMPENSATION;
if (pa_modargs_get_value_boolean(ma, "drift_compensation", &ec->params.drift_compensation) < 0) {
pa_log("Failed to parse drift_compensation value");
goto fail;
}
if (mobile) {
if (ec->params.drift_compensation) {
pa_log("Can't use drift_compensation in mobile mode");
goto fail;
}
if ((rm = routing_mode_from_string(pa_modargs_get_value(ma, "routing_mode", DEFAULT_ROUTING_MODE))) < 0) {
pa_log("Failed to parse routing_mode value");
goto fail;
}
cn = DEFAULT_COMFORT_NOISE;
if (pa_modargs_get_value_boolean(ma, "comfort_noise", &cn) < 0) {
pa_log("Failed to parse cn value");
goto fail;
}
} else {
if (pa_modargs_get_value(ma, "comfort_noise", NULL) || pa_modargs_get_value(ma, "routing_mode", NULL)) {
pa_log("The routing_mode and comfort_noise options are only valid with mobile=true");
goto fail;
}
}
vad = DEFAULT_VAD;
if (pa_modargs_get_value_boolean(ma, "voice_detection", &vad) < 0) {
@ -319,24 +206,6 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
goto fail;
}
ext_filter = DEFAULT_EXTENDED_FILTER;
if (pa_modargs_get_value_boolean(ma, "extended_filter", &ext_filter) < 0) {
pa_log("Failed to parse extended_filter value");
goto fail;
}
intelligibility = DEFAULT_INTELLIGIBILITY_ENHANCER;
if (pa_modargs_get_value_boolean(ma, "intelligibility_enhancer", &intelligibility) < 0) {
pa_log("Failed to parse intelligibility_enhancer value");
goto fail;
}
experimental_agc = DEFAULT_EXPERIMENTAL_AGC;
if (pa_modargs_get_value_boolean(ma, "experimental_agc", &experimental_agc) < 0) {
pa_log("Failed to parse experimental_agc value");
goto fail;
}
agc_start_volume = DEFAULT_AGC_START_VOLUME;
if (pa_modargs_get_value_u32(ma, "agc_start_volume", &agc_start_volume) < 0) {
pa_log("Failed to parse agc_start_volume value");
@ -348,82 +217,7 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
}
ec->params.webrtc.agc_start_volume = agc_start_volume;
beamforming = DEFAULT_BEAMFORMING;
if (pa_modargs_get_value_boolean(ma, "beamforming", &beamforming) < 0) {
pa_log("Failed to parse beamforming value");
goto fail;
}
if (ext_filter)
config.Set<webrtc::ExtendedFilter>(new webrtc::ExtendedFilter(true));
if (intelligibility)
pa_log_warn("The intelligibility enhancer is not currently supported");
if (experimental_agc)
config.Set<webrtc::ExperimentalAgc>(new webrtc::ExperimentalAgc(true, ec->params.webrtc.agc_start_volume));
trace = DEFAULT_TRACE;
if (pa_modargs_get_value_boolean(ma, "trace", &trace) < 0) {
pa_log("Failed to parse trace value");
goto fail;
}
if (trace) {
webrtc::Trace::CreateTrace();
webrtc::Trace::set_level_filter(webrtc::kTraceAll);
ec->params.webrtc.trace_callback = new PaWebrtcTraceCallback();
webrtc::Trace::SetTraceCallback((PaWebrtcTraceCallback *) ec->params.webrtc.trace_callback);
}
webrtc_ec_fixate_spec(rec_ss, rec_map, play_ss, play_map, out_ss, out_map, beamforming);
/* We do this after fixate because we need the capture channel count */
if (beamforming) {
std::vector<webrtc::Point> geometry(rec_ss->channels);
webrtc::SphericalPointf direction(0.0f, 0.0f, 0.0f);
const char *mic_geometry, *target_direction;
if (!(mic_geometry = pa_modargs_get_value(ma, "mic_geometry", NULL))) {
pa_log("mic_geometry must be set if beamforming is enabled");
goto fail;
}
if (!parse_mic_geometry(&mic_geometry, geometry)) {
pa_log("Failed to parse mic_geometry value");
goto fail;
}
if ((target_direction = pa_modargs_get_value(ma, "target_direction", NULL))) {
float f[3];
if (!parse_point(&target_direction, f)) {
pa_log("Failed to parse target_direction value");
goto fail;
}
if (*target_direction != '\0') {
pa_log("Failed to parse target_direction value: more parameters than expected");
goto fail;
}
#define IS_ZERO(f) ((f) < 0.000001 && (f) > -0.000001)
if (!IS_ZERO(f[1]) || !IS_ZERO(f[2])) {
pa_log("The beamformer currently only supports targeting along the azimuth");
goto fail;
}
direction.s[0] = f[0];
direction.s[1] = f[1];
direction.s[2] = f[2];
}
if (!target_direction)
config.Set<webrtc::Beamforming>(new webrtc::Beamforming(true, geometry));
else
config.Set<webrtc::Beamforming>(new webrtc::Beamforming(true, geometry, direction));
}
apm = webrtc::AudioProcessing::Create(config);
webrtc_ec_fixate_spec(rec_ss, rec_map, play_ss, play_map, out_ss, out_map);
pconfig = {
webrtc::StreamConfig(rec_ss->rate, rec_ss->channels, false), /* input stream */
@ -436,46 +230,60 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
goto fail;
}
if (pre_amp) {
config.pre_amplifier.enabled = true;
config.pre_amplifier.fixed_gain_factor = (float)pre_amp_gain;
} else
config.pre_amplifier.enabled = false;
if (hpf)
apm->high_pass_filter()->Enable(true);
config.high_pass_filter.enabled = true;
else
config.high_pass_filter.enabled = false;
if (!mobile) {
apm->echo_cancellation()->enable_drift_compensation(ec->params.drift_compensation);
apm->echo_cancellation()->Enable(true);
} else {
apm->echo_control_mobile()->set_routing_mode(static_cast<webrtc::EchoControlMobile::RoutingMode>(rm));
apm->echo_control_mobile()->enable_comfort_noise(cn);
apm->echo_control_mobile()->Enable(true);
}
config.echo_canceller.enabled = true;
if (ns) {
apm->noise_suppression()->set_level(webrtc::NoiseSuppression::kHigh);
apm->noise_suppression()->Enable(true);
}
if (!mobile)
config.echo_canceller.mobile_mode = false;
else
config.echo_canceller.mobile_mode = true;
if (agc || dgc) {
if (mobile && rm <= webrtc::EchoControlMobile::kEarpiece) {
/* Maybe this should be a knob, but we've got a lot of knobs already */
apm->gain_control()->set_mode(webrtc::GainControl::kFixedDigital);
ec->params.webrtc.agc = false;
} else if (dgc) {
apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveDigital);
ec->params.webrtc.agc = false;
} else {
apm->gain_control()->set_mode(webrtc::GainControl::kAdaptiveAnalog);
if (apm->gain_control()->set_analog_level_limits(0, WEBRTC_AGC_MAX_VOLUME) !=
webrtc::AudioProcessing::kNoError) {
pa_log("Failed to initialise AGC");
goto fail;
}
ec->params.webrtc.agc = true;
}
if (ns)
config.noise_suppression.enabled = true;
else
config.noise_suppression.enabled = false;
apm->gain_control()->Enable(true);
if (tns)
config.transient_suppression.enabled = true;
else
config.transient_suppression.enabled = false;
if (dgc) {
ec->params.webrtc.agc = false;
config.gain_controller1.enabled = true;
if (mobile)
config.gain_controller1.mode = webrtc::AudioProcessing::Config::GainController1::kFixedDigital;
else
config.gain_controller1.mode = webrtc::AudioProcessing::Config::GainController1::kAdaptiveDigital;
} else if (agc) {
ec->params.webrtc.agc = true;
config.gain_controller1.enabled = true;
config.gain_controller1.mode = webrtc::AudioProcessing::Config::GainController1::kAdaptiveAnalog;
config.gain_controller1.analog_level_minimum = 0;
config.gain_controller1.analog_level_maximum = WEBRTC_AGC_MAX_VOLUME;
}
if (vad)
apm->voice_detection()->Enable(true);
config.voice_detection.enabled = true;
else
config.voice_detection.enabled = false;
if (post_amp) {
config.gain_controller2.enabled = true;
config.gain_controller2.fixed_digital.gain_db = (float)post_amp_gain;
config.gain_controller2.adaptive_digital.enabled = false;
} else
config.gain_controller2.enabled = false;
ec->params.webrtc.apm = apm;
ec->params.webrtc.rec_ss = *rec_ss;
@ -485,6 +293,8 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
*nframes = ec->params.webrtc.blocksize;
ec->params.webrtc.first = true;
apm->ApplyConfig(config);
for (i = 0; i < rec_ss->channels; i++)
ec->params.webrtc.rec_buffer[i] = pa_xnew(float, *nframes);
for (i = 0; i < play_ss->channels; i++)
@ -496,10 +306,7 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
fail:
if (ma)
pa_modargs_free(ma);
if (ec->params.webrtc.trace_callback) {
webrtc::Trace::ReturnTrace();
delete ((PaWebrtcTraceCallback *) ec->params.webrtc.trace_callback);
} if (apm)
if (apm)
delete apm;
return false;
@ -515,12 +322,6 @@ void pa_webrtc_ec_play(pa_echo_canceller *ec, const uint8_t *play) {
pa_deinterleave(play, (void **) buf, ss->channels, pa_sample_size(ss), n);
pa_assert_se(apm->ProcessReverseStream(buf, config, config, buf) == webrtc::AudioProcessing::kNoError);
/* FIXME: If ProcessReverseStream() makes any changes to the audio, such as
* applying intelligibility enhancement, those changes don't have any
* effect. This function is called at the source side, but the processing
* would have to be done in the sink to be able to feed the processed audio
* to speakers. */
}
void pa_webrtc_ec_record(pa_echo_canceller *ec, const uint8_t *rec, uint8_t *out) {
@ -538,7 +339,7 @@ void pa_webrtc_ec_record(pa_echo_canceller *ec, const uint8_t *rec, uint8_t *out
if (ec->params.webrtc.agc) {
pa_volume_t v = pa_echo_canceller_get_capture_volume(ec);
old_volume = webrtc_volume_from_pa(v);
apm->gain_control()->set_stream_analog_level(old_volume);
apm->set_stream_analog_level(old_volume);
}
apm->set_stream_delay_ms(0);
@ -553,7 +354,7 @@ void pa_webrtc_ec_record(pa_echo_canceller *ec, const uint8_t *rec, uint8_t *out
ec->params.webrtc.first = false;
new_volume = ec->params.webrtc.agc_start_volume;
} else {
new_volume = apm->gain_control()->stream_analog_level();
new_volume = apm->recommended_stream_analog_level();
}
if (old_volume != new_volume)
@ -564,9 +365,6 @@ void pa_webrtc_ec_record(pa_echo_canceller *ec, const uint8_t *rec, uint8_t *out
}
void pa_webrtc_ec_set_drift(pa_echo_canceller *ec, float drift) {
webrtc::AudioProcessing *apm = (webrtc::AudioProcessing*)ec->params.webrtc.apm;
apm->echo_cancellation()->set_stream_drift_samples(drift * ec->params.webrtc.blocksize);
}
void pa_webrtc_ec_run(pa_echo_canceller *ec, const uint8_t *rec, const uint8_t *play, uint8_t *out) {
@ -577,11 +375,6 @@ void pa_webrtc_ec_run(pa_echo_canceller *ec, const uint8_t *rec, const uint8_t *
void pa_webrtc_ec_done(pa_echo_canceller *ec) {
int i;
if (ec->params.webrtc.trace_callback) {
webrtc::Trace::ReturnTrace();
delete ((PaWebrtcTraceCallback *) ec->params.webrtc.trace_callback);
}
if (ec->params.webrtc.apm) {
delete (webrtc::AudioProcessing*)ec->params.webrtc.apm;
ec->params.webrtc.apm = NULL;