[pulseaudio-discuss] [PATCH v3 20/24] echo-cancel: Add beamforming support in the webrtc canceller

Wed Feb 17 13:17:17 UTC 2016

On Mon, 2016-02-15 at 13:12 +0200, Tanu Kaskinen wrote:
> On Mon, 2016-01-18 at 13:06 +0530, arun at accosted.net wrote:
> > From: Arun Raghavan <git at arunraghavan.net>
> > 
> > ---
> >  src/modules/echo-cancel/webrtc.cc | 138 +++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 137 insertions(+), 1 deletion(-)
> > 
> > diff --git a/src/modules/echo-cancel/webrtc.cc b/src/modules/echo-cancel/webrtc.cc
> > index 5f00286..9ae5d77 100644
> > --- a/src/modules/echo-cancel/webrtc.cc
> > +++ b/src/modules/echo-cancel/webrtc.cc
> > @@ -52,6 +52,7 @@ PA_C_DECL_END
> >  #define DEFAULT_INTELLIGIBILITY_ENHANCER false
> >  #define DEFAULT_EXPERIMENTAL_AGC false
> >  #define DEFAULT_AGC_START_VOLUME 85
> > +#define DEFAULT_BEAMFORMING false
> >  #define DEFAULT_TRACE false
> >  
> >  #define WEBRTC_AGC_MAX_VOLUME 255
> > @@ -70,6 +71,9 @@ static const char* const valid_modargs[] = {
> >      "intelligibility_enhancer",
> >      "experimental_agc",
> >      "agc_start_volume",
> > +    "beamforming",
> > +    "mic_geometry", /* documented in parse_mic_geometry() */
> > +    "target_direction", /* documented in parse_mic_geometry() */
> >      "trace",
> >      NULL
> >  };
> > @@ -138,6 +142,76 @@ static void pa_webrtc_ec_fixate_spec(pa_sample_spec *rec_ss, pa_channel_map *rec
> >      play_ss->rate = rec_ss->rate;
> >  }
> >  
> > +static bool parse_point(const char **point, float (&f)[3]) {
> > +    int ret, length;
> > +
> > +    ret = sscanf(*point, "%g,%g,%g%n", &f[0], &f[1], &f[2], &length);
> > +    if (ret != 3)
> > +        return false;
> > +
> > +    /* Consume the bytes we've read so far */
> > +    *point += length;
> > +
> > +    /* Consume trailing comma if there is one */
> > +    if (**point == ',')
> > +        (*point)++;
> 
> I don't think this should be done in this function. When expecting only
> one point, we should fail on "1,2,3," but this code accepts that
> string.

Okay.

> > +
> > +    return true;
> > +}
> > +
> > +static bool parse_mic_geometry(const char **mic_geometry, std::vector& geometry) {
> > +    /* The microphone geometry is expressed as cartesian point form:
> > +     *   x1,y1,z1,x2,y2,z2,...
> > +     *
> > +     * Where x1,y1,z1 is the position of the first microphone with regards to
> > +     * the array's "center", x2,y2,z2 the position of the second, and so on.
> > +     *
> > +     * 'x' is the horizontal coordinate, with positive values being to the
> > +     * right from the mic array's perspective.
> > +     *
> > +     * 'y' is the depth coordinate, with positive values being in front of the
> > +     * array.
> > +     *
> > +     * 'z' is the vertical coordinate, with positive values being above the
> > +     * array.
> > +     *
> > +     * All distances are in meters.
> > +     */
> 
> It's a bit unclear what "the mic array's perspective" means. Should I
> imagine the array as a person standing upright, looking straight ahead?
> Then positive x is to the right from the "person's" point of view,
> positive y is in front of the array guy and positive z is above the
> array.

Yes.

> > +
> > +    /* The target direction is expected to be in spherical point form:
> > +     *   a,e,r
> > +     *
> > +     * Where 'a is the azimuth of the first mic channel, 'e' its elevation,
> > +     * and 'r' the radius.
> > +     *
> > +     * 0 radians azimuth is to the right of the array, and positive angles
> > +     * move in a counter-clockwise direction.
> > +     *
> > +     * 0 radians elevation is horizontal w.r.t. the array, and positive
> > +     * angles go upwards.
> > +     *
> > +     * radius is distance from the array center in meters.
> > +     */
> > +
> > +    int i;
> > +    float f[3];
> > +
> > +    for (i = 0; i < geometry.size(); i++) {
> > +        if (!parse_point(mic_geometry, f)) {
> > +            pa_log("Failed to parse channel %d in mic_geometry", i);
> > +            return false;
> > +        }
> > +
> > +        pa_log_debug("Got mic #%d position: (%g, %g, %g)", i, f[0], f[1], f[2]);
> > +
> > +        geometry[i].c[0] = f[0];
> > +        geometry[i].c[1] = f[1];
> > +        geometry[i].c[2] = f[2];
> > +    }
> > +
> > +    return true;
> > +}
> > +
> >  bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
> >                         pa_sample_spec *rec_ss, pa_channel_map *rec_map,
> >                         pa_sample_spec *play_ss, pa_channel_map *play_map,
> > @@ -146,7 +220,7 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
> >      webrtc::AudioProcessing *apm = NULL;
> >      webrtc::ProcessingConfig pconfig;
> >      webrtc::Config config;
> > -    bool hpf, ns, agc, dgc, mobile, cn, vad, ext_filter, intelligibility, experimental_agc;
> > +    bool hpf, ns, agc, dgc, mobile, cn, vad, ext_filter, intelligibility, experimental_agc, beamforming;
> >      int rm = -1;
> >      uint32_t agc_start_volume;
> >      pa_modargs *ma;
> > @@ -255,6 +329,12 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
> >      }
> >      ec->params.webrtc.agc_start_volume = agc_start_volume;
> >  
> > +    beamforming = DEFAULT_BEAMFORMING;
> > +    if (pa_modargs_get_value_boolean(ma, "beamforming", &beamforming) < 0) {
> > +        pa_log("Failed to parse beamforming value");
> > +        goto fail;
> > +    }
> > +
> >      if (ext_filter)
> >          config.Set(new webrtc::ExtendedFilter(true));
> >      if (intelligibility)
> > @@ -276,6 +356,62 @@ bool pa_webrtc_ec_init(pa_core *c, pa_echo_canceller *ec,
> >  
> >      pa_webrtc_ec_fixate_spec(rec_ss, rec_map, play_ss, play_map, out_ss, out_map);
> >  
> > +    /* We do this after fixate because we need the capture channel count */
> > +    if (beamforming) {
> > +        std::vector geometry(rec_ss->channels);
> > +        webrtc::SphericalPointf direction(0.0f, 0.0f, 0.0f);
> > +        const char *mic_geometry, *target_direction;
> > +
> > +        if (!(mic_geometry = pa_modargs_get_value(ma, "mic_geometry", NULL))) {
> > +            pa_log("mic_geometry must be set if beamforming is enabled");
> > +            goto fail;
> > +        }
> > +
> > +        if (!parse_mic_geometry(&mic_geometry, geometry)) {
> > +            pa_log("Failed to parse mic_geometry value");
> > +            goto fail;
> > +        }
> > +
> > +        if (*mic_geometry != '\0') {
> > +            pa_log("Failed to parse mic_geometry value: more parameters than expected");
> > +            goto fail;
> > +        }
> 
> This check could be moved to parse_mic_geometry().

Fair enough.

> > +
> > +        if ((target_direction = pa_modargs_get_value(ma, "target_direction", NULL))) {
> > +            float f[3];
> > +
> > +            if (!parse_point(&target_direction, f)) {
> > +                pa_log("Failed to parse target_direction value");
> > +                goto fail;
> > +            }
> > +
> > +            if (*target_direction != '\0') {
> > +                pa_log("Failed to parse target_direction value: more parameters than expected");
> > +                goto fail;
> > +            }
> > +
> > +#define IS_ZERO(f) ((f) < 0.000001 && (f) > -0.000001)
> 
> This kind of macro would be nice to have in some shared location.
> speex.c and a couple of tests have similar code, and I remember needing
> it in the volume control patches.

I can do that in a separate commit.

> > +
> > +	    if (!IS_ZERO(f[1]) || !IS_ZERO(f[2])) {
> 
> This line contains a tab character.

Fixed.

> > +                pa_log("The beamformer currently only supports targeting along the azimuth");
> > +                goto fail;
> > +            }
> > +
> > +            direction.s[0] = f[0];
> > +            direction.s[1] = f[1];
> > +            direction.s[2] = f[2];
> > +        }
> > +
> > +        /* The beamformer gives us a single channel */
> > +        out_ss->channels = 1;
> > +        pa_channel_map_init_mono(out_map);
> 
> Could this be moved to fixate_spec()? I don't like the audio format
> parameters being calculated all over the place.

I wanted to do that, but we can't because we need the capture channel
count for parsing the mic geometry (there is a comment explaining that
where if (beamforming) { ... } is).

-- Arun