[Libva] [PATCH 2/4] Set the pipeline to use the new VP8 encoding shaders on BSW

Wed Jan 11 00:21:04 UTC 2017

On 10/01/17 22:02, Sean V Kelley wrote:
> From: "Xiang, Haihao" <haihao.xiang at intel.com>
> 
> Currently only one temporal layer is supported
> 
> Signed-off-by: Xiang, Haihao <haihao.xiang at intel.com>
> Reviewed-by: Sean V Kelley <seanvk at posteo.de>
> ---
>  src/Makefile.am        |    3 +
>  src/gen8_encoder_vp8.c |  140 +
>  src/gen8_mfc.c         |    8 +-
>  src/gen8_vme.c         |    5 +
>  src/i965_defines.h     |   10 +
>  src/i965_encoder.c     |    2 +
>  src/i965_encoder_vp8.c | 6697 ++++++++++++++++++++++++++++++++++++++++++++++++
>  src/i965_encoder_vp8.h | 2643 +++++++++++++++++++
>  8 files changed, 9507 insertions(+), 1 deletion(-)

I had a go with this on Kaby Lake.  In general, big win - looks like it can be under half the bitrate at comparable quality (though it was pretty terrible before...).

However, the rate control seems to do odd things at low bitrate relative to the frame size?  I can get GPU hangs and wildly varying output bitrate with it, though it seems ok at high bitrate.

I had a look around the rate control and found two minor issues in the RC configuration, though I don't think either of them are relevant to my problem (see below).  I can try to make a reproducer if this is not already known?

Thanks,

- Mark

> ...
> +
> +static void
> +i965_encoder_vp8_get_misc_parameters(VADriverContextP ctx,
> +                                     struct encode_state *encode_state,
> +                                     struct intel_encoder_context *encoder_context)
> +{
> +    struct i965_encoder_vp8_context *vp8_context = encoder_context->vme_context;
> +
> +    if (vp8_context->internal_rate_mode == I965_BRC_CQP) {
> +        vp8_context->init_vbv_buffer_fullness_in_bit = 0;
> +        vp8_context->vbv_buffer_size_in_bit = 0;
> +        vp8_context->target_bit_rate = 0;
> +        vp8_context->max_bit_rate = 0;
> +        vp8_context->min_bit_rate = 0;
> +        vp8_context->brc_need_reset = 0;
> +    } else {
> +        vp8_context->gop_size = encoder_context->brc.gop_size;
> +
> +        if (encoder_context->brc.need_reset) {
> +            vp8_context->framerate = encoder_context->brc.framerate[0];
> +            vp8_context->vbv_buffer_size_in_bit = encoder_context->brc.hrd_buffer_size;
> +            vp8_context->init_vbv_buffer_fullness_in_bit = encoder_context->brc.hrd_initial_buffer_fullness;
> +            vp8_context->max_bit_rate = encoder_context->brc.bits_per_second[0]; // currently only one layer is supported
> +            vp8_context->brc_need_reset = (vp8_context->brc_initted && encoder_context->brc.need_reset);
> +
> +            if (vp8_context->internal_rate_mode == I965_BRC_CBR) {
> +                vp8_context->min_bit_rate = vp8_context->max_bit_rate;
> +                vp8_context->target_bit_rate = vp8_context->max_bit_rate;
> +            } else {
> +                assert(vp8_context->internal_rate_mode == I965_BRC_VBR);
> +                vp8_context->min_bit_rate = vp8_context->max_bit_rate * (2 * encoder_context->brc.target_percentage[0] - 100) / 100;

If target percentage is < 50 then (2 * encoder_context->brc.target_percentage[0] - 100) is negative.  Since it's unsigned, you end up with a garbage number in min_bit_rate.

> +                vp8_context->target_bit_rate = vp8_context->max_bit_rate * encoder_context->brc.target_percentage[0] / 100;
> +            }
> +        }
> +    }
> +
> +    if (encoder_context->quality_level == ENCODER_LOW_QUALITY)
> +        vp8_context->hme_16x_supported = 0;
> +}
> +
> ...
> +
> +static void
> +i965_encoder_vp8_vme_brc_init_reset_set_curbe(VADriverContextP ctx,
> +                                              struct encode_state *encode_state,
> +                                              struct intel_encoder_context *encoder_context,
> +                                              struct i965_gpe_context *gpe_context)
> +{
> +    struct i965_encoder_vp8_context *vp8_context = encoder_context->vme_context;
> +    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
> +    struct vp8_brc_init_reset_curbe_data *pcmd = i965_gpe_context_map_curbe(gpe_context);
> +    double input_bits_per_frame, bps_ratio;
> +
> +    memset(pcmd, 0, sizeof(*pcmd));
> +
> +    pcmd->dw0.profile_level_max_frame = vp8_context->frame_width * vp8_context->frame_height;
> +    pcmd->dw1.init_buf_full_in_bits = vp8_context->init_vbv_buffer_fullness_in_bit;
> +    pcmd->dw2.buf_size_in_bits = vp8_context->vbv_buffer_size_in_bit;
> +    pcmd->dw3.average_bitrate = ALIGN(vp8_context->target_bit_rate, VP8_BRC_KBPS) / VP8_BRC_KBPS * VP8_BRC_KBPS;
> +    pcmd->dw4.max_bitrate = ALIGN(vp8_context->max_bit_rate, VP8_BRC_KBPS) / VP8_BRC_KBPS * VP8_BRC_KBPS;

VP8_BRC_KBPS is 1000 which is not a power of two, so the ALIGN macro isn't doing anything sensible here.

> +    pcmd->dw6.frame_rate_m = vp8_context->framerate.num;
> +    pcmd->dw7.frame_rate_d = vp8_context->framerate.den;
> +    pcmd->dw8.brc_flag = 0;
> +    pcmd->dw8.gop_minus1 = vp8_context->gop_size - 1;
> +
> +    if (vp8_context->internal_rate_mode == I965_BRC_CBR) {
> +        pcmd->dw4.max_bitrate = pcmd->dw3.average_bitrate;
> +
> +        pcmd->dw8.brc_flag = pcmd->dw8.brc_flag | BRC_KERNEL_CBR;
> +    } else if (vp8_context->internal_rate_mode == I965_BRC_VBR) {
> +        if (pcmd->dw4.max_bitrate < pcmd->dw3.average_bitrate) {
> +            pcmd->dw4.max_bitrate = 2 * pcmd->dw3.average_bitrate;
> +        }
> +
> +        pcmd->dw8.brc_flag = pcmd->dw8.brc_flag | BRC_KERNEL_VBR;
> +    }
> +
> +    input_bits_per_frame =
> +        ((double)(pcmd->dw4.max_bitrate) * (double)(pcmd->dw7.frame_rate_d) /
> +         (double)(pcmd->dw6.frame_rate_m));
> +
> +    if (pcmd->dw2.buf_size_in_bits < (unsigned int)input_bits_per_frame * 4) {
> +        pcmd->dw2.buf_size_in_bits = (unsigned int)input_bits_per_frame * 4;
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits == 0) {
> +        pcmd->dw1.init_buf_full_in_bits = 7 * pcmd->dw2.buf_size_in_bits / 8;
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits < (unsigned int)(input_bits_per_frame * 2)) {
> +        pcmd->dw1.init_buf_full_in_bits = (unsigned int)(input_bits_per_frame * 2);
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits > pcmd->dw2.buf_size_in_bits) {
> +        pcmd->dw1.init_buf_full_in_bits = pcmd->dw2.buf_size_in_bits;
> +    }
> +
> +    bps_ratio = input_bits_per_frame / ((double)(pcmd->dw2.buf_size_in_bits) / 30);
> +    bps_ratio = (bps_ratio < 0.1) ? 0.1 : (bps_ratio > 3.5) ? 3.5 : bps_ratio;
> +
> +    pcmd->dw9.frame_width_in_bytes = vp8_context->frame_width;
> +    pcmd->dw10.frame_height_in_bytes = vp8_context->frame_height;
> +    pcmd->dw10.avbr_accuracy = 30;
> +    pcmd->dw11.avbr_convergence = 150;
> +    pcmd->dw11.min_qp = pic_param->clamp_qindex_low;
> +    pcmd->dw12.max_qp = pic_param->clamp_qindex_high;
> +    pcmd->dw12.level_qp = 60;
> +
> +    // DW13 default 100
> +    pcmd->dw13.max_section_pct = 100;
> +    pcmd->dw13.under_shoot_cbr_pct = 115;
> +
> +    // DW14 default 100
> +    pcmd->dw14.min_section_pct = 100;
> +    pcmd->dw14.vbr_bias_pct = 100;
> +    pcmd->dw15.instant_rate_threshold_0_for_p = 30;
> +    pcmd->dw15.instant_rate_threshold_1_for_p = 50;
> +    pcmd->dw15.instant_rate_threshold_2_for_p = 70;
> +    pcmd->dw15.instant_rate_threshold_3_for_p = 120;
> +
> +    pcmd->dw17.instant_rate_threshold_0_for_i = 30;
> +    pcmd->dw17.instant_rate_threshold_1_for_i = 50;
> +    pcmd->dw17.instant_rate_threshold_2_for_i = 90;
> +    pcmd->dw17.instant_rate_threshold_3_for_i = 115;
> +    pcmd->dw18.deviation_threshold_0_for_p = (unsigned int)(-50 * pow(0.9, bps_ratio));
> +    pcmd->dw18.deviation_threshold_1_for_p = (unsigned int)(-50 * pow(0.66, bps_ratio));
> +    pcmd->dw18.deviation_threshold_2_for_p = (unsigned int)(-50 * pow(0.46, bps_ratio));
> +    pcmd->dw18.deviation_threshold_3_for_p = (unsigned int)(-50 * pow(0.3, bps_ratio));
> +    pcmd->dw19.deviation_threshold_4_for_p = (unsigned int)(50 * pow(0.3, bps_ratio));
> +    pcmd->dw19.deviation_threshold_5_for_p = (unsigned int)(50 * pow(0.46, bps_ratio));
> +    pcmd->dw19.deviation_threshold_6_for_p = (unsigned int)(50 * pow(0.7, bps_ratio));
> +    pcmd->dw19.deviation_threshold_7_for_p = (unsigned int)(50 * pow(0.9, bps_ratio));
> +    pcmd->dw20.deviation_threshold_0_for_vbr = (unsigned int)(-50 * pow(0.9, bps_ratio));
> +    pcmd->dw20.deviation_threshold_1_for_vbr = (unsigned int)(-50 * pow(0.7, bps_ratio));
> +    pcmd->dw20.deviation_threshold_2_for_vbr = (unsigned int)(-50 * pow(0.5, bps_ratio));
> +    pcmd->dw20.deviation_threshold_3_for_vbr = (unsigned int)(-50 * pow(0.3, bps_ratio));
> +    pcmd->dw21.deviation_threshold_4_for_vbr = (unsigned int)(100 * pow(0.4, bps_ratio));
> +    pcmd->dw21.deviation_threshold_5_for_vbr = (unsigned int)(100 * pow(0.5, bps_ratio));
> +    pcmd->dw21.deviation_threshold_6_for_vbr = (unsigned int)(100 * pow(0.75, bps_ratio));
> +    pcmd->dw21.deviation_threshold_7_for_vbr = (unsigned int)(100 * pow(0.9, bps_ratio));
> +    pcmd->dw22.deviation_threshold_0_for_i = (unsigned int)(-50 * pow(0.8, bps_ratio));
> +    pcmd->dw22.deviation_threshold_1_for_i = (unsigned int)(-50 * pow(0.6, bps_ratio));
> +    pcmd->dw22.deviation_threshold_2_for_i = (unsigned int)(-50 * pow(0.34, bps_ratio));
> +    pcmd->dw22.deviation_threshold_3_for_i = (unsigned int)(-50 * pow(0.2, bps_ratio));
> +    pcmd->dw23.deviation_threshold_4_for_i = (unsigned int)(50 * pow(0.2, bps_ratio));
> +    pcmd->dw23.deviation_threshold_5_for_i = (unsigned int)(50 * pow(0.4, bps_ratio));
> +    pcmd->dw23.deviation_threshold_6_for_i = (unsigned int)(50 * pow(0.66, bps_ratio));
> +    pcmd->dw23.deviation_threshold_7_for_i = (unsigned int)(50 * pow(0.9, bps_ratio));
> +
> +    // Default: 1
> +    pcmd->dw24.num_t_levels = 1;
> +
> +    if (!vp8_context->brc_initted) {
> +        vp8_context->brc_init_current_target_buf_full_in_bits = pcmd->dw1.init_buf_full_in_bits;
> +    }
> +
> +    vp8_context->brc_init_reset_buf_size_in_bits = pcmd->dw2.buf_size_in_bits;
> +    vp8_context->brc_init_reset_input_bits_per_frame = input_bits_per_frame;
> +
> +    pcmd->dw26.history_buffer_bti = VP8_BTI_BRC_INIT_RESET_HISTORY;
> +    pcmd->dw27.distortion_buffer_bti = VP8_BTI_BRC_INIT_RESET_DISTORTION;
> +
> +    i965_gpe_context_unmap_curbe(gpe_context);
> +}
> +
> ...