[Libva] [PATCH] memman: implement PIPE_CONTROL workaround.

Wed Jul 11 19:21:50 PDT 2012

On Wed, 2012-07-11 at 09:16 +0200, Gwenole Beauchesne wrote: 
> From: Daniel Vetter <daniel.vetter at ffwll.ch>
> 
> Sandybdrige requires an elaborate dance to flush caches without
> hanging the gpu. See public docs Vol2Part1 1.7.4.1 PIPE_CONTROL
> or the corrensponding code in mesa/kernel.
> 
> v2: Incorporate review from Chris Wilson. For paranoia keep all three
> PIPE_CONTROL cmds in the same batchbuffer to avoid upsetting the gpu.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
> [Gwenole: merged from xf86-video-intel]
> Signed-off-by: Gwenole Beauchesne <gwenole.beauchesne at intel.com>
> ---
>  src/intel_batchbuffer.c |   54 +++++++++++++++++++++++++++++++++++-----------
>  src/intel_driver.h      |    3 ++
>  src/intel_memman.c      |    6 +++++
>  3 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
> index 0b52281..77a2c90 100644
> --- a/src/intel_batchbuffer.c
> +++ b/src/intel_batchbuffer.c
> @@ -154,6 +154,36 @@ intel_batchbuffer_data(struct intel_batchbuffer *batch,
>      batch->ptr += size;
>  }
>  
> +static void
> +intel_batchbuffer_emit_post_sync_nonzero_flush(struct intel_batchbuffer *batch)
> +{
> +    struct intel_driver_data * const intel = batch->intel; 
> +
> +    /* Keep this entire sequence of 3 PIPE_CONTROL cmds in one batch to
> +       avoid upsetting the gpu. */
> +    BEGIN_BATCH(batch, 3*4);
> +    OUT_BATCH(batch, CMD_PIPE_CONTROL | (4 - 2));
> +    OUT_BATCH(batch, (CMD_PIPE_CONTROL_CS_STALL |
> +                      CMD_PIPE_CONTROL_STALL_AT_SCOREBOARD));
> +    OUT_BATCH(batch, 0); /* address */
> +    OUT_BATCH(batch, 0); /* write data */
> +
> +    OUT_BATCH(batch, CMD_PIPE_CONTROL | (4 - 2));
> +    OUT_BATCH(batch, CMD_PIPE_CONTROL_WRITE_QWORD);
> +    OUT_RELOC(batch, intel->wa_scratch_bo,
> +              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
> +    OUT_BATCH(batch, 0); /* write data */

The length (DW0[7:0]) is 3 for QWORD write. The batch buffer used on
Linux is a non-secure batch buffer, so the address given here must be in
a PPGTT address, which is valid only with the Linux 3.4+(?) kernel. In
addition, is this operation needed ? The doc only says some store data
commands are needed for TLB invalidate.

> +
> +    /* now finally the _real flush */
> +    OUT_BATCH(batch, CMD_PIPE_CONTROL | (4 - 2));
> +    OUT_BATCH(batch, (CMD_PIPE_CONTROL_WC_FLUSH |
> +                      CMD_PIPE_CONTROL_TC_FLUSH |
> +                      CMD_PIPE_CONTROL_NOWRITE));
> +    OUT_BATCH(batch, 0); /* write address */
> +    OUT_BATCH(batch, 0); /* write data */
> +    ADVANCE_BATCH(batch);
> +}
> +
>  void
>  intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
>  {
> @@ -162,24 +192,22 @@ intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
>      if (IS_GEN6(intel->device_id) ||
>          IS_GEN7(intel->device_id)) {
>          if (batch->flag == I915_EXEC_RENDER) {
> -            BEGIN_BATCH(batch, 4);
> -            OUT_BATCH(batch, CMD_PIPE_CONTROL | 0x2);
> -
> -            if (IS_GEN6(intel->device_id))
> -                OUT_BATCH(batch, 
> -                          CMD_PIPE_CONTROL_WC_FLUSH |
> -                          CMD_PIPE_CONTROL_TC_FLUSH |
> -                          CMD_PIPE_CONTROL_NOWRITE);
> -            else
> +            if (IS_GEN6(intel->device_id)) {
> +                /* HW workaround for Sandy Bridge */
> +                intel_batchbuffer_emit_post_sync_nonzero_flush(batch);
> +            }
> +            else {
> +                BEGIN_BATCH(batch, 4);
> +                OUT_BATCH(batch, CMD_PIPE_CONTROL | 0x2);
>                  OUT_BATCH(batch, 
>                            CMD_PIPE_CONTROL_WC_FLUSH |
>                            CMD_PIPE_CONTROL_TC_FLUSH |
>                            CMD_PIPE_CONTROL_DC_FLUSH |
>                            CMD_PIPE_CONTROL_NOWRITE);
> -
> -            OUT_BATCH(batch, 0);
> -            OUT_BATCH(batch, 0);
> -            ADVANCE_BATCH(batch);
> +                OUT_BATCH(batch, 0);
> +                OUT_BATCH(batch, 0);
> +                ADVANCE_BATCH(batch);
> +            }
>          } else {
>              if (batch->flag == I915_EXEC_BLT) {
>                  BEGIN_BLT_BATCH(batch, 4);
> diff --git a/src/intel_driver.h b/src/intel_driver.h
> index b383218..ad95e41 100644
> --- a/src/intel_driver.h
> +++ b/src/intel_driver.h
> @@ -42,6 +42,7 @@
>  #define BR13_8888                               (0x3 << 24)
>  
>  #define CMD_PIPE_CONTROL                        (CMD_3D | (3 << 27) | (2 << 24) | (0 << 16))
> +#define CMD_PIPE_CONTROL_CS_STALL               (1 << 20)
>  #define CMD_PIPE_CONTROL_NOWRITE                (0 << 14)
>  #define CMD_PIPE_CONTROL_WRITE_QWORD            (1 << 14)
>  #define CMD_PIPE_CONTROL_WRITE_DEPTH            (2 << 14)
> @@ -54,6 +55,7 @@
>  #define CMD_PIPE_CONTROL_DC_FLUSH               (1 << 5)
>  #define CMD_PIPE_CONTROL_GLOBAL_GTT             (1 << 2)
>  #define CMD_PIPE_CONTROL_LOCAL_PGTT             (0 << 2)
> +#define CMD_PIPE_CONTROL_STALL_AT_SCOREBOARD    (1 << 1)
>  #define CMD_PIPE_CONTROL_DEPTH_CACHE_FLUSH      (1 << 0)
>  
> 
> @@ -116,6 +118,7 @@ struct intel_driver_data
>      int locked;
>  
>      dri_bufmgr *bufmgr;
> +    dri_bo *wa_scratch_bo;
>  
>      unsigned int has_exec2  : 1; /* Flag: has execbuffer2? */
>      unsigned int has_bsd    : 1; /* Flag: has bitstream decoder for H.264? */
> diff --git a/src/intel_memman.c b/src/intel_memman.c
> index 7d56e96..cde267e 100644
> --- a/src/intel_memman.c
> +++ b/src/intel_memman.c
> @@ -38,12 +38,18 @@ intel_memman_init(struct intel_driver_data *intel)
>      assert(intel->bufmgr);
>      intel_bufmgr_gem_enable_reuse(intel->bufmgr);
>  
> +    if (IS_GEN6(intel->device_id)) {
> +        intel->wa_scratch_bo =
> +            drm_intel_bo_alloc(intel->bufmgr, "wa scratch", 4096, 4096);
> +        assert(intel->wa_scratch_bo);
> +    }
>      return True;
>  }
>  
>  Bool 
>  intel_memman_terminate(struct intel_driver_data *intel)
>  {
> +    drm_intel_bo_unreference(intel->wa_scratch_bo);
>      drm_intel_bufmgr_destroy(intel->bufmgr);
>      return True;
>  }