[Mesa-dev] [PATCH 07/70] i965: Combine the multiple pipelined register detection into one round-trip

Mon Aug 10 05:17:26 PDT 2015

On 07/08/15 23:13, Chris Wilson wrote:
> Combining the multiple access checks into a few batches and a single
> serialising read can reduce detection times from around 100us to 70us on
> a fast Haswell system.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> ---
>   src/mesa/drivers/dri/i965/intel_screen.c | 165 +++++++++++++++++++------------
>   1 file changed, 101 insertions(+), 64 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> index 36c7bb2..0b60f13 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.c
> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
> @@ -1185,6 +1185,13 @@ intel_detect_timestamp(struct intel_screen *screen)
>      return 0;
>   }
>   
> +struct detect_pipelined_register {
> +   uint32_t reg;
> +   uint32_t expected_value;
> +   unsigned result;
> +   bool reset;
> +};
> +
>   /**
>    * Test if we can use MI_LOAD_REGISTER_MEM from an untrusted batchbuffer.
>    *
> @@ -1192,102 +1199,134 @@ intel_detect_timestamp(struct intel_screen *screen)
>    * while others don't.  Instead of trying to enumerate every case, just
>    * try and write a register and see if works.
>    */
> -static bool
> -intel_detect_pipelined_register(struct intel_screen *screen,
> -                                int reg, uint32_t expected_value, bool reset)
> +static void
> +__intel_detect_pipelined_registers(struct intel_screen *screen,
> +                                   struct detect_pipelined_register *r,
> +                                   int count)
>   {
> -   drm_intel_bo *results, *bo;
> -   uint32_t *batch;
> -   uint32_t offset = 0;
> -   bool success = false;
> +   drm_intel_bo *results;
> +   int i;
> +
> +   if (count == 0)
> +      return;
>   
>      /* Create a zero'ed temporary buffer for reading our results */
>      results = drm_intel_bo_alloc(screen->bufmgr, "registers", 4096, 0);
>      if (results == NULL)
> -      goto err;
> -
> -   bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
> -   if (bo == NULL)
> -      goto err_results;
> +      return;
>   
> -   if (drm_intel_bo_map(bo, 1))
> -      goto err_batch;
> +   /* Emit each access in a separate batch buffer so that if the kernel
> +    * rejects an individual access attempt, we don't incorrectly assume
> +    * all the register accesses are invalid.
> +    */
> +   for (i = 0; i < count; i++) {
> +      drm_intel_bo *bo;
> +      uint32_t *batch;
>   
> -   batch = bo->virtual;
> +      bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
> +      if (bo == NULL)
> +         continue;
>   
> -   /* Write the register. */
> -   *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> -   *batch++ = reg;
> -   *batch++ = expected_value;
> +      if (drm_intel_bo_map(bo, 1))
> +         goto err_batch;
>   
> -   /* Save the register's value back to the buffer. */
> -   *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
> -   *batch++ = reg;
> -   drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
> -                           results, offset*sizeof(uint32_t),
> -                           I915_GEM_DOMAIN_INSTRUCTION,
> -                           I915_GEM_DOMAIN_INSTRUCTION);
> -   *batch++ = results->offset + offset*sizeof(uint32_t);
> +      batch = bo->virtual;
>   
> -   /* And afterwards clear the register */
> -   if (reset) {
> +      /* Write the register. */
>         *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> -      *batch++ = reg;
> -      *batch++ = 0;
> -   }
> +      *batch++ = r[i].reg;
> +      *batch++ = r[i].expected_value;
> +
> +      /* Save the register's value back to the buffer. */
> +      *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
> +      *batch++ = r[i].reg;
> +      drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
> +                              results, i*sizeof(uint32_t),
> +                              I915_GEM_DOMAIN_INSTRUCTION,
> +                              I915_GEM_DOMAIN_INSTRUCTION);
> +      *batch++ = results->offset + i*sizeof(uint32_t);
> +
> +      /* And afterwards clear the register */
> +      if (r[i].reset) {
> +         *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> +         *batch++ = r[i].reg;
> +         *batch++ = 0;
> +      }
>   
> -   *batch++ = MI_BATCH_BUFFER_END;
> +      *batch++ = MI_BATCH_BUFFER_END;
>   
> -   drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
> -                         NULL, 0, 0,
> -                         I915_EXEC_RENDER);
> +      drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
> +                            NULL, 0, 0,
> +                            I915_EXEC_RENDER);
>   
> -   /* Check whether the value got written. */
> +err_batch:
> +      drm_intel_bo_unreference(bo);
> +   }
> +
> +   /* Check whether the values got written. */
>      if (drm_intel_bo_map(results, false) == 0) {
> -      success = *((uint32_t *)results->virtual + offset) == expected_value;
> +      uint32_t *data = results->virtual;
> +      for (i = 0; i < count; i++)
> +         if (data[i] == r[i].expected_value)
> +            screen->hw_has_pipelined_register |= r[i].result;
>         drm_intel_bo_unmap(results);
>      }
>   
> -err_batch:
> -   drm_intel_bo_unreference(bo);
> -err_results:
>      drm_intel_bo_unreference(results);
> -err:
> -   return success;
>   }
>   
>   static bool
> -intel_detect_pipelined_so(struct intel_screen *screen)
> +intel_detect_pipelined_so(struct intel_screen *screen,
> +                          struct detect_pipelined_register *detect)
>   {
> -   /* Supposedly, Broadwell just works. */
> -   if (screen->devinfo->gen >= 8)
> -      return true;
> -
>      if (screen->devinfo->gen <= 6)
> -      return false;
> +      return 0;
> +
> +   /* Supposedly, Broadwell just works. */
> +   if (screen->devinfo->gen >= 8) {
> +      screen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
> +      return 0;
> +   }
>   
>      /* We use SO_WRITE_OFFSET0 since you're supposed to write it (unlike the
>       * statistics registers), and we already reset it to zero before using it.
>       */
> -   return intel_detect_pipelined_register(screen,
> -                                          GEN7_SO_WRITE_OFFSET(0),
> -                                          0x1337d0d0,
> -                                          false);
> +   detect->reg = GEN7_SO_WRITE_OFFSET(0);
> +   detect->expected_value = 0x1337d0d0;
> +   detect->result = HW_HAS_PIPELINED_SOL_OFFSET;
> +   detect->reset = false;
> +   return 1;
>   }
>   
> -static bool
> -intel_detect_pipelined_oacontrol(struct intel_screen *screen)
> +static int
> +intel_detect_pipelined_oacontrol(struct intel_screen *screen,
> +                                 struct detect_pipelined_register *detect)
>   {
>      if (screen->devinfo->gen < 6 || screen->devinfo->gen >= 8)
> -      return false;
> +      return 0;
>   
>      /* Set "Select Context ID" to a particular address (which is likely not a
>       * context), but leave all counting disabled.  This should be harmless.
>       */
> -   return intel_detect_pipelined_register(screen,
> -                                          OACONTROL,
> -                                          0x31337000,
> -                                          true);
> +   detect->reg = OACONTROL;
> +   detect->expected_value = 0x31337000;
> +   detect->result = HW_HAS_PIPELINED_OACONTROL;
> +   detect->reset = true;
> +   return 1;
> +}
> +
> +static void
> +intel_detect_pipelined_register_access(struct intel_screen *screen)
> +{
> +   struct detect_pipelined_register regs[2], *r =regs;
> +
> +   /* Combine the multiple register access validation into a single
> +    * round trip through the kernel + GPU.
> +    */
> +   r += intel_detect_pipelined_so(screen, r);
> +   r += intel_detect_pipelined_oacontrol(screen, r);

Not a fan of this construct. How about changing the return types of the 
detect functions to int?.

Other than that, it looks good:

Reviewed-by: Martin Peres <martin.peres at linux.intel.com>

> +
> +   __intel_detect_pipelined_registers(screen, regs, r-regs);
>   }
>   
>   /**
> @@ -1549,10 +1588,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
>   
>      intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
>      intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
> -   if (intel_detect_pipelined_so(intelScreen))
> -      intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
> -   if (intel_detect_pipelined_oacontrol(intelScreen))
> -      intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_OACONTROL;
> +
> +   intel_detect_pipelined_register_access(intelScreen);
>   
>      const char *force_msaa = getenv("INTEL_FORCE_MSAA");
>      if (force_msaa) {