[Mesa-dev] [PATCH 07/70] i965: Combine the multiple pipelined register detection into one round-trip

Fri Aug 7 13:13:11 PDT 2015

Combining the multiple access checks into a few batches and a single
serialising read can reduce detection times from around 100us to 70us on
a fast Haswell system.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_screen.c | 165 +++++++++++++++++++------------
 1 file changed, 101 insertions(+), 64 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 36c7bb2..0b60f13 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1185,6 +1185,13 @@ intel_detect_timestamp(struct intel_screen *screen)
    return 0;
 }
 
+struct detect_pipelined_register {
+   uint32_t reg;
+   uint32_t expected_value;
+   unsigned result;
+   bool reset;
+};
+
 /**
  * Test if we can use MI_LOAD_REGISTER_MEM from an untrusted batchbuffer.
  *
@@ -1192,102 +1199,134 @@ intel_detect_timestamp(struct intel_screen *screen)
  * while others don't.  Instead of trying to enumerate every case, just
  * try and write a register and see if works.
  */
-static bool
-intel_detect_pipelined_register(struct intel_screen *screen,
-                                int reg, uint32_t expected_value, bool reset)
+static void
+__intel_detect_pipelined_registers(struct intel_screen *screen,
+                                   struct detect_pipelined_register *r,
+                                   int count)
 {
-   drm_intel_bo *results, *bo;
-   uint32_t *batch;
-   uint32_t offset = 0;
-   bool success = false;
+   drm_intel_bo *results;
+   int i;
+
+   if (count == 0)
+      return;
 
    /* Create a zero'ed temporary buffer for reading our results */
    results = drm_intel_bo_alloc(screen->bufmgr, "registers", 4096, 0);
    if (results == NULL)
-      goto err;
-
-   bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
-   if (bo == NULL)
-      goto err_results;
+      return;
 
-   if (drm_intel_bo_map(bo, 1))
-      goto err_batch;
+   /* Emit each access in a separate batch buffer so that if the kernel
+    * rejects an individual access attempt, we don't incorrectly assume
+    * all the register accesses are invalid.
+    */
+   for (i = 0; i < count; i++) {
+      drm_intel_bo *bo;
+      uint32_t *batch;
 
-   batch = bo->virtual;
+      bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
+      if (bo == NULL)
+         continue;
 
-   /* Write the register. */
-   *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
-   *batch++ = reg;
-   *batch++ = expected_value;
+      if (drm_intel_bo_map(bo, 1))
+         goto err_batch;
 
-   /* Save the register's value back to the buffer. */
-   *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
-   *batch++ = reg;
-   drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
-                           results, offset*sizeof(uint32_t),
-                           I915_GEM_DOMAIN_INSTRUCTION,
-                           I915_GEM_DOMAIN_INSTRUCTION);
-   *batch++ = results->offset + offset*sizeof(uint32_t);
+      batch = bo->virtual;
 
-   /* And afterwards clear the register */
-   if (reset) {
+      /* Write the register. */
       *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
-      *batch++ = reg;
-      *batch++ = 0;
-   }
+      *batch++ = r[i].reg;
+      *batch++ = r[i].expected_value;
+
+      /* Save the register's value back to the buffer. */
+      *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
+      *batch++ = r[i].reg;
+      drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
+                              results, i*sizeof(uint32_t),
+                              I915_GEM_DOMAIN_INSTRUCTION,
+                              I915_GEM_DOMAIN_INSTRUCTION);
+      *batch++ = results->offset + i*sizeof(uint32_t);
+
+      /* And afterwards clear the register */
+      if (r[i].reset) {
+         *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
+         *batch++ = r[i].reg;
+         *batch++ = 0;
+      }
 
-   *batch++ = MI_BATCH_BUFFER_END;
+      *batch++ = MI_BATCH_BUFFER_END;
 
-   drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
-                         NULL, 0, 0,
-                         I915_EXEC_RENDER);
+      drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
+                            NULL, 0, 0,
+                            I915_EXEC_RENDER);
 
-   /* Check whether the value got written. */
+err_batch:
+      drm_intel_bo_unreference(bo);
+   }
+
+   /* Check whether the values got written. */
    if (drm_intel_bo_map(results, false) == 0) {
-      success = *((uint32_t *)results->virtual + offset) == expected_value;
+      uint32_t *data = results->virtual;
+      for (i = 0; i < count; i++)
+         if (data[i] == r[i].expected_value)
+            screen->hw_has_pipelined_register |= r[i].result;
       drm_intel_bo_unmap(results);
    }
 
-err_batch:
-   drm_intel_bo_unreference(bo);
-err_results:
    drm_intel_bo_unreference(results);
-err:
-   return success;
 }
 
 static bool
-intel_detect_pipelined_so(struct intel_screen *screen)
+intel_detect_pipelined_so(struct intel_screen *screen,
+                          struct detect_pipelined_register *detect)
 {
-   /* Supposedly, Broadwell just works. */
-   if (screen->devinfo->gen >= 8)
-      return true;
-
    if (screen->devinfo->gen <= 6)
-      return false;
+      return 0;
+
+   /* Supposedly, Broadwell just works. */
+   if (screen->devinfo->gen >= 8) {
+      screen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
+      return 0;
+   }
 
    /* We use SO_WRITE_OFFSET0 since you're supposed to write it (unlike the
     * statistics registers), and we already reset it to zero before using it.
     */
-   return intel_detect_pipelined_register(screen,
-                                          GEN7_SO_WRITE_OFFSET(0),
-                                          0x1337d0d0,
-                                          false);
+   detect->reg = GEN7_SO_WRITE_OFFSET(0);
+   detect->expected_value = 0x1337d0d0;
+   detect->result = HW_HAS_PIPELINED_SOL_OFFSET;
+   detect->reset = false;
+   return 1;
 }
 
-static bool
-intel_detect_pipelined_oacontrol(struct intel_screen *screen)
+static int
+intel_detect_pipelined_oacontrol(struct intel_screen *screen,
+                                 struct detect_pipelined_register *detect)
 {
    if (screen->devinfo->gen < 6 || screen->devinfo->gen >= 8)
-      return false;
+      return 0;
 
    /* Set "Select Context ID" to a particular address (which is likely not a
     * context), but leave all counting disabled.  This should be harmless.
     */
-   return intel_detect_pipelined_register(screen,
-                                          OACONTROL,
-                                          0x31337000,
-                                          true);
+   detect->reg = OACONTROL;
+   detect->expected_value = 0x31337000;
+   detect->result = HW_HAS_PIPELINED_OACONTROL;
+   detect->reset = true;
+   return 1;
+}
+
+static void
+intel_detect_pipelined_register_access(struct intel_screen *screen)
+{
+   struct detect_pipelined_register regs[2], *r =regs;
+
+   /* Combine the multiple register access validation into a single
+    * round trip through the kernel + GPU.
+    */
+   r += intel_detect_pipelined_so(screen, r);
+   r += intel_detect_pipelined_oacontrol(screen, r);
+
+   __intel_detect_pipelined_registers(screen, regs, r-regs);
 }
 
 /**
@@ -1549,10 +1588,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
    intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
-   if (intel_detect_pipelined_so(intelScreen))
-      intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
-   if (intel_detect_pipelined_oacontrol(intelScreen))
-      intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_OACONTROL;
+
+   intel_detect_pipelined_register_access(intelScreen);
 
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
-- 
2.5.0