[Mesa-dev] [PATCH 5/5] i965: Combine the multiple pipelined register detection into one round-trip

Wed Jul 8 06:48:42 PDT 2015

Combining the multiple access checks into a few batches and a single
serialising read can reduce detection times from around 100us to 70us on
a fast Haswell system.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_screen.c | 177 +++++++++++++++++++------------
 1 file changed, 109 insertions(+), 68 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index cb49e9a..595d2dc 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1156,6 +1156,12 @@ intel_detect_timestamp(struct intel_screen *screen)
    return loop > 0;
 }
 
+struct detect_pipelined_register {
+   uint32_t reg;
+   uint32_t expected_value;
+   bool *result;
+};
+
 /**
  * Test if we can use MI_LOAD_REGISTER_MEM from an untrusted batchbuffer.
  *
@@ -1163,107 +1169,143 @@ intel_detect_timestamp(struct intel_screen *screen)
  * while others don't.  Instead of trying to enumerate every case, just
  * try and write a register and see if works.
  */
-static bool
-intel_detect_pipelined_register(struct intel_screen *screen,
-				int reg, uint32_t expected_value)
+static void
+__intel_detect_pipelined_registers(struct intel_screen *screen,
+                                   struct detect_pipelined_register *r,
+                                   int count)
 {
    const int offset = 100;
-
-   drm_intel_bo *bo;
-   uint32_t buf[100];
-   uint32_t *batch = buf;
+   int i;
 
    uint32_t *data;
+
+   if (count == 0)
+      return;
+
+   if (drm_intel_bo_map(screen->workaround_bo, true))
+      return;
+
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   if (drm_intel_bo_map(screen->workaround_bo, true))
-      return false;
-
    data = screen->workaround_bo->virtual;
-   data[offset] = 0xffffffff;
+   for (i = 0; i < count; i++)
+      data[offset+i] = 0xffffffff;
    drm_intel_bo_unmap(screen->workaround_bo);
 
-   bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
-   if (bo == NULL)
-      return false;
+   /* Emit each access in a separate batch buffer so that if the kernel
+    * rejects an individual access attempt, we don't incorrectly assume
+    * all the register accesses are invalid.
+    */
+   for (i = 0; i < count; i++) {
+      drm_intel_bo *bo;
+      uint32_t buf[100];
+      uint32_t *batch = buf;
+
+      bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
+      if (bo == NULL)
+         continue;
+
+      /* Write the register. */
+      *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
+      *batch++ = r[i].reg;
+      *batch++ = r[i].expected_value;
+
+      /* Force a command barrier between the write then read */
+      *batch++ = _3DSTATE_PIPE_CONTROL | (5 - 2);
+      *batch++ = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_CS_STALL;
+      *batch++ = 0;
+      *batch++ = 0;
+      *batch++ = 0;
 
-   /* Write the register. */
-   *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
-   *batch++ = reg;
-   *batch++ = expected_value;
-
-   /* Force a command barrier between the write then read */
-   *batch++ = _3DSTATE_PIPE_CONTROL | (5 - 2);
-   *batch++ = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_CS_STALL;
-   *batch++ = 0;
-   *batch++ = 0;
-   *batch++ = 0;
-
-   /* Save the register's value back to the buffer. */
-   *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
-   *batch++ = reg;
-   drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)buf,
-                           screen->workaround_bo, offset*sizeof(uint32_t),
-                           I915_GEM_DOMAIN_INSTRUCTION,
-                           I915_GEM_DOMAIN_INSTRUCTION);
-   *batch++ = screen->workaround_bo->offset + offset*sizeof(uint32_t);
-
-   /* And afterwards clear the register */
-   *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
-   *batch++ = reg;
-   *batch++ = 0;
-
-   *batch++ = MI_BATCH_BUFFER_END;
-   if ((batch - buf) & 1)
+      /* Save the register's value back to the buffer. */
+      *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
+      *batch++ = r[i].reg;
+      drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)buf,
+                              screen->workaround_bo,
+                              (offset+i)*sizeof(uint32_t),
+                              I915_GEM_DOMAIN_INSTRUCTION,
+                              I915_GEM_DOMAIN_INSTRUCTION);
+      *batch++ = screen->workaround_bo->offset + (offset+i)*sizeof(uint32_t);
+
+      /* And afterwards clear the register */
+      *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
+      *batch++ = r[i].reg;
       *batch++ = 0;
 
-   if (drm_intel_bo_subdata(bo, 0, (char *)batch - (char *)buf, buf) == 0)
-      drm_intel_bo_mrb_exec(bo, (char *)batch - (char *)buf,
-                            NULL, 0, 0,
-                            I915_EXEC_RENDER);
+      *batch++ = MI_BATCH_BUFFER_END;
+      if ((batch - buf) & 1)
+         *batch++ = 0;
 
-   drm_intel_bo_unreference(bo);
+      if (drm_intel_bo_subdata(bo, 0, (char *)batch - (char *)buf, buf) == 0)
+         drm_intel_bo_mrb_exec(bo, (char *)batch - (char *)buf,
+                               NULL, 0, 0,
+                               I915_EXEC_RENDER);
 
-   /* Check whether the value got written. */
-   bool success = false;
+      drm_intel_bo_unreference(bo);
+   }
+
+   /* Check whether the values got written. */
    if (drm_intel_bo_map(screen->workaround_bo, false) == 0) {
       data = screen->workaround_bo->virtual;
-      success = data[offset] == expected_value;
+      for (i = 0; i < count; i++)
+         *r[i].result = data[offset+i] == r[i].expected_value;
       drm_intel_bo_unmap(screen->workaround_bo);
    }
-
-   return success;
 }
 
 static bool
-intel_detect_pipelined_so(struct intel_screen *screen)
+intel_detect_pipelined_so(struct intel_screen *screen,
+                          struct detect_pipelined_register *detect)
 {
-   /* Supposedly, Broadwell just works. */
-   if (screen->devinfo->gen >= 8)
-      return true;
-
+   screen->hw_has_pipelined_so = false;
    if (screen->devinfo->gen <= 6)
-      return false;
+      return 0;
+
+   /* Supposedly, Broadwell just works. */
+   if (screen->devinfo->gen >= 8) {
+      screen->hw_has_pipelined_so = true;
+      return 0;
+   }
 
    /* We use SO_WRITE_OFFSET0 since you're supposed to write it (unlike the
     * statistics registers), and we already reset it to zero before using it.
     */
-   return intel_detect_pipelined_register(screen,
-                                          GEN7_SO_WRITE_OFFSET(0),
-                                          0x1337d0d0);
+   detect->reg = GEN7_SO_WRITE_OFFSET(0);
+   detect->expected_value = 0x1337d0d0;
+   detect->result = &screen->hw_has_pipelined_so;
+   return 1;
 }
 
-static bool
-intel_detect_pipelined_oacontrol(struct intel_screen *screen)
+static int
+intel_detect_pipelined_oacontrol(struct intel_screen *screen,
+				 struct detect_pipelined_register *reg)
 {
+   screen->hw_has_pipelined_oacontrol = false;
    if (screen->devinfo->gen < 6 || screen->devinfo->gen >= 8)
-      return false;
+      return 0;
 
    /* Set "Select Context ID" to a particular address (which is likely not a
     * context), but leave all counting disabled.  This should be harmless.
     */
-   return intel_detect_pipelined_register(screen, OACONTROL, 0x31337000);
+   reg->reg = OACONTROL;
+   reg->expected_value = 0x31337000;
+   reg->result = &screen->hw_has_pipelined_oacontrol;
+   return 1;
+}
+
+static void
+intel_detect_pipelined_register_access(struct intel_screen *screen)
+{
+   struct detect_pipelined_register regs[2], *r =regs;
+
+   /* Combine the multiple register access validation into a single
+    * round trip through the kernel + GPU.
+    */
+   r += intel_detect_pipelined_so(screen, r);
+   r += intel_detect_pipelined_oacontrol(screen, r);
+
+   __intel_detect_pipelined_registers(screen, regs, r-regs);
 }
 
 /**
@@ -1525,9 +1567,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
    intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
-   intelScreen->hw_has_pipelined_so = intel_detect_pipelined_so(intelScreen);
-   intelScreen->hw_has_pipelined_oacontrol =
-      intel_detect_pipelined_oacontrol(intelScreen);
+
+   intel_detect_pipelined_register_access(intelScreen);
 
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
-- 
2.1.4