[Mesa-dev] [PATCH 07/70] i965: Combine the multiple pipelined register detection into one round-trip
Martin Peres
martin.peres at linux.intel.com
Mon Aug 10 05:17:26 PDT 2015
On 07/08/15 23:13, Chris Wilson wrote:
> Combining the multiple access checks into a few batches and a single
> serialising read can reduce detection times from around 100us to 70us on
> a fast Haswell system.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/intel_screen.c | 165 +++++++++++++++++++------------
> 1 file changed, 101 insertions(+), 64 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> index 36c7bb2..0b60f13 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.c
> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
> @@ -1185,6 +1185,13 @@ intel_detect_timestamp(struct intel_screen *screen)
> return 0;
> }
>
> +struct detect_pipelined_register {
> + uint32_t reg;
> + uint32_t expected_value;
> + unsigned result;
> + bool reset;
> +};
> +
> /**
> * Test if we can use MI_LOAD_REGISTER_MEM from an untrusted batchbuffer.
> *
> @@ -1192,102 +1199,134 @@ intel_detect_timestamp(struct intel_screen *screen)
> * while others don't. Instead of trying to enumerate every case, just
> * try and write a register and see if works.
> */
> -static bool
> -intel_detect_pipelined_register(struct intel_screen *screen,
> - int reg, uint32_t expected_value, bool reset)
> +static void
> +__intel_detect_pipelined_registers(struct intel_screen *screen,
> + struct detect_pipelined_register *r,
> + int count)
> {
> - drm_intel_bo *results, *bo;
> - uint32_t *batch;
> - uint32_t offset = 0;
> - bool success = false;
> + drm_intel_bo *results;
> + int i;
> +
> + if (count == 0)
> + return;
>
> /* Create a zero'ed temporary buffer for reading our results */
> results = drm_intel_bo_alloc(screen->bufmgr, "registers", 4096, 0);
> if (results == NULL)
> - goto err;
> -
> - bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
> - if (bo == NULL)
> - goto err_results;
> + return;
>
> - if (drm_intel_bo_map(bo, 1))
> - goto err_batch;
> + /* Emit each access in a separate batch buffer so that if the kernel
> + * rejects an individual access attempt, we don't incorrectly assume
> + * all the register accesses are invalid.
> + */
> + for (i = 0; i < count; i++) {
> + drm_intel_bo *bo;
> + uint32_t *batch;
>
> - batch = bo->virtual;
> + bo = drm_intel_bo_alloc(screen->bufmgr, "batchbuffer", 4096, 0);
> + if (bo == NULL)
> + continue;
>
> - /* Write the register. */
> - *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> - *batch++ = reg;
> - *batch++ = expected_value;
> + if (drm_intel_bo_map(bo, 1))
> + goto err_batch;
>
> - /* Save the register's value back to the buffer. */
> - *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
> - *batch++ = reg;
> - drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
> - results, offset*sizeof(uint32_t),
> - I915_GEM_DOMAIN_INSTRUCTION,
> - I915_GEM_DOMAIN_INSTRUCTION);
> - *batch++ = results->offset + offset*sizeof(uint32_t);
> + batch = bo->virtual;
>
> - /* And afterwards clear the register */
> - if (reset) {
> + /* Write the register. */
> *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> - *batch++ = reg;
> - *batch++ = 0;
> - }
> + *batch++ = r[i].reg;
> + *batch++ = r[i].expected_value;
> +
> + /* Save the register's value back to the buffer. */
> + *batch++ = MI_STORE_REGISTER_MEM | (3 - 2);
> + *batch++ = r[i].reg;
> + drm_intel_bo_emit_reloc(bo, (char *)batch -(char *)bo->virtual,
> + results, i*sizeof(uint32_t),
> + I915_GEM_DOMAIN_INSTRUCTION,
> + I915_GEM_DOMAIN_INSTRUCTION);
> + *batch++ = results->offset + i*sizeof(uint32_t);
> +
> + /* And afterwards clear the register */
> + if (r[i].reset) {
> + *batch++ = MI_LOAD_REGISTER_IMM | (3 - 2);
> + *batch++ = r[i].reg;
> + *batch++ = 0;
> + }
>
> - *batch++ = MI_BATCH_BUFFER_END;
> + *batch++ = MI_BATCH_BUFFER_END;
>
> - drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
> - NULL, 0, 0,
> - I915_EXEC_RENDER);
> + drm_intel_bo_mrb_exec(bo, ALIGN((char *)batch - (char *)bo->virtual, 8),
> + NULL, 0, 0,
> + I915_EXEC_RENDER);
>
> - /* Check whether the value got written. */
> +err_batch:
> + drm_intel_bo_unreference(bo);
> + }
> +
> + /* Check whether the values got written. */
> if (drm_intel_bo_map(results, false) == 0) {
> - success = *((uint32_t *)results->virtual + offset) == expected_value;
> + uint32_t *data = results->virtual;
> + for (i = 0; i < count; i++)
> + if (data[i] == r[i].expected_value)
> + screen->hw_has_pipelined_register |= r[i].result;
> drm_intel_bo_unmap(results);
> }
>
> -err_batch:
> - drm_intel_bo_unreference(bo);
> -err_results:
> drm_intel_bo_unreference(results);
> -err:
> - return success;
> }
>
> static bool
> -intel_detect_pipelined_so(struct intel_screen *screen)
> +intel_detect_pipelined_so(struct intel_screen *screen,
> + struct detect_pipelined_register *detect)
> {
> - /* Supposedly, Broadwell just works. */
> - if (screen->devinfo->gen >= 8)
> - return true;
> -
> if (screen->devinfo->gen <= 6)
> - return false;
> + return 0;
> +
> + /* Supposedly, Broadwell just works. */
> + if (screen->devinfo->gen >= 8) {
> + screen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
> + return 0;
> + }
>
> /* We use SO_WRITE_OFFSET0 since you're supposed to write it (unlike the
> * statistics registers), and we already reset it to zero before using it.
> */
> - return intel_detect_pipelined_register(screen,
> - GEN7_SO_WRITE_OFFSET(0),
> - 0x1337d0d0,
> - false);
> + detect->reg = GEN7_SO_WRITE_OFFSET(0);
> + detect->expected_value = 0x1337d0d0;
> + detect->result = HW_HAS_PIPELINED_SOL_OFFSET;
> + detect->reset = false;
> + return 1;
> }
>
> -static bool
> -intel_detect_pipelined_oacontrol(struct intel_screen *screen)
> +static int
> +intel_detect_pipelined_oacontrol(struct intel_screen *screen,
> + struct detect_pipelined_register *detect)
> {
> if (screen->devinfo->gen < 6 || screen->devinfo->gen >= 8)
> - return false;
> + return 0;
>
> /* Set "Select Context ID" to a particular address (which is likely not a
> * context), but leave all counting disabled. This should be harmless.
> */
> - return intel_detect_pipelined_register(screen,
> - OACONTROL,
> - 0x31337000,
> - true);
> + detect->reg = OACONTROL;
> + detect->expected_value = 0x31337000;
> + detect->result = HW_HAS_PIPELINED_OACONTROL;
> + detect->reset = true;
> + return 1;
> +}
> +
> +static void
> +intel_detect_pipelined_register_access(struct intel_screen *screen)
> +{
> + struct detect_pipelined_register regs[2], *r =regs;
> +
> + /* Combine the multiple register access validation into a single
> + * round trip through the kernel + GPU.
> + */
> + r += intel_detect_pipelined_so(screen, r);
> + r += intel_detect_pipelined_oacontrol(screen, r);
Not a fan of this construct. How about changing the return types of the
detect functions to int?.
Other than that, it looks good:
Reviewed-by: Martin Peres <martin.peres at linux.intel.com>
> +
> + __intel_detect_pipelined_registers(screen, regs, r-regs);
> }
>
> /**
> @@ -1549,10 +1588,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
>
> intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
> intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
> - if (intel_detect_pipelined_so(intelScreen))
> - intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_SOL_OFFSET;
> - if (intel_detect_pipelined_oacontrol(intelScreen))
> - intelScreen->hw_has_pipelined_register |= HW_HAS_PIPELINED_OACONTROL;
> +
> + intel_detect_pipelined_register_access(intelScreen);
>
> const char *force_msaa = getenv("INTEL_FORCE_MSAA");
> if (force_msaa) {
More information about the mesa-dev
mailing list