[Mesa-dev] [PATCH 2/2] i965/hsw: Set MOCS to L3_CACHEABLE for some packets

Kenneth Graunke kenneth at whitecape.org
Thu Jul 18 00:31:55 PDT 2013


On 07/17/2013 04:46 PM, Chad Versace wrote:
> Set MOCS to L3_CACHEABLE for the following packets, during the normal
> draw upload path and during blorp:
>      3DSTATE_CONSTANT_PS
>      3DSTATE_CONSTANT_VS
>      3DSTATE_DEPTH_BUFFER
>      3DSTATE_HIER_DEPTH_BUFFER
>      3DSTATE_STENCIL_BUFFER
>      3DSTATE_VERTEX_BUFFERS
>      CMD_STATE_BASE_ADDRESS
>      SURFACE_STATE
>
> It's not possible to set the MOCS of the MCS buffer, because the MCS
> inherits MOCS from the parent surface.
>
> Gives +5.68% on Xonotic 1920x1080 on Haswell GT2.

At this point, I'm quite happy to accept your choices of what to cache.

Two comments:

I'd really prefer to see each change split into a separate patch.  You 
don't need to benchmark them separately or anything, but it'd be nice. 
I know Eric had hoped to see this happen as well.

I'd also prefer to see:

   uint32_t mocs = brw->is_haswell ? GEN7_MOCS_L3 : 0;

rather than if-blocks...it's just more concise.  Not a big deal either 
way, though.

Neither of those should affect the actual changes, so there would be no 
need to re-run the benchmarks.

Thanks so much for reviving this work, Chad...really sorry for the trouble.

> Performance Measurements
> ========================
>
> system-info {
>      gpu: haswell_m_gt2 0x0416 rev05
>      arch: x86_64
>      kernel: 3.9.9-1-ARCH (Archlinux)
>      xf86-video-intel: 2.12.11-1 (Archlinux)
>      libdrm: 2.4.46-2 (Archlinux)
> }
>
> test {
>      name: pts/xonotic-1.3.1 1920x1080 quality=high
>      delta: +5.68152% +/- 0.653452% (student's t) at 95% confidence
>      n: 3
>      pooled-s: 0.288297
>      mesa-base: master-00d32cd
> }
>
> CC: Kenneth Graunke <kenneth at whitecape.org>
> Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
> ---
>   src/mesa/drivers/dri/i965/brw_draw_upload.c       |  3 +++
>   src/mesa/drivers/dri/i965/brw_misc_state.c        | 11 +++++++-
>   src/mesa/drivers/dri/i965/gen6_blorp.cpp          | 14 +++++++++-
>   src/mesa/drivers/dri/i965/gen7_blorp.cpp          | 32 ++++++++++++++++++++---
>   src/mesa/drivers/dri/i965/gen7_misc_state.c       | 13 +++++++--
>   src/mesa/drivers/dri/i965/gen7_vs_state.c         | 10 ++++++-
>   src/mesa/drivers/dri/i965/gen7_wm_state.c         | 10 ++++++-
>   src/mesa/drivers/dri/i965/gen7_wm_surface_state.c | 18 ++++++++++++-
>   8 files changed, 100 insertions(+), 11 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index 2952027..039c299 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -658,6 +658,9 @@ static void brw_emit_vertices(struct brw_context *brw)
>   	 if (brw->gen >= 7)
>   	    dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
>
> +	 if (brw->is_haswell)
> +	    dw0 |= HSW_MOCS_L3_CACHEABLE << 16;
> +
>   	 OUT_BATCH(dw0 | (buffer->stride << BRW_VB0_PITCH_SHIFT));
>   	 OUT_RELOC(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->offset);
>   	 if (brw->gen >= 5) {
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index 0ab1e76..3811870 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -1023,13 +1023,22 @@ static void upload_state_base_address( struct brw_context *brw )
>       */
>
>      if (brw->gen >= 6) {
> +      uint32_t mocs;
> +
> +      if (brw->is_haswell) {
> +         mocs = HSW_MOCS_L3_CACHEABLE;
> +      } else {
> +         mocs = 0;
> +      }
> +
>         if (brw->gen == 6)
>   	 intel_emit_post_sync_nonzero_flush(brw);
>
>          BEGIN_BATCH(10);
>          OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
>          /* General state base address: stateless DP read/write requests */
> -       OUT_BATCH(1);
> +       OUT_BATCH(mocs << 8 |
> +                 1 /*general state base address modify enable*/);
>          /* Surface state base address:
>   	* BINDING_TABLE_STATE
>   	* SURFACE_STATE
> diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> index 8056bf5..2a1e765 100644
> --- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> @@ -74,9 +74,18 @@ void
>   gen6_blorp_emit_state_base_address(struct brw_context *brw,
>                                      const brw_blorp_params *params)
>   {
> +   uint32_t mocs;
> +
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      BEGIN_BATCH(10);
>      OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
> -   OUT_BATCH(1); /* GeneralStateBaseAddressModifyEnable */
> +   OUT_BATCH(mocs << 8 |
> +             1 /* GeneralStateBaseAddressModifyEnable */);
>      /* SurfaceStateBaseAddress */
>      OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
>      /* DynamicStateBaseAddress */
> @@ -163,6 +172,9 @@ gen6_blorp_emit_vertices(struct brw_context *brw,
>         if (brw->gen >= 7)
>            dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
>
> +      if (brw->is_haswell)
> +         dw0 |= HSW_MOCS_L3_CACHEABLE << 16;
> +
>         BEGIN_BATCH(batch_length);
>         OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2));
>         OUT_BATCH(dw0);
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index fbdd2be..6b5d0be 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -143,6 +143,7 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
>       */
>      struct intel_region *region = surface->mt->region;
>      uint32_t tile_x, tile_y;
> +   uint32_t mocs;
>
>      uint32_t tiling = surface->map_stencil_as_y_tiled
>         ? I915_TILING_Y : region->tiling;
> @@ -151,6 +152,12 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
>         brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 8 * 4, 32, &wm_surf_offset);
>      memset(surf, 0, 8 * 4);
>
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      surf[0] = BRW_SURFACE_2D << BRW_SURFACE_TYPE_SHIFT |
>                surface->brw_surfaceformat << BRW_SURFACE_FORMAT_SHIFT |
>                gen7_surface_tiling_mode(tiling);
> @@ -175,7 +182,8 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
>      assert(tile_x % 4 == 0);
>      assert(tile_y % 2 == 0);
>      surf[5] = SET_FIELD(tile_x / 4, BRW_SURFACE_X_OFFSET) |
> -             SET_FIELD(tile_y / 2, BRW_SURFACE_Y_OFFSET);
> +             SET_FIELD(tile_y / 2, BRW_SURFACE_Y_OFFSET) |
> +             SET_FIELD(mocs, GEN7_SURFACE_MOCS);
>
>      surf[2] = SET_FIELD(width - 1, GEN7_SURFACE_WIDTH) |
>                SET_FIELD(height - 1, GEN7_SURFACE_HEIGHT);
> @@ -614,6 +622,8 @@ gen7_blorp_emit_constant_ps(struct brw_context *brw,
>                               const brw_blorp_params *params,
>                               uint32_t wm_push_const_offset)
>   {
> +   uint32_t mocs;
> +
>      /* Make sure the push constants fill an exact integer number of
>       * registers.
>       */
> @@ -622,13 +632,19 @@ gen7_blorp_emit_constant_ps(struct brw_context *brw,
>      /* There must be at least one register worth of push constant data. */
>      assert(BRW_BLORP_NUM_PUSH_CONST_REGS > 0);
>
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      /* Enable push constant buffer 0. */
>      BEGIN_BATCH(7);
>      OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
>                (7 - 2));
>      OUT_BATCH(BRW_BLORP_NUM_PUSH_CONST_REGS);
>      OUT_BATCH(0);
> -   OUT_BATCH(wm_push_const_offset);
> +   OUT_BATCH(wm_push_const_offset | mocs);
>      OUT_BATCH(0);
>      OUT_BATCH(0);
>      OUT_BATCH(0);
> @@ -658,6 +674,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
>      uint32_t draw_x = params->depth.x_offset;
>      uint32_t draw_y = params->depth.y_offset;
>      uint32_t tile_mask_x, tile_mask_y;
> +   uint32_t mocs;
>
>      brw_get_depthstencil_tile_masks(params->depth.mt,
>                                      params->depth.level,
> @@ -665,6 +682,12 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
>                                      NULL,
>                                      &tile_mask_x, &tile_mask_y);
>
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      /* 3DSTATE_DEPTH_BUFFER */
>      {
>         uint32_t tile_x = draw_x & tile_mask_x;
> @@ -709,7 +732,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
>                   offset);
>         OUT_BATCH((params->depth.width + tile_x - 1) << 4 |
>                   (params->depth.height + tile_y - 1) << 18);
> -      OUT_BATCH(0);
> +      OUT_BATCH(mocs);
>         OUT_BATCH(tile_x |
>                   tile_y << 16);
>         OUT_BATCH(0);
> @@ -726,7 +749,8 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
>
>         BEGIN_BATCH(3);
>         OUT_BATCH((GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
> -      OUT_BATCH(hiz_region->pitch - 1);
> +      OUT_BATCH((mocs << 25) |
> +                (hiz_region->pitch - 1));
>         OUT_RELOC(hiz_region->bo,
>                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
>                   hiz_offset);
> diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
> index 7f61881..31e80f5 100644
> --- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
> @@ -40,6 +40,13 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
>                               uint32_t tile_x, uint32_t tile_y)
>   {
>      struct gl_context *ctx = &brw->ctx;
> +   uint32_t mocs;
> +
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
>
>      intel_emit_depth_stall_flushes(brw);
>
> @@ -63,7 +70,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
>
>      OUT_BATCH(((width + tile_x - 1) << 4) |
>                ((height + tile_y - 1) << 18));
> -   OUT_BATCH(0);
> +   OUT_BATCH(mocs);
>      OUT_BATCH(tile_x | (tile_y << 16));
>      OUT_BATCH(0);
>      ADVANCE_BATCH();
> @@ -78,7 +85,8 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
>         struct intel_mipmap_tree *hiz_mt = depth_mt->hiz_mt;
>         BEGIN_BATCH(3);
>         OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2));
> -      OUT_BATCH(hiz_mt->region->pitch - 1);
> +      OUT_BATCH((mocs << 25) |
> +                (hiz_mt->region->pitch - 1));
>         OUT_RELOC(hiz_mt->region->bo,
>                   I915_GEM_DOMAIN_RENDER,
>                   I915_GEM_DOMAIN_RENDER,
> @@ -108,6 +116,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
>          * same text, and experiments indicate that this is necessary.
>          */
>         OUT_BATCH(enabled |
> +                mocs << 25 |
>   	        (2 * stencil_mt->region->pitch - 1));
>         OUT_RELOC(stencil_mt->region->bo,
>   	        I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> index 7369a9c..c1ccc62 100644
> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> @@ -63,6 +63,14 @@ upload_vs_state(struct brw_context *brw)
>         OUT_BATCH(0);
>         ADVANCE_BATCH();
>      } else {
> +      uint32_t mocs;
> +
> +      if (brw->is_haswell) {
> +         mocs = HSW_MOCS_L3_CACHEABLE;
> +      } else {
> +         mocs = 0;
> +      }
> +
>         BEGIN_BATCH(7);
>         OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));
>         OUT_BATCH(brw->vs.push_const_size);
> @@ -70,7 +78,7 @@ upload_vs_state(struct brw_context *brw)
>         /* Pointer to the VS constant buffer.  Covered by the set of
>          * state flags from gen6_prepare_wm_contants
>          */
> -      OUT_BATCH(brw->vs.push_const_offset);
> +      OUT_BATCH(brw->vs.push_const_offset | mocs);
>         OUT_BATCH(0);
>         OUT_BATCH(0);
>         OUT_BATCH(0);
> diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> index 8f90371..46cf795 100644
> --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> @@ -141,6 +141,14 @@ upload_ps_state(struct brw_context *brw)
>         OUT_BATCH(0);
>         ADVANCE_BATCH();
>      } else {
> +      uint32_t mocs;
> +
> +      if (brw->is_haswell) {
> +         mocs = HSW_MOCS_L3_CACHEABLE;
> +      } else {
> +         mocs = 0;
> +      }
> +
>         BEGIN_BATCH(7);
>         OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (7 - 2));
>
> @@ -150,7 +158,7 @@ upload_ps_state(struct brw_context *brw)
>         /* Pointer to the WM constant buffer.  Covered by the set of
>          * state flags from gen6_upload_wm_push_constants.
>          */
> -      OUT_BATCH(brw->wm.push_const_offset);
> +      OUT_BATCH(brw->wm.push_const_offset | mocs);
>         OUT_BATCH(0);
>         OUT_BATCH(0);
>         OUT_BATCH(0);
> diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> index fe97803..322b650 100644
> --- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> @@ -286,6 +286,7 @@ gen7_update_texture_surface(struct gl_context *ctx,
>      struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
>      struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
>      uint32_t tile_x, tile_y;
> +   uint32_t mocs;
>
>      if (tObj->Target == GL_TEXTURE_BUFFER) {
>         gen7_update_buffer_texture_surface(ctx, unit, binding_table, surf_index);
> @@ -301,6 +302,12 @@ gen7_update_texture_surface(struct gl_context *ctx,
>                                                 tObj->DepthMode,
>                                                 sampler->sRGBDecode);
>
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      surf[0] = translate_tex_target(tObj->Target) << BRW_SURFACE_TYPE_SHIFT |
>                tex_format << BRW_SURFACE_FORMAT_SHIFT |
>                gen7_surface_tiling_mode(mt->region->tiling) |
> @@ -334,6 +341,7 @@ gen7_update_texture_surface(struct gl_context *ctx,
>       */
>      surf[5] = ((tile_x / 4) << BRW_SURFACE_X_OFFSET_SHIFT |
>                 (tile_y / 2) << BRW_SURFACE_Y_OFFSET_SHIFT |
> +              SET_FIELD(mocs, GEN7_SURFACE_MOCS) |
>                 /* mip count */
>                 (intelObj->_MaxLevel - tObj->BaseLevel));
>
> @@ -512,6 +520,7 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
>      bool is_array = false;
>      int depth = MAX2(rb->Depth, 1);
>      int min_array_element;
> +   uint32_t mocs;
>      GLenum gl_target = rb->TexImage ?
>                            rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
>
> @@ -552,6 +561,12 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
>         min_array_element = irb->mt_layer;
>      }
>
> +   if (brw->is_haswell) {
> +      mocs = HSW_MOCS_L3_CACHEABLE;
> +   } else {
> +      mocs = 0;
> +   }
> +
>      surf[0] = surftype << BRW_SURFACE_TYPE_SHIFT |
>                format << BRW_SURFACE_FORMAT_SHIFT |
>                (irb->mt->array_spacing_lod0 ? GEN7_SURFACE_ARYSPC_LOD0
> @@ -571,7 +586,8 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
>
>      assert(brw->has_surface_tile_offset);
>
> -   surf[5] = irb->mt_level - irb->mt->first_level;
> +   surf[5] = SET_FIELD(mocs, GEN7_SURFACE_MOCS) |
> +             (irb->mt_level - irb->mt->first_level);
>
>      surf[2] = SET_FIELD(irb->mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) |
>                SET_FIELD(irb->mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT);
>



More information about the mesa-dev mailing list