Mesa (master): anv: prepare pipeline for delayed emission of color writes

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Apr 22 15:56:31 UTC 2021


Module: Mesa
Branch: master
Commit: 82eb7c04e7ee7b5a393550116c2e68fbdf78c3ca
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=82eb7c04e7ee7b5a393550116c2e68fbdf78c3ca

Author: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Date:   Wed Mar 31 18:50:00 2021 +0300

anv: prepare pipeline for delayed emission of color writes

Namely we want to be able to emit the following dynamically :

  * On Gfx 7/7.5 : 3DSTATE_VM, 3DSTATE_BLEND_STATE_POINTERS

  * On Gfx 8+ : 3DSTATE_VM, 3DSTATE_BLEND_STATE_POINTERS,
    3DSTATE_PS_BLEND

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Reviewed-by: Tapani Pälli <tapani.palli at intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10206>

---

 src/intel/vulkan/anv_private.h   |   7 ++
 src/intel/vulkan/genX_pipeline.c | 217 +++++++++++++++++++++++----------------
 2 files changed, 137 insertions(+), 87 deletions(-)

diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index a0948832d65..f059e232351 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2239,6 +2239,7 @@ enum anv_cmd_dirty_bits {
    ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE         = 1 << 22, /* VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT */
    ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP                  = 1 << 23, /* VK_DYNAMIC_STATE_STENCIL_OP_EXT */
    ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS            = 1 << 24, /* VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT */
+   ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE           = 1 << 25, /* VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT */
 };
 typedef uint32_t anv_cmd_dirty_mask_t;
 
@@ -3369,6 +3370,7 @@ struct anv_graphics_pipeline {
    bool                                         sample_shading_enable;
    bool                                         kill_pixel;
    bool                                         depth_bounds_test_enable;
+   bool                                         force_fragment_thread_dispatch;
 
    /* When primitive replication is used, subpass->view_mask will describe what
     * views to replicate.
@@ -3389,12 +3391,17 @@ struct anv_graphics_pipeline {
       uint32_t                                  depth_stencil_state[3];
       uint32_t                                  clip[4];
       uint32_t                                  xfb_bo_pitch[4];
+      uint32_t                                  wm[3];
+      uint32_t                                  blend_state[MAX_RTS * 2];
    } gfx7;
 
    struct {
       uint32_t                                  sf[4];
       uint32_t                                  raster[5];
       uint32_t                                  wm_depth_stencil[3];
+      uint32_t                                  wm[2];
+      uint32_t                                  ps_blend[2];
+      uint32_t                                  blend_state[1 + MAX_RTS * 2];
    } gfx8;
 
    struct {
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 20edb9ddcae..f9fe6285be0 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -1129,7 +1129,8 @@ is_dual_src_blend_factor(VkBlendFactor factor)
 static void
 emit_cb_state(struct anv_graphics_pipeline *pipeline,
               const VkPipelineColorBlendStateCreateInfo *info,
-              const VkPipelineMultisampleStateCreateInfo *ms_info)
+              const VkPipelineMultisampleStateCreateInfo *ms_info,
+              uint32_t dynamic_states)
 {
    struct anv_device *device = pipeline->base.device;
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
@@ -1150,11 +1151,21 @@ emit_cb_state(struct anv_graphics_pipeline *pipeline,
 
    const uint32_t num_dwords = GENX(BLEND_STATE_length) +
       GENX(BLEND_STATE_ENTRY_length) * surface_count;
-   pipeline->blend_state =
-      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
+   uint32_t *blend_state_start, *state_pos;
+
+   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) {
+      const struct intel_device_info *devinfo = &pipeline->base.device->info;
+      blend_state_start = devinfo->ver >= 8 ?
+         pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
+      pipeline->blend_state = ANV_STATE_NULL;
+   } else {
+      pipeline->blend_state =
+         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
+      blend_state_start = pipeline->blend_state.map;
+   }
+   state_pos = blend_state_start;
 
    bool has_writeable_rt = false;
-   uint32_t *state_pos = pipeline->blend_state.map;
    state_pos += GENX(BLEND_STATE_length);
 #if GFX_VER >= 8
    struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
@@ -1285,29 +1296,38 @@ emit_cb_state(struct anv_graphics_pipeline *pipeline,
    }
 
 #if GFX_VER >= 8
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), blend) {
-      blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
-      blend.HasWriteableRT                = has_writeable_rt;
-      blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
-      blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
-      blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
-      blend.SourceBlendFactor             = bs0.SourceBlendFactor;
-      blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
-      blend.AlphaTestEnable               = false;
-      blend.IndependentAlphaBlendEnable   =
-         blend_state.IndependentAlphaBlendEnable;
+   struct GENX(3DSTATE_PS_BLEND) blend = {
+      GENX(3DSTATE_PS_BLEND_header),
+   };
+   blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
+   blend.HasWriteableRT                = has_writeable_rt;
+   blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
+   blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
+   blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
+   blend.SourceBlendFactor             = bs0.SourceBlendFactor;
+   blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
+   blend.AlphaTestEnable               = false;
+   blend.IndependentAlphaBlendEnable   = blend_state.IndependentAlphaBlendEnable;
+
+   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) {
+      GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
+   } else {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
+         _blend = blend;
    }
 #else
    (void)has_writeable_rt;
 #endif
 
-   GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
+   GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-      bsp.BlendStatePointer      = pipeline->blend_state.offset;
+   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+         bsp.BlendStatePointer      = pipeline->blend_state.offset;
 #if GFX_VER >= 8
-      bsp.BlendStatePointerValid = true;
+         bsp.BlendStatePointerValid = true;
 #endif
+      }
    }
 }
 
@@ -1906,87 +1926,110 @@ emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subp
                 const VkPipelineRasterizationStateCreateInfo *raster,
                 const VkPipelineColorBlendStateCreateInfo *blend,
                 const VkPipelineMultisampleStateCreateInfo *multisample,
-                const VkPipelineRasterizationLineStateCreateInfoEXT *line)
+                const VkPipelineRasterizationLineStateCreateInfoEXT *line,
+                const uint32_t dynamic_states)
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), wm) {
-      wm.StatisticsEnable                    = true;
-      wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
-      wm.LineAntialiasingRegionWidth         = _10pixels;
-      wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
+   struct GENX(3DSTATE_WM) wm = {
+      GENX(3DSTATE_WM_header),
+   };
+   wm.StatisticsEnable                    = true;
+   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
+   wm.LineAntialiasingRegionWidth         = _10pixels;
+   wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
 
-      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-         if (wm_prog_data->early_fragment_tests) {
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      if (wm_prog_data->early_fragment_tests) {
             wm.EarlyDepthStencilControl         = EDSC_PREPS;
-         } else if (wm_prog_data->has_side_effects) {
-            wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
-         } else {
-            wm.EarlyDepthStencilControl         = EDSC_NORMAL;
-         }
+      } else if (wm_prog_data->has_side_effects) {
+         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
+      } else {
+         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
+      }
 
 #if GFX_VER >= 8
-         /* Gfx8 hardware tries to compute ThreadDispatchEnable for us but
-          * doesn't take into account KillPixels when no depth or stencil
-          * writes are enabled.  In order for occlusion queries to work
-          * correctly with no attachments, we need to force-enable PS thread
-          * dispatch.
-          *
-          * The BDW docs are pretty clear that that this bit isn't validated
-          * and probably shouldn't be used in production:
-          *
-          *    "This must always be set to Normal. This field should not be
-          *    tested for functional validation."
-          *
-          * Unfortunately, however, the other mechanism we have for doing this
-          * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
-          * Given two bad options, we choose the one which works.
-          */
-         if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
-             !has_color_buffer_write_enabled(pipeline, blend))
-            wm.ForceThreadDispatchEnable = ForceON;
+      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+       * doesn't take into account KillPixels when no depth or stencil
+       * writes are enabled.  In order for occlusion queries to work
+       * correctly with no attachments, we need to force-enable PS thread
+       * dispatch.
+       *
+       * The BDW docs are pretty clear that that this bit isn't validated
+       * and probably shouldn't be used in production:
+       *
+       *    "This must always be set to Normal. This field should not be
+       *    tested for functional validation."
+       *
+       * Unfortunately, however, the other mechanism we have for doing this
+       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+       * Given two bad options, we choose the one which works.
+       */
+      pipeline->force_fragment_thread_dispatch =
+         wm_prog_data->has_side_effects ||
+         wm_prog_data->uses_kill;
+
+      if (pipeline->force_fragment_thread_dispatch ||
+          !has_color_buffer_write_enabled(pipeline, blend)) {
+         /* Only set this value in non dynamic mode. */
+         wm.ForceThreadDispatchEnable =
+            !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
+      }
 #endif
 
-         wm.BarycentricInterpolationMode =
-            wm_prog_data->barycentric_interp_modes;
+      wm.BarycentricInterpolationMode =
+         wm_prog_data->barycentric_interp_modes;
 
 #if GFX_VER < 8
-         wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
-         wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
-         wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
-         wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-
-         /* If the subpass has a depth or stencil self-dependency, then we
-          * need to force the hardware to do the depth/stencil write *after*
-          * fragment shader execution.  Otherwise, the writes may hit memory
-          * before we get around to fetching from the input attachment and we
-          * may get the depth or stencil value from the current draw rather
-          * than the previous one.
-          */
-         wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
-                                            wm_prog_data->uses_kill;
-
-         if (wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
-             wm_prog_data->has_side_effects ||
-             wm.PixelShaderKillsPixel ||
-             has_color_buffer_write_enabled(pipeline, blend))
-            wm.ThreadDispatchEnable = true;
-
-         if (multisample && multisample->rasterizationSamples > 1) {
-            if (wm_prog_data->persample_dispatch) {
-               wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
-            } else {
-               wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
-            }
-         } else {
+      wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
+      wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
+      wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
+      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+
+      /* If the subpass has a depth or stencil self-dependency, then we
+       * need to force the hardware to do the depth/stencil write *after*
+       * fragment shader execution.  Otherwise, the writes may hit memory
+       * before we get around to fetching from the input attachment and we
+       * may get the depth or stencil value from the current draw rather
+       * than the previous one.
+       */
+      wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
+                                         wm_prog_data->uses_kill;
+
+      pipeline->force_fragment_thread_dispatch =
+         wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
+         wm_prog_data->has_side_effects ||
+         wm.PixelShaderKillsPixel;
+
+      if (pipeline->force_fragment_thread_dispatch ||
+          has_color_buffer_write_enabled(pipeline, blend)) {
+         /* Only set this value in non dynamic mode. */
+         wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
+      }
+
+      if (multisample && multisample->rasterizationSamples > 1) {
+         if (wm_prog_data->persample_dispatch) {
             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+         } else {
+            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
          }
-         wm.MultisampleRasterizationMode =
-            gfx7_ms_rast_mode(pipeline, ia, raster, multisample);
+      } else {
+         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+      }
+      wm.MultisampleRasterizationMode =
+         gfx7_ms_rast_mode(pipeline, ia, raster, multisample);
 #endif
 
-         wm.LineStippleEnable = line && line->stippledLineEnable;
-      }
+      wm.LineStippleEnable = line && line->stippledLineEnable;
+   }
+
+   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) {
+      const struct intel_device_info *devinfo = &pipeline->base.device->info;
+      uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
+      GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
+   } else {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
+         _wm = wm;
    }
 }
 
@@ -2307,7 +2350,7 @@ genX(graphics_pipeline_create)(
                            urb_deref_block_size);
    emit_ms_state(pipeline, ms_info, dynamic_states);
    emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
-   emit_cb_state(pipeline, cb_info, ms_info);
+   emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
    compute_kill_pixel(pipeline, ms_info, subpass);
 
    emit_3dstate_clip(pipeline,
@@ -2347,7 +2390,7 @@ genX(graphics_pipeline_create)(
    emit_3dstate_wm(pipeline, subpass,
                    pCreateInfo->pInputAssemblyState,
                    pCreateInfo->pRasterizationState,
-                   cb_info, ms_info, line_info);
+                   cb_info, ms_info, line_info, dynamic_states);
    emit_3dstate_ps(pipeline, cb_info, ms_info);
 #if GFX_VER >= 8
    emit_3dstate_ps_extra(pipeline, subpass,



More information about the mesa-commit mailing list