[Mesa-dev] [PATCH v2 3/5] anv: Add support for the PMA fix on Broadwell
Jason Ekstrand
jason at jlekstrand.net
Thu Feb 2 21:26:05 UTC 2017
In order to get good performance numbers for this, I had to hack up the
driver to whack wm_prog_data::uses_kill to true to emulate a discard and
used the Sascha "shadowmapping" demo. Setting uses_kill to true dropped
the framerate on the demo by 25-30%. Enabling the PMA fix brought it
back up to around 90% of the original framerate. This doesn't seem to
really impact Dota 2; probably because it doesn't use 16-bit depth.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
---
src/intel/vulkan/TODO | 1 -
src/intel/vulkan/anv_cmd_buffer.c | 2 +
src/intel/vulkan/anv_genX.h | 3 +
src/intel/vulkan/anv_private.h | 17 +++++
src/intel/vulkan/gen7_cmd_buffer.c | 7 ++
src/intel/vulkan/gen8_cmd_buffer.c | 133 +++++++++++++++++++++++++++++++++++++
src/intel/vulkan/genX_blorp_exec.c | 5 ++
src/intel/vulkan/genX_cmd_buffer.c | 15 ++++-
src/intel/vulkan/genX_pipeline.c | 38 +++++++++++
9 files changed, 219 insertions(+), 2 deletions(-)
diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
index 38acc0d..f8b73a1 100644
--- a/src/intel/vulkan/TODO
+++ b/src/intel/vulkan/TODO
@@ -12,5 +12,4 @@ Performance:
- Compressed multisample support
- Pushing pieces of UBOs?
- Enable guardband clipping
- - pma stall workaround
- Use soft-pin to avoid relocations
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 5886fa6..8c08f8d 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -135,6 +135,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
state->restart_index = UINT32_MAX;
state->dynamic = default_dynamic_state;
state->need_query_wa = true;
+ state->pma_fix_enabled = false;
+ state->hiz_enabled = false;
if (state->attachments != NULL) {
vk_free(&cmd_buffer->pool->alloc, state->attachments);
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index d04fe38..67147b0 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -55,6 +55,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+ bool enable);
+
void
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
const struct gen_l3_config *l3_config,
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 4fe3ebc..6efe4ea 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1163,6 +1163,20 @@ struct anv_cmd_state {
bool need_query_wa;
/**
+ * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top
+ * of any command buffer it disabled by disabling it in EndCommandBuffer
+ * and before invoking the secondary in ExecuteCommands.
+ */
+ bool pma_fix_enabled;
+
+ /**
+ * Whether or not we now for certain that HiZ is enabled for the current
+ * subpass. If, for whatever reason, we are unsure as to whether HiZ is
+ * enabled or not, this will be false.
+ */
+ bool hiz_enabled;
+
+ /**
* Array length is anv_cmd_state::pass::attachment_count. Array content is
* valid only when recording a render pass instance.
*/
@@ -1465,8 +1479,11 @@ struct anv_pipeline {
uint32_t cs_right_mask;
+ bool writes_depth;
+ bool depth_test_enable;
bool writes_stencil;
bool depth_clamp_enable;
+ bool kill_pixel;
struct {
uint32_t sf[7];
diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
index 013ed87..c1a25e8 100644
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -260,6 +260,13 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
cmd_buffer->state.dirty = 0;
}
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+ bool enable)
+{
+ /* The NP PMA fix doesn't exist on gen7 */
+}
+
void genX(CmdSetEvent)(
VkCommandBuffer commandBuffer,
VkEvent event,
diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
index 8c8de62..271ab3f 100644
--- a/src/intel/vulkan/gen8_cmd_buffer.c
+++ b/src/intel/vulkan/gen8_cmd_buffer.c
@@ -155,6 +155,135 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
#endif
void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
+{
+#if GEN_GEN == 8
+ if (cmd_buffer->state.pma_fix_enabled == enable)
+ return;
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.DepthCacheFlushEnable = true;
+ pc.CommandStreamerStallEnable = true;
+ pc.RenderTargetCacheFlushEnable = true;
+ }
+
+ uint32_t cache_mode;
+ anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
+ .NPPMAFixEnable = enable,
+ .NPEarlyZFailsDisableMask = enable,
+ .NPPMAFixEnableMask = true,
+ .NPEarlyZFailsDisableMask = true);
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = GENX(CACHE_MODE_1_num);
+ lri.DataDWord = cache_mode;
+ }
+
+ /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+ * Flush bits is often necessary. We do it regardless because it's easier.
+ * The render cache flush is also necessary if stencil writes are enabled.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.DepthStallEnable = true;
+ pc.DepthCacheFlushEnable = true;
+ pc.RenderTargetCacheFlushEnable = true;
+ }
+
+ cmd_buffer->state.pma_fix_enabled = enable;
+#endif /* GEN_GEN == 8 */
+}
+
+static inline bool
+want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
+{
+ assert(GEN_GEN == 8);
+
+ /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
+ *
+ * SW must set this bit in order to enable this fix when following
+ * expression is TRUE.
+ *
+ * 3DSTATE_WM::ForceThreadDispatch != 1 &&
+ * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+ * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+ * (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
+ * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
+ * (3DSTATE_PS_EXTRA::PixelShaderValid) &&
+ * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+ * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+ * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+ * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+ * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
+ * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+ * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+ * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+ * 3DSTATE_PS_BLEND::AlphaTestEnable ||
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+ * 3DSTATE_WM::ForceKillPix != ForceOff &&
+ * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+ * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+ * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+ * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+ * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+ * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+ *
+ * This function only takes care of the pipeline parts of the equation.
+ */
+
+ /* These are always true:
+ * 3DSTATE_WM::ForceThreadDispatch != 1 &&
+ * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+ */
+
+ /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+ * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+ * and there is no harm.
+ *
+ * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+ * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+ */
+ if (!cmd_buffer->state.hiz_enabled)
+ return false;
+
+ /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+ struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+ return false;
+
+ /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
+ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+ if (wm_prog_data->early_fragment_tests)
+ return false;
+
+ /* We never use anv_pipeline for HiZ ops so this is trivially true:
+ * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+ * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+ * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+ * 3DSTATE_WM_HZ_OP::StencilBufferClear)
+ */
+
+ /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
+ if (!pipeline->depth_test_enable)
+ return false;
+
+ /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+ * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+ * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+ * 3DSTATE_PS_BLEND::AlphaTestEnable ||
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+ * 3DSTATE_WM::ForceKillPix != ForceOff &&
+ * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+ * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+ * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+ * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+ * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+ * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+ */
+ return (pipeline->kill_pixel && (pipeline->writes_depth ||
+ pipeline->writes_stencil)) ||
+ wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
+void
genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
{
struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
@@ -211,6 +340,7 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
}
if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+ ANV_CMD_DIRTY_RENDER_TARGETS |
ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK)) {
uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
@@ -234,6 +364,9 @@ genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
pipeline->gen8.wm_depth_stencil);
+
+ genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
+ want_depth_pma_fix(cmd_buffer));
}
#else
if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 663e6c9..6f0b063 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -154,6 +154,11 @@ genX(blorp_exec)(struct blorp_batch *batch,
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
+ /* BLORP doesn't do anything fancy with depth such as discards, so we want
+ * the PMA fix off. Also, off is always the safe option.
+ */
+ genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
blorp_exec(batch, params);
cmd_buffer->state.vb_dirty = ~0;
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index b6b7f74..66de93a 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -627,6 +627,11 @@ genX(EndCommandBuffer)(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ /* We want every command buffer to start with the PMA fix in a known state,
+ * so we disable it at the end of the command buffer.
+ */
+ genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
anv_cmd_buffer_end_batch_buffer(cmd_buffer);
@@ -644,6 +649,11 @@ genX(CmdExecuteCommands)(
assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+ /* The secondary command buffers will assume that the PMA fix is disabled
+ * when they begin executing. Make sure this is true.
+ */
+ genX(cmd_buffer_enable_pma_fix)(primary, false);
+
for (uint32_t i = 0; i < commandBufferCount; i++) {
ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
@@ -2181,7 +2191,8 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
const bool has_stencil =
image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
- /* FIXME: Implement the PMA stall W/A */
+ cmd_buffer->state.hiz_enabled = has_hiz;
+
/* FIXME: Width and Height are wrong */
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
@@ -2419,6 +2430,8 @@ void genX(CmdEndRenderPass)(
anv_cmd_buffer_resolve_subpass(cmd_buffer);
+ cmd_buffer->state.hiz_enabled = false;
+
#ifndef NDEBUG
anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
#endif
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 18fe48c..0588d01 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -653,6 +653,8 @@ emit_ds_state(struct anv_pipeline *pipeline,
* to make sure it's initialized to something useful.
*/
pipeline->writes_stencil = false;
+ pipeline->writes_depth = false;
+ pipeline->depth_test_enable = false;
memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
return;
}
@@ -711,6 +713,9 @@ emit_ds_state(struct anv_pipeline *pipeline,
if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
depth_stencil.DepthBufferWriteEnable = false;
+ pipeline->writes_depth = depth_stencil.DepthBufferWriteEnable;
+ pipeline->depth_test_enable = depth_stencil.DepthTestEnable;
+
#if GEN_GEN <= 7
GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
#else
@@ -1429,6 +1434,38 @@ emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
}
#endif
+static void
+compute_kill_pixel(struct anv_pipeline *pipeline,
+ const VkPipelineMultisampleStateCreateInfo *ms_info,
+ const struct anv_subpass *subpass)
+{
+ if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+ pipeline->kill_pixel = false;
+ return;
+ }
+
+ const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+ /* This computes the KillPixel portion of the computation for whether or
+ * not we want to enable the PMA fix on gen8. It's given by this chunk of
+ * the giant formula:
+ *
+ * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+ * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+ * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+ * 3DSTATE_PS_BLEND::AlphaTestEnable ||
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
+ *
+ * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
+ * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
+ * of an alpha test.
+ */
+ pipeline->kill_pixel =
+ subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
+ wm_prog_data->uses_omask ||
+ (ms_info && ms_info->alphaToCoverageEnable);
+}
+
static VkResult
genX(graphics_pipeline_create)(
VkDevice _device,
@@ -1466,6 +1503,7 @@ genX(graphics_pipeline_create)(
emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
pCreateInfo->pMultisampleState);
+ compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass);
emit_urb_setup(pipeline);
--
2.5.0.400.gff86faf
More information about the mesa-dev
mailing list