[Mesa-dev] [PATCH] i965/hsw: compute DDX in a subspan based only on top row

Thu Sep 12 22:06:02 PDT 2013

From: Chia-I Wu <olv at lunarg.com>

Consider only the top-left and top-right pixels to approximate DDX in a 2x2
subspan, unless the application or the user requests a more accurate
approximation.  This results in a less accurate approximation.  However, it
improves the performance of Xonotic with Ultra settings by 24.3879% +/-
0.832202% (at 95.0% confidence) on Haswell.  No noticeable image quality
difference observed.

No piglit gpu.tests regressions (tested with v1)

I failed to come up with an explanation for the performance difference, as the
change does not affect Ivy Bridge.  If anyone has the insight, please kindly
enlighten me.  Performance differences may also be observed on other games
that call textureGrad and dFdx.

v2: Honor GL_FRAGMENT_SHADER_DERIVATIVE_HINT and add a drirc option.  Update
    comments.

Signed-off-by: Chia-I Wu <olv at lunarg.com>
---
 src/mesa/drivers/dri/i965/brw_context.c   |  1 +
 src/mesa/drivers/dri/i965/brw_context.h   |  1 +
 src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 40 ++++++++++++++++++++++++-------
 src/mesa/drivers/dri/i965/intel_screen.c  |  4 ++++
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 4fcc9fb..1cdfb9d 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -470,6 +470,7 @@ brwCreateContext(int api,
    brw_draw_init( brw );
 
    brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
+   brw->accurate_derivative = driQueryOptionb(&brw->optionCache, "accurate_derivative");
 
    ctx->Const.ContextFlags = 0;
    if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index c566bba..8bfc54a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -964,6 +964,7 @@ struct brw_context
    bool always_flush_cache;
    bool disable_throttling;
    bool precompile;
+   bool accurate_derivative;
 
    driOptionCache optionCache;
    /** @} */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index bfb3d33..69aeab1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
  *
  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
  *
- * and we're trying to produce:
+ * Ideally, we want to produce:
  *
  *           DDX                     DDY
  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
@@ -556,24 +556,48 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
  *
  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
- * between each other.  We could probably do it like ddx and swizzle the right
- * order later, but bail for now and just produce
+ * pair.  But the ideal approximation of DDX may impose a huge performance
+ * cost on sample_d.  As such, we favor ((ss0.tr - ss0.tl)x4 (ss1.tr -
+ * ss1.tl)x4) unless the app or the user requests otherwise.
+ *
+ * For DDY, it's harder, as we want to produce the pairs swizzled between each
+ * other.  We could probably do it like ddx and swizzle the right order later,
+ * but bail for now and just produce
  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
  */
 void
 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 {
+   unsigned vstride, width;
+
+   /* Produce accurate result only when requested.  We emit only one
+    * instruction for either case, but the problem is the result may affect
+    * how fast sample_d executes.
+    *
+    * Since the performance difference is only observed on Haswell, ignore the
+    * hints on other GENs for now.
+    */
+   if (!brw->is_haswell ||
+       brw->ctx.Hint.FragmentShaderDerivative == GL_NICEST ||
+       brw->accurate_derivative) {
+      vstride = BRW_VERTICAL_STRIDE_2;
+      width = BRW_WIDTH_2;
+   }
+   else {
+      vstride = BRW_VERTICAL_STRIDE_4;
+      width = BRW_WIDTH_4;
+   }
+
    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
 				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_2,
-				 BRW_WIDTH_2,
+				 vstride,
+				 width,
 				 BRW_HORIZONTAL_STRIDE_0,
 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
 				 BRW_REGISTER_TYPE_F,
-				 BRW_VERTICAL_STRIDE_2,
-				 BRW_WIDTH_2,
+				 vstride,
+				 width,
 				 BRW_HORIZONTAL_STRIDE_0,
 				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
    brw_ADD(p, dst, src0, negate(src1));
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index eb6515e..ee08ffd 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -61,6 +61,10 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE("false")
+
+      DRI_CONF_OPT_BEGIN_B(accurate_derivative, "false")
+	 DRI_CONF_DESC(en, "Perform more accurate derivative calculation")
+      DRI_CONF_OPT_END
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_DEBUG
       DRI_CONF_NO_RAST("false")
-- 
1.8.3.1