[Mesa-dev] [PATCH] i965: compute DDX in a subspan based only on top row

Mon Sep 30 23:55:03 PDT 2013

On 09/30/2013 10:54 PM, Chia-I Wu wrote:
> From: Chia-I Wu <olv at lunarg.com>
> 
> Consider only the top-left and top-right pixels to approximate DDX in a 2x2
> subspan, unless the application requests a more accurate approximation via
> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
> new driconf option disable_derivative_optimization.
> 
> This results in a less accurate approximation.  However, it improves the
> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0%
> confidence) on Haswell.  No noticeable image quality difference observed.
> 
> The improvement comes from faster sample_d.  It seems, on Haswell, some
> optimizations are introduced to allow faster sample_d when all pixels in a
> subspan have the same derivative.  I considered SAMPLE_STATE too, which allows
> one to control the quality of sample_d on Haswell.  But it gave much worse
> image quality without giving better performance comparing to this change.
> 
> No piglit quick.tests regression on Haswell, except with in-parameter-struct
> and normal-parameter-struct tests which appear to be noises.

Yeah, that's noise...I wouldn't even mention it.

> 
> Signed-off-by: Chia-I Wu <olv at lunarg.com>
> ---
>  src/mesa/drivers/dri/i965/brw_context.c        |  2 ++
>  src/mesa/drivers/dri/i965/brw_context.h        |  1 +
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 +++++++++++++++++++-------
>  src/mesa/drivers/dri/i965/brw_wm.c             | 11 +++++++++
>  src/mesa/drivers/dri/i965/brw_wm.h             |  1 +
>  src/mesa/drivers/dri/i965/intel_screen.c       |  4 ++++
>  6 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 5f58a29..18b8e57 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -478,6 +478,8 @@ brwCreateContext(int api,
>     brw_draw_init( brw );
>  
>     brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
> +   brw->disable_derivative_optimization =
> +      driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
>  
>     ctx->Const.ContextFlags = 0;
>     if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 0f88bad..0ec1218 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1005,6 +1005,7 @@ struct brw_context
>     bool always_flush_cache;
>     bool disable_throttling;
>     bool precompile;
> +   bool disable_derivative_optimization;
>  
>     driOptionCache optionCache;
>     /** @} */
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index 7ce42c4..9eb5e17 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>   *
>   * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
>   *
> - * and we're trying to produce:
> + * Ideally, we want to produce:
>   *
>   *           DDX                     DDY
>   * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>   *
>   * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
>   * for each pair, and vertstride = 2 jumps us 2 elements after processing a
> - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
> - * between each other.  We could probably do it like ddx and swizzle the right
> - * order later, but bail for now and just produce
> + * pair.  But the ideal approximation may impose a huge performance cost on
> + * sample_d.  On at least Haswell, sample_d instruction does some
> + * optimizations if the same LOD is used for all pixels in the subspan.
> + *
> + * For DDY, it's harder, as we want to produce the pairs swizzled between each
> + * other.  We could probably do it like ddx and swizzle the right order later,
> + * but bail for now and just produce
>   * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
>   */
>  void
>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
>  {
> +   unsigned vstride, width;
> +
> +   if (c->key.high_quality_derivatives) {
> +      /* produce accurate derivatives */
> +      vstride = BRW_VERTICAL_STRIDE_2;
> +      width = BRW_WIDTH_2;
> +   }
> +   else {
> +      /* replicate the derivative at the top-left pixel to other pixels */
> +      vstride = BRW_VERTICAL_STRIDE_4;
> +      width = BRW_WIDTH_4;
> +   }
> +
>     struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>  				 BRW_REGISTER_TYPE_F,
> -				 BRW_VERTICAL_STRIDE_2,
> -				 BRW_WIDTH_2,
> +				 vstride,
> +				 width,
>  				 BRW_HORIZONTAL_STRIDE_0,
>  				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>     struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>  				 BRW_REGISTER_TYPE_F,
> -				 BRW_VERTICAL_STRIDE_2,
> -				 BRW_WIDTH_2,
> +				 vstride,
> +				 width,
>  				 BRW_HORIZONTAL_STRIDE_0,
>  				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>     brw_ADD(p, dst, src0, negate(src1));
> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
> index 3d7ca2a..5158bcc 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm.c
> @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context *brw,
>  
>     key->line_aa = line_aa;
>  
> +   /* _NEW_HINT */
> +   if (brw->disable_derivative_optimization) {
> +      key->high_quality_derivatives =
> +         ctx->Hint.FragmentShaderDerivative != GL_FASTEST;
> +   }
> +   else {

Please use } else { here.

> +      key->high_quality_derivatives =
> +         ctx->Hint.FragmentShaderDerivative == GL_NICEST;
> +   }
> +
>     if (brw->gen < 6)
>        key->stats_wm = brw->stats_wm;
>  
> @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = {
>  		_NEW_STENCIL |
>  		_NEW_POLYGON |
>  		_NEW_LINE |
> +		_NEW_HINT |
>  		_NEW_LIGHT |
>  		_NEW_FRAG_CLAMP |
>  		_NEW_BUFFERS |
> diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
> index f7a2c5f..aa786de 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm.h
> +++ b/src/mesa/drivers/dri/i965/brw_wm.h
> @@ -66,6 +66,7 @@ struct brw_wm_prog_key {
>     GLuint render_to_fbo:1;
>     GLuint clamp_fragment_color:1;
>     GLuint line_aa:2;
> +   GLuint high_quality_derivatives:1;
>  
>     GLushort drawable_height;
>     GLbitfield64 input_slots_valid;
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> index de80a00..cddc8e8 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.c
> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
> @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] =
>  	 DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+")
>        DRI_CONF_OPT_END
>  
> +      DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false")
> +	 DRI_CONF_DESC(en, "Derivatives with finer granularity by default")
> +      DRI_CONF_OPT_END
> +
>     DRI_CONF_SECTION_END
>     DRI_CONF_SECTION_QUALITY
>        DRI_CONF_FORCE_S3TC_ENABLE("false")
> 

This all looks good to me, but I'd also like to see brw_fs_precompile
changed to do:

key.high_quality_derivatives = brw->disable_derivative_optimization;

The precompile program key is our best guess what the state will be when
the program is actually used.  Here, the default should be true if the
driconf override is set (it'll be high quality unless overridden to
GL_FASTEST), and false normally (low quality unless overridden to
GL_NICEST).

With that change, this is:
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

I'm fine with this landing, though you should get an ack from Paul or
Ian before pushing.

Thanks again for doing this!  Sorry it took so long to settle on what to do.