[Mesa-dev] [PATCH] i965: compute DDX in a subspan based only on top row

Wed Oct 2 00:34:52 PDT 2013

On Wed, Oct 2, 2013 at 2:14 AM, Chris Forbes <chrisf at ijw.co.nz> wrote:
> With those fixes:
>
> Reviewed-by: Chris Forbes <chrisf at ijw.co.nz>
Thanks, I will push it shortly.

With this change landed, the slowness of sample_d is no longer the
bottleneck.  Instead, the lack of native SIMD16 sample_d becomes the
problem.  I have posted my other series that emulates SIMD16 sample_d
with dual SIMD8 sample_d for review.

>
> On Wed, Oct 2, 2013 at 6:38 AM, Ian Romanick <idr at freedesktop.org> wrote:
>> On 09/30/2013 10:54 PM, Chia-I Wu wrote:
>>> From: Chia-I Wu <olv at lunarg.com>
>>
>> I agree with both of Ken's comments.  With those fixed, this patch is
>>
>> Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>
>>
>>> Consider only the top-left and top-right pixels to approximate DDX in a 2x2
>>> subspan, unless the application requests a more accurate approximation via
>>> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
>>> new driconf option disable_derivative_optimization.
>>>
>>> This results in a less accurate approximation.  However, it improves the
>>> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0%
>>> confidence) on Haswell.  No noticeable image quality difference observed.
>>>
>>> The improvement comes from faster sample_d.  It seems, on Haswell, some
>>> optimizations are introduced to allow faster sample_d when all pixels in a
>>> subspan have the same derivative.  I considered SAMPLE_STATE too, which allows
>>> one to control the quality of sample_d on Haswell.  But it gave much worse
>>> image quality without giving better performance comparing to this change.
>>>
>>> No piglit quick.tests regression on Haswell, except with in-parameter-struct
>>> and normal-parameter-struct tests which appear to be noises.
>>>
>>> Signed-off-by: Chia-I Wu <olv at lunarg.com>
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_context.c        |  2 ++
>>>  src/mesa/drivers/dri/i965/brw_context.h        |  1 +
>>>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 +++++++++++++++++++-------
>>>  src/mesa/drivers/dri/i965/brw_wm.c             | 11 +++++++++
>>>  src/mesa/drivers/dri/i965/brw_wm.h             |  1 +
>>>  src/mesa/drivers/dri/i965/intel_screen.c       |  4 ++++
>>>  6 files changed, 44 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
>>> index 5f58a29..18b8e57 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_context.c
>>> +++ b/src/mesa/drivers/dri/i965/brw_context.c
>>> @@ -478,6 +478,8 @@ brwCreateContext(int api,
>>>     brw_draw_init( brw );
>>>
>>>     brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
>>> +   brw->disable_derivative_optimization =
>>> +      driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
>>>
>>>     ctx->Const.ContextFlags = 0;
>>>     if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
>>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
>>> index 0f88bad..0ec1218 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>>> @@ -1005,6 +1005,7 @@ struct brw_context
>>>     bool always_flush_cache;
>>>     bool disable_throttling;
>>>     bool precompile;
>>> +   bool disable_derivative_optimization;
>>>
>>>     driOptionCache optionCache;
>>>     /** @} */
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> index 7ce42c4..9eb5e17 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>>> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>>>   *
>>>   * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
>>>   *
>>> - * and we're trying to produce:
>>> + * Ideally, we want to produce:
>>>   *
>>>   *           DDX                     DDY
>>>   * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
>>> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>>>   *
>>>   * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
>>>   * for each pair, and vertstride = 2 jumps us 2 elements after processing a
>>> - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
>>> - * between each other.  We could probably do it like ddx and swizzle the right
>>> - * order later, but bail for now and just produce
>>> + * pair.  But the ideal approximation may impose a huge performance cost on
>>> + * sample_d.  On at least Haswell, sample_d instruction does some
>>> + * optimizations if the same LOD is used for all pixels in the subspan.
>>> + *
>>> + * For DDY, it's harder, as we want to produce the pairs swizzled between each
>>> + * other.  We could probably do it like ddx and swizzle the right order later,
>>> + * but bail for now and just produce
>>>   * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
>>>   */
>>>  void
>>>  fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
>>>  {
>>> +   unsigned vstride, width;
>>> +
>>> +   if (c->key.high_quality_derivatives) {
>>> +      /* produce accurate derivatives */
>>> +      vstride = BRW_VERTICAL_STRIDE_2;
>>> +      width = BRW_WIDTH_2;
>>> +   }
>>> +   else {
>>> +      /* replicate the derivative at the top-left pixel to other pixels */
>>> +      vstride = BRW_VERTICAL_STRIDE_4;
>>> +      width = BRW_WIDTH_4;
>>> +   }
>>> +
>>>     struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>>>                                BRW_REGISTER_TYPE_F,
>>> -                              BRW_VERTICAL_STRIDE_2,
>>> -                              BRW_WIDTH_2,
>>> +                              vstride,
>>> +                              width,
>>>                                BRW_HORIZONTAL_STRIDE_0,
>>>                                BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>>     struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>>>                                BRW_REGISTER_TYPE_F,
>>> -                              BRW_VERTICAL_STRIDE_2,
>>> -                              BRW_WIDTH_2,
>>> +                              vstride,
>>> +                              width,
>>>                                BRW_HORIZONTAL_STRIDE_0,
>>>                                BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>>>     brw_ADD(p, dst, src0, negate(src1));
>>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
>>> index 3d7ca2a..5158bcc 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_wm.c
>>> +++ b/src/mesa/drivers/dri/i965/brw_wm.c
>>> @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context *brw,
>>>
>>>     key->line_aa = line_aa;
>>>
>>> +   /* _NEW_HINT */
>>> +   if (brw->disable_derivative_optimization) {
>>> +      key->high_quality_derivatives =
>>> +         ctx->Hint.FragmentShaderDerivative != GL_FASTEST;
>>> +   }
>>> +   else {
>>> +      key->high_quality_derivatives =
>>> +         ctx->Hint.FragmentShaderDerivative == GL_NICEST;
>>> +   }
>>> +
>>>     if (brw->gen < 6)
>>>        key->stats_wm = brw->stats_wm;
>>>
>>> @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = {
>>>               _NEW_STENCIL |
>>>               _NEW_POLYGON |
>>>               _NEW_LINE |
>>> +             _NEW_HINT |
>>>               _NEW_LIGHT |
>>>               _NEW_FRAG_CLAMP |
>>>               _NEW_BUFFERS |
>>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
>>> index f7a2c5f..aa786de 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_wm.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_wm.h
>>> @@ -66,6 +66,7 @@ struct brw_wm_prog_key {
>>>     GLuint render_to_fbo:1;
>>>     GLuint clamp_fragment_color:1;
>>>     GLuint line_aa:2;
>>> +   GLuint high_quality_derivatives:1;
>>>
>>>     GLushort drawable_height;
>>>     GLbitfield64 input_slots_valid;
>>> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
>>> index de80a00..cddc8e8 100644
>>> --- a/src/mesa/drivers/dri/i965/intel_screen.c
>>> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
>>> @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] =
>>>        DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+")
>>>        DRI_CONF_OPT_END
>>>
>>> +      DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false")
>>> +      DRI_CONF_DESC(en, "Derivatives with finer granularity by default")
>>> +      DRI_CONF_OPT_END
>>> +
>>>     DRI_CONF_SECTION_END
>>>     DRI_CONF_SECTION_QUALITY
>>>        DRI_CONF_FORCE_S3TC_ENABLE("false")
>>>
>>

-- 
olv at LunarG.com