[Mesa-dev] [PATCH] i965: compute DDX in a subspan based only on top row
Chris Forbes
chrisf at ijw.co.nz
Tue Oct 1 11:14:02 PDT 2013
With those fixes:
Reviewed-by: Chris Forbes <chrisf at ijw.co.nz>
On Wed, Oct 2, 2013 at 6:38 AM, Ian Romanick <idr at freedesktop.org> wrote:
> On 09/30/2013 10:54 PM, Chia-I Wu wrote:
>> From: Chia-I Wu <olv at lunarg.com>
>
> I agree with both of Ken's comments. With those fixed, this patch is
>
> Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>
>
>> Consider only the top-left and top-right pixels to approximate DDX in a 2x2
>> subspan, unless the application requests a more accurate approximation via
>> GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
>> new driconf option disable_derivative_optimization.
>>
>> This results in a less accurate approximation. However, it improves the
>> performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0%
>> confidence) on Haswell. No noticeable image quality difference observed.
>>
>> The improvement comes from faster sample_d. It seems, on Haswell, some
>> optimizations are introduced to allow faster sample_d when all pixels in a
>> subspan have the same derivative. I considered SAMPLE_STATE too, which allows
>> one to control the quality of sample_d on Haswell. But it gave much worse
>> image quality without giving better performance comparing to this change.
>>
>> No piglit quick.tests regression on Haswell, except with in-parameter-struct
>> and normal-parameter-struct tests which appear to be noises.
>>
>> Signed-off-by: Chia-I Wu <olv at lunarg.com>
>> ---
>> src/mesa/drivers/dri/i965/brw_context.c | 2 ++
>> src/mesa/drivers/dri/i965/brw_context.h | 1 +
>> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 +++++++++++++++++++-------
>> src/mesa/drivers/dri/i965/brw_wm.c | 11 +++++++++
>> src/mesa/drivers/dri/i965/brw_wm.h | 1 +
>> src/mesa/drivers/dri/i965/intel_screen.c | 4 ++++
>> 6 files changed, 44 insertions(+), 8 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
>> index 5f58a29..18b8e57 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.c
>> +++ b/src/mesa/drivers/dri/i965/brw_context.c
>> @@ -478,6 +478,8 @@ brwCreateContext(int api,
>> brw_draw_init( brw );
>>
>> brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
>> + brw->disable_derivative_optimization =
>> + driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
>>
>> ctx->Const.ContextFlags = 0;
>> if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
>> index 0f88bad..0ec1218 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -1005,6 +1005,7 @@ struct brw_context
>> bool always_flush_cache;
>> bool disable_throttling;
>> bool precompile;
>> + bool disable_derivative_optimization;
>>
>> driOptionCache optionCache;
>> /** @} */
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> index 7ce42c4..9eb5e17 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>> *
>> * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
>> *
>> - * and we're trying to produce:
>> + * Ideally, we want to produce:
>> *
>> * DDX DDY
>> * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
>> @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>> *
>> * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
>> * for each pair, and vertstride = 2 jumps us 2 elements after processing a
>> - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
>> - * between each other. We could probably do it like ddx and swizzle the right
>> - * order later, but bail for now and just produce
>> + * pair. But the ideal approximation may impose a huge performance cost on
>> + * sample_d. On at least Haswell, sample_d instruction does some
>> + * optimizations if the same LOD is used for all pixels in the subspan.
>> + *
>> + * For DDY, it's harder, as we want to produce the pairs swizzled between each
>> + * other. We could probably do it like ddx and swizzle the right order later,
>> + * but bail for now and just produce
>> * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
>> */
>> void
>> fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
>> {
>> + unsigned vstride, width;
>> +
>> + if (c->key.high_quality_derivatives) {
>> + /* produce accurate derivatives */
>> + vstride = BRW_VERTICAL_STRIDE_2;
>> + width = BRW_WIDTH_2;
>> + }
>> + else {
>> + /* replicate the derivative at the top-left pixel to other pixels */
>> + vstride = BRW_VERTICAL_STRIDE_4;
>> + width = BRW_WIDTH_4;
>> + }
>> +
>> struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
>> BRW_REGISTER_TYPE_F,
>> - BRW_VERTICAL_STRIDE_2,
>> - BRW_WIDTH_2,
>> + vstride,
>> + width,
>> BRW_HORIZONTAL_STRIDE_0,
>> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>> struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
>> BRW_REGISTER_TYPE_F,
>> - BRW_VERTICAL_STRIDE_2,
>> - BRW_WIDTH_2,
>> + vstride,
>> + width,
>> BRW_HORIZONTAL_STRIDE_0,
>> BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
>> brw_ADD(p, dst, src0, negate(src1));
>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
>> index 3d7ca2a..5158bcc 100644
>> --- a/src/mesa/drivers/dri/i965/brw_wm.c
>> +++ b/src/mesa/drivers/dri/i965/brw_wm.c
>> @@ -416,6 +416,16 @@ static void brw_wm_populate_key( struct brw_context *brw,
>>
>> key->line_aa = line_aa;
>>
>> + /* _NEW_HINT */
>> + if (brw->disable_derivative_optimization) {
>> + key->high_quality_derivatives =
>> + ctx->Hint.FragmentShaderDerivative != GL_FASTEST;
>> + }
>> + else {
>> + key->high_quality_derivatives =
>> + ctx->Hint.FragmentShaderDerivative == GL_NICEST;
>> + }
>> +
>> if (brw->gen < 6)
>> key->stats_wm = brw->stats_wm;
>>
>> @@ -503,6 +513,7 @@ const struct brw_tracked_state brw_wm_prog = {
>> _NEW_STENCIL |
>> _NEW_POLYGON |
>> _NEW_LINE |
>> + _NEW_HINT |
>> _NEW_LIGHT |
>> _NEW_FRAG_CLAMP |
>> _NEW_BUFFERS |
>> diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
>> index f7a2c5f..aa786de 100644
>> --- a/src/mesa/drivers/dri/i965/brw_wm.h
>> +++ b/src/mesa/drivers/dri/i965/brw_wm.h
>> @@ -66,6 +66,7 @@ struct brw_wm_prog_key {
>> GLuint render_to_fbo:1;
>> GLuint clamp_fragment_color:1;
>> GLuint line_aa:2;
>> + GLuint high_quality_derivatives:1;
>>
>> GLushort drawable_height;
>> GLbitfield64 input_slots_valid;
>> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
>> index de80a00..cddc8e8 100644
>> --- a/src/mesa/drivers/dri/i965/intel_screen.c
>> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
>> @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] =
>> DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+")
>> DRI_CONF_OPT_END
>>
>> + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false")
>> + DRI_CONF_DESC(en, "Derivatives with finer granularity by default")
>> + DRI_CONF_OPT_END
>> +
>> DRI_CONF_SECTION_END
>> DRI_CONF_SECTION_QUALITY
>> DRI_CONF_FORCE_S3TC_ENABLE("false")
>>
>
More information about the mesa-dev
mailing list