[Mesa-dev] [PATCH] i965: Only emit 1 viewport when possible.

Anuj Phogat anuj.phogat at gmail.com
Mon Sep 26 20:48:41 UTC 2016


On Mon, Sep 26, 2016 at 11:23 AM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> In core profile, we support up to 16 viewports.  However, in the
> majority of cases, only 1 of them is actually used - we only need
> the others if the last shader stage prior to the rasterizer writes
> gl_ViewportIndex.
>
> Processing all 16 viewports adds additional CPU overhead, which hurts
> CPU-intensive workloads such as Glamor.  This meant that switching to
> core profile actually penalized Glamor to an extent, which is
> unfortunate.
>
> This patch tracks the number of relevant viewports, switching between
> 1 and ctx->Const.MaxViewports if gl_ViewportIndex is written.  A new
> BRW_NEW_VIEWPORT_COUNT flag tracks this.  This could mean re-emitting
> viewport state when switching, but hopefully this is offset by doing
> 1/16th of the work in the common case.  The new flag is also lighter
> weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case.
>
> According to Eric Anholt, this reduces the CPU overhead of scissor and
> viewport state changes n Glamor from 2.5% or so to .8% or so.
>
> Cc: Eric Anholt <eric at anholt.net>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_cc.c              | 10 +++++++---
>  src/mesa/drivers/dri/i965/brw_context.c         |  1 +
>  src/mesa/drivers/dri/i965/brw_context.h         |  9 +++++++++
>  src/mesa/drivers/dri/i965/brw_gs_state.c        |  6 ++++--
>  src/mesa/drivers/dri/i965/brw_state_upload.c    | 11 +++++++++++
>  src/mesa/drivers/dri/i965/gen6_clip_state.c     | 16 +++++++---------
>  src/mesa/drivers/dri/i965/gen6_scissor_state.c  | 10 +++++++---
>  src/mesa/drivers/dri/i965/gen6_viewport_state.c | 22 +++++++++++++++-------
>  src/mesa/drivers/dri/i965/gen7_viewport_state.c | 10 +++++++---
>  src/mesa/drivers/dri/i965/gen8_viewport_state.c | 10 +++++++---
>  10 files changed, 75 insertions(+), 30 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
> index 5c58b44..b11d7c8 100644
> --- a/src/mesa/drivers/dri/i965/brw_cc.c
> +++ b/src/mesa/drivers/dri/i965/brw_cc.c
> @@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw)
>     struct gl_context *ctx = &brw->ctx;
>     struct brw_cc_viewport *ccv;
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE,
> -                        sizeof(*ccv) * ctx->Const.MaxViewports, 32,
> +                        sizeof(*ccv) * viewport_count, 32,
>                           &brw->cc.vp_offset);
>
>     /* _NEW_TRANSFORM */
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        if (ctx->Transform.DepthClamp) {
>           /* _NEW_VIEWPORT */
>           ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near,
> @@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = {
>        .mesa = _NEW_TRANSFORM |
>                _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = brw_upload_cc_vp
>  };
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 6efad78..b0eec16 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api,
>     brw->prim_restart.enable_cut_index = false;
>     brw->gs.enabled = false;
>     brw->sf.viewport_transform_enable = true;
> +   brw->clip.viewport_count = 1;
>
>     brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 00f0adc..b27fe51 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -226,6 +226,7 @@ enum brw_state_id {
>     BRW_STATE_URB_SIZE,
>     BRW_STATE_CC_STATE,
>     BRW_STATE_BLORP,
> +   BRW_STATE_VIEWPORT_COUNT,
>     BRW_NUM_STATE_BITS
>  };
>
> @@ -294,6 +295,7 @@ enum brw_state_id {
>  #define BRW_NEW_PROGRAM_CACHE           (1ull << BRW_STATE_PROGRAM_CACHE)
>  #define BRW_NEW_STATE_BASE_ADDRESS      (1ull << BRW_STATE_STATE_BASE_ADDRESS)
>  #define BRW_NEW_VUE_MAP_GEOM_OUT        (1ull << BRW_STATE_VUE_MAP_GEOM_OUT)
> +#define BRW_NEW_VIEWPORT_COUNT          (1ull << BRW_STATE_VIEWPORT_COUNT)
>  #define BRW_NEW_TRANSFORM_FEEDBACK      (1ull << BRW_STATE_TRANSFORM_FEEDBACK)
>  #define BRW_NEW_RASTERIZER_DISCARD      (1ull << BRW_STATE_RASTERIZER_DISCARD)
>  #define BRW_NEW_STATS_WM                (1ull << BRW_STATE_STATS_WM)
> @@ -1160,6 +1162,13 @@ struct brw_context
>         * instead of vp_bo.
>         */
>        uint32_t vp_offset;
> +
> +      /**
> +       * The number of viewports to use.  If gl_ViewportIndex is written,
> +       * we can have up to ctx->Const.MaxViewports viewports.  If not,
> +       * the viewport index is always 0, so we can only emit one.
> +       */
> +      uint8_t viewport_count;
>     } clip;
>
>
> diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
> index 1757201..8e3bf1e 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
> @@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw)
>     if (unlikely(INTEL_DEBUG & DEBUG_STATS))
>        gs->thread4.stats_enable = 1;
>
> -   gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1;
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   gs->gs6.max_vp_index = brw->clip.viewport_count - 1;
>
>     brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
>  }
> @@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = {
>                 BRW_NEW_CURBE_OFFSETS |
>                 BRW_NEW_FF_GS_PROG_DATA |
>                 BRW_NEW_PROGRAM_CACHE |
> -               BRW_NEW_URB_FENCE,
> +               BRW_NEW_URB_FENCE |
> +               BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = brw_upload_gs_unit,
>  };
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 60f3be6..4f74e23 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = {
>     DEFINE_BIT(BRW_NEW_URB_SIZE),
>     DEFINE_BIT(BRW_NEW_CC_STATE),
>     DEFINE_BIT(BRW_NEW_BLORP),
> +   DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT),
>     {0, 0, 0}
>  };
>
> @@ -710,6 +711,8 @@ static inline void
>  brw_upload_programs(struct brw_context *brw,
>                      enum brw_pipeline pipeline)
>  {
> +   struct gl_context *ctx = &brw->ctx;
> +
>     if (pipeline == BRW_RENDER_PIPELINE) {
>        brw_upload_vs_prog(brw);
>        brw_upload_tess_programs(brw);
> @@ -736,6 +739,14 @@ brw_upload_programs(struct brw_context *brw,
>            old_separate != brw->vue_map_geom_out.separate)
>           brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
>
> +      if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
> +          VARYING_BIT_VIEWPORT) {
> +         ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
> +         brw->clip.viewport_count =
> +            (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
> +            ctx->Const.MaxViewports : 1;
> +      }
> +
>        if (brw->gen < 6) {
>           brw_setup_vue_interpolation(brw);
>           brw_upload_clip_prog(brw);
> diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> index 7dc9740..9c33e67 100644
> --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> @@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw)
>
>     dw2 |= GEN6_CLIP_GB_TEST;
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     /* We need to disable guardband clipping if the guardband (which we always
>      * program to the maximum screen-space bounding box of 8K x 8K) will be
>      * smaller than the viewport.
> @@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
>      * "objects must have a screenspace bounding box not exceeding 8K in the X
>      * or Y direction" restriction.  Instead, they're clipped.
>      */
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        if (ctx->ViewportArray[i].Width > 8192 ||
>            ctx->ViewportArray[i].Height > 8192) {
>           dw2 &= ~GEN6_CLIP_GB_TEST;
> @@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw)
>        const float fb_width = (float)_mesa_geometric_width(fb);
>        const float fb_height = (float)_mesa_geometric_height(fb);
>
> -      for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +      for (unsigned i = 0; i < viewport_count; i++) {
>           if (ctx->ViewportArray[i].X != 0 ||
>               ctx->ViewportArray[i].Y != 0 ||
>               ctx->ViewportArray[i].Width != fb_width ||
> @@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw)
>     if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
>        dw2 |= GEN6_CLIP_XY_TEST;
>
> -   /* BRW_NEW_VUE_MAP_GEOM_OUT */
> -   const int max_vp_index =
> -      (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ?
> -      ctx->Const.MaxViewports : 1;
> -
>     BEGIN_BATCH(4);
>     OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
>     OUT_BATCH(dw1);
> @@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw)
>     OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
>               U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
>               (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
> -             ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
> +             ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
>     ADVANCE_BATCH();
>  }
>
> @@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = {
>                 BRW_NEW_PRIMITIVE |
>                 BRW_NEW_RASTERIZER_DISCARD |
>                 BRW_NEW_TES_PROG_DATA |
> -               BRW_NEW_VUE_MAP_GEOM_OUT,
> +               BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = upload_clip_state,
>  };
> diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> index b03ac73..860445a 100644
> --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> @@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
>     const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
>     const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
> -                            sizeof(*scissor) * ctx->Const.MaxViewports, 32,
> +                            sizeof(*scissor) * viewport_count, 32,
>                               &scissor_state_offset);
>
>     /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
> @@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw)
>      * Note that the hardware's coordinates are inclusive, while Mesa's min is
>      * inclusive but max is exclusive.
>      */
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        int bbox[4];
>
>        bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
> @@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = {
>                _NEW_SCISSOR |
>                _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = gen6_upload_scissor_state,
>  };
> diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> index eacffb9..ad1e72d 100644
> --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> @@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw)
>     struct gl_context *ctx = &brw->ctx;
>     struct brw_clipper_viewport *vp;
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE,
> -                        sizeof(*vp) * ctx->Const.MaxViewports, 32, &brw->clip.vp_offset);
> +                        sizeof(*vp) * viewport_count, 32, &brw->clip.vp_offset);
>
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        /* According to the "Vertex X,Y Clamping and Quantization" section of the
>         * Strips and Fans documentation, objects must not have a screen-space
>         * extents of over 8192 pixels, or they may be mis-rasterized.  The maximum
> @@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = {
>     .dirty = {
>        .mesa = _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = gen6_upload_clip_vp,
>  };
> @@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw)
>     GLfloat y_scale, y_bias;
>     const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> -                         sizeof(*sfv) * ctx->Const.MaxViewports,
> +                         sizeof(*sfv) * viewport_count,
>                           32, &brw->sf.vp_offset);
> -   memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports);
> +   memset(sfv, 0, sizeof(*sfv) * viewport_count);
>
>     /* _NEW_BUFFERS */
>     if (render_to_fbo) {
> @@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
>        y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
>     }
>
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        float scale[3], translate[3];
>
>        /* _NEW_VIEWPORT */
> @@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = {
>        .mesa = _NEW_BUFFERS |
>                _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = gen6_upload_sf_vp,
>  };
> diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> index 34f93af..c447331 100644
> --- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> @@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
>     const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>     struct gen7_sf_clip_viewport *vp;
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> -                        sizeof(*vp) * ctx->Const.MaxViewports, 64,
> +                        sizeof(*vp) * viewport_count, 64,
>                          &brw->sf.vp_offset);
>     /* Also assign to clip.vp_offset in case something uses it. */
>     brw->clip.vp_offset = brw->sf.vp_offset;
> @@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
>        y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
>     }
>
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        float scale[3], translate[3];
>        _mesa_get_viewport_xform(ctx, i, scale, translate);
>
> @@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = {
>        .mesa = _NEW_BUFFERS |
>                _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = gen7_upload_sf_clip_viewport,
>  };
> diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> index acaee1a..84000e3 100644
> --- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> @@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
>     const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
>     const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>
> +   /* BRW_NEW_VIEWPORT_COUNT */
> +   const unsigned viewport_count = brw->clip.viewport_count;
> +
>     float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> -                               16 * 4 * ctx->Const.MaxViewports,
> +                               16 * 4 * viewport_count,
>                                 64, &brw->sf.vp_offset);
>     /* Also assign to clip.vp_offset in case something uses it. */
>     brw->clip.vp_offset = brw->sf.vp_offset;
> @@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
>        y_bias = fb_height;
>     }
>
> -   for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> +   for (unsigned i = 0; i < viewport_count; i++) {
>        float scale[3], translate[3];
>        _mesa_get_viewport_xform(ctx, i, scale, translate);
>
> @@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = {
>        .mesa = _NEW_BUFFERS |
>                _NEW_VIEWPORT,
>        .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP,
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VIEWPORT_COUNT,
>     },
>     .emit = gen8_upload_sf_clip_viewport,
>  };
> --
> 2.10.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

The idea looks good to me.

Acked-by: Anuj Phogat <anuj.phogat at gmail.com>


More information about the mesa-dev mailing list