[Mesa-dev] [PATCH] i965: Only emit 1 viewport when possible.
Ian Romanick
idr at freedesktop.org
Tue Oct 4 01:05:36 UTC 2016
Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>
On 09/26/2016 11:23 AM, Kenneth Graunke wrote:
> In core profile, we support up to 16 viewports. However, in the
> majority of cases, only 1 of them is actually used - we only need
> the others if the last shader stage prior to the rasterizer writes
> gl_ViewportIndex.
>
> Processing all 16 viewports adds additional CPU overhead, which hurts
> CPU-intensive workloads such as Glamor. This meant that switching to
> core profile actually penalized Glamor to an extent, which is
> unfortunate.
>
> This patch tracks the number of relevant viewports, switching between
> 1 and ctx->Const.MaxViewports if gl_ViewportIndex is written. A new
> BRW_NEW_VIEWPORT_COUNT flag tracks this. This could mean re-emitting
> viewport state when switching, but hopefully this is offset by doing
> 1/16th of the work in the common case. The new flag is also lighter
> weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case.
>
> According to Eric Anholt, this reduces the CPU overhead of scissor and
> viewport state changes n Glamor from 2.5% or so to .8% or so.
>
> Cc: Eric Anholt <eric at anholt.net>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/brw_cc.c | 10 +++++++---
> src/mesa/drivers/dri/i965/brw_context.c | 1 +
> src/mesa/drivers/dri/i965/brw_context.h | 9 +++++++++
> src/mesa/drivers/dri/i965/brw_gs_state.c | 6 ++++--
> src/mesa/drivers/dri/i965/brw_state_upload.c | 11 +++++++++++
> src/mesa/drivers/dri/i965/gen6_clip_state.c | 16 +++++++---------
> src/mesa/drivers/dri/i965/gen6_scissor_state.c | 10 +++++++---
> src/mesa/drivers/dri/i965/gen6_viewport_state.c | 22 +++++++++++++++-------
> src/mesa/drivers/dri/i965/gen7_viewport_state.c | 10 +++++++---
> src/mesa/drivers/dri/i965/gen8_viewport_state.c | 10 +++++++---
> 10 files changed, 75 insertions(+), 30 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
> index 5c58b44..b11d7c8 100644
> --- a/src/mesa/drivers/dri/i965/brw_cc.c
> +++ b/src/mesa/drivers/dri/i965/brw_cc.c
> @@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw)
> struct gl_context *ctx = &brw->ctx;
> struct brw_cc_viewport *ccv;
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE,
> - sizeof(*ccv) * ctx->Const.MaxViewports, 32,
> + sizeof(*ccv) * viewport_count, 32,
> &brw->cc.vp_offset);
>
> /* _NEW_TRANSFORM */
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> if (ctx->Transform.DepthClamp) {
> /* _NEW_VIEWPORT */
> ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near,
> @@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = {
> .mesa = _NEW_TRANSFORM |
> _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = brw_upload_cc_vp
> };
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 6efad78..b0eec16 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api,
> brw->prim_restart.enable_cut_index = false;
> brw->gs.enabled = false;
> brw->sf.viewport_transform_enable = true;
> + brw->clip.viewport_count = 1;
>
> brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 00f0adc..b27fe51 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -226,6 +226,7 @@ enum brw_state_id {
> BRW_STATE_URB_SIZE,
> BRW_STATE_CC_STATE,
> BRW_STATE_BLORP,
> + BRW_STATE_VIEWPORT_COUNT,
> BRW_NUM_STATE_BITS
> };
>
> @@ -294,6 +295,7 @@ enum brw_state_id {
> #define BRW_NEW_PROGRAM_CACHE (1ull << BRW_STATE_PROGRAM_CACHE)
> #define BRW_NEW_STATE_BASE_ADDRESS (1ull << BRW_STATE_STATE_BASE_ADDRESS)
> #define BRW_NEW_VUE_MAP_GEOM_OUT (1ull << BRW_STATE_VUE_MAP_GEOM_OUT)
> +#define BRW_NEW_VIEWPORT_COUNT (1ull << BRW_STATE_VIEWPORT_COUNT)
> #define BRW_NEW_TRANSFORM_FEEDBACK (1ull << BRW_STATE_TRANSFORM_FEEDBACK)
> #define BRW_NEW_RASTERIZER_DISCARD (1ull << BRW_STATE_RASTERIZER_DISCARD)
> #define BRW_NEW_STATS_WM (1ull << BRW_STATE_STATS_WM)
> @@ -1160,6 +1162,13 @@ struct brw_context
> * instead of vp_bo.
> */
> uint32_t vp_offset;
> +
> + /**
> + * The number of viewports to use. If gl_ViewportIndex is written,
> + * we can have up to ctx->Const.MaxViewports viewports. If not,
> + * the viewport index is always 0, so we can only emit one.
> + */
> + uint8_t viewport_count;
> } clip;
>
>
> diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
> index 1757201..8e3bf1e 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
> @@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw)
> if (unlikely(INTEL_DEBUG & DEBUG_STATS))
> gs->thread4.stats_enable = 1;
>
> - gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1;
> + /* BRW_NEW_VIEWPORT_COUNT */
> + gs->gs6.max_vp_index = brw->clip.viewport_count - 1;
>
> brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
> }
> @@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = {
> BRW_NEW_CURBE_OFFSETS |
> BRW_NEW_FF_GS_PROG_DATA |
> BRW_NEW_PROGRAM_CACHE |
> - BRW_NEW_URB_FENCE,
> + BRW_NEW_URB_FENCE |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = brw_upload_gs_unit,
> };
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 60f3be6..4f74e23 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = {
> DEFINE_BIT(BRW_NEW_URB_SIZE),
> DEFINE_BIT(BRW_NEW_CC_STATE),
> DEFINE_BIT(BRW_NEW_BLORP),
> + DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT),
> {0, 0, 0}
> };
>
> @@ -710,6 +711,8 @@ static inline void
> brw_upload_programs(struct brw_context *brw,
> enum brw_pipeline pipeline)
> {
> + struct gl_context *ctx = &brw->ctx;
> +
> if (pipeline == BRW_RENDER_PIPELINE) {
> brw_upload_vs_prog(brw);
> brw_upload_tess_programs(brw);
> @@ -736,6 +739,14 @@ brw_upload_programs(struct brw_context *brw,
> old_separate != brw->vue_map_geom_out.separate)
> brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
>
> + if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
> + VARYING_BIT_VIEWPORT) {
> + ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
> + brw->clip.viewport_count =
> + (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
> + ctx->Const.MaxViewports : 1;
> + }
> +
> if (brw->gen < 6) {
> brw_setup_vue_interpolation(brw);
> brw_upload_clip_prog(brw);
> diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> index 7dc9740..9c33e67 100644
> --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> @@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw)
>
> dw2 |= GEN6_CLIP_GB_TEST;
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> /* We need to disable guardband clipping if the guardband (which we always
> * program to the maximum screen-space bounding box of 8K x 8K) will be
> * smaller than the viewport.
> @@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
> * "objects must have a screenspace bounding box not exceeding 8K in the X
> * or Y direction" restriction. Instead, they're clipped.
> */
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> if (ctx->ViewportArray[i].Width > 8192 ||
> ctx->ViewportArray[i].Height > 8192) {
> dw2 &= ~GEN6_CLIP_GB_TEST;
> @@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw)
> const float fb_width = (float)_mesa_geometric_width(fb);
> const float fb_height = (float)_mesa_geometric_height(fb);
>
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> if (ctx->ViewportArray[i].X != 0 ||
> ctx->ViewportArray[i].Y != 0 ||
> ctx->ViewportArray[i].Width != fb_width ||
> @@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw)
> if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
> dw2 |= GEN6_CLIP_XY_TEST;
>
> - /* BRW_NEW_VUE_MAP_GEOM_OUT */
> - const int max_vp_index =
> - (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ?
> - ctx->Const.MaxViewports : 1;
> -
> BEGIN_BATCH(4);
> OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
> OUT_BATCH(dw1);
> @@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw)
> OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
> U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
> (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
> - ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
> + ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
> ADVANCE_BATCH();
> }
>
> @@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = {
> BRW_NEW_PRIMITIVE |
> BRW_NEW_RASTERIZER_DISCARD |
> BRW_NEW_TES_PROG_DATA |
> - BRW_NEW_VUE_MAP_GEOM_OUT,
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = upload_clip_state,
> };
> diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> index b03ac73..860445a 100644
> --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> @@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
> const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
> const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
> - sizeof(*scissor) * ctx->Const.MaxViewports, 32,
> + sizeof(*scissor) * viewport_count, 32,
> &scissor_state_offset);
>
> /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
> @@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw)
> * Note that the hardware's coordinates are inclusive, while Mesa's min is
> * inclusive but max is exclusive.
> */
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> int bbox[4];
>
> bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
> @@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = {
> _NEW_SCISSOR |
> _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = gen6_upload_scissor_state,
> };
> diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> index eacffb9..ad1e72d 100644
> --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> @@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw)
> struct gl_context *ctx = &brw->ctx;
> struct brw_clipper_viewport *vp;
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE,
> - sizeof(*vp) * ctx->Const.MaxViewports, 32, &brw->clip.vp_offset);
> + sizeof(*vp) * viewport_count, 32, &brw->clip.vp_offset);
>
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> /* According to the "Vertex X,Y Clamping and Quantization" section of the
> * Strips and Fans documentation, objects must not have a screen-space
> * extents of over 8192 pixels, or they may be mis-rasterized. The maximum
> @@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = {
> .dirty = {
> .mesa = _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = gen6_upload_clip_vp,
> };
> @@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw)
> GLfloat y_scale, y_bias;
> const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> - sizeof(*sfv) * ctx->Const.MaxViewports,
> + sizeof(*sfv) * viewport_count,
> 32, &brw->sf.vp_offset);
> - memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports);
> + memset(sfv, 0, sizeof(*sfv) * viewport_count);
>
> /* _NEW_BUFFERS */
> if (render_to_fbo) {
> @@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
> y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
> }
>
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> float scale[3], translate[3];
>
> /* _NEW_VIEWPORT */
> @@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = {
> .mesa = _NEW_BUFFERS |
> _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = gen6_upload_sf_vp,
> };
> diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> index 34f93af..c447331 100644
> --- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> @@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
> const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
> struct gen7_sf_clip_viewport *vp;
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> - sizeof(*vp) * ctx->Const.MaxViewports, 64,
> + sizeof(*vp) * viewport_count, 64,
> &brw->sf.vp_offset);
> /* Also assign to clip.vp_offset in case something uses it. */
> brw->clip.vp_offset = brw->sf.vp_offset;
> @@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
> y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
> }
>
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> float scale[3], translate[3];
> _mesa_get_viewport_xform(ctx, i, scale, translate);
>
> @@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = {
> .mesa = _NEW_BUFFERS |
> _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = gen7_upload_sf_clip_viewport,
> };
> diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> index acaee1a..84000e3 100644
> --- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> @@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
> const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
> const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>
> + /* BRW_NEW_VIEWPORT_COUNT */
> + const unsigned viewport_count = brw->clip.viewport_count;
> +
> float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
> - 16 * 4 * ctx->Const.MaxViewports,
> + 16 * 4 * viewport_count,
> 64, &brw->sf.vp_offset);
> /* Also assign to clip.vp_offset in case something uses it. */
> brw->clip.vp_offset = brw->sf.vp_offset;
> @@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
> y_bias = fb_height;
> }
>
> - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
> + for (unsigned i = 0; i < viewport_count; i++) {
> float scale[3], translate[3];
> _mesa_get_viewport_xform(ctx, i, scale, translate);
>
> @@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = {
> .mesa = _NEW_BUFFERS |
> _NEW_VIEWPORT,
> .brw = BRW_NEW_BATCH |
> - BRW_NEW_BLORP,
> + BRW_NEW_BLORP |
> + BRW_NEW_VIEWPORT_COUNT,
> },
> .emit = gen8_upload_sf_clip_viewport,
> };
>
More information about the mesa-dev
mailing list