[Mesa-dev] [PATCH] i965: Only emit 1 viewport when possible.
Kenneth Graunke
kenneth at whitecape.org
Mon Sep 26 18:23:11 UTC 2016
In core profile, we support up to 16 viewports. However, in the
majority of cases, only 1 of them is actually used - we only need
the others if the last shader stage prior to the rasterizer writes
gl_ViewportIndex.
Processing all 16 viewports adds additional CPU overhead, which hurts
CPU-intensive workloads such as Glamor. This meant that switching to
core profile actually penalized Glamor to an extent, which is
unfortunate.
This patch tracks the number of relevant viewports, switching between
1 and ctx->Const.MaxViewports if gl_ViewportIndex is written. A new
BRW_NEW_VIEWPORT_COUNT flag tracks this. This could mean re-emitting
viewport state when switching, but hopefully this is offset by doing
1/16th of the work in the common case. The new flag is also lighter
weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case.
According to Eric Anholt, this reduces the CPU overhead of scissor and
viewport state changes n Glamor from 2.5% or so to .8% or so.
Cc: Eric Anholt <eric at anholt.net>
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
src/mesa/drivers/dri/i965/brw_cc.c | 10 +++++++---
src/mesa/drivers/dri/i965/brw_context.c | 1 +
src/mesa/drivers/dri/i965/brw_context.h | 9 +++++++++
src/mesa/drivers/dri/i965/brw_gs_state.c | 6 ++++--
src/mesa/drivers/dri/i965/brw_state_upload.c | 11 +++++++++++
src/mesa/drivers/dri/i965/gen6_clip_state.c | 16 +++++++---------
src/mesa/drivers/dri/i965/gen6_scissor_state.c | 10 +++++++---
src/mesa/drivers/dri/i965/gen6_viewport_state.c | 22 +++++++++++++++-------
src/mesa/drivers/dri/i965/gen7_viewport_state.c | 10 +++++++---
src/mesa/drivers/dri/i965/gen8_viewport_state.c | 10 +++++++---
10 files changed, 75 insertions(+), 30 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 5c58b44..b11d7c8 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw)
struct gl_context *ctx = &brw->ctx;
struct brw_cc_viewport *ccv;
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE,
- sizeof(*ccv) * ctx->Const.MaxViewports, 32,
+ sizeof(*ccv) * viewport_count, 32,
&brw->cc.vp_offset);
/* _NEW_TRANSFORM */
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
if (ctx->Transform.DepthClamp) {
/* _NEW_VIEWPORT */
ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near,
@@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = {
.mesa = _NEW_TRANSFORM |
_NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = brw_upload_cc_vp
};
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 6efad78..b0eec16 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api,
brw->prim_restart.enable_cut_index = false;
brw->gs.enabled = false;
brw->sf.viewport_transform_enable = true;
+ brw->clip.viewport_count = 1;
brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 00f0adc..b27fe51 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -226,6 +226,7 @@ enum brw_state_id {
BRW_STATE_URB_SIZE,
BRW_STATE_CC_STATE,
BRW_STATE_BLORP,
+ BRW_STATE_VIEWPORT_COUNT,
BRW_NUM_STATE_BITS
};
@@ -294,6 +295,7 @@ enum brw_state_id {
#define BRW_NEW_PROGRAM_CACHE (1ull << BRW_STATE_PROGRAM_CACHE)
#define BRW_NEW_STATE_BASE_ADDRESS (1ull << BRW_STATE_STATE_BASE_ADDRESS)
#define BRW_NEW_VUE_MAP_GEOM_OUT (1ull << BRW_STATE_VUE_MAP_GEOM_OUT)
+#define BRW_NEW_VIEWPORT_COUNT (1ull << BRW_STATE_VIEWPORT_COUNT)
#define BRW_NEW_TRANSFORM_FEEDBACK (1ull << BRW_STATE_TRANSFORM_FEEDBACK)
#define BRW_NEW_RASTERIZER_DISCARD (1ull << BRW_STATE_RASTERIZER_DISCARD)
#define BRW_NEW_STATS_WM (1ull << BRW_STATE_STATS_WM)
@@ -1160,6 +1162,13 @@ struct brw_context
* instead of vp_bo.
*/
uint32_t vp_offset;
+
+ /**
+ * The number of viewports to use. If gl_ViewportIndex is written,
+ * we can have up to ctx->Const.MaxViewports viewports. If not,
+ * the viewport index is always 0, so we can only emit one.
+ */
+ uint8_t viewport_count;
} clip;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 1757201..8e3bf1e 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw)
if (unlikely(INTEL_DEBUG & DEBUG_STATS))
gs->thread4.stats_enable = 1;
- gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1;
+ /* BRW_NEW_VIEWPORT_COUNT */
+ gs->gs6.max_vp_index = brw->clip.viewport_count - 1;
brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
}
@@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = {
BRW_NEW_CURBE_OFFSETS |
BRW_NEW_FF_GS_PROG_DATA |
BRW_NEW_PROGRAM_CACHE |
- BRW_NEW_URB_FENCE,
+ BRW_NEW_URB_FENCE |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = brw_upload_gs_unit,
};
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 60f3be6..4f74e23 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = {
DEFINE_BIT(BRW_NEW_URB_SIZE),
DEFINE_BIT(BRW_NEW_CC_STATE),
DEFINE_BIT(BRW_NEW_BLORP),
+ DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT),
{0, 0, 0}
};
@@ -710,6 +711,8 @@ static inline void
brw_upload_programs(struct brw_context *brw,
enum brw_pipeline pipeline)
{
+ struct gl_context *ctx = &brw->ctx;
+
if (pipeline == BRW_RENDER_PIPELINE) {
brw_upload_vs_prog(brw);
brw_upload_tess_programs(brw);
@@ -736,6 +739,14 @@ brw_upload_programs(struct brw_context *brw,
old_separate != brw->vue_map_geom_out.separate)
brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
+ if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
+ VARYING_BIT_VIEWPORT) {
+ ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
+ brw->clip.viewport_count =
+ (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
+ ctx->Const.MaxViewports : 1;
+ }
+
if (brw->gen < 6) {
brw_setup_vue_interpolation(brw);
brw_upload_clip_prog(brw);
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 7dc9740..9c33e67 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw)
dw2 |= GEN6_CLIP_GB_TEST;
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
/* We need to disable guardband clipping if the guardband (which we always
* program to the maximum screen-space bounding box of 8K x 8K) will be
* smaller than the viewport.
@@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
* "objects must have a screenspace bounding box not exceeding 8K in the X
* or Y direction" restriction. Instead, they're clipped.
*/
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
if (ctx->ViewportArray[i].Width > 8192 ||
ctx->ViewportArray[i].Height > 8192) {
dw2 &= ~GEN6_CLIP_GB_TEST;
@@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw)
const float fb_width = (float)_mesa_geometric_width(fb);
const float fb_height = (float)_mesa_geometric_height(fb);
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
if (ctx->ViewportArray[i].X != 0 ||
ctx->ViewportArray[i].Y != 0 ||
ctx->ViewportArray[i].Width != fb_width ||
@@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw)
if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
dw2 |= GEN6_CLIP_XY_TEST;
- /* BRW_NEW_VUE_MAP_GEOM_OUT */
- const int max_vp_index =
- (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ?
- ctx->Const.MaxViewports : 1;
-
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
OUT_BATCH(dw1);
@@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw)
OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
(_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
- ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
+ ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
ADVANCE_BATCH();
}
@@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = {
BRW_NEW_PRIMITIVE |
BRW_NEW_RASTERIZER_DISCARD |
BRW_NEW_TES_PROG_DATA |
- BRW_NEW_VUE_MAP_GEOM_OUT,
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = upload_clip_state,
};
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index b03ac73..860445a 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
- sizeof(*scissor) * ctx->Const.MaxViewports, 32,
+ sizeof(*scissor) * viewport_count, 32,
&scissor_state_offset);
/* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
@@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw)
* Note that the hardware's coordinates are inclusive, while Mesa's min is
* inclusive but max is exclusive.
*/
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
int bbox[4];
bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
@@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = {
_NEW_SCISSOR |
_NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = gen6_upload_scissor_state,
};
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index eacffb9..ad1e72d 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw)
struct gl_context *ctx = &brw->ctx;
struct brw_clipper_viewport *vp;
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE,
- sizeof(*vp) * ctx->Const.MaxViewports, 32, &brw->clip.vp_offset);
+ sizeof(*vp) * viewport_count, 32, &brw->clip.vp_offset);
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
/* According to the "Vertex X,Y Clamping and Quantization" section of the
* Strips and Fans documentation, objects must not have a screen-space
* extents of over 8192 pixels, or they may be mis-rasterized. The maximum
@@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = {
.dirty = {
.mesa = _NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = gen6_upload_clip_vp,
};
@@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw)
GLfloat y_scale, y_bias;
const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
- sizeof(*sfv) * ctx->Const.MaxViewports,
+ sizeof(*sfv) * viewport_count,
32, &brw->sf.vp_offset);
- memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports);
+ memset(sfv, 0, sizeof(*sfv) * viewport_count);
/* _NEW_BUFFERS */
if (render_to_fbo) {
@@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
}
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
float scale[3], translate[3];
/* _NEW_VIEWPORT */
@@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = {
.mesa = _NEW_BUFFERS |
_NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = gen6_upload_sf_vp,
};
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index 34f93af..c447331 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
struct gen7_sf_clip_viewport *vp;
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
- sizeof(*vp) * ctx->Const.MaxViewports, 64,
+ sizeof(*vp) * viewport_count, 64,
&brw->sf.vp_offset);
/* Also assign to clip.vp_offset in case something uses it. */
brw->clip.vp_offset = brw->sf.vp_offset;
@@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
}
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
float scale[3], translate[3];
_mesa_get_viewport_xform(ctx, i, scale, translate);
@@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = {
.mesa = _NEW_BUFFERS |
_NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = gen7_upload_sf_clip_viewport,
};
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index acaee1a..84000e3 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+ /* BRW_NEW_VIEWPORT_COUNT */
+ const unsigned viewport_count = brw->clip.viewport_count;
+
float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
- 16 * 4 * ctx->Const.MaxViewports,
+ 16 * 4 * viewport_count,
64, &brw->sf.vp_offset);
/* Also assign to clip.vp_offset in case something uses it. */
brw->clip.vp_offset = brw->sf.vp_offset;
@@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
y_bias = fb_height;
}
- for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
+ for (unsigned i = 0; i < viewport_count; i++) {
float scale[3], translate[3];
_mesa_get_viewport_xform(ctx, i, scale, translate);
@@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = {
.mesa = _NEW_BUFFERS |
_NEW_VIEWPORT,
.brw = BRW_NEW_BATCH |
- BRW_NEW_BLORP,
+ BRW_NEW_BLORP |
+ BRW_NEW_VIEWPORT_COUNT,
},
.emit = gen8_upload_sf_clip_viewport,
};
--
2.10.0
More information about the mesa-dev
mailing list