[Mesa-dev] [PATCH 16/16] freedreno: a2xx: a20x hw binning

Wed Dec 19 16:40:05 UTC 2018

---
 src/gallium/drivers/freedreno/a2xx/fd2_draw.c |  32 +++-
 src/gallium/drivers/freedreno/a2xx/fd2_emit.c |  52 ++++++
 src/gallium/drivers/freedreno/a2xx/fd2_emit.h |   3 +-
 src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 150 ++++++++++++++++++
 .../drivers/freedreno/a2xx/fd2_program.c      |  11 +-
 .../drivers/freedreno/freedreno_batch.c       |   3 +
 .../drivers/freedreno/freedreno_batch.h       |   7 +
 .../drivers/freedreno/freedreno_draw.h        |   3 +
 .../drivers/freedreno/freedreno_gmem.c        |  29 +++-
 .../drivers/freedreno/freedreno_gmem.h        |   1 +
 .../drivers/freedreno/freedreno_screen.h      |   6 +
 11 files changed, 281 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 4e91267080..d3e440d144 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -75,11 +75,12 @@ emit_vertexbufs(struct fd_context *ctx)
 	// CONST(20,0) (or CONST(26,0) in soliv_vp)
 
 	fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements);
+	fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements);
 }
 
 static void
 draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
-		   struct fd_ringbuffer *ring, unsigned index_offset)
+		   struct fd_ringbuffer *ring, unsigned index_offset, bool binning)
 {
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
@@ -119,8 +120,22 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
 		OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
 	}
 
+	/* binning shader will take offset from C64 */
+	if (binning && is_a20x(ctx->screen)) {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+		OUT_RING(ring, 0x00000180);
+		OUT_RING(ring, fui(ctx->batch->num_vertices));
+		OUT_RING(ring, fui(0.0f));
+		OUT_RING(ring, fui(0.0f));
+		OUT_RING(ring, fui(0.0f));
+	}
+
+	enum pc_di_vis_cull_mode vismode = USE_VISIBILITY;
+	if (binning || info->mode == PIPE_PRIM_POINTS)
+		vismode = IGNORE_VISIBILITY;
+
 	fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode],
-				 IGNORE_VISIBILITY, info, index_offset);
+				 vismode, info, index_offset);
 
 	if (is_a20x(ctx->screen)) {
 		/* not sure why this is required, but it fixes some hangs */
@@ -145,6 +160,9 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo,
 	if (ctx->dirty & FD_DIRTY_VTXBUF)
 		emit_vertexbufs(ctx);
 
+	if (!(fd_mesa_debug & FD_DBG_NOBIN))
+		fd2_emit_state_binning(ctx, ctx->dirty);
+
 	fd2_emit_state(ctx, ctx->dirty);
 
 	/* a2xx can draw only 65535 vertices at once
@@ -166,17 +184,23 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo,
 		struct pipe_draw_info info = *pinfo;
 		unsigned count = info.count;
 		unsigned step = step_tbl[info.mode];
+		unsigned num_vertices = ctx->batch->num_vertices;
 
 		if (!step)
 			return false;
 
 		for (; count + step > 32766; count -= step) {
 			info.count = MIN2(count, 32766);
-			draw_impl(ctx, &info, ctx->batch->draw, index_offset);
+			draw_impl(ctx, &info, ctx->batch->draw, index_offset, false);
+			draw_impl(ctx, &info, ctx->batch->binning, index_offset, true);
 			info.start += step;
+			ctx->batch->num_vertices += step;
 		}
+		/* changing this value is a hack, restore it */
+		ctx->batch->num_vertices = num_vertices;
 	} else {
-		draw_impl(ctx, pinfo, ctx->batch->draw, index_offset);
+		draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false);
+		draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true);
 	}
 
 	fd_context_all_clean(ctx);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index 9628f26736..7371fa6e8c 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -185,6 +185,58 @@ fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 	}
 }
 
+void
+fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
+{
+	struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend);
+	struct fd_ringbuffer *ring = ctx->batch->binning;
+
+	/* subset of fd2_emit_state needed for hw binning on a20x */
+
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE))
+		fd2_program_emit(ctx, ring, &ctx->prog);
+
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) {
+		emit_constants(ring,  VS_CONST_BASE * 4,
+				&ctx->constbuf[PIPE_SHADER_VERTEX],
+				(dirty & FD_DIRTY_PROG) ? ctx->prog.vp : NULL);
+	}
+
+	if (dirty & FD_DIRTY_VIEWPORT) {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 9);
+		OUT_RING(ring, 0x00000184);
+		OUT_RING(ring, fui(ctx->viewport.translate[0]));
+		OUT_RING(ring, fui(ctx->viewport.translate[1]));
+		OUT_RING(ring, fui(ctx->viewport.translate[2]));
+		OUT_RING(ring, fui(0.0f));
+		OUT_RING(ring, fui(ctx->viewport.scale[0]));
+		OUT_RING(ring, fui(ctx->viewport.scale[1]));
+		OUT_RING(ring, fui(ctx->viewport.scale[2]));
+		OUT_RING(ring, fui(0.0f));
+	}
+
+	/* not sure why this is needed */
+	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
+		enum pipe_format format =
+			pipe_surface_format(ctx->batch->framebuffer.cbufs[0]);
+		bool has_alpha = util_format_has_alpha(format);
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL));
+		OUT_RING(ring, blend->rb_blendcontrol_alpha |
+			COND(has_alpha, blend->rb_blendcontrol_rgb) |
+			COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb));
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK));
+		OUT_RING(ring, blend->rb_colormask);
+	}
+
+	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+	OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL));
+	OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE);
+}
+
 void
 fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
 {
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
index 5e4bddd1fa..891ed91e5a 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@@ -40,7 +40,8 @@ struct fd2_vertex_buf {
 
 void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 		struct fd2_vertex_buf *vbufs, uint32_t n);
-void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty);
+void fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty);
+void fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty);
 void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring);
 
 void fd2_emit_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index e626f1ba1a..7b6bbef2fb 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -39,6 +39,7 @@
 #include "fd2_program.h"
 #include "fd2_util.h"
 #include "fd2_zsa.h"
+#include "instr-a2xx.h"
 
 static uint32_t fmt2swap(enum pipe_format format)
 {
@@ -367,6 +368,31 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
 	/* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */
 }
 
+static void
+patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode)
+{
+	unsigned i;
+
+	if (vismode == USE_VISIBILITY)
+		return;
+
+	for (i = 0; i < batch->draw_patches.size / sizeof(uint32_t*); i++) {
+		uint32_t *ptr = *util_dynarray_element(&batch->draw_patches, uint32_t*, i);
+		unsigned cnt = ptr[0] >> 16 & 0xfff; /* 5 with idx buffer, 3 without */
+
+		/* convert CP_DRAW_INDX_BIN to a CP_DRAW_INDX
+		 * replace first two DWORDS with NOP and move the rest down
+		 * (we don't want to have to move the idx buffer reloc)
+		 */
+		ptr[0] = CP_TYPE3_PKT | (CP_NOP << 8);
+		ptr[1] = 0x00000000;
+
+		ptr[4] = ptr[2] & ~(1 << 14 | 1 << 15); /* remove cull_enable bits */
+		ptr[2] = CP_TYPE3_PKT | ((cnt-2) << 16) | (CP_DRAW_INDX << 8);
+		ptr[3] = 0x00000000;
+	}
+}
+
 static void
 fd2_emit_sysmem_prep(struct fd_batch *batch)
 {
@@ -426,6 +452,10 @@ fd2_emit_sysmem_prep(struct fd_batch *batch)
 	OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET));
 	OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(0) |
 			A2XX_PA_SC_WINDOW_OFFSET_Y(0));
+
+	patch_draws(batch, IGNORE_VISIBILITY);
+	util_dynarray_resize(&batch->draw_patches, 0);
+	util_dynarray_resize(&batch->shader_patches, 0);
 }
 
 /* before first tile */
@@ -450,6 +480,109 @@ fd2_emit_tile_init(struct fd_batch *batch)
 	if (pfb->zsbuf)
 		reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format));
 	OUT_RING(ring, reg);                         /* RB_DEPTH_INFO */
+
+	/* set to zero, for some reason hardware doesn't certain values */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+	OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN));
+	OUT_RING(ring, 0);
+
+	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+	OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
+	OUT_RING(ring, 0);
+
+	if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN) &&
+		gmem->num_vsc_pipes) {
+		/* patch out unneeded memory exports by changing EXEC CF to EXEC_END
+		 *
+		 * in the shader compiler, we guarantee that the shader ends with
+		 * a specific pattern of ALLOC/EXEC CF pairs for the hw binning exports
+		 *
+		 * the since patches point only to dwords and CFs are 1.5 dwords
+		 * the patch is aligned and might point to a ALLOC CF
+		 */
+		for (int i = 0; i < batch->shader_patches.size / sizeof(void*); i++) {
+			instr_cf_t *cf =
+				*util_dynarray_element(&batch->shader_patches, instr_cf_t*, i);
+			if (cf->opc == ALLOC)
+				cf++;
+			assert(cf->opc == EXEC);
+			assert(cf[ctx->screen->num_vsc_pipes*2-2].opc == EXEC_END);
+			cf[2*(gmem->num_vsc_pipes-1)].opc = EXEC_END;
+		}
+
+		patch_draws(batch, USE_VISIBILITY);
+
+		/* initialize shader constants for the binning memexport */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4);
+		OUT_RING(ring, 0x0000000C);
+
+		for (int i = 0; i < gmem->num_vsc_pipes; i++) {
+			struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+
+			/* XXX we know how large this needs to be..
+			 * should do some sort of realloc
+			 * it should be ctx->batch->num_vertices bytes large
+			 * with this size it will break with more than 256k vertices..
+			 */
+			if (!pipe->bo) {
+				pipe->bo = fd_bo_new(ctx->dev, 0x40000,
+						DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i);
+			}
+
+			/* memory export address (export32):
+			 * .x: (base_address >> 2) | 0x40000000 (?)
+			 * .y: index (float) - set by shader
+			 * .z: 0x4B00D000 (?)
+			 * .w: 0x4B000000 (?) | max_index (?)
+			*/
+			OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x4B00D000);
+			OUT_RING(ring, 0x4B000000 | 0x40000);
+		}
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8);
+		OUT_RING(ring, 0x0000018C);
+
+		for (int i = 0; i < gmem->num_vsc_pipes; i++) {
+			struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+			float off_x, off_y, mul_x, mul_y;
+
+			/* const to tranform from [-1,1] to bin coordinates for this pipe
+			 * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc
+			 * 8 possible values on x/y axis,
+			 * to clip at binning stage: only use center 6x6
+			 * TODO: set the z parameters too so that hw binning
+			 * can clip primitives in Z too
+			 */
+
+			mul_x = 1.0f / (float) (gmem->bin_w * 8);
+			mul_y = 1.0f / (float) (gmem->bin_h * 8);
+			off_x = -pipe->x * (1.0/8.0f) + 0.125f - mul_x * gmem->minx;
+			off_y = -pipe->y * (1.0/8.0f) + 0.125f - mul_y * gmem->miny;
+
+			OUT_RING(ring, fui(off_x * (256.0f/255.0f)));
+			OUT_RING(ring, fui(off_y * (256.0f/255.0f)));
+			OUT_RING(ring, 0x3f000000);
+			OUT_RING(ring, fui(0.0f));
+
+			OUT_RING(ring, fui(mul_x * (256.0f/255.0f)));
+			OUT_RING(ring, fui(mul_y * (256.0f/255.0f)));
+			OUT_RING(ring, fui(0.0f));
+			OUT_RING(ring, fui(0.0f));
+		}
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+		OUT_RING(ring, 0);
+
+		ctx->emit_ib(ring, batch->binning);
+	} else {
+		patch_draws(batch, IGNORE_VISIBILITY);
+	}
+
+	util_dynarray_resize(&batch->draw_patches, 0);
+	util_dynarray_resize(&batch->shader_patches, 0);
 }
 
 /* before mem2gmem */
@@ -478,6 +611,7 @@ fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
 static void
 fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 {
+	struct fd_context *ctx = batch->ctx;
 	struct fd_ringbuffer *ring = batch->gmem;
 	struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 	enum pipe_format format = pipe_surface_format(pfb->cbufs[0]);
@@ -504,6 +638,22 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 		OUT_RING(ring, fui(0.0f));
 		OUT_RING(ring, fui(0.0f));
 	}
+
+	if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) {
+		struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN));
+		OUT_RING(ring, tile->n);
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
+		OUT_RING(ring, tile->n);
+
+		/* TODO only emit this when tile->p changes */
+		OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1);
+		OUT_RELOC(ring, pipe->bo, 0, 0, 0);
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
index da020443bd..608badda4e 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -65,7 +65,7 @@ delete_shader(struct fd2_shader_stateobj *so)
 
 static void
 emit(struct fd_ringbuffer *ring, gl_shader_stage type,
-	struct ir2_shader_info *info)
+	struct ir2_shader_info *info, struct util_dynarray *patches)
 {
 	unsigned i;
 
@@ -74,6 +74,10 @@ emit(struct fd_ringbuffer *ring, gl_shader_stage type,
 	OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords);
 	OUT_RING(ring, type == MESA_SHADER_FRAGMENT);
 	OUT_RING(ring, info->sizedwords);
+
+	if (patches)
+		util_dynarray_append(patches, uint32_t*, &ring->cur[info->mem_export_ptr]);
+
 	for (i = 0; i < info->sizedwords; i++)
 		OUT_RING(ring, info->dwords[i]);
 }
@@ -261,10 +265,11 @@ fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]);
 	}
 
-	emit(ring, MESA_SHADER_VERTEX, vpi);
+	emit(ring, MESA_SHADER_VERTEX, vpi,
+		binning ? &ctx->batch->shader_patches : NULL);
 
 	if (fp) {
-		emit(ring, MESA_SHADER_FRAGMENT, fpi);
+		emit(ring, MESA_SHADER_FRAGMENT, fpi, NULL);
 		fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg;
 		vs_export = MAX2(1, f->inputs_count) - 1;
 	}
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index eae2f68ce1..03f5aba5cc 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -90,6 +90,9 @@ batch_init(struct fd_batch *batch)
 
 	util_dynarray_init(&batch->draw_patches, NULL);
 
+	if (is_a2xx(ctx->screen))
+		util_dynarray_init(&batch->shader_patches, NULL);
+
 	if (is_a3xx(ctx->screen))
 		util_dynarray_init(&batch->rbrc_patches, NULL);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index a40d36094c..428a027907 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -145,6 +145,13 @@ struct fd_batch {
 	 */
 	struct util_dynarray rbrc_patches;
 
+	/* Keep track of pointer to start of MEM exports for a20x binning shaders
+	 *
+	 * this is so the end of the shader can be cut off at the right point
+	 * depending on the GMEM configuration
+	 */
+	struct util_dynarray shader_patches;
+
 	struct pipe_framebuffer_state framebuffer;
 
 	struct fd_submit *submit;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h
index c2197f21a9..295c64d771 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.h
+++ b/src/gallium/drivers/freedreno/freedreno_draw.h
@@ -88,6 +88,9 @@ fd_draw(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		if (idx_buffer)
 			size += 2;
 
+		if (vismode)
+			util_dynarray_append(&batch->draw_patches, uint32_t*, ring->cur);
+
 		OUT_PKT3(ring, vismode ? CP_DRAW_INDX_BIN : CP_DRAW_INDX, size);
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, DRAW_A20X(primtype, DI_FACE_CULL_NONE, src_sel,
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index bb15f0a3e1..d2483de1b0 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -214,12 +214,21 @@ calculate_tiles(struct fd_batch *batch)
 
 #define div_round_up(v, a)  (((v) + (a) - 1) / (a))
 	/* figure out number of tiles per pipe: */
-	tpp_x = tpp_y = 1;
-	while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes)
-		tpp_y += 2;
-	while ((div_round_up(nbins_y, tpp_y) *
-			div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes)
-		tpp_x += 1;
+	if (is_a20x(ctx->screen)) {
+		/* for a20x we want to minimize the number of "pipes"
+		 * binning data has 3 bits for x/y (8x8) but the edges are used to
+		 * cull off-screen vertices with hw binning, so we have 6x6 pipes
+		 */
+		tpp_x = 6;
+		tpp_y = 6;
+	} else {
+		tpp_x = tpp_y = 1;
+		while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes)
+			tpp_y += 2;
+		while ((div_round_up(nbins_y, tpp_y) *
+				div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes)
+			tpp_x += 1;
+	}
 
 	gmem->maxpw = tpp_x;
 	gmem->maxph = tpp_y;
@@ -246,6 +255,9 @@ calculate_tiles(struct fd_batch *batch)
 		xoff += tpp_x;
 	}
 
+	/* number of pipes to use for a20x */
+	gmem->num_vsc_pipes = MAX2(1, i);
+
 	for (; i < npipes; i++) {
 		struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
 		pipe->x = pipe->y = pipe->w = pipe->h = 0;
@@ -280,11 +292,12 @@ calculate_tiles(struct fd_batch *batch)
 
 			/* pipe number: */
 			p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
+			assert(p < gmem->num_vsc_pipes);
 
 			/* clip bin width: */
 			bw = MIN2(bin_w, minx + width - xoff);
-
-			tile->n = tile_n[p]++;
+			tile->n = !is_a20x(ctx->screen) ? tile_n[p]++ :
+				((i % tpp_y + 1) << 3 | (j % tpp_x + 1));
 			tile->p = p;
 			tile->bin_w = bw;
 			tile->bin_h = bh;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index b953999ff9..70641d62f3 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -57,6 +57,7 @@ struct fd_gmem_stateobj {
 	uint16_t minx, miny;
 	uint16_t width, height;
 	uint16_t maxpw, maxph;   /* maximum pipe width/height */
+	uint8_t num_vsc_pipes;   /* number of pipes for a20x */
 };
 
 struct fd_batch;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index b289448da6..30d0c0e0b3 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -118,6 +118,12 @@ is_a20x(struct fd_screen *screen)
 	return (screen->gpu_id >= 200) && (screen->gpu_id < 210);
 }
 
+static inline boolean
+is_a2xx(struct fd_screen *screen)
+{
+	return (screen->gpu_id >= 200) && (screen->gpu_id < 300);
+}
+
 /* is a3xx patch revision 0? */
 /* TODO a306.0 probably doesn't need this.. be more clever?? */
 static inline boolean
-- 
2.17.1