xf86-video-intel: 9 commits - src/intel_dri.c src/sna/gen3_render.c src/sna/gen5_render.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_dri.c src/sna/sna_render.c src/sna/sna_render.h

Thu Jun 21 13:36:32 PDT 2012

src/intel_dri.c       |   44 ++++++++++-----------
 src/sna/gen3_render.c |   17 +++++++-
 src/sna/gen5_render.c |   16 +++++++
 src/sna/gen6_render.c |   16 +++++++
 src/sna/gen7_render.c |  101 ++++++++++++++++++++++++++++++++++++++++----------
 src/sna/kgem.c        |   25 +++++++-----
 src/sna/kgem.h        |    1 
 src/sna/sna_dri.c     |   72 ++++++++++++++++++-----------------
 src/sna/sna_render.c  |    7 +++
 src/sna/sna_render.h  |    1 
 10 files changed, 211 insertions(+), 89 deletions(-)

New commits:
commit 565297e6bd3457a150036af9c62fe0dc67b794ac
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 13:53:02 2012 +0100

    sna/gen3+: Keep vbo cached
    
    Once we switch to using a vbo, keep it cached (resetting everytime it is
    idle) until we expire our caches.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 54ec9b2..b66e0e0 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1797,6 +1797,7 @@ static int gen3_get_rectangles__flush(struct sna *sna,
 		return 0;
 	if (!kgem_check_reloc_and_exec(&sna->kgem, 1))
 		return 0;
+
 	if (op->need_magic_ca_pass && sna->render.vbo)
 		return 0;
 
@@ -1988,7 +1989,20 @@ gen3_render_retire(struct kgem *kgem)
 	struct sna *sna;
 
 	sna = container_of(kgem, struct sna, kgem);
-	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
+static void
+gen3_render_expire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
@@ -4680,5 +4694,6 @@ Bool gen3_render_init(struct sna *sna)
 	render->max_3d_pitch = MAX_3D_PITCH;
 
 	sna->kgem.retire = gen3_render_retire;
+	sna->kgem.expire = gen3_render_expire;
 	return TRUE;
 }
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 65c21c3..7d424aa 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -3671,7 +3671,20 @@ gen5_render_retire(struct kgem *kgem)
 	struct sna *sna;
 
 	sna = container_of(kgem, struct sna, kgem);
-	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
+static void
+gen5_render_expire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
@@ -3944,6 +3957,7 @@ Bool gen5_render_init(struct sna *sna)
 
 	sna->kgem.context_switch = gen5_render_context_switch;
 	sna->kgem.retire = gen5_render_retire;
+	sna->kgem.expire = gen5_render_expire;
 
 	sna->render.composite = gen5_render_composite;
 #if !NO_COMPOSITE_SPANS
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 563e04c..896693c 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -4187,7 +4187,20 @@ gen6_render_retire(struct kgem *kgem)
 		kgem->ring = kgem->mode;
 
 	sna = container_of(kgem, struct sna, kgem);
-	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
+static void
+gen6_render_expire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo\n", __FUNCTION__));
 		kgem_bo_destroy(kgem, sna->render.vbo);
 		sna->render.vbo = NULL;
@@ -4273,6 +4286,7 @@ Bool gen6_render_init(struct sna *sna)
 
 	sna->kgem.context_switch = gen6_render_context_switch;
 	sna->kgem.retire = gen6_render_retire;
+	sna->kgem.expire = gen6_render_expire;
 
 	sna->render.composite = gen6_render_composite;
 #if !NO_COMPOSITE_SPANS
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 726a67e..ea9b01a 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -4298,7 +4298,20 @@ gen7_render_retire(struct kgem *kgem)
 		kgem->ring = kgem->mode;
 
 	sna = container_of(kgem, struct sna, kgem);
-	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
+		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
+static void
+gen7_render_expire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo\n", __FUNCTION__));
 		kgem_bo_destroy(kgem, sna->render.vbo);
 		sna->render.vbo = NULL;
@@ -4386,6 +4399,7 @@ Bool gen7_render_init(struct sna *sna)
 
 	sna->kgem.context_switch = gen7_render_context_switch;
 	sna->kgem.retire = gen7_render_retire;
+	sna->kgem.expire = gen7_render_expire;
 
 	sna->render.composite = gen7_render_composite;
 #if !NO_COMPOSITE_SPANS
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 0b5ca61..9fe3661 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2049,6 +2049,8 @@ bool kgem_expire_cache(struct kgem *kgem)
 	if (kgem->wedged)
 		kgem_cleanup(kgem);
 
+	kgem->expire(kgem);
+
 	if (kgem->need_purge)
 		kgem_purge_cache(kgem);
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 408ad03..c154be5 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -169,6 +169,7 @@ struct kgem {
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
 	void (*retire)(struct kgem *kgem);
+	void (*expire)(struct kgem *kgem);
 
 	uint32_t batch[64*1024-8];
 	struct drm_i915_gem_exec_object2 exec[256];
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 49f7c5e..6ddf6f3 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -252,6 +252,12 @@ no_render_retire(struct kgem *kgem)
 }
 
 static void
+no_render_expire(struct kgem *kgem)
+{
+	(void)kgem;
+}
+
+static void
 no_render_fini(struct sna *sna)
 {
 	(void)sna;
@@ -282,6 +288,7 @@ void no_render_init(struct sna *sna)
 
 	sna->kgem.context_switch = no_render_context_switch;
 	sna->kgem.retire = no_render_retire;
+	sna->kgem.expire = no_render_expire;
 	if (sna->kgem.gen >= 60)
 		sna->kgem.ring = KGEM_BLT;
 }
commit d806973e21cd46e605b3cd405323ae7a64c12798
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 12:57:13 2012 +0100

    sna: Micro-optimise search_inactive_cache
    
    Discard the unneeded next parameter to drop a memory reference in a hot
    path, and don't wait for a retirement if we are looking in a larger
    bucket than suits.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 90b4c96..0b5ca61 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2558,7 +2558,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			       uint32_t flags)
 {
 	struct list *cache;
-	struct kgem_bo *bo, *next;
+	struct kgem_bo *bo;
 	uint32_t pitch, untiled_pitch, tiled_height, size;
 	uint32_t handle;
 	int i, bucket, retry;
@@ -2847,7 +2847,7 @@ search_inactive:
 	/* Now just look for a close match and prefer any currently active */
 	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->inactive[bucket];
-	list_for_each_entry_safe(bo, next, cache, list) {
+	list_for_each_entry(bo, cache, list) {
 		assert(bucket(bo) == bucket);
 		assert(bo->reusable);
 
@@ -2861,10 +2861,8 @@ search_inactive:
 		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
 			if (tiling != gem_set_tiling(kgem->fd,
 						     bo->handle,
-						     tiling, pitch)) {
-				kgem_bo_free(kgem, bo);
+						     tiling, pitch))
 				continue;
-			}
 
 			if (bo->map)
 				kgem_bo_release_map(kgem, bo);
@@ -2872,7 +2870,7 @@ search_inactive:
 
 		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 			kgem_bo_free(kgem, bo);
-			continue;
+			break;
 		}
 
 		kgem_bo_remove_from_inactive(kgem, bo);
@@ -2903,6 +2901,7 @@ search_inactive:
 
 	if (--retry) {
 		bucket++;
+		flags &= ~CREATE_INACTIVE;
 		goto search_inactive;
 	}
 
commit d39fef0a7f3daf5c07686b44e4dea01c0f06c77a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 12:25:35 2012 +0100

    sna: Tiles are only 128 bytes wide on gen2
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index bb1b77d..90b4c96 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -803,11 +803,12 @@ void kgem_get_tile_size(struct kgem *kgem, int tiling,
 {
 	if (kgem->gen <= 30) {
 		if (tiling) {
-			*tile_width = 512;
 			if (kgem->gen < 30) {
+				*tile_width = 128;
 				*tile_height = 16;
 				*tile_size = 2048;
 			} else {
+				*tile_width = 512;
 				*tile_height = 8;
 				*tile_size = 4096;
 			}
@@ -853,8 +854,13 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 
 	if (kgem->gen <= 30) {
 		if (tiling) {
-			tile_width = 512;
-			tile_height = kgem->gen < 30 ? 16 : 8;
+			if (kgem->gen < 30) {
+				tile_width = 128;
+				tile_height = 16;
+			} else {
+				tile_width = 512;
+				tile_height =  8;
+			}
 		} else {
 			tile_width = 2 * bpp >> 3;
 			tile_width = ALIGN(tile_width,
commit 4f2dde1fa3b04b27bae8fc0bca9c824bd362d23b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 10:31:24 2012 +0100

    sna/gen7: Eliminate the pipeline stall after a non-pipelined operation
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index b0f7cfc..726a67e 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -955,12 +955,12 @@ gen7_emit_vertex_elements(struct sna *sna,
 }
 
 inline static void
-gen7_emit_pipe_invalidate(struct sna *sna)
+gen7_emit_pipe_invalidate(struct sna *sna, bool stall)
 {
 	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
 	OUT_BATCH(GEN7_PIPE_CONTROL_WC_FLUSH |
 		  GEN7_PIPE_CONTROL_TC_FLUSH |
-		  GEN7_PIPE_CONTROL_CS_STALL);
+		  (stall ? GEN7_PIPE_CONTROL_CS_STALL : 0));
 	OUT_BATCH(0);
 	OUT_BATCH(0);
 }
@@ -1020,7 +1020,7 @@ gen7_emit_state(struct sna *sna,
 	need_stall &= gen7_emit_drawing_rectangle(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		gen7_emit_pipe_invalidate(sna);
+		gen7_emit_pipe_invalidate(sna, need_stall);
 		kgem_clear_dirty(&sna->kgem);
 		kgem_bo_mark_dirty(op->dst.bo);
 		need_stall = false;
@@ -1042,7 +1042,7 @@ static void gen7_magic_ca_pass(struct sna *sna,
 	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
 	     sna->render.vertex_start, sna->render.vertex_index));
 
-	gen7_emit_pipe_invalidate(sna);
+	gen7_emit_pipe_invalidate(sna, true);
 
 	gen7_emit_cc(sna, gen7_get_blend(PictOpAdd, TRUE, op->dst.format));
 	gen7_emit_wm(sna,
commit 3ef05a8d0833203e265aff392f225a11a11c2d01
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 09:36:42 2012 +0100

    sna/gen7: Do not emit a pipeline stall after a non-pipelined command
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index e3c8269..b0f7cfc 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -850,7 +850,7 @@ gen7_emit_binding_table(struct sna *sna, uint16_t offset)
 	return true;
 }
 
-static void
+static bool
 gen7_emit_drawing_rectangle(struct sna *sna,
 			    const struct sna_composite_op *op)
 {
@@ -862,7 +862,7 @@ gen7_emit_drawing_rectangle(struct sna *sna,
 
 	if (sna->render_state.gen7.drawrect_limit == limit &&
 	    sna->render_state.gen7.drawrect_offset == offset)
-		return;
+		return true;
 
 	sna->render_state.gen7.drawrect_offset = offset;
 	sna->render_state.gen7.drawrect_limit = limit;
@@ -871,6 +871,7 @@ gen7_emit_drawing_rectangle(struct sna *sna,
 	OUT_BATCH(0);
 	OUT_BATCH(limit);
 	OUT_BATCH(offset);
+	return false;
 }
 
 static void
@@ -1016,7 +1017,7 @@ gen7_emit_state(struct sna *sna,
 	gen7_emit_vertex_elements(sna, op);
 
 	need_stall |= gen7_emit_binding_table(sna, wm_binding_table);
-	gen7_emit_drawing_rectangle(sna, op);
+	need_stall &= gen7_emit_drawing_rectangle(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
 		gen7_emit_pipe_invalidate(sna);
commit 4501e131e6b737cb8f2581c8b1f7ea9d29a8e912
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 16:38:32 2012 +0100

    sna/gen7: prefer using RENDER copy
    
    Further testing and the balance of doubt swings in favour of using the
    3D pipeline for copies.
    
    For small copies the BLT unit is faster,
    2.14M/sec vs 1.71M/sec for comppixwin10
    
    And for large copies the RENDER pipeline is faster,
    13000/sec vs 8000/sec for comppixwin500
    
    I think the implication is that we are not efficiently utilising the EU
    for small primitives - i.e. something that we might be able to improve.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 9c92e5a..e3c8269 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2448,7 +2448,7 @@ try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (prefer_blt_ring(sna)) {
+	if (sna->kgem.ring == KGEM_BLT) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return TRUE;
 	}
@@ -2462,8 +2462,6 @@ try_blt(struct sna *sna,
 	if (can_switch_rings(sna)) {
 		if (sna_picture_is_solid(src, NULL))
 			return TRUE;
-		if (src->pDrawable)
-			return TRUE;
 	}
 
 	return FALSE;
@@ -3329,7 +3327,7 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   PixmapPtr src, struct kgem_bo *src_bo,
 				   PixmapPtr dst, struct kgem_bo *dst_bo)
 {
-	return (prefer_blt_ring(sna) ||
+	return (sna->kgem.ring == KGEM_BLT ||
 		prefer_blt_bo(sna, src, src_bo) ||
 		prefer_blt_bo(sna, dst, dst_bo));
 }
commit 3da56c48b7820ec77d704c5a16670eb86a6f673f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jun 20 15:14:23 2012 +0100

    sna/gen7: Prefer using BLT rather than redirect for copies
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 3a9a856..9c92e5a 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3400,6 +3400,32 @@ gen7_render_copy_boxes(struct sna *sna, uint8_t alu,
 			       box, n))
 		return TRUE;
 
+	if ((too_large(dst->drawable.width, dst->drawable.height) ||
+	     too_large(src->drawable.width, src->drawable.height)) &&
+	    sna_blt_compare_depth(&src->drawable, &dst->drawable)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (box[i].x1 < extents.x1)
+				extents.x1 = box[i].x1;
+			if (box[i].y1 < extents.y1)
+				extents.y1 = box[i].y1;
+
+			if (box[i].x2 > extents.x2)
+				extents.x2 = box[i].x2;
+			if (box[i].y2 > extents.y2)
+				extents.y2 = box[i].y2;
+		}
+		if (too_large(extents.x2 - extents.x1, extents.y2 - extents.y1) &&
+		    sna_blt_copy_boxes(sna, alu,
+				       src_bo, src_dx, src_dy,
+				       dst_bo, dst_dx, dst_dy,
+				       dst->drawable.bitsPerPixel,
+				       box, n))
+			return TRUE;
+	}
+
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    overlaps(src_bo, src_dx, src_dy,
 		     dst_bo, dst_dx, dst_dy,
@@ -3449,6 +3475,7 @@ fallback_blt:
 			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
+
 		if (!sna_render_composite_redirect(sna, &tmp,
 						   extents.x1 + dst_dx,
 						   extents.y1 + dst_dy,
commit b1f8386db6e9b3eea1bdbf8cde90f33792640ce8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jun 20 23:28:14 2012 +0100

    sna/gen7: Emit a pipeline flush after every render operation
    
    For whatever reason, this produces a 30% improvement with the fish-demo
    (500 -> 660 fps on i7-3730qm at 1024x768). However, it does cause about
    a 5% regression in aa10text. We can appear to alleviate that by only
    doing the flush when the composite op != PictOpSrc.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index c474767..3a9a856 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -953,8 +953,8 @@ gen7_emit_vertex_elements(struct sna *sna,
 	}
 }
 
-static void
-gen7_emit_flush(struct sna *sna)
+inline static void
+gen7_emit_pipe_invalidate(struct sna *sna)
 {
 	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
 	OUT_BATCH(GEN7_PIPE_CONTROL_WC_FLUSH |
@@ -964,6 +964,25 @@ gen7_emit_flush(struct sna *sna)
 	OUT_BATCH(0);
 }
 
+inline static void
+gen7_emit_pipe_flush(struct sna *sna)
+{
+	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
+	OUT_BATCH(GEN7_PIPE_CONTROL_WC_FLUSH);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+inline static void
+gen7_emit_pipe_stall(struct sna *sna)
+{
+	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
+	OUT_BATCH(GEN7_PIPE_CONTROL_CS_STALL |
+		  GEN7_PIPE_CONTROL_STALL_AT_SCOREBOARD);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
 static void
 gen7_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
@@ -1000,18 +1019,15 @@ gen7_emit_state(struct sna *sna,
 	gen7_emit_drawing_rectangle(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		gen7_emit_flush(sna);
+		gen7_emit_pipe_invalidate(sna);
 		kgem_clear_dirty(&sna->kgem);
 		kgem_bo_mark_dirty(op->dst.bo);
 		need_stall = false;
 	}
-	if (need_stall) {
-		OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
-		OUT_BATCH(GEN7_PIPE_CONTROL_CS_STALL |
-			  GEN7_PIPE_CONTROL_STALL_AT_SCOREBOARD);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
-	}
+	if (need_stall)
+		gen7_emit_pipe_stall(sna);
+
+	sna->render_state.gen7.emit_flush = op->op != PictOpSrc;
 }
 
 static void gen7_magic_ca_pass(struct sna *sna,
@@ -1025,7 +1041,7 @@ static void gen7_magic_ca_pass(struct sna *sna,
 	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
 	     sna->render.vertex_start, sna->render.vertex_index));
 
-	gen7_emit_flush(sna);
+	gen7_emit_pipe_invalidate(sna);
 
 	gen7_emit_cc(sna, gen7_get_blend(PictOpAdd, TRUE, op->dst.format));
 	gen7_emit_wm(sna,
@@ -1055,6 +1071,11 @@ static void gen7_vertex_flush(struct sna *sna)
 	sna->kgem.batch[sna->render_state.gen7.vertex_offset] =
 		sna->render.vertex_index - sna->render.vertex_start;
 	sna->render_state.gen7.vertex_offset = 0;
+
+	if (sna->render_state.gen7.emit_flush) {
+		gen7_emit_pipe_flush(sna);
+		sna->render_state.gen7.emit_flush = false;
+	}
 }
 
 static int gen7_vertex_finish(struct sna *sna)
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index fd42b21..65ca359 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -461,6 +461,7 @@ struct gen7_render_state {
 	uint16_t surface_table;
 
 	Bool needs_invariant;
+	Bool emit_flush;
 };
 
 struct sna_static_stream {
commit d02e6d81420a114c9622bbdaf90fc3ae5d4b15a7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 21 16:10:02 2012 +0100

    Encode the third pipe using the HIGH_CRTC shift for vblanks
    
    The original vblank interface only understood 2 pipes (primary and
    secondary) and so selecting the third pipe (introduced with IvyBridge)
    requires use of the HIGH_CRTC. Using the second pipe where we meant the
    third pipe could result in some spurious timings when waiting on the
    vblank.
    
    Reported-by: Adam Jackson <ajax at redhat.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_dri.c b/src/intel_dri.c
index 88ab249..ed5078e 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -1238,6 +1238,16 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
 	i830_dri2_del_frame_event(drawable, flip_info);
 }
 
+static uint32_t pipe_select(int pipe)
+{
+	if (pipe > 1)
+		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
+	else if (pipe > 0)
+		return DRM_VBLANK_SECONDARY;
+	else
+		return 0;
+}
+
 /*
  * ScheduleSwap is responsible for requesting a DRM vblank event for the
  * appropriate frame.
@@ -1307,9 +1317,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	I830DRI2ReferenceBuffer(back);
 
 	/* Get current count */
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 	ret = drmWaitVBlank(intel->drmSubFD, &vbl);
 	if (ret) {
@@ -1345,9 +1353,8 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		if (flip && I830DRI2ScheduleFlip(intel, draw, swap_info))
 			return TRUE;
 
-		vbl.request.type =  DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
+		vbl.request.type =
+			DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 
 		/* If non-pageflipping, but blitting/exchanging, we need to use
 		 * DRM_VBLANK_NEXTONMISS to avoid unreliable timestamping later
@@ -1355,8 +1362,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		 */
 		if (flip == 0)
 			vbl.request.type |= DRM_VBLANK_NEXTONMISS;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
 
 		/* If target_msc already reached or passed, set it to
 		 * current_msc to ensure we return a reasonable value back
@@ -1386,11 +1391,10 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	 * and we need to queue an event that will satisfy the divisor/remainder
 	 * equation.
 	 */
-	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
+	vbl.request.type =
+		DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 	if (flip == 0)
 		vbl.request.type |= DRM_VBLANK_NEXTONMISS;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
 
 	vbl.request.sequence = current_msc - (current_msc % divisor) +
 		remainder;
@@ -1463,9 +1467,7 @@ I830DRI2GetMSC(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 		return TRUE;
 	}
 
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 
 	ret = drmWaitVBlank(intel->drmSubFD, &vbl);
@@ -1531,9 +1533,7 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	}
 
 	/* Get current count */
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 	ret = drmWaitVBlank(intel->drmSubFD, &vbl);
 	if (ret) {
@@ -1564,9 +1564,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 		 */
 		if (current_msc >= target_msc)
 			target_msc = current_msc;
-		vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
+		vbl.request.type =
+			DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 		vbl.request.sequence = target_msc;
 		vbl.request.signal = (unsigned long)wait_info;
 		ret = drmWaitVBlank(intel->drmSubFD, &vbl);
@@ -1591,9 +1590,8 @@ I830DRI2ScheduleWaitMSC(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	 * If we get here, target_msc has already passed or we don't have one,
 	 * so we queue an event that will satisfy the divisor/remainder equation.
 	 */
-	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type =
+		DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 
 	vbl.request.sequence = current_msc - (current_msc % divisor) +
 	    remainder;
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 5390b5a..46a43e9 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1032,6 +1032,21 @@ can_flip(struct sna * sna,
 	return TRUE;
 }
 
+inline static uint32_t pipe_select(int pipe)
+{
+	/* The third pipe was introduced with IvyBridge long after
+	 * multiple pipe support was added to the kernel, hence
+	 * we can safely ignore the capability check - if we have more
+	 * than two pipes, we can assume that they are fully supported.
+	 */
+	if (pipe > 1)
+		return pipe << DRM_VBLANK_HIGH_CRTC_SHIFT;
+	else if (pipe > 0)
+		return DRM_VBLANK_SECONDARY;
+	else
+		return 0;
+}
+
 static void sna_dri_vblank_handle(int fd,
 				  unsigned int frame, unsigned int tv_sec,
 				  unsigned int tv_usec,
@@ -1086,9 +1101,8 @@ static void sna_dri_vblank_handle(int fd,
 				VG_CLEAR(vbl);
 				vbl.request.type =
 					DRM_VBLANK_RELATIVE |
-					DRM_VBLANK_EVENT;
-				if (info->pipe > 0)
-					vbl.request.type |= DRM_VBLANK_SECONDARY;
+					DRM_VBLANK_EVENT |
+					pipe_select(info->pipe);
 				vbl.request.sequence = 1;
 				vbl.request.signal = (unsigned long)info;
 				if (!sna_wait_vblank(sna, &vbl))
@@ -1437,9 +1451,7 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		sna_dri_reference_buffer(back);
 
 		/* Get current count */
-		vbl.request.type = DRM_VBLANK_RELATIVE;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
+		vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 		vbl.request.sequence = 0;
 		if (sna_wait_vblank(sna, &vbl)) {
 			sna_dri_frame_event_info_free(info);
@@ -1452,9 +1464,8 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 
 		vbl.request.type =
 			DRM_VBLANK_ABSOLUTE |
-			DRM_VBLANK_EVENT;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
+			DRM_VBLANK_EVENT |
+			pipe_select(pipe);
 
 		/*
 		 * If divisor is zero, or current_msc is smaller than target_msc
@@ -1610,9 +1621,8 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 			vbl.request.type =
 				DRM_VBLANK_RELATIVE |
 				DRM_VBLANK_NEXTONMISS |
-				DRM_VBLANK_EVENT;
-			if (pipe > 0)
-				vbl.request.type |= DRM_VBLANK_SECONDARY;
+				DRM_VBLANK_EVENT |
+				pipe_select(pipe);
 			vbl.request.sequence = 0;
 			vbl.request.signal = (unsigned long)info;
 			if (sna_wait_vblank(sna, &vbl) == 0)
@@ -1625,9 +1635,7 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	}
 
 	/* Get current count */
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 	if (sna_wait_vblank(sna, &vbl))
 		goto blit_fallback;
@@ -1651,9 +1659,8 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 
 		 vbl.request.type =
 			 DRM_VBLANK_ABSOLUTE |
-			 DRM_VBLANK_EVENT;
-		 if (pipe > 0)
-			 vbl.request.type |= DRM_VBLANK_SECONDARY;
+			 DRM_VBLANK_EVENT |
+			 pipe_select(pipe);
 		 vbl.request.sequence = *target_msc;
 		 vbl.request.signal = (unsigned long)info;
 		 if (sna_wait_vblank(sna, &vbl))
@@ -1674,9 +1681,10 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		     (int)divisor));
 
 	vbl.request.type =
-		DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | DRM_VBLANK_NEXTONMISS;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+		DRM_VBLANK_ABSOLUTE |
+		DRM_VBLANK_EVENT |
+		DRM_VBLANK_NEXTONMISS |
+		pipe_select(pipe);
 
 	vbl.request.sequence = current_msc - current_msc % divisor + remainder;
 	/*
@@ -1872,9 +1880,7 @@ sna_dri_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 
 	VG_CLEAR(vbl);
 
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 
 	if (sna_wait_vblank(sna, &vbl)) {
@@ -1924,9 +1930,7 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	VG_CLEAR(vbl);
 
 	/* Get current count */
-	vbl.request.type = DRM_VBLANK_RELATIVE;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
 	vbl.request.sequence = 0;
 	if (sna_wait_vblank(sna, &vbl))
 		goto out_complete;
@@ -1964,9 +1968,10 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	 * client.
 	 */
 	if (divisor == 0 || current_msc < target_msc) {
-		vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
-		if (pipe > 0)
-			vbl.request.type |= DRM_VBLANK_SECONDARY;
+		vbl.request.type =
+			DRM_VBLANK_ABSOLUTE |
+			DRM_VBLANK_EVENT |
+			pipe_select(pipe);
 		vbl.request.sequence = target_msc;
 		vbl.request.signal = (unsigned long)info;
 		if (sna_wait_vblank(sna, &vbl))
@@ -1981,9 +1986,8 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	 * If we get here, target_msc has already passed or we don't have one,
 	 * so we queue an event that will satisfy the divisor/remainder equation.
 	 */
-	vbl.request.type = DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT;
-	if (pipe > 0)
-		vbl.request.type |= DRM_VBLANK_SECONDARY;
+	vbl.request.type =
+		DRM_VBLANK_ABSOLUTE | DRM_VBLANK_EVENT | pipe_select(pipe);
 
 	vbl.request.sequence = current_msc - current_msc % divisor + remainder;
 
@@ -2024,7 +2028,7 @@ Bool sna_dri_open(struct sna *sna, ScreenPtr screen)
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (sna->kgem.wedged) {
+	if (wedged(sna)) {
 		xf86DrvMsg(sna->scrn->scrnIndex, X_WARNING,
 			   "cannot enable DRI2 whilst the GPU is wedged\n");
 		return FALSE;