xf86-video-intel: 4 commits - src/sna/compiler.h src/sna/gen3_render.c src/sna/gen4_render.c src/sna/gen4_vertex.c src/sna/gen4_vertex.h src/sna/gen5_render.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/Makefile.am src/sna/sna_accel.c src/sna/sna_cpu.c src/sna/sna_driver.c src/sna/sna.h test/.gitignore

Mon Feb 25 16:30:40 PST 2013

src/sna/Makefile.am   |    1 
 src/sna/compiler.h    |   10 
 src/sna/gen3_render.c |    2 
 src/sna/gen4_render.c |    6 
 src/sna/gen4_vertex.c | 1343 +++++++++++++++++++++++++++++++++++++++++++++-----
 src/sna/gen4_vertex.h |    4 
 src/sna/gen5_render.c |    6 
 src/sna/gen6_render.c |    6 
 src/sna/gen7_render.c |    6 
 src/sna/sna.h         |   14 
 src/sna/sna_accel.c   |    2 
 src/sna/sna_cpu.c     |   76 ++
 src/sna/sna_driver.c  |    2 
 test/.gitignore       |    2 
 14 files changed, 1330 insertions(+), 150 deletions(-)

New commits:
commit 49d9c5e9d48d043a8366b6471128c999b5e887b0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 26 00:27:12 2013 +0000

    sna: Reverse inverted assertions
    
    Oops, the assertions that we had sufficient free space was inverted.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 9cc1349..04ec889 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1997,8 +1997,8 @@ start:
 			goto start;
 	}
 
-	assert(op->floats_per_rect >= vertex_space(sna));
 	assert(rem <= vertex_space(sna));
+	assert(op->floats_per_rect <= rem);
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 	sna->render.vertex_index += 3*want;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 9280246..4abcb97 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -661,8 +661,8 @@ start:
 			goto start;
 	}
 
-	assert(op->floats_per_rect >= vertex_space(sna));
 	assert(rem <= vertex_space(sna));
+	assert(op->floats_per_rect <= rem);
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 8b9eaac..9a9c223 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -647,8 +647,8 @@ start:
 			goto start;
 	}
 
-	assert(op->floats_per_rect >= vertex_space(sna));
 	assert(rem <= vertex_space(sna));
+	assert(op->floats_per_rect <= rem);
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d410514..3195388 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1196,8 +1196,8 @@ start:
 			goto start;
 	}
 
-	assert(op->floats_per_rect >= vertex_space(sna));
 	assert(rem <= vertex_space(sna));
+	assert(op->floats_per_rect <= rem);
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 7984cf1..0dae8c4 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1341,8 +1341,8 @@ start:
 			goto start;
 	}
 
-	assert(op->floats_per_rect >= vertex_space(sna));
 	assert(rem <= vertex_space(sna));
+	assert(op->floats_per_rect <= rem);
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
commit 94b95cc2fc63457903a7b7b6eaa09bc27f25750c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 26 00:02:16 2013 +0000

    sna/gen4+: Begin specialising vertex programs for ISA
    
    Allow use of advanced ISA when available by detecting support at
    runtime. This initial work just uses GCC to emit varying ISA, future
    work could use hand written code for these hot spots.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index 23ec31c..fe2e321 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -52,6 +52,16 @@
 #define flatten
 #endif
 
+#if defined(__GNUC__) && (__GNUC__ >= 4) /* 4.4 */
+#define sse2 __attribute__((target("sse2")))
+#define sse4_2 __attribute__((target("sse4.2,sse2")))
+#define avx2 __attribute__((target("avx2,sse4.2,sse2")))
+#else
+#define sse2
+#define sse4_2
+#define avx2
+#endif
+
 #ifdef HAVE_VALGRIND
 #define VG(x) x
 #else
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index d08d762..9280246 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1945,7 +1945,7 @@ gen4_render_composite(struct sna *sna,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(tmp);
+	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp);
 
 	tmp->blt   = gen4_render_composite_blt;
 	tmp->box   = gen4_render_composite_box;
@@ -2186,7 +2186,7 @@ gen4_render_composite_spans(struct sna *sna,
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(tmp);
+	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp);
 	tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
 	tmp->box   = gen4_render_composite_spans_box;
diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c
index 5062ebd..20f85b3 100644
--- a/src/sna/gen4_vertex.c
+++ b/src/sna/gen4_vertex.c
@@ -272,10 +272,10 @@ emit_texcoord(struct sna *sna,
 
 inline static void
 emit_vertex(struct sna *sna,
-		      const struct sna_composite_op *op,
-		      int16_t srcX, int16_t srcY,
-		      int16_t mskX, int16_t mskY,
-		      int16_t dstX, int16_t dstY)
+	    const struct sna_composite_op *op,
+	    int16_t srcX, int16_t srcY,
+	    int16_t mskX, int16_t mskY,
+	    int16_t dstX, int16_t dstY)
 {
 	OUT_VERTEX(dstX, dstY);
 	emit_texcoord(sna, &op->src, srcX, srcY);
@@ -414,6 +414,66 @@ emit_primitive_linear(struct sna *sna,
 	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
 }
 
+sse4_2 fastcall static void
+emit_primitive_linear__sse4_2(struct sna *sna,
+			      const struct sna_composite_op *op,
+			      const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
+avx2 fastcall static void
+emit_primitive_linear__avx2(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
 fastcall static void
 emit_boxes_linear(const struct sna_composite_op *op,
 		  const BoxRec *box, int nbox,
@@ -442,6 +502,62 @@ emit_boxes_linear(const struct sna_composite_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_boxes_linear__sse4_2(const struct sna_composite_op *op,
+			  const BoxRec *box, int nbox,
+			  float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_linear__avx2(const struct sna_composite_op *op,
+			const BoxRec *box, int nbox,
+			float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_primitive_identity_source(struct sna *sna,
 			       const struct sna_composite_op *op,
@@ -473,6 +589,68 @@ emit_primitive_identity_source(struct sna *sna,
 	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
 }
 
+sse4_2 fastcall static void
+emit_primitive_identity_source__sse4_2(struct sna *sna,
+				       const struct sna_composite_op *op,
+				       const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+
+	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+	v[1] = v[4] + r->width * op->src.scale[0];
+
+	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
+}
+
+avx2 fastcall static void
+emit_primitive_identity_source__avx2(struct sna *sna,
+				     const struct sna_composite_op *op,
+				     const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+
+	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+	v[1] = v[4] + r->width * op->src.scale[0];
+
+	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
+}
+
 fastcall static void
 emit_boxes_identity_source(const struct sna_composite_op *op,
 			   const BoxRec *box, int nbox,
@@ -503,6 +681,66 @@ emit_boxes_identity_source(const struct sna_composite_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_boxes_identity_source__sse4_2(const struct sna_composite_op *op,
+				   const BoxRec *box, int nbox,
+				   float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_identity_source__avx2(const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox,
+				 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_primitive_simple_source(struct sna *sna,
 			     const struct sna_composite_op *op,
@@ -543,11 +781,17 @@ emit_primitive_simple_source(struct sna *sna,
 	v[8] = ((r->src.y + ty) * yy + y0) * sy;
 }
 
-fastcall static void
-emit_boxes_simple_source(const struct sna_composite_op *op,
-			 const BoxRec *box, int nbox,
-			 float *v)
+sse4_2 fastcall static void
+emit_primitive_simple_source__sse4_2(struct sna *sna,
+				     const struct sna_composite_op *op,
+				     const struct sna_composite_rectangles *r)
 {
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
 	float xx = op->src.transform->matrix[0][0];
 	float x0 = op->src.transform->matrix[0][2];
 	float yy = op->src.transform->matrix[1][1];
@@ -557,51 +801,203 @@ emit_boxes_simple_source(const struct sna_composite_op *op,
 	int16_t tx = op->src.offset[0];
 	int16_t ty = op->src.offset[1];
 
-	do {
-		union {
-			struct sna_coordinate p;
-			float f;
-		} dst;
-
-		dst.p.x = box->x2;
-		dst.p.y = box->y2;
-		v[0] = dst.f;
-		v[1] = ((box->x2 + tx) * xx + x0) * sx;
-		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
-
-		dst.p.x = box->x1;
-		v[3] = dst.f;
-		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
-
-		dst.p.y = box->y1;
-		v[6] = dst.f;
-		v[8] = ((box->y1 + ty) * yy + y0) * sy;
-
-		v += 9;
-		box++;
-	} while (--nbox);
-}
-
-fastcall static void
-emit_primitive_affine_source(struct sna *sna,
-			     const struct sna_composite_op *op,
-			     const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
 	assert(op->floats_per_rect == 9);
 	assert((sna->render.vertex_used % 3) == 0);
 	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
+	sna->render.vertex_used += 3*3;
 
 	dst.p.x = r->dst.x + r->width;
 	dst.p.y = r->dst.y + r->height;
 	v[0] = dst.f;
-	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+avx2 fastcall static void
+emit_primitive_simple_source__avx2(struct sna *sna,
+				   const struct sna_composite_op *op,
+				   const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+fastcall static void
+emit_boxes_simple_source(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+sse4_2 fastcall static void
+emit_boxes_simple_source__sse4_2(const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox,
+				 float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_simple_source__avx2(const struct sna_composite_op *op,
+			       const BoxRec *box, int nbox,
+			       float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_affine_source(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
 				    op->src.offset[1] + r->src.y + r->height,
 				    op->src.transform, op->src.scale,
 				    &v[1], &v[2]);
@@ -981,7 +1377,7 @@ emit_composite_texcoord_affine(struct sna *sna,
 }
 
 
-unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
+unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp)
 {
 	unsigned vb;
 
@@ -1060,14 +1456,30 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			vb = 1;
 		} else if (tmp->src.is_linear) {
 			DBG(("%s: linear, no mask\n", __FUNCTION__));
-			tmp->prim_emit = emit_primitive_linear;
-			tmp->emit_boxes = emit_boxes_linear;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_primitive_linear__avx2;
+				tmp->emit_boxes = emit_boxes_linear__avx2;
+			} else  if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_primitive_linear__sse4_2;
+				tmp->emit_boxes = emit_boxes_linear__sse4_2;
+			} else {
+				tmp->prim_emit = emit_primitive_linear;
+				tmp->emit_boxes = emit_boxes_linear;
+			}
 			tmp->floats_per_vertex = 2;
 			vb = 1;
 		} else if (tmp->src.transform == NULL) {
 			DBG(("%s: identity src, no mask\n", __FUNCTION__));
-			tmp->prim_emit = emit_primitive_identity_source;
-			tmp->emit_boxes = emit_boxes_identity_source;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_primitive_identity_source__avx2;
+				tmp->emit_boxes = emit_boxes_identity_source__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_primitive_identity_source__sse4_2;
+				tmp->emit_boxes = emit_boxes_identity_source__sse4_2;
+			} else {
+				tmp->prim_emit = emit_primitive_identity_source;
+				tmp->emit_boxes = emit_boxes_identity_source;
+			}
 			tmp->floats_per_vertex = 3;
 			vb = 2;
 		} else if (tmp->src.is_affine) {
@@ -1075,8 +1487,16 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
 			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
 				DBG(("%s: simple src, no mask\n", __FUNCTION__));
-				tmp->prim_emit = emit_primitive_simple_source;
-				tmp->emit_boxes = emit_boxes_simple_source;
+				if (sna->cpu_features & AVX2) {
+					tmp->prim_emit = emit_primitive_simple_source__avx2;
+					tmp->emit_boxes = emit_boxes_simple_source__avx2;
+				} else if (sna->cpu_features & SSE4_2) {
+					tmp->prim_emit = emit_primitive_simple_source__sse4_2;
+					tmp->emit_boxes = emit_boxes_simple_source__sse4_2;
+				} else {
+					tmp->prim_emit = emit_primitive_simple_source;
+					tmp->emit_boxes = emit_boxes_simple_source;
+				}
 			} else {
 				DBG(("%s: affine src, no mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_affine_source;
@@ -1222,48 +1642,51 @@ emit_span_identity(struct sna *sna,
 	v[11] = v[7] = v[3] = opacity;
 }
 
-fastcall static void
-emit_span_boxes_identity(const struct sna_composite_spans_op *op,
-			 const struct sna_opacity_box *b, int nbox,
-			 float *v)
+sse4_2 fastcall static void
+emit_span_identity__sse4_2(struct sna *sna,
+			   const struct sna_composite_spans_op *op,
+			   const BoxRec *box,
+			   float opacity)
 {
-	do {
-		union {
-			struct sna_coordinate p;
-			float f;
-		} dst;
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
 
-		float sx = op->base.src.scale[0];
-		float sy = op->base.src.scale[1];
-		int16_t tx = op->base.src.offset[0];
-		int16_t ty = op->base.src.offset[1];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
 
-		dst.p.x = b->box.x2;
-		dst.p.y = b->box.y2;
-		v[0] = dst.f;
-		v[1] = (b->box.x2 + tx) * sx;
-		v[6] = v[2] = (b->box.y2 + ty) * sy;
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
 
-		dst.p.x = b->box.x1;
-		v[4] = dst.f;
-		v[9] = v[5] = (b->box.x1 + tx) * sx;
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = (box->x2 + tx) * sx;
+	v[6] = v[2] = (box->y2 + ty) * sy;
 
-		dst.p.y = b->box.y1;
-		v[8] = dst.f;
-		v[10] = (b->box.y1 + ty) * sy;
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = (box->x1 + tx) * sx;
 
-		v[11] = v[7] = v[3] = b->alpha;
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = (box->y1 + ty) * sy;
 
-		v += 12;
-		b++;
-	} while (--nbox);
+	v[11] = v[7] = v[3] = opacity;
 }
 
-fastcall static void
-emit_span_simple(struct sna *sna,
-		  const struct sna_composite_spans_op *op,
-		  const BoxRec *box,
-		  float opacity)
+avx2 fastcall static void
+emit_span_identity__avx2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
 {
 	float *v;
 	union {
@@ -1271,10 +1694,6 @@ emit_span_simple(struct sna *sna,
 		float f;
 	} dst;
 
-	float xx = op->base.src.transform->matrix[0][0];
-	float x0 = op->base.src.transform->matrix[0][2];
-	float yy = op->base.src.transform->matrix[1][1];
-	float y0 = op->base.src.transform->matrix[1][2];
 	float sx = op->base.src.scale[0];
 	float sy = op->base.src.scale[1];
 	int16_t tx = op->base.src.offset[0];
@@ -1289,23 +1708,266 @@ emit_span_simple(struct sna *sna,
 	dst.p.x = box->x2;
 	dst.p.y = box->y2;
 	v[0] = dst.f;
-	v[1] = ((box->x2 + tx) * xx + x0) * sx;
-	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+	v[1] = (box->x2 + tx) * sx;
+	v[6] = v[2] = (box->y2 + ty) * sy;
 
 	dst.p.x = box->x1;
 	v[4] = dst.f;
-	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+	v[9] = v[5] = (box->x1 + tx) * sx;
 
 	dst.p.y = box->y1;
 	v[8] = dst.f;
-	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+	v[10] = (box->y1 + ty) * sy;
 
 	v[11] = v[7] = v[3] = opacity;
 }
 
 fastcall static void
-emit_span_boxes_simple(const struct sna_composite_spans_op *op,
-		       const struct sna_opacity_box *b, int nbox,
+emit_span_boxes_identity(const struct sna_composite_spans_op *op,
+			 const struct sna_opacity_box *b, int nbox,
+			 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+sse4_2 fastcall static void
+emit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op,
+				 const struct sna_opacity_box *b, int nbox,
+				 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_simple(struct sna *sna,
+		 const struct sna_composite_spans_op *op,
+		 const BoxRec *box,
+		 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+sse4_2 fastcall static void
+emit_span_simple__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_simple__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_simple(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
 		       float *v)
 {
 	float xx = op->base.src.transform->matrix[0][0];
@@ -1317,38 +1979,210 @@ emit_span_boxes_simple(const struct sna_composite_spans_op *op,
 	int16_t tx = op->base.src.offset[0];
 	int16_t ty = op->base.src.offset[1];
 
-	do {
-		union {
-			struct sna_coordinate p;
-			float f;
-		} dst;
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+sse4_2 fastcall static void
+emit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_affine(struct sna *sna,
+		  const struct sna_composite_spans_op *op,
+		  const BoxRec *box,
+		  float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[1], &v[2]);
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[5], &v[6]);
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[9], &v[10]);
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+sse4_2 fastcall static void
+emit_span_affine__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
 
-		dst.p.x = b->box.x2;
-		dst.p.y = b->box.y2;
-		v[0] = dst.f;
-		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
-		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
 
-		dst.p.x = b->box.x1;
-		v[4] = dst.f;
-		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[1], &v[2]);
 
-		dst.p.y = b->box.y1;
-		v[8] = dst.f;
-		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[5], &v[6]);
 
-		v[11] = v[7] = v[3] = b->alpha;
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[9], &v[10]);
 
-		v += 12;
-		b++;
-	} while (--nbox);
+	v[11] = v[7] = v[3] = opacity;
 }
 
-fastcall static void
-emit_span_affine(struct sna *sna,
-		  const struct sna_composite_spans_op *op,
-		  const BoxRec *box,
-		  float opacity)
+avx2 fastcall static void
+emit_span_affine__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
 {
 	union {
 		struct sna_coordinate p;
@@ -1432,11 +2266,161 @@ emit_span_boxes_affine(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_span_linear(struct sna *sna,
-		  const struct sna_composite_spans_op *op,
-		  const BoxRec *box,
-		  float opacity)
+		 const struct sna_composite_spans_op *op,
+		 const BoxRec *box,
+		 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
+	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
+	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
+
+	v[8] = v[5] = v[2] = opacity;
+}
+
+sse4_2 fastcall static void
+emit_span_linear__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
+	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
+	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
+
+	v[8] = v[5] = v[2] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_linear__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
 {
 	union {
 		struct sna_coordinate p;
@@ -1494,6 +2478,66 @@ emit_span_boxes_linear(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
 inline inline static uint32_t
 gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
 {
@@ -1502,7 +2546,8 @@ gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
 	return 1 << 2 | id;
 }
 
-unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
+unsigned gen4_choose_spans_emitter(struct sna *sna,
+				   struct sna_composite_spans_op *tmp)
 {
 	unsigned vb;
 
@@ -1512,24 +2557,56 @@ unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.is_linear) {
-		tmp->prim_emit = emit_span_linear;
-		tmp->emit_boxes = emit_span_boxes_linear;
+		if (sna->cpu_features & AVX2) {
+			tmp->prim_emit = emit_span_linear__avx2;
+			tmp->emit_boxes = emit_span_boxes_linear__avx2;
+		} else if (sna->cpu_features & SSE4_2) {
+			tmp->prim_emit = emit_span_linear__sse4_2;
+			tmp->emit_boxes = emit_span_boxes_linear__sse4_2;
+		} else {
+			tmp->prim_emit = emit_span_linear;
+			tmp->emit_boxes = emit_span_boxes_linear;
+		}
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.transform == NULL) {
-		tmp->prim_emit = emit_span_identity;
-		tmp->emit_boxes = emit_span_boxes_identity;
+		if (sna->cpu_features & AVX2) {
+			tmp->prim_emit = emit_span_identity__avx2;
+			tmp->emit_boxes = emit_span_boxes_identity__avx2;
+		} else if (sna->cpu_features & SSE4_2) {
+			tmp->prim_emit = emit_span_identity__sse4_2;
+			tmp->emit_boxes = emit_span_boxes_identity__sse4_2;
+		} else {
+			tmp->prim_emit = emit_span_identity;
+			tmp->emit_boxes = emit_span_boxes_identity;
+		}
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
 	} else if (tmp->base.is_affine) {
 		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
 		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
 		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
-			tmp->prim_emit = emit_span_simple;
-			tmp->emit_boxes = emit_span_boxes_simple;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_span_simple__avx2;
+				tmp->emit_boxes = emit_span_boxes_simple__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_span_simple__sse4_2;
+				tmp->emit_boxes = emit_span_boxes_simple__sse4_2;
+			} else {
+				tmp->prim_emit = emit_span_simple;
+				tmp->emit_boxes = emit_span_boxes_simple;
+			}
 		} else {
-			tmp->prim_emit = emit_span_affine;
-			tmp->emit_boxes = emit_span_boxes_affine;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_span_affine__avx2;
+				tmp->emit_boxes = emit_span_boxes_affine__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_span_affine__sse4_2;
+				tmp->emit_boxes = emit_span_boxes_affine__sse4_2;
+			} else {
+				tmp->prim_emit = emit_span_affine;
+				tmp->emit_boxes = emit_span_boxes_affine;
+			}
 		}
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
diff --git a/src/sna/gen4_vertex.h b/src/sna/gen4_vertex.h
index 431b545..1494ba1 100644
--- a/src/sna/gen4_vertex.h
+++ b/src/sna/gen4_vertex.h
@@ -10,7 +10,7 @@ void gen4_vertex_flush(struct sna *sna);
 int gen4_vertex_finish(struct sna *sna);
 void gen4_vertex_close(struct sna *sna);
 
-unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp);
-unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp);
+unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp);
+unsigned gen4_choose_spans_emitter(struct sna *sna, struct sna_composite_spans_op *tmp);
 
 #endif /* GEN4_VERTEX_H */
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index f236877..8b9eaac 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1924,7 +1924,7 @@ gen5_render_composite(struct sna *sna,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(tmp);
+	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp);
 
 	tmp->blt   = gen5_render_composite_blt;
 	tmp->box   = gen5_render_composite_box;
@@ -2152,7 +2152,7 @@ gen5_render_composite_spans(struct sna *sna,
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(tmp);
+	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp);
 	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
 	tmp->box   = gen5_render_composite_spans_box;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index fa4c47b..d410514 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2272,7 +2272,7 @@ gen6_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen4_choose_composite_emitter(tmp));
+			       gen4_choose_composite_emitter(sna, tmp));
 
 	tmp->blt   = gen6_render_composite_blt;
 	tmp->box   = gen6_render_composite_box;
@@ -2508,7 +2508,7 @@ gen6_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen6_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN6_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       gen4_choose_spans_emitter(tmp));
+			       gen4_choose_spans_emitter(sna, tmp));
 
 	tmp->box   = gen6_render_composite_spans_box;
 	tmp->boxes = gen6_render_composite_spans_boxes;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index bd14d90..7984cf1 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2412,7 +2412,7 @@ gen7_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen4_choose_composite_emitter(tmp));
+			       gen4_choose_composite_emitter(sna, tmp));
 
 	tmp->blt   = gen7_render_composite_blt;
 	tmp->box   = gen7_render_composite_box;
@@ -2628,7 +2628,7 @@ gen7_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen7_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       gen4_choose_spans_emitter(tmp));
+			       gen4_choose_spans_emitter(sna, tmp));
 
 	tmp->box   = gen7_render_composite_spans_box;
 	tmp->boxes = gen7_render_composite_spans_boxes;
commit f095678125b25aeae80d838729a7f89d09007e10
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 25 23:14:12 2013 +0000

    test: Correct ignore path for async.avi
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/test/.gitignore b/test/.gitignore
index b1c350e..6576f55 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -14,4 +14,4 @@ render-copyarea-size
 render-copy-alphaless
 mixed-stress
 lowlevel-blt-bench
-test/vsync.avi
+vsync.avi
commit aa045d80ec0d3366745d1671b9f667cec587d804
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 25 23:13:23 2013 +0000

    sna: Detect available instruction sets at runtime
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
index c74c904..b5da0cf 100644
--- a/src/sna/Makefile.am
+++ b/src/sna/Makefile.am
@@ -48,6 +48,7 @@ libsna_la_SOURCES = \
 	sna_accel.c \
 	sna_blt.c \
 	sna_composite.c \
+	sna_cpu.c \
 	sna_damage.c \
 	sna_damage.h \
 	sna_display.c \
diff --git a/src/sna/sna.h b/src/sna/sna.h
index f6e89ec..77a52bd 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -207,6 +207,18 @@ struct sna {
 #define SNA_TEAR_FREE		0x10
 #define SNA_FORCE_SHADOW	0x20
 
+	unsigned cpu_features;
+#define MMX 0x1
+#define SSE 0x2
+#define SSE2 0x4
+#define SSE3 0x8
+#define SSSE3 0x10
+#define SSE4a 0x20
+#define SSE4_1 0x40
+#define SSE4_2 0x80
+#define AVX 0x100
+#define AVX2 0x200
+
 	unsigned watch_flush;
 
 	struct timeval timer_tv;
@@ -855,6 +867,8 @@ inline static bool is_clipped(const RegionRec *r,
 		r->extents.y2 - r->extents.y1 != d->height);
 }
 
+unsigned sna_cpu_detect(void);
+
 void sna_threads_init(void);
 int sna_use_threads (int width, int height, int threshold);
 void sna_threads_run(void (*func)(void *arg), void *arg);
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 81fe165..6015ab6 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1371,7 +1371,7 @@ static bool
 sna_pixmap_create_mappable_gpu(PixmapPtr pixmap)
 {
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);;
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	unsigned create;
 
 	if (wedged(sna))
diff --git a/src/sna/sna_cpu.c b/src/sna/sna_cpu.c
new file mode 100644
index 0000000..65f6e04
--- /dev/null
+++ b/src/sna/sna_cpu.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris at chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "sna.h"
+
+#include <cpuid.h>
+
+unsigned sna_cpu_detect(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned features = 0;
+
+	__cpuid(0, eax, ebx, ecx, edx);
+
+	if (eax & bit_SSE3)
+		features |= SSE3;
+
+	if (eax & bit_SSSE3)
+		features |= SSSE3;
+
+	if (eax & bit_SSE4_1)
+		features |= SSE4_1;
+
+	if (eax & bit_SSE4_2)
+		features |= SSE4_2;
+
+	if (eax & bit_AVX)
+		features |= AVX;
+
+	if (edx & bit_MMX)
+		features |= MMX;
+
+	if (edx & bit_SSE)
+		features |= SSE;
+
+	if (edx & bit_SSE2)
+		features |= SSE2;
+
+	if (edx & bit_SSE4a)
+		features |= SSE4a;
+
+	__cpuid(7, eax, ebx, ecx, edx);
+
+	if (eax & bit_AVX2)
+		features |= AVX2;
+
+	return features;
+}
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index b73f3c5..1930660 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -487,6 +487,8 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 
 		sna->info = (void *)((uintptr_t)scrn->driverPrivate & ~1);
 		scrn->driverPrivate = sna;
+
+		sna->cpu_features = sna_cpu_detect();
 	}
 	sna = to_sna(scrn);
 	sna->scrn = scrn;