xf86-video-intel: 3 commits - configure.ac src/sna/atomic.h src/sna/compiler.h src/sna/gen3_render.c src/sna/gen4_render.c src/sna/gen4_vertex.c src/sna/gen5_render.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/kgem_debug_gen6.c src/sna/Makefile.am src/sna/sna_blt.c src/sna/sna.h src/sna/sna_render.c src/sna/sna_render.h src/sna/sna_threads.c src/sna/sna_trapezoids.c src/sna/sna_vertex.c

Sun Jan 27 09:00:28 PST 2013

configure.ac              |   34 ++
 src/sna/Makefile.am       |    2 
 src/sna/atomic.h          |   89 +++++++
 src/sna/compiler.h        |    2 
 src/sna/gen3_render.c     |  562 ++++++++++++++++++++++++++++++++++++++++++++--
 src/sna/gen4_render.c     |  118 +++++++++
 src/sna/gen4_vertex.c     |  466 ++++++++++++++++++++++++++++++++++++--
 src/sna/gen5_render.c     |  118 +++++++++
 src/sna/gen6_render.c     |  124 +++++++++-
 src/sna/gen7_render.c     |  118 +++++++++
 src/sna/kgem_debug_gen6.c |    4 
 src/sna/sna.h             |    1 
 src/sna/sna_blt.c         |  306 ++++++++++++++++++++++++-
 src/sna/sna_render.c      |    2 
 src/sna/sna_render.h      |   55 ++++
 src/sna/sna_threads.c     |    6 
 src/sna/sna_trapezoids.c  |  475 +++++++++++++++++++++++++++++++++++---
 src/sna/sna_vertex.c      |   37 +++
 18 files changed, 2414 insertions(+), 105 deletions(-)

New commits:
commit 73f574945f2cac14f9bafa6395e2c4dbb16fcf5d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 27 16:02:52 2013 +0000

    sna: Disable all signals in the render threads
    
    X uses them (SIGIO especially) for input handling, and gets rightfully
    confused if it finds itself in a different thread.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_threads.c b/src/sna/sna_threads.c
index 4a98753..f77ddbf 100644
--- a/src/sna/sna_threads.c
+++ b/src/sna/sna_threads.c
@@ -29,6 +29,7 @@
 
 #include <unistd.h>
 #include <pthread.h>
+#include <signal.h>
 
 static int max_threads = -1;
 
@@ -44,6 +45,11 @@ static struct thread {
 static void *__run__(void *arg)
 {
 	struct thread *t = arg;
+	sigset_t signals;
+
+	/* Disable all signals in the slave threads as X uses them for IO */
+	sigfillset(&signals);
+	pthread_sigmask(SIG_BLOCK, &signals, NULL);
 
 	pthread_mutex_lock(&t->mutex);
 	while (1) {
commit 9a7bf70365980809d0f02190f2f620a957ff1ba8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jan 26 23:03:33 2013 +0000

    sna: Enable threaded rasterisation for non-antialiased geometry
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 3224d71..6c0ea6a 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -431,6 +431,26 @@ gen3_emit_composite_primitive_constant(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_constant(const struct sna_composite_op *op,
+				   const BoxRec *box, int nbox,
+				   float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+
+		v[2] = box->x1;
+		v[3] = box->y2;
+
+		v[4] = box->x1;
+		v[5] = box->y1;
+
+		box++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_gradient(struct sna *sna,
 						const struct sna_composite_op *op,
 						const struct sna_composite_rectangles *r)
@@ -457,6 +477,32 @@ gen3_emit_composite_primitive_identity_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_gradient(const struct sna_composite_op *op,
+					    const BoxRec *box, int nbox,
+					    float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+		v[2] = box->x2 + op->src.offset[0];
+		v[3] = box->y2 + op->src.offset[1];
+
+		v[4] = box->x1;
+		v[5] = box->y2;
+		v[6] = box->x1 + op->src.offset[0];
+		v[7] = box->y2 + op->src.offset[1];
+
+		v[8] = box->x1;
+		v[9] = box->y1;
+		v[10] = box->x1 + op->src.offset[0];
+		v[11] = box->y1 + op->src.offset[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
 					      const struct sna_composite_op *op,
 					      const struct sna_composite_rectangles *r)
@@ -494,6 +540,40 @@ gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_affine_gradient(const struct sna_composite_op *op,
+					  const BoxRec *box, int nbox,
+					  float *v)
+{
+	const PictTransform *transform = op->src.transform;
+
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+		sna_get_transformed_coordinates(box->x2 + op->src.offset[0],
+						box->y2 + op->src.offset[1],
+						transform,
+						&v[2], &v[3]);
+
+		v[4] = box->x1;
+		v[5] = box->y2;
+		sna_get_transformed_coordinates(box->x1 + op->src.offset[0],
+						box->y2 + op->src.offset[1],
+						transform,
+						&v[6], &v[7]);
+
+		v[8] = box->x1;
+		v[9] = box->y1;
+		sna_get_transformed_coordinates(box->x1 + op->src.offset[0],
+						box->y1 + op->src.offset[1],
+						transform,
+						&v[10], &v[11]);
+
+		box++;
+		v += 12;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_source(struct sna *sna,
 					      const struct sna_composite_op *op,
 					      const struct sna_composite_rectangles *r)
@@ -519,6 +599,28 @@ gen3_emit_composite_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_source(const struct sna_composite_op *op,
+					  const BoxRec *box, int nbox,
+					  float *v)
+{
+	do {
+		v[0] = box->x2 + op->dst.x;
+		v[8] = v[4] = box->x1 + op->dst.x;
+		v[5] = v[1] = box->y2 + op->dst.y;
+		v[9] = box->y1 + op->dst.y;
+
+		v[10] = v[6] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[2] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[11] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[7] = v[3] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
 							const struct sna_composite_op *op,
 							const struct sna_composite_rectangles *r)
@@ -544,6 +646,28 @@ gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_source_no_offset(const struct sna_composite_op *op,
+						    const BoxRec *box, int nbox,
+						    float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[8] = v[4] = box->x1;
+		v[5] = v[1] = box->y2;
+		v[9] = box->y1;
+
+		v[10] = v[6] = box->x1 * op->src.scale[0];
+		v[2] = box->x2 * op->src.scale[0];
+
+		v[11] = box->y1 * op->src.scale[1];
+		v[7] = v[3] = box->y2 * op->src.scale[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_affine_source(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const struct sna_composite_rectangles *r)
@@ -577,6 +701,39 @@ gen3_emit_composite_primitive_affine_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_affine_source(const struct sna_composite_op *op,
+					const BoxRec *box, int nbox,
+					float *v)
+{
+	const PictTransform *transform = op->src.transform;
+
+	do {
+		v[0] = box->x2;
+		v[5] = v[1] = box->y2;
+		v[8] = v[4] = box->x1;
+		v[9] = box->y1;
+
+		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+					    box->y2 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+					    box->y2 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[6], &v[7]);
+
+		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+					    box->y1 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[10], &v[11]);
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_constant_identity_mask(struct sna *sna,
 						     const struct sna_composite_op *op,
 						     const struct sna_composite_rectangles *r)
@@ -1900,9 +2057,9 @@ gen3_render_composite_box(struct sna *sna,
 }
 
 static void
-gen3_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen3_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s: nbox=%d, src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
 	     __FUNCTION__, nbox,
@@ -1936,6 +2093,60 @@ gen3_render_composite_boxes(struct sna *sna,
 }
 
 static void
+gen3_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen3_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+static void
 gen3_render_composite_done(struct sna *sna,
 			   const struct sna_composite_op *op)
 {
@@ -2986,24 +3197,32 @@ gen3_render_composite(struct sna *sna,
 		case SHADER_WHITE:
 		case SHADER_CONSTANT:
 			tmp->prim_emit = gen3_emit_composite_primitive_constant;
+			tmp->emit_boxes = gen3_emit_composite_boxes_constant;
 			break;
 		case SHADER_LINEAR:
 		case SHADER_RADIAL:
-			if (tmp->src.transform == NULL)
+			if (tmp->src.transform == NULL) {
 				tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient;
-			else if (tmp->src.is_affine)
+				tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient;
+			} else if (tmp->src.is_affine) {
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
+				tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient;
+			}
 			break;
 		case SHADER_TEXTURE:
 			if (tmp->src.transform == NULL) {
-				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0)
+				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
 					tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
-				else
+					tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset;
+				} else {
 					tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
+					tmp->emit_boxes = gen3_emit_composite_boxes_identity_source;
+				}
 			} else if (tmp->src.is_affine) {
 				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
 				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
+				tmp->emit_boxes = gen3_emit_composite_boxes_affine_source;
 			}
 			break;
 		}
@@ -3035,7 +3254,11 @@ gen3_render_composite(struct sna *sna,
 
 	tmp->blt   = gen3_render_composite_blt;
 	tmp->box   = gen3_render_composite_box;
-	tmp->boxes = gen3_render_composite_boxes;
+	tmp->boxes = gen3_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen3_render_composite_boxes;
+		tmp->thread_boxes = gen3_render_composite_boxes__thread;
+	}
 	tmp->done  = gen3_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 65016cd..e4f5f59 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1109,9 +1109,9 @@ gen4_render_composite_box(struct sna *sna,
 }
 
 static void
-gen4_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen4_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
 	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
@@ -1145,6 +1145,62 @@ gen4_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen4_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen4_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -1899,7 +1955,11 @@ gen4_render_composite(struct sna *sna,
 
 	tmp->blt   = gen4_render_composite_blt;
 	tmp->box   = gen4_render_composite_box;
-	tmp->boxes = gen4_render_composite_boxes;
+	tmp->boxes = gen4_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen4_render_composite_boxes;
+		tmp->thread_boxes = gen4_render_composite_boxes__thread;
+	}
 	tmp->done  = gen4_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c
index cc679d3..e513166 100644
--- a/src/sna/gen4_vertex.c
+++ b/src/sna/gen4_vertex.c
@@ -360,6 +360,31 @@ emit_primitive_solid(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_solid(const struct sna_composite_op *op,
+		 const BoxRec *box, int nbox,
+		 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[5] = v[3] = v[1] = .5;
+		box++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_linear(struct sna *sna,
 		      const struct sna_composite_op *op,
 		      const struct sna_composite_rectangles *r)
@@ -390,6 +415,34 @@ emit_primitive_linear(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_linear(const struct sna_composite_op *op,
+		  const BoxRec *box, int nbox,
+		  float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_identity_source(struct sna *sna,
 			       const struct sna_composite_op *op,
 			       const struct sna_composite_rectangles *r)
@@ -421,6 +474,36 @@ emit_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_identity_source(const struct sna_composite_op *op,
+			   const BoxRec *box, int nbox,
+			   float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_simple_source(struct sna *sna,
 			     const struct sna_composite_op *op,
 			     const struct sna_composite_rectangles *r)
@@ -461,6 +544,45 @@ emit_primitive_simple_source(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_simple_source(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_affine_source(struct sna *sna,
 			     const struct sna_composite_op *op,
 			     const struct sna_composite_rectangles *r)
@@ -500,6 +622,43 @@ emit_primitive_affine_source(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_affine_source(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x2,
+					    op->src.offset[1] + box->y2,
+					    op->src.transform, op->src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
+					    op->src.offset[1] + box->y2,
+					    op->src.transform, op->src.scale,
+					    &v[4], &v[5]);
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
+					    op->src.offset[1] + box->y1,
+					    op->src.transform, op->src.scale,
+					    &v[7], &v[8]);
+		box++;
+		v += 9;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_identity_mask(struct sna *sna,
 			     const struct sna_composite_op *op,
 			     const struct sna_composite_rectangles *r)
@@ -543,6 +702,40 @@ emit_primitive_identity_mask(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_identity_mask(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	float msk_x = op->mask.offset[0];
+	float msk_y = op->mask.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[2] = (msk_x + box->x2) * op->mask.scale[0];
+		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
+
+		dst.p.x = box->x1;
+		v[4] = dst.f;
+		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
+
+		dst.p.y = box->y1;
+		v[8] = dst.f;
+		v[11] = (msk_y + box->y1) * op->mask.scale[1];
+
+		v[9] = v[5] = v[1] = .5;
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_linear_identity_mask(struct sna *sna,
 				    const struct sna_composite_op *op,
 				    const struct sna_composite_rectangles *r)
@@ -588,6 +781,43 @@ emit_primitive_linear_identity_mask(struct sna *sna,
 }
 
 fastcall static void
+emit_boxes_linear_identity_mask(const struct sna_composite_op *op,
+				const BoxRec *box, int nbox,
+				float *v)
+{
+	float msk_x = op->mask.offset[0];
+	float msk_y = op->mask.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[2] = (msk_x + box->x2) * op->mask.scale[0];
+		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
+
+		dst.p.x = box->x1;
+		v[4] = dst.f;
+		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
+
+		dst.p.y = box->y1;
+		v[8] = dst.f;
+		v[11] = (msk_y + box->y1) * op->mask.scale[1];
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y2);
+		v[9] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 emit_primitive_identity_source_mask(struct sna *sna,
 				    const struct sna_composite_op *op,
 				    const struct sna_composite_rectangles *r)
@@ -760,11 +990,13 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			if (tmp->src.is_solid) {
 				DBG(("%s: solid, identity mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_identity_mask;
+				tmp->emit_boxes = emit_boxes_identity_mask;
 				tmp->floats_per_vertex = 4;
 				vb = 1 | 2 << 2;
 			} else if (tmp->src.is_linear) {
 				DBG(("%s: linear, identity mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_linear_identity_mask;
+				tmp->emit_boxes = emit_boxes_linear_identity_mask;
 				tmp->floats_per_vertex = 4;
 				vb = 1 | 2 << 2;
 			} else if (tmp->src.transform == NULL) {
@@ -821,6 +1053,7 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 		if (tmp->src.is_solid) {
 			DBG(("%s: solid, no mask\n", __FUNCTION__));
 			tmp->prim_emit = emit_primitive_solid;
+			tmp->emit_boxes = emit_boxes_solid;
 			if (tmp->src.is_opaque && tmp->op == PictOpOver)
 				tmp->op = PictOpSrc;
 			tmp->floats_per_vertex = 2;
@@ -828,11 +1061,13 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 		} else if (tmp->src.is_linear) {
 			DBG(("%s: linear, no mask\n", __FUNCTION__));
 			tmp->prim_emit = emit_primitive_linear;
+			tmp->emit_boxes = emit_boxes_linear;
 			tmp->floats_per_vertex = 2;
 			vb = 1;
 		} else if (tmp->src.transform == NULL) {
 			DBG(("%s: identity src, no mask\n", __FUNCTION__));
 			tmp->prim_emit = emit_primitive_identity_source;
+			tmp->emit_boxes = emit_boxes_identity_source;
 			tmp->floats_per_vertex = 3;
 			vb = 2;
 		} else if (tmp->src.is_affine) {
@@ -841,9 +1076,11 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
 				DBG(("%s: simple src, no mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_simple_source;
+				tmp->emit_boxes = emit_boxes_simple_source;
 			} else {
 				DBG(("%s: affine src, no mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_affine_source;
+				tmp->emit_boxes = emit_boxes_affine_source;
 			}
 			tmp->floats_per_vertex = 3;
 			vb = 2;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 81e6635..998d55e 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1095,9 +1095,9 @@ gen5_render_composite_box(struct sna *sna,
 }
 
 static void
-gen5_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen5_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
 	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
@@ -1131,6 +1131,62 @@ gen5_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen5_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen5_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -1874,7 +1930,11 @@ gen5_render_composite(struct sna *sna,
 
 	tmp->blt   = gen5_render_composite_blt;
 	tmp->box   = gen5_render_composite_box;
-	tmp->boxes = gen5_render_composite_boxes;
+	tmp->boxes = gen5_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen5_render_composite_boxes;
+		tmp->thread_boxes = gen5_render_composite_boxes__thread;
+	}
 	tmp->done  = gen5_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 4ff1606..4a9387a 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1340,9 +1340,9 @@ gen6_render_composite_box(struct sna *sna,
 }
 
 static void
-gen6_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen6_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("composite_boxes(%d)\n", nbox));
 
@@ -1372,6 +1372,62 @@ gen6_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen6_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, op, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen6_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, op, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -2214,7 +2270,11 @@ gen6_render_composite(struct sna *sna,
 
 	tmp->blt   = gen6_render_composite_blt;
 	tmp->box   = gen6_render_composite_box;
-	tmp->boxes = gen6_render_composite_boxes;
+	tmp->boxes = gen6_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen6_render_composite_boxes;
+		tmp->thread_boxes = gen6_render_composite_boxes__thread;
+	}
 	tmp->done  = gen6_render_composite_done;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 34ba252..6eec4b4 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1463,9 +1463,9 @@ gen7_render_composite_box(struct sna *sna,
 }
 
 static void
-gen7_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen7_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("composite_boxes(%d)\n", nbox));
 
@@ -1495,6 +1495,62 @@ gen7_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen7_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen7_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -2334,7 +2390,11 @@ gen7_render_composite(struct sna *sna,
 
 	tmp->blt   = gen7_render_composite_blt;
 	tmp->box   = gen7_render_composite_box;
-	tmp->boxes = gen7_render_composite_boxes;
+	tmp->boxes = gen7_render_composite_boxes__blt;
+	if (tmp->emit_boxes){
+		tmp->boxes = gen7_render_composite_boxes;
+		tmp->thread_boxes = gen7_render_composite_boxes__thread;
+	}
 	tmp->done  = gen7_render_composite_done;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 5602579..edfcb9e 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -934,6 +934,76 @@ static void blt_composite_fill_boxes_no_offset(struct sna *sna,
 	_sna_blt_fill_boxes(sna, &op->u.blt, box, n);
 }
 
+static void blt_composite_fill_boxes_no_offset__thread(struct sna *sna,
+						       const struct sna_composite_op *op,
+						       const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	const struct sna_blt_state *blt = &op->u.blt;
+	uint32_t cmd = blt->cmd;
+
+	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, nbox));
+
+	sna_vertex_lock(&sna->render);
+	if (!kgem_check_batch(kgem, 3)) {
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	}
+
+	do {
+		uint32_t *b = kgem->batch + kgem->nbatch;
+		int nbox_this_time;
+
+		nbox_this_time = nbox;
+		if (3*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+			nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 3;
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		kgem->nbatch += 3 * nbox_this_time;
+		assert(kgem->nbatch < kgem->surface);
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		while (nbox_this_time >= 8) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b[6] = cmd; *(uint64_t *)(b+7) = *(const uint64_t *)box++;
+			b[9] = cmd; *(uint64_t *)(b+10) = *(const uint64_t *)box++;
+			b[12] = cmd; *(uint64_t *)(b+13) = *(const uint64_t *)box++;
+			b[15] = cmd; *(uint64_t *)(b+16) = *(const uint64_t *)box++;
+			b[18] = cmd; *(uint64_t *)(b+19) = *(const uint64_t *)box++;
+			b[21] = cmd; *(uint64_t *)(b+22) = *(const uint64_t *)box++;
+			b += 24;
+			nbox_this_time -= 8;
+		}
+		if (nbox_this_time & 4) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b[6] = cmd; *(uint64_t *)(b+7) = *(const uint64_t *)box++;
+			b[9] = cmd; *(uint64_t *)(b+10) = *(const uint64_t *)box++;
+			b += 12;
+		}
+		if (nbox_this_time & 2) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b += 6;
+		}
+		if (nbox_this_time & 1) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+		}
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+		if (!nbox)
+			break;
+
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	} while (1);
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall static void blt_composite_fill_box(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const BoxRec *box)
@@ -957,6 +1027,92 @@ static void blt_composite_fill_boxes(struct sna *sna,
 	} while (--n);
 }
 
+static inline uint64_t add4(const BoxRec *b, int16_t x, int16_t y)
+{
+	union {
+		uint64_t v;
+		int16_t i[4];
+	} vi;
+	vi.v = *(uint64_t *)b;
+	vi.i[0] += x;
+	vi.i[1] += y;
+	vi.i[2] += x;
+	vi.i[3] += y;
+	return vi.v;
+}
+
+static void blt_composite_fill_boxes__thread(struct sna *sna,
+					     const struct sna_composite_op *op,
+					     const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	const struct sna_blt_state *blt = &op->u.blt;
+	uint32_t cmd = blt->cmd;
+	int16_t dx = op->dst.x;
+	int16_t dy = op->dst.y;
+
+	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, nbox));
+
+	sna_vertex_lock(&sna->render);
+	if (!kgem_check_batch(kgem, 3)) {
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	}
+
+	do {
+		uint32_t *b = kgem->batch + kgem->nbatch;
+		int nbox_this_time;
+
+		nbox_this_time = nbox;
+		if (3*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+			nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 3;
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		kgem->nbatch += 3 * nbox_this_time;
+		assert(kgem->nbatch < kgem->surface);
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		while (nbox_this_time >= 8) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b[6] = cmd; *(uint64_t *)(b+7) = add4(box++, dx, dy);
+			b[9] = cmd; *(uint64_t *)(b+10) = add4(box++, dx, dy);
+			b[12] = cmd; *(uint64_t *)(b+13) = add4(box++, dx, dy);
+			b[15] = cmd; *(uint64_t *)(b+16) = add4(box++, dx, dy);
+			b[18] = cmd; *(uint64_t *)(b+19) = add4(box++, dx, dy);
+			b[21] = cmd; *(uint64_t *)(b+22) = add4(box++, dx, dy);
+			b += 24;
+			nbox_this_time -= 8;
+		}
+		if (nbox_this_time & 4) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b[6] = cmd; *(uint64_t *)(b+7) = add4(box++, dx, dy);
+			b[9] = cmd; *(uint64_t *)(b+10) = add4(box++, dx, dy);
+			b += 12;
+		}
+		if (nbox_this_time & 2) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b += 6;
+		}
+		if (nbox_this_time & 1) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+		}
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+		if (!nbox)
+			break;
+
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	} while (1);
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall
 static void blt_composite_nop(struct sna *sna,
 			       const struct sna_composite_op *op,
@@ -1014,6 +1170,7 @@ prepare_blt_clear(struct sna *sna,
 		op->blt   = blt_composite_fill__cpu;
 		op->box   = blt_composite_fill_box__cpu;
 		op->boxes = blt_composite_fill_boxes__cpu;
+		op->thread_boxes = blt_composite_fill_boxes__cpu;
 		op->done  = nop_done;
 		op->u.blt.pixel = 0;
 		return true;
@@ -1023,9 +1180,11 @@ prepare_blt_clear(struct sna *sna,
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
+		op->thread_boxes = blt_composite_fill_boxes__thread;
 	} else {
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
+		op->thread_boxes = blt_composite_fill_boxes_no_offset__thread;
 	}
 	op->done = nop_done;
 
@@ -1050,6 +1209,7 @@ prepare_blt_fill(struct sna *sna,
 		op->blt = blt_composite_fill__cpu;
 		op->box   = blt_composite_fill_box__cpu;
 		op->boxes = blt_composite_fill_boxes__cpu;
+		op->thread_boxes = blt_composite_fill_boxes__cpu;
 		op->done = nop_done;
 		return true;
 	}
@@ -1058,9 +1218,11 @@ prepare_blt_fill(struct sna *sna,
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
+		op->thread_boxes = blt_composite_fill_boxes__thread;
 	} else {
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
+		op->thread_boxes = blt_composite_fill_boxes_no_offset__thread;
 	}
 	op->done = nop_done;
 
@@ -1151,6 +1313,141 @@ static void blt_composite_copy_boxes(struct sna *sna,
 	} while(--nbox);
 }
 
+static inline uint32_t add2(uint32_t v, int16_t x, int16_t y)
+{
+	x += v & 0xffff;
+	y += v >> 16;
+	return (uint16_t)y << 16 | x;
+}
+
+static void blt_composite_copy_boxes__thread(struct sna *sna,
+					     const struct sna_composite_op *op,
+					     const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	int dst_dx = op->dst.x;
+	int dst_dy = op->dst.y;
+	int src_dx = op->src.offset[0];
+	int src_dy = op->src.offset[1];
+	uint32_t cmd = op->u.blt.cmd;
+	uint32_t br13 = op->u.blt.br13;
+	struct kgem_bo *src_bo = op->u.blt.bo[0];
+	struct kgem_bo *dst_bo = op->u.blt.bo[1];
+	int src_pitch = op->u.blt.pitch[0];
+
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+
+	if ((dst_dx | dst_dy) == 0) {
+		uint64_t hdr = (uint64_t)br13 << 32 | cmd;
+		do {
+			int nbox_this_time;
+
+			nbox_this_time = nbox;
+			if (8*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+				nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 8;
+			if (2*nbox_this_time > KGEM_RELOC_SIZE(kgem) - kgem->nreloc)
+				nbox_this_time = (KGEM_RELOC_SIZE(kgem) - kgem->nreloc)/2;
+			assert(nbox_this_time);
+			nbox -= nbox_this_time;
+
+			do {
+				uint32_t *b = kgem->batch + kgem->nbatch;
+
+				DBG(("  %s: box=(%d, %d)x(%d, %d)\n",
+				     __FUNCTION__,
+				     box->x1, box->y1,
+				     box->x2 - box->x1, box->y2 - box->y1));
+
+				assert(box->x1 + src_dx >= 0);
+				assert(box->y1 + src_dy >= 0);
+				assert(box->x1 + src_dx <= INT16_MAX);
+				assert(box->y1 + src_dy <= INT16_MAX);
+
+				assert(box->x1 >= 0);
+				assert(box->y1 >= 0);
+
+				*(uint64_t *)&b[0] = hdr;
+				*(uint64_t *)&b[2] = *(const uint64_t *)box;
+				b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, dst_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      I915_GEM_DOMAIN_RENDER |
+						      KGEM_RELOC_FENCED,
+						      0);
+				b[5] = add2(b[2], src_dx, src_dy);
+				b[6] = src_pitch;
+				b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      KGEM_RELOC_FENCED,
+						      0);
+				kgem->nbatch += 8;
+				assert(kgem->nbatch < kgem->surface);
+				box++;
+			} while (--nbox_this_time);
+
+			if (!nbox)
+				break;
+
+			_kgem_submit(kgem);
+			_kgem_set_mode(kgem, KGEM_BLT);
+		} while (1);
+	} else {
+		do {
+			int nbox_this_time;
+
+			nbox_this_time = nbox;
+			if (8*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+				nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 8;
+			if (2*nbox_this_time > KGEM_RELOC_SIZE(kgem) - kgem->nreloc)
+				nbox_this_time = (KGEM_RELOC_SIZE(kgem) - kgem->nreloc)/2;
+			assert(nbox_this_time);
+			nbox -= nbox_this_time;
+
+			do {
+				uint32_t *b = kgem->batch + kgem->nbatch;
+
+				DBG(("  %s: box=(%d, %d)x(%d, %d)\n",
+				     __FUNCTION__,
+				     box->x1, box->y1,
+				     box->x2 - box->x1, box->y2 - box->y1));
+
+				assert(box->x1 + src_dx >= 0);
+				assert(box->y1 + src_dy >= 0);
+
+				assert(box->x1 + dst_dx >= 0);
+				assert(box->y1 + dst_dy >= 0);
+
+				b[0] = cmd;
+				b[1] = br13;
+				b[2] = ((box->y1 + dst_dy) << 16) | (box->x1 + dst_dx);
+				b[3] = ((box->y2 + dst_dy) << 16) | (box->x2 + dst_dx);
+				b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, dst_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      I915_GEM_DOMAIN_RENDER |
+						      KGEM_RELOC_FENCED,
+						      0);
+				b[5] = ((box->y1 + src_dy) << 16) | (box->x1 + src_dx);
+				b[6] = src_pitch;
+				b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      KGEM_RELOC_FENCED,
+						      0);
+				kgem->nbatch += 8;
+				assert(kgem->nbatch < kgem->surface);
+				box++;
+			} while (--nbox_this_time);
+
+			if (!nbox)
+				break;
+
+			_kgem_submit(kgem);
+			_kgem_set_mode(kgem, KGEM_BLT);
+		} while (1);
+	}
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall static void
 blt_composite_copy_with_alpha(struct sna *sna,
 			      const struct sna_composite_op *op,
@@ -1277,6 +1574,7 @@ prepare_blt_copy(struct sna *sna,
 		op->blt   = blt_composite_copy;
 		op->box   = blt_composite_copy_box;
 		op->boxes = blt_composite_copy_boxes;
+		op->thread_boxes = blt_composite_copy_boxes__thread;
 
 		if (!sna_blt_copy_init(sna, &op->u.blt, bo, op->dst.bo,
 				       src->drawable.bitsPerPixel,
@@ -2075,6 +2373,7 @@ sna_blt_composite__convert(struct sna *sna,
 		tmp->blt   = blt_composite_copy;
 		tmp->box   = blt_composite_copy_box;
 		tmp->boxes = blt_composite_copy_boxes;
+		tmp->thread_boxes = blt_composite_copy_boxes__thread;
 
 		if (!sna_blt_copy_init(sna, &tmp->u.blt,
 				       tmp->src.bo, tmp->dst.bo,
@@ -2446,13 +2745,6 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	return true;
 }
 
-static inline uint32_t add2(uint32_t v, int16_t x, int16_t y)
-{
-	x += v & 0xffff;
-	y += v >> 16;
-	return (uint16_t)y << 16 | x;
-}
-
 bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 			struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 			struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 6a0b1d8..c953e50 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -35,6 +35,8 @@ struct sna_composite_op {
 			     const BoxRec *box);
 	void (*boxes)(struct sna *sna, const struct sna_composite_op *op,
 		      const BoxRec *box, int nbox);
+	void (*thread_boxes)(struct sna *sna, const struct sna_composite_op *op,
+			     const BoxRec *box, int nbox);
 	void (*done)(struct sna *sna, const struct sna_composite_op *op);
 
 	struct sna_damage **damage;
@@ -93,6 +95,9 @@ struct sna_composite_op {
 	fastcall void (*prim_emit)(struct sna *sna,
 				   const struct sna_composite_op *op,
 				   const struct sna_composite_rectangles *r);
+	fastcall void (*emit_boxes)(const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox,
+				    float *v);
 
 	struct sna_composite_redirect {
 		struct kgem_bo *real_bo;
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index bf4816b..7f7492a 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1876,7 +1876,7 @@ static void
 mono_add_line(struct mono *mono,
 	      int dst_x, int dst_y,
 	      xFixed top, xFixed bottom,
-	      xPointFixed *p1, xPointFixed *p2,
+	      const xPointFixed *p1, const xPointFixed *p2,
 	      int dir)
 {
 	struct mono_polygon *polygon = &mono->polygon;
@@ -1893,7 +1893,7 @@ mono_add_line(struct mono *mono,
 	       dir));
 
 	if (top > bottom) {
-		xPointFixed *t;
+		const xPointFixed *t;
 
 		y = top;
 		top = bottom;
@@ -2150,6 +2150,60 @@ mono_span__fast(struct mono *c, int x1, int x2, BoxPtr box)
 	c->op.box(c->sna, &c->op, box);
 }
 
+struct mono_span_thread_boxes {
+	const struct sna_composite_op *op;
+#define MONO_SPAN_MAX_BOXES (8192/sizeof(BoxRec))
+	BoxRec boxes[MONO_SPAN_MAX_BOXES];
+	int num_boxes;
+};
+
+inline static void
+thread_mono_span_add_boxes(struct mono *c, const BoxRec *box, int count)
+{
+	struct mono_span_thread_boxes *b = c->op.priv;
+
+	assert(count > 0 && count <= MONO_SPAN_MAX_BOXES);
+	if (b->num_boxes + count > MONO_SPAN_MAX_BOXES) {
+		b->op->thread_boxes(c->sna, b->op, b->boxes, b->num_boxes);
+		b->num_boxes = 0;
+	}
+
+	memcpy(b->boxes + b->num_boxes, box, count*sizeof(BoxRec));
+	b->num_boxes += count;
+	assert(b->num_boxes <= MONO_SPAN_MAX_BOXES);
+}
+
+fastcall static void
+thread_mono_span_clipped(struct mono *c, int x1, int x2, BoxPtr box)
+{
+	pixman_region16_t region;
+
+	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+
+	box->x1 = x1;
+	box->x2 = x2;
+
+	assert(c->clip.data);
+
+	pixman_region_init_rects(&region, box, 1);
+	RegionIntersect(&region, &region, &c->clip);
+	if (REGION_NUM_RECTS(&region))
+		thread_mono_span_add_boxes(c,
+					   REGION_RECTS(&region),
+					   REGION_NUM_RECTS(&region));
+	pixman_region_fini(&region);
+}
+
+fastcall static void
+thread_mono_span(struct mono *c, int x1, int x2, BoxPtr box)
+{
+	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+
+	box->x1 = x1;
+	box->x2 = x2;
+	thread_mono_span_add_boxes(c, box, 1);
+}
+
 inline static void
 mono_row(struct mono *c, int16_t y, int16_t h)
 {
@@ -2267,10 +2321,7 @@ mono_render(struct mono *mono)
 	struct mono_polygon *polygon = &mono->polygon;
 	int i, j, h = mono->clip.extents.y2 - mono->clip.extents.y1;
 
-	if (mono->clip.data == NULL && mono->op.damage == NULL)
-		mono->span = mono_span__fast;
-	else
-		mono->span = mono_span;
+	assert(mono->span);
 
 	for (i = 0; i < h; i = j) {
 		j = i + 1;
@@ -4053,6 +4104,74 @@ choose_span(struct sna_composite_spans_op *tmp,
 	return span;
 }
 
+struct mono_span_thread {
+	struct sna *sna;
+	const xTrapezoid *traps;
+	const struct sna_composite_op *op;
+	RegionPtr clip;
+	int ntrap;
+	BoxRec extents;
+	int dx, dy;
+};
+
+static void
+mono_span_thread(void *arg)
+{
+	struct mono_span_thread *thread = arg;
+	struct mono mono;
+	struct mono_span_thread_boxes boxes;
+	const xTrapezoid *t;
+	int n;
+
+	mono.sna = thread->sna;
+
+	mono.clip.extents = thread->extents;
+	mono.clip.data = NULL;
+	if (thread->clip->data) {
+		RegionIntersect(&mono.clip, &mono.clip, thread->clip);
+		if (RegionNil(&mono.clip))
+			return;
+	}
+
+	boxes.op = thread->op;
+	boxes.num_boxes = 0;
+	mono.op.priv = &boxes;
+
+	if (!mono_init(&mono, 2*thread->ntrap)) {
+		RegionUninit(&mono.clip);
+		return;
+	}
+
+	for (n = thread->ntrap, t = thread->traps; n--; t++) {
+		if (!xTrapezoidValid(t))
+			continue;
+
+		if (pixman_fixed_to_int(t->top) + thread->dy >= thread->extents.y2 ||
+		    pixman_fixed_to_int(t->bottom) + thread->dy <= thread->extents.y1)
+			continue;
+
+		mono_add_line(&mono, thread->dx, thread->dy,
+			      t->top, t->bottom,
+			      &t->left.p1, &t->left.p2, 1);
+		mono_add_line(&mono, thread->dx, thread->dy,
+			      t->top, t->bottom,
+			      &t->right.p1, &t->right.p2, -1);
+	}
+
+	if (mono.clip.data == NULL)
+		mono.span = thread_mono_span;
+	else
+		mono.span = thread_mono_span_clipped;
+
+	mono_render(&mono);
+	mono_fini(&mono);
+
+	if (boxes.num_boxes)
+		thread->op->thread_boxes(thread->sna, thread->op,
+					 boxes.boxes, boxes.num_boxes);
+	RegionUninit(&mono.clip);
+}
+
 static bool
 mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			       INT16 src_x, INT16 src_y,
@@ -4062,8 +4181,8 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	BoxRec extents;
 	int16_t dst_x, dst_y;
 	int16_t dx, dy;
-	bool was_clear;
-	int n;
+	bool unbounded;
+	int num_threads, n;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -4102,11 +4221,69 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + mono.clip.extents.x1 - dst_x - dx,
 	     src_y + mono.clip.extents.y1 - dst_y - dy));
 
+	unbounded = (!sna_drawable_is_clear(dst->pDrawable) &&
+		     !operator_is_bounded(op));
+
 	mono.sna = to_sna_from_drawable(dst->pDrawable);
-	if (!mono_init(&mono, 2*ntrap))
+	if (!mono.sna->render.composite(mono.sna, op, src, NULL, dst,
+				       src_x + mono.clip.extents.x1 - dst_x - dx,
+				       src_y + mono.clip.extents.y1 - dst_y - dy,
+				       0, 0,
+				       mono.clip.extents.x1,  mono.clip.extents.y1,
+				       mono.clip.extents.x2 - mono.clip.extents.x1,
+				       mono.clip.extents.y2 - mono.clip.extents.y1,
+				       memset(&mono.op, 0, sizeof(mono.op))))
 		return false;
 
-	was_clear = sna_drawable_is_clear(dst->pDrawable);
+	num_threads = 1;
+	if (!NO_GPU_THREADS &&
+	    mono.op.thread_boxes &&
+	    mono.op.damage == NULL &&
+	    !unbounded)
+		num_threads = sna_use_threads(mono.clip.extents.x2 - mono.clip.extents.x1,
+					      mono.clip.extents.y2 - mono.clip.extents.y1,
+					      16);
+	if (num_threads > 1) {
+		struct mono_span_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for mono span compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     mono.clip.extents.x2 - mono.clip.extents.x1,
+		     mono.clip.extents.y2 - mono.clip.extents.y1));
+
+		threads[0].sna = mono.sna;
+		threads[0].op = &mono.op;
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].extents = mono.clip.extents;
+		threads[0].clip = &mono.clip;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+
+		y = extents.y1;
+		h = extents.y2 - extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(mono_span_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = extents.y2;
+		mono_span_thread(&threads[0]);
+
+		sna_threads_wait();
+		mono.op.done(mono.sna, &mono.op);
+		return true;
+	}
+
+	if (!mono_init(&mono, 2*ntrap))
+		return false;
 
 	for (n = 0; n < ntrap; n++) {
 		if (!xTrapezoidValid(&traps[n]))
@@ -4124,23 +4301,16 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			      &traps[n].right.p1, &traps[n].right.p2, -1);
 	}
 
-	memset(&mono.op, 0, sizeof(mono.op));
-	if (!mono.sna->render.composite(mono.sna, op, src, NULL, dst,
-				       src_x + mono.clip.extents.x1 - dst_x - dx,
-				       src_y + mono.clip.extents.y1 - dst_y - dy,
-				       0, 0,
-				       mono.clip.extents.x1,  mono.clip.extents.y1,
-				       mono.clip.extents.x2 - mono.clip.extents.x1,
-				       mono.clip.extents.y2 - mono.clip.extents.y1,
-				       &mono.op)) {
-		mono_fini(&mono);
-		return false;
-	}
+	if (mono.clip.data == NULL && mono.op.damage == NULL)
+		mono.span = mono_span__fast;
+	else
+		mono.span = mono_span;
+
 	mono_render(&mono);
 	mono.op.done(mono.sna, &mono.op);
 	mono_fini(&mono);
 
-	if (!was_clear && !operator_is_bounded(op)) {
+	if (unbounded) {
 		xPointFixed p1, p2;
 
 		if (!mono_init(&mono, 2+2*ntrap))
@@ -5245,6 +5415,11 @@ unbounded_pass:
 		mono.op.box = mono_inplace_composite_box;
 		mono.op.boxes = mono_inplace_composite_boxes;
 	}
+
+	if (mono.clip.data == NULL && mono.op.damage == NULL)
+		mono.span = mono_span__fast;
+	else
+		mono.span = mono_span;
 	mono_render(&mono);
 	mono_fini(&mono);
 
@@ -6850,6 +7025,10 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 				       mono.clip.extents.x2 - mono.clip.extents.x1,
 				       mono.clip.extents.y2 - mono.clip.extents.y1,
 				       &mono.op)) {
+		if (mono.clip.data == NULL && mono.op.damage == NULL)
+			mono.span = mono_span__fast;
+		else
+			mono.span = mono_span;
 		mono_render(&mono);
 		mono.op.done(mono.sna, &mono.op);
 	}
@@ -6893,6 +7072,10 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 					       mono.clip.extents.x2 - mono.clip.extents.x1,
 					       mono.clip.extents.y2 - mono.clip.extents.y1,
 					       &mono.op)) {
+			if (mono.clip.data == NULL && mono.op.damage == NULL)
+				mono.span = mono_span__fast;
+			else
+				mono.span = mono_span;
 			mono_render(&mono);
 			mono.op.done(mono.sna, &mono.op);
 		}
commit 8178cff5718e69e14d3953a7f754d7585a06838f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jan 26 14:41:04 2013 +0000

    sna: Begin sketching out a threaded rasteriser for spans
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index cb1496b..46affdc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -104,6 +104,40 @@ if test x$ASM != "xno"; then
 fi
 AM_CONDITIONAL(HAVE_GEN4ASM, test x$gen4asm = xyes)
 
+# Check for atomic intrinsics
+AC_CACHE_CHECK([for native atomic primitives], intel_cv_atomic_primitives,
+[
+    intel_cv_atomic_primitives="none"
+
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+int atomic_add(int i) { return __sync_fetch_and_add (&i, 1); }
+int atomic_cmpxchg(int i, int j, int k) { return __sync_val_compare_and_swap (&i, j, k); }
+				    ]],[[]])],
+		   [intel_cv_atomic_primitives="Intel"],[])
+
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    AC_CHECK_HEADER([atomic_ops.h], intel_cv_atomic_primitives="libatomic-ops")
+    fi
+
+    # atomic functions defined in <atomic.h> & libc on Solaris
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    AC_CHECK_FUNC([atomic_cas_uint],
+			  intel_cv_atomic_primitives="Solaris")
+    fi
+
+])
+if test "x$intel_cv_atomic_primitives" = xIntel; then
+    AC_DEFINE(HAVE_ATOMIC_PRIMITIVES, 1,
+	      [Enable if your compiler supports the Intel __sync_* atomic primitives])
+fi
+if test "x$intel_cv_atomic_primitives" = "xlibatomic-ops"; then
+    AC_DEFINE(HAVE_LIB_ATOMIC_OPS, 1, [Enable if you have libatomic-ops-dev installed])
+fi
+
+if test "x$intel_cv_atomic_primitives" = "xnone"; then
+		AC_MSG_ERROR([xf86-video-intel depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package.])
+fi
+
 AC_ARG_ENABLE(udev,
               AS_HELP_STRING([--disable-udev],
                              [Disable udev-based monitor hotplug detection [default=auto]]),
diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
index bfa836f..c74c904 100644
--- a/src/sna/Makefile.am
+++ b/src/sna/Makefile.am
@@ -38,6 +38,7 @@ libsna_la_LDFLAGS = -pthread
 libsna_la_LIBADD = @UDEV_LIBS@ -lm @DRM_LIBS@ brw/libbrw.la fb/libfb.la
 
 libsna_la_SOURCES = \
+	atomic.h \
 	blt.c \
 	compiler.h \
 	kgem.c \
@@ -64,6 +65,7 @@ libsna_la_SOURCES = \
 	sna_tiling.c \
 	sna_transform.c \
 	sna_threads.c \
+	sna_vertex.c \
 	sna_video.c \
 	sna_video.h \
 	sna_video_overlay.c \
diff --git a/src/sna/atomic.h b/src/sna/atomic.h
new file mode 100644
index 0000000..306dc6d
--- /dev/null
+++ b/src/sna/atomic.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright Â© 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris at chris-wilson.co.uk>
+ *
+ */
+
+#ifndef ATOMIC_H
+#define ATOMIC_H
+
+#if HAVE_ATOMIC_PRIMITIVES
+
+#define HAS_ATOMIC_OPS 1
+
+typedef struct {
+	int atomic;
+} atomic_t;
+
+# define atomic_read(x) ((x)->atomic)
+# define atomic_set(x, val) ((x)->atomic = (val))
+# define atomic_inc(x) ((void) __sync_fetch_and_add (&(x)->atomic, 1))
+# define atomic_dec_and_test(x) (__sync_fetch_and_add (&(x)->atomic, -1) == 1)
+# define atomic_add(x, v) ((void) __sync_add_and_fetch(&(x)->atomic, (v)))
+# define atomic_dec(x, v) ((void) __sync_sub_and_fetch(&(x)->atomic, (v)))
+# define atomic_cmpxchg(x, oldv, newv) __sync_val_compare_and_swap (&(x)->atomic, oldv, newv)
+
+#endif
+
+#if HAVE_LIB_ATOMIC_OPS
+#include <atomic_ops.h>
+
+#define HAS_ATOMIC_OPS 1
+
+typedef struct {
+	AO_t atomic;
+} atomic_t;
+
+# define atomic_read(x) AO_load_full(&(x)->atomic)
+# define atomic_set(x, val) AO_store_full(&(x)->atomic, (val))
+# define atomic_inc(x) ((void) AO_fetch_and_add1_full(&(x)->atomic))
+# define atomic_add(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, (v)))
+# define atomic_dec(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, -(v)))
+# define atomic_dec_and_test(x) (AO_fetch_and_sub1_full(&(x)->atomic) == 1)
+# define atomic_cmpxchg(x, oldv, newv) AO_compare_and_swap_full(&(x)->atomic, oldv, newv)
+
+#endif
+
+#if defined(__sun) && !defined(HAS_ATOMIC_OPS)  /* Solaris & OpenSolaris */
+
+#include <sys/atomic.h>
+#define HAS_ATOMIC_OPS 1
+
+typedef struct { uint_t atomic; } atomic_t;
+
+# define atomic_read(x) (int) ((x)->atomic)
+# define atomic_set(x, val) ((x)->atomic = (uint_t)(val))
+# define atomic_inc(x) (atomic_inc_uint (&(x)->atomic))
+# define atomic_dec_and_test(x) (atomic_dec_uint_nv(&(x)->atomic) == 1)
+# define atomic_add(x, v) (atomic_add_int(&(x)->atomic, (v)))
+# define atomic_dec(x, v) (atomic_add_int(&(x)->atomic, -(v)))
+# define atomic_cmpxchg(x, oldv, newv) atomic_cas_uint (&(x)->atomic, oldv, newv)
+
+#endif
+
+#if ! HAS_ATOMIC_OPS
+#error xf86-video-intel requires atomic operations, please define them for your CPU/compiler.
+#endif
+
+#endif
diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index ff80365..b985f2b 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -36,6 +36,7 @@
 #define fastcall __attribute__((regparm(3)))
 #define must_check __attribute__((warn_unused_result))
 #define constant __attribute__((const))
+#define __packed__ __attribute__((__packed__))
 #else
 #define likely(expr) (expr)
 #define unlikely(expr) (expr)
@@ -44,6 +45,7 @@
 #define fastcall
 #define must_check
 #define constant
+#define __packed__
 #endif
 
 #ifdef HAVE_VALGRIND
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 01c0aee..3224d71 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1618,6 +1618,8 @@ static int gen3_vertex_finish(struct sna *sna)
 	assert(sna->render.vertex_used);
 	assert(sna->render.vertex_used <= sna->render.vertex_size);
 
+	sna_vertex_wait__locked(&sna->render);
+
 	bo = sna->render.vbo;
 	if (bo) {
 		DBG(("%s: reloc = %d\n", __FUNCTION__,
@@ -1796,6 +1798,17 @@ static int gen3_get_rectangles__flush(struct sna *sna,
 		}
 	}
 
+	/* Preventing discarding new vbo after lock contention */
+	if (sna->render.active) {
+		int rem;
+
+		sna_vertex_wait__locked(&sna->render);
+
+		rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	return gen3_vertex_finish(sna);
 }
 
@@ -1838,6 +1851,7 @@ flush:
 		gen3_vertex_flush(sna);
 		gen3_magic_ca_pass(sna, op);
 	}
+	gen3_vertex_finish(sna);
 	_kgem_submit(&sna->kgem);
 	gen3_emit_composite_state(sna, op);
 	assert(sna->render.vertex_offset == 0);
@@ -3081,6 +3095,26 @@ gen3_emit_composite_spans_primitive_zero(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_zero__boxes(const struct sna_composite_spans_op *op,
+						const struct sna_opacity_box *b,
+						int nbox, float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+
+		v[2] = op->base.dst.x + b->box.x1;
+		v[3] = v[1];
+
+		v[4] = v[2];
+		v[5] = op->base.dst.x + b->box.y1;
+
+		v += 6;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
 						   const struct sna_composite_spans_op *op,
 						   const BoxRec *box,
@@ -3096,6 +3130,22 @@ gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset__boxes(const struct sna_composite_spans_op *op,
+							  const struct sna_opacity_box *b,
+							  int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[3] = v[1] = b->box.y2;
+		v[4] = v[2] = b->box.x1;
+		v[5] = b->box.y1;
+
+		b++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_constant(struct sna *sna,
 					     const struct sna_composite_spans_op *op,
 					     const BoxRec *box,
@@ -3112,6 +3162,24 @@ gen3_emit_composite_spans_primitive_constant(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_constant__boxes(const struct sna_composite_spans_op *op,
+						    const struct sna_opacity_box *b,
+						    int nbox,
+						    float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[6] = v[3] = op->base.dst.x + b->box.x1;
+		v[4] = v[1] = op->base.dst.y + b->box.y2;
+		v[7] = op->base.dst.y + b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
 						       const struct sna_composite_spans_op *op,
 						       const BoxRec *box,
@@ -3128,6 +3196,23 @@ gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset__boxes(const struct sna_composite_spans_op *op,
+							      const struct sna_opacity_box *b,
+							      int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[6] = v[3] = b->box.x1;
+		v[4] = v[1] = b->box.y2;
+		v[7] = b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
 						    const struct sna_composite_spans_op *op,
 						    const BoxRec *box,
@@ -3156,6 +3241,36 @@ gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
 						  const struct sna_composite_spans_op *op,
 						  const BoxRec *box,
@@ -3190,6 +3305,40 @@ gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__boxes(const struct sna_composite_spans_op *op,
+							 const struct sna_opacity_box *b,
+							 int nbox,
+							 float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0]  = op->base.dst.x + b->box.x2;
+		v[6]  = v[1] = op->base.dst.y + b->box.y2;
+		v[10] = v[5] = op->base.dst.x + b->box.x1;
+		v[11] = op->base.dst.y + b->box.y1;
+		v[14] = v[9] = v[4]  = b->alpha;
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
 						      const struct sna_composite_spans_op *op,
 						      const BoxRec *box,
@@ -3218,6 +3367,36 @@ gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__boxes(const struct sna_composite_spans_op *op,
+							     const struct sna_opacity_box *b,
+							     int nbox,
+							     float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = op->base.src.offset[0] + b->box.x2;
+		v[3] = op->base.src.offset[1] + b->box.y2;
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = op->base.src.offset[0] + b->box.x1;
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = op->base.src.offset[1] + b->box.y1;
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
 						    const struct sna_composite_spans_op *op,
 						    const BoxRec *box,
@@ -3253,6 +3432,43 @@ gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x2,
+						 (int)op->base.src.offset[1] + b->box.y2,
+						 transform,
+						 &v[2], &v[3]);
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x1,
+						 (int)op->base.src.offset[1] + b->box.y2,
+						 transform,
+						 &v[7], &v[8]);
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x1,
+						 (int)op->base.src.offset[1] + b->box.y1,
+						 transform,
+						 &v[12], &v[13]);
+		v[14] = b->alpha;
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive(struct sna *sna,
 				    const struct sna_composite_spans_op *op,
 				    const BoxRec *box,
@@ -3297,6 +3513,48 @@ gen3_render_composite_spans_constant_box(struct sna *sna,
 }
 
 fastcall static void
+gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
+						  const struct sna_composite_spans_op *op,
+						  const struct sna_opacity_box *box,
+						  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * 9;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		do {
+			v[0] = box->box.x2;
+			v[6] = v[3] = box->box.x1;
+			v[4] = v[1] = box->box.y2;
+			v[7] = box->box.y1;
+			v[8] = v[5] = v[2] = box->alpha;
+			v += 9;
+			box++;
+		} while (--nbox_this_time);
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen3_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
 				const BoxRec *box, float opacity)
@@ -3344,6 +3602,41 @@ gen3_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen3_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen3_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
@@ -3447,40 +3740,58 @@ gen3_render_composite_spans(struct sna *sna,
 	no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
 	tmp->box   = gen3_render_composite_spans_box;
 	tmp->boxes = gen3_render_composite_spans_boxes;
+	tmp->thread_boxes = gen3_render_composite_spans_boxes__thread;
 	tmp->done  = gen3_render_composite_spans_done;
 	tmp->prim_emit = gen3_emit_composite_spans_primitive;
 	switch (tmp->base.src.u.gen3.type) {
 	case SHADER_NONE:
 		assert(0);
 	case SHADER_ZERO:
-		tmp->prim_emit = no_offset ? gen3_emit_composite_spans_primitive_zero_no_offset : gen3_emit_composite_spans_primitive_zero;
+		if (no_offset) {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero_no_offset;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero_no_offset__boxes;
+		} else {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero__boxes;
+		}
 		break;
 	case SHADER_BLACK:
 	case SHADER_WHITE:
 	case SHADER_CONSTANT:
 		if (no_offset) {
 			tmp->box = gen3_render_composite_spans_constant_box;
+			tmp->thread_boxes = gen3_render_composite_spans_constant_thread_boxes;
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
-		} else
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant_no_offset__boxes;
+		} else {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__boxes;
+		}
 		break;
 	case SHADER_LINEAR:
 	case SHADER_RADIAL:
-		if (tmp->base.src.transform == NULL)
+		if (tmp->base.src.transform == NULL) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient;
-		else if (tmp->base.src.is_affine)
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__boxes;
+		} else if (tmp->base.src.is_affine) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__boxes;
+		}
 		break;
 	case SHADER_TEXTURE:
-		if (tmp->base.src.transform == NULL)
+		if (tmp->base.src.transform == NULL) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source;
-		else if (tmp->base.src.is_affine) {
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__boxes;
+		} else if (tmp->base.src.is_affine) {
 			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
 			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__boxes;
 		}
 		break;
 	}
+	if (tmp->emit_boxes == NULL)
+		tmp->thread_boxes = NULL;
 
 	tmp->base.mask.bo = NULL;
 
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index d2f3fff..65016cd 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -621,6 +621,17 @@ static int gen4_get_rectangles__flush(struct sna *sna,
 						     op->u.gen4.wm_kernel);
 	}
 
+	/* Preventing discarding new vbo after lock contention */
+	if (sna->render.active) {
+		int rem;
+
+		sna_vertex_wait__locked(&sna->render);
+
+		rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	return gen4_vertex_finish(sna);
 }
 
@@ -656,6 +667,7 @@ flush:
 		gen4_vertex_flush(sna);
 		gen4_magic_ca_pass(sna, op);
 	}
+	gen4_vertex_finish(sna);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1966,6 +1978,42 @@ gen4_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen4_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen4_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
@@ -2080,6 +2128,8 @@ gen4_render_composite_spans(struct sna *sna,
 
 	tmp->box   = gen4_render_composite_spans_box;
 	tmp->boxes = gen4_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen4_render_composite_spans_boxes__thread;
 	tmp->done  = gen4_render_composite_spans_done;
 
 	if (!kgem_check_bo(&sna->kgem,
diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c
index 4e40467..cc679d3 100644
--- a/src/sna/gen4_vertex.c
+++ b/src/sna/gen4_vertex.c
@@ -36,12 +36,13 @@
 
 void gen4_vertex_flush(struct sna *sna)
 {
-	assert(sna->render.vertex_offset);
-	assert(sna->render.vertex_index > sna->render.vertex_start);
-
 	DBG(("%s[%x] = %d\n", __FUNCTION__,
 	     4*sna->render.vertex_offset,
 	     sna->render.vertex_index - sna->render.vertex_start));
+
+	assert(sna->render.vertex_offset);
+	assert(sna->render.vertex_index > sna->render.vertex_start);
+
 	sna->kgem.batch[sna->render.vertex_offset] =
 		sna->render.vertex_index - sna->render.vertex_start;
 	sna->render.vertex_offset = 0;
@@ -58,6 +59,8 @@ int gen4_vertex_finish(struct sna *sna)
 	assert(sna->render.vertex_offset == 0);
 	assert(sna->render.vertex_used);
 
+	sna_vertex_wait__locked(&sna->render);
+
 	/* Note: we only need dword alignment (currently) */
 
 	bo = sna->render.vbo;
@@ -73,6 +76,7 @@ int gen4_vertex_finish(struct sna *sna)
 					       0);
 		}
 
+		assert(!sna->render.active);
 		sna->render.nvertex_reloc = 0;
 		sna->render.vertex_used = 0;
 		sna->render.vertex_index = 0;
@@ -87,6 +91,7 @@ int gen4_vertex_finish(struct sna *sna)
 		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
 
 	size = 256*1024;
+	assert(!sna->render.active);
 	sna->render.vertices = NULL;
 	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
 	while (sna->render.vbo == NULL && size > 16*1024) {
@@ -144,6 +149,8 @@ void gen4_vertex_close(struct sna *sna)
 	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0,
 	     sna->render.vb_id, sna->render.nvertex_reloc));
 
+	assert(!sna->render.active);
+
 	bo = sna->render.vbo;
 	if (bo) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
@@ -205,6 +212,7 @@ void gen4_vertex_close(struct sna *sna)
 	sna->render.vb_id = 0;
 
 	if (sna->render.vbo == NULL) {
+		assert(!sna->render.active);
 		sna->render.vertex_used = 0;
 		sna->render.vertex_index = 0;
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -853,7 +861,7 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 }
 
 inline static void
-emit_spans_vertex(struct sna *sna,
+emit_span_vertex(struct sna *sna,
 		  const struct sna_composite_spans_op *op,
 		  int16_t x, int16_t y)
 {
@@ -867,18 +875,18 @@ emit_composite_spans_primitive(struct sna *sna,
 			       const BoxRec *box,
 			       float opacity)
 {
-	emit_spans_vertex(sna, op, box->x2, box->y2);
+	emit_span_vertex(sna, op, box->x2, box->y2);
 	OUT_VERTEX_F(opacity);
 
-	emit_spans_vertex(sna, op, box->x1, box->y2);
+	emit_span_vertex(sna, op, box->x1, box->y2);
 	OUT_VERTEX_F(opacity);
 
-	emit_spans_vertex(sna, op, box->x1, box->y1);
+	emit_span_vertex(sna, op, box->x1, box->y1);
 	OUT_VERTEX_F(opacity);
 }
 
 fastcall static void
-emit_spans_solid(struct sna *sna,
+emit_span_solid(struct sna *sna,
 		 const struct sna_composite_spans_op *op,
 		 const BoxRec *box,
 		 float opacity)
@@ -909,7 +917,36 @@ emit_spans_solid(struct sna *sna,
 }
 
 fastcall static void
-emit_spans_identity(struct sna *sna,
+emit_span_boxes_solid(const struct sna_composite_spans_op *op,
+		      const struct sna_opacity_box *b,
+		      int nbox, float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = v[1] = .5;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_identity(struct sna *sna,
 		    const struct sna_composite_spans_op *op,
 		    const BoxRec *box,
 		    float opacity)
@@ -949,7 +986,43 @@ emit_spans_identity(struct sna *sna,
 }
 
 fastcall static void
-emit_spans_simple(struct sna *sna,
+emit_span_boxes_identity(const struct sna_composite_spans_op *op,
+			 const struct sna_opacity_box *b, int nbox,
+			 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_simple(struct sna *sna,
 		  const struct sna_composite_spans_op *op,
 		  const BoxRec *box,
 		  float opacity)
@@ -993,7 +1066,47 @@ emit_spans_simple(struct sna *sna,
 }
 
 fastcall static void
-emit_spans_affine(struct sna *sna,
+emit_span_boxes_simple(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float xx = op->base.src.transform->matrix[0][0];
+		float x0 = op->base.src.transform->matrix[0][2];
+		float yy = op->base.src.transform->matrix[1][1];
+		float y0 = op->base.src.transform->matrix[1][2];
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_affine(struct sna *sna,
 		  const struct sna_composite_spans_op *op,
 		  const BoxRec *box,
 		  float opacity)
@@ -1038,7 +1151,50 @@ emit_spans_affine(struct sna *sna,
 }
 
 fastcall static void
-emit_spans_linear(struct sna *sna,
+emit_span_boxes_affine(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_linear(struct sna *sna,
 		  const struct sna_composite_spans_op *op,
 		  const BoxRec *box,
 		  float opacity)
@@ -1069,6 +1225,35 @@ emit_spans_linear(struct sna *sna,
 	v[8] = v[5] = v[2] = opacity;
 }
 
+fastcall static void
+emit_span_boxes_linear(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
 inline inline static uint32_t
 gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
 {
@@ -1083,24 +1268,30 @@ unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
 	unsigned vb;
 
 	if (tmp->base.src.is_solid) {
-		tmp->prim_emit = emit_spans_solid;
+		tmp->prim_emit = emit_span_solid;
+		tmp->emit_boxes = emit_span_boxes_solid;
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.is_linear) {
-		tmp->prim_emit = emit_spans_linear;
+		tmp->prim_emit = emit_span_linear;
+		tmp->emit_boxes = emit_span_boxes_linear;
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.transform == NULL) {
-		tmp->prim_emit = emit_spans_identity;
+		tmp->prim_emit = emit_span_identity;
+		tmp->emit_boxes = emit_span_boxes_identity;
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
 	} else if (tmp->base.is_affine) {
 		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
 		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
-		if (!sna_affine_transform_is_rotation(tmp->base.src.transform))
-			tmp->prim_emit = emit_spans_simple;
-		else
-			tmp->prim_emit = emit_spans_affine;
+		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
+			tmp->prim_emit = emit_span_simple;
+			tmp->emit_boxes = emit_span_boxes_simple;
+		} else {
+			tmp->prim_emit = emit_span_affine;
+			tmp->emit_boxes = emit_span_boxes_affine;
+		}
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
 	} else {
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 5995d1d..81e6635 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -607,6 +607,17 @@ static int gen5_get_rectangles__flush(struct sna *sna,
 						     op->u.gen5.wm_kernel);
 	}
 
+	/* Preventing discarding new vbo after lock contention */
+	if (sna->render.active) {
+		int rem;
+
+		sna_vertex_wait__locked(&sna->render);
+
+		rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	return gen4_vertex_finish(sna);
 }
 
@@ -643,6 +654,7 @@ flush:
 		gen4_vertex_flush(sna);
 		gen5_magic_ca_pass(sna, op);
 	}
+	gen4_vertex_finish(sna);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1939,6 +1951,42 @@ gen5_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen5_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen5_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
@@ -2049,6 +2097,8 @@ gen5_render_composite_spans(struct sna *sna,
 
 	tmp->box   = gen5_render_composite_spans_box;
 	tmp->boxes = gen5_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen5_render_composite_spans_boxes__thread;
 	tmp->done  = gen5_render_composite_spans_done;
 
 	if (!kgem_check_bo(&sna->kgem,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 35ff862..4ff1606 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1157,6 +1157,17 @@ static int gen6_get_rectangles__flush(struct sna *sna,
 		}
 	}
 
+	/* Preventing discarding new vbo after lock contention */
+	if (sna->render.active) {
+		int rem;
+
+		sna_vertex_wait__locked(&sna->render);
+
+		rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	return gen4_vertex_finish(sna);
 }
 
@@ -1193,6 +1204,7 @@ flush:
 		gen4_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
 	}
+	gen4_vertex_finish(sna);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1293,6 +1305,7 @@ gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
 		sna->render_state.gen6.floats_per_vertex = op->floats_per_vertex;
 	}
+	assert((sna->render.vertex_used % op->floats_per_vertex) == 0);
 }
 
 fastcall static void
@@ -1720,6 +1733,7 @@ static void gen6_render_composite_done(struct sna *sna,
 {
 	DBG(("%s\n", __FUNCTION__));
 
+	assert(!sna->render.active);
 	if (sna->render.vertex_offset) {
 		gen4_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
@@ -2281,10 +2295,47 @@ gen6_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen6_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen6_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
+	assert(!sna->render.active);
 
 	if (sna->render.vertex_offset)
 		gen4_vertex_flush(sna);
@@ -2397,6 +2448,8 @@ gen6_render_composite_spans(struct sna *sna,
 
 	tmp->box   = gen6_render_composite_spans_box;
 	tmp->boxes = gen6_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen6_render_composite_spans_boxes__thread;
 	tmp->done  = gen6_render_composite_spans_done;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
@@ -2768,6 +2821,7 @@ gen6_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
 
+	assert(!sna->render.active);
 	if (sna->render.vertex_offset)
 		gen4_vertex_flush(sna);
 }
@@ -3115,6 +3169,7 @@ gen6_render_op_fill_done(struct sna *sna, const struct sna_fill_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
 
+	assert(!sna->render.active);
 	if (sna->render.vertex_offset)
 		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
@@ -3409,6 +3464,7 @@ gen6_render_expire(struct kgem *kgem)
 	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
 		kgem_bo_destroy(kgem, sna->render.vbo);
+		assert(!sna->render.active);
 		sna->render.vbo = NULL;
 		sna->render.vertices = sna->render.vertex_data;
 		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index fa36ce6..34ba252 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1282,6 +1282,17 @@ static int gen7_get_rectangles__flush(struct sna *sna,
 		}
 	}
 
+	/* Preventing discarding new vbo after lock contention */
+	if (sna->render.active) {
+		int rem;
+
+		sna_vertex_wait__locked(&sna->render);
+
+		rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	return gen4_vertex_finish(sna);
 }
 
@@ -1318,6 +1329,7 @@ flush:
 		gen4_vertex_flush(sna);
 		gen7_magic_ca_pass(sna, op);
 	}
+	gen4_vertex_finish(sna);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -2403,6 +2415,42 @@ gen7_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen7_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen7_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
@@ -2499,6 +2547,8 @@ gen7_render_composite_spans(struct sna *sna,
 
 	tmp->box   = gen7_render_composite_spans_box;
 	tmp->boxes = gen7_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen7_render_composite_spans_boxes__thread;
 	tmp->done  = gen7_render_composite_spans_done;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
index fd3f789..7ef55d3 100644
--- a/src/sna/kgem_debug_gen6.c
+++ b/src/sna/kgem_debug_gen6.c
@@ -75,11 +75,11 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	assert(i < kgem->nreloc);
 	reloc = kgem->reloc[i].target_handle;
 
-	if (reloc == 0) {
+	if (reloc == -1) {
 		base = kgem->batch;
 	} else {
 		list_for_each_entry(bo, &kgem->next_request->buffers, request)
-			if (bo->handle == reloc)
+			if (bo->target_handle == reloc)
 				break;
 		assert(&bo->request != &kgem->next_request->buffers);
 		base = kgem_bo_map__debug(kgem, bo);
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 5832c99..84d9807 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -42,6 +42,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #endif
 
 #include <stdint.h>
+
 #include "compiler.h"
 
 #include <xorg-server.h>
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 84c6b35..4b32b82 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -304,6 +304,8 @@ void no_render_init(struct sna *sna)
 	sna->kgem.expire = no_render_expire;
 	if (sna->kgem.has_blt)
 		sna->kgem.ring = KGEM_BLT;
+
+	sna_vertex_init(sna);
 }
 
 static struct kgem_bo *
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 13a3e7d..6a0b1d8 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -7,6 +7,8 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include <pthread.h>
+#include "atomic.h"
 
 #define GRADIENT_CACHE_SIZE 16
 
@@ -142,6 +144,11 @@ struct sna_composite_op {
 	void *priv;
 };
 
+struct sna_opacity_box {
+	BoxRec box;
+	float alpha;
+} __packed__;
+
 struct sna_composite_spans_op {
 	struct sna_composite_op base;
 
@@ -153,6 +160,12 @@ struct sna_composite_spans_op {
 		      const struct sna_composite_spans_op *op,
 		      const BoxRec *box, int nbox,
 		      float opacity);
+
+	fastcall void (*thread_boxes)(struct sna *sna,
+				      const struct sna_composite_spans_op *op,
+				      const struct sna_opacity_box *box,
+				      int nbox);
+
 	fastcall void (*done)(struct sna *sna,
 			      const struct sna_composite_spans_op *op);
 
@@ -160,6 +173,9 @@ struct sna_composite_spans_op {
 				   const struct sna_composite_spans_op *op,
 				   const BoxRec *box,
 				   float opacity);
+	fastcall void (*emit_boxes)(const struct sna_composite_spans_op *op,
+				    const struct sna_opacity_box *box, int nbox,
+				    float *v);
 };
 
 struct sna_fill_op {
@@ -188,6 +204,10 @@ struct sna_copy_op {
 };
 
 struct sna_render {
+	pthread_mutex_t lock;
+	pthread_cond_t wait;
+	int active;
+
 	int max_3d_size;
 	int max_3d_pitch;
 
@@ -714,4 +734,34 @@ sna_render_copy_boxes__overlap(struct sna *sna, uint8_t alu,
 bool
 sna_composite_mask_is_opaque(PicturePtr mask);
 
+void sna_vertex_init(struct sna *sna);
+
+static inline void sna_vertex_lock(struct sna_render *r)
+{
+	pthread_mutex_lock(&r->lock);
+}
+
+static inline void sna_vertex_acquire__locked(struct sna_render *r)
+{
+	r->active++;
+}
+
+static inline void sna_vertex_unlock(struct sna_render *r)
+{
+	pthread_mutex_unlock(&r->lock);
+}
+
+static inline void sna_vertex_release__locked(struct sna_render *r)
+{
+	assert(r->active > 0);
+	if (--r->active == 0)
+		pthread_cond_signal(&r->wait);
+}
+
+static inline void sna_vertex_wait__locked(struct sna_render *r)
+{
+	while (r->active)
+		pthread_cond_wait(&r->wait, &r->lock);
+}
+
 #endif /* SNA_RENDER_H */
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 79e845a..bf4816b 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -49,6 +49,7 @@
 #define NO_ALIGNED_BOXES 0
 #define NO_UNALIGNED_BOXES 0
 #define NO_SCAN_CONVERTER 0
+#define NO_GPU_THREADS 0
 
 /* TODO: Emit unantialiased and MSAA triangles. */
 
@@ -328,10 +329,10 @@ floored_divrem(int a, int b)
 /* Compute the floored division (x*a)/b. Assumes / and % perform symmetric
  * division. */
 static struct quorem
-floored_muldivrem(int x, int a, int b)
+floored_muldivrem(int32_t x, int32_t a, int32_t b)
 {
 	struct quorem qr;
-	long long xa = (long long)x*a;
+	int64_t xa = (int64_t)x*a;
 	qr.quo = xa/b;
 	qr.rem = xa%b;
 	if (qr.rem && (xa>=0) != (b>=0)) {
@@ -674,6 +675,8 @@ polygon_add_edge(struct polygon *polygon,
 	ybot = bottom <= ymax ? bottom : ymax;
 	e->ytop = ytop;
 	e->height_left = ybot - ytop;
+	if (e->height_left <= 0)
+		return;
 
 	if (dx == 0) {
 		e->x.quo = x1;
@@ -736,6 +739,8 @@ polygon_add_line(struct polygon *polygon,
 
 	e->ytop = top;
 	e->height_left = bot - top;
+	if (e->height_left <= 0)
+		return;
 
 	if (dx == 0) {
 		e->x.quo = p1->x;
@@ -4021,14 +4026,13 @@ static span_func_t
 choose_span(struct sna_composite_spans_op *tmp,
 	    PicturePtr dst,
 	    PictFormatPtr maskFormat,
-	    uint8_t op,
 	    RegionPtr clip)
 {
 	span_func_t span;
 
 	if (is_mono(dst, maskFormat)) {
 		/* XXX An imprecise approximation */
-		if (maskFormat && !operator_is_bounded(op)) {
+		if (maskFormat && !operator_is_bounded(tmp->base.op)) {
 			span = tor_blt_span_mono_unbounded;
 			if (REGION_NUM_RECTS(clip) > 1)
 				span = tor_blt_span_mono_unbounded_clipped;
@@ -4188,6 +4192,151 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	return true;
 }
 
+struct span_thread {
+	struct sna *sna;
+	const struct sna_composite_spans_op *op;
+	const xTrapezoid *traps;
+	RegionPtr clip;
+	span_func_t span;
+	BoxRec extents;
+	int dx, dy, draw_y;
+	int ntrap;
+	bool unbounded;
+};
+
+#define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
+struct span_thread_boxes {
+	const struct sna_composite_spans_op *op;
+	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
+	int num_boxes;
+};
+
+static void span_thread_add_boxes(struct sna *sna, void *data,
+				  const BoxRec *box, int count, float alpha)
+{
+	struct span_thread_boxes *b = data;
+
+	__DBG(("%s: adding %d boxes with alpha=%f\n",
+	       __FUNCTION__, count, alpha));
+
+	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
+	if (b->num_boxes + count > SPAN_THREAD_MAX_BOXES) {
+		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
+		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
+		b->num_boxes = 0;
+	}
+
+	do {
+		b->boxes[b->num_boxes].box = *box++;
+		b->boxes[b->num_boxes].alpha = alpha;
+		b->num_boxes++;
+	} while (--count);
+	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+}
+
+static void
+span_thread_box(struct sna *sna,
+		struct sna_composite_spans_op *op,
+		pixman_region16_t *clip,
+		const BoxRec *box,
+		int coverage)
+{
+	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
+	span_thread_add_boxes(sna, op, box, 1, AREA_TO_ALPHA(coverage));
+}
+
+static void
+span_thread_clipped_box(struct sna *sna,
+			struct sna_composite_spans_op *op,
+			pixman_region16_t *clip,
+			const BoxRec *box,
+			int coverage)
+{
+	pixman_region16_t region;
+
+	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
+	       AREA_TO_ALPHA(coverage)));
+
+	pixman_region_init_rects(&region, box, 1);
+	RegionIntersect(&region, &region, clip);
+	if (REGION_NUM_RECTS(&region)) {
+		span_thread_add_boxes(sna, op,
+				      REGION_RECTS(&region),
+				      REGION_NUM_RECTS(&region),
+				      AREA_TO_ALPHA(coverage));
+	}
+	pixman_region_fini(&region);
+}
+
+static span_func_t
+thread_choose_span(struct sna_composite_spans_op *tmp,
+		   PicturePtr dst,
+		   PictFormatPtr maskFormat,
+		   RegionPtr clip)
+{
+	span_func_t span;
+
+	if (tmp->base.damage)
+		return NULL;
+
+	if (is_mono(dst, maskFormat)) {
+		return NULL;
+	} else {
+		if (REGION_NUM_RECTS(clip) > 1)
+			span = span_thread_clipped_box;
+		else
+			span = span_thread_box;
+	}
+
+	return span;
+}
+
+static void
+span_thread(void *arg)
+{
+	struct span_thread *thread = arg;
+	struct span_thread_boxes boxes;
+	struct tor tor;
+	const xTrapezoid *t;
+	int n, y1, y2;
+
+	if (tor_init(&tor, &thread->extents, 2*thread->ntrap))
+		return;
+
+	boxes.op = thread->op;
+	boxes.num_boxes = 0;
+
+	y1 = thread->extents.y1 - thread->draw_y;
+	y2 = thread->extents.y2 - thread->draw_y;
+	for (n = thread->ntrap, t = thread->traps; n--; t++) {
+		xTrapezoid tt;
+
+		if (pixman_fixed_to_int(t->top) >= y2 ||
+		    pixman_fixed_to_int(t->bottom) < y1)
+			continue;
+
+		if (!project_trapezoid_onto_grid(t, thread->dx, thread->dy, &tt))
+			continue;
+
+		tor_add_edge(&tor, &tt, &tt.left, 1);
+		tor_add_edge(&tor, &tt, &tt.right, -1);
+	}
+
+	tor_render(thread->sna, &tor,
+		   (struct sna_composite_spans_op *)&boxes, thread->clip,
+		   thread->span, thread->unbounded);
+
+	tor_fini(&tor);
+
+	if (boxes.num_boxes) {
+		DBG(("%s: flushing %d boxes\n", __FUNCTION__, boxes.num_boxes));
+		assert(boxes.num_boxes <= SPAN_THREAD_MAX_BOXES);
+		thread->op->thread_boxes(thread->sna, thread->op,
+					 boxes.boxes, boxes.num_boxes);
+	}
+}
+
 static bool
 trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			 PictFormatPtr maskFormat, unsigned int flags,
@@ -4196,12 +4345,12 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 {
 	struct sna *sna;
 	struct sna_composite_spans_op tmp;
-	struct tor tor;
 	BoxRec extents;
 	pixman_region16_t clip;
 	int16_t dst_x, dst_y;
 	bool was_clear;
 	int dx, dy, n;
+	int num_threads;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -4305,29 +4454,78 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	dx *= FAST_SAMPLES_X;
 	dy *= FAST_SAMPLES_Y;
-	if (tor_init(&tor, &extents, 2*ntrap))
-		goto skip;
 
-	for (n = 0; n < ntrap; n++) {
-		xTrapezoid t;
+	num_threads = 1;
+	if (!NO_GPU_THREADS && tmp.thread_boxes &&
+	    thread_choose_span(&tmp, dst, maskFormat, &clip))
+		num_threads = sna_use_threads(extents.x2-extents.x1,
+					      extents.y2-extents.y1,
+					      16);
+	if (num_threads == 1) {
+		struct tor tor;
 
-		if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
-			continue;
+		if (tor_init(&tor, &extents, 2*ntrap))
+			goto skip;
 
-		if (pixman_fixed_to_int(traps[n].top) + dst->pDrawable->y >= extents.y2 ||
-		    pixman_fixed_to_int(traps[n].bottom) + dst->pDrawable->y < extents.y1)
-			continue;
+		for (n = 0; n < ntrap; n++) {
+			xTrapezoid t;
 
-		tor_add_edge(&tor, &t, &t.left, 1);
-		tor_add_edge(&tor, &t, &t.right, -1);
-	}
+			if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
+				continue;
 
-	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
-		   !was_clear && maskFormat && !operator_is_bounded(op));
+			if (pixman_fixed_to_int(traps[n].top) + dst->pDrawable->y >= extents.y2 ||
+			    pixman_fixed_to_int(traps[n].bottom) + dst->pDrawable->y < extents.y1)
+				continue;
+
+			tor_add_edge(&tor, &t, &t.left, 1);
+			tor_add_edge(&tor, &t, &t.right, -1);
+		}
+
+		tor_render(sna, &tor, &tmp, &clip,
+			   choose_span(&tmp, dst, maskFormat, &clip),
+			   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
-	tor_fini(&tor);
+		tor_fini(&tor);
+	} else {
+		struct span_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for span compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     extents.x2 - extents.x1,
+		     extents.y2 - extents.y1));
+
+		threads[0].sna = sna;
+		threads[0].op = &tmp;
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].extents = extents;
+		threads[0].clip = &clip;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+		threads[0].draw_y = dst->pDrawable->y;
+		threads[0].unbounded = !was_clear && maskFormat && !operator_is_bounded(op);
+		threads[0].span = thread_choose_span(&tmp, dst, maskFormat, &clip);
+
+		y = extents.y1;
+		h = extents.y2 - extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(span_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = extents.y2;
+		span_thread(&threads[0]);
+
+		sna_threads_wait();
+	}
 	tmp.done(sna, &tmp);
 
 	REGION_UNINIT(NULL, &clip);
@@ -6282,7 +6480,7 @@ trap_span_converter(PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, clip,
-		   choose_span(&tmp, dst, NULL, PictOpAdd, clip), false);
+		   choose_span(&tmp, dst, NULL, clip), false);
 
 skip:
 	tor_fini(&tor);
@@ -6827,7 +7025,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, &clip),
 		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
@@ -7201,7 +7399,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	assert(tor.polygon->num_edges <= 2*count);
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, &clip),
 		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
diff --git a/src/sna/sna_vertex.c b/src/sna/sna_vertex.c
new file mode 100644
index 0000000..6755d9a
--- /dev/null
+++ b/src/sna/sna_vertex.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright Â© 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris at chris-wilson.co.uk>
+ *
+ */
+
+#include "sna.h"
+
+#include <unistd.h>
+
+void sna_vertex_init(struct sna *sna)
+{
+	pthread_mutex_init(&sna->render.lock, NULL);
+	pthread_cond_init(&sna->render.wait, NULL);
+	sna->render.active = 0;
+}