xf86-video-intel: 12 commits - src/sna/gen2_render.c src/sna/gen3_render.c src/sna/gen5_render.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/sna_accel.c src/sna/sna_composite.c src/sna/sna_glyphs.c src/sna/sna_render.h src/sna/sna_render_inline.h src/sna/sna_tiling.c src/sna/sna_trapezoids.c

Chris Wilson ickle at kemper.freedesktop.org
Sun Jun 17 04:45:04 PDT 2012


 src/sna/gen2_render.c       |   21 -
 src/sna/gen3_render.c       |   14 
 src/sna/gen5_render.c       |   18 
 src/sna/gen6_render.c       |   18 
 src/sna/gen7_render.c       |   21 -
 src/sna/sna_accel.c         |  124 ++++--
 src/sna/sna_composite.c     |   74 ++--
 src/sna/sna_glyphs.c        |    3 
 src/sna/sna_render.h        |    8 
 src/sna/sna_render_inline.h |   14 
 src/sna/sna_tiling.c        |  262 ++++++++++++++
 src/sna/sna_trapezoids.c    |  813 ++++++++++++++++++++++++++++++++++++++------
 12 files changed, 1196 insertions(+), 194 deletions(-)

New commits:
commit 9756c60b4ad15281d025b9c27f19d19e8a630958
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jun 17 11:39:33 2012 +0100

    sna/gen7: Enable non-rectilinear spans
    
    Seems we have enough GPU power to overcome the clumsy shaders. Just
    imagine the possibilities when we have a true shader for spans...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 4642549..3c07d8d 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3166,9 +3166,6 @@ gen7_render_composite_spans(struct sna *sna,
 	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
 	     width, height, flags, sna->kgem.ring));
 
-	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
-		return FALSE;
-
 	if (op >= ARRAY_SIZE(gen7_blend_op))
 		return FALSE;
 
commit 41aff56a1f452e409c7a49512a1d2824b74f3838
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jun 17 10:50:31 2012 +0100

    sna: Add tiling for spans
    
    Semmingly only advisable when already committed to using the GPU. This
    first pass is still a little naive as it makes no attempt to avoid empty
    tiles, nor aims to be efficient.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index e67dd95..5fd995b 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2220,14 +2220,23 @@ gen2_render_composite_spans(struct sna *sna,
 		return FALSE;
 	}
 
-	if (!gen2_check_dst_format(dst->format)) {
-		DBG(("%s: fallback due to unhandled dst format: %x\n",
-		     __FUNCTION__, dst->format));
+	if (gen2_composite_fallback(sna, src, NULL, dst))
 		return FALSE;
-	}
 
-	if (need_tiling(sna, width, height))
-		return FALSE;
+	if (need_tiling(sna, width, height)) {
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+		     __FUNCTION__, width, height));
+
+		if (!is_gpu(dst->pDrawable)) {
+			DBG(("%s: fallback, tiled operation not on GPU\n",
+			     __FUNCTION__));
+			return FALSE;
+		}
+
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
+	}
 
 	if (!gen2_composite_set_target(sna, &tmp->base, dst)) {
 		DBG(("%s: unable to set render target\n",
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 62b3a8e..54ec9b2 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -3392,9 +3392,18 @@ gen3_render_composite_spans(struct sna *sna,
 		return FALSE;
 
 	if (need_tiling(sna, width, height)) {
-		DBG(("%s: fallback, operation (%dx%d) too wide for pipeline\n",
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
 		     __FUNCTION__, width, height));
-		return FALSE;
+
+		if (!is_gpu(dst->pDrawable)) {
+			DBG(("%s: fallback, tiled operation not on GPU\n",
+			     __FUNCTION__));
+			return FALSE;
+		}
+
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
 	}
 
 	if (!gen3_composite_set_target(sna, &tmp->base, dst)) {
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index e31ac5c..65c21c3 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2785,12 +2785,24 @@ gen5_render_composite_spans(struct sna *sna,
 	if (op >= ARRAY_SIZE(gen5_blend_op))
 		return FALSE;
 
-	if (need_tiling(sna, width, height))
-		return FALSE;
-
 	if (gen5_composite_fallback(sna, src, NULL, dst))
 		return FALSE;
 
+	if (need_tiling(sna, width, height)) {
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+		     __FUNCTION__, width, height));
+
+		if (!is_gpu(dst->pDrawable)) {
+			DBG(("%s: fallback, tiled operation not on GPU\n",
+			     __FUNCTION__));
+			return FALSE;
+		}
+
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
+	}
+
 	tmp->base.op = op;
 	if (!gen5_composite_set_target(dst, &tmp->base))
 		return FALSE;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d613fe1..b927d08 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3088,12 +3088,24 @@ gen6_render_composite_spans(struct sna *sna,
 	if (op >= ARRAY_SIZE(gen6_blend_op))
 		return FALSE;
 
-	if (need_tiling(sna, width, height))
-		return FALSE;
-
 	if (gen6_composite_fallback(sna, src, NULL, dst))
 		return FALSE;
 
+	if (need_tiling(sna, width, height)) {
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+		     __FUNCTION__, width, height));
+
+		if (!is_gpu(dst->pDrawable)) {
+			DBG(("%s: fallback, tiled operation not on GPU\n",
+			     __FUNCTION__));
+			return FALSE;
+		}
+
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
+	}
+
 	tmp->base.op = op;
 	if (!gen6_composite_set_target(sna, &tmp->base, dst))
 		return FALSE;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index a7c498e..4642549 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3172,12 +3172,24 @@ gen7_render_composite_spans(struct sna *sna,
 	if (op >= ARRAY_SIZE(gen7_blend_op))
 		return FALSE;
 
-	if (need_tiling(sna, width, height))
-		return FALSE;
-
 	if (gen7_composite_fallback(sna, src, NULL, dst))
 		return FALSE;
 
+	if (need_tiling(sna, width, height)) {
+		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
+		     __FUNCTION__, width, height));
+
+		if (!is_gpu(dst->pDrawable)) {
+			DBG(("%s: fallback, tiled operation not on GPU\n",
+			     __FUNCTION__));
+			return FALSE;
+		}
+
+		return sna_tiling_composite_spans(op, src, dst,
+						  src_x, src_y, dst_x, dst_y,
+						  width, height, flags, tmp);
+	}
+
 	tmp->base.op = op;
 	if (!gen7_composite_set_target(sna, &tmp->base, dst))
 		return FALSE;
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 4898223..fd42b21 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -517,6 +517,14 @@ Bool sna_tiling_composite(uint32_t op,
 			  int16_t dst_x, int16_t dst_y,
 			  int16_t width, int16_t height,
 			  struct sna_composite_op *tmp);
+Bool sna_tiling_composite_spans(uint32_t op,
+				PicturePtr src,
+				PicturePtr dst,
+				int16_t src_x,  int16_t src_y,
+				int16_t dst_x,  int16_t dst_y,
+				int16_t width,  int16_t height,
+				unsigned flags,
+				struct sna_composite_spans_op *tmp);
 Bool sna_tiling_fill_boxes(struct sna *sna,
 			   CARD8 op,
 			   PictFormat format,
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 33fee42..16b90ca 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -39,6 +39,11 @@
 #define DBG(x) ErrorF x
 #endif
 
+struct sna_tile_span {
+	BoxRec box;
+	float opacity;
+};
+
 struct sna_tile_state {
 	int op;
 	PicturePtr src, mask, dst;
@@ -48,6 +53,7 @@ struct sna_tile_state {
 	int16_t mask_x, mask_y;
 	int16_t dst_x, dst_y;
 	int16_t width, height;
+	unsigned flags;
 
 	int rect_count;
 	int rect_size;
@@ -318,6 +324,262 @@ sna_tiling_composite(uint32_t op,
 	return TRUE;
 }
 
+fastcall static void
+sna_tiling_composite_spans_box(struct sna *sna,
+			       const struct sna_composite_spans_op *op,
+			       const BoxRec *box, float opacity)
+{
+	struct sna_tile_state *tile = op->base.priv;
+	struct sna_tile_span *a;
+
+	if (tile->rect_count == tile->rect_size) {
+		int newsize = tile->rect_size * 2;
+
+		if (tile->rects == tile->rects_embedded) {
+			a = malloc (sizeof(struct sna_tile_span) * newsize);
+			if (a == NULL)
+				return;
+
+			memcpy(a,
+			       tile->rects_embedded,
+			       sizeof(struct sna_tile_span) * tile->rect_count);
+		} else {
+			a = realloc(tile->rects,
+				    sizeof(struct sna_tile_span) * newsize);
+			if (a == NULL)
+				return;
+		}
+
+		tile->rects = (void *)a;
+		tile->rect_size = newsize;
+	} else
+		a = (void *)tile->rects;
+
+	a[tile->rect_count].box = *box;
+	a[tile->rect_count].opacity = opacity;
+	tile->rect_count++;
+	(void)sna;
+}
+
+static void
+sna_tiling_composite_spans_boxes(struct sna *sna,
+				 const struct sna_composite_spans_op *op,
+				 const BoxRec *box, int nbox, float opacity)
+{
+	while (nbox--)
+		sna_tiling_composite_spans_box(sna, op->base.priv, box++, opacity);
+	(void)sna;
+}
+
+fastcall static void
+sna_tiling_composite_spans_done(struct sna *sna,
+				const struct sna_composite_spans_op *op)
+{
+	struct sna_tile_state *tile = op->base.priv;
+	struct sna_composite_spans_op tmp;
+	int x, y, n, step;
+	bool force_fallback = false;
+
+	/* Use a small step to accommodate enlargement through tile alignment */
+	step = sna->render.max_3d_size;
+	if (tile->dst_x & (8*512 / tile->dst->pDrawable->bitsPerPixel - 1) ||
+	    tile->dst_y & 63)
+		step /= 2;
+	while (step * step * 4 > sna->kgem.max_copy_tile_size)
+		step /= 2;
+
+	DBG(("%s -- %dx%d, count=%d, step size=%d\n", __FUNCTION__,
+	     tile->width, tile->height, tile->rect_count, step));
+
+	if (tile->rect_count == 0)
+		goto done;
+
+	for (y = 0; y < tile->height; y += step) {
+		int height = step;
+		if (y + height > tile->height)
+			height = tile->height - y;
+		for (x = 0; x < tile->width; x += step) {
+			const struct sna_tile_span *r = (void *)tile->rects;
+			int width = step;
+			if (x + width > tile->width)
+				width = tile->width - x;
+			if (!force_fallback &&
+			    sna->render.composite_spans(sna, tile->op,
+							tile->src, tile->dst,
+							tile->src_x + x,  tile->src_y + y,
+							tile->dst_x + x,  tile->dst_y + y,
+							width, height, tile->flags,
+							memset(&tmp, 0, sizeof(tmp)))) {
+				for (n = 0; n < tile->rect_count; n++) {
+					BoxRec b;
+
+					b.x1 = r->box.x1 - tile->dst_x;
+					if (b.x1 < x)
+						b.x1 = x;
+
+					b.y1 = r->box.y1 - tile->dst_y;
+					if (b.y1 < y)
+						b.y1 = y;
+
+					b.x2 = r->box.x2 - tile->dst_x;
+					if (b.x2 > x + width)
+						b.x2 = x + width;
+
+					b.y2 = r->box.y2 - tile->dst_y;
+					if (b.y2 > y + height)
+						b.y2 = y + height;
+
+					DBG(("%s: rect[%d] = (%d, %d)x(%d,%d), tile=(%d,%d)x(%d, %d), blt=(%d,%d),(%d,%d)\n",
+					     __FUNCTION__, n,
+					     r->dst.x, r->dst.y,
+					     r->width, r->height,
+					     x, y, width, height,
+					     b.x1, b.y1, b.x2, b.y2));
+
+					if (b.y2 > b.y1 && b.x2 > b.x1)
+						tmp.box(sna, &tmp, &b, r->opacity);
+					r++;
+				}
+				tmp.done(sna, &tmp);
+			} else {
+				unsigned int flags;
+
+				DBG(("%s -- falback\n", __FUNCTION__));
+
+				if (tile->op <= PictOpSrc)
+					flags = MOVE_WRITE;
+				else
+					flags = MOVE_WRITE | MOVE_READ;
+				if (!sna_drawable_move_to_cpu(tile->dst->pDrawable,
+							      flags))
+					goto done;
+				if (tile->dst->alphaMap &&
+				    !sna_drawable_move_to_cpu(tile->dst->alphaMap->pDrawable,
+							      flags))
+					goto done;
+
+				if (tile->src->pDrawable &&
+				    !sna_drawable_move_to_cpu(tile->src->pDrawable,
+							      MOVE_READ))
+					goto done;
+				if (tile->src->alphaMap &&
+				    !sna_drawable_move_to_cpu(tile->src->alphaMap->pDrawable,
+							      MOVE_READ))
+					goto done;
+
+				for (n = 0; n < tile->rect_count; n++) {
+					BoxRec b;
+
+					b.x1 = r->box.x1 - tile->dst_x;
+					if (b.x1 < x)
+						b.x1 = x;
+
+					b.y1 = r->box.y1 - tile->dst_y;
+					if (b.y1 < y)
+						b.y1 = y;
+
+					b.x2 = r->box.x2 - tile->dst_x;
+					if (b.x2 > x + width)
+						b.x2 = x + width;
+
+					b.y2 = r->box.y2 - tile->dst_y;
+					if (b.y2 > y + height)
+						b.y2 = y + height;
+
+					DBG(("%s: rect[%d] = (%d, %d)x(%d,%d), tile=(%d,%d)x(%d, %d), blt=(%d,%d),(%d,%d)\n",
+					     __FUNCTION__, n,
+					     r->dst.x, r->dst.y,
+					     r->width, r->height,
+					     x, y, width, height,
+					     b.x1, b.y1, b.x2, b.y2));
+
+					if (b.y2 > b.y1 && b.x2 > b.x1) {
+						xRenderColor alpha;
+						PicturePtr mask;
+						int error;
+
+						alpha.red = alpha.green = alpha.blue = 0;
+						alpha.alpha = r->opacity * 0xffff;
+
+						mask = CreateSolidPicture(0, &alpha, &error);
+						if (!mask)
+							goto done;
+
+						fbComposite(tile->op,
+							    tile->src, mask, tile->dst,
+							    tile->src_x + x,  tile->src_y + y,
+							    0, 0,
+							    tile->dst_x + x,  tile->dst_y + y,
+							    width, height);
+
+						FreePicture(mask, 0);
+					}
+					r++;
+				}
+
+				force_fallback = true;
+			}
+		}
+	}
+
+done:
+	if (tile->rects != tile->rects_embedded)
+		free(tile->rects);
+	free(tile);
+}
+
+Bool
+sna_tiling_composite_spans(uint32_t op,
+			   PicturePtr src,
+			   PicturePtr dst,
+			   int16_t src_x,  int16_t src_y,
+			   int16_t dst_x,  int16_t dst_y,
+			   int16_t width,  int16_t height,
+			   unsigned flags,
+			   struct sna_composite_spans_op *tmp)
+{
+	struct sna_tile_state *tile;
+	struct sna_pixmap *priv;
+
+	DBG(("%s size=(%d, %d), tile=%d\n",
+	     __FUNCTION__, width, height,
+	     to_sna_from_drawable(dst->pDrawable)->render.max_3d_size));
+
+	priv = sna_pixmap(get_drawable_pixmap(dst->pDrawable));
+	if (priv == NULL || priv->gpu_bo == NULL)
+		return FALSE;
+
+	tile = malloc(sizeof(*tile));
+	if (!tile)
+		return FALSE;
+
+	tile->op = op;
+	tile->flags = flags;
+
+	tile->src  = src;
+	tile->mask = NULL;
+	tile->dst  = dst;
+
+	tile->src_x = src_x;
+	tile->src_y = src_y;
+	tile->mask_x = 0;
+	tile->mask_y = 0;
+	tile->dst_x = dst_x;
+	tile->dst_y = dst_y;
+	tile->width = width;
+	tile->height = height;
+	tile->rects = tile->rects_embedded;
+	tile->rect_count = 0;
+	tile->rect_size = ARRAY_SIZE(tile->rects_embedded);
+
+	tmp->box   = sna_tiling_composite_spans_box;
+	tmp->boxes = sna_tiling_composite_spans_boxes;
+	tmp->done  = sna_tiling_composite_spans_done;
+
+	tmp->base.priv = tile;
+	return TRUE;
+}
+
 Bool
 sna_tiling_fill_boxes(struct sna *sna,
 		      CARD8 op,
commit 222e6ff43ef683e82101fb360911fc01fbe00597
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 23:40:30 2012 +0100

    sna: Read inplace for fallback copies
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 85db919..1e86643 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4052,10 +4052,21 @@ fallback:
 		DBG(("%s: fallback -- src=(%d, %d), dst=(%d, %d)\n",
 		     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy));
 		if (src_priv) {
+			unsigned mode;
+
 			RegionTranslate(&region, src_dx, src_dy);
+
+			assert_pixmap_contains_box(src_pixmap,
+						   RegionExtents(&region));
+
+			mode = MOVE_READ;
+			if (src_priv->cpu_bo == NULL)
+				mode |= MOVE_INPLACE_HINT;
+
 			if (!sna_drawable_move_region_to_cpu(&src_pixmap->drawable,
-							     &region, MOVE_READ))
+							     &region, mode))
 				goto out;
+
 			RegionTranslate(&region, -src_dx, -src_dy);
 		}
 
commit 79d468925bb012806e097337e4e5930818c6ab46
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 17:54:21 2012 +0100

    sna: Decrease latency for 1x1 GetImage by using an inplace mapping
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3c17d75..85db919 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11982,8 +11982,9 @@ sna_get_image(DrawablePtr drawable,
 	      char *dst)
 {
 	RegionRec region;
+	unsigned int flags;
 
-	DBG(("%s (%d, %d, %d, %d)\n", __FUNCTION__, x, y, w, h));
+	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
 
 	region.extents.x1 = x + drawable->x;
 	region.extents.y1 = y + drawable->y;
@@ -11991,7 +11992,10 @@ sna_get_image(DrawablePtr drawable,
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
 
-	if (!sna_drawable_move_region_to_cpu(drawable, &region, MOVE_READ))
+	flags = MOVE_READ;
+	if ((w | h) == 1)
+		flags |= MOVE_INPLACE_HINT;
+	if (!sna_drawable_move_region_to_cpu(drawable, &region, flags))
 		return;
 
 	if (format == ZPixmap &&
commit 2c2a8d3780f1d8de3f13bee8e068fdaf608ff9e9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 23:19:49 2012 +0100

    sna: Allow reads to be performed inplace
    
    If we can guess that we will only readback the data once, then we can
    skip the copy into the shadow.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 12b4e18..3c17d75 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -975,7 +975,7 @@ static inline bool operate_inplace(struct sna_pixmap *priv, unsigned flags)
 	if ((flags & MOVE_INPLACE_HINT) == 0 || priv->gpu_bo == NULL)
 		return false;
 
-	if (kgem_bo_is_busy(priv->gpu_bo))
+	if (flags & MOVE_WRITE && kgem_bo_is_busy(priv->gpu_bo))
 		return false;
 
 	return priv->stride != 0;
@@ -1087,7 +1087,6 @@ skip_inplace_map:
 	if (operate_inplace(priv, flags) &&
 	    pixmap_inplace(sna, pixmap, priv) &&
 	    sna_pixmap_move_to_gpu(pixmap, flags)) {
-		assert(flags & MOVE_WRITE);
 		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
 
 		DBG(("%s: operate inplace\n", __FUNCTION__));
@@ -1097,13 +1096,15 @@ skip_inplace_map:
 		if (pixmap->devPrivate.ptr != NULL) {
 			priv->mapped = true;
 			pixmap->devKind = priv->gpu_bo->pitch;
-			sna_damage_all(&priv->gpu_damage,
-				       pixmap->drawable.width,
-				       pixmap->drawable.height);
-			sna_damage_destroy(&priv->cpu_damage);
-			list_del(&priv->list);
-			priv->undamaged = false;
-			priv->clear = false;
+			if (flags & MOVE_WRITE) {
+				sna_damage_all(&priv->gpu_damage,
+					       pixmap->drawable.width,
+					       pixmap->drawable.height);
+				sna_damage_destroy(&priv->cpu_damage);
+				list_del(&priv->list);
+				priv->undamaged = false;
+				priv->clear = false;
+			}
 			return true;
 		}
 
@@ -1470,7 +1471,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 
 	if (operate_inplace(priv, flags) &&
 	    region_inplace(sna, pixmap, region, priv)) {
-		assert(flags & MOVE_WRITE);
 		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
 
 		DBG(("%s: operate inplace\n", __FUNCTION__));
@@ -1480,7 +1480,8 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		if (pixmap->devPrivate.ptr != NULL) {
 			priv->mapped = true;
 			pixmap->devKind = priv->gpu_bo->pitch;
-			if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+			if (flags & MOVE_WRITE &&
+			    !DAMAGE_IS_ALL(priv->gpu_damage)) {
 				sna_damage_add(&priv->gpu_damage, region);
 				if (sna_damage_is_all(&priv->gpu_damage,
 						      pixmap->drawable.width,
commit bc6997f6f751d3ba352dfc20c6717ec12b8fac47
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 17:54:15 2012 +0100

    sna: Cleanup damage processing after operating inplace
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index dc97812..12b4e18 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -970,6 +970,17 @@ static inline bool use_cpu_bo_for_read(struct sna_pixmap *priv)
 	return kgem_bo_is_busy(priv->gpu_bo) || kgem_bo_is_busy(priv->cpu_bo);
 }
 
+static inline bool operate_inplace(struct sna_pixmap *priv, unsigned flags)
+{
+	if ((flags & MOVE_INPLACE_HINT) == 0 || priv->gpu_bo == NULL)
+		return false;
+
+	if (kgem_bo_is_busy(priv->gpu_bo))
+		return false;
+
+	return priv->stride != 0;
+}
+
 bool
 _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 {
@@ -1073,9 +1084,7 @@ skip_inplace_map:
 
 	assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
 
-	if (flags & MOVE_INPLACE_HINT &&
-	    priv->stride && priv->gpu_bo &&
-	    !kgem_bo_is_busy(priv->gpu_bo) &&
+	if (operate_inplace(priv, flags) &&
 	    pixmap_inplace(sna, pixmap, priv) &&
 	    sna_pixmap_move_to_gpu(pixmap, flags)) {
 		assert(flags & MOVE_WRITE);
@@ -1091,6 +1100,9 @@ skip_inplace_map:
 			sna_damage_all(&priv->gpu_damage,
 				       pixmap->drawable.width,
 				       pixmap->drawable.height);
+			sna_damage_destroy(&priv->cpu_damage);
+			list_del(&priv->list);
+			priv->undamaged = false;
 			priv->clear = false;
 			return true;
 		}
@@ -1347,6 +1359,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	if (sna_damage_is_all(&priv->cpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
+		DBG(("%s: pixmap=%ld all damaged on CPU\n",
+		     __FUNCTION__, pixmap->drawable.serialNumber));
+
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->undamaged = false;
 
@@ -1453,11 +1468,8 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		}
 	}
 
-	if (flags & MOVE_INPLACE_HINT &&
-	    priv->stride && priv->gpu_bo &&
-	    !kgem_bo_is_busy(priv->gpu_bo) &&
-	    region_inplace(sna, pixmap, region, priv) &&
-	    sna_pixmap_move_area_to_gpu(pixmap, &region->extents, flags)) {
+	if (operate_inplace(priv, flags) &&
+	    region_inplace(sna, pixmap, region, priv)) {
 		assert(flags & MOVE_WRITE);
 		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
 
@@ -1468,8 +1480,18 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		if (pixmap->devPrivate.ptr != NULL) {
 			priv->mapped = true;
 			pixmap->devKind = priv->gpu_bo->pitch;
-			if (!DAMAGE_IS_ALL(priv->gpu_damage))
+			if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 				sna_damage_add(&priv->gpu_damage, region);
+				if (sna_damage_is_all(&priv->gpu_damage,
+						      pixmap->drawable.width,
+						      pixmap->drawable.height)) {
+					DBG(("%s: replaced entire pixmap, destroying CPU shadow\n",
+					     __FUNCTION__));
+					sna_damage_destroy(&priv->cpu_damage);
+					priv->undamaged = false;
+					list_del(&priv->list);
+				}
+			}
 			priv->clear = false;
 			return true;
 		}
@@ -3943,12 +3965,21 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
-					RegionTranslate(&region, dst_dx, dst_dy);
-					assert_pixmap_contains_box(dst_pixmap,
-								   RegionExtents(&region));
-					sna_damage_add(&dst_priv->gpu_damage,
-						       &region);
-					RegionTranslate(&region, -dst_dx, -dst_dy);
+					if (replaces) {
+						sna_damage_destroy(&dst_priv->cpu_damage);
+						sna_damage_all(&dst_priv->gpu_damage,
+							       dst_pixmap->drawable.width,
+							       dst_pixmap->drawable.height);
+						list_del(&dst_priv->list);
+						dst_priv->undamaged = false;
+					} else {
+						RegionTranslate(&region, dst_dx, dst_dy);
+						assert_pixmap_contains_box(dst_pixmap,
+									   RegionExtents(&region));
+						sna_damage_add(&dst_priv->gpu_damage,
+							       &region);
+						RegionTranslate(&region, -dst_dx, -dst_dy);
+					}
 				}
 			}
 		}
commit 937ca8a5d8a0f70a0724db1519bb7b5fc0857425
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 17:53:58 2012 +0100

    sna: Use memset for simple clears
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ed996df..dc97812 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2979,13 +2979,18 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 		}
 
-		pixman_fill(pixmap->devPrivate.ptr,
-			    pixmap->devKind/sizeof(uint32_t),
-			    pixmap->drawable.bitsPerPixel,
-			    0, 0,
-			    pixmap->drawable.width,
-			    pixmap->drawable.height,
-			    priv->clear_color);
+		if (priv->clear_color = 0) {
+			memset(pixmap->devPrivate.ptr,
+			       0, pixmap->devKind * pixmap->drawable.height);
+		} else {
+			pixman_fill(pixmap->devPrivate.ptr,
+				    pixmap->devKind/sizeof(uint32_t),
+				    pixmap->drawable.bitsPerPixel,
+				    0, 0,
+				    pixmap->drawable.width,
+				    pixmap->drawable.height,
+				    priv->clear_color);
+		}
 
 		sna_damage_all(&priv->cpu_damage,
 			       pixmap->drawable.width,
commit de4572b0b52e2fcfcad04660ee2f81ee88d500a5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 13:40:26 2012 +0100

    sna: Inspect CPU damaged state when deciding upon Composite placement
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index fc5bf8a..85453d3 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -410,6 +410,21 @@ static void apply_damage(struct sna_composite_op *op, RegionPtr region)
 	sna_damage_add(op->damage, region);
 }
 
+static inline bool use_cpu(PixmapPtr pixmap, struct sna_pixmap *priv,
+			   CARD8 op, INT16 width, INT16 height)
+{
+	if (too_small(priv))
+		return true;
+
+	if (DAMAGE_IS_ALL(priv->cpu_damage) &&
+	    (op > PictOpSrc ||
+	     width  < pixmap->drawable.width ||
+	     height < pixmap->drawable.height))
+		return true;
+
+	return false;
+}
+
 void
 sna_composite(CARD8 op,
 	      PicturePtr src,
@@ -481,8 +496,9 @@ sna_composite(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(priv) && !picture_is_gpu(src) && !picture_is_gpu(mask)) {
-		DBG(("%s: fallback due to too small\n", __FUNCTION__));
+	if (use_cpu(pixmap, priv, op, width, height) &&
+	    !picture_is_gpu(src) && !picture_is_gpu(mask)) {
+		DBG(("%s: fallback, dst is too small (or completely damaged)\n", __FUNCTION__));
 		goto fallback;
 	}
 
@@ -792,15 +808,10 @@ sna_composite_rectangles(CARD8		 op,
 		goto fallback;
 	}
 
-	if (too_small(priv)) {
-		DBG(("%s: fallback, dst is too small\n", __FUNCTION__));
-		goto fallback;
-	}
-
-	if (DAMAGE_IS_ALL(priv->cpu_damage) &&
-	    (region.extents.x2 - region.extents.x1 < pixmap->drawable.width ||
-	     region.extents.y2 - region.extents.y1 < pixmap->drawable.height)) {
-		DBG(("%s: fallback due to completely damaged CPU\n", __FUNCTION__));
+	if (use_cpu(pixmap, priv, op,
+		    region.extents.x2 - region.extents.x1,
+		    region.extents.y2 - region.extents.y1)) {
+		DBG(("%s: fallback, dst is too small (or completely damaged)\n", __FUNCTION__));
 		goto fallback;
 	}
 
commit b689cd924c500373e1e293dd9eb54a238e400381
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 15 16:25:51 2012 +0100

    sna: Composite traps inplace if the CPU is already all-damaged
    
    One outcome is that inspecting the usage patterns afterwards indicated
    that we were missing an opportunity to reduce unaligned boxes to an
    inplace operation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index ab1cd39..fc5bf8a 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -207,26 +207,28 @@ sna_compute_composite_region(RegionPtr region,
 	}
 
 	/* clip against src */
-	if (src->pDrawable) {
-		src_x += src->pDrawable->x;
-		src_y += src->pDrawable->y;
-	}
-	if (!clip_to_src(region, src, dst_x - src_x, dst_y - src_y)) {
-		pixman_region_fini (region);
-		return FALSE;
-	}
-	DBG(("%s: clip against src: (%d, %d), (%d, %d)\n",
-	     __FUNCTION__,
-	     region->extents.x1, region->extents.y1,
-	     region->extents.x2, region->extents.y2));
-
-	if (src->alphaMap) {
-		if (!clip_to_src(region, src->alphaMap,
-				 dst_x - (src_x - src->alphaOrigin.x),
-				 dst_y - (src_y - src->alphaOrigin.y))) {
-			pixman_region_fini(region);
+	if (src) {
+		if (src->pDrawable) {
+			src_x += src->pDrawable->x;
+			src_y += src->pDrawable->y;
+		}
+		if (!clip_to_src(region, src, dst_x - src_x, dst_y - src_y)) {
+			pixman_region_fini (region);
 			return FALSE;
 		}
+		DBG(("%s: clip against src: (%d, %d), (%d, %d)\n",
+		     __FUNCTION__,
+		     region->extents.x1, region->extents.y1,
+		     region->extents.x2, region->extents.y2));
+
+		if (src->alphaMap) {
+			if (!clip_to_src(region, src->alphaMap,
+					 dst_x - (src_x - src->alphaOrigin.x),
+					 dst_y - (src_y - src->alphaOrigin.y))) {
+				pixman_region_fini(region);
+				return FALSE;
+			}
+		}
 	}
 
 	/* clip against mask */
@@ -361,7 +363,8 @@ sna_compute_composite_extents(BoxPtr extents,
 	     extents->x1, extents->y1,
 	     extents->x2, extents->y2));
 
-	trim_source_extents(extents, src, dst_x - src_x, dst_y - src_y);
+	if (src)
+		trim_source_extents(extents, src, dst_x - src_x, dst_y - src_y);
 	if (mask)
 		trim_source_extents(extents, mask,
 				    dst_x - mask_x, dst_y - mask_y);
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index a3bdb16..61b4fb1 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2541,9 +2541,10 @@ composite_aligned_boxes(struct sna *sna,
 			PicturePtr dst,
 			PictFormatPtr maskFormat,
 			INT16 src_x, INT16 src_y,
-			int ntrap, xTrapezoid *traps)
+			int ntrap, xTrapezoid *traps,
+			bool force_fallback)
 {
-	BoxRec stack_boxes[64], *boxes, extents;
+	BoxRec stack_boxes[64], *boxes;
 	pixman_region16_t region, clip;
 	struct sna_composite_op tmp;
 	Bool ret = true;
@@ -2564,8 +2565,8 @@ composite_aligned_boxes(struct sna *sna,
 	dx = dst->pDrawable->x;
 	dy = dst->pDrawable->y;
 
-	extents.x1 = extents.y1 = 32767;
-	extents.x2 = extents.y2 = -32767;
+	region.extents.x1 = region.extents.y1 = 32767;
+	region.extents.x2 = region.extents.y2 = -32767;
 	num_boxes = 0;
 	for (n = 0; n < ntrap; n++) {
 		boxes[num_boxes].x1 = dx + pixman_fixed_to_int(traps[n].left.p1.x + pixman_fixed_1_minus_e/2);
@@ -2578,15 +2579,15 @@ composite_aligned_boxes(struct sna *sna,
 		if (boxes[num_boxes].y1 >= boxes[num_boxes].y2)
 			continue;
 
-		if (boxes[num_boxes].x1 < extents.x1)
-			extents.x1 = boxes[num_boxes].x1;
-		if (boxes[num_boxes].x2 > extents.x2)
-			extents.x2 = boxes[num_boxes].x2;
+		if (boxes[num_boxes].x1 < region.extents.x1)
+			region.extents.x1 = boxes[num_boxes].x1;
+		if (boxes[num_boxes].x2 > region.extents.x2)
+			region.extents.x2 = boxes[num_boxes].x2;
 
-		if (boxes[num_boxes].y1 < extents.y1)
-			extents.y1 = boxes[num_boxes].y1;
-		if (boxes[num_boxes].y2 > extents.y2)
-			extents.y2 = boxes[num_boxes].y2;
+		if (boxes[num_boxes].y1 < region.extents.y1)
+			region.extents.y1 = boxes[num_boxes].y1;
+		if (boxes[num_boxes].y2 > region.extents.y2)
+			region.extents.y2 = boxes[num_boxes].y2;
 
 		num_boxes++;
 	}
@@ -2596,37 +2597,96 @@ composite_aligned_boxes(struct sna *sna,
 
 	DBG(("%s: extents (%d, %d), (%d, %d) offset of (%d, %d)\n",
 	     __FUNCTION__,
-	     extents.x1, extents.y1,
-	     extents.x2, extents.y2,
-	     extents.x1 - boxes[0].x1,
-	     extents.y1 - boxes[0].y1));
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2,
+	     region.extents.x1 - boxes[0].x1,
+	     region.extents.y1 - boxes[0].y1));
 
-	src_x += extents.x1 - boxes[0].x1;
-	src_y += extents.y1 - boxes[0].y1;
+	src_x += region.extents.x1 - boxes[0].x1;
+	src_y += region.extents.y1 - boxes[0].y1;
 
 	if (!sna_compute_composite_region(&clip,
 					  src, NULL, dst,
 					  src_x,  src_y,
 					  0, 0,
-					  extents.x1 - dx, extents.y1 - dy,
-					  extents.x2 - extents.x1,
-					  extents.y2 - extents.y1)) {
+					  region.extents.x1 - dx, region.extents.y1 - dy,
+					  region.extents.x2 - region.extents.x1,
+					  region.extents.y2 - region.extents.y1)) {
 		DBG(("%s: trapezoids do not intersect drawable clips\n",
 		     __FUNCTION__)) ;
 		goto done;
 	}
 
-	memset(&tmp, 0, sizeof(tmp));
-	if (!sna->render.composite(sna, op, src, NULL, dst,
+	if (force_fallback ||
+	    !sna->render.composite(sna, op, src, NULL, dst,
 				   src_x,  src_y,
 				   0, 0,
-				   extents.x1,  extents.y1,
-				   extents.x2 - extents.x1,
-				   extents.y2 - extents.y1,
-				   &tmp)) {
+				   clip.extents.x1,  clip.extents.y1,
+				   clip.extents.x2 - clip.extents.x1,
+				   clip.extents.y2 - clip.extents.y1,
+				   memset(&tmp, 0, sizeof(tmp)))) {
+		unsigned int flags;
+		pixman_box16_t *b;
+		int i, count;
+
 		DBG(("%s: composite render op not supported\n",
 		     __FUNCTION__));
-		ret = false;
+
+		flags = MOVE_READ | MOVE_WRITE;
+		if (n == 1 && op <= PictOpSrc)
+			flags = MOVE_WRITE | MOVE_INPLACE_HINT;
+
+		if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &clip, flags))
+			goto done;
+		if (dst->alphaMap  &&
+		    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable,
+					      MOVE_READ | MOVE_WRITE))
+			goto done;
+		if (src->pDrawable) {
+			if (!sna_drawable_move_to_cpu(src->pDrawable,
+						      MOVE_READ))
+				goto done;
+			if (src->alphaMap &&
+			    !sna_drawable_move_to_cpu(src->alphaMap->pDrawable,
+						      MOVE_READ))
+				goto done;
+		}
+
+		DBG(("%s: fbComposite()\n", __FUNCTION__));
+		if (maskFormat) {
+			pixman_region_init_rects(&region, boxes, num_boxes);
+			RegionIntersect(&region, &region, &clip);
+
+			b = REGION_RECTS(&region);
+			count = REGION_NUM_RECTS(&region);
+			for (i = 0; i < count; i++) {
+				fbComposite(op, src, NULL, dst,
+					    src_x + b[i].x1 - boxes[0].x1,
+					    src_y + b[i].y1 - boxes[0].y1,
+					    0, 0,
+					    b[i].x1, b[i].y1,
+					    b[i].x2 - b[i].x1, b[i].y2 - b[i].y1);
+			}
+			pixman_region_fini(&region);
+		} else {
+			for (n = 0; n < num_boxes; n++) {
+				pixman_region_init_rects(&region, &boxes[n], 1);
+				RegionIntersect(&region, &region, &clip);
+				b = REGION_RECTS(&region);
+				count = REGION_NUM_RECTS(&region);
+				for (i = 0; i < count; i++) {
+					fbComposite(op, src, NULL, dst,
+						    src_x + b[i].x1 - boxes[0].x1,
+						    src_y + b[i].y1 - boxes[0].y1,
+						    0, 0,
+						    b[i].x1, b[i].y1,
+						    b[i].x2 - b[i].x1, b[i].y2 - b[i].y1);
+				}
+				pixman_region_fini(&region);
+				pixman_region_fini(&region);
+			}
+		}
+		ret = true;
 		goto done;
 	}
 
@@ -2877,15 +2937,17 @@ blt_unaligned_box_row(PixmapPtr scratch,
 			    y1, y2,
 			    covered * (grid_coverage(SAMPLES_X, trap->right.p1.x) - grid_coverage(SAMPLES_X, trap->left.p1.x)));
 	} else {
-		if (pixman_fixed_frac(trap->left.p1.x))
+		if (pixman_fixed_frac(trap->left.p1.x)) {
 			blt_opacity(scratch,
-				    x1, x1+1,
+				    x1, x1,
 				    y1, y2,
 				    covered * (SAMPLES_X - grid_coverage(SAMPLES_X, trap->left.p1.x)));
+			x1++;
+		}
 
-		if (x2 > x1 + 1) {
+		if (x2 > x1) {
 			blt_opacity(scratch,
-				    x1 + 1, x2,
+				    x1, x2,
 				    y1, y2,
 				    covered*SAMPLES_X);
 		}
@@ -2898,20 +2960,581 @@ blt_unaligned_box_row(PixmapPtr scratch,
 	}
 }
 
-static Bool
+#define ONE_HALF 0x7f
+#define RB_MASK 0x00ff00ff
+#define RB_ONE_HALF 0x007f007f
+#define RB_MASK_PLUS_ONE 0x01000100
+#define G_SHIFT 8
+
+static force_inline uint32_t
+mul8x2_8 (uint32_t a, uint8_t b)
+{
+	uint32_t t = (a & RB_MASK) * b + RB_ONE_HALF;
+	return ((t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT) & RB_MASK;
+}
+
+static force_inline uint32_t
+add8x2_8x2(uint32_t a, uint32_t b)
+{
+	uint32_t t = a + b;
+	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);
+	return t & RB_MASK;
+}
+
+static force_inline uint32_t
+lerp8x4(uint32_t src, uint8_t a, uint32_t dst)
+{
+	return (add8x2_8x2(mul8x2_8(src, a),
+			   mul8x2_8(dst, ~a)) |
+		add8x2_8x2(mul8x2_8(src >> G_SHIFT, a),
+			   mul8x2_8(dst >> G_SHIFT, ~a)) << G_SHIFT);
+}
+
+inline static void
+lerp32_opacity(PixmapPtr scratch,
+	       uint32_t color,
+	       int16_t x, int16_t w,
+	       int16_t y, int16_t h,
+	       uint8_t opacity)
+{
+	uint32_t *ptr;
+	int stride, i;
+
+	ptr = (uint32_t*)((uint8_t *)scratch->devPrivate.ptr + scratch->devKind * y);
+	ptr += x;
+	stride = scratch->devKind / 4;
+
+	if (opacity == 0xff) {
+		if ((w | h) == 1) {
+			*ptr = color;
+		} else {
+			if (w < 16) {
+				do {
+					for (i = 0; i < w; i++)
+						ptr[i] = color;
+					ptr += stride;
+				} while (--h);
+			} else {
+				pixman_fill(ptr, stride, 32,
+					    0, 0, w, h, color);
+			}
+		}
+	} else {
+		if ((w | h) == 1) {
+			*ptr = lerp8x4(color, opacity, *ptr);
+		} else if (w == 1) {
+			do {
+				*ptr = lerp8x4(color, opacity, *ptr);
+				ptr += stride;
+			} while (--h);
+		} else{
+			do {
+				for (i = 0; i < w; i++)
+					ptr[i] = lerp8x4(color, opacity, ptr[i]);
+				ptr += stride;
+			} while (--h);
+		}
+	}
+}
+
+static void
+lerp32_unaligned_box_row(PixmapPtr scratch, uint32_t color,
+			 const BoxRec *extents,
+			 xTrapezoid *trap, int16_t dx,
+			 int16_t y, int16_t h,
+			 uint8_t covered)
+{
+	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x) + dx;
+	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	int16_t x2 = pixman_fixed_to_int(trap->right.p1.x) + dx;
+	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
+
+	if (x1 < extents->x1)
+		x1 = extents->x1, fx1 = 0;
+	if (x2 > extents->x2)
+		x2 = extents->x2, fx2 = 0;
+
+	if (x2 < x1) {
+		if (fx1) {
+			lerp32_opacity(scratch, color,
+				       x1, 1,
+				       y, h,
+				       covered * (SAMPLES_X - fx1));
+			x1++;
+		}
+
+		if (x2 > x1) {
+			lerp32_opacity(scratch, color,
+				       x1, x2-x1,
+				       y, h,
+				       covered*SAMPLES_X);
+		}
+
+		if (fx2) {
+			lerp32_opacity(scratch, color,
+				       x2, 1,
+				       y, h,
+				       covered * fx2);
+		}
+	} else if (x1 == x2 && fx2 > fx1) {
+		lerp32_opacity(scratch, color,
+			       x1, 1,
+			       y, h,
+			       covered * (fx2 - fx1));
+	}
+}
+
+struct pixman_inplace {
+	pixman_image_t *image, *source, *mask;
+	uint32_t color;
+	uint32_t *bits;
+	int dx, dy;
+	int sx, sy;
+	uint8_t op;
+};
+
+static force_inline uint8_t
+mul_8_8(uint8_t a, uint8_t b)
+{
+    uint16_t t = a * (uint16_t)b + 0x7f;
+    return ((t >> 8) + t) >> 8;
+}
+
+static inline uint32_t multa(uint32_t s, uint8_t a, int shift)
+{
+	return mul_8_8((s >> shift) & 0xff, a) << shift;
+}
+
+static inline uint32_t mul_4x8_8(uint32_t color, uint8_t alpha)
+{
+	uint32_t v;
+
+	v = multa(color, alpha, 24);
+	v |= multa(color, alpha, 16);
+	v |= multa(color, alpha, 8);
+	v |= multa(color, alpha, 0);
+
+	return v;
+}
+
+inline static void
+pixsolid_opacity(struct pixman_inplace *pi,
+		 int16_t x, int16_t w,
+		 int16_t y, int16_t h,
+		 uint8_t opacity)
+{
+	if (opacity == 0xff)
+		*pi->bits = pi->color;
+	else
+		*pi->bits = mul_4x8_8(pi->color, opacity);
+	pixman_image_composite(pi->op, pi->source, NULL, pi->image,
+			       0, 0, 0, 0, pi->dx + x, pi->dy + y, w, h);
+}
+
+static void
+pixsolid_unaligned_box_row(struct pixman_inplace *pi,
+			   const BoxRec *extents,
+			   xTrapezoid *trap,
+			   int16_t y, int16_t h,
+			   uint8_t covered)
+{
+	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x);
+	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	int16_t x2 = pixman_fixed_to_int(trap->right.p1.x);
+	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
+
+	if (x1 < extents->x1)
+		x1 = extents->x1, fx1 = 0;
+	if (x2 > extents->x2)
+		x2 = extents->x2, fx2 = 0;
+
+	if (x2 < x1) {
+		if (fx1) {
+			pixsolid_opacity(pi, x1, 1, y, h,
+					 covered * (SAMPLES_X - fx1));
+			x1++;
+		}
+
+		if (x2 > x1)
+			pixsolid_opacity(pi, x1, x2-x1, y, h, covered*SAMPLES_X);
+
+		if (fx2)
+			pixsolid_opacity(pi, x2, 1, y, h, covered * fx2);
+	} else if (x1 == x2 && fx2 > fx1) {
+		pixsolid_opacity(pi, x1, 1, y, h, covered * (fx2 - fx1));
+	}
+}
+
+static bool
+composite_unaligned_boxes_inplace__solid(CARD8 op, uint32_t color,
+					 PicturePtr dst, int n, xTrapezoid *t,
+					 bool force_fallback)
+{
+	PixmapPtr pixmap;
+	int16_t dx, dy;
+
+	if (!force_fallback && is_gpu(dst->pDrawable)) {
+		DBG(("%s: fallback -- can not perform operation in place, destination busy\n",
+		     __FUNCTION__));
+
+		return false;
+	}
+
+	/* XXX a8 boxes */
+	if (dst->format == PICT_a8r8g8b8 || dst->format == PICT_x8r8g8b8) {
+		DBG(("%s: fallback -- can not perform operation in place, unhanbled format %08lx\n",
+		     __FUNCTION__, dst->format));
+		goto pixman;
+	}
+
+	pixmap = get_drawable_pixmap(dst->pDrawable);
+	get_drawable_deltas(dst->pDrawable, pixmap, &dx, &dy);
+
+	if (op == PictOpOver && (color >> 24) == 0xff)
+		op = PictOpSrc;
+	if (op == PictOpOver) {
+		struct sna_pixmap *priv = sna_pixmap(pixmap);
+		if (priv && priv->clear && priv->clear_color == 0)
+			op = PictOpSrc;
+	}
+
+	switch (op) {
+	case PictOpSrc:
+		break;
+	default:
+		DBG(("%s: fallback -- can not perform op [%d] in place\n",
+		     __FUNCTION__, op));
+		goto pixman;
+	}
+
+	do {
+		RegionRec clip;
+		BoxPtr extents;
+		int count;
+
+		clip.extents.x1 = pixman_fixed_to_int(t->left.p1.x);
+		clip.extents.x2 = pixman_fixed_to_int(t->right.p1.x + pixman_fixed_1_minus_e);
+		clip.extents.y1 = pixman_fixed_to_int(t->top);
+		clip.extents.y2 = pixman_fixed_to_int(t->bottom + pixman_fixed_1_minus_e);
+		clip.data = NULL;
+
+		if (!sna_compute_composite_region(&clip,
+						   NULL, NULL, dst,
+						   0, 0,
+						   0, 0,
+						   clip.extents.x1, clip.extents.y1,
+						   clip.extents.x2 - clip.extents.x1,
+						   clip.extents.y2 - clip.extents.y1))
+			continue;
+
+		if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &clip,
+						     MOVE_WRITE | MOVE_READ)) {
+			RegionUninit(&clip);
+			continue;
+		}
+
+		RegionTranslate(&clip, dx, dy);
+		count = REGION_NUM_RECTS(&clip);
+		extents = REGION_RECTS(&clip);
+		while (count--) {
+			int16_t y1 = dy + pixman_fixed_to_int(t->top);
+			int16_t fy1 = pixman_fixed_frac(t->top);
+			int16_t y2 = dy + pixman_fixed_to_int(t->bottom);
+			int16_t fy2 = pixman_fixed_frac(t->bottom);
+
+			if (y1 < extents->y1)
+				y1 = extents->y1, fy1 = 0;
+			if (y2 > extents->y2)
+				y2 = extents->y2, fy2 = 0;
+			if (y1 < y2) {
+				if (fy1) {
+					lerp32_unaligned_box_row(pixmap, color, extents,
+								 t, dx, y1, 1,
+								 SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
+					y1++;
+				}
+
+				if (y2 > y1)
+					lerp32_unaligned_box_row(pixmap, color, extents,
+								 t, dx, y1, y2 - y1,
+								 SAMPLES_Y);
+
+				if (fy2)
+					lerp32_unaligned_box_row(pixmap, color,  extents,
+								 t, dx, y2, 1,
+								 grid_coverage(SAMPLES_Y, fy2));
+			} else if (y1 == y2 && fy2 > fy1) {
+				lerp32_unaligned_box_row(pixmap, color, extents,
+							 t, dx, y1, 1,
+							 grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
+			}
+			extents++;
+		}
+
+		RegionUninit(&clip);
+	} while (--n && t++);
+
+	return true;
+
+pixman:
+	do {
+		struct pixman_inplace pi;
+		RegionRec clip;
+		BoxPtr extents;
+		int count;
+
+		clip.extents.x1 = pixman_fixed_to_int(t->left.p1.x);
+		clip.extents.x2 = pixman_fixed_to_int(t->right.p1.x + pixman_fixed_1_minus_e);
+		clip.extents.y1 = pixman_fixed_to_int(t->top);
+		clip.extents.y2 = pixman_fixed_to_int(t->bottom + pixman_fixed_1_minus_e);
+		clip.data = NULL;
+
+		if (!sna_compute_composite_region(&clip,
+						   NULL, NULL, dst,
+						   0, 0,
+						   0, 0,
+						   clip.extents.x1, clip.extents.y1,
+						   clip.extents.x2 - clip.extents.x1,
+						   clip.extents.y2 - clip.extents.y1))
+			continue;
+
+		if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &clip,
+						     MOVE_WRITE | MOVE_READ)) {
+			RegionUninit(&clip);
+			continue;
+		}
+
+		pi.image = image_from_pict(dst, FALSE, &pi.dx, &pi.dy);
+		pi.source = pixman_image_create_bits(PIXMAN_a8r8g8b8, 1, 1, NULL, 0);
+		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+		pi.bits = pixman_image_get_data(pi.source);
+		pi.op = op;
+
+		count = REGION_NUM_RECTS(&clip);
+		extents = REGION_RECTS(&clip);
+		while (count--) {
+			int16_t y1 = pixman_fixed_to_int(t->top);
+			int16_t fy1 = pixman_fixed_frac(t->top);
+			int16_t y2 = pixman_fixed_to_int(t->bottom);
+			int16_t fy2 = pixman_fixed_frac(t->bottom);
+
+			if (y1 < extents->y1)
+				y1 = extents->y1, fy1 = 0;
+			if (y2 > extents->y2)
+				y2 = extents->y2, fy2 = 0;
+			if (y1 < y2) {
+				if (fy1) {
+					pixsolid_unaligned_box_row(&pi, extents, t, y1, 1,
+								   SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
+					y1++;
+				}
+
+				if (y2 > y1)
+					pixsolid_unaligned_box_row(&pi, extents, t, y1, y2 - y1,
+								   SAMPLES_Y);
+
+				if (fy2)
+					pixsolid_unaligned_box_row(&pi, extents, t, y2, 1,
+								   grid_coverage(SAMPLES_Y, fy2));
+			} else if (y1 == y2 && fy2 > fy1) {
+				pixsolid_unaligned_box_row(&pi, extents, t, y1, 1,
+							   grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
+			}
+			extents++;
+		}
+
+		RegionUninit(&clip);
+		pixman_image_unref(pi.image);
+		pixman_image_unref(pi.source);
+	} while (--n && t++);
+	return true;
+}
+
+inline static void
+pixmask_opacity(struct pixman_inplace *pi,
+		int16_t x, int16_t w,
+		int16_t y, int16_t h,
+		uint8_t opacity)
+{
+	if (opacity == 0xff) {
+		pixman_image_composite(pi->op, pi->source, NULL, pi->image,
+				       pi->sx + x, pi->sy + y,
+				       0, 0,
+				       pi->dx + x, pi->dy + y,
+				       w, h);
+	} else {
+		*pi->bits = opacity;
+		pixman_image_composite(pi->op, pi->source, pi->mask, pi->image,
+				       pi->sx + x, pi->sy + y,
+				       0, 0,
+				       pi->dx + x, pi->dy + y,
+				       w, h);
+	}
+}
+
+static void
+pixmask_unaligned_box_row(struct pixman_inplace *pi,
+			  const BoxRec *extents,
+			  xTrapezoid *trap,
+			  int16_t y, int16_t h,
+			  uint8_t covered)
+{
+	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x);
+	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	int16_t x2 = pixman_fixed_to_int(trap->right.p1.x);
+	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
+
+	if (x1 < extents->x1)
+		x1 = extents->x1, fx1 = 0;
+	if (x2 > extents->x2)
+		x2 = extents->x2, fx2 = 0;
+
+	if (x2 < x1) {
+		if (fx1) {
+			pixmask_opacity(pi, x1, 1, y, h,
+					 covered * (SAMPLES_X - fx1));
+			x1++;
+		}
+
+		if (x2 > x1)
+			pixmask_opacity(pi, x1, x2-x1, y, h, covered*SAMPLES_X);
+
+		if (fx2)
+			pixmask_opacity(pi, x2, 1, y, h, covered * fx2);
+	} else if (x1 == x2 && fx2 > fx1) {
+		pixmask_opacity(pi, x1, 1, y, h, covered * (fx2 - fx1));
+	}
+}
+
+static bool
+composite_unaligned_boxes_inplace(CARD8 op,
+				  PicturePtr src, int16_t src_x, int16_t src_y,
+				  PicturePtr dst, int n, xTrapezoid *t,
+				  bool force_fallback)
+{
+	PixmapPtr pixmap;
+	int16_t dx, dy;
+
+	if (!force_fallback && is_gpu(dst->pDrawable)) {
+		DBG(("%s: fallback -- can not perform operation in place, destination busy\n",
+		     __FUNCTION__));
+
+		return false;
+	}
+
+	src_x -= pixman_fixed_to_int(t[0].left.p1.x);
+	src_y -= pixman_fixed_to_int(t[0].left.p1.y);
+
+	pixmap = get_drawable_pixmap(dst->pDrawable);
+	get_drawable_deltas(dst->pDrawable, pixmap, &dx, &dy);
+
+	do {
+		struct pixman_inplace pi;
+		RegionRec clip;
+		BoxPtr extents;
+		int count;
+
+		clip.extents.x1 = pixman_fixed_to_int(t->left.p1.x);
+		clip.extents.x2 = pixman_fixed_to_int(t->right.p1.x + pixman_fixed_1_minus_e);
+		clip.extents.y1 = pixman_fixed_to_int(t->top);
+		clip.extents.y2 = pixman_fixed_to_int(t->bottom + pixman_fixed_1_minus_e);
+		clip.data = NULL;
+
+		if (!sna_compute_composite_region(&clip,
+						   NULL, NULL, dst,
+						   0, 0,
+						   0, 0,
+						   clip.extents.x1, clip.extents.y1,
+						   clip.extents.x2 - clip.extents.x1,
+						   clip.extents.y2 - clip.extents.y1))
+			continue;
+
+		if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &clip,
+						     MOVE_WRITE | MOVE_READ)) {
+			RegionUninit(&clip);
+			continue;
+		}
+
+		pi.image = image_from_pict(dst, FALSE, &pi.dx, &pi.dy);
+		pi.source = image_from_pict(src, TRUE, &pi.sx, &pi.sy);
+		pi.sx += src_x;
+		pi.sy += src_y;
+		pi.mask = pixman_image_create_bits(PIXMAN_a8, 1, 1, NULL, 0);
+		pixman_image_set_repeat(pi.mask, PIXMAN_REPEAT_NORMAL);
+		pi.bits = pixman_image_get_data(pi.mask);
+		pi.op = op;
+
+		count = REGION_NUM_RECTS(&clip);
+		extents = REGION_RECTS(&clip);
+		while (count--) {
+			int16_t y1 = pixman_fixed_to_int(t->top);
+			int16_t fy1 = pixman_fixed_frac(t->top);
+			int16_t y2 = pixman_fixed_to_int(t->bottom);
+			int16_t fy2 = pixman_fixed_frac(t->bottom);
+
+			if (y1 < extents->y1)
+				y1 = extents->y1, fy1 = 0;
+			if (y2 > extents->y2)
+				y2 = extents->y2, fy2 = 0;
+			if (y1 < y2) {
+				if (fy1) {
+					pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
+								   SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
+					y1++;
+				}
+
+				if (y2 > y1)
+					pixmask_unaligned_box_row(&pi, extents, t, y1, y2 - y1,
+								   SAMPLES_Y);
+
+				if (fy2)
+					pixmask_unaligned_box_row(&pi, extents, t, y2, 1,
+								   grid_coverage(SAMPLES_Y, fy2));
+			} else if (y1 == y2 && fy2 > fy1) {
+				pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
+							  grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
+			}
+			extents++;
+		}
+
+		RegionUninit(&clip);
+		pixman_image_unref(pi.image);
+		pixman_image_unref(pi.source);
+		pixman_image_unref(pi.mask);
+	} while (--n && t++);
+
+	return true;
+}
+
+static bool
 composite_unaligned_boxes_fallback(CARD8 op,
 				   PicturePtr src,
 				   PicturePtr dst,
 				   INT16 src_x, INT16 src_y,
-				   int ntrap, xTrapezoid *traps)
+				   int ntrap, xTrapezoid *traps,
+				   bool force_fallback)
 {
 	ScreenPtr screen = dst->pDrawable->pScreen;
-	INT16 dst_x = pixman_fixed_to_int(traps[0].left.p1.x);
-	INT16 dst_y = pixman_fixed_to_int(traps[0].left.p1.y);
-	int dx = dst->pDrawable->x;
-	int dy = dst->pDrawable->y;
+	uint32_t color;
+	int16_t dst_x, dst_y;
+	int16_t dx, dy;
 	int n;
 
+	if (sna_picture_is_solid(src, &color) &&
+	    composite_unaligned_boxes_inplace__solid(op, color, dst,
+						     ntrap, traps,
+						     force_fallback))
+		return true;
+
+	if (composite_unaligned_boxes_inplace(op, src, src_x, src_y,
+					      dst, ntrap, traps,
+					      force_fallback))
+		return true;
+
+	dst_x = pixman_fixed_to_int(traps[0].left.p1.x);
+	dst_y = pixman_fixed_to_int(traps[0].left.p1.y);
+	dx = dst->pDrawable->x;
+	dy = dst->pDrawable->y;
 	for (n = 0; n < ntrap; n++) {
 		xTrapezoid *t = &traps[n];
 		PixmapPtr scratch;
@@ -2934,10 +3557,16 @@ composite_unaligned_boxes_fallback(CARD8 op,
 						   extents.y2 - extents.y1))
 			continue;
 
-		scratch = sna_pixmap_create_upload(screen,
-						   extents.x2 - extents.x1,
-						   extents.y2 - extents.y1,
-						   8, KGEM_BUFFER_WRITE);
+		if (force_fallback)
+			scratch = sna_pixmap_create_unattached(screen,
+							       extents.x2 - extents.x1,
+							       extents.y2 - extents.y1,
+							       8);
+		else
+			scratch = sna_pixmap_create_upload(screen,
+							   extents.x2 - extents.x1,
+							   extents.y2 - extents.y1,
+							   8, KGEM_BUFFER_WRITE);
 		if (!scratch)
 			continue;
 
@@ -2956,12 +3585,14 @@ composite_unaligned_boxes_fallback(CARD8 op,
 			blt_unaligned_box_row(scratch, &extents, t, y1, y1 + 1,
 					      grid_coverage(SAMPLES_Y, t->bottom) - grid_coverage(SAMPLES_Y, t->top));
 		} else {
-			if (pixman_fixed_frac(t->top))
+			if (pixman_fixed_frac(t->top)) {
 				blt_unaligned_box_row(scratch, &extents, t, y1, y1 + 1,
 						      SAMPLES_Y - grid_coverage(SAMPLES_Y, t->top));
+				y1++;
+			}
 
-			if (y2 > y1 + 1)
-				blt_unaligned_box_row(scratch, &extents, t, y1+1, y2,
+			if (y2 > y1)
+				blt_unaligned_box_row(scratch, &extents, t, y1, y2,
 						      SAMPLES_Y);
 
 			if (pixman_fixed_frac(t->bottom))
@@ -2985,17 +3616,18 @@ composite_unaligned_boxes_fallback(CARD8 op,
 		}
 	}
 
-	return TRUE;
+	return true;
 }
 
-static Bool
+static bool
 composite_unaligned_boxes(struct sna *sna,
 			  CARD8 op,
 			  PicturePtr src,
 			  PicturePtr dst,
 			  PictFormatPtr maskFormat,
 			  INT16 src_x, INT16 src_y,
-			  int ntrap, xTrapezoid *traps)
+			  int ntrap, xTrapezoid *traps,
+			  bool force_fallback)
 {
 	BoxRec extents;
 	struct sna_composite_spans_op tmp;
@@ -3013,9 +3645,13 @@ composite_unaligned_boxes(struct sna *sna,
 	if (ntrap > 1 && maskFormat)
 		return false;
 
-	priv = sna_pixmap(get_drawable_pixmap(dst->pDrawable));
-	if (priv == NULL || !sna->render.composite_spans)
-		return composite_unaligned_boxes_fallback(op, src, dst, src_x, src_y, ntrap, traps);
+	if (force_fallback || !sna->render.composite_spans) {
+fallback:
+		return composite_unaligned_boxes_fallback(op, src, dst,
+							  src_x, src_y,
+							  ntrap, traps,
+							  force_fallback);
+	}
 
 	dst_x = extents.x1 = pixman_fixed_to_int(traps[0].left.p1.x);
 	extents.x2 = pixman_fixed_to_int(traps[0].right.p1.x + pixman_fixed_1_minus_e);
@@ -3077,10 +3713,14 @@ composite_unaligned_boxes(struct sna *sna,
 	switch (op) {
 	case PictOpAdd:
 	case PictOpOver:
+		priv = sna_pixmap(get_drawable_pixmap(dst->pDrawable));
+		assert(priv != NULL);
 		if (priv->clear && priv->clear_color == 0)
 			op = PictOpSrc;
 		break;
 	case PictOpIn:
+		priv = sna_pixmap(get_drawable_pixmap(dst->pDrawable));
+		assert(priv != NULL);
 		if (priv->clear && priv->clear_color == 0)
 			return true;
 		break;
@@ -3097,13 +3737,13 @@ composite_unaligned_boxes(struct sna *sna,
 					 &tmp)) {
 		DBG(("%s: composite spans render op not supported\n",
 		     __FUNCTION__));
-		return false;
+		REGION_UNINIT(NULL, &clip);
+		goto fallback;
 	}
 
 	for (n = 0; n < ntrap; n++)
 		composite_unaligned_trap(sna, &tmp, &traps[n], dx, dy, c);
 	tmp.done(sna, &tmp);
-
 	REGION_UNINIT(NULL, &clip);
 	return true;
 }
@@ -3629,13 +4269,6 @@ struct inplace {
 	};
 };
 
-static force_inline uint8_t
-mul_8_8(uint8_t a, uint8_t b)
-{
-    uint16_t t = a * (uint16_t)b + 0x7f;
-    return ((t >> 8) + t) >> 8;
-}
-
 static force_inline uint8_t coverage_opacity(int coverage, uint8_t opacity)
 {
 	coverage = coverage * 256 / FAST_SAMPLES_XY;
@@ -3798,36 +4431,6 @@ tor_blt_add_clipped(struct sna *sna,
 	pixman_region_fini(&region);
 }
 
-#define ONE_HALF 0x7f
-#define RB_MASK 0x00ff00ff
-#define RB_ONE_HALF 0x007f007f
-#define RB_MASK_PLUS_ONE 0x01000100
-#define G_SHIFT 8
-
-static force_inline uint32_t
-mul8x2_8 (uint32_t a, uint8_t b)
-{
-	uint32_t t = (a & RB_MASK) * b + RB_ONE_HALF;
-	return ((t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT) & RB_MASK;
-}
-
-static force_inline uint32_t
-add8x2_8x2(uint32_t a, uint32_t b)
-{
-	uint32_t t = a + b;
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);
-	return t & RB_MASK;
-}
-
-static force_inline uint32_t
-lerp8x4(uint32_t src, uint8_t a, uint32_t dst)
-{
-	return (add8x2_8x2(mul8x2_8(src, a),
-			   mul8x2_8(dst, ~a)) |
-		add8x2_8x2(mul8x2_8(src >> G_SHIFT, a),
-			   mul8x2_8(dst >> G_SHIFT, ~a)) << G_SHIFT);
-}
-
 static void
 tor_blt_lerp32(struct sna *sna,
 	       struct sna_composite_spans_op *op,
@@ -4196,7 +4799,6 @@ unbounded_pass:
 static bool
 trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 				 uint32_t color,
-				 PicturePtr src,
 				 PicturePtr dst,
 				 PictFormatPtr maskFormat,
 				 int ntrap, xTrapezoid *traps)
@@ -4234,7 +4836,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 		do {
 			/* XXX unwind errors? */
 			if (!trapezoid_span_inplace__x8r8g8b8(op, color,
-							      src, dst, NULL,
+							      dst, NULL,
 							      1, traps++))
 				return false;
 		} while (--ntrap);
@@ -4252,7 +4854,7 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 	     region.extents.x2, region.extents.y2));
 
 	if (!sna_compute_composite_extents(&region.extents,
-					   src, NULL, dst,
+					   NULL, NULL, dst,
 					   0, 0,
 					   0, 0,
 					   region.extents.x1, region.extents.y1,
@@ -4372,7 +4974,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	if (dst->format == PICT_a8r8g8b8 || dst->format == PICT_x8r8g8b8)
 		return trapezoid_span_inplace__x8r8g8b8(op, color,
-							src, dst, maskFormat,
+							dst, maskFormat,
 							ntrap, traps);
 
 	if (dst->format != PICT_a8) {
@@ -4454,8 +5056,8 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	     region.extents.x2, region.extents.y2));
 
 	if (!sna_compute_composite_extents(&region.extents,
-					   src, NULL, dst,
-					   src_x, src_y,
+					   NULL, NULL, dst,
+					   0, 0,
 					   0, 0,
 					   region.extents.x1, region.extents.y1,
 					   region.extents.x2 - region.extents.x1,
@@ -4687,7 +5289,7 @@ sna_composite_trapezoids(CARD8 op,
 	PixmapPtr pixmap = get_drawable_pixmap(dst->pDrawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_pixmap *priv;
-	bool rectilinear, pixel_aligned;
+	bool rectilinear, pixel_aligned, force_fallback;
 	unsigned flags;
 	int n;
 
@@ -4718,12 +5320,14 @@ sna_composite_trapezoids(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(priv) && !picture_is_gpu(src)) {
-		DBG(("%s: fallback -- dst is too small, %dx%d\n",
+	force_fallback = false;
+	if ((too_small(priv) || DAMAGE_IS_ALL(priv->cpu_damage)) &&
+	    !picture_is_gpu(src)) {
+		DBG(("%s: force fallbacks -- dst is too small, %dx%d\n",
 		     __FUNCTION__,
 		     dst->pDrawable->width,
 		     dst->pDrawable->height));
-		goto fallback;
+		force_fallback = true;
 	}
 
 	/* scan through for fast rectangles */
@@ -4769,18 +5373,23 @@ sna_composite_trapezoids(CARD8 op,
 			if (composite_aligned_boxes(sna, op, src, dst,
 						    maskFormat,
 						    xSrc, ySrc,
-						    ntrap, traps))
+						    ntrap, traps,
+						    force_fallback))
 			    return;
 		} else {
 			if (composite_unaligned_boxes(sna, op, src, dst,
 						      maskFormat,
 						      xSrc, ySrc,
-						      ntrap, traps))
+						      ntrap, traps,
+						      force_fallback))
 				return;
 		}
 		flags |= COMPOSITE_SPANS_RECTILINEAR;
 	}
 
+	if (force_fallback)
+		goto fallback;
+
 	if (is_mono(dst, maskFormat) &&
 	    mono_trapezoids_span_converter(op, src, dst,
 					   xSrc, ySrc,
commit ae3c0963790cfb6f984ed4ad3ecbaae492775e1b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 15 16:25:51 2012 +0100

    sna: Composite glyphs inplace if the CPU is already all-damaged
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 17c42d5..3d24ea4 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -1364,7 +1364,8 @@ sna_glyphs(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(priv) && !picture_is_gpu(src)) {
+	if ((too_small(priv) || DAMAGE_IS_ALL(priv->cpu_damage)) &&
+	    !picture_is_gpu(src)) {
 		DBG(("%s: fallback -- too small (%dx%d)\n",
 		     __FUNCTION__, dst->pDrawable->width, dst->pDrawable->height));
 		goto fallback;
commit eaed58b2baf30eaea37be06cfc1d9d81059aba27
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 16 12:55:54 2012 +0100

    sna: Tweak placement of operations
    
    Take in account busyness of the damaged GPU bo for considering placement
    of the subsequent operations. In particular, note that is_cpu is only
    used for when we feel like the following operation would be better on
    the CPU and just want to confirm that doing so will not stall.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2eee01d..ed996df 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1282,17 +1282,17 @@ static inline bool region_inplace(struct sna *sna,
 	if (wedged(sna))
 		return false;
 
-	if (priv->flush) {
-		DBG(("%s: yes, exported via dri, will flush\n", __FUNCTION__));
-		return true;
-	}
-
 	if (priv->cpu_damage &&
 	    region_overlaps_damage(region, priv->cpu_damage)) {
 		DBG(("%s: no, uncovered CPU damage pending\n", __FUNCTION__));
 		return false;
 	}
 
+	if (priv->flush) {
+		DBG(("%s: yes, exported via dri, will flush\n", __FUNCTION__));
+		return true;
+	}
+
 	if (priv->mapped) {
 		DBG(("%s: yes, already mapped, continuiung\n", __FUNCTION__));
 		return true;
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index b9acea1..15512fd 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -75,7 +75,7 @@ is_gpu(DrawablePtr drawable)
 	if (priv == NULL || priv->clear)
 		return false;
 
-	if (priv->gpu_damage || (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo) && !priv->gpu_bo->proxy))
+	if (DAMAGE_IS_ALL(priv->gpu_damage) || (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo) && !priv->gpu_bo->proxy))
 		return true;
 
 	return priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo);
@@ -85,10 +85,14 @@ static inline Bool
 is_cpu(DrawablePtr drawable)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
-	if (priv == NULL || priv->clear || DAMAGE_IS_ALL(priv->cpu_damage))
+	if (priv == NULL || priv->gpu_bo == NULL || priv->clear || DAMAGE_IS_ALL(priv->cpu_damage))
 		return true;
 
-	if (priv->gpu_damage || (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))
+	assert(!priv->gpu_bo->proxy); /* should be marked as cpu damaged */
+	if (priv->gpu_damage && kgem_bo_is_busy(priv->gpu_bo));
+		return false;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
 		return false;
 
 	return true;
@@ -106,7 +110,7 @@ too_small(struct sna_pixmap *priv)
 {
 	assert(priv);
 
-	if (priv->gpu_damage)
+	if (priv->gpu_bo)
 		return false;
 
 	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
@@ -120,7 +124,7 @@ unattached(DrawablePtr drawable)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
 
-	if (priv == NULL)
+	if (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))
 		return true;
 
 	return priv->gpu_bo == NULL && priv->cpu_bo == NULL;
commit 8eac098962891a5deb7c53d36c6dec57c7f2b972
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jun 17 09:51:12 2012 +0100

    sna/gen3: Add another DBG for dropping vbo
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index e58cdd6..62b3a8e 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1697,6 +1697,7 @@ static void gen3_vertex_close(struct sna *sna)
 			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
 			sna->render.vertices = kgem_bo_map__gtt(&sna->kgem, bo);
 			if (sna->render.vertices == NULL) {
+				DBG(("%s: discarding non-mappable vertices\n",__FUNCTION__));
 				sna->render.vbo = NULL;
 				sna->render.vertices = sna->render.vertex_data;
 				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);


More information about the xorg-commit mailing list