xf86-video-intel: 11 commits - src/sna/blt.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_accel.c src/sna/sna_io.c src/sna/sna_trapezoids_imprecise.c

Sat Jun 28 23:17:55 PDT 2014

src/sna/blt.c                      |  152 ++++++++++++++++---------------------
 src/sna/kgem.c                     |    9 +-
 src/sna/kgem.h                     |   10 ++
 src/sna/sna_accel.c                |  151 +++++++++++++++++++++++++++++-------
 src/sna/sna_io.c                   |   10 +-
 src/sna/sna_trapezoids_imprecise.c |   17 +++-
 6 files changed, 228 insertions(+), 121 deletions(-)

New commits:
commit 6b906ae742ec96eeef403191d3cdded6a23a70b7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jun 29 07:02:44 2014 +0100

    sna: Update allocation of CPU bo to avoid creating active buffers
    
    Since we now prefer CPU detiling, exactly when we want active/inactive
    buffers is a little more complex - and we also need to take into account
    when we want to use the CPU bo as a render target.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index efcde3e..54fece4 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -508,7 +508,7 @@ static bool must_check
 sna_pixmap_alloc_cpu(struct sna *sna,
 		     PixmapPtr pixmap,
 		     struct sna_pixmap *priv,
-		     bool from_gpu)
+		     unsigned flags)
 {
 	/* Restore after a GTT mapping? */
 	assert(priv->gpu_damage == NULL || priv->gpu_bo);
@@ -520,14 +520,21 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	assert(priv->stride);
 
 	if (priv->create & KGEM_CAN_CREATE_CPU) {
+		unsigned hint;
+
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
+		hint = 0;
+		if ((flags & MOVE_ASYNC_HINT) == 0 &&
+		    ((flags & MOVE_READ) == 0 || (priv->gpu_damage && !priv->clear && !sna->kgem.has_llc)))
+			hint = CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE;
+
 		priv->cpu_bo = kgem_create_cpu_2d(&sna->kgem,
 						  pixmap->drawable.width,
 						  pixmap->drawable.height,
 						  pixmap->drawable.bitsPerPixel,
-						  from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE);
+						  hint);
 		if (priv->cpu_bo) {
 			priv->ptr = kgem_bo_map__cpu(&sna->kgem, priv->cpu_bo);
 			if (priv->ptr) {
@@ -2165,7 +2172,8 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 skip_inplace_map:
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->clear = false;
-		if (priv->cpu_bo && !priv->cpu_bo->flush &&
+		if ((flags & MOVE_ASYNC_HINT) == 0 &&
+		    priv->cpu_bo && !priv->cpu_bo->flush &&
 		    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
 			DBG(("%s: discarding busy CPU bo\n", __FUNCTION__));
 			assert(!priv->shm);
@@ -2175,7 +2183,7 @@ skip_inplace_map:
 			sna_pixmap_free_cpu(sna, priv, false);
 
 			assert(priv->mapped == MAPPED_NONE);
-			if (!sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
+			if (!sna_pixmap_alloc_cpu(sna, pixmap, priv, 0))
 				return false;
 			assert(priv->mapped == MAPPED_NONE);
 			assert(pixmap->devPrivate.ptr == PTR(priv->ptr));
@@ -2277,8 +2285,7 @@ skip_inplace_map:
 
 	assert(priv->mapped == MAPPED_NONE);
 	if (pixmap->devPrivate.ptr == NULL &&
-	    !sna_pixmap_alloc_cpu(sna, pixmap, priv,
-				  flags & MOVE_READ ? priv->gpu_damage && !priv->clear : 0))
+	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, flags))
 		return false;
 	assert(priv->mapped == MAPPED_NONE);
 	assert(pixmap->devPrivate.ptr == PTR(priv->ptr));
@@ -2290,6 +2297,15 @@ skip_inplace_map:
 			     pixmap->devKind, pixmap->devKind * pixmap->drawable.height));
 
 			if (priv->cpu_bo) {
+				if ((flags & MOVE_ASYNC_HINT || priv->cpu_bo->exec) &&
+				    sna->render.fill_one(sna,
+							  pixmap, priv->cpu_bo, priv->clear_color,
+							  0, 0,
+							  pixmap->drawable.width,
+							  pixmap->drawable.height,
+							  GXcopy))
+					goto clear_done;
+
 				DBG(("%s: syncing CPU bo\n", __FUNCTION__));
 				kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 				assert(pixmap->devPrivate.ptr == MAP(priv->cpu_bo->map__cpu));
@@ -2311,6 +2327,7 @@ skip_inplace_map:
 					    priv->clear_color);
 			}
 
+clear_done:
 			sna_damage_all(&priv->cpu_damage, pixmap);
 			sna_pixmap_free_gpu(sna, priv);
 			assert(priv->gpu_damage == NULL);
@@ -2474,6 +2491,27 @@ static inline bool region_inplace(struct sna *sna,
 		>= sna->kgem.half_cpu_cache_pages;
 }
 
+static bool cpu_clear_boxes(struct sna *sna,
+			    PixmapPtr pixmap,
+			    struct sna_pixmap *priv,
+			    const BoxRec *box, int n)
+{
+	struct sna_fill_op fill;
+
+	if (!sna_fill_init_blt(&fill, sna,
+			       pixmap, priv->cpu_bo,
+			       GXcopy, priv->clear_color,
+			       FILL_BOXES)) {
+		DBG(("%s: unsupported fill\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	fill.boxes(sna, &fill, box, n);
+	fill.done(sna, &fill);
+	return true;
+}
+
 bool
 sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				RegionPtr region,
@@ -2602,7 +2640,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		sna_pixmap_unmap(pixmap, priv);
 		assert(priv->mapped == MAPPED_NONE);
 		if (pixmap->devPrivate.ptr == NULL &&
-		    !sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
+		    !sna_pixmap_alloc_cpu(sna, pixmap, priv, flags))
 			return false;
 		assert(priv->mapped == MAPPED_NONE);
 		assert(pixmap->devPrivate.ptr == PTR(priv->ptr));
@@ -2788,8 +2826,7 @@ move_to_cpu:
 
 	assert(priv->mapped == MAPPED_NONE);
 	if (pixmap->devPrivate.ptr == NULL &&
-	    !sna_pixmap_alloc_cpu(sna, pixmap, priv,
-				  flags & MOVE_READ ? priv->gpu_damage && !priv->clear : 0)) {
+	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, flags)) {
 		DBG(("%s: CPU bo allocation failed, trying full move-to-cpu\n", __FUNCTION__));
 		goto move_to_cpu;
 	}
@@ -2819,6 +2856,10 @@ move_to_cpu:
 
 		DBG(("%s: pending clear, doing partial fill\n", __FUNCTION__));
 		if (priv->cpu_bo) {
+			if ((flags & MOVE_ASYNC_HINT || priv->cpu_bo->exec) &&
+			    cpu_clear_boxes(sna, pixmap, priv, box, n))
+				goto clear_done;
+
 			DBG(("%s: syncing CPU bo\n", __FUNCTION__));
 			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 			assert(pixmap->devPrivate.ptr == MAP(priv->cpu_bo->map__cpu));
@@ -2836,6 +2877,7 @@ move_to_cpu:
 			box++;
 		} while (--n);
 
+clear_done:
 		if (flags & MOVE_WRITE ||
 		    region->extents.x2 - region->extents.x1 > 1 ||
 		    region->extents.y2 - region->extents.y1 > 1) {
commit b961d7323369284ea2c3db47d30c27ffe01a9040
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jun 29 07:00:58 2014 +0100

    sna: Sync CPU bo before writes
    
    Fixes regression from
    
    commit 961139f5878572ebea268a0bbf47caf05af9093f [2.99.912]
    Author: Chris Wilson <chris at chris-wilson.co.uk>
    Date:   Fri May 30 09:45:15 2014 +0100
    
        sna: Use manual detiling for downloads
    
    Reported-by: Harald Judt <h.judt at gmx.at>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=80560
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 7a9610c..efcde3e 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1679,14 +1679,22 @@ static inline bool gpu_bo_download(struct sna *sna,
 	if (!kgem_bo_can_map__cpu(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC))
 		return false;
 
-	if (idle && __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
-		return false;
+	if (idle) {
+		if (__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+			return false;
+
+		if (priv->cpu_bo && __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo))
+			return false;
+	}
 
 	src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
 	if (src == NULL)
 		return false;
 
 	kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
+
+	if (priv->cpu_bo)
+		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 	assert(has_coherent_ptr(sna, priv, MOVE_WRITE));
 
 	if (sigtrap_get())
commit 53ef9e762a6e7802b3d5f8fba9ac17ff95545c10
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 21:07:08 2014 +0100

    sna: Only preferentially upload through the GTT for large transfers
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3e5b036..7a9610c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4708,6 +4708,11 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
 		break;
 	}
 
+	if (priv->gpu_damage == NULL && !box_inplace(pixmap, &region->extents)) {
+		DBG(("%s: no, too small to bother with using the GTT\n", __FUNCTION__));
+		return false;
+	}
+
 	if (!kgem_bo_can_map(&sna->kgem, priv->gpu_bo)) {
 		DBG(("%s: no, cannot map through the CPU\n", __FUNCTION__));
 		return false;
commit 0955f12ae04011593b71817e3151b8fb7c228899
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 17:38:54 2014 +0100

    sna: Prefer linear if below tile_width
    
    Be stricter in order to allow greater use of CPU bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 22aef25..3f56c32 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4194,13 +4194,13 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		goto done;
 	}
 
-	if (tiling == I915_TILING_X && width * bpp <= 8*8*512/10) {
+	if (tiling == I915_TILING_X && width * bpp <= 8*512) {
 		DBG(("%s: too thin [width %d, %d bpp] for TILING_X\n",
 		     __FUNCTION__, width, bpp));
 		tiling = I915_TILING_NONE;
 		goto done;
 	}
-	if (tiling == I915_TILING_Y && width * bpp <= 8*8*128/10) {
+	if (tiling == I915_TILING_Y && width * bpp < 8*128) {
 		DBG(("%s: too thin [%d] for TILING_Y\n",
 		     __FUNCTION__, width));
 		tiling = I915_TILING_NONE;
commit 3ef966f4c5bae07108ce2720f4da3c3c4e41e1aa
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:23:29 2014 +0100

    sna/io: Prefer CPU copies on LLC
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 9e175a7..eaa2052 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -117,6 +117,8 @@ read_boxes_inplace__cpu(struct kgem *kgem,
 	if (sigtrap_get())
 		return false;
 
+	DBG(("%s x %d\n", __FUNCTION__, n));
+
 	if (bo->tiling == I915_TILING_X) {
 		do {
 			memcpy_from_tiled_x(kgem, src, dst, bpp, src_pitch, dst_pitch,
@@ -210,10 +212,13 @@ static bool download_inplace(struct kgem *kgem,
 	if (FORCE_INPLACE)
 		return FORCE_INPLACE > 0;
 
+	if (cpu)
+		return true;
+
 	if (kgem->can_blt_cpu && kgem->max_cpu_size)
 		return false;
 
-	return !__kgem_bo_is_busy(kgem, bo) || cpu;
+	return !__kgem_bo_is_busy(kgem, bo);
 }
 
 void sna_read_boxes(struct sna *sna, PixmapPtr dst, struct kgem_bo *src_bo,
@@ -253,7 +258,7 @@ void sna_read_boxes(struct sna *sna, PixmapPtr dst, struct kgem_bo *src_bo,
 	 * this path.
 	 */
 
-	if (download_inplace(kgem, dst, src_bo, box ,nbox)) {
+	if (download_inplace(kgem, dst, src_bo, box, nbox)) {
 fallback:
 		read_boxes_inplace(kgem, dst, src_bo, box, nbox);
 		return;
commit 9fc052da5c4246402d2707b3a91efffa7dd81e08
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:22:17 2014 +0100

    sna: Don't discard damage for SHM pixmaps
    
    We don't really want to rendering into SHM pixmaps except for copying
    back due to the strict serialisation requirements.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2a4c567..3e5b036 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -6126,7 +6126,11 @@ upload_inplace:
 
 static void discard_cpu_damage(struct sna *sna, struct sna_pixmap *priv)
 {
+	if (priv->cpu_damage == NULL && !priv->shm)
+		return;
+
 	DBG(("%s: discarding existing CPU damage\n", __FUNCTION__));
+
 	if (kgem_bo_discard_cache(priv->gpu_bo, true)) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
 		assert(DAMAGE_IS_ALL(priv->cpu_damage));
@@ -6137,6 +6141,7 @@ static void discard_cpu_damage(struct sna *sna, struct sna_pixmap *priv)
 		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
+
 	sna_damage_destroy(&priv->cpu_damage);
 	list_del(&priv->flush_list);
 
commit 0f8b39d24ff15cf3373ac7293f12772ebe16b68b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:21:36 2014 +0100

    sna: Check for a mappable GPU bo before migrating damage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b7e3d90..2a4c567 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4520,6 +4520,14 @@ try_upload__tiled_x(PixmapPtr pixmap, RegionRec *region,
 		return false;
 	}
 
+	if (!sna_pixmap_move_area_to_gpu(pixmap, &region->extents,
+					 MOVE_WRITE | (region->data ? MOVE_READ : 0)))
+		return false;
+
+	if ((priv->create & KGEM_CAN_CREATE_LARGE) == 0 &&
+	    __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+		return false;
+
 	dst = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
 	if (dst == NULL)
 		return false;
@@ -4622,6 +4630,11 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
 	if (!USE_INPLACE)
 		return false;
 
+	assert(priv);
+
+	if (priv->shm && priv->gpu_damage == NULL)
+		return false;
+
 	replaces = region_subsumes_pixmap(region, pixmap);
 
 	DBG(("%s: bo? %d, can map? %d, replaces? %d\n", __FUNCTION__,
@@ -4678,18 +4691,10 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
 		}
 	}
 
-	if (!sna_pixmap_move_area_to_gpu(pixmap, &region->extents,
-					 MOVE_WRITE | (region->data ? MOVE_READ : 0)))
-		return false;
-
 	if (priv->gpu_bo == NULL &&
 	    !create_upload_tiled_x(&sna->kgem, pixmap, priv, ignore_cpu))
 		return false;
 
-	if ((priv->create & KGEM_CAN_CREATE_LARGE) == 0 &&
-	    __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
-		return false;
-
 	DBG(("%s: tiling=%d\n", __FUNCTION__, priv->gpu_bo->tiling));
 	switch (priv->gpu_bo->tiling) {
 	case I915_TILING_Y:
@@ -4708,6 +4713,14 @@ try_upload__inplace(PixmapPtr pixmap, RegionRec *region,
 		return false;
 	}
 
+	if (!sna_pixmap_move_area_to_gpu(pixmap, &region->extents,
+					 MOVE_WRITE | (region->data ? MOVE_READ : 0)))
+		return false;
+
+	if ((priv->create & KGEM_CAN_CREATE_LARGE) == 0 &&
+	    __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+		return false;
+
 	dst = kgem_bo_map(&sna->kgem, priv->gpu_bo);
 	if (dst == NULL)
 		return false;
@@ -4768,6 +4781,14 @@ done:
 			sna_damage_destroy(&priv->cpu_damage);
 		else
 			sna_damage_subtract(&priv->cpu_damage, region);
+
+		if (priv->cpu_damage == NULL) {
+			list_del(&priv->flush_list);
+			sna_damage_all(&priv->gpu_damage, pixmap);
+		}
+
+		if (priv->shm)
+			sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
 	}
 
 	assert(!priv->clear);
@@ -4895,6 +4916,7 @@ try_upload__fast(PixmapPtr pixmap, RegionRec *region,
 		return false;
 
 	if (ignore_cpu_damage(sna, priv, region)) {
+		DBG(("%s: ignore existing cpu damage (if any)\n", __FUNCTION__));
 		if (try_upload__inplace(pixmap, region, x, y, w, h, bits, stride))
 			return true;
 	}
@@ -6057,6 +6079,8 @@ upload_inplace:
 	}
 	dst_priv->clear = false;
 
+	assert(has_coherent_ptr(sna, src_priv, MOVE_READ));
+
 	box = region_rects(region);
 	n = region_num_rects(region);
 	if (dst_priv->gpu_bo->tiling) {
commit cfdaee4a7e45689b0fbbc8c3166d28d69797e759
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:20:00 2014 +0100

    sna: Skip adding damage if it is already contained
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 9f5c0b4..b7e3d90 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2865,7 +2865,7 @@ move_to_cpu:
 
 				DBG(("%s: region already in CPU damage\n",
 				     __FUNCTION__));
-				goto done;
+				goto already_damaged;
 			}
 		}
 
@@ -2986,6 +2986,7 @@ done:
 		}
 	}
 
+already_damaged:
 	if (dx | dy)
 		RegionTranslate(region, -dx, -dy);
 
commit 80752fb2794faa581d891b24148eaf51c42afd25
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:19:22 2014 +0100

    sna: Tidy calling memcpy_from_tiled
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 898f943..22aef25 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -5968,6 +5968,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
 	assert(bo->proxy == NULL);
+	assert_tiling(kgem, bo);
 
 	if (bo->map__cpu)
 		return MAP(bo->map__cpu);
@@ -6086,6 +6087,8 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 	assert(!bo->scanout);
+	assert_tiling(kgem, bo);
+
 	kgem_bo_submit(kgem, bo);
 
 	/* SHM pixmaps use proxies for subpage offsets */
@@ -6120,6 +6123,7 @@ void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 	assert(!bo->scanout || !write);
+	assert_tiling(kgem, bo);
 
 	if (write || bo->needs_flush)
 		kgem_bo_submit(kgem, bo);
@@ -6165,6 +6169,7 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 	assert(bo->refcnt);
 	assert(bo->proxy == NULL);
+	assert_tiling(kgem, bo);
 
 	kgem_bo_submit(kgem, bo);
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index e66bffb..be9b7e8 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -787,6 +787,11 @@ memcpy_to_tiled_x(struct kgem *kgem,
 		  int16_t dst_x, int16_t dst_y,
 		  uint16_t width, uint16_t height)
 {
+	assert(kgem->memcpy_to_tiled_x);
+	assert(src_x >= 0 && src_y >= 0);
+	assert(dst_x >= 0 && dst_y >= 0);
+	assert(8*src_stride >= (src_x+width) * bpp);
+	assert(8*dst_stride >= (dst_x+width) * bpp);
 	return kgem->memcpy_to_tiled_x(src, dst, bpp,
 				       src_stride, dst_stride,
 				       src_x, src_y,
@@ -802,6 +807,11 @@ memcpy_from_tiled_x(struct kgem *kgem,
 		    int16_t dst_x, int16_t dst_y,
 		    uint16_t width, uint16_t height)
 {
+	assert(kgem->memcpy_from_tiled_x);
+	assert(src_x >= 0 && src_y >= 0);
+	assert(dst_x >= 0 && dst_y >= 0);
+	assert(8*src_stride >= (src_x+width) * bpp);
+	assert(8*dst_stride >= (dst_x+width) * bpp);
 	return kgem->memcpy_from_tiled_x(src, dst, bpp,
 					 src_stride, dst_stride,
 					 src_x, src_y,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a559907..9f5c0b4 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1693,30 +1693,32 @@ static inline bool gpu_bo_download(struct sna *sna,
 		return false;
 
 	if (priv->gpu_bo->tiling) {
+		int bpp = priv->pixmap->drawable.bitsPerPixel;
+		void *dst = priv->pixmap->devPrivate.ptr;
+		int dst_pitch = priv->pixmap->devKind;
+
 		DBG(("%s: download through a tiled CPU map\n", __FUNCTION__));
 		do {
 			DBG(("%s: box (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
-			memcpy_from_tiled_x(&sna->kgem, src,
-					    priv->pixmap->devPrivate.ptr,
-					    priv->pixmap->drawable.bitsPerPixel,
-					    priv->gpu_bo->pitch,
-					    priv->pixmap->devKind,
+			memcpy_from_tiled_x(&sna->kgem, src, dst, bpp,
+					    priv->gpu_bo->pitch, dst_pitch,
 					    box->x1, box->y1,
 					    box->x1, box->y1,
 					    box->x2 - box->x1, box->y2 - box->y1);
 			box++;
 		} while (--n);
 	} else {
+		int bpp = priv->pixmap->drawable.bitsPerPixel;
+		void *dst = priv->pixmap->devPrivate.ptr;
+		int dst_pitch = priv->pixmap->devKind;
+
 		DBG(("%s: download through a linear CPU map\n", __FUNCTION__));
 		do {
 			DBG(("%s: box (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
-			memcpy_blt(src,
-				   priv->pixmap->devPrivate.ptr,
-				   priv->pixmap->drawable.bitsPerPixel,
-				   priv->gpu_bo->pitch,
-				   priv->pixmap->devKind,
+			memcpy_blt(src, dst, bpp,
+				   priv->gpu_bo->pitch, dst_pitch,
 				   box->x1, box->y1,
 				   box->x1, box->y1,
 				   box->x2 - box->x1, box->y2 - box->y1);
@@ -4934,6 +4936,10 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
 	y += dy + drawable->y;
+	assert(region->extents.x1 >= x);
+	assert(region->extents.y1 >= y);
+	assert(region->extents.x2 <= x + w);
+	assert(region->extents.y2 <= y + h);
 
 	if (try_upload__fast(pixmap, region, x, y, w, h, bits, stride))
 		return true;
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index f464dce..9e175a7 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -118,7 +118,6 @@ read_boxes_inplace__cpu(struct kgem *kgem,
 		return false;
 
 	if (bo->tiling == I915_TILING_X) {
-		assert(kgem->memcpy_from_tiled_x);
 		do {
 			memcpy_from_tiled_x(kgem, src, dst, bpp, src_pitch, dst_pitch,
 					    box->x1, box->y1,
commit 2a0176379f0ff290d276adc72d44dfddafd96da5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 14:18:23 2014 +0100

    sna: Micro-optimise unswizzling tiling/detiling
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index b61f88b..b5bfee6 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -233,55 +233,47 @@ memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
 	const unsigned tile_size = 4096;
 
 	const unsigned cpp = bpp / 8;
-	const unsigned stride_tiles = dst_stride / tile_width;
-	const unsigned swizzle_pixels = tile_width / cpp;
-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
-	const unsigned tile_mask = (1 << tile_pixels) - 1;
-
-	unsigned x, y;
+	const unsigned tile_pixels = tile_width / cpp;
+	const unsigned tile_shift = ffs(tile_pixels) - 1;
+	const unsigned tile_mask = tile_pixels - 1;
 
 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+	assert(src != dst);
 
-	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
-
-	for (y = 0; y < height; ++y) {
-		const uint32_t dy = y + dst_y;
-		const uint32_t tile_row =
-			(dy / tile_height * stride_tiles * tile_size +
-			 (dy & (tile_height-1)) * tile_width);
-		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
-		uint32_t dx = dst_x, offset;
-
-		x = width * cpp;
-		if (dx & (swizzle_pixels - 1)) {
-			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
-			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
-			offset = tile_row +
-				(dx >> tile_pixels) * tile_size +
-				(dx & tile_mask) * cpp;
-			memcpy((char *)dst + offset, src_row, length * cpp);
-
-			src_row += length * cpp;
-			x -= length * cpp;
-			dx += length;
+	if (src_x | src_y)
+		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+	assert(src_stride >= width * cpp);
+	src_stride -= width * cpp;
+
+	while (height--) {
+		unsigned w = width * cpp;
+		uint8_t *tile_row = dst;
+
+		tile_row += dst_y / tile_height * dst_stride * tile_height;
+		tile_row += (dst_y & (tile_height-1)) * tile_width;
+		if (dst_x) {
+			tile_row += (dst_x >> tile_shift) * tile_size;
+			if (dst_x & tile_mask) {
+				const unsigned x = (dst_x & tile_mask) * cpp;
+				const unsigned len = min(tile_width - x, w);
+				memcpy(tile_row + x, src, len);
+
+				tile_row += tile_size;
+				src = (const uint8_t *)src + len;
+				w -= len;
+			}
 		}
-		while (x >= 512) {
-			assert((dx & tile_mask) == 0);
-			offset = tile_row + (dx >> tile_pixels) * tile_size;
-
-			memcpy((char *)dst + offset, src_row, 512);
+		while (w >= tile_width) {
+			memcpy(tile_row, src, tile_width);
 
-			src_row += 512;
-			x -= 512;
-			dx += swizzle_pixels;
-		}
-		if (x) {
-			offset = tile_row +
-				(dx >> tile_pixels) * tile_size +
-				(dx & tile_mask) * cpp;
-			memcpy((char *)dst + offset, src_row, x);
+			tile_row += tile_size;
+			src = (const uint8_t *)src + tile_width;
+			w -= tile_width;
 		}
+		memcpy(tile_row, src, w);
+		src = (const uint8_t *)src + src_stride + w;
+		dst_y++;
 	}
 }
 
@@ -297,55 +289,47 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
 	const unsigned tile_size = 4096;
 
 	const unsigned cpp = bpp / 8;
-	const unsigned stride_tiles = src_stride / tile_width;
-	const unsigned swizzle_pixels = tile_width / cpp;
-	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
-	const unsigned tile_mask = (1 << tile_pixels) - 1;
-
-	unsigned x, y;
+	const unsigned tile_pixels = tile_width / cpp;
+	const unsigned tile_shift = ffs(tile_pixels) - 1;
+	const unsigned tile_mask = tile_pixels - 1;
 
 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+	assert(src != dst);
 
-	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
-
-	for (y = 0; y < height; ++y) {
-		const uint32_t sy = y + src_y;
-		const uint32_t tile_row =
-			(sy / tile_height * stride_tiles * tile_size +
-			 (sy & (tile_height-1)) * tile_width);
-		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
-		uint32_t sx = src_x, offset;
-
-		x = width * cpp;
-		if (sx & (swizzle_pixels - 1)) {
-			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
-			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
-			offset = tile_row +
-				(sx >> tile_pixels) * tile_size +
-				(sx & tile_mask) * cpp;
-			memcpy(dst_row, (const char *)src + offset, length * cpp);
-
-			dst_row += length * cpp;
-			x -= length * cpp;
-			sx += length;
+	if (dst_x | dst_y)
+		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+	assert(dst_stride >= width * cpp);
+	dst_stride -= width * cpp;
+
+	while (height--) {
+		unsigned w = width * cpp;
+		const uint8_t *tile_row = src;
+
+		tile_row += src_y / tile_height * src_stride * tile_height;
+		tile_row += (src_y & (tile_height-1)) * tile_width;
+		if (src_x) {
+			tile_row += (src_x >> tile_shift) * tile_size;
+			if (src_x & tile_mask) {
+				const unsigned x = (src_x & tile_mask) * cpp;
+				const unsigned len = min(tile_width - x, w);
+				memcpy(dst, tile_row + x, len);
+
+				tile_row += tile_size;
+				dst = (uint8_t *)dst + len;
+				w -= len;
+			}
 		}
-		while (x >= 512) {
-			assert((sx & tile_mask) == 0);
-			offset = tile_row + (sx >> tile_pixels) * tile_size;
-
-			memcpy(dst_row, (const char *)src + offset, 512);
+		while (w >= tile_width) {
+			memcpy(dst, tile_row, tile_width);
 
-			dst_row += 512;
-			x -= 512;
-			sx += swizzle_pixels;
-		}
-		if (x) {
-			offset = tile_row +
-				(sx >> tile_pixels) * tile_size +
-				(sx & tile_mask) * cpp;
-			memcpy(dst_row, (const char *)src + offset, x);
+			tile_row += tile_size;
+			dst = (uint8_t *)dst + tile_width;
+			w -= tile_width;
 		}
+		memcpy(dst, tile_row, w);
+		dst = (uint8_t *)dst + dst_stride + w;
+		src_y++;
 	}
 }
 
commit 24cb50e53c789cb7a05d59ad103dda1c3a009485
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jun 28 07:05:55 2014 +0100

    sna/trapezoids: Handle mono traps just in case
    
    I disabled a few paths and ended up in an assert that mono trapezoids
    shouldn't get that far...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids_imprecise.c b/src/sna/sna_trapezoids_imprecise.c
index 69f8ae8..ebde762 100644
--- a/src/sna/sna_trapezoids_imprecise.c
+++ b/src/sna/sna_trapezoids_imprecise.c
@@ -1505,6 +1505,15 @@ inplace_end_subrows(struct active_list *active, uint8_t *row,
 }
 
 static void
+convert_mono(uint8_t *ptr, int w)
+{
+	while (w--) {
+		*ptr = 0xff * (*ptr >= 0xf0);
+		ptr++;
+	}
+}
+
+static void
 tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 {
 	int i, j, h = converter->extents.y2;
@@ -1516,7 +1525,6 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 	int width = scratch->drawable.width;
 
 	__DBG(("%s: mono=%d, buf?=%d\n", __FUNCTION__, mono, buf != NULL));
-	assert(!mono);
 	assert(converter->extents.y1 == 0);
 	assert(converter->extents.x1 == 0);
 	assert(scratch->drawable.depth == 8);
@@ -1552,6 +1560,8 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 		if (do_full_step) {
 			memset(ptr, 0, width);
 			inplace_row(active, ptr, width);
+			if (mono)
+				convert_mono(ptr, width);
 			if (row != ptr)
 				memcpy(row, ptr, width);
 
@@ -1584,8 +1594,11 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 			}
 			assert(min >= 0 && max <= width);
 			memset(row, 0, min);
-			if (max > min)
+			if (max > min) {
 				inplace_end_subrows(active, row+min, (int8_t*)ptr+min, max-min);
+				if (mono)
+					convert_mono(row+min, max-min);
+			}
 			if (max < width)
 				memset(row+max, 0, width-max);
 		}