xf86-video-intel: 7 commits - src/sna/blt.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_accel.c src/sna/sna_blt.c src/sna/sna_composite.c src/sna/sna.h src/sna/sna_io.c

Sat Jun 22 00:34:14 PDT 2013

src/sna/blt.c           |  324 ++++++++++++++++++++++++++++++++++++------------
 src/sna/kgem.c          |   48 +++++--
 src/sna/kgem.h          |   24 +++
 src/sna/sna.h           |    7 -
 src/sna/sna_accel.c     |  163 ++++++++++++++++++++----
 src/sna/sna_blt.c       |   80 +++++++----
 src/sna/sna_composite.c |    2 
 src/sna/sna_io.c        |   15 --
 8 files changed, 495 insertions(+), 168 deletions(-)

New commits:
commit 62e42de300275a668a326357d454062221714fe8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 21 21:00:23 2013 +0100

    sna: Determine swizzling once during initialisation and choose memcpy_to_tiled_x
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index af87667..4dbd9e8 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -213,12 +213,12 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	}
 }
 
-fast_memcpy void
-memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
-		  int32_t src_stride, int32_t dst_stride,
-		  int16_t src_x, int16_t src_y,
-		  int16_t dst_x, int16_t dst_y,
-		  uint16_t width, uint16_t height)
+static fast_memcpy void
+memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+			     int32_t src_stride, int32_t dst_stride,
+			     int16_t src_x, int16_t src_y,
+			     int16_t dst_x, int16_t dst_y,
+			     uint16_t width, uint16_t height)
 {
 	const unsigned tile_width = 512;
 	const unsigned tile_height = 8;
@@ -226,14 +226,14 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
 
 	const unsigned cpp = bpp / 8;
 	const unsigned stride_tiles = dst_stride / tile_width;
-	const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+	const unsigned swizzle_pixels = tile_width / cpp;
 	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
 	const unsigned tile_mask = (1 << tile_pixels) - 1;
 
 	unsigned x, y;
 
-	DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
-	     __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
 
 	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
 
@@ -252,19 +252,71 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
 			offset = tile_row +
 				(dx >> tile_pixels) * tile_size +
 				(dx & tile_mask) * cpp;
-			switch (swizzling) {
-			case I915_BIT_6_SWIZZLE_NONE:
-				break;
-			case I915_BIT_6_SWIZZLE_9:
-				offset ^= (offset >> 3) & 64;
-				break;
-			case I915_BIT_6_SWIZZLE_9_10:
-				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
-				break;
-			case I915_BIT_6_SWIZZLE_9_11:
-				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
-				break;
-			}
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		while (x >= 512) {
+			assert((dx & tile_mask) == 0);
+			offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+			memcpy((char *)dst + offset, src_row, 512);
+
+			src_row += 512;
+			x -= 512;
+			dx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+fast_memcpy static void
+memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
+			     int32_t src_stride, int32_t dst_stride,
+			     int16_t src_x, int16_t src_y,
+			     int16_t dst_x, int16_t dst_y,
+			     uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= (offset >> 3) & 64;
 
 			memcpy((char *)dst + offset, src_row, length * cpp);
 
@@ -272,64 +324,184 @@ memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
 			x -= length * cpp;
 			dx += length;
 		}
-		if (swizzling) {
-			while (x >= 64) {
-				offset = tile_row +
-					(dx >> tile_pixels) * tile_size +
-					(dx & tile_mask) * cpp;
-				switch (swizzling) {
-				case I915_BIT_6_SWIZZLE_9:
-					offset ^= (offset >> 3) & 64;
-					break;
-				case I915_BIT_6_SWIZZLE_9_10:
-					offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
-					break;
-				case I915_BIT_6_SWIZZLE_9_11:
-					offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
-					break;
-				}
-
-				memcpy((char *)dst + offset, src_row, 64);
-
-				src_row += 64;
-				x -= 64;
-				dx += swizzle_pixels;
-			}
-		} else {
-			while (x >= 512) {
-				assert((dx & tile_mask) == 0);
-				offset = tile_row + (dx >> tile_pixels) * tile_size;
+		while (x >= 64) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= (offset >> 3) & 64;
 
-				memcpy((char *)dst + offset, src_row, 512);
+			memcpy((char *)dst + offset, src_row, 64);
 
-				src_row += 512;
-				x -= 512;
-				dx += swizzle_pixels;
-			}
+			src_row += 64;
+			x -= 64;
+			dx += swizzle_pixels;
 		}
 		if (x) {
 			offset = tile_row +
 				(dx >> tile_pixels) * tile_size +
 				(dx & tile_mask) * cpp;
-			switch (swizzling) {
-			case I915_BIT_6_SWIZZLE_NONE:
-				break;
-			case I915_BIT_6_SWIZZLE_9:
-				offset ^= (offset >> 3) & 64;
-				break;
-			case I915_BIT_6_SWIZZLE_9_10:
-				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
-				break;
-			case I915_BIT_6_SWIZZLE_9_11:
-				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
-				break;
-			}
+			offset ^= (offset >> 3) & 64;
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+fast_memcpy static void
+memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
+				int32_t src_stride, int32_t dst_stride,
+				int16_t src_x, int16_t src_y,
+				int16_t dst_x, int16_t dst_y,
+				uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
 
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		while (x >= 64) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+
+			memcpy((char *)dst + offset, src_row, 64);
+
+			src_row += 64;
+			x -= 64;
+			dx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
 			memcpy((char *)dst + offset, src_row, x);
 		}
 	}
 }
 
+fast_memcpy static void
+memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
+				int32_t src_stride, int32_t dst_stride,
+				int16_t src_x, int16_t src_y,
+				int16_t dst_x, int16_t dst_y,
+				uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		while (x >= 64) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+
+			memcpy((char *)dst + offset, src_row, 64);
+
+			src_row += 64;
+			x -= 64;
+			dx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling)
+{
+	switch (swizzling) {
+	default:
+	case I915_BIT_6_SWIZZLE_NONE:
+		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
+		break;
+	case I915_BIT_6_SWIZZLE_9:
+		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
+		break;
+	case I915_BIT_6_SWIZZLE_9_10:
+		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
+		break;
+	case I915_BIT_6_SWIZZLE_9_11:
+		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
+		break;
+	}
+}
+
 void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
@@ -561,10 +733,10 @@ memcpy_xor(const void *src, void *dst, int bpp,
 					while (i >= 16) {
 						__m128i xmm1, xmm2, xmm3, xmm4;
 
-						xmm1 = xmm_load_128u((__m128i*)s + 0);
-						xmm2 = xmm_load_128u((__m128i*)s + 1);
-						xmm3 = xmm_load_128u((__m128i*)s + 2);
-						xmm4 = xmm_load_128u((__m128i*)s + 3);
+						xmm1 = xmm_load_128u((const __m128i*)s + 0);
+						xmm2 = xmm_load_128u((const __m128i*)s + 1);
+						xmm3 = xmm_load_128u((const __m128i*)s + 2);
+						xmm4 = xmm_load_128u((const __m128i*)s + 3);
 
 						xmm_save_128((__m128i*)d + 0,
 							     _mm_or_si128(xmm1, mask));
@@ -583,8 +755,8 @@ memcpy_xor(const void *src, void *dst, int bpp,
 					if (i & 8) {
 						__m128i xmm1, xmm2;
 
-						xmm1 = xmm_load_128u((__m128i*)s + 0);
-						xmm2 = xmm_load_128u((__m128i*)s + 1);
+						xmm1 = xmm_load_128u((const __m128i*)s + 0);
+						xmm2 = xmm_load_128u((const __m128i*)s + 1);
 
 						xmm_save_128((__m128i*)d + 0,
 							     _mm_or_si128(xmm1, mask));
@@ -597,7 +769,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
 
 					if (i & 4) {
 						xmm_save_128((__m128i*)d,
-							     _mm_or_si128(xmm_load_128u((__m128i*)s),
+							     _mm_or_si128(xmm_load_128u((const __m128i*)s),
 									  mask));
 
 						d += 4;
@@ -643,7 +815,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
 		case 2:
 			do {
 				uint16_t *d = (uint16_t *)dst_bytes;
-				uint16_t *s = (uint16_t *)src_bytes;
+				const uint16_t *s = (const uint16_t *)src_bytes;
 
 				for (i = 0; i < width; i++)
 					d[i] = (s[i] & and) | or;
@@ -656,7 +828,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
 		case 4:
 			do {
 				uint32_t *d = (uint32_t *)dst_bytes;
-				uint32_t *s = (uint32_t *)src_bytes;
+				const uint32_t *s = (const uint32_t *)src_bytes;
 
 				for (i = 0; i < width; i++)
 					d[i] = (s[i] & and) | or;
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 66dce47..b32ceee 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -964,6 +964,39 @@ err:
 	return false;
 }
 
+static void kgem_init_swizzling(struct kgem *kgem)
+{
+	struct drm_i915_gem_get_tiling tiling;
+
+#ifndef __x86_64__
+	/* Between a register starved compiler emitting attrocious code
+	 * and the extra overhead in the kernel for managing the tight
+	 * 32-bit address space, unless we have a 64-bit system,
+	 * using memcpy_to_tiled_x() is extremely slow.
+	 */
+	return;
+#endif
+
+	if (kgem->gen < 050) /* bit17 swizzling :( */
+		return;
+
+	VG_CLEAR(tiling);
+	tiling.handle = gem_create(kgem->fd, 1);
+	if (!tiling.handle)
+		return;
+
+	if (!gem_set_tiling(kgem->fd, tiling.handle, I915_TILING_X, 512))
+		goto out;
+
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
+		goto out;
+
+	choose_memcpy_to_tiled_x(kgem, tiling.swizzle_mode);
+out:
+	gem_close(kgem->fd, tiling.handle);
+}
+
+
 void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 {
 	struct drm_i915_gem_get_aperture aperture;
@@ -1212,6 +1245,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 		kgem->batch_flags_base |= LOCAL_I915_EXEC_HANDLE_LUT;
 	if (kgem->has_pinned_batches)
 		kgem->batch_flags_base |= LOCAL_I915_EXEC_IS_PINNED;
+
+	kgem_init_swizzling(kgem);
 }
 
 /* XXX hopefully a good approximation */
@@ -5797,19 +5832,6 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 	}
 }
 
-int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo)
-{
-	struct drm_i915_gem_get_tiling tiling;
-
-	VG_CLEAR(tiling);
-	tiling.handle = bo->handle;
-	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
-		return 0;
-
-	assert(bo->tiling == tiling.tiling_mode);
-	return tiling.swizzle_mode;
-}
-
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
 		struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 33a4db0..91a38f7 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -196,6 +196,12 @@ struct kgem {
 	void (*retire)(struct kgem *kgem);
 	void (*expire)(struct kgem *kgem);
 
+	void (*memcpy_to_tiled_x)(const void *src, void *dst, int bpp,
+				  int32_t src_stride, int32_t dst_stride,
+				  int16_t src_x, int16_t src_y,
+				  int16_t dst_x, int16_t dst_y,
+				  uint16_t width, uint16_t height);
+
 	uint16_t reloc__self[256];
 	uint32_t batch[64*1024-8] page_aligned;
 	struct drm_i915_gem_exec_object2 exec[256] page_aligned;
@@ -286,7 +292,6 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
-int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
 
 bool kgem_retire(struct kgem *kgem);
 
@@ -693,4 +698,21 @@ static inline void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch)
 }
 #endif
 
+static inline void
+memcpy_to_tiled_x(struct kgem *kgem,
+		  const void *src, void *dst, int bpp,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height)
+{
+	return kgem->memcpy_to_tiled_x(src, dst, bpp,
+				       src_stride, dst_stride,
+				       src_x, src_y,
+				       dst_x, dst_y,
+				       width, height);
+}
+
+void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling);
+
 #endif /* KGEM_H */
diff --git a/src/sna/sna.h b/src/sna/sna.h
index da5d8af..f720c64 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -848,12 +848,7 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	   int16_t src_x, int16_t src_y,
 	   int16_t dst_x, int16_t dst_y,
 	   uint16_t width, uint16_t height);
-void
-memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
-		  int32_t src_stride, int32_t dst_stride,
-		  int16_t src_x, int16_t src_y,
-		  int16_t dst_x, int16_t dst_y,
-		  uint16_t width, uint16_t height);
+
 void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 599cfc1..44b87cd 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3868,15 +3868,7 @@ static inline void box32_add_rect(Box32Rec *box, const xRectangle *r)
 
 static bool can_upload_tiled_x(struct kgem *kgem, struct kgem_bo *bo)
 {
-#ifndef __x86_64__
-	/* Between a register starved compiler emitting attrocious code
-	 * and the extra overhead in the kernel for managing the tight
-	 * 32-bit address space, unless we have a 64-bit system,
-	 * using memcpy_to_tiled_x() is extremely slow.
-	 */
-	return false;
-#endif
-	if (kgem->gen < 050) /* bit17 swizzling :( */
+	if (!kgem->memcpy_to_tiled_x)
 		return false;
 
 	if (bo->tiling != I915_TILING_X)
@@ -3896,7 +3888,6 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	BoxRec *box;
 	uint8_t *dst;
-	int swizzle;
 	int n;
 
 	DBG(("%s: bo? %d, can tile? %d\n", __FUNCTION__,
@@ -3919,10 +3910,9 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
 	DBG(("%s: upload(%d, %d, %d, %d) x %d\n", __FUNCTION__, x, y, w, h, n));
 
 	kgem_bo_sync__cpu(&sna->kgem, priv->gpu_bo);
-	swizzle = kgem_bo_get_swizzling(&sna->kgem, priv->gpu_bo);
 	do {
-		memcpy_to_tiled_x(bits, dst,
-				  pixmap->drawable.bitsPerPixel, swizzle,
+		memcpy_to_tiled_x(&sna->kgem, bits, dst,
+				  pixmap->drawable.bitsPerPixel,
 				  stride, priv->gpu_bo->pitch,
 				  box->x1 - x, box->y1 - y,
 				  box->x1, box->y1,
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 1ec1a60..e51c033 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -477,16 +477,7 @@ fallback:
 
 static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
 {
-#ifndef __x86_64__
-	/* Between a register starved compiler emitting attrocious code
-	 * and the extra overhead in the kernel for managing the tight
-	 * 32-bit address space, unless we have a 64-bit system,
-	 * using memcpy_to_tiled_x() is extremely slow.
-	 */
-	return false;
-#endif
-
-	if (kgem->gen < 050) /* bit17 swizzling :( */
+	if (!kgem->memcpy_to_tiled_x)
 		return false;
 
 	if (bo->tiling != I915_TILING_X)
@@ -505,7 +496,6 @@ write_boxes_inplace__tiled(struct kgem *kgem,
                            const BoxRec *box, int n)
 {
 	uint8_t *dst;
-	int swizzle;
 
 	assert(bo->tiling == I915_TILING_X);
 
@@ -514,9 +504,8 @@ write_boxes_inplace__tiled(struct kgem *kgem,
 		return false;
 
 	kgem_bo_sync__cpu(kgem, bo);
-	swizzle = kgem_bo_get_swizzling(kgem, bo);
 	do {
-		memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch,
+		memcpy_to_tiled_x(kgem, src, dst, bpp, stride, bo->pitch,
 				  box->x1 + src_dx, box->y1 + src_dy,
 				  box->x1 + dst_dx, box->y1 + dst_dy,
 				  box->x2 - box->x1, box->y2 - box->y1);
commit 53c113c3cc2f8527debc185f0819139ca8637637
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 21 19:27:24 2013 +0100

    sna: Allow PutImage to write inplace using manual tiling
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 277bab6..599cfc1 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3866,6 +3866,74 @@ static inline void box32_add_rect(Box32Rec *box, const xRectangle *r)
 		box->y2 = v;
 }
 
+static bool can_upload_tiled_x(struct kgem *kgem, struct kgem_bo *bo)
+{
+#ifndef __x86_64__
+	/* Between a register starved compiler emitting attrocious code
+	 * and the extra overhead in the kernel for managing the tight
+	 * 32-bit address space, unless we have a 64-bit system,
+	 * using memcpy_to_tiled_x() is extremely slow.
+	 */
+	return false;
+#endif
+	if (kgem->gen < 050) /* bit17 swizzling :( */
+		return false;
+
+	if (bo->tiling != I915_TILING_X)
+		return false;
+
+	if (bo->scanout)
+		return false;
+
+	return bo->domain == DOMAIN_CPU || kgem->has_llc;
+}
+
+static bool
+try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
+		   int x, int y, int w, int  h, char *bits, int stride)
+{
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	BoxRec *box;
+	uint8_t *dst;
+	int swizzle;
+	int n;
+
+	DBG(("%s: bo? %d, can tile? %d\n", __FUNCTION__,
+	     priv->gpu_bo != NULL,
+	     priv->gpu_bo ? can_upload_tiled(&sna->kgem, priv->gpu_bo) : 0));
+
+	if (!DAMAGE_IS_ALL(priv->gpu_damage) ||
+	    !can_upload_tiled_x(&sna->kgem, priv->gpu_bo))
+		return false;
+
+	assert(priv->gpu_bo->tiling == I915_TILING_X);
+
+	dst = __kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+	if (dst == NULL)
+		return false;
+
+	box = RegionRects(region);
+	n = RegionNumRects(region);
+
+	DBG(("%s: upload(%d, %d, %d, %d) x %d\n", __FUNCTION__, x, y, w, h, n));
+
+	kgem_bo_sync__cpu(&sna->kgem, priv->gpu_bo);
+	swizzle = kgem_bo_get_swizzling(&sna->kgem, priv->gpu_bo);
+	do {
+		memcpy_to_tiled_x(bits, dst,
+				  pixmap->drawable.bitsPerPixel, swizzle,
+				  stride, priv->gpu_bo->pitch,
+				  box->x1 - x, box->y1 - y,
+				  box->x1, box->y1,
+				  box->x2 - box->x1, box->y2 - box->y1);
+		box++;
+	} while (--n);
+	__kgem_bo_unmap__cpu(&sna->kgem, priv->gpu_bo, dst);
+
+	return true;
+}
+
 static bool
 sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		    int x, int y, int w, int  h, char *bits, int stride)
@@ -3883,14 +3951,17 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (drawable->depth < 8)
 		return false;
 
-	if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
-					     region, MOVE_WRITE))
-		return false;
-
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
 	y += dy + drawable->y;
 
+	if (try_upload_tiled_x(pixmap, region, x, y, w, h, bits, stride))
+		return true;
+
+	if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
+					     region, MOVE_WRITE))
+		return false;
+
 	DBG(("%s: upload(%d, %d, %d, %d)\n", __FUNCTION__, x, y, w, h));
 
 	/* Region is pre-clipped and translated into pixmap space */
@@ -4330,7 +4401,7 @@ source_contains_region(struct sna_damage *damage,
 
 static bool
 move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
-	    const RegionRec *region, int16_t dx, int16_t dy,
+	    RegionRec *region, int16_t dx, int16_t dy,
 	    uint8_t alu, bool dst_is_gpu)
 {
 	int w = region->extents.x2 - region->extents.x1;
@@ -14488,7 +14559,7 @@ static void sna_accel_post_damage(struct sna *sna)
 		DBG(("%s: slave:  ((%d, %d), (%d, %d))x%d\n", __FUNCTION__,
 		     region.extents.x1, region.extents.y1,
 		     region.extents.x2, region.extents.y2,
-		     RegionNumRects(&region.extents)));
+		     RegionNumRects(&region)));
 
 		box = RegionRects(&region);
 		n = RegionNumRects(&region);
commit 48028a7c923fa0d66b01e8e94d4f0742866f78ec
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 21 14:29:43 2013 +0100

    sna: Inspect availablity of render before prefering to use the GPU
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 7002638..08960fc 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -1981,21 +1981,6 @@ is_clear(PixmapPtr pixmap)
 	return priv && priv->clear;
 }
 
-static struct kgem_bo *
-peek_bo(DrawablePtr draw)
-{
-	struct sna_pixmap *priv;
-
-	if (draw == NULL)
-		return NULL;
-
-	priv = sna_pixmap(get_drawable_pixmap(draw));
-	if (priv == NULL)
-		return NULL;
-
-	return priv->gpu_bo;
-}
-
 bool
 sna_blt_composite(struct sna *sna,
 		  uint32_t op,
@@ -2013,6 +1998,7 @@ sna_blt_composite(struct sna *sna,
 	int16_t tx, ty;
 	BoxRec dst_box, src_box;
 	uint32_t alpha_fixup;
+	uint32_t color, hint;
 	bool was_clear;
 	bool ret;
 
@@ -2045,28 +2031,30 @@ sna_blt_composite(struct sna *sna,
 	} else
 		sna_render_picture_extents(dst, &dst_box);
 
-	bo = sna_pixmap(tmp->dst.pixmap)->gpu_bo;
-	if (bo == NULL || bo != peek_bo(src->pDrawable))
-		bo = sna_drawable_use_bo(dst->pDrawable, PREFER_GPU,
-					 &dst_box, &tmp->damage);
-	if (bo && !kgem_bo_can_blt(&sna->kgem, bo)) {
-		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
-		     __FUNCTION__, bo->tiling, bo->pitch));
-		return false;
-	}
-
 	tmp->dst.format = dst->format;
 	tmp->dst.width = tmp->dst.pixmap->drawable.width;
 	tmp->dst.height = tmp->dst.pixmap->drawable.height;
 	get_drawable_deltas(dst->pDrawable, tmp->dst.pixmap,
 			    &tmp->dst.x, &tmp->dst.y);
-	tmp->dst.bo = bo;
 
 	if (op == PictOpClear) {
 clear:
 		if (was_clear)
 			return prepare_blt_nop(sna, tmp);
 
+		hint = 0;
+		if (can_render(sna))
+			hint |= PREFER_GPU;
+		if (dst->pCompositeClip->data == NULL)
+			hint |= IGNORE_CPU;
+		tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+						  &dst_box, &tmp->damage);
+		if (tmp->dst.bo && !kgem_bo_can_blt(&sna->kgem, tmp->dst.bo)) {
+			DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+			     __FUNCTION__, tmp->dst.bo->tiling, tmp->dst.bo->pitch));
+			return false;
+		}
+
 		if (!tmp->dst.bo) {
 			RegionRec region;
 
@@ -2096,6 +2084,21 @@ clear:
 			return false;
 		}
 
+		color = get_solid_color(src, tmp->dst.format);
+fill:
+		hint = 0;
+		if (can_render(sna))
+			hint |= PREFER_GPU;
+		if (dst->pCompositeClip->data == NULL)
+			hint |= IGNORE_CPU;
+		tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+						  &dst_box, &tmp->damage);
+		if (tmp->dst.bo && !kgem_bo_can_blt(&sna->kgem, tmp->dst.bo)) {
+			DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+			     __FUNCTION__, tmp->dst.bo->tiling, tmp->dst.bo->pitch));
+			return false;
+		}
+
 		if (!tmp->dst.bo) {
 			RegionRec region;
 
@@ -2107,7 +2110,7 @@ clear:
 				return false;
 		}
 
-		return prepare_blt_fill(sna, tmp, get_solid_color(src, tmp->dst.format));
+		return prepare_blt_fill(sna, tmp, color);
 	}
 
 	if (!src->pDrawable) {
@@ -2151,9 +2154,9 @@ clear:
 
 	src_pixmap = get_drawable_pixmap(src->pDrawable);
 	if (is_clear(src_pixmap)) {
-		return prepare_blt_fill(sna, tmp,
-					color_convert(sna_pixmap(src_pixmap)->clear_color,
-						      src->format, tmp->dst.format));
+		color = color_convert(sna_pixmap(src_pixmap)->clear_color,
+				      src->format, tmp->dst.format);
+		goto fill;
 	}
 
 	alpha_fixup = 0;
@@ -2214,6 +2217,23 @@ clear:
 	src_box.x2 = x + width;
 	src_box.y2 = y + height;
 	bo = NULL;
+
+	hint = 0;
+	if (can_render(sna))
+		hint |= PREFER_GPU;
+	if (dst->pCompositeClip->data == NULL)
+		hint |= IGNORE_CPU;
+	if (source_is_gpu(src_pixmap, &src_box))
+		hint |= FORCE_GPU;
+
+	tmp->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint,
+					  &dst_box, &tmp->damage);
+	if (tmp->dst.bo && !kgem_bo_can_blt(&sna->kgem, tmp->dst.bo)) {
+		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+		     __FUNCTION__, tmp->dst.bo->tiling, tmp->dst.bo->pitch));
+		return false;
+	}
+
 	if (tmp->dst.bo || source_is_gpu(src_pixmap, &src_box))
 		bo = __sna_render_pixmap_bo(sna, src_pixmap, &src_box, true);
 	if (bo) {
diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 17cc68c..da3fd62 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -920,7 +920,7 @@ sna_composite_rectangles(CARD8		 op,
 	 * operation, then we may as well delete it without moving it
 	 * first to the GPU.
 	 */
-	hint = PREFER_GPU;
+	hint = can_render(sna) ? PREFER_GPU : 0;
 	if (op <= PictOpSrc) {
 		if (priv->cpu_damage &&
 		    region_subsumes_damage(&region, priv->cpu_damage)) {
commit 71fc83401ec8c560a89a284805e849954ea18ee4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 21 14:28:38 2013 +0100

    sna: Check if we may want to simply upload for a CopyArea
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 7d03023..277bab6 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4312,9 +4312,26 @@ out:
 }
 
 static bool
+source_contains_region(struct sna_damage *damage,
+		       const RegionRec *region, int16_t dx, int16_t dy)
+{
+	BoxRec box;
+
+	if (DAMAGE_IS_ALL(damage))
+		return true;
+
+	box = region->extents;
+	box.x1 += dx;
+	box.x2 += dx;
+	box.y1 += dy;
+	box.y2 += dy;
+	return sna_damage_contains_box__no_reduce(damage, &box);
+}
+
+static bool
 move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 	    const RegionRec *region, int16_t dx, int16_t dy,
-	    uint8_t alu)
+	    uint8_t alu, bool dst_is_gpu)
 {
 	int w = region->extents.x2 - region->extents.x1;
 	int h = region->extents.y2 - region->extents.y1;
@@ -4326,7 +4343,26 @@ move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 		return true;
 	}
 
+	if (dst_is_gpu && priv->cpu_bo && priv->cpu_damage) {
+		DBG(("%s: can use CPU bo? cpu_damage=%d, gpu_damage=%d, cpu hint=%d\n",
+		     __FUNCTION__,
+		     priv->cpu_damage ? DAMAGE_IS_ALL(priv->cpu_damage) ? -1 : 1 : 0,
+		     priv->gpu_damage ? DAMAGE_IS_ALL(priv->gpu_damage) ? -1 : 1 : 0,
+		     priv->cpu));
+		if (DAMAGE_IS_ALL(priv->cpu_damage) || priv->gpu_damage == NULL)
+			return false;
+
+		if (priv->cpu &&
+		    source_contains_region(priv->cpu_damage, region, dx, dy))
+			return false;
+	}
+
 	if (priv->gpu_bo) {
+		DBG(("%s: has gpu bo (cpu damage?=%d, cpu=%d, gpu tiling=%d)\n",
+		     __FUNCTION__,
+		     priv->cpu_damage ? DAMAGE_IS_ALL(priv->cpu_damage) ? -1 : 1 : 0,
+		     priv->cpu, priv->gpu_bo->tiling));
+
 		if (priv->cpu_damage == NULL)
 			return true;
 
@@ -4548,7 +4584,8 @@ sna_pixmap_is_gpu(PixmapPtr pixmap)
 }
 
 static int
-source_prefer_gpu(struct sna *sna, struct sna_pixmap *priv)
+source_prefer_gpu(struct sna *sna, struct sna_pixmap *priv,
+		  RegionRec *region, int16_t dx, int16_t dy)
 {
 	if (priv == NULL) {
 		DBG(("%s: source unattached, use cpu\n", __FUNCTION__));
@@ -4560,10 +4597,13 @@ source_prefer_gpu(struct sna *sna, struct sna_pixmap *priv)
 		return 0;
 	}
 
-	if (priv->gpu_damage) {
-		DBG(("%s: source has gpu damage, force gpu\n", __FUNCTION__));
+	if (priv->gpu_damage &&
+	    (priv->cpu_damage == NULL ||
+	     !source_contains_region(priv->cpu_damage, region, dx, dy))) {
+		DBG(("%s: source has gpu damage, force gpu? %d\n",
+		     __FUNCTION__, priv->cpu_damage == NULL));
 		assert(priv->gpu_bo);
-		return PREFER_GPU | FORCE_GPU;
+		return priv->cpu_damage ? PREFER_GPU : PREFER_GPU | FORCE_GPU;
 	}
 
 	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)) {
@@ -4589,7 +4629,7 @@ static bool use_shm_bo(struct sna *sna,
 		return false;
 	}
 
-	if (!priv->shm) {
+	if (!priv->shm && !priv->cpu) {
 		DBG(("%s: yes, ordinary CPU bo\n", __FUNCTION__));
 		return true;
 	}
@@ -4685,7 +4725,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (dst_priv == NULL)
 		goto fallback;
 
-	hint = source_prefer_gpu(sna, src_priv) ?:
+	hint = source_prefer_gpu(sna, src_priv, region, src_dx, src_dy) ?:
 		region_inplace(sna, dst_pixmap, region,
 			       dst_priv, alu_overwrites(alu));
 	if (dst_priv->cpu_damage && alu_overwrites(alu)) {
@@ -4765,7 +4805,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		}
 
 		if (src_priv &&
-		    move_to_gpu(src_pixmap, src_priv, region, src_dx, src_dy, alu) &&
+		    move_to_gpu(src_pixmap, src_priv, region, src_dx, src_dy, alu, bo == dst_priv->gpu_bo) &&
 		    sna_pixmap_move_to_gpu(src_pixmap, MOVE_READ | MOVE_ASYNC_HINT)) {
 			DBG(("%s: move whole src_pixmap to GPU and copy\n",
 			     __FUNCTION__));
commit 7e90e522199c4d6b479554073acb33e9d82fb8cc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jun 21 14:27:42 2013 +0100

    sna: Fix inspection of transfer extents for deciding transport
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index d26d613..7d03023 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1505,7 +1505,7 @@ sna_pixmap_create_mappable_gpu(PixmapPtr pixmap,
 
 static inline bool use_cpu_bo_for_download(struct sna *sna,
 					   struct sna_pixmap *priv,
-					   const BoxRec *box)
+					   int nbox, const BoxRec *box)
 {
 	if (DBG_NO_CPU_DOWNLOAD)
 		return false;
@@ -1523,10 +1523,11 @@ static inline bool use_cpu_bo_for_download(struct sna *sna,
 	}
 
 	/* Is it worth detiling? */
+	assert(box[0].y1 < box[nbox-1].y2);
 	if (kgem_bo_is_mappable(&sna->kgem, priv->gpu_bo) &&
-	    (box->y2 - box->y1 - 1) * priv->gpu_bo->pitch < 4096) {
-		DBG(("%s: no, tiny transfer, expect to read inplace\n",
-		     __FUNCTION__));
+	    (box[nbox-1].y2 - box[0].y1 - 1) * priv->gpu_bo->pitch < 4096) {
+		DBG(("%s: no, tiny transfer (height=%d, pitch=%d) expect to read inplace\n",
+		     __FUNCTION__, box[nbox-1].y2-box[0].y1, priv->gpu_bo->pitch));
 		return false;
 	}
 
@@ -2020,7 +2021,7 @@ skip_inplace_map:
 		if (n) {
 			bool ok = false;
 
-			if (use_cpu_bo_for_download(sna, priv, &priv->gpu_damage->extents)) {
+			if (use_cpu_bo_for_download(sna, priv, n, box)) {
 				DBG(("%s: using CPU bo for download from GPU\n", __FUNCTION__));
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
@@ -2406,7 +2407,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			assert(priv->gpu_bo);
 
 			ok = false;
-			if (use_cpu_bo_for_download(sna, priv, &priv->gpu_damage->extents)) {
+			if (use_cpu_bo_for_download(sna, priv, n, box)) {
 				DBG(("%s: using CPU bo for download from GPU\n", __FUNCTION__));
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
@@ -2522,7 +2523,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				if (n) {
 					bool ok = false;
 
-					if (use_cpu_bo_for_download(sna, priv, &priv->gpu_damage->extents)) {
+					if (use_cpu_bo_for_download(sna, priv, n, box)) {
 						DBG(("%s: using CPU bo for download from GPU\n", __FUNCTION__));
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
@@ -2550,7 +2551,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				DBG(("%s: region wholly inside damage\n",
 				     __FUNCTION__));
 
-				if (use_cpu_bo_for_download(sna, priv, &r->extents)) {
+				if (use_cpu_bo_for_download(sna, priv, n, box)) {
 					DBG(("%s: using CPU bo for download from GPU\n", __FUNCTION__));
 					ok = sna->render.copy_boxes(sna, GXcopy,
 								    pixmap, priv->gpu_bo, 0, 0,
@@ -2578,7 +2579,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 					DBG(("%s: region intersects damage\n",
 					     __FUNCTION__));
 
-					if (use_cpu_bo_for_download(sna, priv, &need.extents)) {
+					if (use_cpu_bo_for_download(sna, priv, n, box)) {
 						DBG(("%s: using CPU bo for download from GPU\n", __FUNCTION__));
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
commit 94cbe7bf7b7acc9a7f2cb56c5e275af028f3cdc8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 20 19:40:44 2013 +0100

    sna: Mark overwriting CopyArea as not needing the dst damage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a481388..d26d613 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2693,7 +2693,7 @@ sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags)
 	return sna_drawable_move_region_to_cpu(&pixmap->drawable, &region, flags);
 }
 
-static bool alu_overwrites(uint8_t alu)
+pure static bool alu_overwrites(uint8_t alu)
 {
 	switch (alu) {
 	case GXclear:
@@ -4700,10 +4700,8 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			list_del(&dst_priv->flush_list);
 			dst_priv->cpu = false;
 		}
-		if (region->data == NULL)
-			hint |= IGNORE_CPU;
 	}
-	if (replaces)
+	if (alu_overwrites(alu))
 		hint |= IGNORE_CPU;
 
 	/* XXX hack for firefox -- subsequent uses of src will be corrupt! */
commit b3d1118bbee1172f72c946163a37ca4ad5feecce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 20 19:34:18 2013 +0100

    sna: Promote the CopyArea to the GPU if it subsumes the CPU damage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 9cbecfe..a481388 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4312,10 +4312,11 @@ out:
 
 static bool
 move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
-	    const BoxRec *box, uint8_t alu)
+	    const RegionRec *region, int16_t dx, int16_t dy,
+	    uint8_t alu)
 {
-	int w = box->x2 - box->x1;
-	int h = box->y2 - box->y1;
+	int w = region->extents.x2 - region->extents.x1;
+	int h = region->extents.y2 - region->extents.y1;
 	int count;
 
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
@@ -4336,6 +4337,12 @@ move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 
 		if (priv->gpu_bo->tiling)
 			return true;
+
+		RegionTranslate(region, dx, dy);
+		count = region_subsumes_damage(region, priv->cpu_damage);
+		RegionTranslate(region, -dx, -dy);
+		if (count)
+			return true;
 	} else {
 		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0)
 			return false;
@@ -4759,7 +4766,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		}
 
 		if (src_priv &&
-		    move_to_gpu(src_pixmap, src_priv, &region->extents, alu) &&
+		    move_to_gpu(src_pixmap, src_priv, region, src_dx, src_dy, alu) &&
 		    sna_pixmap_move_to_gpu(src_pixmap, MOVE_READ | MOVE_ASYNC_HINT)) {
 			DBG(("%s: move whole src_pixmap to GPU and copy\n",
 			     __FUNCTION__));