xf86-video-intel: 6 commits - configure.ac src/intel_display.c src/sna/blt.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_accel.c

Thu Jun 27 08:41:37 PDT 2013

configure.ac        |    3 
 src/intel_display.c |    2 
 src/sna/blt.c       |  276 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/sna/kgem.c      |    6 -
 src/sna/kgem.h      |   22 +++-
 src/sna/sna_accel.c |   89 +++++++++++++++-
 6 files changed, 382 insertions(+), 16 deletions(-)

New commits:
commit b5e85e495e55e2537d305b7bebacdf6f97b66199
Author: Roy.Li <rongqing.li at windriver.com>
Date:   Thu Jun 27 14:10:14 2013 +0800

    uxa: fix the compilation error with xorg-xserver <= 1.10
    
    struct _Screen has no canDoBGNoneRoot when ABI_VIDEODRV_VERSION is less than 10.0
    
    Signed-off-by: Roy.Li <rongqing.li at windriver.com>

diff --git a/src/intel_display.c b/src/intel_display.c
index 17168e5..0acb86d 100644
--- a/src/intel_display.c
+++ b/src/intel_display.c
@@ -2113,7 +2113,9 @@ void intel_copy_fb(ScrnInfoPtr scrn)
 				0, 0,
 				scrn->virtualX, scrn->virtualY);
 	intel->uxa_driver->done_copy(dst);
+#if ABI_VIDEODRV_VERSION >= SET_ABI_VERSION(10, 0)
 	pScreen->canDoBGNoneRoot = TRUE;
+#endif
 
 cleanup_dst:
 	(*pScreen->DestroyPixmap)(dst);
commit 41715af4d009bfcb351946ddaa3a3ea3767a1429
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 27 16:36:52 2013 +0100

    configure: SNA supports the old Xorgs
    
    So allow it to be compiled by default for older Xorgs as well.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 6721279..7425fda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -203,9 +203,6 @@ AC_ARG_ENABLE(sna,
 	      [SNA="$enableval"],
 	      [SNA=auto])
 
-if test "x$SNA" = "xauto" && pkg-config --exists "xorg-server >= 1.10"; then
-	SNA=yes
-fi
 if test "x$SNA" != "xno"; then
 	AC_DEFINE(USE_SNA, 1, [Enable SNA support])
 	AC_CHECK_HEADERS([sys/sysinfo.h])
commit 7ce487617445c81f0178823de8896a2b73bbaaf1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 27 16:08:43 2013 +0100

    sna: Trim the large object threshold
    
    Be kinder to smaller machines by lowering the threshold at which treat
    an object as huge and worthy of avoiding duplication.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 5b78c83..3859e2d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1205,8 +1205,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 		kgem->max_upload_tile_size = kgem->aperture_low;
 
 	kgem->large_object_size = MAX_CACHE_SIZE;
-	if (kgem->large_object_size > kgem->max_gpu_size)
-		kgem->large_object_size = kgem->max_gpu_size;
+	if (kgem->large_object_size > half_gpu_max)
+		kgem->large_object_size = half_gpu_max;
 	if (kgem->max_copy_tile_size > kgem->aperture_high/2)
 		kgem->max_copy_tile_size = kgem->aperture_high/2;
 	if (kgem->max_copy_tile_size > kgem->aperture_low)
commit 31467e18d2ccdc42b0601b43b581524859de1373
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 27 16:07:36 2013 +0100

    sna: Prefer operating inplace with a very large GPU bo
    
    As we strive to only keep one copy when working with very large objects,
    so try operating inplace on a mapping for CPU operations with a large
    GPU bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 46e383d..af68a14 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1785,6 +1785,12 @@ static inline bool operate_inplace(struct sna_pixmap *priv, unsigned flags)
 		return true;
 	}
 
+	if (priv->create & KGEM_CAN_CREATE_LARGE) {
+		DBG(("%s: large object, has GPU? %d\n",
+		     __FUNCTION__, priv->gpu_bo));
+		return priv->gpu_bo != NULL;
+	}
+
 	if (flags & MOVE_WRITE && priv->gpu_bo&&kgem_bo_is_busy(priv->gpu_bo)) {
 		DBG(("%s: no, GPU is busy, so stage write\n", __FUNCTION__));
 		return false;
@@ -2261,8 +2267,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	}
 
 	if (USE_INPLACE &&
-	    (flags & (MOVE_READ | MOVE_ASYNC_HINT)) == 0 &&
-	    (priv->flush || box_inplace(pixmap, &region->extents))) {
+	    (priv->create & KGEM_CAN_CREATE_LARGE ||
+	     ((flags & (MOVE_READ | MOVE_ASYNC_HINT)) == 0 &&
+	      (priv->flush || box_inplace(pixmap, &region->extents))))) {
 		DBG(("%s: marking for inplace hint (%d, %d)\n",
 		     __FUNCTION__, priv->flush, box_inplace(pixmap, &region->extents)));
 		flags |= MOVE_INPLACE_HINT;
@@ -3938,15 +3945,22 @@ static bool can_upload_tiled_x(struct kgem *kgem, struct sna_pixmap *priv)
 	struct kgem_bo *bo = priv->gpu_bo;
 	assert(bo);
 
-	if (priv->cow)
+	if (priv->cow) {
+		DBG(("%s: no, has COW\n", __FUNCTION__));
 		return false;
+	}
 
-	if (bo->tiling != I915_TILING_X)
+	if (bo->tiling != I915_TILING_X) {
+		DBG(("%s: no, uses %d tiling\n", __FUNCTION__, bo->tiling));
 		return false;
+	}
 
-	if (bo->scanout)
+	if (bo->scanout) {
+		DBG(("%s: no, is scanout\n", __FUNCTION__, bo->scanout));
 		return false;
+	}
 
+	DBG(("%s? domain=%d, has_llc=%d\n", __FUNCTION__, bo->domain, kgem->has_llc));
 	return bo->domain == DOMAIN_CPU || kgem->has_llc;
 }
 
@@ -4025,7 +4039,8 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
 		return false;
 
 	assert(priv->gpu_bo->tiling == I915_TILING_X);
-	if (__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+	if ((priv->create & KGEM_CAN_CREATE_LARGE) == 0 &&
+	    __kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
 		return false;
 
 	dst = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
commit b615ce97ec43ea8fe02e995244c757138abcb2de
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 27 10:45:22 2013 +0100

    sna: Add a fast path for reading back from tiled X bo
    
    This is lower latency than the double copy incurred for first moving the
    bo to the CPU and then copying it back - but due to the less efficient
    tiled memcpy, it has lower throughput. So x11perf -shmget500 suffers
    (by about 30%) but real world applications improve by about 2x.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3783933..46e383d 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4028,7 +4028,7 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
 	if (__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
 		return false;
 
-	dst = __kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+	dst = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
 	if (dst == NULL)
 		return false;
 
@@ -4048,7 +4048,6 @@ try_upload_tiled_x(PixmapPtr pixmap, RegionRec *region,
 				  box->x2 - box->x1, box->y2 - box->y1);
 		box++;
 	} while (--n);
-	__kgem_bo_unmap__cpu(&sna->kgem, priv->gpu_bo, dst);
 
 	if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 		if (replaces) {
@@ -14345,6 +14344,62 @@ sna_get_image_blt(DrawablePtr drawable,
 	return ok;
 }
 
+static bool
+sna_get_image_tiled(DrawablePtr drawable,
+		    RegionPtr region,
+		    char *dst,
+		    unsigned flags)
+{
+	PixmapPtr pixmap = get_drawable_pixmap(drawable);
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	char *src;
+
+	if (!sna->kgem.memcpy_from_tiled_x)
+		return false;
+
+	if (flags & MOVE_INPLACE_HINT)
+		return false;
+
+	if (priv == NULL || priv->gpu_bo == NULL)
+		return false;
+
+	if (priv->gpu_bo->tiling != I915_TILING_X)
+		return false;
+
+	if (priv->gpu_bo->scanout)
+		return false;
+
+	if (!sna->kgem.has_llc && priv->gpu_bo->domain != DOMAIN_CPU)
+		return false;
+
+	if (priv->gpu_damage == NULL ||
+	    !(DAMAGE_IS_ALL(priv->gpu_damage) ||
+	      sna_damage_contains_box__no_reduce(priv->gpu_damage,
+						 &region->extents)))
+		return false;
+
+	src = kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+	if (src == NULL)
+		return false;
+
+	DBG(("%s: download through a tiled CPU map\n", __FUNCTION__));
+
+	kgem_bo_sync__cpu_full(&sna->kgem, priv->gpu_bo, FORCE_FULL_SYNC);
+
+	memcpy_from_tiled_x(&sna->kgem, src, dst,
+			    pixmap->drawable.bitsPerPixel,
+			    priv->gpu_bo->pitch,
+			    PixmapBytePad(region->extents.x2 - region->extents.x1,
+					  drawable->depth),
+			    region->extents.x1, region->extents.y1,
+			    0, 0,
+			    region->extents.x2 - region->extents.x1,
+			    region->extents.y2 - region->extents.y1);
+
+	return true;
+}
+
 static void
 sna_get_image(DrawablePtr drawable,
 	      int x, int y, int w, int h,
@@ -14379,6 +14434,9 @@ sna_get_image(DrawablePtr drawable,
 	if (can_blt && sna_get_image_blt(drawable, &region, dst, flags))
 		return;
 
+	if (can_blt && sna_get_image_tiled(drawable, &region, dst, flags))
+		return;
+
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, flags))
 		return;
 
commit 6493c8c65f93ad2554c2512a07ba640e966fd026
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jun 27 10:45:13 2013 +0100

    sna: Implement memcpy_from_tiled functions (for X-tiling only atm)
    
    To provide symmetry with the ability to write into an X-tiled mapping of
    a bo, we add the memcpy_from_tiled to be able to read back from the same
    bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index b27c683..4a33093 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -277,6 +277,70 @@ memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
 	}
 }
 
+static fast_memcpy void
+memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+			       int32_t src_stride, int32_t dst_stride,
+			       int16_t src_x, int16_t src_y,
+			       int16_t dst_x, int16_t dst_y,
+			       uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = src_stride / tile_width;
+	const unsigned swizzle_pixels = tile_width / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t sy = y + src_y;
+		const uint32_t tile_row =
+			(sy / tile_height * stride_tiles * tile_size +
+			 (sy & (tile_height-1)) * tile_width);
+		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+		uint32_t sx = src_x, offset;
+
+		x = width * cpp;
+		if (sx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			memcpy(dst_row, (const char *)src + offset, length * cpp);
+
+			dst_row += length * cpp;
+			x -= length * cpp;
+			sx += length;
+		}
+		while (x >= 512) {
+			assert((sx & tile_mask) == 0);
+			offset = tile_row + (sx >> tile_pixels) * tile_size;
+
+			memcpy(dst_row, (const char *)src + offset, 512);
+
+			dst_row += 512;
+			x -= 512;
+			sx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			memcpy(dst_row, (const char *)src + offset, x);
+		}
+	}
+}
+
 fast_memcpy static void
 memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
 			     int32_t src_stride, int32_t dst_stride,
@@ -347,6 +411,75 @@ memcpy_to_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
 }
 
 fast_memcpy static void
+memcpy_from_tiled_x__swizzle_9(const void *src, void *dst, int bpp,
+			       int32_t src_stride, int32_t dst_stride,
+			       int16_t src_x, int16_t src_y,
+			       int16_t dst_x, int16_t dst_y,
+			       uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = src_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t sy = y + src_y;
+		const uint32_t tile_row =
+			(sy / tile_height * stride_tiles * tile_size +
+			 (sy & (tile_height-1)) * tile_width);
+		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+		uint32_t sx = src_x, offset;
+
+		x = width * cpp;
+		if (sx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= (offset >> 3) & 64;
+
+			memcpy(dst_row, (const char *)src + offset, length * cpp);
+
+			dst_row += length * cpp;
+			x -= length * cpp;
+			sx += length;
+		}
+		while (x >= 64) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= (offset >> 3) & 64;
+
+			memcpy(dst_row, (const char *)src + offset, 64);
+
+			dst_row += 64;
+			x -= 64;
+			sx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= (offset >> 3) & 64;
+			memcpy(dst_row, (const char *)src + offset, x);
+		}
+	}
+}
+
+fast_memcpy static void
 memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
 				int32_t src_stride, int32_t dst_stride,
 				int16_t src_x, int16_t src_y,
@@ -416,6 +549,75 @@ memcpy_to_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
 }
 
 fast_memcpy static void
+memcpy_from_tiled_x__swizzle_9_10(const void *src, void *dst, int bpp,
+				  int32_t src_stride, int32_t dst_stride,
+				  int16_t src_x, int16_t src_y,
+				  int16_t dst_x, int16_t dst_y,
+				  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = src_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t sy = y + src_y;
+		const uint32_t tile_row =
+			(sy / tile_height * stride_tiles * tile_size +
+			 (sy & (tile_height-1)) * tile_width);
+		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+		uint32_t sx = src_x, offset;
+
+		x = width * cpp;
+		if (sx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+
+			memcpy(dst_row, (const char *)src + offset, length * cpp);
+
+			dst_row += length * cpp;
+			x -= length * cpp;
+			sx += length;
+		}
+		while (x >= 64) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+
+			memcpy(dst_row, (const char *)src + offset, 64);
+
+			dst_row += 64;
+			x -= 64;
+			sx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+			memcpy(dst_row, (const char *)src + offset, x);
+		}
+	}
+}
+
+fast_memcpy static void
 memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
 				int32_t src_stride, int32_t dst_stride,
 				int16_t src_x, int16_t src_y,
@@ -483,7 +685,75 @@ memcpy_to_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
 	}
 }
 
-void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling)
+fast_memcpy static void
+memcpy_from_tiled_x__swizzle_9_11(const void *src, void *dst, int bpp,
+				  int32_t src_stride, int32_t dst_stride,
+				  int16_t src_x, int16_t src_y,
+				  int16_t dst_x, int16_t dst_y,
+				  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = src_stride / tile_width;
+	const unsigned swizzle_pixels = 64 / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t sy = y + src_y;
+		const uint32_t tile_row =
+			(sy / tile_height * stride_tiles * tile_size +
+			 (sy & (tile_height-1)) * tile_width);
+		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y;
+		uint32_t sx = src_x, offset;
+
+		x = width * cpp;
+		if (sx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels);
+			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx;
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+			memcpy(dst_row, (const char *)src + offset, length * cpp);
+
+			dst_row += length * cpp;
+			x -= length * cpp;
+			sx += length;
+		}
+		while (x >= 64) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+
+			memcpy(dst_row, (const char *)src + offset, 64);
+
+			dst_row += 64;
+			x -= 64;
+			sx += swizzle_pixels;
+		}
+		if (x) {
+			offset = tile_row +
+				(sx >> tile_pixels) * tile_size +
+				(sx & tile_mask) * cpp;
+			offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+			memcpy(dst_row, (const char *)src + offset, x);
+		}
+	}
+}
+
+void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling)
 {
 	switch (swizzling) {
 	default:
@@ -492,18 +762,22 @@ void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling)
 	case I915_BIT_6_SWIZZLE_NONE:
 		DBG(("%s: no swizzling\n", __FUNCTION__));
 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
+		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
 		break;
 	case I915_BIT_6_SWIZZLE_9:
 		DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
+		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9;
 		break;
 	case I915_BIT_6_SWIZZLE_9_10:
 		DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__));
 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
+		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10;
 		break;
 	case I915_BIT_6_SWIZZLE_9_11:
 		DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__));
 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
+		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
 		break;
 	}
 }
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 55c4fe5..5b78c83 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -991,7 +991,7 @@ static void kgem_init_swizzling(struct kgem *kgem)
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
 		goto out;
 
-	choose_memcpy_to_tiled_x(kgem, tiling.swizzle_mode);
+	choose_memcpy_tiled_x(kgem, tiling.swizzle_mode);
 out:
 	gem_close(kgem->fd, tiling.handle);
 }
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 91a38f7..d1a391a 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -201,6 +201,11 @@ struct kgem {
 				  int16_t src_x, int16_t src_y,
 				  int16_t dst_x, int16_t dst_y,
 				  uint16_t width, uint16_t height);
+	void (*memcpy_from_tiled_x)(const void *src, void *dst, int bpp,
+				    int32_t src_stride, int32_t dst_stride,
+				    int16_t src_x, int16_t src_y,
+				    int16_t dst_x, int16_t dst_y,
+				    uint16_t width, uint16_t height);
 
 	uint16_t reloc__self[256];
 	uint32_t batch[64*1024-8] page_aligned;
@@ -713,6 +718,21 @@ memcpy_to_tiled_x(struct kgem *kgem,
 				       width, height);
 }
 
-void choose_memcpy_to_tiled_x(struct kgem *kgem, int swizzling);
+static inline void
+memcpy_from_tiled_x(struct kgem *kgem,
+		    const void *src, void *dst, int bpp,
+		    int32_t src_stride, int32_t dst_stride,
+		    int16_t src_x, int16_t src_y,
+		    int16_t dst_x, int16_t dst_y,
+		    uint16_t width, uint16_t height)
+{
+	return kgem->memcpy_from_tiled_x(src, dst, bpp,
+					 src_stride, dst_stride,
+					 src_x, src_y,
+					 dst_x, dst_y,
+					 width, height);
+}
+
+void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling);
 
 #endif /* KGEM_H */