xf86-video-intel: 6 commits - src/sna/blt.c src/sna/gen2_render.c src/sna/gen3_render.c src/sna/gen4_render.c src/sna/gen5_render.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/kgem.c src/sna/kgem_debug_gen5.c src/sna/kgem.h src/sna/sna_accel.c src/sna/sna_blt.c src/sna/sna.h src/sna/sna_io.c src/sna/sna_render.c src/sna/sna_render.h src/sna/sna_trapezoids.c src/sna/sna_video.c src/sna/sna_video_textured.c

Sun Jan 29 06:55:29 PST 2012

src/sna/blt.c                |    2 
 src/sna/gen2_render.c        |   21 +
 src/sna/gen3_render.c        |   16 -
 src/sna/gen4_render.c        |    1 
 src/sna/gen5_render.c        |    1 
 src/sna/gen6_render.c        |   19 +
 src/sna/gen7_render.c        |    1 
 src/sna/kgem.c               |  464 ++++++++++++++++++++++++++++---------------
 src/sna/kgem.h               |   69 +++++-
 src/sna/kgem_debug_gen5.c    |    2 
 src/sna/sna.h                |   11 -
 src/sna/sna_accel.c          |  166 +++++++++------
 src/sna/sna_blt.c            |   58 ++---
 src/sna/sna_io.c             |  447 ++++++++++++++++++++++++++++++-----------
 src/sna/sna_render.c         |  188 ++++++++++++-----
 src/sna/sna_render.h         |    1 
 src/sna/sna_trapezoids.c     |    2 
 src/sna/sna_video.c          |    2 
 src/sna/sna_video_textured.c |    2 
 19 files changed, 1010 insertions(+), 463 deletions(-)

New commits:
commit d3fb1e1e89ccf5cefe6add66de4f960ef07cac60
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 14:20:33 2012 +0000

    sna: Handle GPU creation failure when uploading subtexture
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index abc6325..cbaaafd 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1043,20 +1043,23 @@ sna_render_picture_extract(struct sna *sna,
 		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
 			return 0;
 	} else {
-		if (texture_is_cpu(pixmap, &box) &&
-		    !move_to_gpu(pixmap, &box)) {
-			bo = kgem_upload_source_image(&sna->kgem,
-						      pixmap->devPrivate.ptr,
-						      &box,
-						      pixmap->devKind,
-						      pixmap->drawable.bitsPerPixel);
-		} else {
+		bool upload = true;
+		if (!texture_is_cpu(pixmap, &box) &&
+		    move_to_gpu(pixmap, &box)) {
 			struct sna_pixmap *priv;
 
 			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
-			if (priv)
+			if (priv) {
 				src_bo = priv->gpu_bo;
+				upload = false;
+			}
 		}
+		if (upload)
+			bo = kgem_upload_source_image(&sna->kgem,
+						      pixmap->devPrivate.ptr,
+						      &box,
+						      pixmap->devKind,
+						      pixmap->drawable.bitsPerPixel);
 	}
 	if (src_bo) {
 		bo = kgem_create_2d(&sna->kgem, w, h,
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 2fd1eaf..eb6c968 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3973,6 +3973,8 @@ trap_mask_converter(PicturePtr picture,
 
 	pixmap = get_drawable_pixmap(picture->pDrawable);
 	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
+	if (priv == NULL)
+		return false;
 
 	/* XXX strict adherence to the Render specification */
 	if (picture->polyMode == PolyModePrecise) {
commit 518a99ea34b26aa094f29a4cc1ea5419f63a0e56
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 14:09:46 2012 +0000

    sna: Always create a GPU bo for copying from an existent source GPU bo
    
    Make sure we prevent the readback of an active source GPU bo by always
    prefering to do the copy on the GPU if the data is already resisent.
    This fixes the second regression from e583af9cc, (sna: Experiment with
    creating large objects as CPU bo).
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ce35113..5d0e042 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3211,7 +3211,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	}
 
 	/* Try to maintain the data on the GPU */
-	if (dst_priv->gpu_bo == NULL && dst_priv->gpu &&
+	if (dst_priv->gpu_bo == NULL &&
 	    ((dst_priv->cpu_damage == NULL && copy_use_gpu_bo(sna, dst_priv, &region)) ||
 	     (src_priv && (src_priv->gpu_bo != NULL || (src_priv->cpu_bo && kgem_bo_is_busy(src_priv->cpu_bo)))))) {
 		uint32_t tiling = sna_pixmap_choose_tiling(dst_pixmap);
commit 624d9843abda9ca6bd1b004d70a6fdc082ba9653
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 13:55:20 2012 +0000

    sna: Ignore map status and pick the first inactive bo for reuse
    
    This fixes the performance regression introduced with e583af9cca,
    (sna: Experiment with creating large objects as CPU bo), as we ended up
    creating fresh bo and incurring setup and thrashing overhead, when we
    already had plenty cached.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d062a1d..1d7b8e9 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2501,9 +2501,6 @@ search_inactive:
 			continue;
 		}
 
-		if ((flags & CREATE_CPU_MAP) == 0 && IS_CPU_MAP(bo->map))
-			continue;
-
 		if (bo->tiling != tiling ||
 		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
 			if (tiling != gem_set_tiling(kgem->fd,
commit 5c6255ba2f12f04938fd586ca02562ee3cae05af
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 11:02:38 2012 +0000

    sna: Determine whether to use a partial proxy based on the pitch
    
    On gen4+ devices the maximum render pitch is much larger than is simply
    required for the maximum coordinates. This makes it possible to use
    proxy textures as a subimage into the oversized texture without having
    to blit into a temporary copy for virtually every single bo we use.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 2a97cea..398988a 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -55,6 +55,7 @@
 #define PREFER_BLT_COPY 1
 
 #define MAX_3D_SIZE 2048
+#define MAX_3D_PITCH 8192
 
 #define BATCH(v) batch_emit(sna, v)
 #define BATCH_F(v) batch_emit_float(sna, v)
@@ -547,7 +548,7 @@ gen2_get_batch(struct sna *sna)
 
 static void gen2_emit_target(struct sna *sna, const struct sna_composite_op *op)
 {
-	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= 8192);
+	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= MAX_3D_PITCH);
 	assert(sna->render_state.gen2.vertex_offset == 0);
 
 	if (sna->render_state.gen2.target == op->dst.bo->unique_id) {
@@ -1736,7 +1737,7 @@ gen2_render_composite(struct sna *sna,
 
 	tmp->op = op;
 	if (too_large(tmp->dst.width, tmp->dst.height) ||
-	    tmp->dst.bo->pitch > 8192) {
+	    tmp->dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, tmp,
 						   dst_x, dst_y, width, height))
 			return FALSE;
@@ -2192,7 +2193,7 @@ gen2_render_composite_spans(struct sna *sna,
 
 	tmp->base.op = op;
 	if (too_large(tmp->base.dst.width, tmp->base.dst.height) ||
-	    tmp->base.dst.bo->pitch > 8192) {
+	    tmp->base.dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, &tmp->base,
 						   dst_x, dst_y, width, height))
 			return FALSE;
@@ -2388,7 +2389,7 @@ gen2_render_fill_boxes(struct sna *sna,
 	     color->red, color->green, color->blue, color->alpha));
 
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192 ||
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH ||
 	    !gen2_check_dst_format(format)) {
 		DBG(("%s: try blt, too large or incompatible destination\n",
 		     __FUNCTION__));
@@ -2589,7 +2590,7 @@ gen2_render_fill(struct sna *sna, uint8_t alu,
 
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192)
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_fill(sna, alu,
 				    dst_bo, dst->drawable.bitsPerPixel,
 				    color,
@@ -2665,7 +2666,7 @@ gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    bo->pitch < 8 || bo->pitch > 8192)
+	    bo->pitch < 8 || bo->pitch > MAX_3D_PITCH)
 		return gen2_render_fill_one_try_blt(sna, dst, bo, color,
 						    x1, y1, x2, y2, alu);
 
@@ -2832,9 +2833,9 @@ gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	if (src_bo == dst_bo || /* XXX handle overlap using 3D ? */
 	    too_large(src->drawable.width, src->drawable.height) ||
-	    src_bo->pitch > 8192 ||
+	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192)
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
@@ -2957,7 +2958,8 @@ gen2_render_copy(struct sna *sna, uint8_t alu,
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(src->drawable.width, src->drawable.height) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    src_bo->pitch > 8192 || dst_bo->pitch < 8 || dst_bo->pitch > 8192) {
+	    src_bo->pitch > MAX_3D_PITCH ||
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -3045,5 +3047,6 @@ Bool gen2_render_init(struct sna *sna)
 	render->flush = gen2_render_flush;
 
 	render->max_3d_size = MAX_3D_SIZE;
+	render->max_3d_pitch = MAX_3D_PITCH;
 	return TRUE;
 }
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 931142d..da90d82 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -65,6 +65,7 @@ enum {
 };
 
 #define MAX_3D_SIZE 2048
+#define MAX_3D_PITCH 8192
 
 #define OUT_BATCH(v) batch_emit(sna, v)
 #define OUT_BATCH_F(v) batch_emit_float(sna, v)
@@ -143,7 +144,7 @@ static inline uint32_t gen3_buf_tiling(uint32_t tiling)
 static inline Bool
 gen3_check_pitch_3d(struct kgem_bo *bo)
 {
-	return bo->pitch <= 8192;
+	return bo->pitch <= MAX_3D_PITCH;
 }
 
 static uint32_t gen3_get_blend_cntl(int op,
@@ -3826,9 +3827,9 @@ gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    src_bo == dst_bo || /* XXX handle overlap using 3D ? */
-	    src_bo->pitch > 8192 ||
+	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(src->drawable.width, src->drawable.height) ||
-	    dst_bo->pitch > 8192 ||
+	    dst_bo->pitch > MAX_3D_PITCH ||
 	    too_large(dst->drawable.width, dst->drawable.height))
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
@@ -3959,7 +3960,7 @@ gen3_render_copy(struct sna *sna, uint8_t alu,
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(src->drawable.width, src->drawable.height) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    src_bo->pitch > 8192 || dst_bo->pitch > 8192) {
+	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch > MAX_3D_PITCH) {
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -4083,7 +4084,7 @@ gen3_render_fill_boxes(struct sna *sna,
 	     color->red, color->green, color->blue, color->alpha));
 
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch > 8192 ||
+	    dst_bo->pitch > MAX_3D_PITCH ||
 	    !gen3_check_dst_format(format)) {
 		DBG(("%s: try blt, too large or incompatible destination\n",
 		     __FUNCTION__));
@@ -4265,7 +4266,7 @@ gen3_render_fill(struct sna *sna, uint8_t alu,
 	/* Must use the BLT if we can't RENDER... */
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch > 8192)
+	    dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_fill(sna, alu,
 				    dst_bo, dst->drawable.bitsPerPixel,
 				    color,
@@ -4346,7 +4347,7 @@ gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	/* Must use the BLT if we can't RENDER... */
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    bo->pitch > 8192)
+	    bo->pitch > MAX_3D_PITCH)
 		return gen3_render_fill_one_try_blt(sna, dst, bo, color,
 						    x1, y1, x2, y2, alu);
 
@@ -4424,5 +4425,6 @@ Bool gen3_render_init(struct sna *sna)
 	render->fini = gen3_render_fini;
 
 	render->max_3d_size = MAX_3D_SIZE;
+	render->max_3d_pitch = MAX_3D_PITCH;
 	return TRUE;
 }
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 91d5f49..d9542ea 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -3220,5 +3220,6 @@ Bool gen4_render_init(struct sna *sna)
 	sna->render.fini = gen4_render_fini;
 
 	sna->render.max_3d_size = 8192;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 2c6d020..3465121 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -3702,5 +3702,6 @@ Bool gen5_render_init(struct sna *sna)
 	sna->render.fini = gen5_render_fini;
 
 	sna->render.max_3d_size = MAX_3D_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 41e05d0..0d244f1 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -4044,5 +4044,6 @@ Bool gen6_render_init(struct sna *sna)
 	sna->render.fini = gen6_render_fini;
 
 	sna->render.max_3d_size = GEN6_MAX_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 282b724..ff04631 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -4097,5 +4097,6 @@ Bool gen7_render_init(struct sna *sna)
 	sna->render.fini = gen7_render_fini;
 
 	sna->render.max_3d_size = GEN7_MAX_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 208c8f2..d062a1d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -704,6 +704,39 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 	return ALIGN(width, scanout ? 64 : kgem->min_alignment);
 }
 
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size)
+{
+	if (kgem->gen < 30) {
+		if (tiling) {
+			*tile_width = 512;
+			*tile_height = 16;
+			*tile_size = 2048;
+		} else {
+			*tile_width = 1;
+			*tile_height = 1;
+			*tile_size = 1;
+		}
+	} else switch (tiling) {
+	default:
+	case I915_TILING_NONE:
+		*tile_width = 1;
+		*tile_height = 1;
+		*tile_size = 1;
+		break;
+	case I915_TILING_X:
+		*tile_width = 512;
+		*tile_height = 8;
+		*tile_size = 4096;
+		break;
+	case I915_TILING_Y:
+		*tile_width = kgem->gen <= 30 ? 512 : 128;
+		*tile_height = 32;
+		*tile_size = 4096;
+		break;
+	}
+}
+
 static uint32_t kgem_surface_size(struct kgem *kgem,
 				  bool relaxed_fencing,
 				  bool scanout,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 2631e81..f386967 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -361,6 +361,8 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 		   const void *data, int length);
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size);
 
 static inline int kgem_bo_size(struct kgem_bo *bo)
 {
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 7077f36..abc6325 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -816,10 +816,27 @@ sna_render_picture_partial(struct sna *sna,
 	struct kgem_bo *bo = NULL;
 	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
 	BoxRec box;
+	int tile_width, tile_height, tile_size;
+	int offset;
 
 	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
 	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
 
+	if (use_cpu_bo(sna, pixmap, &box)) {
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->cpu_bo;
+	} else {
+		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->gpu_bo;
+	}
+
+	if (bo->pitch > sna->render.max_3d_pitch)
+		return 0;
+
 	box.x1 = x;
 	box.y1 = y;
 	box.x2 = x + w;
@@ -855,51 +872,65 @@ sna_render_picture_partial(struct sna *sna,
 		}
 	}
 
-	/* Presume worst case tile-row alignment for Y-tiling */
-	box.y1 = box.y1 & (64 - 1);
-	box.y2 = ALIGN(box.y2, 64);
+	kgem_get_tile_size(&sna->kgem, bo->tiling,
+			   &tile_width, &tile_height, &tile_size);
+
+	/* Ensure we align to an even tile row */
+	box.y1 = box.y1 & ~(2*tile_height - 1);
+	box.y2 = ALIGN(box.y2, 2*tile_height);
+	if (box.y2 > pixmap->drawable.height)
+		box.y2 = pixmap->drawable.height;
+
+	box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
+	box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
+	if (box.x2 > pixmap->drawable.width)
+		box.x2 = pixmap->drawable.width;
+
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
 	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
 	     box.x1, box.y1, box.x2, box.y2, w, h,
 	     pixmap->drawable.width, pixmap->drawable.height));
-	if (w <= 0 || h <= 0 || h > sna->render.max_3d_size)
+	if (w <= 0 || h <= 0 ||
+	    w > sna->render.max_3d_size ||
+	    h > sna->render.max_3d_size)
 		return 0;
 
-	memset(&channel->embedded_transform,
-	       0,
-	       sizeof(channel->embedded_transform));
-	channel->embedded_transform.matrix[0][0] = 1 << 16;
-	channel->embedded_transform.matrix[0][2] = 0;
-	channel->embedded_transform.matrix[1][1] = 1 << 16;
-	channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
-	channel->embedded_transform.matrix[2][2] = 1 << 16;
-	if (channel->transform)
+	/* How many tiles across are we? */
+	offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+	channel->bo = kgem_create_proxy(bo,
+					box.y1 * bo->pitch + offset,
+					h * bo->pitch);
+	if (channel->bo == NULL)
+		return 0;
+
+	channel->bo->pitch = bo->pitch;
+
+	if (channel->transform) {
+		memset(&channel->embedded_transform,
+		       0,
+		       sizeof(channel->embedded_transform));
+		channel->embedded_transform.matrix[0][0] = 1 << 16;
+		channel->embedded_transform.matrix[0][2] = -box.x1 << 16;
+		channel->embedded_transform.matrix[1][1] = 1 << 16;
+		channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
+		channel->embedded_transform.matrix[2][2] = 1 << 16;
 		pixman_transform_multiply(&channel->embedded_transform,
 					  &channel->embedded_transform,
 					  channel->transform);
-	channel->transform = &channel->embedded_transform;
-
-	if (use_cpu_bo(sna, pixmap, &box)) {
-		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->cpu_bo;
+		channel->transform = &channel->embedded_transform;
 	} else {
-		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->gpu_bo;
+		x -= box.x1;
+		y -= box.y1;
 	}
 
 	channel->offset[0] = x - dst_x;
 	channel->offset[1] = y - dst_y;
-	channel->scale[0] = 1.f/pixmap->drawable.width;
+	channel->scale[0] = 1.f/w;
 	channel->scale[1] = 1.f/h;
-	channel->width  = pixmap->drawable.width;
+	channel->width  = w;
 	channel->height = h;
-	channel->bo = kgem_create_proxy(bo, box.y1 * bo->pitch, h * bo->pitch);
-	return channel->bo != NULL;
+	return 1;
 }
 
 int
@@ -927,8 +958,7 @@ sna_render_picture_extract(struct sna *sna,
 		return -1;
 	}
 
-	if (pixmap->drawable.width < sna->render.max_3d_size &&
-	    sna_render_picture_partial(sna, picture, channel,
+	if (sna_render_picture_partial(sna, picture, channel,
 				       x, y, w, h,
 				       dst_x, dst_y))
 		return 1;
@@ -1527,30 +1557,67 @@ sna_render_composite_redirect(struct sna *sna,
 		return FALSE;
 	}
 
-	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
-		int y1, y2;
+	if (op->dst.bo->pitch <= sna->render.max_3d_pitch) {
+		int tile_width, tile_height, tile_size;
+		BoxRec box;
+		int w, h;
 
-		assert(op->dst.pixmap->drawable.height > sna->render.max_3d_size);
-		y1 =  y + op->dst.y;
-		y2 =  y1 + height;
-		y1 &= y1 & (64 - 1);
-		y2 = ALIGN(y2, 64);
+		kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling,
+				   &tile_width, &tile_height, &tile_size);
+
+		box.x1 = x;
+		box.x2 = x + width;
+		box.y1 = y;
+		box.y2 = y + height;
+
+		/* Ensure we align to an even tile row */
+		box.y1 = box.y1 & ~(2*tile_height - 1);
+		box.y2 = ALIGN(box.y2, 2*tile_height);
+		if (box.y2 > op->dst.pixmap->drawable.height)
+			box.y2 = op->dst.pixmap->drawable.height;
+
+		box.x1 = box.x1 & ~(tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel - 1);
+		box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);
+		if (box.x2 > op->dst.pixmap->drawable.width)
+			box.x2 = op->dst.pixmap->drawable.width;
+
+		w = box.x2 - box.x1;
+		h = box.y2 - box.y1;
+		DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
+		     box.x1, box.y1, box.x2, box.y2, w, h,
+		     op->dst.pixmap->drawable.width,
+		     op->dst.pixmap->drawable.height));
+		if (w <= sna->render.max_3d_size &&
+		    h <= sna->render.max_3d_size) {
+			int offset;
 
-		if (y2 - y1 <= sna->render.max_3d_size) {
 			t->box.x2 = t->box.x1 = op->dst.x;
 			t->box.y2 = t->box.y1 = op->dst.y;
-			t->real_bo = priv->gpu_bo;
+			t->real_bo = op->dst.bo;
 			t->real_damage = op->damage;
 			if (op->damage) {
 				t->damage = sna_damage_create();
 				op->damage = &t->damage;
 			}
 
-			op->dst.bo = kgem_create_proxy(priv->gpu_bo,
-						       y1 * priv->gpu_bo->pitch,
-						       (y2 - y1) * priv->gpu_bo->pitch);
-			op->dst.y += -y1;
-			op->dst.height = y2 - y1;
+			/* How many tiles across are we? */
+			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+			op->dst.bo = kgem_create_proxy(op->dst.bo,
+						       box.y1 * op->dst.bo->pitch + offset,
+						       h * op->dst.bo->pitch);
+			if (!op->dst.bo) {
+				t->real_bo = NULL;
+				if (t->damage)
+					__sna_damage_destroy(t->damage);
+				return FALSE;
+			}
+
+			op->dst.bo->pitch = t->real_bo->pitch;
+
+			op->dst.x += -box.x1;
+			op->dst.y += -box.y1;
+			op->dst.width  = w;
+			op->dst.height = h;
 			return TRUE;
 		}
 	}
@@ -1583,7 +1650,7 @@ sna_render_composite_redirect(struct sna *sna,
 		return FALSE;
 	}
 
-	t->real_bo = priv->gpu_bo;
+	t->real_bo = op->dst.bo;
 	t->real_damage = op->damage;
 	if (op->damage) {
 		t->damage = sna_damage_create();
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index b23a8a7..5df53dc 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -186,6 +186,7 @@ struct sna_copy_op {
 
 struct sna_render {
 	int max_3d_size;
+	int max_3d_pitch;
 
 	Bool (*composite)(struct sna *sna, uint8_t op,
 			  PicturePtr dst, PicturePtr src, PicturePtr mask,
commit 65466f86263b3788b438fe021a12ade371190b01
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jan 28 01:54:47 2012 +0000

    sna: Allow ridiculously large bo, up to half the total GATT
    
    Such large bo place extreme stress on the system, for example trying to
    mmap a 1GiB into the CPU domain currently fails due to a kernel bug. :(
    So if you can avoid the swap thrashing during the upload, the ddx can now
    handle 16k x 16k images on gen4+ on the GPU. That is fine until you want
    two such images...
    
    The real complication comes in uploading (and downloading) from such
    large textures as they are too large for a single operation with
    automatic detiling via either the BLT or the RENDER ring. We could do
    manual tiling/switching or, as this patch does, tile the transfer in
    chunks small enough to fit into either pipeline.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index fb3dd35..65d586c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -154,6 +154,8 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	uint8_t *dst_bytes;
 	int byte_width;
 
+	assert(src);
+	assert(dst);
 	assert(width && height);
 	assert(bpp >= 8);
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d813d95..41e05d0 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1193,6 +1193,7 @@ gen6_bind_bo(struct sna *sna,
 			       bo, domains, 0);
 	ss[2] = ((width - 1)  << GEN6_SURFACE_WIDTH_SHIFT |
 		 (height - 1) << GEN6_SURFACE_HEIGHT_SHIFT);
+	assert(bo->pitch <= (1 << 18));
 	ss[3] = (gen6_tiling_bits(bo->tiling) |
 		 (bo->pitch - 1) << GEN6_SURFACE_PITCH_SHIFT);
 	ss[4] = 0;
@@ -3136,10 +3137,19 @@ fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return false;
 
-		return sna_blt_copy_boxes_fallback(sna, alu,
-						   src, src_bo, src_dx, src_dy,
-						   dst, dst_bo, dst_dx, dst_dy,
-						   box, n);
+		if (sna_blt_copy_boxes_fallback(sna, alu,
+						 src, src_bo, src_dx, src_dy,
+						 dst, dst_bo, dst_dx, dst_dy,
+						 box, n))
+			return true;
+
+		return false;
+#if 0
+		return sna_tiling_copy_boxes(sna,
+					     src, src_bo, src_dx, src_dy,
+					     dst, dst_bo, dst_dx, dst_dy,
+					     box, n);
+#endif
 	}
 
 	if (dst->drawable.depth == src->drawable.depth) {
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 86a4372..208c8f2 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -45,7 +45,7 @@
 #endif
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags);
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
 static inline void _list_del(struct list *list)
 {
@@ -99,7 +99,6 @@ static inline void list_replace(struct list *old,
 #define DBG(x) ErrorF x
 #endif
 
-#define PAGE_SIZE 4096
 #define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
 #define MAX_GTT_VMA_CACHE 512
 #define MAX_CPU_VMA_CACHE INT16_MAX
@@ -120,6 +119,14 @@ struct kgem_partial_bo {
 static struct kgem_bo *__kgem_freed_bo;
 static struct drm_i915_gem_exec_object2 _kgem_dummy_exec;
 
+static inline int bytes(struct kgem_bo *bo)
+{
+	return kgem_bo_size(bo);
+}
+
+#define bucket(B) (B)->size.pages.bucket
+#define num_pages(B) (B)->size.pages.count
+
 #ifndef NDEBUG
 static bool validate_partials(struct kgem *kgem)
 {
@@ -128,10 +135,10 @@ static bool validate_partials(struct kgem *kgem)
 	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
 		if (bo->base.list.next == &kgem->partial)
 			return true;
-		if (bo->base.size - bo->used < next->base.size - next->used) {
+		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
 			ErrorF("this rem: %d, next rem: %d\n",
-			       bo->base.size - bo->used,
-			       next->base.size - next->used);
+			       bytes(&bo->base) - bo->used,
+			       bytes(&next->base) - next->used);
 			goto err;
 		}
 	}
@@ -140,7 +147,7 @@ static bool validate_partials(struct kgem *kgem)
 err:
 	list_for_each_entry(bo, &kgem->partial, base.list)
 		ErrorF("bo: used=%d / %d, rem=%d\n",
-		       bo->used, bo->base.size, bo->base.size - bo->used);
+		       bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
 	return false;
 }
 #else
@@ -312,7 +319,7 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	assert(!bo->purged);
 	assert(!kgem_busy(kgem, bo->handle));
 
-	assert(length <= bo->size);
+	assert(length <= bytes(bo));
 	if (gem_write(kgem->fd, bo->handle, 0, length, data))
 		return FALSE;
 
@@ -322,17 +329,13 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	return TRUE;
 }
 
-static uint32_t gem_create(int fd, int size)
+static uint32_t gem_create(int fd, int num_pages)
 {
 	struct drm_i915_gem_create create;
 
-#if DEBUG_KGEM
-	assert((size & (PAGE_SIZE-1)) == 0);
-#endif
-
 	VG_CLEAR(create);
 	create.handle = 0;
-	create.size = size;
+	create.size = PAGE_SIZE * num_pages;
 	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
 
 	return create.handle;
@@ -415,7 +418,7 @@ static void gem_close(int fd, uint32_t handle)
 	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
 }
 
-static inline unsigned long __fls(unsigned long word)
+constant inline static unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -423,24 +426,21 @@ static inline unsigned long __fls(unsigned long word)
 	return word;
 }
 
-constant inline static int cache_bucket(int size)
+constant inline static int cache_bucket(int num_pages)
 {
-	uint32_t order = __fls(size / PAGE_SIZE);
-	assert(order < NUM_CACHE_BUCKETS);
-	return order;
+	return __fls(num_pages);
 }
 
 static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
-				      int handle, int size)
+				      int handle, int num_pages)
 {
-	assert(size);
+	assert(num_pages);
 	memset(bo, 0, sizeof(*bo));
 
 	bo->refcnt = 1;
 	bo->handle = handle;
-	bo->size = size;
-	bo->bucket = cache_bucket(size);
-	assert(bo->size < 1 << (12 + bo->bucket + 1));
+	num_pages(bo) = num_pages;
+	bucket(bo) = cache_bucket(num_pages);
 	bo->reusable = true;
 	bo->domain = DOMAIN_CPU;
 	list_init(&bo->request);
@@ -450,7 +450,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
 	return bo;
 }
 
-static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
+static struct kgem_bo *__kgem_bo_alloc(int handle, int num_pages)
 {
 	struct kgem_bo *bo;
 
@@ -463,7 +463,7 @@ static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
 			return NULL;
 	}
 
-	return __kgem_bo_init(bo, handle, size);
+	return __kgem_bo_init(bo, handle, num_pages);
 }
 
 static struct kgem_request _kgem_static_request;
@@ -481,14 +481,14 @@ static struct kgem_request *__kgem_request_alloc(void)
 	return rq;
 }
 
-static struct list *inactive(struct kgem *kgem, int size)
+static struct list *inactive(struct kgem *kgem, int num_pages)
 {
-	return &kgem->inactive[cache_bucket(size)];
+	return &kgem->inactive[cache_bucket(num_pages)];
 }
 
-static struct list *active(struct kgem *kgem, int size, int tiling)
+static struct list *active(struct kgem *kgem, int num_pages, int tiling)
 {
-	return &kgem->active[cache_bucket(size)][tiling];
+	return &kgem->active[cache_bucket(num_pages)][tiling];
 }
 
 static size_t
@@ -575,6 +575,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	list_init(&kgem->partial);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
+	list_init(&kgem->large);
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
 		list_init(&kgem->inactive[i]);
 	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
@@ -658,11 +659,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
+	kgem->max_object_size = kgem->aperture_total / 2;
 	kgem->max_cpu_size = kgem->aperture_total / 2;
-	if (kgem->max_cpu_size > MAX_OBJECT_SIZE)
-		kgem->max_cpu_size = MAX_OBJECT_SIZE;
-
-	kgem->max_gpu_size = -1;
+	kgem->max_gpu_size = MAX_CACHE_SIZE;
 	if (gen < 40) {
 		/* If we have to use fences for blitting, we have to make
 		 * sure we can fit them into the aperture.
@@ -677,6 +676,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
 	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
 
+	/* Convert the aperture thresholds to pages */
+	kgem->aperture_low /= PAGE_SIZE;
+	kgem->aperture_high /= PAGE_SIZE;
+
 	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
 	if ((int)kgem->fence_max < 0)
 		kgem->fence_max = 5; /* minimum safe value for all hw */
@@ -811,7 +814,7 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 	exec->handle = bo->handle;
 	exec->offset = bo->presumed_offset;
 
-	kgem->aperture += bo->size;
+	kgem->aperture += num_pages(bo);
 
 	return exec;
 }
@@ -875,7 +878,7 @@ static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, kgem->vma[type].count));
 
 	VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-	munmap(CPU_MAP(bo->map), bo->size);
+	munmap(CPU_MAP(bo->map), bytes(bo));
 	bo->map = NULL;
 
 	if (!list_is_empty(&bo->vma)) {
@@ -917,16 +920,22 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(bo->rq == NULL);
 	assert(bo->domain != DOMAIN_GPU);
 
-	list_move(&bo->list, &kgem->inactive[bo->bucket]);
+	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
+		kgem_bo_free(kgem, bo);
+		return;
+	}
+
+	list_move(&bo->list, &kgem->inactive[bucket(bo)]);
 	if (bo->map) {
 		int type = IS_CPU_MAP(bo->map);
-		if (!type && !kgem_bo_is_mappable(kgem, bo)) {
+		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
+		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
 			list_del(&bo->vma);
-			munmap(CPU_MAP(bo->map), bo->size);
+			munmap(CPU_MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
 		if (bo->map) {
-			list_move(&bo->vma, &kgem->vma[type].inactive[bo->bucket]);
+			list_move(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
 			kgem->vma[type].count++;
 		}
 	}
@@ -1002,8 +1011,14 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	bo->scanout = bo->flush = false;
 	if (bo->rq) {
+		struct list *cache;
+
 		DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle));
-		list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+		if (bucket(bo) < NUM_CACHE_BUCKETS)
+			cache = &kgem->active[bucket(bo)][bo->tiling];
+		else
+			cache = &kgem->large;
+		list_add(&bo->list, cache);
 		return;
 	}
 
@@ -1012,10 +1027,17 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	if (bo->needs_flush) {
 		if ((bo->needs_flush = kgem_busy(kgem, bo->handle))) {
+			struct list *cache;
+
 			DBG(("%s: handle=%d -> flushing\n",
 			     __FUNCTION__, bo->handle));
+
 			list_add(&bo->request, &kgem->flushing);
-			list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+			if (bucket(bo) < NUM_CACHE_BUCKETS)
+				cache = &kgem->active[bucket(bo)][bo->tiling];
+			else
+				cache = &kgem->large;
+			list_add(&bo->list, cache);
 			bo->rq = &_kgem_static_request;
 			return;
 		}
@@ -1231,7 +1253,7 @@ static void kgem_close_inactive(struct kgem *kgem)
 
 static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 {
-	int remain = bo->base.size - bo->used;
+	int remain = bytes(&bo->base) - bo->used;
 
 	while (bo->base.list.prev != &kgem->partial) {
 		struct kgem_partial_bo *p;
@@ -1239,7 +1261,7 @@ static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 		p = list_entry(bo->base.list.prev,
 			       struct kgem_partial_bo,
 			       base.list);
-		if (remain <= p->base.size - p->used)
+		if (remain <= bytes(&p->base) - p->used)
 			break;
 
 		assert(p->base.list.next == &bo->base.list);
@@ -1282,7 +1304,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 		assert(bo->base.rq == kgem->next_request);
 		if (bo->used && bo->need_io) {
 			if (bo->base.refcnt == 1 &&
-			    bo->used < bo->base.size / 2) {
+			    bo->used < bytes(&bo->base) / 2) {
 				struct kgem_bo *shrink;
 
 				shrink = search_linear_cache(kgem,
@@ -1293,10 +1315,10 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 					DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
 					     __FUNCTION__,
-					     bo->used, bo->base.size, shrink->size,
+					     bo->used, bytes(&bo->base), bytes(shrink),
 					     bo->base.handle, shrink->handle));
 
-					assert(bo->used <= shrink->size);
+					assert(bo->used <= bytes(shrink));
 					gem_write(kgem->fd, shrink->handle,
 						  0, bo->used, bo->mem);
 
@@ -1330,9 +1352,9 @@ static void kgem_finish_partials(struct kgem *kgem)
 			}
 
 			DBG(("%s: handle=%d, uploading %d/%d\n",
-			     __FUNCTION__, bo->base.handle, bo->used, bo->base.size));
+			     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
 			assert(!kgem_busy(kgem, bo->base.handle));
-			assert(bo->used <= bo->base.size);
+			assert(bo->used <= bytes(&bo->base));
 			gem_write(kgem->fd, bo->base.handle,
 				  0, bo->used, bo->mem);
 			bo->need_io = 0;
@@ -1616,7 +1638,7 @@ void _kgem_submit(struct kgem *kgem)
 					       i,
 					       kgem->exec[i].handle,
 					       (int)kgem->exec[i].offset,
-					       found ? found->size : -1,
+					       found ? bytes(found) : -1,
 					       found ? found->tiling : -1,
 					       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
 					       found ? found->purged : -1);
@@ -1690,7 +1712,7 @@ static void kgem_expire_partial(struct kgem *kgem)
 			continue;
 
 		DBG(("%s: discarding unused partial buffer: %d/%d, write? %d\n",
-		     __FUNCTION__, bo->used, bo->base.size, bo->write));
+		     __FUNCTION__, bo->used, bytes(&bo->base), bo->write));
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
 	}
@@ -1773,7 +1795,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 				list_move_tail(&bo->list, &preserve);
 			} else {
 				count++;
-				size += bo->size;
+				size += bytes(bo);
 				kgem_bo_free(kgem, bo);
 				DBG(("%s: expiring %d\n",
 				     __FUNCTION__, bo->handle));
@@ -1834,28 +1856,31 @@ void kgem_cleanup_cache(struct kgem *kgem)
 }
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 {
 	struct kgem_bo *bo, *first = NULL;
 	bool use_active = (flags & CREATE_INACTIVE) == 0;
 	struct list *cache;
 
+	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
+		return NULL;
+
 	if (!use_active &&
-	    list_is_empty(inactive(kgem, size)) &&
-	    !list_is_empty(active(kgem, size, I915_TILING_NONE)) &&
+	    list_is_empty(inactive(kgem, num_pages)) &&
+	    !list_is_empty(active(kgem, num_pages, I915_TILING_NONE)) &&
 	    !kgem_retire(kgem))
 		return NULL;
 
 	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
-		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
 		list_for_each_entry(bo, cache, vma) {
 			assert(IS_CPU_MAP(bo->map) == for_cpu);
-			assert(bo->bucket == cache_bucket(size));
+			assert(bucket(bo) == cache_bucket(num_pages));
 
-			if (size > bo->size) {
+			if (num_pages > num_pages(bo)) {
 				DBG(("inactive too small: %d < %d\n",
-				     bo->size, size));
+				     num_pages(bo), num_pages));
 				continue;
 			}
 
@@ -1874,8 +1899,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 			bo->tiling = I915_TILING_NONE;
 			bo->pitch = 0;
 			bo->delta = 0;
-			DBG(("  %s: found handle=%d (size=%d) in linear vma cache\n",
-			     __FUNCTION__, bo->handle, bo->size));
+			DBG(("  %s: found handle=%d (num_pages=%d) in linear vma cache\n",
+			     __FUNCTION__, bo->handle, num_pages(bo)));
 			assert(use_active || bo->domain != DOMAIN_GPU);
 			assert(!bo->needs_flush);
 			//assert(!kgem_busy(kgem, bo->handle));
@@ -1883,13 +1908,13 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		}
 	}
 
-	cache = use_active ? active(kgem, size, I915_TILING_NONE) : inactive(kgem, size);
+	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
 	list_for_each_entry(bo, cache, list) {
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
 		assert(!!bo->rq == !!use_active);
 
-		if (size > bo->size)
+		if (num_pages > num_pages(bo))
 			continue;
 
 		if (use_active && bo->tiling != I915_TILING_NONE)
@@ -1946,8 +1971,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		assert(bo->tiling == I915_TILING_NONE);
 		bo->pitch = 0;
 		bo->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, bo->handle, bo->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, bo->handle, num_pages(bo),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || bo->domain != DOMAIN_GPU);
 		assert(!bo->needs_flush || use_active);
@@ -1965,8 +1990,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 
 		first->pitch = 0;
 		first->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, first->handle, first->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, first->handle, num_pages(first),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || first->domain != DOMAIN_GPU);
 		assert(!first->needs_flush || use_active);
@@ -1990,7 +2015,7 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 		return NULL;
 
 	DBG(("%s: new handle=%d\n", __FUNCTION__, open_arg.handle));
-	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size);
+	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size / PAGE_SIZE);
 	if (bo == NULL) {
 		gem_close(kgem->fd, open_arg.handle);
 		return NULL;
@@ -2007,7 +2032,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 	DBG(("%s(%d)\n", __FUNCTION__, size));
 
-	size = PAGE_ALIGN(size);
+	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	bo = search_linear_cache(kgem, size, CREATE_INACTIVE);
 	if (bo)
 		return kgem_bo_reference(bo);
@@ -2019,7 +2044,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 	DBG(("%s: new handle=%d\n", __FUNCTION__, handle));
 	bo = __kgem_bo_alloc(handle, size);
 	if (bo == NULL) {
-		gem_close(kgem->fd, size);
+		gem_close(kgem->fd, handle);
 		return NULL;
 	}
 
@@ -2028,8 +2053,6 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int bpp)
 {
-	uint32_t pitch;
-
 	if (DBG_NO_TILING)
 		return tiling < 0 ? tiling : I915_TILING_NONE;
 
@@ -2058,17 +2081,6 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		}
 	}
 
-	/* First check that we can fence the whole object */
-	if (tiling &&
-	    kgem_surface_size(kgem, false, false,
-			      width, height, bpp, tiling,
-			      &pitch) > kgem->max_gpu_size) {
-		DBG(("%s: too large (%dx%d) to be fenced, discarding tiling\n",
-		     __FUNCTION__, width, height));
-		tiling = I915_TILING_NONE;
-		goto done;
-	}
-
 	if (tiling < 0)
 		return tiling;
 
@@ -2125,18 +2137,42 @@ done:
 	return tiling;
 }
 
-bool kgem_can_create_cpu(struct kgem *kgem,
+bool kgem_can_create_2d(struct kgem *kgem,
 			 int width, int height, int depth)
 {
+	int bpp = BitsPerPixel(depth);
 	uint32_t pitch, size;
 
 	if (depth < 8 || kgem->wedged)
 		return false;
 
 	size = kgem_surface_size(kgem, false, false,
-				 width, height, BitsPerPixel(depth),
+				 width, height, bpp,
+				 I915_TILING_X, &pitch);
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
-	return size > 0 && size < kgem->max_cpu_size;
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	return false;
+}
+
+bool kgem_can_create_cpu(struct kgem *kgem,
+			 int width, int height, int bpp)
+{
+	uint32_t pitch, size;
+
+	if (bpp < 8 || kgem->wedged)
+		return false;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
+				 I915_TILING_NONE, &pitch);
+	return size > 0 && size <= kgem->max_cpu_size;
 }
 
 static bool _kgem_can_create_gpu(struct kgem *kgem,
@@ -2179,7 +2215,7 @@ inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 		size = 512 * 1024;
 	else
 		size = 1024 * 1024;
-	while (size < bo->size)
+	while (size < bytes(bo))
 		size *= 2;
 
 	return size;
@@ -2213,10 +2249,52 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				 kgem->has_relaxed_fencing,
 				 flags & CREATE_SCANOUT,
 				 width, height, bpp, tiling, &pitch);
-	assert(size && size < kgem->max_cpu_size);
-	assert(tiling == I915_TILING_NONE || size < kgem->max_gpu_size);
+	assert(size && size <= kgem->max_object_size);
+	size /= PAGE_SIZE;
 	bucket = cache_bucket(size);
 
+	if (bucket >= NUM_CACHE_BUCKETS) {
+		DBG(("%s: large bo num pages=%d, bucket=%d\n",
+		     __FUNCTION__, size, bucket));
+
+		if (flags & CREATE_INACTIVE)
+			goto create;
+
+		tiled_height = kgem_aligned_height(kgem, height, I915_TILING_Y);
+		untiled_pitch = kgem_untiled_pitch(kgem,
+						   width, bpp,
+						   flags & CREATE_SCANOUT);
+
+		list_for_each_entry(bo, &kgem->large, list) {
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+
+			if (bo->tiling) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
+			} else
+				bo->pitch = untiled_pitch;
+
+			if (bo->pitch * tiled_height > bytes(bo))
+				continue;
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			return kgem_bo_reference(bo);
+		}
+
+		goto create;
+	}
+
 	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
 		if (kgem->has_llc && tiling == I915_TILING_NONE)
@@ -2227,16 +2305,16 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		cache = &kgem->vma[for_cpu].inactive[bucket];
 		do {
 			list_for_each_entry(bo, cache, vma) {
-				assert(bo->bucket == bucket);
+				assert(bucket(bo) == bucket);
 				assert(bo->refcnt == 0);
 				assert(bo->map);
 				assert(IS_CPU_MAP(bo->map) == for_cpu);
 				assert(bo->rq == NULL);
 				assert(list_is_empty(&bo->request));
 
-				if (size > bo->size) {
+				if (size > num_pages(bo)) {
 					DBG(("inactive too small: %d < %d\n",
-					     bo->size, size));
+					     num_pages(bo), size));
 					continue;
 				}
 
@@ -2275,13 +2353,14 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	if (retry > 3)
 		retry = 3;
 search_again:
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->active[bucket][tiling];
 	if (tiling) {
 		tiled_height = kgem_aligned_height(kgem, height, tiling);
 		list_for_each_entry(bo, cache, list) {
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
@@ -2292,7 +2371,7 @@ search_again:
 				continue;
 			}
 
-			if (bo->pitch * tiled_height > bo->size)
+			if (bo->pitch * tiled_height > bytes(bo))
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2305,13 +2384,13 @@ search_again:
 		}
 	} else {
 		list_for_each_entry(bo, cache, list) {
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
-			if (bo->size < size)
+			if (num_pages(bo) < size)
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2340,7 +2419,7 @@ search_again:
 							 kgem->has_relaxed_fencing,
 							 flags & CREATE_SCANOUT,
 							 width, height, bpp, tiling, &pitch);
-			cache = active(kgem, tiled_height, i);
+			cache = active(kgem, tiled_height / PAGE_SIZE, i);
 			tiled_height = kgem_aligned_height(kgem, height, i);
 			list_for_each_entry(bo, cache, list) {
 				assert(!bo->purged);
@@ -2357,7 +2436,7 @@ search_again:
 				} else
 					bo->pitch = untiled_pitch;
 
-				if (bo->pitch * tiled_height > bo->size)
+				if (bo->pitch * tiled_height > bytes(bo))
 					continue;
 
 				kgem_bo_remove_from_active(kgem, bo);
@@ -2378,13 +2457,14 @@ skip_active_search:
 		retry = 3;
 search_inactive:
 	/* Now just look for a close match and prefer any currently active */
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->inactive[bucket];
 	list_for_each_entry_safe(bo, next, cache, list) {
-		assert(bo->bucket == bucket);
+		assert(bucket(bo) == bucket);
 
-		if (size > bo->size) {
+		if (size > num_pages(bo)) {
 			DBG(("inactive too small: %d < %d\n",
-			     bo->size, size));
+			     num_pages(bo), size));
 			continue;
 		}
 
@@ -2439,6 +2519,7 @@ search_inactive:
 		goto search_inactive;
 	}
 
+create:
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
@@ -2455,7 +2536,7 @@ search_inactive:
 	if (tiling != I915_TILING_NONE)
 		bo->tiling = gem_set_tiling(kgem->fd, handle, tiling, pitch);
 
-	assert(bo->size >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
+	assert(bytes(bo) >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
 
 	DBG(("  new pitch=%d, tiling=%d, handle=%d, id=%d\n",
 	     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
@@ -2470,9 +2551,9 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 		return;
 
 	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
-	     __FUNCTION__, bo->size, bo->delta, io->used));
+	     __FUNCTION__, bo->size.bytes, bo->delta, io->used));
 
-	if (bo->delta + bo->size == io->used) {
+	if (bo->delta + bo->size.bytes == io->used) {
 		io->used = bo->delta;
 		bubble_sort_partial(kgem, io);
 	}
@@ -2508,25 +2589,30 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	va_list ap;
 	struct kgem_bo *bo;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		if (bo->exec)
 			continue;
 
-		size += bo->size;
+		if (bo->proxy) {
+			bo = bo->proxy;
+			if (bo->exec)
+				continue;
+		}
+		num_pages += num_pages(bo);
 		num_exec++;
 	}
 	va_end(ap);
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2541,11 +2627,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	struct kgem_bo *bo;
 	int num_fence = 0;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 	int fenced_size = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		if (bo->proxy)
+			bo = bo->proxy;
 		if (bo->exec) {
 			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
 				continue;
@@ -2558,7 +2646,7 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 			continue;
 		}
 
-		size += bo->size;
+		num_pages += num_pages(bo);
 		num_exec++;
 		if (kgem->gen < 40 && bo->tiling) {
 			fenced_size += kgem_bo_fenced_size(kgem, bo);
@@ -2573,13 +2661,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	if (kgem->nfence + num_fence > kgem->fence_max)
 		return false;
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2698,7 +2786,7 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 		assert(bo->rq == NULL);
 
 		VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-		munmap(CPU_MAP(bo->map), bo->size);
+		munmap(CPU_MAP(bo->map), bytes(bo));
 		bo->map = NULL;
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
@@ -2736,11 +2824,11 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		assert(bo->size <= kgem->aperture_mappable / 4);
+		assert(bytes(bo) <= kgem->aperture_mappable / 4);
 
-		kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
-		ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
+		ptr = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 			       PROT_READ | PROT_WRITE);
 		if (ptr == NULL)
 			return NULL;
@@ -2780,8 +2868,8 @@ void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		return bo->map;
 
-	kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
-	return bo->map = gem_mmap(kgem->fd, bo->handle, bo->size,
+	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+	return bo->map = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 				  PROT_READ | PROT_WRITE);
 }
 
@@ -2789,7 +2877,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_i915_gem_mmap mmap_arg;
 
-	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bo->size));
+	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bytes(bo)));
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
 
@@ -2799,18 +2887,19 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		kgem_bo_release_map(kgem, bo);
 
-	kgem_trim_vma_cache(kgem, MAP_CPU, bo->bucket);
+	kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
 
 	VG_CLEAR(mmap_arg);
 	mmap_arg.handle = bo->handle;
 	mmap_arg.offset = 0;
-	mmap_arg.size = bo->size;
+	mmap_arg.size = bytes(bo);
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-		assert(0);
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain\n",
+		       __FUNCTION__, bo->handle, bytes(bo));
 		return NULL;
 	}
 
-	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bo->size, 0, 1));
+	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bytes(bo), 0, 1));
 
 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
 	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
@@ -2876,6 +2965,9 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	if (!kgem->has_vmap)
 		return NULL;
 
+	if (size >= MAX_CACHE_SIZE)
+		return NULL;
+
 	handle = gem_vmap(kgem->fd, ptr, size, read_only);
 	if (handle == 0)
 		return NULL;
@@ -2972,6 +3064,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	if (bo == NULL)
 		return NULL;
 
+	bo->size.bytes = length;
 	bo->io = target->io;
 	bo->dirty = target->dirty;
 	bo->tiling = target->tiling;
@@ -2982,11 +3075,11 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	return bo;
 }
 
-static struct kgem_partial_bo *partial_bo_alloc(int size)
+static struct kgem_partial_bo *partial_bo_alloc(int num_pages)
 {
 	struct kgem_partial_bo *bo;
 
-	bo = malloc(sizeof(*bo) + 128 + size);
+	bo = malloc(sizeof(*bo) + 128 + num_pages * PAGE_SIZE);
 	if (bo) {
 		bo->mem = (void *)ALIGN((uintptr_t)bo + sizeof(*bo), 64);
 		bo->mmapped = false;
@@ -3010,20 +3103,20 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size < kgem->max_cpu_size);
+	assert(size <= kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
 			/* We can reuse any write buffer which we can fit */
-			if (size <= bo->base.size) {
+			if (size <= bytes(&bo->base)) {
 				if (bo->base.refcnt == 1 && bo->base.exec) {
 					DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = 0;
 					goto done;
-				} else if (bo->used + size <= bo->base.size) {
+				} else if (bo->used + size <= bytes(&bo->base)) {
 					DBG(("%s: reusing unfinished write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = bo->used;
 					goto done;
 				}
@@ -3037,24 +3130,25 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			continue;
 		}
 
-		if (bo->used + size <= bo->base.size) {
+		if (bo->used + size <= bytes(&bo->base)) {
 			DBG(("%s: reusing partial buffer? used=%d + size=%d, total=%d\n",
-			     __FUNCTION__, bo->used, size, bo->base.size));
+			     __FUNCTION__, bo->used, size, bytes(&bo->base)));
 			offset = bo->used;
 			bo->used += size;
 			goto done;
 		}
 
 		DBG(("%s: too small (%d < %d)\n",
-		     __FUNCTION__, bo->base.size - bo->used, size));
+		     __FUNCTION__, bytes(&bo->base) - bo->used, size));
 		break;
 	}
 
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
-	if (alloc >= kgem->max_cpu_size)
+	if (alloc > kgem->max_gpu_size)
 		alloc = PAGE_ALIGN(size);
+	alloc /= PAGE_SIZE;
 	if (kgem->has_cpu_bo) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
@@ -3098,7 +3192,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->base.io = true;
 			bo->mmapped = true;
 
-			alloc = bo->base.size;
+			alloc = num_pages(&bo->base);
 			goto init;
 		} else {
 			bo->base.refcnt = 0; /* for valgrind */
@@ -3107,7 +3201,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 
-	if (alloc > kgem->aperture_mappable / 4)
+	if (PAGE_SIZE * alloc > kgem->aperture_mappable / 4)
 		flags &= ~KGEM_BUFFER_INPLACE;
 
 	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
@@ -3164,7 +3258,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				bo->mmapped = true;
 				bo->base.refcnt = 1;
 
-				alloc = bo->base.size;
+				alloc = num_pages(&bo->base);
 				goto init;
 			} else {
 				kgem_bo_free(kgem, &bo->base);
@@ -3173,11 +3267,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 #else
-	alloc = ALIGN(size, 64*1024);
+	alloc = ALIGN(size, 64*1024) / PAGE_SIZE;
 #endif
 	/* Be more parsimonious with pwrite/pread buffers */
 	if ((flags & KGEM_BUFFER_INPLACE) == 0)
-		alloc = PAGE_ALIGN(size);
+		alloc = PAGE_ALIGN(size) / PAGE_SIZE;
 	flags &= ~KGEM_BUFFER_INPLACE;
 
 	old = NULL;
@@ -3188,7 +3282,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	if (old) {
 		DBG(("%s: reusing ordinary handle %d for io\n",
 		     __FUNCTION__, old->handle));
-		alloc = old->size;
+		alloc = num_pages(old);
 		bo = partial_bo_alloc(alloc);
 		if (bo == NULL)
 			return NULL;
@@ -3240,21 +3334,40 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
-		if (bo->mem == NULL) {
-			kgem_bo_free(kgem, &bo->base);
+		if (bo->mem != NULL) {
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
+			bo->need_io = false;
+			bo->base.io = true;
+			bo->mmapped = true;
+			goto init;
+		}
+
+		DBG(("%s: failing back to new pwrite buffer\n", __FUNCTION__));
+		old = &bo->base;
+		bo = partial_bo_alloc(alloc);
+		if (bo == NULL) {
+			free(old);
 			return NULL;
 		}
 
-		if (flags & KGEM_BUFFER_WRITE)
-			kgem_bo_sync__cpu(kgem, &bo->base);
+		memcpy(&bo->base, old, sizeof(*old));
+		free(old);
+
+		assert(bo->mem);
+		assert(!bo->mmapped);
 
-		bo->need_io = false;
+		list_init(&bo->base.request);
+		list_init(&bo->base.vma);
+		list_init(&bo->base.list);
+		bo->base.refcnt = 1;
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
 		bo->base.io = true;
-		bo->mmapped = true;
 	}
 init:
 	bo->base.reusable = false;
-	assert(bo->base.size == alloc);
+	assert(num_pages(&bo->base) == alloc);
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 
@@ -3263,12 +3376,12 @@ init:
 	offset = 0;
 
 	list_add(&bo->base.list, &kgem->partial);
-	DBG(("%s(size=%d) new handle=%d\n",
+	DBG(("%s(pages=%d) new handle=%d\n",
 	     __FUNCTION__, alloc, bo->base.handle));
 
 done:
 	/* adjust the position within the list to maintain decreasing order */
-	alloc = bo->base.size - bo->used;
+	alloc = bytes(&bo->base) - bo->used;
 	{
 		struct kgem_partial_bo *p, *first;
 
@@ -3276,9 +3389,9 @@ done:
 					     struct kgem_partial_bo,
 					     base.list);
 		while (&p->base.list != &kgem->partial &&
-		       alloc < p->base.size - p->used) {
+		       alloc < bytes(&p->base) - p->used) {
 			DBG(("%s: this=%d, right=%d\n",
-			     __FUNCTION__, alloc, p->base.size -p->used));
+			     __FUNCTION__, alloc, bytes(&p->base) -p->used));
 			p = list_first_entry(&p->base.list,
 					     struct kgem_partial_bo,
 					     base.list);
@@ -3287,6 +3400,7 @@ done:
 			list_move_tail(&bo->base.list, &p->base.list);
 		assert(validate_partials(kgem));
 	}
+	assert(bo->mem);
 	*ret = (char *)bo->mem + offset;
 	return kgem_create_proxy(&bo->base, offset, size);
 }
@@ -3300,6 +3414,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	int stride;
 
 	assert(width > 0 && height > 0);
+	assert(ret != NULL);
 	stride = ALIGN(width, 2) * bpp >> 3;
 	stride = ALIGN(stride, kgem->min_alignment);
 
@@ -3307,8 +3422,12 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	     __FUNCTION__, width, height, bpp, stride));
 
 	bo = kgem_create_buffer(kgem, stride * ALIGN(height, 2), flags, ret);
-	if (bo == NULL)
+	if (bo == NULL) {
+		DBG(("%s: allocation failure for upload buffer\n",
+		     __FUNCTION__));
 		return NULL;
+	}
+	assert(*ret != NULL);
 
 	if (height & 1) {
 		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
@@ -3319,7 +3438,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		 */
 		if (io->used)
 			io->used -= stride;
-		bo->size -= stride;
+		bo->size.bytes -= stride;
 		bubble_sort_partial(kgem, io);
 	}
 
@@ -3357,8 +3476,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
-	uint32_t offset = _bo->delta, length = _bo->size;
+	uint32_t offset = _bo->delta, length = _bo->size.bytes;
 
+	assert(_bo->io);
 	assert(_bo->exec == NULL);
 	if (_bo->proxy)
 		_bo = _bo->proxy;
@@ -3461,7 +3581,7 @@ kgem_replace_bo(struct kgem *kgem,
 	assert(src->tiling == I915_TILING_NONE);
 
 	size = height * pitch;
-	size = PAGE_ALIGN(size);
+	size = PAGE_ALIGN(size) / PAGE_SIZE;
 
 	dst = search_linear_cache(kgem, size, 0);
 	if (dst == NULL)
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 0dc67da..2631e81 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -66,10 +66,16 @@ struct kgem_bo {
 	uint32_t handle;
 	uint32_t presumed_offset;
 	uint32_t delta;
-	uint32_t size:28;
-	uint32_t bucket:4;
-#define MAX_OBJECT_SIZE (1 << 28)
-
+	union {
+		struct {
+			uint32_t count:27;
+#define PAGE_SIZE 4096
+			uint32_t bucket:5;
+#define NUM_CACHE_BUCKETS 16
+#define MAX_CACHE_SIZE (1 << (NUM_CACHE_BUCKETS+12))
+		} pages;
+		uint32_t bytes;
+	} size;
 	uint32_t pitch : 18; /* max 128k */
 	uint32_t tiling : 2;
 	uint32_t reusable : 1;
@@ -100,8 +106,6 @@ enum {
 	NUM_MAP_TYPES,
 };
 
-#define NUM_CACHE_BUCKETS 16
-
 struct kgem {
 	int fd;
 	int wedged;
@@ -117,7 +121,10 @@ struct kgem {
 		KGEM_BLT,
 	} mode, ring;
 
-	struct list flushing, active[NUM_CACHE_BUCKETS][3], inactive[NUM_CACHE_BUCKETS];
+	struct list flushing;
+	struct list large;
+	struct list active[NUM_CACHE_BUCKETS][3];
+	struct list inactive[NUM_CACHE_BUCKETS];
 	struct list partial;
 	struct list requests;
 	struct kgem_request *next_request;
@@ -154,7 +161,7 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_gpu_size, max_cpu_size;
+	uint32_t max_gpu_size, max_cpu_size, max_object_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
@@ -194,8 +201,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
+bool kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
 bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp);
-bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int depth);
+bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int bpp);
 
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
@@ -354,11 +362,46 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
 
+static inline int kgem_bo_size(struct kgem_bo *bo)
+{
+	assert(!(bo->proxy && bo->io));
+	return PAGE_SIZE * bo->size.pages.count;
+}
+
+static inline int kgem_buffer_size(struct kgem_bo *bo)
+{
+	assert(bo->proxy && bo->io);
+	return bo->size.bytes;
+}
+
+static inline bool kgem_bo_can_blt(struct kgem *kgem,
+				   struct kgem_bo *bo)
+{
+	int pitch;
+
+	if (bo->tiling == I915_TILING_Y) {
+		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+		     __FUNCTION__, bo->handle));
+		return false;
+	}
+
+	pitch = bo->pitch;
+	if (kgem->gen >= 40 && bo->tiling)
+		pitch /= 4;
+	if (pitch > MAXSHORT) {
+		DBG(("%s: can not blt to handle=%d, adjusted pitch=%d\n",
+		     __FUNCTION__, pitch));
+		return false;
+	}
+
+	return true;
+}
+
 static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 				       struct kgem_bo *bo)
 {
 	DBG_HDR(("%s: domain=%d, offset: %d size: %d\n",
-		 __FUNCTION__, bo->domain, bo->presumed_offset, bo->size));
+		 __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
 
 	if (bo->domain == DOMAIN_GTT)
 		return true;
@@ -371,9 +414,9 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 		return false;
 
 	if (!bo->presumed_offset)
-		return bo->size <= kgem->aperture_mappable / 4;
+		return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
 
-	return bo->presumed_offset + bo->size <= kgem->aperture_mappable;
+	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
 static inline bool kgem_bo_mapped(struct kgem_bo *bo)
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index f21220f..9e7360a 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -79,7 +79,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	} else {
 		bo = kgem_debug_get_bo_for_reloc_entry(kgem, reloc);
 		base = kgem_bo_map__debug(kgem, bo);
-		size = bo->size;
+		size = kgem_bo_size(bo);
 	}
 	ptr = (char *)base + reloc->delta;
 
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 5910daf..d9ba773 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -92,7 +92,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define DEBUG_NO_RENDER 0
 #define DEBUG_NO_BLT 0
-#define DEBUG_NO_IO 0
 
 #define DEBUG_FLUSH_BATCH 0
 #define DEBUG_FLUSH_SYNC 0
@@ -647,7 +646,7 @@ void sna_read_boxes(struct sna *sna,
 		    struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 		    PixmapPtr dst, int16_t dst_dx, int16_t dst_dy,
 		    const BoxRec *box, int n);
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int n);
@@ -657,10 +656,10 @@ void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 			  const BoxRec *box, int nbox,
 			  uint32_t and, uint32_t or);
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride);
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **bo,
+		 const void *src, int stride);
 struct kgem_bo *sna_replace__xor(struct sna *sna,
 				 PixmapPtr pixmap,
 				 struct kgem_bo *bo,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f2997d0..ce35113 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -243,9 +243,14 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	if (priv->ptr)
 		goto done;
 
+	DBG(("%s: pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
 	assert(priv->stride);
 
-	if (sna->kgem.has_cpu_bo || !priv->gpu) {
+	if ((sna->kgem.has_cpu_bo || !priv->gpu) &&
+	    kgem_can_create_cpu(&sna->kgem,
+				pixmap->drawable.width,
+				pixmap->drawable.height,
+				pixmap->drawable.bitsPerPixel)) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
@@ -270,8 +275,11 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 		}
 	}
 
-	if (priv->ptr == NULL)
+	if (priv->ptr == NULL) {
+		DBG(("%s: allocating ordinary memory for shadow pixels [%d bytes]\n",
+		     __FUNCTION__, priv->stride * pixmap->drawable.height));
 		priv->ptr = malloc(priv->stride * pixmap->drawable.height);
+	}
 
 	assert(priv->ptr);
 done:
@@ -289,7 +297,7 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
-		     __FUNCTION__, priv->cpu_bo->handle, priv->cpu_bo->size));
+		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
 
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 		priv->cpu_bo = NULL;
@@ -515,10 +523,10 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 		break;
 
 	default:
-		if (!kgem_can_create_gpu(&sna->kgem,
-					 pixmap->drawable.width,
-					 pixmap->drawable.height,
-					 pixmap->drawable.bitsPerPixel))
+		if (!kgem_can_create_2d(&sna->kgem,
+					pixmap->drawable.width,
+					pixmap->drawable.height,
+					pixmap->drawable.depth))
 			return NULL;
 		break;
 	}
@@ -669,8 +677,11 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
-	if (!kgem_can_create_cpu(&sna->kgem, width, height, depth))
+	if (!kgem_can_create_2d(&sna->kgem, width, height, depth)) {
+		DBG(("%s: can not use GPU, just creating shadow\n",
+		     __FUNCTION__));
 		return create_pixmap(sna, screen, width, height, depth, usage);
+	}
 
 	if (!sna->have_render)
 		return create_pixmap(sna, screen,
@@ -704,6 +715,8 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 
 	pad = PixmapBytePad(width, depth);
 	if (pad * height <= 4096) {
+		DBG(("%s: small buffer [%d], attaching to shadow pixmap\n",
+		     __FUNCTION__, pad * height));
 		pixmap = create_pixmap(sna, screen,
 				       width, height, depth, usage);
 		if (pixmap == NullPixmap)
@@ -713,6 +726,9 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	} else {
 		struct sna_pixmap *priv;
 
+		DBG(("%s: creating GPU pixmap %dx%d, stride=%d\n",
+		     __FUNCTION__, width, height, pad));
+
 		pixmap = create_pixmap(sna, screen, 0, 0, depth, usage);
 		if (pixmap == NullPixmap)
 			return NullPixmap;
@@ -1609,19 +1625,20 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 				    box->x1 <= 0 && box->y1 <= 0 &&
 				    box->x2 >= pixmap->drawable.width &&
 				    box->y2 >= pixmap->drawable.height) {
-					priv->gpu_bo =
-						sna_replace(sna, pixmap,
-							    priv->gpu_bo,
-							    pixmap->devPrivate.ptr,
-							    pixmap->devKind);
+					ok = sna_replace(sna, pixmap,
+							 &priv->gpu_bo,
+							 pixmap->devPrivate.ptr,
+							 pixmap->devKind);
 				} else {
-					sna_write_boxes(sna, pixmap,
-							priv->gpu_bo, 0, 0,
-							pixmap->devPrivate.ptr,
-							pixmap->devKind,
-							0, 0,
-							box, n);
+					ok = sna_write_boxes(sna, pixmap,
+							     priv->gpu_bo, 0, 0,
+							     pixmap->devPrivate.ptr,
+							     pixmap->devKind,
+							     0, 0,
+							     box, n);
 				}
+				if (!ok)
+					return false;
 			}
 		}
 
@@ -1637,12 +1654,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, 1);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, 1);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, 1);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1658,12 +1677,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, n);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, n);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1671,7 +1692,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 done:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return true;
@@ -1811,7 +1832,7 @@ done:
 
 use_gpu_bo:
 	priv->clear = false;
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive,
 			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
@@ -1978,6 +1999,17 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (!sna_pixmap_move_to_gpu(pixmap, flags))
 		return NULL;
 
+	/* For large bo, try to keep only a single copy around */
+	if (!priv->gpu && priv->ptr) {
+		sna_damage_all(&priv->gpu_damage,
+			       pixmap->drawable.width,
+			       pixmap->drawable.height);
+		sna_damage_destroy(&priv->cpu_damage);
+		priv->undamaged = false;
+		list_del(&priv->list);
+		sna_pixmap_free_cpu(to_sna_from_pixmap(pixmap), priv);
+	}
+
 	return priv;
 }
 
@@ -2070,19 +2102,20 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			if (n == 1 && !priv->pinned &&
 			    (box->x2 - box->x1) >= pixmap->drawable.width &&
 			    (box->y2 - box->y1) >= pixmap->drawable.height) {
-				priv->gpu_bo =
-					sna_replace(sna, pixmap,
-						    priv->gpu_bo,
-						    pixmap->devPrivate.ptr,
-						    pixmap->devKind);
+				ok = sna_replace(sna, pixmap,
+						 &priv->gpu_bo,
+						 pixmap->devPrivate.ptr,
+						 pixmap->devKind);
 			} else {
-				sna_write_boxes(sna, pixmap,
+				ok = sna_write_boxes(sna, pixmap,
 						priv->gpu_bo, 0, 0,
 						pixmap->devPrivate.ptr,
 						pixmap->devKind,
 						0, 0,
 						box, n);
 			}
+			if (!ok)
+				return NULL;
 		}
 	}
 
@@ -2098,7 +2131,7 @@ done:
 	if (DAMAGE_IS_ALL(priv->gpu_damage))
 		priv->undamaged = false;
 active:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return priv;
@@ -2321,11 +2354,8 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	    !priv->pinned && nbox == 1 &&
 	    box->x1 <= 0 && box->y1 <= 0 &&
 	    box->x2 >= pixmap->drawable.width &&
-	    box->y2 >= pixmap->drawable.height) {
-		priv->gpu_bo =
-			sna_replace(sna, pixmap, priv->gpu_bo, bits, stride);
-		return TRUE;
-	}
+	    box->y2 >= pixmap->drawable.height)
+		return sna_replace(sna, pixmap, &priv->gpu_bo, bits, stride);
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
@@ -2341,15 +2371,13 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		kgem_bo_destroy(&sna->kgem, src_bo);
 	}
 
-	if (!ok && gc->alu == GXcopy) {
-		sna_write_boxes(sna, pixmap,
-				priv->gpu_bo, 0, 0,
-				bits,
-				stride,
-				-x, -y,
-				box, nbox);
-		ok = TRUE;
-	}
+	if (!ok && gc->alu == GXcopy)
+		ok = sna_write_boxes(sna, pixmap,
+				     priv->gpu_bo, 0, 0,
+				     bits,
+				     stride,
+				     -x, -y,
+				     box, nbox);
 
 	return ok;
 }
@@ -3213,7 +3241,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			}
 		} else {
 			dst_priv->clear = false;
-			if (!dst_priv->pinned)
+			if (!dst_priv->pinned && dst_priv->gpu)
 				list_move(&dst_priv->inactive,
 					  &sna->active_pixmaps);
 		}
@@ -3400,10 +3428,10 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 				assert(src_dy + box->y1 + dst_pixmap->drawable.height <= src_pixmap->drawable.height);
 				assert(src_dx + box->x1 + dst_pixmap->drawable.width <= src_pixmap->drawable.width);
 
-				dst_priv->gpu_bo =
-					sna_replace(sna, dst_pixmap,
-						    dst_priv->gpu_bo,
-						    bits, stride);
+				if (!sna_replace(sna, dst_pixmap,
+						 &dst_priv->gpu_bo,
+						 bits, stride))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					sna_damage_destroy(&dst_priv->cpu_damage);
@@ -3416,12 +3444,13 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			} else {
 				DBG(("%s: dst is on the GPU, src is on the CPU, uploading into dst\n",
 				     __FUNCTION__));
-				sna_write_boxes(sna, dst_pixmap,
-						dst_priv->gpu_bo, dst_dx, dst_dy,
-						src_pixmap->devPrivate.ptr,
-						src_pixmap->devKind,
-						src_dx, src_dy,
-						box, n);
+				if (!sna_write_boxes(sna, dst_pixmap,
+						     dst_priv->gpu_bo, dst_dx, dst_dy,
+						     src_pixmap->devPrivate.ptr,
+						     src_pixmap->devKind,
+						     src_dx, src_dy,
+						     box, n))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					RegionTranslate(&region, dst_dx, dst_dy);
@@ -11502,7 +11531,7 @@ static void sna_accel_inactive(struct sna *sna)
 		count = bytes = 0;
 		list_for_each_entry(priv, &sna->inactive_clock[1], inactive)
 			if (!priv->pinned)
-				count++, bytes += priv->gpu_bo->size;
+				count++, bytes += kgem_bo_size(priv->gpu_bo);
 
 		DBG(("%s: trimming %d inactive GPU buffers, %d bytes\n",
 		    __FUNCTION__, count, bytes));
@@ -11528,6 +11557,9 @@ static void sna_accel_inactive(struct sna *sna)
 		priv = list_first_entry(&sna->inactive_clock[1],
 					struct sna_pixmap,
 					inactive);
+		assert(priv->gpu);
+		assert(priv->gpu_bo);
+
 		/* XXX Rather than discarding the GPU buffer here, we
 		 * could mark it purgeable and allow the shrinker to
 		 * reap its storage only under memory pressure.
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 535628c..7efbcf9 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -235,7 +235,7 @@ inline static void sna_blt_fill_one(struct sna *sna,
 
 	assert(x >= 0);
 	assert(y >= 0);
-	assert((y+height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((y+height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -358,10 +358,10 @@ static void sna_blt_alpha_fixup_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -409,10 +409,10 @@ static void sna_blt_copy_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -787,7 +787,7 @@ inline static void _sna_blt_fill_box(struct sna *sna,
 
 	assert(box->x1 >= 0);
 	assert(box->y1 >= 0);
-	assert(box->y2 * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert(box->y2 * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -1106,7 +1106,7 @@ prepare_blt_copy(struct sna *sna,
 	PixmapPtr src = op->u.blt.src_pixmap;
 	struct sna_pixmap *priv = sna_pixmap(src);
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo))
 		return FALSE;
 
 	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
@@ -1176,9 +1176,8 @@ blt_put_composite(struct sna *sna,
 		data += (src_x - dst_x) * bpp / 8;
 		data += (src_y - dst_y) * pitch;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, dst_priv->gpu_bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		BoxRec box;
 
@@ -1215,9 +1214,8 @@ fastcall static void blt_put_composite_box(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1250,9 +1248,8 @@ static void blt_put_composite_boxes(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1573,9 +1570,13 @@ sna_blt_composite(struct sna *sna,
 
 	tmp->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	priv = sna_pixmap_move_to_gpu(tmp->dst.pixmap, MOVE_WRITE | MOVE_READ);
-	if (priv == NULL || priv->gpu_bo->tiling == I915_TILING_Y) {
-		DBG(("%s: dst not on the gpu or using Y-tiling\n",
-		     __FUNCTION__));
+	if (priv == NULL) {
+		DBG(("%s: dst not attached\n", __FUNCTION__));
+		return FALSE;
+	}
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo)) {
+		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+		     __FUNCTION__, priv->gpu_bo->tiling, priv->gpu_bo->pitch));
 		return FALSE;
 	}
 
@@ -1747,7 +1748,7 @@ bool sna_blt_fill(struct sna *sna, uint8_t alu,
 
 	DBG(("%s(alu=%d, pixel=%x, bpp=%d)\n", __FUNCTION__, alu, pixel, bpp));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(&sna->kgem, bo)) {
 		DBG(("%s: rejected due to incompatible Y-tiling\n",
 		     __FUNCTION__));
 		return FALSE;
@@ -1797,10 +1798,10 @@ bool sna_blt_copy(struct sna *sna, uint8_t alu,
 	return FALSE;
 #endif
 
-	if (src->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, src))
 		return FALSE;
 
-	if (dst->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, dst))
 		return FALSE;
 
 	if (!sna_blt_copy_init(sna, &op->base.u.blt,
@@ -1926,7 +1927,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	DBG(("%s (%d, %08x, %d) x %d\n",
 	     __FUNCTION__, bpp, pixel, alu, nbox));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(kgem, bo)) {
 		DBG(("%s: fallback -- dst uses Y-tiling\n", __FUNCTION__));
 		return FALSE;
 	}
@@ -2020,7 +2021,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 
 			assert(box->x1 >= 0);
 			assert(box->y1 >= 0);
-			assert(box->y2 * bo->pitch <= bo->size);
+			assert(box->y2 * bo->pitch <= kgem_bo_size(bo));
 
 			b = kgem->batch + kgem->nbatch;
 			kgem->nbatch += 3;
@@ -2075,8 +2076,13 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	    src_bo->tiling, dst_bo->tiling,
 	    src_bo->pitch, dst_bo->pitch));
 
-	if (src_bo->tiling == I915_TILING_Y || dst_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(kgem, src_bo) || !kgem_bo_can_blt(kgem, dst_bo)) {
+		DBG(("%s: cannot blt to src? %d or dst? %d\n",
+		     __FUNCTION__,
+		     kgem_bo_can_blt(kgem, src_bo),
+		     kgem_bo_can_blt(kgem, dst_bo)));
 		return FALSE;
+	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	if (bpp == 32)
@@ -2087,7 +2093,7 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		cmd |= BLT_SRC_TILED;
 		src_pitch >>= 2;
 	}
-	assert(src_pitch < MAXSHORT);
+	assert(src_pitch <= MAXSHORT);
 
 	br13 = dst_bo->pitch;
 	if (kgem->gen >= 40 && dst_bo->tiling) {
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index f3ca212..14a7901 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -44,6 +44,27 @@
 
 /* XXX Need to avoid using GTT fenced access for I915_TILING_Y on 855GM */
 
+static Bool
+box_intersect(BoxPtr a, const BoxRec *b)
+{
+	if (a->x1 < b->x1)
+		a->x1 = b->x1;
+	if (a->x2 > b->x2)
+		a->x2 = b->x2;
+	if (a->y1 < b->y1)
+		a->y1 = b->y1;
+	if (a->y2 > b->y2)
+		a->y2 = b->y2;
+
+	return a->x1 < a->x2 && a->y1 < a->y2;
+}
+
+static inline bool must_tile(struct sna *sna, int width, int height)
+{
+	return (width  > sna->render.max_3d_size ||
+		height > sna->render.max_3d_size);
+}
+
 static void read_boxes_inplace(struct kgem *kgem,
 			       struct kgem_bo *bo, int16_t src_dx, int16_t src_dy,
 			       PixmapPtr pixmap, int16_t dst_dx, int16_t dst_dy,
@@ -105,13 +126,13 @@ void sna_read_boxes(struct sna *sna,
 	for (n = 0; n < nbox; n++) {
 		if (box[n].x1 + src_dx < 0 || box[n].y1 + src_dy < 0 ||
 		    (box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 > src_bo->pitch ||
-		    (box[n].y2 + src_dy) * src_bo->pitch > src_bo->size)
+		    (box[n].y2 + src_dy) * src_bo->pitch > kgem_bo_size(src_bo))
 		{
 			FatalError("source out-of-bounds box[%d]=(%d, %d), (%d, %d) + (%d, %d), pitch=%d, size=%d\n", n,
 				   box[n].x1, box[n].y1,
 				   box[n].x2, box[n].y2,
 				   src_dx, src_dy,
-				   src_bo->pitch, src_bo->size);
+				   src_bo->pitch, kgem_bo_size(src_bo));
 		}
 	}
 #endif
@@ -132,7 +153,6 @@ fallback:
 		return;
 	}
 
-	/* Is it worth detiling? */
 	extents = box[0];
 	for (n = 1; n < nbox; n++) {
 		if (box[n].x1 < extents.x1)
@@ -145,11 +165,16 @@ fallback:
 		if (box[n].y2 > extents.y2)
 			extents.y2 = box[n].y2;
 	}
-	if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
-		goto fallback;
+	if (kgem_bo_is_mappable(kgem, src_bo)) {
+		/* Is it worth detiling? */
+		if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
+			goto fallback;
+	}
 
 	/* Try to avoid switching rings... */
-	if (src_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, src_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
@@ -161,38 +186,124 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		dst_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_LAST,
-					       &ptr);
-		if (!dst_bo)
-			goto fallback;
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling download, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						c++;
+					}
+					if (c == clipped)
+						continue;
+
+					dst_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_LAST,
+								       &ptr);
+					if (!dst_bo)
+						goto fallback;
+
+					if (!sna->render.copy_boxes(sna, GXcopy,
+								    dst, src_bo, src_dx, src_dy,
+								    &tmp, dst_bo, -tile.x1, -tile.y1,
+								    clipped, c-clipped)) {
+						kgem_bo_destroy(&sna->kgem, dst_bo);
+						goto fallback;
+					}
+
+					kgem_bo_submit(&sna->kgem, dst_bo);
+					kgem_buffer_read_sync(kgem, dst_bo);
+
+					while (c-- != clipped) {
+						memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+							   dst_bo->pitch, dst->devKind,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x1 + dst_dx,
+							   c->y1 + dst_dy,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+					}
+
+					kgem_bo_destroy(&sna->kgem, dst_bo);
+				}
+			}
 
-		if (!sna->render.copy_boxes(sna, GXcopy,
-					    dst, src_bo, src_dx, src_dy,
-					    &tmp, dst_bo, -extents.x1, -extents.y1,
-					    box, nbox)) {
-			kgem_bo_destroy(&sna->kgem, dst_bo);
-			goto fallback;
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			dst_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_LAST,
+						       &ptr);
+			if (!dst_bo)
+				goto fallback;
+
+			if (!sna->render.copy_boxes(sna, GXcopy,
+						    dst, src_bo, src_dx, src_dy,
+						    &tmp, dst_bo, -extents.x1, -extents.y1,
+						    box, nbox)) {
+				kgem_bo_destroy(&sna->kgem, dst_bo);
+				goto fallback;
+			}
 
-		kgem_bo_submit(&sna->kgem, dst_bo);
-		kgem_buffer_read_sync(kgem, dst_bo);
+			kgem_bo_submit(&sna->kgem, dst_bo);
+			kgem_buffer_read_sync(kgem, dst_bo);
+
+			for (n = 0; n < nbox; n++) {
+				memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+					   dst_bo->pitch, dst->devKind,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x1 + dst_dx,
+					   box[n].y1 + dst_dy,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
-				   dst_bo->pitch, dst->devKind,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x1 + dst_dx,
-				   box[n].y1 + dst_dy,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
+			kgem_bo_destroy(&sna->kgem, dst_bo);
 		}
-
-		kgem_bo_destroy(&sna->kgem, dst_bo);
 		return;
 	}
 
@@ -270,7 +381,7 @@ fallback:
 			assert(tmp_box[n].x1 + src_dx >= 0);
 			assert((tmp_box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 <= src_bo->pitch);
 			assert(tmp_box[n].y1 + src_dy >= 0);
-			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= src_bo->size);
+			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= kgem_bo_size(src_bo));
 
 			b[0] = cmd;
 			b[1] = br13 | pitch;
@@ -299,7 +410,7 @@ fallback:
 		_kgem_set_mode(kgem, KGEM_BLT);
 		tmp_box += nbox_this_time;
 	} while (1);
-	assert(offset == dst_bo->size);
+	assert(offset == kgem_buffer_size(dst_bo));
 
 	kgem_buffer_read_sync(kgem, dst_bo);
 
@@ -331,12 +442,12 @@ fallback:
 
 		src += pitch * height;
 	} while (--nbox);
-	assert(src - (char *)ptr == dst_bo->size);
+	assert(src - (char *)ptr == kgem_buffer_size(dst_bo));
 	kgem_bo_destroy(kgem, dst_bo);
 	sna->blt_state.fill_bo = 0;
 }
 
-static void write_boxes_inplace(struct kgem *kgem,
+static bool write_boxes_inplace(struct kgem *kgem,
 				const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
 				struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
 				const BoxRec *box, int n)
@@ -346,11 +457,14 @@ static void write_boxes_inplace(struct kgem *kgem,
 	DBG(("%s x %d, handle=%d, tiling=%d\n",
 	     __FUNCTION__, n, bo->handle, bo->tiling));
 
+	if (!kgem_bo_is_mappable(kgem, bo))
+		return false;
+
 	kgem_bo_submit(kgem, bo);
 
 	dst = kgem_bo_map(kgem, bo);
 	if (dst == NULL)
-		return;
+		return false;
 
 	assert(dst != src);
 
@@ -364,7 +478,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 		assert(box->x1 + dst_dx >= 0);
 		assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
 		assert(box->y1 + dst_dy >= 0);
-		assert((box->y2 + dst_dy)*bo->pitch <= bo->size);
+		assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
 
 		assert(box->x1 + src_dx >= 0);
 		assert((box->x2 + src_dx)*bpp <= 8*stride);
@@ -377,6 +491,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 			   box->x2 - box->x1, box->y2 - box->y1);
 		box++;
 	} while (--n);
+	return true;
 }
 
 static bool upload_inplace(struct kgem *kgem,
@@ -384,9 +499,6 @@ static bool upload_inplace(struct kgem *kgem,
 			   const BoxRec *box,
 			   int n, int bpp)
 {
-	if (DEBUG_NO_IO)
-		return kgem_bo_is_mappable(kgem, bo);
-
 	/* If we are writing through the GTT, check first if we might be
 	 * able to almagamate a series of small writes into a single
 	 * operation.
@@ -404,13 +516,14 @@ static bool upload_inplace(struct kgem *kgem,
 	return !kgem_bo_map_will_stall(kgem, bo);
 }
 
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int nbox)
 {
 	struct kgem *kgem = &sna->kgem;
 	struct kgem_bo *src_bo;
+	BoxRec extents;
 	void *ptr;
 	int offset;
 	int n, cmd, br13;
@@ -419,30 +532,30 @@ void sna_write_boxes(struct sna *sna, PixmapPtr dst,
 
 	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
-		write_boxes_inplace(kgem,
-				    src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
-				    dst_bo, dst_dx, dst_dy,
-				    box, nbox);
-		return;
+		return write_boxes_inplace(kgem,
+					   src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
+					   dst_bo, dst_dx, dst_dy,
+					   box, nbox);
 	}
 
-	/* Try to avoid switching rings... */
-	if (dst_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
-		PixmapRec tmp;
-		BoxRec extents;
+	extents = box[0];
+	for (n = 1; n < nbox; n++) {
+		if (box[n].x1 < extents.x1)
+			extents.x1 = box[n].x1;
+		if (box[n].x2 > extents.x2)
+			extents.x2 = box[n].x2;
 
-		extents = box[0];
-		for (n = 1; n < nbox; n++) {
-			if (box[n].x1 < extents.x1)
-				extents.x1 = box[n].x1;
-			if (box[n].x2 > extents.x2)
-				extents.x2 = box[n].x2;
+		if (box[n].y1 < extents.y1)
+			extents.y1 = box[n].y1;
+		if (box[n].y2 > extents.y2)
+			extents.y2 = box[n].y2;
+	}
 
-			if (box[n].y1 < extents.y1)
-				extents.y1 = box[n].y1;
-			if (box[n].y2 > extents.y2)
-				extents.y2 = box[n].y2;
-		}
+	/* Try to avoid switching rings... */
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, dst_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
+		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
 		tmp.drawable.height = extents.y2 - extents.y1;
@@ -453,37 +566,130 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		src_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_WRITE_INPLACE,
-					       &ptr);
-		if (!src_bo)
-			goto fallback;
+		DBG(("%s: upload (%d, %d)x(%d, %d), max %dx%d\n",
+		     __FUNCTION__,
+		     extents.x1, extents.y1,
+		     tmp.drawable.width, tmp.drawable.height,
+		     sna->render.max_3d_size, sna->render.max_3d_size));
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling upload, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					src_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_WRITE_INPLACE,
+								       &ptr);
+					if (!src_bo)
+						goto fallback;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+							   stride, src_bo->pitch,
+							   c->x1 + src_dx,
+							   c->y1 + src_dy,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+						c++;
+					}
+
+					if (c != clipped)
+						n = sna->render.copy_boxes(sna, GXcopy,
+									   &tmp, src_bo, -tile.x1, -tile.y1,
+									   dst, dst_bo, dst_dx, dst_dy,
+									   clipped, c - clipped);
+					else
+						n = 1;
+
+					kgem_bo_destroy(&sna->kgem, src_bo);
+
+					if (!n)
+						goto fallback;
+				}
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
-				   stride, src_bo->pitch,
-				   box[n].x1 + src_dx,
-				   box[n].y1 + src_dy,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			src_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_WRITE_INPLACE,
+						       &ptr);
+			if (!src_bo)
+				goto fallback;
+
+			for (n = 0; n < nbox; n++) {
+				DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+				     __FUNCTION__,
+				     box[n].x1, box[n].y1,
+				     box[n].x2, box[n].y2,
+				     src_dx, src_dy,
+				     box[n].x1 - extents.x1,
+				     box[n].y1 - extents.y1));
+				memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+					   stride, src_bo->pitch,
+					   box[n].x1 + src_dx,
+					   box[n].y1 + src_dy,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		n = sna->render.copy_boxes(sna, GXcopy,
-					   &tmp, src_bo, -extents.x1, -extents.y1,
-					   dst, dst_bo, dst_dx, dst_dy,
-					   box, nbox);
+			n = sna->render.copy_boxes(sna, GXcopy,
+						   &tmp, src_bo, -extents.x1, -extents.y1,
+						   dst, dst_bo, dst_dx, dst_dy,
+						   box, nbox);
 
-		kgem_bo_destroy(&sna->kgem, src_bo);
+			kgem_bo_destroy(&sna->kgem, src_bo);
 
-		if (!n)
-			goto fallback;
+			if (!n)
+				goto fallback;
+		}
 
-		return;
+		return true;
 	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
@@ -586,7 +792,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -597,6 +803,7 @@ fallback:
 	} while (nbox);
 
 	sna->blt_state.fill_bo = 0;
+	return true;
 }
 
 static void
@@ -823,7 +1030,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -951,11 +1158,12 @@ indirect_replace(struct sna *sna,
 	return ret;
 }
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride)
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **_bo,
+		 const void *src, int stride)
 {
+	struct kgem_bo *bo = *_bo;
 	struct kgem *kgem = &sna->kgem;
 	void *dst;
 
@@ -968,7 +1176,7 @@ struct kgem_bo *sna_replace(struct sna *sna,
 
 	if ((!kgem_bo_mapped(bo) || bo->rq) &&
 	    indirect_replace(sna, pixmap, bo, src, stride))
-		return bo;
+		return true;
 
 	if (kgem_bo_is_busy(bo)) {
 		struct kgem_bo *new_bo;
@@ -979,26 +1187,26 @@ struct kgem_bo *sna_replace(struct sna *sna,
 					pixmap->drawable.bitsPerPixel,
 					bo->tiling,
 					CREATE_GTT_MAP | CREATE_INACTIVE);
-		if (new_bo) {
-			kgem_bo_destroy(kgem, bo);
+		if (new_bo)
 			bo = new_bo;
-		}
 	}
 
 	if (bo->tiling == I915_TILING_NONE && bo->pitch == stride) {
-		kgem_bo_write(kgem, bo, src,
-			      (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8);
+		if (!kgem_bo_write(kgem, bo, src,
+				   (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
+			goto err;
 	} else {
 		if (kgem_bo_is_mappable(kgem, bo)) {
 			dst = kgem_bo_map(kgem, bo);
-			if (dst) {
-				memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
-					   stride, bo->pitch,
-					   0, 0,
-					   0, 0,
-					   pixmap->drawable.width,
-					   pixmap->drawable.height);
-			}
+			if (!dst)
+				goto err;
+
+			memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
+				   stride, bo->pitch,
+				   0, 0,
+				   0, 0,
+				   pixmap->drawable.width,
+				   pixmap->drawable.height);
 		} else {
 			BoxRec box;
 
@@ -1006,14 +1214,23 @@ struct kgem_bo *sna_replace(struct sna *sna,
 			box.x2 = pixmap->drawable.width;
 			box.y2 = pixmap->drawable.height;
 
-			sna_write_boxes(sna, pixmap,
-					bo, 0, 0,
-					src, stride, 0, 0,
-					&box, 1);
+			if (!sna_write_boxes(sna, pixmap,
+					     bo, 0, 0,
+					     src, stride, 0, 0,
+					     &box, 1))
+				goto err;
 		}
 	}
 
-	return bo;
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, *_bo);
+	*_bo = bo;
+	return true;
+
+err:
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, bo);
+	return false;
 }
 
 struct kgem_bo *sna_replace__xor(struct sna *sna,
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index f9151e0..7077f36 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -696,6 +696,11 @@ static int sna_render_picture_downsample(struct sna *sna,
 	DBG(("%s: creating temporary GPU bo %dx%d\n",
 	     __FUNCTION__, width, height));
 
+	if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, ow, oh,
+						dst_x, dst_y);
+
 	tmp = screen->CreatePixmap(screen,
 				   width, height,
 				   pixmap->drawable.depth,
@@ -1306,9 +1311,6 @@ do_fixup:
 		return 0;
 	}
 
-	/* XXX Convolution filter? */
-	memset(ptr, 0, channel->bo->size);
-
 	/* Composite in the original format to preserve idiosyncracies */
 	if (picture->format == channel->pict_format)
 		dst = pixman_image_create_bits(picture->format,
@@ -1354,7 +1356,7 @@ do_fixup:
 					       w, h);
 			pixman_image_unref(src);
 		} else {
-			memset(ptr, 0, channel->bo->size);
+			memset(ptr, 0, kgem_buffer_size(channel->bo));
 			dst = src;
 		}
 	}
@@ -1528,7 +1530,7 @@ sna_render_composite_redirect(struct sna *sna,
 	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
 		int y1, y2;
 
-		assert(op->dst.pixmap.drawable.height > sna->render.max_3d_size);
+		assert(op->dst.pixmap->drawable.height > sna->render.max_3d_size);
 		y1 =  y + op->dst.y;
 		y2 =  y1 + height;
 		y1 &= y1 & (64 - 1);
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 7b759a7..cec0473 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -100,7 +100,7 @@ sna_video_buffer(struct sna *sna,
 		 struct sna_video_frame *frame)
 {
 	/* Free the current buffer if we're going to have to reallocate */
-	if (video->buf && video->buf->size < frame->size)
+	if (video->buf && kgem_bo_size(video->buf) < frame->size)
 		sna_video_free_buffers(sna, video);
 
 	if (video->buf == NULL)
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index d99f884..1aaf972 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -271,7 +271,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 			return BadAlloc;
 		}
 
-		assert(frame.bo->size >= frame.size);
+		assert(kgem_bo_size(frame.bo) >= frame.size);
 	} else {
 		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
 		if (frame.bo == NULL) {
commit 03211f4b0b7e32b6d7dc28e60be72db204b8c8d4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 23:18:05 2012 +0000

    sna: Guard against the upload buffer growing past the maximum bo size
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index a5c47d6..86a4372 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3010,7 +3010,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size < kgem->max_cpu_buffer);
+	assert(size < kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
@@ -3053,6 +3053,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
+	if (alloc >= kgem->max_cpu_size)
+		alloc = PAGE_ALIGN(size);
 	if (kgem->has_cpu_bo) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)