xf86-video-intel: 7 commits - src/intel_dri.c src/intel.h src/intel_video.c src/legacy/i810 src/sna/blt.c src/sna/gen6_render.c src/sna/gen7_render.c src/sna/kgem.c src/sna/kgem_debug_gen3.c src/sna/kgem_debug_gen4.c src/sna/kgem_debug_gen5.c src/sna/kgem_debug_gen6.c src/sna/kgem_debug_gen7.c src/sna/kgem.h src/sna/sna_accel.c src/sna/sna_glyphs.c src/sna/sna.h src/sna/sna_io.c src/sna/sna_render.c src/sna/sna_render.h

Sun Jan 8 14:49:08 PST 2012

src/intel.h                  |    1 
 src/intel_dri.c              |    1 
 src/intel_video.c            |    1 
 src/legacy/i810/i810.h       |    1 
 src/legacy/i810/i810_dga.c   |    1 
 src/legacy/i810/i810_dri.c   |    1 
 src/legacy/i810/i810_hwmc.c  |    1 
 src/legacy/i810/i810_video.c |    1 
 src/sna/blt.c                |  253 ++++++++++++++++++++---
 src/sna/gen6_render.c        |   98 ++++++++
 src/sna/gen7_render.c        |   98 ++++++++
 src/sna/kgem.c               |  472 +++++++++++++++++++++++--------------------
 src/sna/kgem.h               |   24 +-
 src/sna/kgem_debug_gen3.c    |    6 
 src/sna/kgem_debug_gen4.c    |   23 --
 src/sna/kgem_debug_gen5.c    |   23 --
 src/sna/kgem_debug_gen6.c    |   30 --
 src/sna/kgem_debug_gen7.c    |   30 --
 src/sna/sna.h                |    1 
 src/sna/sna_accel.c          |    2 
 src/sna/sna_glyphs.c         |    6 
 src/sna/sna_io.c             |   46 +++-
 src/sna/sna_render.c         |   13 +
 src/sna/sna_render.h         |    1 
 24 files changed, 747 insertions(+), 387 deletions(-)

New commits:
commit 6c70558ae7298db94724c931d88a730ef0151608
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 20:48:48 2012 +0000

    sna: mark the cpu bo used for the upload buffer as in CPU domain
    
    For correctness we need to inform GEM of the change of domain for the
    buffer so that it knows to invalidate any caches when it is next used by
    the GPU.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 643771f..6801c59 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2883,6 +2883,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			return NULL;
 		}
 
+		if (write)
+			kgem_bo_sync__cpu(kgem, &bo->base);
+
 		bo->need_io = false;
 		bo->base.io = true;
 
commit 9ec31af02922bb016d0dfba07bc60cdca35b36f8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 15:52:17 2012 +0000

    sna/io: Combine small uploads into single writes
    
    For a small update, try and amalgamate the upload buffer.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 761d324..dbf1774 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -305,6 +305,46 @@ static void write_boxes_inplace(struct kgem *kgem,
 	} while (--n);
 }
 
+static bool upload_inplace(struct kgem *kgem,
+			   struct kgem_bo *bo,
+			   const BoxRec *box,
+			   int n, int bpp)
+{
+	if (DEBUG_NO_IO)
+		return true;
+
+	if (unlikely(kgem->wedged))
+		return true;
+
+	/* If we are writing through the GTT, check first if we might be
+	 * able to almagamate a series of small writes into a single
+	 * operation.
+	 */
+	if (!bo->map) {
+		BoxRec extents;
+
+		extents = box[0];
+		while (--n) {
+			box++;
+			if (box->x1 < extents.x1)
+				extents.x1 = box->x1;
+			if (box->x2 > extents.x2)
+				extents.x2 = box->x2;
+
+			if (box->y1 < extents.y1)
+				extents.y1 = box->y1;
+			if (box->y2 > extents.y2)
+				extents.y2 = box->y2;
+		}
+
+		if ((extents.x2 - extents.x1) * (extents.y2 - extents.y1) * bpp >> 12
+		    < kgem->half_cpu_cache_pages)
+			return false;
+	}
+
+	return !kgem_bo_map_will_stall(kgem, bo);
+}
+
 void sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
@@ -318,8 +358,7 @@ void sna_write_boxes(struct sna *sna, PixmapPtr dst,
 
 	DBG(("%s x %d\n", __FUNCTION__, nbox));
 
-	if (DEBUG_NO_IO || kgem->wedged ||
-	    !kgem_bo_map_will_stall(kgem, dst_bo)) {
+	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
 		write_boxes_inplace(kgem,
 				    src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
@@ -551,8 +590,7 @@ void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 
 	DBG(("%s x %d\n", __FUNCTION__, nbox));
 
-	if (DEBUG_NO_IO || kgem->wedged ||
-	    !kgem_bo_map_will_stall(kgem, dst_bo)) {
+	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
 		write_boxes_inplace__xor(kgem,
 					 src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
commit 4db1bb3fd81b51e74b7f3e90078627d9d96fbefe
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 14:20:40 2012 +0000

    Removed deprecated xf86PciInfo.h includes
    
    The driver should and does provide its own PCI-IDs.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel.h b/src/intel.h
index 5698131..7593731 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -53,7 +53,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "xf86_OSproc.h"
 #include "compiler.h"
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "xf86Cursor.h"
 #include "xf86xv.h"
diff --git a/src/intel_dri.c b/src/intel_dri.c
index 152313a..df3338f 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -51,7 +51,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xf86.h"
 #include "xf86_OSproc.h"
 
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "xf86drm.h"
 
diff --git a/src/intel_video.c b/src/intel_video.c
index 6d74d51..25f6483 100644
--- a/src/intel_video.c
+++ b/src/intel_video.c
@@ -58,7 +58,6 @@
 #include "xf86.h"
 #include "xf86_OSproc.h"
 #include "compiler.h"
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "xf86fbman.h"
 #include "xf86drm.h"
diff --git a/src/legacy/i810/i810.h b/src/legacy/i810/i810.h
index 3a355a3..183c701 100644
--- a/src/legacy/i810/i810.h
+++ b/src/legacy/i810/i810.h
@@ -40,7 +40,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <stdint.h>
 #include "compiler.h"
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "i810_reg.h"
 #include "xaa.h"
diff --git a/src/legacy/i810/i810_dga.c b/src/legacy/i810/i810_dga.c
index d9ddda0..44181c6 100644
--- a/src/legacy/i810/i810_dga.c
+++ b/src/legacy/i810/i810_dga.c
@@ -29,7 +29,6 @@
 #include "xf86.h"
 #include "xf86_OSproc.h"
 #include "xf86Pci.h"
-#include "xf86PciInfo.h"
 #include "xaa.h"
 #include "xaalocal.h"
 #include "i810.h"
diff --git a/src/legacy/i810/i810_dri.c b/src/legacy/i810/i810_dri.c
index 4fc5aab..9129069 100644
--- a/src/legacy/i810/i810_dri.c
+++ b/src/legacy/i810/i810_dri.c
@@ -10,7 +10,6 @@
 #include "xf86.h"
 #include "xf86_OSproc.h"
 
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 
 #include "windowstr.h"
diff --git a/src/legacy/i810/i810_hwmc.c b/src/legacy/i810/i810_hwmc.c
index 724e1be..ba50e1e 100644
--- a/src/legacy/i810/i810_hwmc.c
+++ b/src/legacy/i810/i810_hwmc.c
@@ -42,7 +42,6 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xf86.h"
 #include "xf86_OSproc.h"
 #include "compiler.h"
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "xf86fbman.h"
 #include "regionstr.h"
diff --git a/src/legacy/i810/i810_video.c b/src/legacy/i810/i810_video.c
index 68dc471..a0e6acd 100644
--- a/src/legacy/i810/i810_video.c
+++ b/src/legacy/i810/i810_video.c
@@ -42,7 +42,6 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xf86.h"
 #include "xf86_OSproc.h"
 #include "compiler.h"
-#include "xf86PciInfo.h"
 #include "xf86Pci.h"
 #include "xf86fbman.h"
 #include "regionstr.h"
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 6ae4310..4290659 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -45,7 +45,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "compiler.h"
 #include <xf86_OSproc.h>
-#include <xf86PciInfo.h>
 #include <xf86Pci.h>
 #include <xf86Cursor.h>
 #include <xf86xv.h>
commit 54232d1a5da51d0f68fe099436bb0a1a2e249954
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 15:22:58 2012 +0000

    sna: Add ricer stripes to memcpy_xor
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index d28ad98..fb3dd35 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -31,11 +31,118 @@
 
 #include "sna.h"
 
+#if __x86_64__
+#define USE_SSE2 1
+#endif
+
 #if DEBUG_BLT
 #undef DBG
 #define DBG(x) ErrorF x
 #endif
 
+#if USE_SSE2
+#include <xmmintrin.h>
+
+#if __x86_64__
+#define have_sse2() 1
+#else
+enum {
+	MMX = 0x1,
+	MMX_EXTENSIONS = 0x2,
+	SSE = 0x6,
+	SSE2 = 0x8,
+	CMOV = 0x10
+};
+
+#ifdef __GNUC__
+static unsigned int
+detect_cpu_features(void)
+{
+	unsigned int features;
+	unsigned int result = 0;
+
+	char vendor[13];
+	vendor[0] = 0;
+	vendor[12] = 0;
+
+	asm (
+	     "pushf\n"
+	     "pop %%eax\n"
+	     "mov %%eax, %%ecx\n"
+	     "xor $0x00200000, %%eax\n"
+	     "push %%eax\n"
+	     "popf\n"
+	     "pushf\n"
+	     "pop %%eax\n"
+	     "mov $0x0, %%edx\n"
+	     "xor %%ecx, %%eax\n"
+	     "jz 1f\n"
+
+	     "mov $0x00000000, %%eax\n"
+	     "push %%ebx\n"
+	     "cpuid\n"
+	     "mov %%ebx, %%eax\n"
+	     "pop %%ebx\n"
+	     "mov %%eax, %1\n"
+	     "mov %%edx, %2\n"
+	     "mov %%ecx, %3\n"
+	     "mov $0x00000001, %%eax\n"
+	     "push %%ebx\n"
+	     "cpuid\n"
+	     "pop %%ebx\n"
+	     "1:\n"
+	     "mov %%edx, %0\n"
+	     : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8])
+	     :: "%eax", "%ecx", "%edx");
+
+	features = 0;
+	if (result) {
+		/* result now contains the standard feature bits */
+		if (result & (1 << 15))
+			features |= CMOV;
+		if (result & (1 << 23))
+			features |= MMX;
+		if (result & (1 << 25))
+			features |= SSE;
+		if (result & (1 << 26))
+			features |= SSE2;
+	}
+	return features;
+}
+#else
+static unsigned int detect_cpu_features(void) { return 0; }
+#endif
+
+static bool have_sse2(void)
+{
+	static int sse2_present = -1;
+
+	if (sse2_present == -1)
+		sse2_present = detect_cpu_features() & SSE2;
+
+	return sse2_present;
+}
+#endif
+
+static inline __m128i
+xmm_create_mask_32(uint32_t mask)
+{
+	return _mm_set_epi32(mask, mask, mask, mask);
+}
+
+static inline __m128i
+xmm_load_128u(const __m128i *src)
+{
+	return _mm_loadu_si128(src);
+}
+
+static inline void
+xmm_save_128(__m128i *dst, __m128i data)
+{
+	_mm_store_si128(dst, data);
+}
+#endif
+
 void
 memcpy_blt(const void *src, void *dst, int bpp,
 	   int32_t src_stride, int32_t dst_stride,
@@ -136,39 +243,123 @@ memcpy_xor(const void *src, void *dst, int bpp,
 	if (and == 0xffffffff) {
 		switch (bpp) {
 		case 1:
-			do {
-				for (i = 0; i < width; i++)
-					dst_bytes[i] = src_bytes[i] | or;
-
-				src_bytes += src_stride;
-				dst_bytes += dst_stride;
-			} while (--height);
-			break;
-
+			if (width & 1) {
+				do {
+					for (i = 0; i < width; i++)
+						dst_bytes[i] = src_bytes[i] | or;
+
+					src_bytes += src_stride;
+					dst_bytes += dst_stride;
+				} while (--height);
+				break;
+			} else {
+				width /= 2;
+				or |= or << 8;
+			}
 		case 2:
-			do {
-				uint16_t *d = (uint16_t *)dst_bytes;
-				uint16_t *s = (uint16_t *)src_bytes;
-
-				for (i = 0; i < width; i++)
-					d[i] = s[i] | or;
-
-				src_bytes += src_stride;
-				dst_bytes += dst_stride;
-			} while (--height);
-			break;
-
+			if (width & 1) {
+				do {
+					uint16_t *d = (uint16_t *)dst_bytes;
+					uint16_t *s = (uint16_t *)src_bytes;
+
+					for (i = 0; i < width; i++)
+						d[i] = s[i] | or;
+
+					src_bytes += src_stride;
+					dst_bytes += dst_stride;
+				} while (--height);
+				break;
+			} else {
+				width /= 2;
+				or |= or << 16;
+			}
 		case 4:
-			do {
-				uint32_t *d = (uint32_t *)dst_bytes;
-				uint32_t *s = (uint32_t *)src_bytes;
-
-				for (i = 0; i < width; i++)
-					d[i] = s[i] | or;
-
-				src_bytes += src_stride;
-				dst_bytes += dst_stride;
-			} while (--height);
+#if USE_SSE2
+			if (width * 4 == dst_stride && dst_stride == src_stride) {
+				width *= height;
+				height = 1;
+			}
+
+			if (have_sse2()) {
+				do {
+					uint32_t *d = (uint32_t *)dst_bytes;
+					uint32_t *s = (uint32_t *)src_bytes;
+					__m128i mask = xmm_create_mask_32(or);
+
+					i = width;
+					while (i && (uintptr_t)d & 15) {
+						*d++ = *s++ | or;
+						i--;
+					}
+
+					while (i >= 16) {
+						__m128i xmm1, xmm2, xmm3, xmm4;
+
+						xmm1 = xmm_load_128u((__m128i*)s + 0);
+						xmm2 = xmm_load_128u((__m128i*)s + 1);
+						xmm3 = xmm_load_128u((__m128i*)s + 2);
+						xmm4 = xmm_load_128u((__m128i*)s + 3);
+
+						xmm_save_128((__m128i*)d + 0,
+							     _mm_or_si128(xmm1, mask));
+						xmm_save_128((__m128i*)d + 1,
+							     _mm_or_si128(xmm2, mask));
+						xmm_save_128((__m128i*)d + 2,
+							     _mm_or_si128(xmm3, mask));
+						xmm_save_128((__m128i*)d + 3,
+							     _mm_or_si128(xmm4, mask));
+
+						d += 16;
+						s += 16;
+						i -= 16;
+					}
+
+					if (i & 8) {
+						__m128i xmm1, xmm2;
+
+						xmm1 = xmm_load_128u((__m128i*)s + 0);
+						xmm2 = xmm_load_128u((__m128i*)s + 1);
+
+						xmm_save_128((__m128i*)d + 0,
+							     _mm_or_si128(xmm1, mask));
+						xmm_save_128((__m128i*)d + 1,
+							     _mm_or_si128(xmm2, mask));
+						d += 8;
+						s += 8;
+						i -= 8;
+					}
+
+					if (i & 4) {
+						xmm_save_128((__m128i*)d,
+							     _mm_or_si128(xmm_load_128u((__m128i*)s),
+									  mask));
+
+						d += 4;
+						s += 4;
+						i -= 4;
+					}
+
+					while (i) {
+						*d++ = *s++ | or;
+						i--;
+					}
+
+					src_bytes += src_stride;
+					dst_bytes += dst_stride;
+				} while (--height);
+			} else
+#else
+				do {
+					uint32_t *d = (uint32_t *)dst_bytes;
+					uint32_t *s = (uint32_t *)src_bytes;
+
+					for (i = 0; i < width; i++)
+						d[i] = s[i] | or;
+
+					src_bytes += src_stride;
+					dst_bytes += dst_stride;
+				} while (--height);
+#endif
 			break;
 		}
 	} else {
commit c037b4f542a7b21cbaecedec259da3589db10039
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 08:58:41 2012 +0000

    sna: Tune cache size for cpu bo cache
    
    This helps SNB on cairo-traces that utilize lots of temporary uploads
    (rasterised sources and masks for instance), but comes at a cost of
    regressing others...
    
    In order to counter the regression from increasing the GTT cache size,
    the CPU/GTT vma cache are split and accounted separately.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 0c1b2b1..643771f 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -94,7 +94,9 @@ static inline void list_replace(struct list *old,
 #endif
 
 #define PAGE_SIZE 4096
-#define MAX_VMA_CACHE 256
+#define MAX_GTT_VMA_CACHE 512
+#define MAX_CPU_VMA_CACHE INT16_MAX
+#define MAP_PRESERVE_TIME 10
 
 #define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
 #define CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) & ~1))
@@ -200,8 +202,8 @@ static void *gem_mmap(int fd, uint32_t handle, int size, int prot)
 }
 
 static int __gem_write(int fd, uint32_t handle,
-		     int offset, int length,
-		     const void *src)
+		       int offset, int length,
+		       const void *src)
 {
 	struct drm_i915_gem_pwrite pwrite;
 
@@ -371,7 +373,7 @@ kgem_bo_clear_purgeable(struct kgem *kgem, struct kgem_bo *bo)
 	madv.handle = bo->handle;
 	madv.madv = I915_MADV_WILLNEED;
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0) {
-		bo->purged = 0;
+		bo->purged = !madv.retained;
 		return madv.retained;
 	}
 
@@ -388,14 +390,32 @@ static void gem_close(int fd, uint32_t handle)
 	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
 }
 
+static inline unsigned long __fls(unsigned long word)
+{
+	asm("bsr %1,%0"
+	    : "=r" (word)
+	    : "rm" (word));
+	return word;
+}
+
+constant inline static int cache_bucket(int size)
+{
+	uint32_t order = __fls(size / PAGE_SIZE);
+	assert(order < NUM_CACHE_BUCKETS);
+	return order;
+}
+
 static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
 				      int handle, int size)
 {
+	assert(size);
 	memset(bo, 0, sizeof(*bo));
 
 	bo->refcnt = 1;
 	bo->handle = handle;
 	bo->size = size;
+	bo->bucket = cache_bucket(size);
+	assert(bo->size < 1 << (12 + bo->bucket + 1));
 	bo->reusable = true;
 	bo->domain = DOMAIN_CPU;
 	list_init(&bo->request);
@@ -436,30 +456,14 @@ static struct kgem_request *__kgem_request_alloc(void)
 	return rq;
 }
 
-static inline unsigned long __fls(unsigned long word)
-{
-	asm("bsr %1,%0"
-	    : "=r" (word)
-	    : "rm" (word));
-	return word;
-}
-
-static struct list *inactive(struct kgem *kgem,
-			     int size)
+static struct list *inactive(struct kgem *kgem, int size)
 {
-	uint32_t order = __fls(size / PAGE_SIZE);
-	if (order >= ARRAY_SIZE(kgem->inactive))
-		order = ARRAY_SIZE(kgem->inactive)-1;
-	return &kgem->inactive[order];
+	return &kgem->inactive[cache_bucket(size)];
 }
 
-static struct list *active(struct kgem *kgem,
-			     int size)
+static struct list *active(struct kgem *kgem, int size)
 {
-	uint32_t order = __fls(size / PAGE_SIZE);
-	if (order >= ARRAY_SIZE(kgem->active))
-		order = ARRAY_SIZE(kgem->active)-1;
-	return &kgem->active[order];
+	return &kgem->active[cache_bucket(size)];
 }
 
 static size_t
@@ -508,7 +512,7 @@ static int gem_param(struct kgem *kgem, int name)
 void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 {
 	struct drm_i915_gem_get_aperture aperture;
-	unsigned int i;
+	unsigned int i, j;
 
 	memset(kgem, 0, sizeof(*kgem));
 
@@ -527,12 +531,16 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	list_init(&kgem->partial);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
-	list_init(&kgem->vma_cache);
-	list_init(&kgem->vma_inactive);
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
 		list_init(&kgem->inactive[i]);
 	for (i = 0; i < ARRAY_SIZE(kgem->active); i++)
 		list_init(&kgem->active[i]);
+	for (i = 0; i < ARRAY_SIZE(kgem->vma); i++) {
+		for (j = 0; j < ARRAY_SIZE(kgem->vma[i].inactive); j++)
+			list_init(&kgem->vma[i].inactive[j]);
+	}
+	kgem->vma[MAP_GTT].count = -MAX_GTT_VMA_CACHE;
+	kgem->vma[MAP_CPU].count = -MAX_CPU_VMA_CACHE;
 
 	kgem->next_request = __kgem_request_alloc();
 
@@ -572,6 +580,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	kgem->max_object_size = kgem->aperture_mappable / 2;
 	if (kgem->max_object_size > kgem->aperture_low)
 		kgem->max_object_size = kgem->aperture_low;
+	if (kgem->max_object_size > MAX_OBJECT_SIZE)
+		kgem->max_object_size = MAX_OBJECT_SIZE;
 	DBG(("%s: max object size %d\n", __FUNCTION__, kgem->max_object_size));
 
 	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
@@ -765,6 +775,21 @@ static void kgem_bo_binding_free(struct kgem *kgem, struct kgem_bo *bo)
 	}
 }
 
+static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+	int type = IS_CPU_MAP(bo->map);
+
+	DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
+	     __FUNCTION__, type ? "CPU" : "GTT",
+	     bo->handle, kgem->vma[type].count));
+
+	munmap(CPU_MAP(bo->map), bo->size);
+	bo->map = NULL;
+
+	list_del(&bo->vma);
+	kgem->vma[type].count--;
+}
+
 static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -773,14 +798,8 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 
 	kgem_bo_binding_free(kgem, bo);
 
-	if (bo->map) {
-		DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
-		       __FUNCTION__, IS_CPU_MAP(bo->map) ? "CPU" : "GTT",
-		       bo->handle, kgem->vma_count-1));
-		munmap(CPU_MAP(bo->map), bo->size);
-		list_del(&bo->vma);
-		kgem->vma_count--;
-	}
+	if (bo->map)
+		kgem_bo_release_map(kgem, bo);
 	assert(list_is_empty(&bo->vma));
 
 	_list_del(&bo->list);
@@ -799,6 +818,44 @@ static bool is_mmaped_buffer(struct kgem_partial_bo *bo)
 	return bo->mem != bo+1;
 }
 
+inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
+					    struct kgem_bo *bo)
+{
+	assert(!kgem_busy(kgem, bo->handle));
+	assert(!bo->proxy);
+	assert(!bo->io);
+
+	list_move(&bo->list, &kgem->inactive[bo->bucket]);
+	if (bo->map) {
+		int type = IS_CPU_MAP(bo->map);
+		list_move_tail(&bo->vma, &kgem->vma[type].inactive[bo->bucket]);
+		kgem->vma[type].count++;
+	}
+
+	kgem->need_expire = true;
+}
+
+inline static void kgem_bo_remove_from_inactive(struct kgem *kgem,
+						struct kgem_bo *bo)
+{
+	list_del(&bo->list);
+	assert(bo->rq == NULL);
+	if (bo->map) {
+		assert(!list_is_empty(&bo->vma));
+		list_del(&bo->vma);
+		kgem->vma[IS_CPU_MAP(bo->map)].count--;
+	}
+}
+
+inline static void kgem_bo_remove_from_active(struct kgem *kgem,
+					      struct kgem_bo *bo)
+{
+	list_del(&bo->list);
+	if (bo->rq == &_kgem_static_request)
+		list_del(&bo->request);
+	assert(list_is_empty(&bo->vma));
+}
+
 static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -812,12 +869,8 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		goto destroy;
 
 	if (bo->io) {
-		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo;
 		struct kgem_bo *base;
 
-		if (is_mmaped_buffer(io))
-			kgem_bo_unmap__cpu(kgem, bo, io->mem);
-
 		base = malloc(sizeof(*base));
 		if (base) {
 			DBG(("%s: transferring io handle=%d to bo\n",
@@ -843,14 +896,15 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->vmap == false && bo->sync == false);
 	bo->scanout = bo->flush = false;
 
+	assert(list_is_empty(&bo->vma));
 	if (bo->rq) {
 		DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle));
-		list_move(&bo->list, active(kgem, bo->size));
+		list_move(&bo->list, &kgem->active[bo->bucket]);
 	} else if (bo->needs_flush) {
 		DBG(("%s: handle=%d -> flushing\n", __FUNCTION__, bo->handle));
 		assert(list_is_empty(&bo->request));
 		list_add(&bo->request, &kgem->flushing);
-		list_move(&bo->list, active(kgem, bo->size));
+		list_move(&bo->list, &kgem->active[bo->bucket]);
 		bo->rq = &_kgem_static_request;
 	} else {
 		assert(bo->exec == NULL);
@@ -864,10 +918,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		}
 
 		DBG(("%s: handle=%d -> inactive\n", __FUNCTION__, bo->handle));
-		assert(!kgem_busy(kgem, bo->handle));
-		list_move(&bo->list, inactive(kgem, bo->size));
-		if (bo->map)
-			list_move(&bo->vma, &kgem->vma_inactive);
+		kgem_bo_move_to_inactive(kgem, bo);
 		kgem->need_expire = true;
 	}
 
@@ -905,7 +956,7 @@ bool kgem_retire(struct kgem *kgem)
 			bo->needs_flush = false;
 			bo->domain = DOMAIN_NONE;
 			bo->rq = NULL;
-			list_move(&bo->list, inactive(kgem, bo->size));
+			kgem_bo_move_to_inactive(kgem, bo);
 			list_del(&bo->request);
 		} else
 			kgem_bo_free(kgem, bo);
@@ -948,8 +999,7 @@ bool kgem_retire(struct kgem *kgem)
 					} else if(kgem_bo_set_purgeable(kgem, bo)) {
 						DBG(("%s: moving %d to inactive\n",
 						     __FUNCTION__, bo->handle));
-						list_move(&bo->list,
-							  inactive(kgem, bo->size));
+						kgem_bo_move_to_inactive(kgem, bo);
 						retired = true;
 					} else {
 						DBG(("%s: closing %d\n",
@@ -969,7 +1019,7 @@ bool kgem_retire(struct kgem *kgem)
 		if (kgem_bo_set_purgeable(kgem, rq->bo)) {
 			assert(rq->bo->rq == NULL);
 			assert(list_is_empty(&rq->bo->request));
-			list_move(&rq->bo->list, inactive(kgem, rq->bo->size));
+			kgem_bo_move_to_inactive(kgem, rq->bo);
 			retired = true;
 		} else {
 			kgem->need_purge = 1;
@@ -1483,6 +1533,9 @@ bool kgem_expire_cache(struct kgem *kgem)
 
 	idle = true;
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+		struct list preserve;
+
+		list_init(&preserve);
 		while (!list_is_empty(&kgem->inactive[i])) {
 			bo = list_last_entry(&kgem->inactive[i],
 					     struct kgem_bo, list);
@@ -1492,10 +1545,20 @@ bool kgem_expire_cache(struct kgem *kgem)
 				break;
 			}
 
-			count++;
-			size += bo->size;
-
-			kgem_bo_free(kgem, bo);
+			if (bo->map && bo->delta + MAP_PRESERVE_TIME > expire) {
+				idle = false;
+				list_move_tail(&bo->list, &preserve);
+			} else {
+				count++;
+				size += bo->size;
+				kgem_bo_free(kgem, bo);
+			}
+		}
+		if (!list_is_empty(&preserve)) {
+			preserve.prev->next = kgem->inactive[i].next;
+			kgem->inactive[i].next->prev = preserve.prev;
+			kgem->inactive[i].next = preserve.next;
+			preserve.next->prev = &kgem->inactive[i];
 		}
 	}
 
@@ -1552,15 +1615,24 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 	bool use_active = (flags & CREATE_INACTIVE) == 0;
 	struct list *cache;
 
-	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
-		int for_cpu = !!(flags & CREATE_CPU_MAP);
-		assert(for_cpu || use_active == false);
-		list_for_each_entry(bo, &kgem->vma_inactive, vma) {
-			if (IS_CPU_MAP(bo->map) != for_cpu)
-				continue;
+	if (!use_active &&
+	    list_is_empty(inactive(kgem, size)) &&
+	    !list_is_empty(active(kgem, size)) &&
+	    !kgem_retire(kgem))
+		return NULL;
 
-			if (size > bo->size || 2*size < bo->size)
+	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+		int for_cpu = !!(flags & CREATE_CPU_MAP);
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
+		list_for_each_entry(bo, cache, vma) {
+			assert(IS_CPU_MAP(bo->map) == for_cpu);
+			assert(bo->bucket == cache_bucket(size));
+
+			if (size > bo->size) {
+				DBG(("inactive too small: %d < %d\n",
+				     bo->size, size));
 				continue;
+			}
 
 			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 				kgem->need_purge |= bo->domain == DOMAIN_GPU;
@@ -1573,10 +1645,7 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 					   I915_TILING_NONE, 0) != I915_TILING_NONE)
 				continue;
 
-			list_del(&bo->list);
-			if (bo->rq == &_kgem_static_request)
-				list_del(&bo->request);
-			list_move_tail(&bo->vma, &kgem->vma_cache);
+			kgem_bo_remove_from_inactive(kgem, bo);
 
 			bo->tiling = I915_TILING_NONE;
 			bo->pitch = 0;
@@ -1590,7 +1659,7 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		}
 	}
 
-	cache = use_active ? active(kgem, size): inactive(kgem, size);
+	cache = use_active ? active(kgem, size) : inactive(kgem, size);
 	list_for_each_entry_safe(bo, next, cache, list) {
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
@@ -1644,13 +1713,10 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 				continue;
 		}
 
-		list_del(&bo->list);
-		if (bo->rq == &_kgem_static_request)
-			list_del(&bo->request);
-		if (bo->map) {
-			assert(!list_is_empty(&bo->vma));
-			list_move_tail(&bo->vma, &kgem->vma_cache);
-		}
+		if (use_active)
+			kgem_bo_remove_from_active(kgem, bo);
+		else
+			kgem_bo_remove_from_inactive(kgem, bo);
 
 		bo->tiling = I915_TILING_NONE;
 		bo->pitch = 0;
@@ -1673,22 +1739,14 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 					   I915_TILING_NONE, 0) != I915_TILING_NONE)
 				return NULL;
 
-			if (first->map) {
-				munmap(CPU_MAP(first->map), first->size);
-				first->map = NULL;
-
-				list_del(&first->vma);
-				kgem->vma_count--;
-			}
+			if (first->map)
+				kgem_bo_release_map(kgem, first);
 		}
 
-		list_del(&first->list);
-		if (first->rq == &_kgem_static_request)
-			list_del(&first->request);
-		if (first->map) {
-			assert(!list_is_empty(&first->vma));
-			list_move_tail(&first->vma, &kgem->vma_cache);
-		}
+		if (use_active)
+			kgem_bo_remove_from_active(kgem, first);
+		else
+			kgem_bo_remove_from_inactive(kgem, first);
 
 		first->tiling = I915_TILING_NONE;
 		first->pitch = 0;
@@ -1718,7 +1776,7 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 		return NULL;
 
 	DBG(("%s: new handle=%d\n", __FUNCTION__, open_arg.handle));
-	bo = __kgem_bo_alloc(open_arg.handle, 0);
+	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size);
 	if (bo == NULL) {
 		gem_close(kgem->fd, open_arg.handle);
 		return NULL;
@@ -1740,14 +1798,6 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 	if (bo)
 		return kgem_bo_reference(bo);
 
-	if (!list_is_empty(&kgem->requests)) {
-		if (kgem_retire(kgem)) {
-			bo = search_linear_cache(kgem, size, CREATE_INACTIVE);
-			if (bo)
-				return kgem_bo_reference(bo);
-		}
-	}
-
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
@@ -1877,11 +1927,11 @@ static bool _kgem_can_create_2d(struct kgem *kgem,
 
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp, tiling, &pitch);
-	if (size == 0 || size > kgem->max_object_size)
+	if (size == 0 || size >= kgem->max_object_size)
 		size = kgem_surface_size(kgem, false, false,
 					 width, height, bpp,
 					 I915_TILING_NONE, &pitch);
-	return size > 0 && size <= kgem->max_object_size;
+	return size > 0 && size < kgem->max_object_size;
 }
 
 #if DEBUG_KGEM
@@ -1954,18 +2004,18 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		/* We presume that we will need to upload to this bo,
 		 * and so would prefer to have an active VMA.
 		 */
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
 		do {
-			list_for_each_entry(bo, &kgem->vma_inactive, vma) {
+			list_for_each_entry(bo, cache, vma) {
+				assert(bo->bucket == cache_bucket(size));
 				assert(bo->refcnt == 0);
 				assert(bo->map);
+				assert(IS_CPU_MAP(bo->map) == for_cpu);
 				assert(bo->rq == NULL);
 				assert(list_is_empty(&bo->request));
 
-				if (IS_CPU_MAP(bo->map) != for_cpu)
-					continue;
-
-				if (size > bo->size || 2*size < bo->size) {
-					DBG(("inactive vma too small/large: %d < %d\n",
+				if (size > bo->size) {
+					DBG(("inactive too small: %d < %d\n",
 					     bo->size, size));
 					continue;
 				}
@@ -1977,25 +2027,24 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 					continue;
 				}
 
-				bo->pitch = pitch;
-				list_del(&bo->list);
-
 				if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 					kgem_bo_free(kgem, bo);
 					break;
 				}
 
+				bo->pitch = pitch;
 				bo->delta = 0;
 				bo->unique_id = kgem_get_unique_id(kgem);
-				list_move_tail(&bo->vma, &kgem->vma_cache);
-				assert(bo->pitch);
+
+				kgem_bo_remove_from_inactive(kgem, bo);
+
 				DBG(("  from inactive vma: pitch=%d, tiling=%d: handle=%d, id=%d\n",
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
 				assert(bo->reusable);
 				assert(bo->domain != DOMAIN_GPU && !kgem_busy(kgem, bo->handle));
 				return kgem_bo_reference(bo);
 			}
-		} while (!list_is_empty(&kgem->vma_cache) && kgem_retire(kgem));
+		} while (!list_is_empty(cache) && kgem_retire(kgem));
 	}
 
 	if (flags & CREATE_INACTIVE)
@@ -2013,6 +2062,8 @@ search_active: /* Best active match first */
 	list_for_each_entry(bo, cache, list) {
 		uint32_t s;
 
+		assert(bo->bucket == cache_bucket(size));
+
 		if (bo->tiling) {
 			if (bo->pitch < pitch) {
 				DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
@@ -2031,10 +2082,6 @@ search_active: /* Best active match first */
 				continue;
 			}
 
-			list_del(&bo->list);
-			if (bo->rq == &_kgem_static_request)
-				list_del(&bo->request);
-
 			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 				kgem->need_purge |= bo->domain == DOMAIN_GPU;
 				kgem_bo_free(kgem, bo);
@@ -2042,6 +2089,8 @@ search_active: /* Best active match first */
 				goto search_active;
 			}
 
+			kgem_bo_remove_from_active(kgem, bo);
+
 			bo->unique_id = kgem_get_unique_id(kgem);
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
@@ -2061,6 +2110,8 @@ search_active: /* Best active match first */
 			kgem->need_purge |= next->domain == DOMAIN_GPU;
 			kgem_bo_free(kgem, next);
 		} else {
+			kgem_bo_remove_from_active(kgem, next);
+
 			next->unique_id = kgem_get_unique_id(kgem);
 			next->delta = 0;
 			DBG(("  2:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
@@ -2075,6 +2126,8 @@ skip_active_search:
 	/* Now just look for a close match and prefer any currently active */
 	cache = inactive(kgem, size);
 	list_for_each_entry_safe(bo, next, cache, list) {
+		assert(bo->bucket == cache_bucket(size));
+
 		if (size > bo->size) {
 			DBG(("inactive too small: %d < %d\n",
 			     bo->size, size));
@@ -2090,29 +2143,20 @@ skip_active_search:
 				continue;
 			}
 
-			if (bo->map) {
-				munmap(CPU_MAP(bo->map), bo->size);
-				bo->map = NULL;
-
-				list_del(&bo->vma);
-				kgem->vma_count--;
-			}
+			if (bo->map)
+				kgem_bo_release_map(kgem, bo);
 		}
 
-		bo->pitch = pitch;
-		bo->tiling = tiling;
-
-		list_del(&bo->list);
-		assert(list_is_empty(&bo->request));
-
 		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
 			kgem->need_purge |= bo->domain == DOMAIN_GPU;
 			kgem_bo_free(kgem, bo);
 			continue;
 		}
 
-		if (bo->map)
-			list_move_tail(&bo->vma, &kgem->vma_cache);
+		kgem_bo_remove_from_inactive(kgem, bo);
+
+		bo->pitch = pitch;
+		bo->tiling = tiling;
 
 		bo->delta = 0;
 		bo->unique_id = kgem_get_unique_id(kgem);
@@ -2372,9 +2416,14 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 	return delta;
 }
 
-static void kgem_trim_vma_cache(struct kgem *kgem)
+static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 {
-	if (kgem->vma_count > MAX_VMA_CACHE && kgem->need_purge)
+	int i, j;
+
+	if (kgem->vma[type].count <= 0)
+	       return;
+
+	if (kgem->need_purge)
 		kgem_purge_cache(kgem);
 
 	/* vma are limited on a per-process basis to around 64k.
@@ -2384,33 +2433,36 @@ static void kgem_trim_vma_cache(struct kgem *kgem)
 	 * start failing mappings, we keep our own number of open
 	 * vma to within a conservative value.
 	 */
-	while (kgem->vma_count > MAX_VMA_CACHE) {
-		struct kgem_bo *old;
-
-		if (list_is_empty(&kgem->vma_inactive)) {
-			old = list_first_entry(&kgem->vma_cache,
-					       struct kgem_bo,
-					       vma);
-		} else {
-			old = list_last_entry(&kgem->vma_inactive,
-					      struct kgem_bo,
-					      vma);
+	i = 0;
+	while (kgem->vma[type].count > 0) {
+		struct kgem_bo *bo = NULL;
+
+		for (j = 0;
+		     bo == NULL && j < ARRAY_SIZE(kgem->vma[type].inactive);
+		     j++) {
+			struct list *head = &kgem->vma[type].inactive[i++%ARRAY_SIZE(kgem->vma[type].inactive)];
+			if (!list_is_empty(head))
+				bo = list_first_entry(head,
+						      struct kgem_bo,
+						      vma);
 		}
-		DBG(("%s: discarding %s %s vma cache for %d\n",
+		if (bo == NULL)
+			break;
+
+		DBG(("%s: discarding inactive %s vma cache for %d\n",
 		     __FUNCTION__,
-		     list_is_empty(&kgem->vma_inactive) ? "cached" : "inactive",
-		     IS_CPU_MAP(old->map) ? "CPU" : "GTT", old->handle));
-		assert(old->map);
-		munmap(CPU_MAP(old->map), old->size);
-		old->map = NULL;
-		list_del(&old->vma);
-		kgem->vma_count--;
-
-		if (old->rq == NULL && old->refcnt == 0) {
-			DBG(("%s: discarding unused vma bo handle=%d\n",
-			     __FUNCTION__, old->handle));
-			kgem_bo_free(kgem, old);
-}
+		     IS_CPU_MAP(bo->map) ? "CPU" : "GTT", bo->handle));
+		assert(IS_CPU_MAP(bo->map) == type);
+		assert(bo->map);
+		assert(bo->rq == NULL);
+
+		munmap(CPU_MAP(bo->map), bo->size);
+		bo->map = NULL;
+		list_del(&bo->vma);
+		kgem->vma[type].count--;
+
+		if (!bo->purged && !kgem_bo_set_purgeable(kgem, bo))
+			kgem_bo_free(kgem, bo);
 	}
 }
 
@@ -2421,18 +2473,12 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot)
 	assert(bo->refcnt || bo->exec); /* allow for debugging purposes */
 	assert(!bo->purged);
 
-	if (IS_CPU_MAP(bo->map)) {
-		DBG(("%s: discarding CPU vma cache for %d\n",
-		       __FUNCTION__, bo->handle));
-		munmap(CPU_MAP(bo->map), bo->size);
-		bo->map = NULL;
-		list_del(&bo->vma);
-		kgem->vma_count--;
-	}
+	if (IS_CPU_MAP(bo->map))
+		kgem_bo_release_map(kgem, bo);
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		kgem_trim_vma_cache(kgem);
+		kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
 
 		ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
 			       PROT_READ | PROT_WRITE);
@@ -2445,10 +2491,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot)
 		 * flush CPU damage to their GPU bo.
 		 */
 		bo->map = ptr;
-		kgem->vma_count++;
-
-		DBG(("%s: caching vma for %d, count=%d\n",
-		     __FUNCTION__, bo->handle, kgem->vma_count));
+		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
 	}
 
 	if (bo->domain != DOMAIN_GTT) {
@@ -2473,8 +2516,6 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot)
 		bo->domain = DOMAIN_GTT;
 	}
 
-	list_move_tail(&bo->vma, &kgem->vma_cache);
-
 	return ptr;
 }
 
@@ -2486,25 +2527,13 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->refcnt);
 	assert(!bo->purged);
 
-	if (IS_CPU_MAP(bo->map)) {
-		void *ptr = CPU_MAP(bo->map);
-		list_del(&bo->vma);
-		kgem->vma_count--;
-		bo->map = NULL;
-		VG(VALGRIND_MALLOCLIKE_BLOCK(ptr, bo->size, 0, 1));
-		return ptr;
-	}
+	if (IS_CPU_MAP(bo->map))
+		return CPU_MAP(bo->map);
 
-	if (bo->map) {
-		DBG(("%s: discarding GTT vma cache for %d\n",
-		       __FUNCTION__, bo->handle));
-		munmap(CPU_MAP(bo->map), bo->size);
-		bo->map = NULL;
-		list_del(&bo->vma);
-		kgem->vma_count--;
-	}
+	if (bo->map)
+		kgem_bo_release_map(kgem, bo);
 
-	kgem_trim_vma_cache(kgem);
+	kgem_trim_vma_cache(kgem, MAP_CPU, bo->bucket);
 
 	VG_CLEAR(mmap_arg);
 	mmap_arg.handle = bo->handle;
@@ -2515,38 +2544,11 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 		return NULL;
 	}
 
-	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bo->size, 0, 1));
+	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
 	return (void *)(uintptr_t)mmap_arg.addr_ptr;
 }
 
-void kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr)
-{
-	assert(bo->map == NULL);
-	assert(ptr != NULL);
-
-	bo->map = MAKE_CPU_MAP(ptr);
-	list_move(&bo->vma, &kgem->vma_cache);
-	kgem->vma_count++;
-
-	VG(VALGRIND_FREELIKE_BLOCK(ptr, 0));
-}
-
-void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo)
-{
-	if (bo->map == NULL)
-		return;
-
-	DBG(("%s: (debug) releasing vma for handle=%d, count=%d\n",
-	     __FUNCTION__, bo->handle, kgem->vma_count-1));
-	assert(!IS_CPU_MAP(bo->map));
-
-	munmap(CPU_MAP(bo->map), bo->size);
-	bo->map = NULL;
-
-	list_del(&bo->vma);
-	kgem->vma_count--;
-}
-
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_gem_flink flink;
@@ -2876,6 +2878,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
 		if (bo->mem == NULL) {
+			bo->base.refcnt = 0; /* for valgrind */
 			kgem_bo_free(kgem, &bo->base);
 			return NULL;
 		}
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 57ac647..1bc0d9b 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -63,8 +63,10 @@ struct kgem_bo {
 	uint32_t refcnt;
 	uint32_t handle;
 	uint32_t presumed_offset;
-	uint32_t size;
 	uint32_t delta;
+	uint32_t size:28;
+	uint32_t bucket:4;
+#define MAX_OBJECT_SIZE (1 << 28)
 
 	uint32_t pitch : 18; /* max 128k */
 	uint32_t tiling : 2;
@@ -90,6 +92,14 @@ struct kgem_request {
 	struct list buffers;
 };
 
+enum {
+	MAP_GTT = 0,
+	MAP_CPU,
+	NUM_MAP_TYPES,
+};
+
+#define NUM_CACHE_BUCKETS 16
+
 struct kgem {
 	int fd;
 	int wedged;
@@ -105,20 +115,22 @@ struct kgem {
 		KGEM_BLT,
 	} mode, ring;
 
-	struct list flushing, active[16], inactive[16];
+	struct list flushing, active[NUM_CACHE_BUCKETS], inactive[NUM_CACHE_BUCKETS];
 	struct list partial;
 	struct list requests;
-	struct list vma_cache;
-	struct list vma_inactive;
 	struct kgem_request *next_request;
 
+	struct {
+		struct list inactive[NUM_CACHE_BUCKETS];
+		int16_t count;
+	} vma[NUM_MAP_TYPES];
+
 	uint16_t nbatch;
 	uint16_t surface;
 	uint16_t nexec;
 	uint16_t nreloc;
 	uint16_t nfence;
 	uint16_t max_batch_size;
-	uint16_t vma_count;
 
 	uint32_t flush:1;
 	uint32_t sync:1;
@@ -332,10 +344,8 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 			uint32_t delta);
 
 void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot);
-void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
-void kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 
 Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
diff --git a/src/sna/kgem_debug_gen3.c b/src/sna/kgem_debug_gen3.c
index 0238b73..213c69f 100644
--- a/src/sna/kgem_debug_gen3.c
+++ b/src/sna/kgem_debug_gen3.c
@@ -101,9 +101,6 @@ static void gen3_update_vertex_buffer_addr(struct kgem *kgem,
 	}
 	ptr = (char *)base + kgem->reloc[i].delta;
 
-	if (state.vb.current)
-		kgem_bo_unmap(kgem, state.vb.current);
-
 	state.vb.current = bo;
 	state.vb.base = base;
 	state.vb.ptr = ptr;
@@ -1612,8 +1609,5 @@ int kgem_gen3_decode_3d(struct kgem *kgem, uint32_t offset)
 
 void kgem_gen3_finish_state(struct kgem *kgem)
 {
-	if (state.vb.current)
-		kgem_bo_unmap(kgem, state.vb.current);
-
 	memset(&state, 0, sizeof(state));
 }
diff --git a/src/sna/kgem_debug_gen4.c b/src/sna/kgem_debug_gen4.c
index 0f91d29..0004ecf 100644
--- a/src/sna/kgem_debug_gen4.c
+++ b/src/sna/kgem_debug_gen4.c
@@ -89,8 +89,6 @@ static void gen4_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	ptr = (char *)base + kgem->reloc[i].delta;
 
 	i = data[0] >> 27;
-	if (state.vb[i].current)
-		kgem_bo_unmap(kgem, state.vb[i].current);
 
 	state.vb[i].current = bo;
 	state.vb[i].base = base;
@@ -415,13 +413,6 @@ get_reloc(struct kgem *kgem,
 
 	return (char *)base + delta;
 }
-
-static void
-put_reloc(struct kgem *kgem, struct reloc *r)
-{
-	if (r->bo != NULL)
-		kgem_bo_unmap(kgem, r->bo);
-}
 #endif
 
 int kgem_gen4_decode_3d(struct kgem *kgem, uint32_t offset)
@@ -691,21 +682,7 @@ int kgem_gen4_decode_3d(struct kgem *kgem, uint32_t offset)
 	return len;
 }
 
-static void finish_vertex_buffers(struct kgem *kgem)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(state.vb); i++)
-		if (state.vb[i].current)
-			kgem_bo_unmap(kgem, state.vb[i].current);
-}
-
 void kgem_gen4_finish_state(struct kgem *kgem)
 {
-	finish_vertex_buffers(kgem);
-
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem, state.dynamic_state.base);
-
 	memset(&state, 0, sizeof(state));
 }
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index c4f5df1..7912cc9 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -84,8 +84,6 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	ptr = (char *)base + reloc->delta;
 
 	i = data[0] >> 27;
-	if (state.vb[i].current)
-		kgem_bo_unmap(kgem, state.vb[i].current);
 
 	state.vb[i].handle = reloc->target_handle;
 	state.vb[i].current = bo;
@@ -389,13 +387,6 @@ get_reloc(struct kgem *kgem,
 
 	return (char *)base + delta;
 }
-
-static void
-put_reloc(struct kgem *kgem, struct reloc *r)
-{
-	if (r->bo != NULL)
-		kgem_bo_umap(kgem, r->bo);
-}
 #endif
 
 int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset)
@@ -667,21 +658,7 @@ int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset)
 	return len;
 }
 
-static void finish_vertex_buffers(struct kgem *kgem)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(state.vb); i++)
-		if (state.vb[i].current)
-			kgem_bo_unmap(kgem, state.vb[i].current);
-}
-
 void kgem_gen5_finish_state(struct kgem *kgem)
 {
-	finish_vertex_buffers(kgem);
-
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem,state. dynamic_state.current);
-
 	memset(&state, 0, sizeof(state));
 }
diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
index 5bcd85d..d23e2d9 100644
--- a/src/sna/kgem_debug_gen6.c
+++ b/src/sna/kgem_debug_gen6.c
@@ -88,8 +88,6 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	ptr = (char *)base + kgem->reloc[i].delta;
 
 	i = data[0] >> 26;
-	if (state.vb[i].current)
-		kgem_bo_unmap(kgem, state.vb[i].current);
 
 	state.vb[i].current = bo;
 	state.vb[i].base = base;
@@ -129,9 +127,6 @@ static void gen6_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
 		ptr = NULL;
 	}
 
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem, state.dynamic_state.current);
-
 	state.dynamic_state.current = bo;
 	state.dynamic_state.base = base;
 	state.dynamic_state.ptr = ptr;
@@ -300,22 +295,8 @@ static void primitive_out(struct kgem *kgem, uint32_t *data)
 	}
 }
 
-static void finish_vertex_buffers(struct kgem *kgem)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(state.vb); i++)
-		if (state.vb[i].current)
-			kgem_bo_unmap(kgem, state.vb[i].current);
-}
-
 static void finish_state(struct kgem *kgem)
 {
-	finish_vertex_buffers(kgem);
-
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem, state.dynamic_state.base);
-
 	memset(&state, 0, sizeof(state));
 }
 
@@ -478,13 +459,6 @@ get_reloc(struct kgem *kgem,
 	return (char *)base + (delta & ~3);
 }
 
-static void
-put_reloc(struct kgem *kgem, struct reloc *r)
-{
-	if (r->bo != NULL)
-		kgem_bo_unmap(kgem, r->bo);
-}
-
 static const char *
 gen6_filter_to_string(uint32_t filter)
 {
@@ -539,8 +513,6 @@ gen6_decode_sampler_state(struct kgem *kgem, const uint32_t *reloc)
 	ErrorF("  Sampler 1:\n");
 	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
 	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
-
-	put_reloc(kgem, &r);
 }
 
 static const char *
@@ -604,8 +576,6 @@ gen6_decode_blend(struct kgem *kgem, const uint32_t *reloc)
 	ErrorF("  Blend (%s): function %s, src=%s, dst=%s\n",
 	       blend->blend0.blend_enable ? "enabled" : "disabled",
 	       func, src, dst);
-
-	put_reloc(kgem, &r);
 }
 
 int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
diff --git a/src/sna/kgem_debug_gen7.c b/src/sna/kgem_debug_gen7.c
index a33a918..c13e96f 100644
--- a/src/sna/kgem_debug_gen7.c
+++ b/src/sna/kgem_debug_gen7.c
@@ -88,8 +88,6 @@ static void gen7_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	ptr = (char *)base + kgem->reloc[i].delta;
 
 	i = data[0] >> 26;
-	if (state.vb[i].current)
-		kgem_bo_unmap(kgem, state.vb[i].base);
 
 	state.vb[i].current = bo;
 	state.vb[i].base = base;
@@ -129,9 +127,6 @@ static void gen7_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
 		ptr = NULL;
 	}
 
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem, state.dynamic_state.base);
-
 	state.dynamic_state.current = bo;
 	state.dynamic_state.base = base;
 	state.dynamic_state.ptr = ptr;
@@ -300,22 +295,8 @@ static void primitive_out(struct kgem *kgem, uint32_t *data)
 	}
 }
 
-static void finish_vertex_buffers(struct kgem *kgem)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(state.vb); i++)
-		if (state.vb[i].current)
-			kgem_bo_unmap(kgem, state.vb[i].current);
-}
-
 static void finish_state(struct kgem *kgem)
 {
-	finish_vertex_buffers(kgem);
-
-	if (state.dynamic_state.current)
-		kgem_bo_unmap(kgem, state.dynamic_state.base);
-
 	memset(&state, 0, sizeof(state));
 }
 
@@ -478,13 +459,6 @@ get_reloc(struct kgem *kgem,
 	return (char *)base + (delta & ~3);
 }
 
-static void
-put_reloc(struct kgem *kgem, struct reloc *r)
-{
-	if (r->bo != NULL)
-		kgem_bo_unmap(kgem, r->bo);
-}
-
 static const char *
 gen7_filter_to_string(uint32_t filter)
 {
@@ -539,8 +513,6 @@ gen7_decode_sampler_state(struct kgem *kgem, const uint32_t *reloc)
 	ErrorF("  Sampler 1:\n");
 	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
 	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
-
-	put_reloc(kgem, &r);
 }
 
 static const char *
@@ -604,8 +576,6 @@ gen7_decode_blend(struct kgem *kgem, const uint32_t *reloc)
 	ErrorF("  Blend (%s): function %s, src=%s, dst=%s\n",
 	       blend->blend0.blend_enable ? "enabled" : "disabled",
 	       func, src, dst);
-
-	put_reloc(kgem, &r);
 }
 
 int kgem_gen7_decode_3d(struct kgem *kgem, uint32_t offset)
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 4e42c6d..6b69a6e 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -238,9 +238,7 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
 		     __FUNCTION__, priv->cpu_bo->handle, priv->cpu_bo->size));
 
-		kgem_bo_unmap__cpu(&sna->kgem, priv->cpu_bo, priv->ptr);
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-
 		priv->cpu_bo = NULL;
 	} else
 		free(priv->ptr);
commit 26042b2660d87044e1920a1267d9984c00c9566a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 11:45:11 2012 +0000

    sna: Bubble sort the partial buffer list back into order after trimming padding
    
    After reducing the used size in the partial buffer, we need to resort
    the list to maintain the list in decreasing amount of available space.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 08d4db7..0c1b2b1 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2163,6 +2163,9 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 	if (list_is_empty(&io->base.list))
 		return;
 
+	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
+	     __FUNCTION__, bo->size, bo->delta, io->used));
+
 	if (bo->size == io->used) {
 		assert(io->base.exec == NULL);
 		assert(io->base.refcnt >= 2);
@@ -2976,12 +2979,41 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		return NULL;
 
 	if (height & 1) {
+		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
+		int remain;
+
 		/* Having padded this surface to ensure that accesses to
 		 * the last pair of rows is valid, remove the padding so
 		 * that it can be allocated to other pixmaps.
 		 */
-		((struct kgem_partial_bo *)bo->proxy)->used -= stride;
+		io->used -= stride;
 		bo->size -= stride;
+
+		/* And bubble-sort the partial back into place */
+		remain = io->alloc - io->used;
+		while (io->base.list.prev != &kgem->partial) {
+			struct kgem_partial_bo *p;
+
+			p = list_entry(io->base.list.prev,
+				       struct kgem_partial_bo,
+				       base.list);
+			if (remain <= p->alloc - p->used)
+				break;
+
+			assert(p->base.list.next == &io->base.list);
+			io->base.list.prev = p->base.list.prev;
+			p->base.list.prev->next = &io->base.list;
+			p->base.list.prev = &io->base.list;
+
+			p->base.list.next = io->base.list.next;
+			io->base.list.next->prev = &p->base.list;
+			io->base.list.next = &p->base.list;
+
+			assert(p->base.list.next->prev == &p->base.list);
+			assert(io->base.list.prev->next == &io->base.list);
+		}
+
+		assert(validate_partials(kgem));
 	}
 
 	bo->pitch = stride;
commit 3f7ea44bf19a03ee81b683885c9c2416092254a3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 8 02:27:28 2012 +0000

    sna/gen[67]: Hook into the clear operation for glyph masks
    
    Allow SandyBridge to specialise its clear routine to reduce the number
    of ring switches. It may be interesting to specialise the clear routines
    even further and use the special render clear commands...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 86bf460..e95cdd6 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -55,6 +55,7 @@
 #define NO_COPY_BOXES 0
 #define NO_FILL 0
 #define NO_FILL_BOXES 0
+#define NO_CLEAR 0
 
 #define GEN6_MAX_SIZE 8192
 
@@ -3552,6 +3553,102 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	return TRUE;
 }
 
+static Bool
+gen6_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
+{
+	BoxRec box;
+
+	box.x1 = 0;
+	box.y1 = 0;
+	box.x2 = dst->drawable.width;
+	box.y2 = dst->drawable.height;
+
+	return sna_blt_fill_boxes(sna, GXclear,
+				  bo, dst->drawable.bitsPerPixel,
+				  0, &box, 1);
+}
+
+static Bool
+gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
+{
+	struct sna_composite_op tmp;
+
+#if NO_CLEAR
+	return gen6_render_clear_try_blt(sna, dst, bo);
+#endif
+
+	DBG(("%s: %dx%d\n",
+	     __FUNCTION__,
+	     dst->drawable.width,
+	     dst->drawable.height));
+
+	/* Prefer to use the BLT if already engaged */
+	if (sna->kgem.ring == KGEM_BLT &&
+	    gen6_render_clear_try_blt(sna, dst, bo))
+		return TRUE;
+
+	/* Must use the BLT if we can't RENDER... */
+	if (too_large(dst->drawable.width, dst->drawable.height))
+		return gen6_render_clear_try_blt(sna, dst, bo);
+
+	tmp.op = PictOpClear;
+
+	tmp.dst.pixmap = dst;
+	tmp.dst.width  = dst->drawable.width;
+	tmp.dst.height = dst->drawable.height;
+	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
+	tmp.dst.bo = bo;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	tmp.src.bo = sna_render_get_solid(sna, 0);
+	tmp.src.filter = SAMPLER_FILTER_NEAREST;
+	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
+
+	tmp.mask.bo = NULL;
+	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
+	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
+
+	tmp.is_affine = TRUE;
+	tmp.floats_per_vertex = 3;
+	tmp.floats_per_rect = 9;
+	tmp.has_component_alpha = 0;
+	tmp.need_magic_ca_pass = FALSE;
+
+	tmp.u.gen6.wm_kernel = GEN6_WM_KERNEL_NOMASK;
+	tmp.u.gen6.nr_surfaces = 2;
+	tmp.u.gen6.nr_inputs = 1;
+	tmp.u.gen6.ve_id = 1;
+
+	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+		_kgem_submit(&sna->kgem);
+
+	gen6_emit_fill_state(sna, &tmp);
+	gen6_align_vertex(sna, &tmp);
+
+	if (!gen6_get_rectangles(sna, &tmp, 1)) {
+		gen6_emit_fill_state(sna, &tmp);
+		gen6_get_rectangles(sna, &tmp, 1);
+	}
+
+	OUT_VERTEX(dst->drawable.width, dst->drawable.height);
+	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(1);
+
+	OUT_VERTEX(0, dst->drawable.height);
+	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(1);
+
+	OUT_VERTEX(0, 0);
+	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(0);
+
+	gen6_vertex_flush(sna);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+	_kgem_set_mode(&sna->kgem, KGEM_RENDER);
+
+	return TRUE;
+}
+
 static void gen6_render_flush(struct sna *sna)
 {
 	gen6_vertex_finish(sna, TRUE);
@@ -3658,6 +3755,7 @@ Bool gen6_render_init(struct sna *sna)
 	sna->render.fill_boxes = gen6_render_fill_boxes;
 	sna->render.fill = gen6_render_fill;
 	sna->render.fill_one = gen6_render_fill_one;
+	sna->render.clear = gen6_render_clear;
 
 	sna->render.flush = gen6_render_flush;
 	sna->render.reset = gen6_render_reset;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index dd93ae9..9d17c87 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -55,6 +55,7 @@
 #define NO_COPY_BOXES 0
 #define NO_FILL 0
 #define NO_FILL_BOXES 0
+#define NO_CLEAR 0
 
 #define GEN7_MAX_SIZE 16384
 
@@ -3608,6 +3609,102 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	return TRUE;
 }
 
+static Bool
+gen7_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
+{
+	BoxRec box;
+
+	box.x1 = 0;
+	box.y1 = 0;
+	box.x2 = dst->drawable.width;
+	box.y2 = dst->drawable.height;
+
+	return sna_blt_fill_boxes(sna, GXclear,
+				  bo, dst->drawable.bitsPerPixel,
+				  0, &box, 1);
+}
+
+static Bool
+gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
+{
+	struct sna_composite_op tmp;
+
+#if NO_CLEAR
+	return gen7_render_clear_try_blt(sna, dst, bo);
+#endif
+
+	DBG(("%s: %dx%d\n",
+	     __FUNCTION__,
+	     dst->drawable.width,
+	     dst->drawable.height));
+
+	/* Prefer to use the BLT if already engaged */
+	if (sna->kgem.ring == KGEM_BLT &&
+	    gen7_render_clear_try_blt(sna, dst, bo))
+		return TRUE;
+
+	/* Must use the BLT if we can't RENDER... */
+	if (too_large(dst->drawable.width, dst->drawable.height))
+		return gen7_render_clear_try_blt(sna, dst, bo);
+
+	tmp.op = PictOpClear;
+
+	tmp.dst.pixmap = dst;
+	tmp.dst.width  = dst->drawable.width;
+	tmp.dst.height = dst->drawable.height;
+	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
+	tmp.dst.bo = bo;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	tmp.src.bo = sna_render_get_solid(sna, 0);
+	tmp.src.filter = SAMPLER_FILTER_NEAREST;
+	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
+
+	tmp.mask.bo = NULL;
+	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
+	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
+
+	tmp.is_affine = TRUE;
+	tmp.floats_per_vertex = 3;
+	tmp.floats_per_rect = 9;
+	tmp.has_component_alpha = 0;
+	tmp.need_magic_ca_pass = FALSE;
+
+	tmp.u.gen7.wm_kernel = GEN6_WM_KERNEL_NOMASK;
+	tmp.u.gen7.nr_surfaces = 2;
+	tmp.u.gen7.nr_inputs = 1;
+	tmp.u.gen7.ve_id = 1;
+
+	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+		_kgem_submit(&sna->kgem);
+
+	gen7_emit_fill_state(sna, &tmp);
+	gen7_align_vertex(sna, &tmp);
+
+	if (!gen7_get_rectangles(sna, &tmp, 1)) {
+		gen7_emit_fill_state(sna, &tmp);
+		gen7_get_rectangles(sna, &tmp, 1);
+	}
+
+	OUT_VERTEX(dst->drawable.width, dst->drawable.height);
+	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(1);
+
+	OUT_VERTEX(0, dst->drawable.height);
+	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(1);
+
+	OUT_VERTEX(0, 0);
+	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(0);
+
+	gen7_vertex_flush(sna);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+	_kgem_set_mode(&sna->kgem, KGEM_RENDER);
+
+	return TRUE;
+}
+
 static void gen7_render_flush(struct sna *sna)
 {
 	gen7_vertex_finish(sna, TRUE);
@@ -3711,6 +3808,7 @@ Bool gen7_render_init(struct sna *sna)
 	sna->render.fill_boxes = gen7_render_fill_boxes;
 	sna->render.fill = gen7_render_fill;
 	sna->render.fill_one = gen7_render_fill_one;
+	sna->render.clear = gen7_render_clear;
 
 	sna->render.flush = gen7_render_flush;
 	sna->render.reset = gen7_render_reset;
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 209b199..f64d2f9 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -654,11 +654,7 @@ static bool
 clear_pixmap(struct sna *sna, PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	return sna->render.fill_one(sna, pixmap, priv->gpu_bo, 0,
-				    0, 0,
-				    pixmap->drawable.width,
-				    pixmap->drawable.height,
-				    GXclear);
+	return sna->render.clear(sna, pixmap, priv->gpu_bo);
 }
 
 static Bool
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 96c4366..c4d8a58 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -197,6 +197,18 @@ no_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 				  color, &box, 1);
 }
 
+static Bool
+no_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
+{
+	DBG(("%s: pixmap=%ld %dx%d\n", __FUNCTION__,
+	     dst->drawable.serialNumber,
+	     dst->drawable.width,
+	     dst->drawable.height));
+	return sna->render.fill_one(sna, dst, bo, 0,
+				    0, 0, dst->drawable.width, dst->drawable.height,
+				    GXclear);
+}
+
 static void no_render_reset(struct sna *sna)
 {
 	(void)sna;
@@ -235,6 +247,7 @@ void no_render_init(struct sna *sna)
 	render->fill_boxes = no_render_fill_boxes;
 	render->fill = no_render_fill;
 	render->fill_one = no_render_fill_one;
+	render->clear = no_render_clear;
 
 	render->reset = no_render_reset;
 	render->flush = no_render_flush;
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 5cd0d7c..2229c18 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -219,6 +219,7 @@ struct sna_render {
 			 uint32_t color,
 			 int16_t x1, int16_t y1, int16_t x2, int16_t y2,
 			 uint8_t alu);
+	Bool (*clear)(struct sna *sna, PixmapPtr dst, struct kgem_bo *dst_bo);
 
 	Bool (*copy_boxes)(struct sna *sna, uint8_t alu,
 			   PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,