Fri Jun 29 06:52:52 PDT 2012

Rebased ref, commits from common ancestor:
commit f6f43a71cc2a5703e3582adc807978944afff302
Author: Kristian HÃ¸gsberg <krh at bitplanet.net>
Date:   Fri Jun 29 09:49:56 2012 -0400

    xwayland: Use new DRI2 AuthMagic2

diff --git a/src/intel_dri.c b/src/intel_dri.c
index 1745aef..fa04e18 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -1617,22 +1617,14 @@ out_complete:
 }
 
 #ifdef XORG_WAYLAND
-static int intel_auth_magic(int fd, uint32_t magic)
+static int intel_auth_magic2(ScreenPtr screen, uint32_t magic)
 {
-	ScrnInfoPtr scrn;
-	intel_screen_private *intel;
-	int i;
+	ScrnInfoPtr scrn = xf86Screens[screen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
 
 	/* Not wayland, go stragight to drm */
 	if (!xorgWayland)
-		return drmAuthMagic(fd, magic);
-
-	for (i = 0; i < 1; i++) {
-		scrn = xf86Screens[i];
-		intel = intel_get_screen_private(scrn);
-		if (xwl_screen_get_drm_fd(intel->xwl_screen) == fd)
-			break;
-	}
+		return drmAuthMagic(intel->drmSubFD, magic);
 
 	/* Forward the request to our host */
 	return xwl_drm_authenticate(intel->xwl_screen, magic);
@@ -1724,9 +1716,9 @@ Bool I830DRI2ScreenInit(ScreenPtr screen)
 	}
 #endif
 
-#if DRI2INFOREC_VERSION >= 5 && defined(XORG_WAYLAND)
-	info.version = 5;
-	info.AuthMagic = intel_auth_magic;
+#if defined(XORG_WAYLAND) /* If we have XORG_WAYLAND, we have AuthMagic2 */
+	info.version = 4;
+	info.AuthMagic2 = intel_auth_magic2;
 #endif
 
 	return DRI2ScreenInit(screen, &info);
commit c79bf28e6e351898216cfd1b021e87cc282f78a9
Author: Christopher James Halse Rogers <christopher.halse.rogers at canonical.com>
Date:   Wed May 23 22:08:37 2012 +1000

    xwayland: Adapt to new initialisation sequence

diff --git a/src/intel_driver.c b/src/intel_driver.c
index e414406..7bf8b76 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -631,15 +631,18 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 
 #ifdef XORG_WAYLAND
 	if (xorgWayland) {
-		xf86LoadSubModule(scrn, "xwayland");
-		intel->xwl_screen =
-			xwl_screen_pre_init(scrn, 0, &xwl_driver);
+		intel->xwl_screen = xwl_screen_create();
 		if (!intel->xwl_screen) {
 			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
 				   "Failed to initialize xwayland.\n");
 			return FALSE;
 		}
-
+		if (!xwl_screen_pre_init(scrn, intel->xwl_screen, 
+					 0, &xwl_driver)) {
+			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+				   "Failed to pre-init xwayland screen\n");
+			xwl_screen_destroy(intel->xwl_screen);
+		}
 		intel->drmSubFD =
 			xwl_screen_get_drm_fd(intel->xwl_screen);
 	}
commit 7df24b385f3b7576cd1c51158b6e4989e838459b
Author: Kristian HÃ¸gsberg <krh at bitplanet.net>
Date:   Fri Oct 15 17:58:14 2010 -0400

    Add xwayland support

diff --git a/src/intel_driver.c b/src/intel_driver.c
index 9124fdf..e414406 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -803,6 +803,11 @@ I830BlockHandler(int i, pointer blockData, pointer pTimeout, pointer pReadmask)
 
 	intel_uxa_block_handler(intel);
 	intel_video_block_handler(intel);
+
+#ifdef XORG_WAYLAND
+	if (intel->xwl_screen)
+		xwl_screen_post_damage(intel->xwl_screen);
+#endif
 }
 
 static Bool
commit e26f82159a7477a9752f7a771aa28d650f4d7b6f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon May 7 08:55:35 2012 +0100

    sna: Manually execute the timer as TimerForce does not run an inactive timer
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3280490..274facb 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11995,6 +11995,7 @@ static bool sna_accel_do_flush(struct sna *sna)
 		}
 
 		if (sna->timer_ready & (1<<(FLUSH_TIMER))) {
+			DBG(("%s (time=%ld), triggered\n", __FUNCTION__, (long)sna->time));
 			sna->timer_expire[FLUSH_TIMER] =
 				sna->time + sna->vblank_interval;
 			return true;
@@ -12007,6 +12008,7 @@ static bool sna_accel_do_flush(struct sna *sna)
 			sna->timer_ready |= 1 << FLUSH_TIMER;
 			sna->timer_expire[FLUSH_TIMER] =
 				sna->time + sna->vblank_interval / 2;
+			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)sna->time));
 		}
 	}
 
@@ -12017,6 +12019,7 @@ static bool sna_accel_do_expire(struct sna *sna)
 {
 	if (sna->timer_active & (1<<(EXPIRE_TIMER))) {
 		if (sna->timer_ready & (1<<(EXPIRE_TIMER))) {
+			DBG(("%s (time=%ld), triggered\n", __FUNCTION__, (long)sna->time));
 			sna->timer_expire[EXPIRE_TIMER] =
 				sna->time + MAX_INACTIVE_TIME * 1000;
 			return true;
@@ -12027,6 +12030,7 @@ static bool sna_accel_do_expire(struct sna *sna)
 			sna->timer_ready |= 1 << EXPIRE_TIMER;
 			sna->timer_expire[EXPIRE_TIMER] =
 				sna->time + MAX_INACTIVE_TIME * 1000;
+			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)sna->time));
 		}
 	}
 
@@ -12039,6 +12043,7 @@ static bool sna_accel_do_inactive(struct sna *sna)
 		if (sna->timer_ready & (1<<(INACTIVE_TIMER))) {
 			sna->timer_expire[INACTIVE_TIMER] =
 				sna->time + 120 * 1000;
+			DBG(("%s (time=%ld), triggered\n", __FUNCTION__, (long)sna->time));
 			return true;
 		}
 	} else {
@@ -12047,6 +12052,7 @@ static bool sna_accel_do_inactive(struct sna *sna)
 			sna->timer_ready |= 1 << INACTIVE_TIMER;
 			sna->timer_expire[INACTIVE_TIMER] =
 				sna->time + 120 * 1000;
+			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)sna->time));
 		}
 	}
 
@@ -12056,30 +12062,31 @@ static bool sna_accel_do_inactive(struct sna *sna)
 static CARD32 sna_timeout(OsTimerPtr timer, CARD32 now, pointer arg)
 {
 	struct sna *sna = arg;
-	CARD32 next = UINT32_MAX;
+	int32_t next = 0;
 	uint32_t active;
 	int i;
 
+	DBG(("%s: now=%d, active=%08x, ready=%08x\n",
+	     __FUNCTION__, now, sna->timer_active, sna->timer_ready));
 	active = sna->timer_active & ~sna->timer_ready;
 	if (active == 0)
 		return 0;
 
 	for (i = 0; i < NUM_TIMERS; i++) {
 		if (active & (1 << i)) {
-			if (now > sna->timer_expire[i]) {
+			int32_t delta = sna->timer_expire[i] - now;
+			DBG(("%s: timer[%d] expires in %d [%d]\n",
+			     __FUNCTION__, i, delta, sna->timer_expire[i]));
+			if (delta <= 0)
 				sna->timer_ready |= 1 << i;
-			} else {
-				if (sna->timer_expire[i] < next)
-					next = sna->timer_expire[i];
-			}
+			else if (next == 0 || delta < next)
+				next = delta;
 		}
 	}
 
-	if ((sna->timer_active & ~sna->timer_ready) == 0)
-		return 0;
-
-	assert(next != UINT32_MAX);
-	return next - now;
+	DBG(("%s: active=%08x, ready=%08x, next=+%d\n",
+	     __FUNCTION__, sna->timer_active, sna->timer_ready, next));
+	return next;
 }
 
 static bool sna_accel_flush(struct sna *sna)
@@ -12385,8 +12392,15 @@ void sna_accel_block_handler(struct sna *sna)
 	}
 
 	if (sna->timer_ready) {
+		DBG(("%s: evaluating timers, ready=%x\n",
+		     __FUNCTION__, sna->timer_ready));
 		sna->timer_ready = 0;
-		TimerForce(sna->timer);
+		TimerSet(sna->timer, 0,
+			 sna_timeout(sna->timer,
+				     sna->time,
+				     sna),
+			 sna_timeout, sna);
+		assert(sna->timer_ready == 0);
 	}
 }
 
commit fa6d8bf9454ad427319d9386927ffd79cc99e4e9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun May 6 17:23:11 2012 +0100

    sna: Replace timerfd with OsTimer
    
    As timerfd is linux-specific, and OsTimer an OS-agnostic abraction,
    replace the former with the later. Arguably this has slightly better
    performance characteristics in select-bound loops.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 3d646da..31d3a43 100644
--- a/configure.ac
+++ b/configure.ac
@@ -264,8 +264,6 @@ if test "x$DEBUG" = xfull; then
         CFLAGS="$CFLAGS -O0 -ggdb3"
 fi
 
-AC_CHECK_HEADERS([sys/timerfd.h])
-
 DRIVER_NAME=intel
 AC_SUBST([DRIVER_NAME])
 AC_SUBST([moduledir])
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 3e4b8ec..d7a20b9 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -225,7 +225,9 @@ struct sna {
 	unsigned watch_flush;
 	unsigned flush;
 
-	int timer[NUM_TIMERS];
+	uint32_t time;
+	OsTimerPtr timer;
+	uint32_t timer_expire[NUM_TIMERS];
 	uint16_t timer_active;
 	uint16_t timer_ready;
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3e03934..3280490 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11964,165 +11964,123 @@ static struct sna_pixmap *sna_accel_scanout(struct sna *sna)
 	return priv && priv->gpu_bo ? priv : NULL;
 }
 
-#if HAVE_SYS_TIMERFD_H && !FORCE_FLUSH
-#include <sys/timerfd.h>
-#include <errno.h>
-
-static uint64_t read_timer(int fd)
-{
-	uint64_t count = 0;
-	int ret = read(fd, &count, sizeof(count));
-	return count;
-	(void)ret;
-}
-
-static void sna_accel_drain_timer(struct sna *sna, int id)
+static void sna_accel_disarm_timer(struct sna *sna, int id)
 {
-	if (sna->timer_active & (1<<id))
-		read_timer(sna->timer[id]);
-}
-
-static void _sna_accel_disarm_timer(struct sna *sna, int id)
-{
-	struct itimerspec to;
-
-	DBG(("%s[%d] (time=%ld)\n", __FUNCTION__, id, (long)GetTimeInMillis()));
-
-	memset(&to, 0, sizeof(to));
-	timerfd_settime(sna->timer[id], 0, &to, NULL);
+	DBG(("%s[%d] (time=%ld)\n", __FUNCTION__, id, (long)sna->time));
 	sna->timer_active &= ~(1<<id);
+	sna->timer_ready &= ~(1<<id);
 }
 
-#define return_if_timer_active(id) do {					\
-	if (sna->timer_active & (1<<(id)))				\
-		return (sna->timer_ready & (1<<(id))) && read_timer(sna->timer[id]) > 0;			\
-} while (0)
-
-static Bool sna_accel_do_flush(struct sna *sna)
+static bool sna_accel_do_flush(struct sna *sna)
 {
-	struct itimerspec to;
 	struct sna_pixmap *priv;
 
 	priv = sna_accel_scanout(sna);
 	if (priv == NULL) {
 		DBG(("%s -- no scanout attached\n", __FUNCTION__));
-		return FALSE;
-	}
-
-	if (sna->kgem.flush_now) {
-		sna->kgem.flush_now = 0;
-		if (priv->gpu_bo->exec) {
-			DBG(("%s -- forcing flush\n", __FUNCTION__));
-			sna_accel_drain_timer(sna, FLUSH_TIMER);
-			return TRUE;
-		}
-	}
-
-	return_if_timer_active(FLUSH_TIMER);
-
-	if (priv->cpu_damage == NULL && priv->gpu_bo->exec == NULL) {
-		DBG(("%s -- no pending write to scanout\n", __FUNCTION__));
-		return FALSE;
+		sna_accel_disarm_timer(sna, FLUSH_TIMER);
+		return false;
 	}
 
 	if (sna->flags & SNA_NO_DELAYED_FLUSH)
-		return TRUE;
-
-	if (sna->timer[FLUSH_TIMER] == -1)
-		return TRUE;
-
-	DBG(("%s, starting flush timer, at time=%ld\n",
-	     __FUNCTION__, (long)GetTimeInMillis()));
+		return true;
 
-	/* Initial redraw hopefully before this vblank */
-	to.it_value.tv_sec = 0;
-	to.it_value.tv_nsec = sna->vblank_interval / 2;
+	if (sna->timer_active & (1<<(FLUSH_TIMER))) {
+		if (sna->kgem.flush_now) {
+			sna->kgem.flush_now = 0;
+			if (priv->gpu_bo->exec) {
+				DBG(("%s -- forcing flush\n", __FUNCTION__));
+				sna->timer_ready |= 1 << FLUSH_TIMER;
+			}
+		}
 
-	/* Then periodic updates for every vblank */
-	to.it_interval.tv_sec = 0;
-	to.it_interval.tv_nsec = sna->vblank_interval;
-	timerfd_settime(sna->timer[FLUSH_TIMER], 0, &to, NULL);
+		if (sna->timer_ready & (1<<(FLUSH_TIMER))) {
+			sna->timer_expire[FLUSH_TIMER] =
+				sna->time + sna->vblank_interval;
+			return true;
+		}
+	} else {
+		if (priv->cpu_damage == NULL && priv->gpu_bo->exec == NULL) {
+			DBG(("%s -- no pending write to scanout\n", __FUNCTION__));
+		} else {
+			sna->timer_active |= 1 << FLUSH_TIMER;
+			sna->timer_ready |= 1 << FLUSH_TIMER;
+			sna->timer_expire[FLUSH_TIMER] =
+				sna->time + sna->vblank_interval / 2;
+		}
+	}
 
-	sna->timer_active |= 1 << FLUSH_TIMER;
-	return FALSE;
+	return false;
 }
 
-static Bool sna_accel_do_expire(struct sna *sna)
+static bool sna_accel_do_expire(struct sna *sna)
 {
-	struct itimerspec to;
-
-	return_if_timer_active(EXPIRE_TIMER);
-
-	if (!sna->kgem.need_expire)
-		return FALSE;
-
-	if (sna->timer[EXPIRE_TIMER] == -1)
-		return TRUE;
-
-	to.it_interval.tv_sec = MAX_INACTIVE_TIME;
-	to.it_interval.tv_nsec = 0;
-	to.it_value = to.it_interval;
-	timerfd_settime(sna->timer[EXPIRE_TIMER], 0, &to, NULL);
+	if (sna->timer_active & (1<<(EXPIRE_TIMER))) {
+		if (sna->timer_ready & (1<<(EXPIRE_TIMER))) {
+			sna->timer_expire[EXPIRE_TIMER] =
+				sna->time + MAX_INACTIVE_TIME * 1000;
+			return true;
+		}
+	} else {
+		if (sna->kgem.need_expire) {
+			sna->timer_active |= 1 << EXPIRE_TIMER;
+			sna->timer_ready |= 1 << EXPIRE_TIMER;
+			sna->timer_expire[EXPIRE_TIMER] =
+				sna->time + MAX_INACTIVE_TIME * 1000;
+		}
+	}
 
-	sna->timer_active |= 1 << EXPIRE_TIMER;
-	return FALSE;
+	return false;
 }
 
-static Bool sna_accel_do_inactive(struct sna *sna)
+static bool sna_accel_do_inactive(struct sna *sna)
 {
-	struct itimerspec to;
-
-	return_if_timer_active(INACTIVE_TIMER);
-
-	if (list_is_empty(&sna->active_pixmaps))
-		return FALSE;
-
-	if (sna->timer[INACTIVE_TIMER] == -1)
-		return FALSE;
-
-	/* Periodic expiration after every 2 minutes. */
-	to.it_interval.tv_sec = 120;
-	to.it_interval.tv_nsec = 0;
-	to.it_value = to.it_interval;
-	timerfd_settime(sna->timer[INACTIVE_TIMER], 0, &to, NULL);
+	if (sna->timer_active & (1<<(INACTIVE_TIMER))) {
+		if (sna->timer_ready & (1<<(INACTIVE_TIMER))) {
+			sna->timer_expire[INACTIVE_TIMER] =
+				sna->time + 120 * 1000;
+			return true;
+		}
+	} else {
+		if (!list_is_empty(&sna->active_pixmaps)) {
+			sna->timer_active |= 1 << INACTIVE_TIMER;
+			sna->timer_ready |= 1 << INACTIVE_TIMER;
+			sna->timer_expire[INACTIVE_TIMER] =
+				sna->time + 120 * 1000;
+		}
+	}
 
-	sna->timer_active |= 1 << INACTIVE_TIMER;
-	return FALSE;
+	return false;
 }
 
-static void sna_accel_create_timers(struct sna *sna)
+static CARD32 sna_timeout(OsTimerPtr timer, CARD32 now, pointer arg)
 {
-	int id;
+	struct sna *sna = arg;
+	CARD32 next = UINT32_MAX;
+	uint32_t active;
+	int i;
 
-	/* XXX Can we replace this with OSTimer provided by dix? */
+	active = sna->timer_active & ~sna->timer_ready;
+	if (active == 0)
+		return 0;
 
-#ifdef CLOCK_MONOTONIC_COARSE
-	for (id = 0; id < NUM_FINE_TIMERS; id++)
-		sna->timer[id] = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-	for (; id < NUM_TIMERS; id++) {
-		sna->timer[id] = timerfd_create(CLOCK_MONOTONIC_COARSE, TFD_NONBLOCK);
-		if (sna->timer[id] == -1)
-			sna->timer[id] = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+	for (i = 0; i < NUM_TIMERS; i++) {
+		if (active & (1 << i)) {
+			if (now > sna->timer_expire[i]) {
+				sna->timer_ready |= 1 << i;
+			} else {
+				if (sna->timer_expire[i] < next)
+					next = sna->timer_expire[i];
+			}
+		}
 	}
-#else
-	for (id = 0; id < NUM_TIMERS; id++)
-		sna->timer[id] = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-#endif
-}
-#else
-static void sna_accel_create_timers(struct sna *sna)
-{
-	int id;
 
-	for (id = 0; id < NUM_TIMERS; id++)
-		sna->timer[id] = -1;
+	if ((sna->timer_active & ~sna->timer_ready) == 0)
+		return 0;
+
+	assert(next != UINT32_MAX);
+	return next - now;
 }
-static Bool sna_accel_do_flush(struct sna *sna) { return sna_accel_scanout(sna) != NULL; }
-static Bool sna_accel_do_expire(struct sna *sna) { return sna->kgem.need_expire; }
-static Bool sna_accel_do_inactive(struct sna *sna) { return FALSE; }
-static void sna_accel_drain_timer(struct sna *sna, int id) { }
-static void _sna_accel_disarm_timer(struct sna *sna, int id) { }
-#endif
 
 static bool sna_accel_flush(struct sna *sna)
 {
@@ -12131,14 +12089,14 @@ static bool sna_accel_flush(struct sna *sna)
 	bool busy = priv->cpu_damage || need_throttle;
 
 	DBG(("%s (time=%ld), cpu damage? %p, exec? %d nbatch=%d, busy? %d, need_throttle=%d\n",
-	     __FUNCTION__, (long)GetTimeInMillis(),
+	     __FUNCTION__, (long)sna->time,
 	     priv->cpu_damage,
 	     priv->gpu_bo->exec != NULL,
 	     sna->kgem.nbatch,
 	     sna->kgem.busy, need_throttle));
 
 	if (!sna->kgem.busy && !busy)
-		_sna_accel_disarm_timer(sna, FLUSH_TIMER);
+		sna_accel_disarm_timer(sna, FLUSH_TIMER);
 	sna->kgem.busy = busy;
 
 	if (priv->cpu_damage)
@@ -12152,10 +12110,10 @@ static bool sna_accel_flush(struct sna *sna)
 
 static void sna_accel_expire(struct sna *sna)
 {
-	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)GetTimeInMillis()));
+	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)sna->time));
 
 	if (!kgem_expire_cache(&sna->kgem))
-		_sna_accel_disarm_timer(sna, EXPIRE_TIMER);
+		sna_accel_disarm_timer(sna, EXPIRE_TIMER);
 }
 
 static void sna_accel_inactive(struct sna *sna)
@@ -12163,7 +12121,7 @@ static void sna_accel_inactive(struct sna *sna)
 	struct sna_pixmap *priv;
 	struct list preserve;
 
-	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)GetTimeInMillis()));
+	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)sna->time));
 
 #if DEBUG_ACCEL
 	{
@@ -12252,22 +12210,12 @@ static void sna_accel_inactive(struct sna *sna)
 	if (list_is_empty(&sna->inactive_clock[1]) &&
 	    list_is_empty(&sna->inactive_clock[0]) &&
 	    list_is_empty(&sna->active_pixmaps))
-		_sna_accel_disarm_timer(sna, INACTIVE_TIMER);
-}
-
-static void sna_accel_install_timers(struct sna *sna)
-{
-	int n;
-
-	for (n = 0; n < NUM_TIMERS; n++) {
-		if (sna->timer[n] != -1)
-			AddGeneralSocket(sna->timer[n]);
-	}
+		sna_accel_disarm_timer(sna, INACTIVE_TIMER);
 }
 
 Bool sna_accel_pre_init(struct sna *sna)
 {
-	sna_accel_create_timers(sna);
+	sna->timer = TimerSet(NULL, 0, 0, sna_timeout, sna);
 	return TRUE;
 }
 
@@ -12287,7 +12235,6 @@ Bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 	list_init(&sna->inactive_clock[1]);
 
 	AddGeneralSocket(sna->kgem.fd);
-	sna_accel_install_timers(sna);
 
 	screen->CreateGC = sna_create_gc;
 	screen->GetImage = sna_get_image;
@@ -12413,17 +12360,17 @@ static void sna_accel_throttle(struct sna *sna)
 	if (sna->flags & SNA_NO_THROTTLE)
 		return;
 
-	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)GetTimeInMillis()));
+	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)sna->time));
 
 	kgem_throttle(&sna->kgem);
 }
 
 void sna_accel_block_handler(struct sna *sna)
 {
-	if (sna_accel_do_flush(sna)) {
-		if (sna_accel_flush(sna))
-			sna_accel_throttle(sna);
-	}
+	sna->time = GetTimeInMillis();
+
+	if (sna_accel_do_flush(sna) && sna_accel_flush(sna))
+		sna_accel_throttle(sna);
 
 	if (sna_accel_do_expire(sna))
 		sna_accel_expire(sna);
@@ -12437,31 +12384,20 @@ void sna_accel_block_handler(struct sna *sna)
 		sna->watch_flush = 0;
 	}
 
-	sna->timer_ready = 0;
+	if (sna->timer_ready) {
+		sna->timer_ready = 0;
+		TimerForce(sna->timer);
+	}
 }
 
 void sna_accel_wakeup_handler(struct sna *sna, fd_set *ready)
 {
-	int id, active;
-
 	if (sna->kgem.need_retire)
 		kgem_retire(&sna->kgem);
 	if (sna->kgem.need_purge)
 		kgem_purge_cache(&sna->kgem);
-
-	active = sna->timer_active & ~sna->timer_ready;
-	for (id = 0; id < NUM_TIMERS; id++)
-		if (active & (1 << id) && FD_ISSET(sna->timer[id], ready))
-			sna->timer_ready |= 1 << id;
 }
 
 void sna_accel_free(struct sna *sna)
 {
-	int id;
-
-	for (id = 0; id < NUM_TIMERS; id++)
-		if (sna->timer[id] != -1) {
-			close(sna->timer[id]);
-			sna->timer[id] = -1;
-		}
 }
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index 6287cd9..65d1992 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -643,8 +643,8 @@ static void update_flush_interval(struct sna *sna)
 	if (max_vrefresh == 0)
 		max_vrefresh = 40;
 
-	sna->vblank_interval = 1000 * 1000 * 1000 / max_vrefresh; /* Hz -> ns */
-	DBG(("max_vrefresh=%d, vblank_interval=%d ns\n",
+	sna->vblank_interval = 1000 / max_vrefresh; /* Hz -> ms */
+	DBG(("max_vrefresh=%d, vblank_interval=%d ms\n",
 	       max_vrefresh, sna->vblank_interval));
 }
 
commit 495c7446fb5fa0056d388008091a388274a5d7fb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun May 6 21:51:27 2012 +0100

    sna: Remove short-circuiting for large font fallbacks
    
    Unlike the fallback for an unhandled depth, we need to ensure that the
    pixmaps are mapped to the CPU before calling the fb routines.
    
    Reported-by: Toralf FÃ¶rster <toralf.foerster at gmx.de>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=49558
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1bc34dc..3e03934 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10796,9 +10796,6 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
-	if (sna_font_too_large(gc->font))
-		goto fallback;
-
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph8(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -10828,8 +10825,11 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_POLY_TEXT8)
 		goto force_fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto force_fallback;
+
 	if (!PM_IS_SOLID(drawable, gc->planemask))
-		return false;
+		goto force_fallback;
 
 	if (!gc_is_solid(gc, &fg))
 		goto force_fallback;
@@ -10888,9 +10888,6 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
-	if (sna_font_too_large(gc->font))
-		goto fallback;
-
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph16(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -10920,8 +10917,11 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_POLY_TEXT16)
 		goto force_fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto force_fallback;
+
 	if (!PM_IS_SOLID(drawable, gc->planemask))
-		return false;
+		goto force_fallback;
 
 	if (!gc_is_solid(gc, &fg))
 		goto force_fallback;
@@ -10980,9 +10980,6 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
-	if (sna_font_too_large(gc->font))
-		goto fallback;
-
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph8(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -11024,6 +11021,9 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_IMAGE_TEXT8)
 		goto force_fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto force_fallback;
+
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto force_fallback;
 
@@ -11074,9 +11074,6 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
-	if (sna_font_too_large(gc->font))
-		goto fallback;
-
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph16(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -11118,6 +11115,9 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_IMAGE_TEXT16)
 		goto force_fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto force_fallback;
+
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto force_fallback;
 
commit e943db57db0ce3664c8adf4bd9bde53727c7a0bb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun May 6 12:40:54 2012 +0100

    sna/gen2+: Fix typo for computing redirected extents for render copy
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 0ad346e..9717aac 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2909,14 +2909,14 @@ fallback:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 680d36f..e73c707 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -4069,14 +4069,14 @@ fallback_blt:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index b8a2459..9cad75e 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2655,14 +2655,14 @@ fallback_blt:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 1fb7f65..54d0b22 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2962,14 +2962,14 @@ fallback_blt:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 38fb024..55673bf 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3339,14 +3339,14 @@ fallback_blt:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 6c9a833..e128f5c 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3422,14 +3422,14 @@ fallback_blt:
 		int i;
 
 		for (i = 1; i < n; i++) {
-			if (extents.x1 < box[i].x1)
+			if (box[i].x1 < extents.x1)
 				extents.x1 = box[i].x1;
-			if (extents.y1 < box[i].y1)
+			if (box[i].y1 < extents.y1)
 				extents.y1 = box[i].y1;
 
-			if (extents.x2 > box[i].x2)
+			if (box[i].x2 > extents.x2)
 				extents.x2 = box[i].x2;
-			if (extents.y2 > box[i].y2)
+			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
commit 703ce6f9d28bf26d719ffd996d1b37396cd7d367
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 4 20:56:37 2012 +0100

    sna: Add a pair of asserts to track down a NULL pointer dereference
    
    Looks like the assumption for the location of the data is invalid,
    allocation failure, perhaps?
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index dee6608..880e173 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -460,6 +460,8 @@ static struct kgem_bo *upload(struct sna *sna,
 			priv->mapped = false;
 		}
 		if (pixmap->devPrivate.ptr == NULL) {
+			assert(priv->ptr);
+			assert(priv->stride);
 			pixmap->devPrivate.ptr = priv->ptr;
 			pixmap->devKind = priv->stride;
 		}
commit b932086623a92a23a1df2eeda3a438b2ed953acc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 4 10:09:46 2012 +0100

    sna/dri: Only track a single pending flip across all pipes
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index e07a115..3e4b8ec 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -249,7 +249,7 @@ struct sna {
 	} mode;
 
 	struct sna_dri {
-		void *flip_pending[2];
+		void *flip_pending;
 	} dri;
 
 	unsigned int tiling;
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 32602d5..2b97e68 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -937,6 +937,9 @@ can_flip(struct sna * sna,
 	WindowPtr win = (WindowPtr)draw;
 	PixmapPtr pixmap;
 
+	if (!sna->scrn->vtSema)
+		return FALSE;
+
 	if (draw->type == DRAWABLE_PIXMAP)
 		return FALSE;
 
@@ -1109,7 +1112,7 @@ sna_dri_flip_continue(struct sna *sna,
 
 	info->next_front.name = 0;
 
-	sna->dri.flip_pending[info->pipe] = info;
+	sna->dri.flip_pending = info;
 
 	return TRUE;
 }
@@ -1171,8 +1174,8 @@ static void sna_dri_flip_event(struct sna *sna,
 		break;
 
 	case DRI2_FLIP_THROTTLE:
-		assert(sna->dri.flip_pending[flip->pipe] == flip);
-		sna->dri.flip_pending[flip->pipe] = NULL;
+		assert(sna->dri.flip_pending == flip);
+		sna->dri.flip_pending = NULL;
 
 		if (flip->next_front.name &&
 		    flip->drawable_id &&
@@ -1204,9 +1207,9 @@ static void sna_dri_flip_event(struct sna *sna,
 	case DRI2_ASYNC_FLIP:
 		DBG(("%s: async swap flip completed on pipe %d, pending? %d, new? %d\n",
 		     __FUNCTION__, flip->pipe,
-		     sna->dri.flip_pending[flip->pipe] != NULL,
+		     sna->dri.flip_pending != NULL,
 		     flip->front->name != flip->old_front.name));
-		assert(sna->dri.flip_pending[flip->pipe] == flip);
+		assert(sna->dri.flip_pending == flip);
 
 		if (flip->front->name != flip->next_front.name) {
 			DBG(("%s: async flip continuing\n", __FUNCTION__));
@@ -1240,7 +1243,7 @@ finish_async_flip:
 			flip->next_front.bo = NULL;
 
 			DBG(("%s: async flip completed\n", __FUNCTION__));
-			sna->dri.flip_pending[flip->pipe] = NULL;
+			sna->dri.flip_pending = NULL;
 			sna_dri_frame_event_info_free(flip);
 		}
 		break;
@@ -1326,9 +1329,9 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		int type = DRI2_FLIP_THROTTLE;
 
 		DBG(("%s: performing immediate swap on pipe %d, pending? %d\n",
-		     __FUNCTION__, pipe, sna->dri.flip_pending[pipe] != NULL));
+		     __FUNCTION__, pipe, sna->dri.flip_pending != NULL));
 
-		info = sna->dri.flip_pending[pipe];
+		info = sna->dri.flip_pending;
 		if (info) {
 			if (info->drawable_id == draw->id) {
 				DBG(("%s: chaining flip\n", __FUNCTION__));
@@ -1382,7 +1385,7 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 				       CREATE_EXACT);
 		info->back->name = kgem_bo_flink(&sna->kgem,
 						 get_private(info->back)->bo);
-		sna->dri.flip_pending[info->pipe] = info;
+		sna->dri.flip_pending = info;
 
 		DRI2SwapComplete(info->client, draw, 0, 0, 0,
 				 DRI2_EXCHANGE_COMPLETE,
@@ -1697,16 +1700,17 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 	struct sna *sna = to_sna_from_drawable(draw);
 	struct sna_dri_frame_event *info;
 	struct kgem_bo *bo;
-	int name, pipe;
+	int name;
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	pipe = sna_dri_get_pipe(draw);
-	if (pipe == -1) {
-		PixmapPtr pixmap = get_drawable_pixmap(draw);
+	if (!sna->scrn->vtSema) {
+		PixmapPtr pixmap;
 
+exchange:
 		DBG(("%s: unattached, exchange pixmaps\n", __FUNCTION__));
 
+		pixmap = get_drawable_pixmap(draw);
 		set_bo(pixmap, get_private(back)->bo);
 		sna_dri_exchange_attachment(front, back);
 		get_private(back)->pixmap = pixmap;
@@ -1733,10 +1737,14 @@ blit:
 	bo = NULL;
 	name = 0;
 
-	info = sna->dri.flip_pending[pipe];
+	info = sna->dri.flip_pending;
 	if (info == NULL) {
-		DBG(("%s: no pending flip on pipe %d, so updating scanout\n",
-		     __FUNCTION__, pipe));
+		int pipe = sna_dri_get_pipe(draw);
+		if (pipe == -1)
+			goto exchange;
+
+		DBG(("%s: no pending flip, so updating scanout\n",
+		     __FUNCTION__));
 
 		info = calloc(1, sizeof(struct sna_dri_frame_event));
 		if (!info)
@@ -1801,7 +1809,7 @@ blit:
 	info->back->name = name;
 
 	set_bo(sna->front, get_private(info->front)->bo);
-	sna->dri.flip_pending[info->pipe] = info;
+	sna->dri.flip_pending = info;
 
 	DRI2SwapComplete(client, draw, 0, 0, 0,
 			 DRI2_EXCHANGE_COMPLETE, func, data);
commit ac85e8915060ea60c8b17e5d9afec965b25f3323
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 4 09:50:19 2012 +0100

    sna: Cache the framebuffer id
    
    Also fixup a weakness of only tracking scanout with a single bit, as we
    used to clear it forcibly after every flip.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 666a23f..6e0c3d0 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1088,6 +1088,25 @@ inline static void kgem_bo_remove_from_active(struct kgem *kgem,
 	assert(list_is_empty(&bo->vma));
 }
 
+static void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (!bo->scanout)
+		return;
+
+	assert(bo->proxy == NULL);
+
+	DBG(("%s: handle=%d, fb=%d\n", __FUNCTION__, bo->handle, bo->delta));
+	if (bo->delta) {
+		drmModeRmFB(kgem->fd, bo->delta);
+		bo->delta = 0;
+	}
+
+	bo->scanout = false;
+	bo->needs_flush = true;
+	bo->flush = false;
+	bo->reusable = true;
+}
+
 static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -1096,6 +1115,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->refcnt == 0);
 
 	bo->binding.offset = 0;
+	kgem_bo_clear_scanout(kgem, bo);
 
 	if (NO_CACHE)
 		goto destroy;
@@ -4151,18 +4171,6 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 	}
 }
 
-void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
-{
-	bo->needs_flush = true;
-	bo->flush = false;
-
-	if (!bo->scanout)
-		return;
-
-	bo->scanout = false;
-	bo->reusable = true;
-}
-
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
 		struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 9eac68e..186eaa0 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -509,8 +509,6 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 bool kgem_buffer_is_inplace(struct kgem_bo *bo);
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *bo);
 
-void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo);
-
 void kgem_throttle(struct kgem *kgem);
 #define MAX_INACTIVE_TIME 10
 bool kgem_expire_cache(struct kgem *kgem);
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 5272a08..e07a115 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -318,8 +318,6 @@ extern int sna_page_flip(struct sna *sna,
 
 extern PixmapPtr sna_set_screen_pixmap(struct sna *sna, PixmapPtr pixmap);
 
-void sna_mode_delete_fb(struct sna *sna, uint32_t fb);
-
 constant static inline struct sna *
 to_sna(ScrnInfoPtr scrn)
 {
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index 5275d4a..6287cd9 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -151,6 +151,40 @@ int sna_crtc_to_plane(xf86CrtcPtr crtc)
 	return sna_crtc->plane;
 }
 
+static unsigned get_fb(struct sna *sna, struct kgem_bo *bo,
+		       int width, int height)
+{
+	ScrnInfoPtr scrn = sna->scrn;
+	int ret;
+
+	assert(bo->proxy == NULL);
+	if (bo->delta) {
+		DBG(("%s: reusing fb=%d for handle=%d\n",
+		     __FUNCTION__, bo->delta, bo->handle));
+		return bo->delta;
+	}
+
+	DBG(("%s: create fb %dx%d@%d/%d\n",
+	     __FUNCTION__, width, height, scrn->depth, scrn->bitsPerPixel));
+
+	assert(bo->tiling != I915_TILING_Y);
+	ret = drmModeAddFB(sna->kgem.fd,
+			   width, height,
+			   scrn->depth, scrn->bitsPerPixel,
+			   bo->pitch, bo->handle,
+			   &bo->delta);
+	if (ret < 0) {
+		ErrorF("%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d\n",
+		       __FUNCTION__,
+		       width, height,
+		       scrn->depth, scrn->bitsPerPixel, bo->pitch);
+		return 0;
+	}
+
+	bo->scanout = true;
+	return bo->delta;
+}
+
 static uint32_t gem_create(int fd, int size)
 {
 	struct drm_i915_gem_create create;
@@ -434,10 +468,6 @@ sna_crtc_restore(struct sna *sna)
 	if (!bo)
 		return;
 
-	assert(bo->tiling != I915_TILING_Y);
-	bo->scanout = true;
-	bo->domain = DOMAIN_NONE;
-
 	DBG(("%s: create fb %dx%d@%d/%d\n",
 	     __FUNCTION__,
 	     sna->front->drawable.width,
@@ -446,13 +476,10 @@ sna_crtc_restore(struct sna *sna)
 	     sna->front->drawable.bitsPerPixel));
 
 	sna_mode_remove_fb(sna);
-	if (drmModeAddFB(sna->kgem.fd,
-			 sna->front->drawable.width,
-			 sna->front->drawable.height,
-			 sna->front->drawable.depth,
-			 sna->front->drawable.bitsPerPixel,
-			 bo->pitch, bo->handle,
-			 &sna->mode.fb_id))
+	sna->mode.fb_id = get_fb(sna, bo,
+				 sna->front->drawable.width,
+				 sna->front->drawable.height);
+	if (sna->mode.fb_id == 0)
 		return;
 
 	DBG(("%s: handle %d attached to fb %d\n",
@@ -468,6 +495,7 @@ sna_crtc_restore(struct sna *sna)
 			return;
 	}
 
+	bo->domain = DOMAIN_NONE;
 	scrn->displayWidth = bo->pitch / sna->mode.cpp;
 	sna->mode.fb_pixmap = sna->front->drawable.serialNumber;
 }
@@ -652,28 +680,16 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 		if (!bo)
 			return FALSE;
 
-		DBG(("%s: create fb %dx%d@%d/%d\n",
-		     __FUNCTION__,
-		     scrn->virtualX, scrn->virtualY,
-		     scrn->depth, scrn->bitsPerPixel));
-
-		assert(bo->tiling != I915_TILING_Y);
-		ret = drmModeAddFB(sna->kgem.fd,
-				   scrn->virtualX, scrn->virtualY,
-				   scrn->depth, scrn->bitsPerPixel,
-				   bo->pitch, bo->handle,
-				   &sna_mode->fb_id);
-		if (ret < 0) {
-			ErrorF("%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d\n",
-			       __FUNCTION__,
-			       scrn->virtualX, scrn->virtualY,
-			       scrn->depth, scrn->bitsPerPixel, bo->pitch);
+		/* recreate the fb in case the size has changed */
+		assert(bo->delta == 0);
+		sna_mode->fb_id = get_fb(sna, bo,
+					 scrn->virtualX, scrn->virtualY);
+		if (sna_mode->fb_id == 0)
 			return FALSE;
-		}
 
 		DBG(("%s: handle %d attached to fb %d\n",
 		     __FUNCTION__, bo->handle, sna_mode->fb_id));
-		bo->scanout = true;
+
 		bo->domain = DOMAIN_NONE;
 		sna_mode->fb_pixmap = sna->front->drawable.serialNumber;
 	}
@@ -773,22 +789,14 @@ sna_crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 		return NULL;
 	}
 
-	assert(bo->tiling != I915_TILING_Y);
-	if (drmModeAddFB(sna->kgem.fd,
-			 width, height, scrn->depth, scrn->bitsPerPixel,
-			 bo->pitch, bo->handle,
-			 &sna_crtc->shadow_fb_id)) {
-		ErrorF("%s: failed to add rotate  fb: %dx%d depth=%d, bpp=%d, pitch=%d\n",
-		       __FUNCTION__,
-		       width, height,
-		       scrn->depth, scrn->bitsPerPixel, bo->pitch);
+	sna_crtc->shadow_fb_id = get_fb(sna, bo, width, height);
+	if (sna_crtc->shadow_fb_id == 0) {
 		scrn->pScreen->DestroyPixmap(shadow);
 		return NULL;
 	}
 
 	DBG(("%s: attached handle %d to fb %d\n",
 	     __FUNCTION__, bo->handle, sna_crtc->shadow_fb_id));
-	bo->scanout = true;
 	bo->domain = DOMAIN_NONE;
 	return sna_crtc->shadow = shadow;
 }
@@ -813,10 +821,8 @@ sna_crtc_shadow_destroy(xf86CrtcPtr crtc, PixmapPtr pixmap, void *data)
 	DBG(("%s(fb=%d, handle=%d)\n", __FUNCTION__,
 	     sna_crtc->shadow_fb_id, sna_pixmap_get_bo(pixmap)->handle));
 
-	drmModeRmFB(sna->kgem.fd, sna_crtc->shadow_fb_id);
 	sna_crtc->shadow_fb_id = 0;
 
-	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(pixmap));
 	pixmap->drawable.pScreen->DestroyPixmap(pixmap);
 	sna_crtc->shadow = NULL;
 }
@@ -1702,23 +1708,16 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	if (!bo)
 		goto fail;
 
-	assert(bo->tiling != I915_TILING_Y);
-	if (drmModeAddFB(sna->kgem.fd, width, height,
-			 scrn->depth, scrn->bitsPerPixel,
-			 bo->pitch, bo->handle,
-			 &mode->fb_id)) {
-		ErrorF("%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d\n",
-		       __FUNCTION__,
-		       width, height,
-		       scrn->depth, scrn->bitsPerPixel, bo->pitch);
+	assert(bo->delta == 0);
+
+	mode->fb_id = get_fb(sna, bo, width, height);
+	if (mode->fb_id == 0)
 		goto fail;
-	}
 
 	DBG(("%s: handle %d, pixmap serial %lu attached to fb %d\n",
 	     __FUNCTION__, bo->handle,
 	     sna->front->drawable.serialNumber, mode->fb_id));
 
-	bo->scanout = true;
 	for (i = 0; i < xf86_config->num_crtc; i++) {
 		xf86CrtcPtr crtc = xf86_config->crtc[i];
 
@@ -1739,17 +1738,12 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	assert(scrn->pScreen->GetScreenPixmap(scrn->pScreen) == sna->front);
 	assert(scrn->pScreen->GetWindowPixmap(scrn->pScreen->root) == sna->front);
 
-	if (old_fb_id)
-		drmModeRmFB(sna->kgem.fd, old_fb_id);
-	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(old_front));
 	scrn->pScreen->DestroyPixmap(old_front);
 
 	return TRUE;
 
 fail:
 	DBG(("%s: restoring original front pixmap and fb\n", __FUNCTION__));
-	if (old_fb_id != mode->fb_id)
-		drmModeRmFB(sna->kgem.fd, mode->fb_id);
 	mode->fb_id = old_fb_id;
 
 	if (sna->front)
@@ -1843,15 +1837,9 @@ sna_page_flip(struct sna *sna,
 	/*
 	 * Create a new handle for the back buffer
 	 */
-	assert(bo->tiling != I915_TILING_Y);
-	if (drmModeAddFB(sna->kgem.fd, scrn->virtualX, scrn->virtualY,
-			 scrn->depth, scrn->bitsPerPixel,
-			 bo->pitch, bo->handle,
-			 &mode->fb_id)) {
-		ErrorF("%s: failed to add fb: %dx%d depth=%d, bpp=%d, pitch=%d\n",
-		       __FUNCTION__,
-		       scrn->virtualX, scrn->virtualY,
-		       scrn->depth, scrn->bitsPerPixel, bo->pitch);
+	mode->fb_id = get_fb(sna, bo, scrn->virtualX, scrn->virtualY);
+	if (mode->fb_id == 0) {
+		mode->fb_id = *old_fb;
 		return 0;
 	}
 
@@ -1871,23 +1859,14 @@ sna_page_flip(struct sna *sna,
 	 */
 	count = do_page_flip(sna, data, ref_crtc_hw_id);
 	DBG(("%s: page flipped %d crtcs\n", __FUNCTION__, count));
-	if (count) {
-		bo->scanout = true;
+	if (count)
 		bo->domain = DOMAIN_NONE;
-	} else {
-		drmModeRmFB(sna->kgem.fd, mode->fb_id);
+	else
 		mode->fb_id = *old_fb;
-	}
 
 	return count;
 }
 
-void sna_mode_delete_fb(struct sna *sna, uint32_t fb)
-{
-	if (fb)
-		drmModeRmFB(sna->kgem.fd, fb);
-}
-
 static const xf86CrtcConfigFuncsRec sna_crtc_config_funcs = {
 	sna_crtc_resize
 };
@@ -1932,11 +1911,8 @@ sna_mode_remove_fb(struct sna *sna)
 	DBG(("%s: deleting fb id %d for pixmap serial %d\n",
 	     __FUNCTION__, mode->fb_id,mode->fb_pixmap));
 
-	if (mode->fb_id) {
-		drmModeRmFB(sna->kgem.fd, mode->fb_id);
-		mode->fb_id = 0;
-		mode->fb_pixmap = 0;
-	}
+	mode->fb_id = 0;
+	mode->fb_pixmap = 0;
 }
 
 void
@@ -1957,7 +1933,6 @@ sna_mode_fini(struct sna *sna)
 #endif
 
 	sna_mode_remove_fb(sna);
-	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(sna->front));
 
 	/* mode->shadow_fb_id should have been destroyed already */
 }
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index d5eefcb..32602d5 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -336,7 +336,6 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 		}
 
 		private->bo->flush = 0;
-		kgem_bo_clear_scanout(&sna->kgem, private->bo); /* paranoia */
 		kgem_bo_destroy(&sna->kgem, private->bo);
 
 		free(buffer);
@@ -392,7 +391,6 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	sna_damage_destroy(&priv->cpu_damage);
 	priv->undamaged = false;
 
-	kgem_bo_clear_scanout(&sna->kgem, priv->gpu_bo); /* paranoia */
 	kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 	priv->gpu_bo = ref(bo);
 }
@@ -860,18 +858,10 @@ sna_dri_add_frame_event(struct sna_dri_frame_event *info)
 static void
 sna_dri_frame_event_release_bo(struct kgem *kgem, struct kgem_bo *bo)
 {
-	kgem_bo_clear_scanout(kgem, bo);
 	kgem_bo_destroy(kgem, bo);
 }
 
 static void
-sna_dri_frame_event_finish(struct sna_dri_frame_event *info)
-{
-	sna_mode_delete_fb(info->sna, info->old_fb);
-	kgem_bo_clear_scanout(&info->sna->kgem, info->old_front.bo);
-}
-
-static void
 sna_dri_frame_event_info_free(struct sna_dri_frame_event *info)
 {
 	DBG(("%s: del[%p] (%p, %ld)\n", __FUNCTION__,
@@ -1177,13 +1167,10 @@ static void sna_dri_flip_event(struct sna *sna,
 					 flip->event_data);
 		}
 
-		sna_dri_frame_event_finish(flip);
 		sna_dri_frame_event_info_free(flip);
 		break;
 
 	case DRI2_FLIP_THROTTLE:
-		sna_dri_frame_event_finish(flip);
-
 		assert(sna->dri.flip_pending[flip->pipe] == flip);
 		sna->dri.flip_pending[flip->pipe] = NULL;
 
@@ -1221,8 +1208,6 @@ static void sna_dri_flip_event(struct sna *sna,
 		     flip->front->name != flip->old_front.name));
 		assert(sna->dri.flip_pending[flip->pipe] == flip);
 
-		sna_dri_frame_event_finish(flip);
-
 		if (flip->front->name != flip->next_front.name) {
 			DBG(("%s: async flip continuing\n", __FUNCTION__));
 
@@ -1801,7 +1786,6 @@ blit:
 		}
 		info->front->name = info->back->name;
 		get_private(info->front)->bo = get_private(info->back)->bo;
-		__kgem_flush(&sna->kgem, get_private(info->back)->bo);
 	}
 
 	if (bo == NULL) {
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index 7b3cfce..150e973 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -784,9 +784,6 @@ static Bool sna_close_screen(int scrnIndex, ScreenPtr screen)
 
 	sna_mode_remove_fb(sna);
 	if (sna->front) {
-		struct kgem_bo *bo = sna_pixmap_get_bo(sna->front);
-		if (bo)
-			kgem_bo_clear_scanout(&sna->kgem, bo); /* valgrind */
 		screen->DestroyPixmap(sna->front);
 		sna->front = NULL;
 	}
commit 0ee86c1d9159e798bbb7427686e5c6c7e55213c0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 4 09:11:31 2012 +0100

    sna/dri: pageflip unref debugging
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 956a038..5272a08 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -130,6 +130,8 @@ struct sna_pixmap {
 	uint32_t stride;
 	uint32_t clear_color;
 
+	uint32_t flush;
+
 #define SOURCE_BIAS 4
 	uint16_t source_count;
 	uint8_t pinned :1;
@@ -138,7 +140,6 @@ struct sna_pixmap {
 	uint8_t undamaged :1;
 	uint8_t create :3;
 	uint8_t header :1;
-	uint8_t flush :1;
 };
 
 struct sna_glyph {
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 1ad2ff1..d5eefcb 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -163,7 +163,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	if (priv == NULL)
 		return NULL;
 
-	if (priv->flush)
+	if (priv->flush++)
 		return priv->gpu_bo;
 
 	tiling = color_tiling(sna, &pixmap->drawable);
@@ -181,7 +181,6 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 
 	/* Don't allow this named buffer to be replaced */
 	priv->pinned = 1;
-	priv->flush = true;
 
 	return priv->gpu_bo;
 }
@@ -317,17 +316,21 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 	if (buffer == NULL)
 		return;
 
+	DBG(("%s: %p [handle=%d] -- refcnt=%d, pixmap=%d\n",
+	     __FUNCTION__, buffer, private->bo->handle, private->refcnt,
+	     private->pixmap ? private->pixmap->drawable.serialNumber : 0));
+
 	if (--private->refcnt == 0) {
 		if (private->pixmap) {
 			ScreenPtr screen = private->pixmap->drawable.pScreen;
 			struct sna_pixmap *priv = sna_pixmap(private->pixmap);
 
 			/* Undo the DRI markings on this pixmap */
-			assert(priv->flush);
-			list_del(&priv->list);
-			sna_accel_watch_flush(sna, -1);
-			priv->pinned = private->pixmap == sna->front;
-			priv->flush = false;
+			if (priv->flush && --priv->flush == 0) {
+				list_del(&priv->list);
+				sna_accel_watch_flush(sna, -1);
+				priv->pinned = private->pixmap == sna->front;
+			}
 
 			screen->DestroyPixmap(private->pixmap);
 		}
@@ -1238,6 +1241,8 @@ static void sna_dri_flip_event(struct sna *sna,
 			flip->next_front.name = flip->front->name;
 			flip->off_delay = 5;
 		} else if (--flip->off_delay) {
+			DBG(("%s: queuing no-flip [delay=%d]\n",
+			     __FUNCTION__, flip->off_delay));
 			/* Just queue a no-op flip to trigger another event */
 			flip->count = sna_page_flip(sna,
 						    get_private(flip->front)->bo,
@@ -1765,6 +1770,13 @@ blit:
 			goto blit;
 		}
 
+		DBG(("%s: referencing (%p:%d, %p:%d)\n",
+		     __FUNCTION__,
+		     front, get_private(front)->refcnt,
+		     back, get_private(back)->refcnt));
+		sna_dri_reference_buffer(front);
+		sna_dri_reference_buffer(back);
+
 		if (!sna_dri_page_flip(sna, info)) {
 			sna_dri_frame_event_info_free(info);
 			goto blit;
@@ -1773,9 +1785,6 @@ blit:
 		info->next_front.name = info->front->name;
 		info->next_front.bo = get_private(info->front)->bo;
 		info->off_delay = 5;
-
-		sna_dri_reference_buffer(front);
-		sna_dri_reference_buffer(back);
 	} else if (info->type != DRI2_ASYNC_FLIP) {
 		/* A normal vsync'ed client is finishing, wait for it
 		 * to unpin the old framebuffer before taking over.
commit 985ff724fde6ac6c3165a8362a0e56472d075cc3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 23:52:17 2012 +0100

    sna: Ensure drawables are clipped against the pixmap before migration
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 424738a..956a038 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -446,16 +446,7 @@ bool must_check sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 						RegionPtr region,
 						unsigned flags);
 
-static inline bool must_check
-sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags)
-{
-	RegionRec region;
-
-	pixman_region_init_rect(&region,
-				drawable->x, drawable->y,
-				drawable->width, drawable->height);
-	return sna_drawable_move_region_to_cpu(drawable, &region, flags);
-}
+bool must_check sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags);
 
 static inline bool must_check
 sna_drawable_move_to_gpu(DrawablePtr drawable, unsigned flags)
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 6430de8..1bc34dc 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1697,6 +1697,43 @@ out:
 	return true;
 }
 
+bool
+sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags)
+{
+	RegionRec region;
+	PixmapPtr pixmap;
+	int16_t dx, dy;
+
+	if (drawable->type == DRAWABLE_PIXMAP)
+		return sna_pixmap_move_to_cpu((PixmapPtr)drawable, flags);
+
+	pixmap = get_window_pixmap((WindowPtr)drawable);
+	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+
+	DBG(("%s: (%d, %d)x(%d, %d) + (%d, %d), flags=%x\n",
+	     __FUNCTION__,
+	     drawable->x, drawable->y,
+	     drawable->width, drawable->height,
+	     dx, dy, flags));
+
+	region.extents.x1 = drawable->x + dx;
+	region.extents.y1 = drawable->y + dy;
+	region.extents.x2 = region.extents.x1 + drawable->width;
+	region.extents.y2 = region.extents.y1 + drawable->height;
+	region.data = NULL;
+
+	if (region.extents.x1 < 0)
+		region.extents.x1 = 0;
+	if (region.extents.y1 < 0)
+		region.extents.y1 = 0;
+	if (region.extents.x2 > pixmap->drawable.width)
+		region.extents.x2 = pixmap->drawable.width;
+	if (region.extents.y2 > pixmap->drawable.height)
+		region.extents.y2 = pixmap->drawable.height;
+
+	return sna_drawable_move_region_to_cpu(&pixmap->drawable, &region, flags);
+}
+
 static bool alu_overwrites(uint8_t alu)
 {
 	switch (alu) {
commit e2cf87cc87535d7407936f34b0e22989c29759ce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 23:21:48 2012 +0100

    sna: Compile fix for fresh assertion
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index a116936..666a23f 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4076,7 +4076,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 	assert(_bo->proxy);
 
 	_bo = _bo->proxy;
-	assert(bo->proxy == NULL);
+	assert(_bo->proxy == NULL);
 	assert(_bo->exec == NULL);
 
 	bo = (struct kgem_partial_bo *)_bo;
commit a543cefc4fa59f4f3ca518cdace08cb0a94273e9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 22:33:59 2012 +0100

    sna: Maintain a reference to the chain of proxies
    
    Rather than attempt to flatten the chain to the last link, we may need
    to hold a reference to the intermediate links in case of batch buffer
    submission.
    
    Fixes http://tnsp.org/~ccr/intel-gfx/test.html
    
    Reported-by: Matti Hamalainen <ccr at tnsp.org>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=49436
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index ed2eaf1..680d36f 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2381,9 +2381,14 @@ gen3_composite_picture(struct sna *sna,
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
 						  x, y, w, h, dst_x, dst_y);
 
-	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
+	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
+		DBG(("%s: pixmap too large (%dx%d), extracting (%d, %d)x(%d,%d)\n",
+		     __FUNCTION__,
+		     pixmap->drawable.width, pixmap->drawable.height,
+		     x, y, w, h));
 		return sna_render_picture_extract(sna, picture, channel,
 						  x, y, w, h, dst_x, dst_y);
+	}
 
 	return sna_render_pixmap_bo(sna, channel, pixmap,
 				    x, y, w, h, dst_x, dst_y);
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 49fa173..a116936 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2960,7 +2960,7 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 		if (bo->exec)
 			continue;
 
-		if (bo->proxy) {
+		while (bo->proxy) {
 			bo = bo->proxy;
 			if (bo->exec)
 				continue;
@@ -2989,7 +2989,7 @@ bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
 {
 	uint32_t size;
 
-	if (bo->proxy)
+	while (bo->proxy)
 		bo = bo->proxy;
 	if (bo->exec) {
 		if (kgem->gen < 40 &&
@@ -3034,7 +3034,7 @@ bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
-		if (bo->proxy)
+		while (bo->proxy)
 			bo = bo->proxy;
 		if (bo->exec) {
 			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
@@ -3098,10 +3098,10 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 		assert(bo->refcnt);
 		assert(!bo->purged);
 
-		delta += bo->delta;
-		if (bo->proxy) {
-			DBG(("%s: adding proxy for handle=%d\n",
-			     __FUNCTION__, bo->handle));
+		while (bo->proxy) {
+			DBG(("%s: adding proxy [delta=%d] for handle=%d\n",
+			     __FUNCTION__, bo->delta, bo->handle));
+			delta += bo->delta;
 			assert(bo->handle == bo->proxy->handle);
 			/* need to release the cache upon batch submit */
 			list_move(&bo->request, &kgem->next_request->buffers);
@@ -3527,8 +3527,9 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 {
 	struct kgem_bo *bo;
 
-	DBG(("%s: target handle=%d, offset=%d, length=%d, io=%d\n",
-	     __FUNCTION__, target->handle, offset, length, target->io));
+	DBG(("%s: target handle=%d [proxy? %d], offset=%d, length=%d, io=%d\n",
+	     __FUNCTION__, target->handle, target->proxy ? target->proxy->delta : -1,
+	     offset, length, target->io));
 
 	bo = __kgem_bo_alloc(target->handle, length);
 	if (bo == NULL)
@@ -3537,15 +3538,11 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	bo->reusable = false;
 	bo->size.bytes = length;
 
-	bo->io = target->io;
+	bo->io = target->io && target->proxy == NULL;
 	bo->dirty = target->dirty;
 	bo->tiling = target->tiling;
 	bo->pitch = target->pitch;
 
-	if (target->proxy) {
-		offset += target->delta;
-		target = target->proxy;
-	}
 	bo->proxy = kgem_bo_reference(target);
 	bo->delta = offset;
 	return bo;
@@ -4079,6 +4076,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 	assert(_bo->proxy);
 
 	_bo = _bo->proxy;
+	assert(bo->proxy == NULL);
 	assert(_bo->exec == NULL);
 
 	bo = (struct kgem_partial_bo *)_bo;
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3072345..6430de8 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2138,7 +2138,8 @@ sna_pixmap_create_upload(ScreenPtr screen,
 	int bpp = BitsPerPixel(depth);
 	void *ptr;
 
-	DBG(("%s(%d, %d, %d)\n", __FUNCTION__, width, height, depth));
+	DBG(("%s(%d, %d, %d, flags=%x)\n", __FUNCTION__,
+	     width, height, depth, flags));
 	assert(width);
 	assert(height);
 
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 3d0f9e9..dee6608 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -957,6 +957,10 @@ sna_render_picture_partial(struct sna *sna,
 		kgem_get_tile_size(&sna->kgem, bo->tiling,
 				   &tile_width, &tile_height, &tile_size);
 
+		DBG(("%s: tiling=%d, size=%dx%d, chunk=%d\n",
+		     __FUNCTION__, bo->tiling,
+		     tile_width, tile_height, tile_size));
+
 		/* Ensure we align to an even tile row */
 		box.y1 = box.y1 & ~(2*tile_height - 1);
 		box.y2 = ALIGN(box.y2, 2*tile_height);
@@ -989,8 +993,6 @@ sna_render_picture_partial(struct sna *sna,
 	if (channel->bo == NULL)
 		return 0;
 
-	channel->bo->pitch = bo->pitch;
-
 	if (channel->transform) {
 		memset(&channel->embedded_transform,
 		       0,
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index d0e5afc..d7e6d40 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -188,6 +188,14 @@ sna_tiling_composite_done(struct sna *sna,
 					if (y2 > y + height)
 						y2 = y + height;
 
+					DBG(("%s: rect[%d] = (%d, %d)x(%d,%d), tile=(%d,%d)x(%d, %d), blt=(%d,%d),(%d,%d), delta=(%d,%d)\n",
+					     __FUNCTION__, n,
+					     r->dst.x, r->dst.y,
+					     r->width, r->height,
+					     x, y, width, height,
+					     x1, y1, x2, y2,
+					     dx, dy));
+
 					if (y2 > y1 && x2 > x1) {
 						struct sna_composite_rectangles rr;
 						rr.src.x = dx + r->src.x;
commit 2181516792fa041d9b36d40bf0a220e7dc2742dd
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 17:35:24 2012 +0100

    sna: Remove extraneous SCANOUT flags
    
    These are now fixed by obeying a minimum alignment restriction.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index f239555..3d0f9e9 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1846,7 +1846,7 @@ sna_render_composite_redirect(struct sna *sna,
 			    width, height, bpp,
 			    kgem_choose_tiling(&sna->kgem, I915_TILING_X,
 					       width, height, bpp),
-			    CREATE_SCANOUT | CREATE_TEMPORARY);
+			    CREATE_TEMPORARY);
 	if (!bo)
 		return FALSE;
 
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index dcb4d1d..d0e5afc 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -384,7 +384,7 @@ sna_tiling_fill_boxes(struct sna *sna,
 							       tmp.drawable.width,
 							       tmp.drawable.height,
 							       dst->drawable.bitsPerPixel),
-					    CREATE_SCANOUT | CREATE_TEMPORARY);
+					    CREATE_TEMPORARY);
 			if (bo) {
 				int16_t dx = this.extents.x1;
 				int16_t dy = this.extents.y1;
commit 5a8b17556a5452673677069e88ae5b345519b4ce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 17:35:10 2012 +0100

    sna: Fix offset for combining damage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_damage.h b/src/sna/sna_damage.h
index fb661b2..1196912 100644
--- a/src/sna/sna_damage.h
+++ b/src/sna/sna_damage.h
@@ -36,7 +36,8 @@ static inline void sna_damage_combine(struct sna_damage **l,
 				      struct sna_damage *r,
 				      int dx, int dy)
 {
-	*l = _sna_damage_combine(*l, r, dx, dy);
+	assert(!DAMAGE_IS_ALL(*l));
+	*l = _sna_damage_combine(*l, DAMAGE_PTR(r), dx, dy);
 }
 
 fastcall struct sna_damage *_sna_damage_add(struct sna_damage *damage,
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index ec2aaad..f239555 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1869,6 +1869,7 @@ sna_render_composite_redirect(struct sna *sna,
 	t->real_bo = op->dst.bo;
 	t->real_damage = op->damage;
 	if (op->damage) {
+		assert(!DAMAGE_IS_ALL(op->damage));
 		t->damage = sna_damage_create();
 		op->damage = &t->damage;
 	}
@@ -1899,8 +1900,10 @@ sna_render_composite_redirect_done(struct sna *sna,
 					   &t->box, 1);
 		}
 		if (t->damage) {
+			DBG(("%s: combining damage, offset=(%d, %d)\n",
+			     __FUNCTION__, t->box.x1, t->box.y1));
 			sna_damage_combine(t->real_damage, t->damage,
-					   -t->box.x1, -t->box.y1);
+					   t->box.x1, t->box.y1);
 			__sna_damage_destroy(t->damage);
 		}
 
commit 3e7b00a41e4286e062da205a22bc434be0c8458c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 17:34:28 2012 +0100

    sna: Debug option to force particular upload/download paths
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 7cec3ff..3de164b 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -42,6 +42,8 @@
 
 #define PITCH(x, y) ALIGN((x)*(y), 4)
 
+#define FORCE_INPLACE 0
+
 /* XXX Need to avoid using GTT fenced access for I915_TILING_Y on 855GM */
 
 static Bool
@@ -109,6 +111,15 @@ static void read_boxes_inplace(struct kgem *kgem,
 	} while (--n);
 }
 
+static bool download_inplace(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (FORCE_INPLACE)
+		return FORCE_INPLACE > 0;
+
+	return !kgem_bo_map_will_stall(kgem, bo) ||
+		bo->tiling == I915_TILING_NONE;
+}
+
 void sna_read_boxes(struct sna *sna,
 		    struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 		    PixmapPtr dst, int16_t dst_dx, int16_t dst_dy,
@@ -150,8 +161,7 @@ void sna_read_boxes(struct sna *sna,
 	 * this path.
 	 */
 
-	if (!kgem_bo_map_will_stall(kgem, src_bo) ||
-	    src_bo->tiling == I915_TILING_NONE) {
+	if (download_inplace(kgem, src_bo)) {
 fallback:
 		read_boxes_inplace(kgem,
 				   src_bo, src_dx, src_dy,
@@ -512,6 +522,9 @@ static bool upload_inplace(struct kgem *kgem,
 			   const BoxRec *box,
 			   int n, int bpp)
 {
+	if (FORCE_INPLACE)
+		return FORCE_INPLACE > 0;
+
 	/* If we are writing through the GTT, check first if we might be
 	 * able to almagamate a series of small writes into a single
 	 * operation.
commit 63c997c13eff323e71d27f141fc53915ab234d42
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 16:17:35 2012 +0100

    sna/dri: Balance flush counting
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 308e329..424738a 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -129,7 +129,6 @@ struct sna_pixmap {
 
 	uint32_t stride;
 	uint32_t clear_color;
-	unsigned flush;
 
 #define SOURCE_BIAS 4
 	uint16_t source_count;
@@ -139,6 +138,7 @@ struct sna_pixmap {
 	uint8_t undamaged :1;
 	uint8_t create :3;
 	uint8_t header :1;
+	uint8_t flush :1;
 };
 
 struct sna_glyph {
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 1a7b6bd..1ad2ff1 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -163,7 +163,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	if (priv == NULL)
 		return NULL;
 
-	if (priv->flush++)
+	if (priv->flush)
 		return priv->gpu_bo;
 
 	tiling = color_tiling(sna, &pixmap->drawable);
@@ -181,6 +181,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 
 	/* Don't allow this named buffer to be replaced */
 	priv->pinned = 1;
+	priv->flush = true;
 
 	return priv->gpu_bo;
 }
@@ -322,12 +323,11 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 			struct sna_pixmap *priv = sna_pixmap(private->pixmap);
 
 			/* Undo the DRI markings on this pixmap */
-			assert(priv->flush > 0);
-			if (--priv->flush == 0) {
-				list_del(&priv->list);
-				sna_accel_watch_flush(sna, -1);
-				priv->pinned = private->pixmap == sna->front;
-			}
+			assert(priv->flush);
+			list_del(&priv->list);
+			sna_accel_watch_flush(sna, -1);
+			priv->pinned = private->pixmap == sna->front;
+			priv->flush = false;
 
 			screen->DestroyPixmap(private->pixmap);
 		}
@@ -649,13 +649,13 @@ sna_dri_copy_region(DrawablePtr draw,
 		     struct kgem_bo *, struct kgem_bo *, bool) = sna_dri_copy;
 
 	if (dst_buffer->attachment == DRI2BufferFrontLeft) {
-		dst = sna_pixmap_set_dri(sna, pixmap);
+		dst = sna_pixmap_get_bo(pixmap);
 		copy = sna_dri_copy_to_front;
 	} else
 		dst = get_private(dst_buffer)->bo;
 
 	if (src_buffer->attachment == DRI2BufferFrontLeft) {
-		src = sna_pixmap_set_dri(sna, pixmap);
+		src = sna_pixmap_get_bo(pixmap);
 		assert(copy == sna_dri_copy);
 		copy = sna_dri_copy_from_front;
 	} else
commit d62f5286453599f25c7268c3075265c7eaf8ed03
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 14:54:17 2012 +0100

    sna: Minor glyph fallback fixes
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index f4f115e..07b2a94 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -1136,6 +1136,8 @@ glyphs_fallback(CARD8 op,
 		mask_image = dst_image;
 		src_x -= x;
 		src_y -= y;
+		x += dst->pDrawable->x;
+		y += dst->pDrawable->y;
 	}
 
 	do {
@@ -1152,7 +1154,7 @@ glyphs_fallback(CARD8 op,
 			glyph_image = sna_glyph(g)->image;
 			if (glyph_image == NULL) {
 				PicturePtr picture;
-				int dx, dy;
+				int gx, gy;
 
 				picture = GlyphPicture(g)[screen];
 				if (picture == NULL)
@@ -1160,11 +1162,11 @@ glyphs_fallback(CARD8 op,
 
 				glyph_image = image_from_pict(picture,
 							      FALSE,
-							      &dx, &dy);
+							      &gx, &gy);
 				if (!glyph_image)
 					goto next_glyph;
 
-				assert(dx == 0 && dy == 0);
+				assert(gx == 0 && gy == 0);
 				sna_glyph(g)->image = glyph_image;
 			}
 
@@ -1191,11 +1193,15 @@ glyphs_fallback(CARD8 op,
 				int xi = x - g->info.x;
 				int yi = y - g->info.y;
 
-				DBG(("%s: glyph+(%d, %d) to dst (%d, %d)x(%d, %d), src (%d, %d) [op=%d]\n",
+				DBG(("%s: glyph+(%d, %d) to dst (%d, %d)x(%d, %d)/[(%d, %d)x(%d, %d)], src (%d, %d) [op=%d]\n",
 				     __FUNCTION__,
 				     dx, dy,
 				     xi, yi,
 				     g->info.width, g->info.height,
+				     dst->pDrawable->x,
+				     dst->pDrawable->y,
+				     dst->pDrawable->width,
+				     dst->pDrawable->height,
 				     src_x + xi,
 				     src_y + yi,
 				     op));
@@ -1261,7 +1267,7 @@ sna_glyphs(CARD8 op,
 	if (REGION_NUM_RECTS(dst->pCompositeClip) == 0)
 		return;
 
-	if (FALLBACK || DEBUG_NO_RENDER)
+	if (FALLBACK || !sna->have_render)
 		goto fallback;
 
 	if (wedged(sna)) {
commit 205fc190269160e4a6dd24e769e0f832006db03a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 14:53:29 2012 +0100

    sna: Don't discard GPU buffer if we only want to read back for the operation
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index c84b23e..3072345 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1315,9 +1315,11 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
 		sna_damage_destroy(&priv->gpu_damage);
-		sna_pixmap_free_gpu(sna, priv);
 		priv->undamaged = false;
 
+		if (flags & MOVE_WRITE)
+			sna_pixmap_free_gpu(sna, priv);
+
 		if (pixmap->devPrivate.ptr == NULL &&
 		    !sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
 			return false;
@@ -1458,8 +1460,10 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, priv->gpu_damage != NULL))
 		return false;
 
-	if (priv->gpu_bo == NULL)
+	if (priv->gpu_bo == NULL) {
+		assert(priv->gpu_damage == NULL);
 		goto done;
+	}
 
 	assert(priv->gpu_bo->proxy == NULL);
 	if (priv->clear) {
commit f948e68b35cce748f296bb9f32febcd250acf731
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 3 11:27:44 2012 +0100

    sna: Improve handling of inplace IO for large transfers
    
    If the transfer is large enough to obliterate the caches, then it is
    preferrable to do it inplace rather than upload a proxy texture and
    queue a blit. This helps prevent an inconsistency where one layer
    believes the operation should be done inplace only for the IO layer to
    perform an indirect upload.
    
    Testing show no significant impact upon the cairo-traces, but it does
    prevent x11perf -shmput from exploding.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 51025ea..9eac68e 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -462,7 +462,7 @@ static inline bool kgem_bo_map_will_stall(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle,
 	     bo->domain, bo->presumed_offset, bo->size));
 
-	if (!kgem_bo_is_mappable(kgem, bo))
+	if (!kgem_bo_is_mappable(kgem, bo) && kgem_bo_is_busy(bo))
 		return true;
 
 	if (kgem->wedged)
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 2539518..7cec3ff 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -516,17 +516,16 @@ static bool upload_inplace(struct kgem *kgem,
 	 * able to almagamate a series of small writes into a single
 	 * operation.
 	 */
-	if (!bo->map) {
+	if (!bo->map || kgem_bo_map_will_stall(kgem, bo)) {
 		unsigned int bytes = 0;
 		while (n--) {
 			bytes += (box->x2 - box->x1) * (box->y2 - box->y1);
 			box++;
 		}
-		if (bytes * bpp >> 12 < kgem->half_cpu_cache_pages)
-			return false;
+		return bytes * bpp >> 12 >= kgem->half_cpu_cache_pages;
 	}
 
-	return !kgem_bo_map_will_stall(kgem, bo);
+	return true;
 }
 
 bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
@@ -570,7 +569,9 @@ fallback:
 	}
 
 	/* Try to avoid switching rings... */
-	if (!can_blt || kgem->ring == KGEM_RENDER ||
+	if (!can_blt ||
+	    kgem->ring == KGEM_RENDER ||
+	    (kgem->has_semaphores && kgem->mode == KGEM_NONE) ||
 	    upload_too_large(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
commit ce52cb9ca48a32117bb6afa45a3049f8bcefa46b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 30 09:24:06 2012 +0100

    sna/gen7: Fix debug printing of primitives
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 327714f..6c9a833 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -898,7 +898,7 @@ gen7_emit_vertex_elements(struct sna *sna,
 
 	if (op->is_affine) {
 		src_format = GEN7_SURFACEFORMAT_R32G32_FLOAT;
-		w_component = GEN7_VFCOMPONENT_STORE_1_FLT;
+		w_component = GEN7_VFCOMPONENT_STORE_0;
 	} else {
 		src_format = GEN7_SURFACEFORMAT_R32G32B32_FLOAT;
 		w_component = GEN7_VFCOMPONENT_STORE_SRC;
@@ -930,7 +930,7 @@ gen7_emit_vertex_elements(struct sna *sna,
 		  0 << GEN7_VE0_OFFSET_SHIFT); /* offsets vb in bytes */
 	OUT_BATCH(GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT |
 		  GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT |
-		  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_2_SHIFT |
+		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
 		  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT);
 
 	/* u0, v0, w0 */
diff --git a/src/sna/kgem_debug_gen7.c b/src/sna/kgem_debug_gen7.c
index a7dbbf2..78eae01 100644
--- a/src/sna/kgem_debug_gen7.c
+++ b/src/sna/kgem_debug_gen7.c
@@ -287,8 +287,8 @@ static void primitive_out(struct kgem *kgem, uint32_t *data)
 
 	assert((data[0] & (1<<15)) == 0); /* XXX index buffers */
 
-	for (n = 0; n < data[1]; n++) {
-		int v = data[2] + n;
+	for (n = 0; n < data[2]; n++) {
+		int v = data[3] + n;
 		ErrorF("	[%d:%d] = ", n, v);
 		indirect_vertex_out(kgem, v);
 		ErrorF("\n");
@@ -358,7 +358,7 @@ get_965_depthformat(unsigned int depthformat)
 }
 
 static const char *
-get_965_element_component(uint32_t data, int component)
+get_element_component(uint32_t data, int component)
 {
 	uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
 
@@ -387,9 +387,9 @@ get_965_element_component(uint32_t data, int component)
 }
 
 static const char *
-get_965_prim_type(uint32_t data)
+get_prim_type(uint32_t data)
 {
-	uint32_t primtype = (data >> 10) & 0x1f;
+	uint32_t primtype = data & 0x1f;
 
 	switch (primtype) {
 	case 0x01: return "point list";
@@ -656,10 +656,10 @@ int kgem_gen7_decode_3d(struct kgem *kgem, uint32_t offset)
 			i++;
 			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
 				  "dst offset 0x%02x bytes\n",
-				  get_965_element_component(data[i], 0),
-				  get_965_element_component(data[i], 1),
-				  get_965_element_component(data[i], 2),
-				  get_965_element_component(data[i], 3),
+				  get_element_component(data[i], 0),
+				  get_element_component(data[i], 1),
+				  get_element_component(data[i], 2),
+				  get_element_component(data[i], 3),
 				  (data[i] & 0xff) * 4);
 			i++;
 		}
@@ -673,16 +673,16 @@ int kgem_gen7_decode_3d(struct kgem *kgem, uint32_t offset)
 		return len;
 
 	case 0x7b00:
-		assert(len == 6);
-		kgem_debug_print(data, offset, 0,
-			  "3DPRIMITIVE: %s %s\n",
-			  get_965_prim_type(data[0]),
-			  (data[0] & (1 << 15)) ? "random" : "sequential");
-		kgem_debug_print(data, offset, 1, "vertex count\n");
-		kgem_debug_print(data, offset, 2, "start vertex\n");
-		kgem_debug_print(data, offset, 3, "instance count\n");
-		kgem_debug_print(data, offset, 4, "start instance\n");
-		kgem_debug_print(data, offset, 5, "index bias\n");
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0, "3DPRIMITIVE\n");
+		kgem_debug_print(data, offset, 1, "type %s, %s\n",
+			  get_prim_type(data[1]),
+			  (data[1] & (1 << 15)) ? "random" : "sequential");
+		kgem_debug_print(data, offset, 2, "vertex count\n");
+		kgem_debug_print(data, offset, 3, "start vertex\n");
+		kgem_debug_print(data, offset, 4, "instance count\n");
+		kgem_debug_print(data, offset, 5, "start instance\n");
+		kgem_debug_print(data, offset, 6, "index bias\n");
 		primitive_out(kgem, data);
 		return len;
 	}
commit 452f7382630c429fd96e9ad4fe83e513d231647e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 2 21:09:27 2012 +0100

    sna: Avoid reducing damage for synchronisation
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 8b52a40..c84b23e 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1523,9 +1523,10 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		priv->undamaged = true;
 	}
 
-	if (sna_damage_contains_box(priv->gpu_damage,
-				    &region->extents) != PIXMAN_REGION_OUT) {
-		DBG(("%s: region (%dx%d) intersects gpu damage\n",
+	if (priv->gpu_damage &&
+	    (DAMAGE_IS_ALL(priv->gpu_damage) ||
+	     sna_damage_overlaps_box(priv->gpu_damage, &region->extents))) {
+		DBG(("%s: region (%dx%d) overlaps gpu damage\n",
 		     __FUNCTION__,
 		     region->extents.x2 - region->extents.x1,
 		     region->extents.y2 - region->extents.y1));
@@ -1538,9 +1539,18 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				       priv->gpu_bo, 0, 0,
 				       pixmap, 0, 0,
 				       &region->extents, 1);
-		} else {
+			goto done;
+		}
+
+		if (sna_damage_contains_box(priv->gpu_damage,
+					    &region->extents) != PIXMAN_REGION_OUT) {
 			RegionRec want, *r = region;
 
+			DBG(("%s: region (%dx%d) intersects gpu damage\n",
+			     __FUNCTION__,
+			     region->extents.x2 - region->extents.x1,
+			     region->extents.y2 - region->extents.y1));
+
 			/* Expand the region to move 32x32 pixel blocks at a
 			 * time, as we assume that we will continue writing
 			 * afterwards and so aim to coallesce subsequent
diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index f2d682d..b97edbe 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -984,20 +984,6 @@ static bool box_contains(const BoxRec *a, const BoxRec *b)
 	return true;
 }
 
-static inline Bool sna_damage_maybe_contains_box(const struct sna_damage *damage,
-						 const BoxRec *box)
-{
-	if (box->x2 <= damage->extents.x1 ||
-	    box->x1 >= damage->extents.x2)
-		return FALSE;
-
-	if (box->y2 <= damage->extents.y1 ||
-	    box->y1 >= damage->extents.y2)
-		return FALSE;
-
-	return TRUE;
-}
-
 static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 						RegionPtr region)
 {
@@ -1011,7 +997,7 @@ static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 
 	assert(RegionNotEmpty(region));
 
-	if (!sna_damage_maybe_contains_box(damage, &region->extents))
+	if (!sna_damage_overlaps_box(damage, &region->extents))
 		return damage;
 
 
@@ -1096,7 +1082,7 @@ inline static struct sna_damage *__sna_damage_subtract_box(struct sna_damage *da
 		return NULL;
 	}
 
-	if (!sna_damage_maybe_contains_box(damage, box))
+	if (!sna_damage_overlaps_box(damage, box))
 		return damage;
 
 	if (box_contains(box, &damage->extents)) {
@@ -1189,7 +1175,7 @@ static struct sna_damage *__sna_damage_subtract_boxes(struct sna_damage *damage,
 	extents.y1 += dy;
 	extents.y2 += dy;
 
-	if (!sna_damage_maybe_contains_box(damage, &extents))
+	if (!sna_damage_overlaps_box(damage, &extents))
 		return damage;
 
 	if (n == 1)
@@ -1248,7 +1234,7 @@ static int __sna_damage_contains_box(struct sna_damage *damage,
 	if (damage->mode == DAMAGE_ALL)
 		return PIXMAN_REGION_IN;
 
-	if (!sna_damage_maybe_contains_box(damage, box))
+	if (!sna_damage_overlaps_box(damage, box))
 		return PIXMAN_REGION_OUT;
 
 	ret = pixman_region_contains_rectangle(&damage->region, (BoxPtr)box);
@@ -1297,7 +1283,7 @@ bool _sna_damage_contains_box__no_reduce(const struct sna_damage *damage,
 	if (damage->mode == DAMAGE_SUBTRACT)
 		return false;
 
-	if (!sna_damage_maybe_contains_box(damage, box))
+	if (!sna_damage_overlaps_box(damage, box))
 		return false;
 
 	return pixman_region_contains_rectangle((RegionPtr)&damage->region,
diff --git a/src/sna/sna_damage.h b/src/sna/sna_damage.h
index 422a124..fb661b2 100644
--- a/src/sna/sna_damage.h
+++ b/src/sna/sna_damage.h
@@ -190,6 +190,21 @@ static inline Bool sna_damage_intersect(struct sna_damage *damage,
 	return _sna_damage_intersect(damage, region, result);
 }
 
+static inline bool
+sna_damage_overlaps_box(const struct sna_damage *damage,
+			const BoxRec *box)
+{
+	if (box->x2 <= damage->extents.x1 ||
+	    box->x1 >= damage->extents.x2)
+		return FALSE;
+
+	if (box->y2 <= damage->extents.y1 ||
+	    box->y1 >= damage->extents.y2)
+		return FALSE;
+
+	return TRUE;
+}
+
 int _sna_damage_contains_box(struct sna_damage *damage,
 			     const BoxRec *box);
 static inline int sna_damage_contains_box(struct sna_damage *damage,
commit 57b1ce459136d58d55af7f7667dfc057084eef70
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 2 19:09:10 2012 +0100

    sna: Bring back the minimum alignment for G33
    
    The underlying cause is still not fixed. It should be possible to use
    the much laxer alignment for single-stream linear. Still no idea how I
    fail to convince the GPU to drop the depth buffer.
    
    Reported-by: Matti Hamalainen <ccr at tnsp.org>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=49391
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d519ed6..49fa173 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -600,6 +600,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		/* 865g cannot handle a batch spanning multiple pages */
 		kgem->max_batch_size = PAGE_SIZE / sizeof(uint32_t);
 
+	kgem->min_alignment = 4;
+	if (gen < 40)
+		kgem->min_alignment = 64;
+
 	kgem->half_cpu_cache_pages = cpu_cache_size() >> 13;
 
 	list_init(&kgem->active_partials);
@@ -770,8 +774,8 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 				   uint32_t width, uint32_t bpp,
 				   bool scanout)
 {
-	width = ALIGN(width, 4) * bpp >> 3;
-	return ALIGN(width, scanout ? 64 : 4);
+	width = ALIGN(width, 2) * bpp >> 3;
+	return ALIGN(width, scanout ? 64 : kgem->min_alignment);
 }
 
 void kgem_get_tile_size(struct kgem *kgem, int tiling,
@@ -832,13 +836,17 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 			tile_width = 512;
 			tile_height = kgem->gen < 30 ? 16 : 8;
 		} else {
-			tile_width = scanout ? 64 : 4 * bpp >> 3;
-			tile_height = 4;
+			tile_width = 2 * bpp >> 3;
+			tile_width = ALIGN(tile_width,
+					   scanout ? 64 : kgem->min_alignment);
+			tile_height = 2;
 		}
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
-		tile_width = scanout ? 64 : 4 * bpp >> 3;
+		tile_width = 2 * bpp >> 3;
+		tile_width = ALIGN(tile_width,
+				   scanout ? 64 : kgem->min_alignment);
 		tile_height = 2;
 		break;
 	case I915_TILING_X:
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index ad2fe84..51025ea 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -143,6 +143,7 @@ struct kgem {
 	uint16_t nfence;
 	uint16_t wait;
 	uint16_t max_batch_size;
+	uint16_t min_alignment;
 
 	uint32_t flush:1;
 	uint32_t need_expire:1;
commit 54bb5fbbd2edf3c28e1e363af97fd6d0e9b6a98e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 2 12:45:09 2012 +0100

    sna: Always try to operate inplace if we an LLC gpu bo

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index db2442a..8b52a40 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -904,6 +904,9 @@ static inline bool pixmap_inplace(struct sna *sna,
 	if (priv->mapped)
 		return true;
 
+	if (sna->kgem.has_llc && pixmap != sna->front)
+		return !priv->cpu_bo;
+
 	return (pixmap->devKind * pixmap->drawable.height >> 12) >
 		sna->kgem.half_cpu_cache_pages;
 }
@@ -1264,6 +1267,9 @@ static inline bool region_inplace(struct sna *sna,
 		return false;
 	}
 
+	if (sna->kgem.has_llc && pixmap != sna->front)
+		return !priv->cpu_bo;
+
 	DBG(("%s: (%dx%d), inplace? %d\n",
 	     __FUNCTION__,
 	     region->extents.x2 - region->extents.x1,
commit 3a7e16d4a48dc00f96b02437aa2a049434a9d843
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 2 11:31:42 2012 +0100

    sna: Fallback for glyphs too large for XY_TEXT_IMMEDIATE
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 12017bd..db2442a 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -6746,7 +6746,6 @@ spans_fallback:
 		}
 
 		gc->ops = (GCOps *)&sna_gc_ops;
-		assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 		if (data.damage) {
 			if (data.dx | data.dy)
 				pixman_region_translate(&data.region, data.dx, data.dy);
@@ -10622,8 +10621,6 @@ sna_glyph_extents(FontPtr font,
 
 		extents->overallWidth += p->metrics.characterWidth;
 	}
-
-	assert(extents->overallWidth > 0);
 }
 
 static bool sna_set_glyph(CharInfoPtr in, CharInfoPtr out)
@@ -10716,6 +10713,16 @@ inline static bool sna_get_glyph16(FontPtr font, struct sna_font *priv,
 	return sna_set_glyph(ret, *out = p);
 }
 
+static inline bool sna_font_too_large(FontPtr font)
+{
+	int top = max(FONTMAXBOUNDS(font, ascent), FONTASCENT(font));
+	int bot = max(FONTMAXBOUNDS(font, descent), FONTDESCENT(font));
+	int width = max(FONTMAXBOUNDS(font, characterWidth), -FONTMINBOUNDS(font, characterWidth));
+	DBG(("%s: (%d + %d) x %d: %d\n", __FUNCTION__,
+	     top, bot, width, (top + bot) * (width + 7)/8));
+	return (top + bot) * (width + 7)/8 > 124;
+}
+
 static int
 sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	       int x, int y,
@@ -10731,6 +10738,9 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph8(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -10820,6 +10830,9 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph16(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -10909,6 +10922,9 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph8(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -11000,6 +11016,9 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (drawable->depth < 8)
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	for (i = n = 0; i < count; i++) {
 		if (sna_get_glyph16(gc->font, priv, chars[i], &info[n]))
 			n++;
@@ -11335,6 +11354,9 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	if ((bo = sna_drawable_use_bo(drawable, true,
 				      &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
@@ -11412,6 +11434,9 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 	if (!gc_is_solid(gc, &fg))
 		goto fallback;
 
+	if (sna_font_too_large(gc->font))
+		goto fallback;
+
 	if ((bo = sna_drawable_use_bo(drawable, true,
 				      &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
commit 53750068d53203313ff377f81533272ec26a0779
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue May 1 12:37:45 2012 +0100

    sna: Only attempt to reuse exported scanout buffers
    
    Yet more mesa w/a.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4302952..d519ed6 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4148,13 +4148,13 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
 {
 	bo->needs_flush = true;
-	bo->reusable = true;
 	bo->flush = false;
 
 	if (!bo->scanout)
 		return;
 
 	bo->scanout = false;
+	bo->reusable = true;
 }
 
 struct kgem_bo *
commit ec735bb310752eaed687d88af01b0f95b7edce71
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue May 1 11:15:00 2012 +0100

    sna: Fast-path unclipped glyphs
    
    Avoid the redundant computation of the glyph intersection with the
    drawable bounding box.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 87371aa..f4f115e 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -427,8 +427,12 @@ glyphs_to_dst(struct sna *sna,
 	     __FUNCTION__, op, src_x, src_y, nlist,
 	     list->xOff, list->yOff, dst->pDrawable->x, dst->pDrawable->y));
 
-	rects = REGION_RECTS(dst->pCompositeClip);
-	nrect = REGION_NUM_RECTS(dst->pCompositeClip);
+	if (dst->pCompositeClip->extents.x2 - dst->pCompositeClip->extents.x1 < dst->pDrawable->width ||
+	    dst->pCompositeClip->extents.y2 - dst->pCompositeClip->extents.y1 < dst->pDrawable->height) {
+		rects = REGION_RECTS(dst->pCompositeClip);
+		nrect = REGION_NUM_RECTS(dst->pCompositeClip);
+	} else
+		nrect = 0;
 
 	x = dst->pDrawable->x;
 	y = dst->pDrawable->y;
@@ -476,47 +480,68 @@ glyphs_to_dst(struct sna *sna,
 				glyph_atlas = priv.atlas;
 			}
 
-			for (i = 0; i < nrect; i++) {
+			if (nrect) {
+				for (i = 0; i < nrect; i++) {
+					struct sna_composite_rectangles r;
+					int16_t dx, dy;
+					int16_t x2, y2;
+
+					r.dst.x = x - glyph->info.x;
+					r.dst.y = y - glyph->info.y;
+					x2 = r.dst.x + glyph->info.width;
+					y2 = r.dst.y + glyph->info.height;
+					dx = dy = 0;
+
+					DBG(("%s: glyph=(%d, %d), (%d, %d), clip=(%d, %d), (%d, %d)\n",
+					     __FUNCTION__,
+					     r.dst.x, r.dst.y, x2, y2,
+					     rects[i].x1, rects[i].y1,
+					     rects[i].x2, rects[i].y2));
+					if (rects[i].y1 >= y2)
+						break;
+
+					if (r.dst.x < rects[i].x1)
+						dx = rects[i].x1 - r.dst.x, r.dst.x = rects[i].x1;
+					if (x2 > rects[i].x2)
+						x2 = rects[i].x2;
+					if (r.dst.y < rects[i].y1)
+						dy = rects[i].y1 - r.dst.y, r.dst.y = rects[i].y1;
+					if (y2 > rects[i].y2)
+						y2 = rects[i].y2;
+
+					if (r.dst.x < x2 && r.dst.y < y2) {
+						DBG(("%s: blt=(%d, %d), (%d, %d)\n",
+						     __FUNCTION__, r.dst.x, r.dst.y, x2, y2));
+
+						r.src.x = r.dst.x + src_x;
+						r.src.y = r.dst.y + src_y;
+						r.mask.x = dx + priv.coordinate.x;
+						r.mask.y = dy + priv.coordinate.y;
+						r.width  = x2 - r.dst.x;
+						r.height = y2 - r.dst.y;
+						tmp.blt(sna, &tmp, &r);
+						apply_damage(&tmp, &r);
+					}
+				}
+			} else {
 				struct sna_composite_rectangles r;
-				int16_t dx, dy;
-				int16_t x2, y2;
 
 				r.dst.x = x - glyph->info.x;
 				r.dst.y = y - glyph->info.y;
-				x2 = r.dst.x + glyph->info.width;
-				y2 = r.dst.y + glyph->info.height;
-				dx = dy = 0;
+				r.src.x = r.dst.x + src_x;
+				r.src.y = r.dst.y + src_y;
+				r.mask.x = priv.coordinate.x;
+				r.mask.y = priv.coordinate.y;
+				r.width  = glyph->info.width;
+				r.height = glyph->info.height;
 
-				DBG(("%s: glyph=(%d, %d), (%d, %d), clip=(%d, %d), (%d, %d)\n",
+				DBG(("%s: glyph=(%d, %d)x(%d, %d), unclipped\n",
 				     __FUNCTION__,
-				     r.dst.x, r.dst.y, x2, y2,
-				     rects[i].x1, rects[i].y1,
-				     rects[i].x2, rects[i].y2));
-				if (rects[i].y1 >= y2)
-					break;
+				     r.dst.x, r.dst.y,
+				     r.width, r.height));
 
-				if (r.dst.x < rects[i].x1)
-					dx = rects[i].x1 - r.dst.x, r.dst.x = rects[i].x1;
-				if (x2 > rects[i].x2)
-					x2 = rects[i].x2;
-				if (r.dst.y < rects[i].y1)
-					dy = rects[i].y1 - r.dst.y, r.dst.y = rects[i].y1;
-				if (y2 > rects[i].y2)
-					y2 = rects[i].y2;
-
-				if (r.dst.x < x2 && r.dst.y < y2) {
-					DBG(("%s: blt=(%d, %d), (%d, %d)\n",
-					     __FUNCTION__, r.dst.x, r.dst.y, x2, y2));
-
-					r.src.x = r.dst.x + src_x;
-					r.src.y = r.dst.y + src_y;
-					r.mask.x = dx + priv.coordinate.x;
-					r.mask.y = dy + priv.coordinate.y;
-					r.width  = x2 - r.dst.x;
-					r.height = y2 - r.dst.y;
-					tmp.blt(sna, &tmp, &r);
-					apply_damage(&tmp, &r);
-				}
+				tmp.blt(sna, &tmp, &r);
+				apply_damage(&tmp, &r);
 			}
 
 next_glyph:
commit 9d1f546ea6c9b57f1b87d0703e1dfc6d3e50ca72
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 29 21:59:52 2012 +0100

    legacy/i810: hwmc additionally depends upon building DRI
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/legacy/i810/Makefile.am b/src/legacy/i810/Makefile.am
index a1bdd85..e7fa04f 100644
--- a/src/legacy/i810/Makefile.am
+++ b/src/legacy/i810/Makefile.am
@@ -25,10 +25,10 @@ liblegacy_i810_la_SOURCES +=\
          i810_dri.c \
          i810_dri.h \
 	 $(NULL)
-endif
 
 if XVMC
 liblegacy_i810_la_SOURCES += \
 	i810_hwmc.c \
 	$(NULL)
 endif
+endif
commit df14c459100f99db82e640e7d111619c82869768
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 29 21:49:04 2012 +0100

    configure: Version bump for 2.19.0 release

diff --git a/NEWS b/NEWS
index 9d2b15e..cc74879 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,29 @@
+Release 2.19.0 (2012-04-29)
+===========================
+More stability fixes for UXA and support for another variant of IvyBridge.
+Given the severity of the stability fixes, I strongly recommend everybody
+to upgrade to 2.19.0.
+
+* Prevent waiting on scanlines whilst not in control of the VT and therefore
+  whilst referencing foreign CRTC configurations.
+
+* Pixmap (and bo leak) during fallback glyph composition
+
+* Remove broken acceleration for rendering glyphs directly upon the
+  destination pixmap, exposed by cairo-1.12.0 (and coincidentally fix
+  another Pixmap leak upon fallback handling).
+
+* Add support for Ivy Bridge GT2 Server chipset [PCI id 0x016a]
+
+* Remove broken damage flushing with CompositeRectangles
+  https://bugs.freedesktop.org/show_bug.cgi?id=32547
+
+* Fix crash upon server start with multiple monitors
+  https://bugs.freedesktop.org/show_bug.cgi?id=47395
+
+* Fix composition issues resulting from overly aggressive Pixmap reuse
+  https://bugs.freedesktop.org/show_bug.cgi?id=47345
+
 Release 2.18.0 (2012-02-24)
 ===========================
 Time passes, a few more bugs have crept out of the woodwork that are a
diff --git a/configure.ac b/configure.ac
index 0489f64..3d646da 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
 # Initialize Autoconf
 AC_PREREQ([2.60])
 AC_INIT([xf86-video-intel],
-        [2.18.0],
+        [2.19.0],
         [https://bugs.freedesktop.org/enter_bug.cgi?product=xorg],
         [xf86-video-intel])
 AC_CONFIG_SRCDIR([Makefile.am])
commit e85134af75e99242b2bc2475f12b67c391d7a710
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 28 01:54:43 2012 +0100

    sna: Tune relocation array size
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 4def6b1..ad2fe84 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -171,7 +171,7 @@ struct kgem {
 
 	uint32_t batch[4*1024];
 	struct drm_i915_gem_exec_object2 exec[256];
-	struct drm_i915_gem_relocation_entry reloc[384];
+	struct drm_i915_gem_relocation_entry reloc[612];
 };
 
 #define KGEM_BATCH_RESERVED 1
commit a77d1f68c2f429ba871fc5776dbce1f8213d7f31
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 27 23:19:56 2012 +0100

    sna: PolyPoint only uses the gc->fgPixel
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f4d3de4..12017bd 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5560,8 +5560,7 @@ static Bool
 sna_poly_point_blt(DrawablePtr drawable,
 		   struct kgem_bo *bo,
 		   struct sna_damage **damage,
-		   GCPtr gc, uint32_t pixel,
-		   int mode, int n, DDXPointPtr pt,
+		   GCPtr gc, int mode, int n, DDXPointPtr pt,
 		   bool clipped)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
@@ -5574,7 +5573,7 @@ sna_poly_point_blt(DrawablePtr drawable,
 	DBG(("%s: alu=%d, pixel=%08lx, clipped?=%d\n",
 	     __FUNCTION__, gc->alu, gc->fgPixel, clipped));
 
-	if (!sna_fill_init_blt(&fill, sna, pixmap, bo, gc->alu, pixel))
+	if (!sna_fill_init_blt(&fill, sna, pixmap, bo, gc->alu, gc->fgPixel))
 		return FALSE;
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
@@ -5704,7 +5703,6 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	RegionRec region;
 	unsigned flags;
-	uint32_t color;
 
 	DBG(("%s(mode=%d, n=%d, pt[0]=(%d, %d)\n",
 	     __FUNCTION__, mode, n, pt[0].x, pt[0].y));
@@ -5729,7 +5727,7 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 		goto fallback;
 	}
 
-	if (PM_IS_SOLID(drawable, gc->planemask) && gc_is_solid(gc, &color)) {
+	if (PM_IS_SOLID(drawable, gc->planemask)) {
 		struct sna_damage **damage;
 		struct kgem_bo *bo;
 
@@ -5738,7 +5736,7 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 
 		if ((bo = sna_drawable_use_bo(drawable, false, &region.extents, &damage)) &&
 		    sna_poly_point_blt(drawable, bo, damage,
-				       gc, color, mode, n, pt, flags & 2))
+				       gc, mode, n, pt, flags & 2))
 			return;
 	}
 
commit 00f02596c1c0013af99ee7e67dc59527160a952c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 27 22:08:51 2012 +0100

    sna/gen6: Allow ring switching at the start of a batch
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 5bbe5e3..38fb024 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -57,7 +57,7 @@
 #define NO_FILL_BOXES 0
 #define NO_CLEAR 0
 
-#define NO_RING_SWITCH 1
+#define NO_RING_SWITCH 0
 
 #define GEN6_MAX_SIZE 8192
 
@@ -2332,7 +2332,7 @@ static bool prefer_blt_ring(struct sna *sna)
 
 static bool can_switch_rings(struct sna *sna)
 {
-	return sna->kgem.has_semaphores && !NO_RING_SWITCH;
+	return sna->kgem.mode == KGEM_NONE && sna->kgem.has_semaphores && !NO_RING_SWITCH;
 }
 
 static Bool
@@ -2369,6 +2369,8 @@ try_blt(struct sna *sna,
 	if (can_switch_rings(sna)) {
 		if (sna_picture_is_solid(src, NULL))
 			return TRUE;
+		if (src->pDrawable)
+			return TRUE;
 	}
 
 	return FALSE;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 2228873..327714f 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -57,7 +57,7 @@
 #define NO_FILL_BOXES 0
 #define NO_CLEAR 0
 
-#define NO_RING_SWITCH 1
+#define NO_RING_SWITCH 0
 
 #define GEN7_MAX_SIZE 16384
 
@@ -2425,7 +2425,7 @@ static bool prefer_blt_ring(struct sna *sna)
 
 static bool can_switch_rings(struct sna *sna)
 {
-	return sna->kgem.has_semaphores && !NO_RING_SWITCH;
+	return sna->kgem.mode == KGEM_NONE && sna->kgem.has_semaphores && !NO_RING_SWITCH;
 }
 
 static Bool
@@ -2462,6 +2462,8 @@ try_blt(struct sna *sna,
 	if (can_switch_rings(sna)) {
 		if (sna_picture_is_solid(src, NULL))
 			return TRUE;
+		if (src->pDrawable)
+			return TRUE;
 	}
 
 	return FALSE;
commit 388f702c197f0821774a6fe9804c60a5c375045c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 27 22:03:55 2012 +0100

    sna: Tweak semaphores-enabled heuristic
    
    The kernel module now defaults to -1, confusing the test.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 29f0e29..4302952 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -574,7 +574,7 @@ static bool semaphores_enabled(void)
 	if (file) {
 		int value;
 		if (fscanf(file, "%d", &value) == 1)
-			detected = value > 0;
+			detected = value != 0;
 		fclose(file);
 	}
 
commit c5ad9f868db12d3e17278664d823c90ba69baa4d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 27 19:21:44 2012 +0100

    sna: Tweak placement choice for high-overhead operations
    
    Some operations cost more to setup than to transfer data back and forth!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a5c1648..f4d3de4 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1923,6 +1923,7 @@ box_inplace(PixmapPtr pixmap, const BoxRec *box)
 
 static inline struct kgem_bo *
 sna_drawable_use_bo(DrawablePtr drawable,
+		    bool prefer_gpu,
 		    const BoxRec *box,
 		    struct sna_damage ***damage)
 {
@@ -1949,6 +1950,9 @@ sna_drawable_use_bo(DrawablePtr drawable,
 		goto use_cpu_bo;
 	}
 
+	if (!prefer_gpu && priv->gpu_bo && !kgem_bo_is_busy(priv->gpu_bo))
+		goto use_cpu_bo;
+
 	if (DAMAGE_IS_ALL(priv->gpu_damage))
 		goto use_gpu_bo;
 
@@ -1968,6 +1972,12 @@ sna_drawable_use_bo(DrawablePtr drawable,
 			goto use_cpu_bo;
 		}
 
+		if (priv->cpu_damage && !prefer_gpu) {
+			DBG(("%s: prefer cpu",
+			     __FUNCTION__));
+			goto use_cpu_bo;
+		}
+
 		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE | MOVE_READ))
 			goto use_cpu_bo;
 
@@ -2932,7 +2942,8 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	int n;
 	uint8_t rop = copy_ROP[gc->alu];
 
-	bo = sna_drawable_use_bo(&pixmap->drawable, &region->extents, &damage);
+	bo = sna_drawable_use_bo(&pixmap->drawable, true,
+				 &region->extents, &damage);
 	if (bo == NULL)
 		return false;
 
@@ -3054,7 +3065,8 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (gc->alu != GXcopy)
 		return false;
 
-	bo = sna_drawable_use_bo(&pixmap->drawable, &region->extents, &damage);
+	bo = sna_drawable_use_bo(&pixmap->drawable, true,
+				 &region->extents, &damage);
 	if (bo == NULL)
 		return false;
 
@@ -4952,7 +4964,7 @@ sna_fill_spans(DrawablePtr drawable, GCPtr gc, int n,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	bo = sna_drawable_use_bo(drawable, &region.extents, &damage);
+	bo = sna_drawable_use_bo(drawable, true, &region.extents, &damage);
 	if (bo) {
 		if (gc_is_solid(gc, &color)) {
 			DBG(("%s: trying solid fill [alu=%d, pixel=%08lx] blt paths\n",
@@ -5502,7 +5514,7 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (!PM_IS_SOLID(dst, gc->planemask))
 		goto fallback;
 
-	arg.bo = sna_drawable_use_bo(dst, &region.extents, &arg.damage);
+	arg.bo = sna_drawable_use_bo(dst, true, &region.extents, &arg.damage);
 	if (arg.bo) {
 		if (arg.bo->tiling == I915_TILING_Y) {
 			assert(arg.bo == sna_pixmap_get_bo(pixmap));
@@ -5724,7 +5736,7 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 		DBG(("%s: trying solid fill [%08lx] blt paths\n",
 		     __FUNCTION__, gc->fgPixel));
 
-		if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+		if ((bo = sna_drawable_use_bo(drawable, false, &region.extents, &damage)) &&
 		    sna_poly_point_blt(drawable, bo, damage,
 				       gc, color, mode, n, pt, flags & 2))
 			return;
@@ -6417,50 +6429,10 @@ sna_poly_line_extents(DrawablePtr drawable, GCPtr gc,
 inline static bool
 _use_zero_spans(DrawablePtr drawable, GCPtr gc, const BoxRec *extents)
 {
-	PixmapPtr pixmap;
-	struct sna_pixmap *priv;
-	BoxRec area;
-	int16_t dx, dy;
-
 	if (USE_ZERO_SPANS)
 		return USE_ZERO_SPANS > 0;
 
-	if (!drawable_gc_inplace_hint(drawable, gc))
-		return TRUE;
-
-	/* XXX check for GPU stalls on the gc (stipple, tile, etc) */
-
-	pixmap = get_drawable_pixmap(drawable);
-	priv = sna_pixmap(pixmap);
-	if (priv == NULL)
-		return FALSE;
-
-	if (DAMAGE_IS_ALL(priv->cpu_damage))
-		return FALSE;
-
-	if (priv->stride == 0 || priv->gpu_bo == NULL)
-		return FALSE;
-
-	if (!kgem_bo_is_busy(priv->gpu_bo))
-		return FALSE;
-
-	if (DAMAGE_IS_ALL(priv->gpu_damage))
-		return TRUE;
-
-	if (priv->gpu_damage == NULL)
-		return FALSE;
-
-	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	area = *extents;
-	area.x1 += dx;
-	area.x2 += dx;
-	area.y1 += dy;
-	area.y2 += dy;
-	DBG(("%s extents (%d, %d), (%d, %d)\n", __FUNCTION__,
-	     area.x1, area.y1, area.x2, area.y2));
-
-	return sna_damage_contains_box(priv->gpu_damage,
-				       &area) != PIXMAN_REGION_OUT;
+	return !drawable_gc_inplace_hint(drawable, gc);
 }
 
 static bool
@@ -6481,50 +6453,10 @@ use_zero_spans(DrawablePtr drawable, GCPtr gc, const BoxRec *extents)
 inline static bool
 _use_wide_spans(DrawablePtr drawable, GCPtr gc, const BoxRec *extents)
 {
-	PixmapPtr pixmap;
-	struct sna_pixmap *priv;
-	BoxRec area;
-	int16_t dx, dy;
-
 	if (USE_WIDE_SPANS)
 		return USE_WIDE_SPANS > 0;
 
-	if (!drawable_gc_inplace_hint(drawable, gc))
-		return TRUE;
-
-	/* XXX check for GPU stalls on the gc (stipple, tile, etc) */
-
-	pixmap = get_drawable_pixmap(drawable);
-	priv = sna_pixmap(pixmap);
-	if (priv == NULL)
-		return FALSE;
-
-	if (DAMAGE_IS_ALL(priv->cpu_damage))
-		return FALSE;
-
-	if (priv->stride == 0 || priv->gpu_bo == NULL)
-		return FALSE;
-
-	if (!kgem_bo_is_busy(priv->gpu_bo))
-		return FALSE;
-
-	if (DAMAGE_IS_ALL(priv->gpu_damage))
-		return TRUE;
-
-	if (priv->gpu_damage == NULL)
-		return FALSE;
-
-	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	area = *extents;
-	area.x1 += dx;
-	area.x2 += dx;
-	area.y1 += dy;
-	area.y2 += dy;
-	DBG(("%s extents (%d, %d), (%d, %d)\n", __FUNCTION__,
-	     area.x1, area.y1, area.x2, area.y2));
-
-	return sna_damage_contains_box(priv->gpu_damage,
-				       &area) != PIXMAN_REGION_OUT;
+	return !drawable_gc_inplace_hint(drawable, gc);
 }
 
 static bool
@@ -6605,7 +6537,7 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 		     __FUNCTION__, (unsigned)color));
 
 		if (data.flags & 4) {
-			data.bo = sna_drawable_use_bo(drawable,
+			data.bo = sna_drawable_use_bo(drawable, true,
 						      &data.region.extents,
 						      &data.damage);
 			if (data.bo &&
@@ -6616,8 +6548,8 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 					      data.flags & 2))
 				return;
 		} else { /* !rectilinear */
-			if (use_zero_spans(drawable, gc, &data.region.extents) &&
-			    (data.bo = sna_drawable_use_bo(drawable,
+			if ((data.bo = sna_drawable_use_bo(drawable,
+							   use_zero_spans(drawable, gc, &data.region.extents),
 							   &data.region.extents,
 							   &data.damage)) &&
 			    sna_poly_zero_line_blt(drawable,
@@ -6630,7 +6562,8 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 		}
 	} else if (data.flags & 4) {
 		/* Try converting these to a set of rectangles instead */
-		data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage);
+		data.bo = sna_drawable_use_bo(drawable, true,
+					      &data.region.extents, &data.damage);
 		if (data.bo) {
 			DDXPointRec p1, p2;
 			xRectangle *rect;
@@ -6701,8 +6634,9 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 	}
 
 spans_fallback:
-	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    (data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
+	if ((data.bo = sna_drawable_use_bo(drawable,
+					   use_wide_spans(drawable, gc, &data.region.extents),
+					   &data.region.extents, &data.damage))) {
 		DBG(("%s: converting line into spans\n", __FUNCTION__));
 		get_drawable_deltas(drawable, data.pixmap, &data.dx, &data.dy);
 		sna_gc(gc)->priv = &data;
@@ -7570,12 +7504,6 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	data.bo = sna_drawable_use_bo(drawable,
-				      &data.region.extents,
-				      &data.damage);
-	if (data.bo == NULL)
-		goto fallback;
-
 	if (gc->lineStyle != LineSolid || gc->lineWidth > 1)
 		goto spans_fallback;
 	if (gc_is_solid(gc, &color)) {
@@ -7583,14 +7511,20 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		     __FUNCTION__, (unsigned)color, data.flags));
 
 		if (data.flags & 4) {
-			if (sna_poly_segment_blt(drawable,
+			if ((data.bo = sna_drawable_use_bo(drawable, true,
+							   &data.region.extents,
+							   &data.damage)) &&
+			     sna_poly_segment_blt(drawable,
 						 data.bo, data.damage,
 						 gc, color, n, seg,
 						 &data.region.extents,
 						 data.flags & 2))
 				return;
 		} else {
-			if (use_zero_spans(drawable, gc, &data.region.extents) &&
+			if ((data.bo = sna_drawable_use_bo(drawable,
+							   use_zero_spans(drawable, gc, &data.region.extents),
+							   &data.region.extents,
+							   &data.damage)) &&
 			    sna_poly_zero_segment_blt(drawable,
 						      data.bo, data.damage,
 						      gc, n, seg,
@@ -7603,6 +7537,12 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		xRectangle *rect;
 		int i;
 
+		data.bo = sna_drawable_use_bo(drawable, true,
+					      &data.region.extents,
+					      &data.damage);
+		if (data.bo == NULL)
+			goto fallback;
+
 		DBG(("%s: converting to rectagnles\n", __FUNCTION__));
 
 		rect = malloc (n * sizeof (xRectangle));
@@ -7660,7 +7600,10 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 	}
 
 spans_fallback:
-	if (use_wide_spans(drawable, gc, &data.region.extents)) {
+	if ((data.bo = sna_drawable_use_bo(drawable,
+					   use_wide_spans(drawable, gc, &data.region.extents),
+					   &data.region.extents,
+					   &data.damage))) {
 		void (*line)(DrawablePtr, GCPtr, int, int, DDXPointPtr);
 		int i;
 
@@ -8280,7 +8223,8 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 	    PM_IS_SOLID(drawable, gc->planemask)) {
 		DBG(("%s: trying blt solid fill [%08lx] paths\n",
 		     __FUNCTION__, gc->fgPixel));
-		if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+		if ((bo = sna_drawable_use_bo(drawable, true,
+					      &region.extents, &damage)) &&
 		    sna_poly_rectangle_blt(drawable, bo, damage,
 					   gc, n, r, &region.extents, flags&2))
 			return;
@@ -8288,7 +8232,8 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 		/* Not a trivial outline, but we still maybe able to break it
 		 * down into simpler operations that we can accelerate.
 		 */
-		if (sna_drawable_use_bo(drawable, &region.extents, &damage)) {
+		if (sna_drawable_use_bo(drawable, true,
+					&region.extents, &damage)) {
 			miPolyRectangle(drawable, gc, n, r);
 			return;
 		}
@@ -8408,8 +8353,8 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    (data.bo = sna_drawable_use_bo(drawable,
+	if ((data.bo = sna_drawable_use_bo(drawable,
+					   use_wide_spans(drawable, gc, &data.region.extents),
 					   &data.region.extents, &data.damage))) {
 		uint32_t color;
 
@@ -8761,8 +8706,8 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 	if (!PM_IS_SOLID(draw, gc->planemask))
 		goto fallback;
 
-	if (use_wide_spans(draw, gc, &data.region.extents) &&
-	    (data.bo = sna_drawable_use_bo(draw,
+	if ((data.bo = sna_drawable_use_bo(draw,
+					   use_wide_spans(draw, gc, &data.region.extents),
 					   &data.region.extents,
 					   &data.damage))) {
 		uint32_t color;
@@ -10174,12 +10119,15 @@ sna_poly_fill_rect(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect)
 		}
 	}
 
+	bo = sna_drawable_use_bo(draw, true, &region.extents, &damage);
+	if (bo == NULL)
+		goto fallback;
+
 	if (gc_is_solid(gc, &color)) {
 		DBG(("%s: solid fill [%08x], testing for blt\n",
 		     __FUNCTION__, color));
 
-		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
-		    sna_poly_fill_rect_blt(draw,
+		if (sna_poly_fill_rect_blt(draw,
 					   bo, damage,
 					   gc, color, n, rect,
 					   &region.extents, flags & 2))
@@ -10187,16 +10135,14 @@ sna_poly_fill_rect(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect)
 	} else if (gc->fillStyle == FillTiled) {
 		DBG(("%s: tiled fill, testing for blt\n", __FUNCTION__));
 
-		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
-		    sna_poly_fill_rect_tiled_blt(draw, bo, damage,
+		if (sna_poly_fill_rect_tiled_blt(draw, bo, damage,
 						 gc, n, rect,
 						 &region.extents, flags & 2))
 			return;
 	} else {
 		DBG(("%s: stippled fill, testing for blt\n", __FUNCTION__));
 
-		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
-		    sna_poly_fill_rect_stippled_blt(draw, bo, damage,
+		if (sna_poly_fill_rect_stippled_blt(draw, bo, damage,
 						    gc, n, rect,
 						    &region.extents, flags & 2))
 			return;
@@ -10330,8 +10276,7 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 	if (!PM_IS_SOLID(draw, gc->planemask))
 		goto fallback;
 
-	if (use_wide_spans(draw, gc, &data.region.extents) &&
-	    (data.bo = sna_drawable_use_bo(draw,
+	if ((data.bo = sna_drawable_use_bo(draw, true,
 					   &data.region.extents,
 					   &data.damage))) {
 		uint32_t color;
@@ -10482,7 +10427,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		return false;
 	}
 
-	bo = sna_drawable_use_bo(drawable, &clip->extents, &damage);
+	bo = sna_drawable_use_bo(drawable, true, &clip->extents, &damage);
 	if (bo == NULL)
 		return false;
 
@@ -11392,7 +11337,8 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+	if ((bo = sna_drawable_use_bo(drawable, true,
+				      &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
 				   bo, damage, &region,
 				   gc->fgPixel, gc->bgPixel, false))
@@ -11468,7 +11414,8 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 	if (!gc_is_solid(gc, &fg))
 		goto fallback;
 
-	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+	if ((bo = sna_drawable_use_bo(drawable, true,
+				      &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
 				   bo, damage, &region, fg, -1, true))
 		goto out;
@@ -11505,7 +11452,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
 	int n;
 	uint8_t rop = copy_ROP[gc->alu];
 
-	bo = sna_drawable_use_bo(drawable, &region->extents, &damage);
+	bo = sna_drawable_use_bo(drawable, true, &region->extents, &damage);
 	if (bo == NULL)
 		return false;
 
commit f13c6376ba9e2d4a699bb19bbb8eb92e773a577a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 27 16:50:28 2012 +0100

    sna: Micro-optimise common case of checking a single fenced bo
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4c4aa7c..29f0e29 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2977,7 +2977,45 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	return true;
 }
 
-bool kgem_check_bo_fenced(struct kgem *kgem, ...)
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
+{
+	uint32_t size;
+
+	if (bo->proxy)
+		bo = bo->proxy;
+	if (bo->exec) {
+		if (kgem->gen < 40 &&
+		    bo->tiling != I915_TILING_NONE &&
+		    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+			if (kgem->nfence >= kgem->fence_max)
+				return false;
+
+			size = kgem->aperture_fenced;
+			size += kgem_bo_fenced_size(kgem, bo);
+			if (size > kgem->aperture_mappable)
+				return false;
+		}
+
+		return true;
+	}
+
+	if (kgem->aperture > kgem->aperture_low)
+		return false;
+
+	if (kgem->nexec >= KGEM_EXEC_SIZE(kgem) - 1)
+		return false;
+
+	if (kgem->gen < 40 &&
+	    bo->tiling != I915_TILING_NONE &&
+	    kgem->nfence >= kgem->fence_max)
+		return false;
+
+	size = kgem->aperture;
+	size += num_pages(bo);
+	return size <= kgem->aperture_high;
+}
+
+bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
 {
 	va_list ap;
 	struct kgem_bo *bo;
@@ -4165,7 +4203,7 @@ kgem_replace_bo(struct kgem *kgem,
 	kgem_set_mode(kgem, KGEM_BLT);
 	if (!kgem_check_batch(kgem, 8) ||
 	    !kgem_check_reloc(kgem, 2) ||
-	    !kgem_check_bo_fenced(kgem, src, dst, NULL)) {
+	    !kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
 		_kgem_submit(kgem);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 1235b83..4def6b1 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -344,7 +344,8 @@ static inline void kgem_advance_batch(struct kgem *kgem, int num_dwords)
 }
 
 bool kgem_check_bo(struct kgem *kgem, ...) __attribute__((sentinel(0)));
-bool kgem_check_bo_fenced(struct kgem *kgem, ...) __attribute__((sentinel(0)));
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo);
+bool kgem_check_many_bo_fenced(struct kgem *kgem, ...) __attribute__((sentinel(0)));
 
 void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo);
 static inline void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index dc084f0..a5c1648 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2975,7 +2975,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		void *ptr;
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -3103,7 +3103,7 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 			void *ptr;
 
 			if (!kgem_check_batch(&sna->kgem, 12) ||
-			    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc(&sna->kgem, 2)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -5117,7 +5117,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 		if (src_stride <= 128) {
 			src_stride = ALIGN(src_stride, 8) / 4;
 			if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
-			    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -5159,7 +5159,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			void *ptr;
 
 			if (!kgem_check_batch(&sna->kgem, 8) ||
-			    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 			    !kgem_check_reloc(&sna->kgem, 2)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -5277,7 +5277,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 		     sx, sy, bx1, bx2));
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9195,7 +9195,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 			     __FUNCTION__, r->x + dx, r->y + dy, r->width, r->height));
 
 			if (!kgem_check_batch(&sna->kgem, 9) ||
-			    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9243,7 +9243,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 					uint32_t *b;
 
 					if (!kgem_check_batch(&sna->kgem, 9) ||
-					    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+					    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 					    !kgem_check_reloc(&sna->kgem, 1)) {
 						_kgem_submit(&sna->kgem);
 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9293,7 +9293,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 						uint32_t *b;
 
 						if (!kgem_check_batch(&sna->kgem, 9) ||
-						    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+						    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 						    !kgem_check_reloc(&sna->kgem, 1)) {
 							_kgem_submit(&sna->kgem);
 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9433,7 +9433,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 			if (src_stride <= 128) {
 				src_stride = ALIGN(src_stride, 8) / 4;
 				if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
-				    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+				    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 				    !kgem_check_reloc(&sna->kgem, 1)) {
 					_kgem_submit(&sna->kgem);
 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9475,7 +9475,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 				void *ptr;
 
 				if (!kgem_check_batch(&sna->kgem, 8) ||
-				    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+				    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 				    !kgem_check_reloc(&sna->kgem, 2)) {
 					_kgem_submit(&sna->kgem);
 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9575,7 +9575,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 				if (src_stride <= 128) {
 					src_stride = ALIGN(src_stride, 8) / 4;
 					if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
-					    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+					    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 					    !kgem_check_reloc(&sna->kgem, 1)) {
 						_kgem_submit(&sna->kgem);
 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9614,7 +9614,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 					} while (--bh);
 				} else {
 					if (!kgem_check_batch(&sna->kgem, 8) ||
-					    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+					    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 					    !kgem_check_reloc(&sna->kgem, 2)) {
 						_kgem_submit(&sna->kgem);
 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9715,7 +9715,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 					if (src_stride <= 128) {
 						src_stride = ALIGN(src_stride, 8) / 4;
 						if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
-						    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+						    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 						    !kgem_check_reloc(&sna->kgem, 1)) {
 							_kgem_submit(&sna->kgem);
 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9754,7 +9754,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 						} while (--bh);
 					} else {
 						if (!kgem_check_batch(&sna->kgem, 8) ||
-						    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+						    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 						    !kgem_check_reloc(&sna->kgem, 2)) {
 							_kgem_submit(&sna->kgem);
 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -9856,7 +9856,7 @@ sna_poly_fill_rect_stippled_n_box(struct sna *sna,
 			len = bw*bh;
 			len = ALIGN(len, 8) / 4;
 			if (!kgem_check_batch(&sna->kgem, 7+len) ||
-			    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -10512,7 +10512,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 
 	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
-	    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+	    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
 		_kgem_submit(&sna->kgem);
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -11180,7 +11180,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 
 	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
-	    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+	    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
 		_kgem_submit(&sna->kgem);
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -11549,7 +11549,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		void *ptr;
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index a81a145..82c61df 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -151,10 +151,10 @@ static bool sna_blt_fill_init(struct sna *sna,
 	blt->bpp = bpp;
 
 	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_bo_fenced(kgem, bo, NULL) ||
+	if (!kgem_check_bo_fenced(kgem, bo) ||
 	    !kgem_check_batch(kgem, 12)) {
 		_kgem_submit(kgem);
-		assert(kgem_check_bo_fenced(kgem, bo, NULL));
+		assert(kgem_check_bo_fenced(kgem, bo));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -293,9 +293,9 @@ static Bool sna_blt_copy_init(struct sna *sna,
 	}
 
 	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_bo_fenced(kgem, src, dst, NULL)) {
+	if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
 		_kgem_submit(kgem);
-		if (!kgem_check_bo_fenced(kgem, src, dst, NULL))
+		if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL))
 			return FALSE;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
@@ -345,9 +345,9 @@ static Bool sna_blt_alpha_fixup_init(struct sna *sna,
 	blt->pixel = alpha;
 
 	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_bo_fenced(kgem, src, dst, NULL)) {
+	if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
 		_kgem_submit(kgem);
-		if (!kgem_check_bo_fenced(kgem, src, dst, NULL))
+		if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL))
 			return FALSE;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
@@ -1103,10 +1103,10 @@ prepare_blt_copy(struct sna *sna,
 	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo))
 		return FALSE;
 
-	if (!kgem_check_bo_fenced(&sna->kgem, op->dst.bo, priv->gpu_bo, NULL)) {
+	if (!kgem_check_many_bo_fenced(&sna->kgem, op->dst.bo, priv->gpu_bo, NULL)) {
 		_kgem_submit(&sna->kgem);
-		if (!kgem_check_bo_fenced(&sna->kgem,
-					  op->dst.bo, priv->gpu_bo, NULL))
+		if (!kgem_check_many_bo_fenced(&sna->kgem,
+					       op->dst.bo, priv->gpu_bo, NULL))
 			return FALSE;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
@@ -1577,9 +1577,9 @@ sna_blt_composite(struct sna *sna,
 	if (width && height)
 		reduce_damage(tmp, dst_x, dst_y, width, height);
 
-	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
+	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo)) {
 		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL));
+		assert(kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo));
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -1884,9 +1884,9 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 	kgem_set_mode(kgem, KGEM_BLT);
 	if (!kgem_check_batch(kgem, 6) ||
 	    !kgem_check_reloc(kgem, 1) ||
-	    !kgem_check_bo_fenced(kgem, bo, NULL)) {
+	    !kgem_check_bo_fenced(kgem, bo)) {
 		_kgem_submit(kgem);
-		assert(kgem_check_bo_fenced(&sna->kgem, bo, NULL));
+		assert(kgem_check_bo_fenced(&sna->kgem, bo));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -1957,10 +1957,10 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	}
 
 	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_bo_fenced(kgem, bo, NULL) ||
+	if (!kgem_check_bo_fenced(kgem, bo) ||
 	    !kgem_check_batch(kgem, 12)) {
 		_kgem_submit(kgem);
-		assert(kgem_check_bo_fenced(&sna->kgem, bo, NULL));
+		assert(kgem_check_bo_fenced(&sna->kgem, bo));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -2122,9 +2122,9 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	kgem_set_mode(kgem, KGEM_BLT);
 	if (!kgem_check_batch(kgem, 8) ||
 	    !kgem_check_reloc(kgem, 2) ||
-	    !kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
+	    !kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
 		_kgem_submit(kgem);
-		if (!kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL))
+		if (!kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL))
 			return sna_tiling_blt_copy_boxes(sna, alu,
 							 src_bo, src_dx, src_dy,
 							 dst_bo, dst_dx, dst_dy,
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 02a5c75..2539518 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -360,7 +360,7 @@ fallback:
 	if (kgem->nexec + 2 > KGEM_EXEC_SIZE(kgem) ||
 	    kgem->nreloc + 2 > KGEM_RELOC_SIZE(kgem) ||
 	    !kgem_check_batch(kgem, 8) ||
-	    !kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
+	    !kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
 		_kgem_submit(kgem);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
@@ -732,7 +732,7 @@ tile:
 	if (kgem->nexec + 2 > KGEM_EXEC_SIZE(kgem) ||
 	    kgem->nreloc + 2 > KGEM_RELOC_SIZE(kgem) ||
 	    !kgem_check_batch(kgem, 8) ||
-	    !kgem_check_bo_fenced(kgem, dst_bo, NULL)) {
+	    !kgem_check_bo_fenced(kgem, dst_bo)) {
 		_kgem_submit(kgem);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
@@ -969,7 +969,7 @@ fallback:
 	if (kgem->nexec + 2 > KGEM_EXEC_SIZE(kgem) ||
 	    kgem->nreloc + 2 > KGEM_RELOC_SIZE(kgem) ||
 	    !kgem_check_batch(kgem, 8) ||
-	    !kgem_check_bo_fenced(kgem, dst_bo, NULL)) {
+	    !kgem_check_bo_fenced(kgem, dst_bo)) {
 		_kgem_submit(kgem);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
commit 7a2a2ab3c7f07db101e9c74582d6feec1eefdd1c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 25 17:15:37 2012 +0100

    sna: Fixup broken assertion
    
    It is valid for the cpu_bo to be NULL, as we may be choosing to free the
    large shadow pixel buffer instead.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 382c671..dc084f0 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2256,7 +2256,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 		sna_damage_destroy(&priv->cpu_damage);
 		priv->undamaged = false;
 		list_del(&priv->list);
-		assert(!priv->cpu_bo->sync);
+		assert(priv->cpu_bo == NULL || !priv->cpu_bo->sync);
 		sna_pixmap_free_cpu(to_sna_from_pixmap(pixmap), priv);
 	}
 
commit 2d684d5e7f8c442b577352cf2e75c9abbcc27423
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 25 16:04:33 2012 +0100

    sna/gen7: Add CS stall before changing WM binding table
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index e2741c4..2228873 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -837,11 +837,11 @@ gen7_emit_wm(struct sna *sna, unsigned int kernel, int nr_surfaces, int nr_input
 	OUT_BATCH(0); /* kernel 2 */
 }
 
-static void
+static bool
 gen7_emit_binding_table(struct sna *sna, uint16_t offset)
 {
 	if (sna->render_state.gen7.surface_table == offset)
-		return;
+		return false;
 
 	/* Binding table pointers */
 	assert(is_aligned(4*offset, 32));
@@ -849,6 +849,7 @@ gen7_emit_binding_table(struct sna *sna, uint16_t offset)
 	OUT_BATCH(offset*4);
 
 	sna->render_state.gen7.surface_table = offset;
+	return true;
 }
 
 static void
@@ -970,11 +971,7 @@ gen7_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t wm_binding_table)
 {
-	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		gen7_emit_flush(sna);
-		kgem_clear_dirty(&sna->kgem);
-		kgem_bo_mark_dirty(op->dst.bo);
-	}
+	bool need_stall = false;
 
 	gen7_emit_cc(sna,
 		     gen7_get_blend(op->op,
@@ -1001,8 +998,22 @@ gen7_emit_state(struct sna *sna,
 		     op->u.gen7.nr_inputs);
 	gen7_emit_vertex_elements(sna, op);
 
-	gen7_emit_binding_table(sna, wm_binding_table);
+	need_stall |= gen7_emit_binding_table(sna, wm_binding_table);
 	gen7_emit_drawing_rectangle(sna, op);
+
+	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
+		gen7_emit_flush(sna);
+		kgem_clear_dirty(&sna->kgem);
+		kgem_bo_mark_dirty(op->dst.bo);
+		need_stall = false;
+	}
+	if (need_stall) {
+		OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
+		OUT_BATCH(GEN7_PIPE_CONTROL_CS_STALL |
+			  GEN7_PIPE_CONTROL_STALL_AT_SCOREBOARD);
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+	}
 }
 
 static void gen7_magic_ca_pass(struct sna *sna,
commit 1fbc3fa0d9c0430f9680f63f896244d6e59dddb2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 25 15:04:01 2012 +0100

    sna/gen7: Apply more recent improvements from SNB perf tuning
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 7dff02f..e2741c4 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -57,6 +57,8 @@
 #define NO_FILL_BOXES 0
 #define NO_CLEAR 0
 
+#define NO_RING_SWITCH 1
+
 #define GEN7_MAX_SIZE 16384
 
 /* XXX Todo
@@ -967,7 +969,6 @@ static void
 gen7_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t wm_binding_table)
-
 {
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
 		gen7_emit_flush(sna);
@@ -1353,18 +1354,13 @@ gen7_emit_composite_primitive_solid(struct sna *sna,
 	dst.p.x = r->dst.x + r->width;
 	dst.p.y = r->dst.y + r->height;
 	v[0] = dst.f;
-	v[1] = 1.;
-	v[2] = 1.;
-
 	dst.p.x = r->dst.x;
 	v[3] = dst.f;
-	v[4] = 0.;
-	v[5] = 1.;
-
 	dst.p.y = r->dst.y;
 	v[6] = dst.f;
-	v[7] = 0.;
-	v[8] = 0.;
+
+	v[5] = v[2] = v[1] = 1.;
+	v[8] = v[7] = v[4] = 0.;
 }
 
 fastcall static void
@@ -1397,6 +1393,44 @@ gen7_emit_composite_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen7_emit_composite_primitive_simple_source(struct sna *sna,
+					    const struct sna_composite_op *op,
+					    const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+fastcall static void
 gen7_emit_composite_primitive_affine_source(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const struct sna_composite_rectangles *r)
@@ -1486,141 +1520,63 @@ gen7_emit_composite_primitive_identity_source_mask(struct sna *sna,
 	v[14] = msk_y * op->mask.scale[1];
 }
 
-fastcall static void
-gen7_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
+inline static void
+gen7_emit_composite_texcoord(struct sna *sna,
+			     const struct sna_composite_channel *channel,
+			     int16_t x, int16_t y)
 {
-	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
-	Bool is_affine = op->is_affine;
-	const float *src_sf = op->src.scale;
-	const float *mask_sf = op->mask.scale;
-
-	if (is_affine) {
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1],
-						op->src.transform,
-						&src_x[0],
-						&src_y[0]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[1],
-						&src_y[1]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0] + r->width,
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[2],
-						&src_y[2]);
-	} else {
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-							r->src.y + op->src.offset[1],
-							op->src.transform,
-							&src_x[0],
-							&src_y[0],
-							&src_w[0]))
-			return;
-
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-							r->src.y + op->src.offset[1] + r->height,
-							op->src.transform,
-							&src_x[1],
-							&src_y[1],
-							&src_w[1]))
-			return;
-
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0] + r->width,
-							r->src.y + op->src.offset[1] + r->height,
-							op->src.transform,
-							&src_x[2],
-							&src_y[2],
-							&src_w[2]))
-			return;
-	}
+	x += channel->offset[0];
+	y += channel->offset[1];
 
-	if (op->mask.bo) {
-		if (is_affine) {
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1],
-							op->mask.transform,
-							&mask_x[0],
-							&mask_y[0]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[1],
-							&mask_y[1]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0] + r->width,
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[2],
-							&mask_y[2]);
-		} else {
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-								r->mask.y + op->mask.offset[1],
-								op->mask.transform,
-								&mask_x[0],
-								&mask_y[0],
-								&mask_w[0]))
-				return;
-
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-								r->mask.y + op->mask.offset[1] + r->height,
-								op->mask.transform,
-								&mask_x[1],
-								&mask_y[1],
-								&mask_w[1]))
-				return;
-
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0] + r->width,
-								r->mask.y + op->mask.offset[1] + r->height,
-								op->mask.transform,
-								&mask_x[2],
-								&mask_y[2],
-								&mask_w[2]))
-				return;
-		}
-	}
+	if (channel->is_affine) {
+		float s, t;
 
-	OUT_VERTEX(r->dst.x + r->width, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[2] * src_sf[0]);
-	OUT_VERTEX_F(src_y[2] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[2]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[2] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[2] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[2]);
-	}
+		sna_get_transformed_coordinates(x, y,
+						channel->transform,
+						&s, &t);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+	} else {
+		float s, t, w;
 
-	OUT_VERTEX(r->dst.x, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[1] * src_sf[0]);
-	OUT_VERTEX_F(src_y[1] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[1]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[1] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[1] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[1]);
+		sna_get_transformed_coordinates_3d(x, y,
+						   channel->transform,
+						   &s, &t, &w);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+		OUT_VERTEX_F(w);
 	}
+}
 
-	OUT_VERTEX(r->dst.x, r->dst.y);
-	OUT_VERTEX_F(src_x[0] * src_sf[0]);
-	OUT_VERTEX_F(src_y[0] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[0]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[0] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[0] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[0]);
-	}
+static void
+gen7_emit_composite_vertex(struct sna *sna,
+			   const struct sna_composite_op *op,
+			   int16_t srcX, int16_t srcY,
+			   int16_t mskX, int16_t mskY,
+			   int16_t dstX, int16_t dstY)
+{
+	OUT_VERTEX(dstX, dstY);
+	gen7_emit_composite_texcoord(sna, &op->src, srcX, srcY);
+	gen7_emit_composite_texcoord(sna, &op->mask, mskX, mskY);
+}
+
+fastcall static void
+gen7_emit_composite_primitive(struct sna *sna,
+			      const struct sna_composite_op *op,
+			      const struct sna_composite_rectangles *r)
+{
+	gen7_emit_composite_vertex(sna, op,
+				   r->src.x + r->width,  r->src.y + r->height,
+				   r->mask.x + r->width, r->mask.y + r->height,
+				   r->dst.x + r->width, r->dst.y + r->height);
+	gen7_emit_composite_vertex(sna, op,
+				   r->src.x,  r->src.y + r->height,
+				   r->mask.x, r->mask.y + r->height,
+				   r->dst.x,  r->dst.y + r->height);
+	gen7_emit_composite_vertex(sna, op,
+				   r->src.x,  r->src.y,
+				   r->mask.x, r->mask.y,
+				   r->dst.x,  r->dst.y);
 }
 
 static void gen7_emit_vertex_buffer(struct sna *sna,
@@ -2064,7 +2020,7 @@ gen7_render_video(struct sna *sna,
 	tmp.dst.pixmap = pixmap;
 	tmp.dst.width  = pixmap->drawable.width;
 	tmp.dst.height = pixmap->drawable.height;
-	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
+	tmp.dst.format = sna_render_format_for_depth(pixmap->drawable.depth);
 	tmp.dst.bo = priv->gpu_bo;
 
 	tmp.src.bo = frame->bo;
@@ -2163,6 +2119,7 @@ gen7_composite_solid_init(struct sna *sna,
 	channel->repeat = RepeatNormal;
 	channel->is_affine = TRUE;
 	channel->is_solid  = TRUE;
+	channel->is_opaque = (color >> 24) == 0xff;
 	channel->transform = NULL;
 	channel->width  = 1;
 	channel->height = 1;
@@ -2366,7 +2323,7 @@ gen7_composite_picture(struct sna *sna,
 		channel->transform = picture->transform;
 
 	channel->card_format = gen7_get_card_format(picture->format);
-	if (channel->card_format == -1)
+	if (channel->card_format == (unsigned)-1)
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
 						  x, y, w, h, dst_x, dst_y);
 
@@ -2411,9 +2368,6 @@ gen7_composite_set_target(struct sna *sna, struct sna_composite_op *op, PictureP
 {
 	struct sna_pixmap *priv;
 
-	if (!gen7_check_dst_format(dst->format))
-		return FALSE;
-
 	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	op->dst.width  = op->dst.pixmap->drawable.width;
 	op->dst.height = op->dst.pixmap->drawable.height;
@@ -2453,12 +2407,22 @@ gen7_composite_set_target(struct sna *sna, struct sna_composite_op *op, PictureP
 	return TRUE;
 }
 
+static bool prefer_blt_ring(struct sna *sna)
+{
+	return sna->kgem.ring != KGEM_RENDER;
+}
+
+static bool can_switch_rings(struct sna *sna)
+{
+	return sna->kgem.has_semaphores && !NO_RING_SWITCH;
+}
+
 static Bool
 try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (sna->kgem.ring != KGEM_RENDER) {
+	if (prefer_blt_ring(sna)) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return TRUE;
 	}
@@ -2484,11 +2448,16 @@ try_blt(struct sna *sna,
 		return TRUE;
 	}
 
+	if (can_switch_rings(sna)) {
+		if (sna_picture_is_solid(src, NULL))
+			return TRUE;
+	}
+
 	return FALSE;
 }
 
 static bool
-is_gradient(PicturePtr picture)
+check_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
 		return FALSE;
@@ -2542,9 +2511,10 @@ source_fallback(PicturePtr p, PixmapPtr pixmap)
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	if (is_gradient(p) ||
-	    !gen7_check_repeat(p) ||
-	    !gen7_check_format(p->format))
+	if (p->pSourcePict)
+		return check_gradient(p);
+
+	if (!gen7_check_repeat(p) || !gen7_check_format(p->format))
 		return true;
 
 	if (pixmap && source_is_busy(pixmap))
@@ -2578,7 +2548,7 @@ gen7_composite_fallback(struct sna *sna,
 
 	if (mask) {
 		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
-		mask_fallback = source_fallback(src, mask_pixmap);
+		mask_fallback = source_fallback(mask, mask_pixmap);
 	} else {
 		mask_pixmap = NULL;
 		mask_fallback = false;
@@ -2743,6 +2713,8 @@ gen7_render_composite(struct sna *sna,
 					    width, height,
 					    tmp);
 
+	if (op == PictOpClear)
+		op = PictOpSrc;
 	tmp->op = op;
 	if (!gen7_composite_set_target(sna, tmp, dst))
 		return FALSE;
@@ -2842,12 +2814,21 @@ gen7_render_composite(struct sna *sna,
 
 		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
 	} else {
-		if (tmp->src.is_solid)
+		if (tmp->src.is_solid) {
 			tmp->prim_emit = gen7_emit_composite_primitive_solid;
-		else if (tmp->src.transform == NULL)
+			if (tmp->src.is_opaque && op == PictOpOver)
+				tmp->op = PictOpSrc;
+		} else if (tmp->src.transform == NULL)
 			tmp->prim_emit = gen7_emit_composite_primitive_identity_source;
-		else if (tmp->src.is_affine)
-			tmp->prim_emit = gen7_emit_composite_primitive_affine_source;
+		else if (tmp->src.is_affine) {
+			if (tmp->src.transform->matrix[0][1] == 0 &&
+			    tmp->src.transform->matrix[1][0] == 0) {
+				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
+				tmp->prim_emit = gen7_emit_composite_primitive_simple_source;
+			} else
+				tmp->prim_emit = gen7_emit_composite_primitive_affine_source;
+		}
 
 		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
@@ -2920,32 +2901,6 @@ gen7_composite_alpha_gradient_init(struct sna *sna,
 }
 
 inline static void
-gen7_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	float t[3];
-
-	if (channel->is_affine) {
-		sna_get_transformed_coordinates(x + channel->offset[0],
-						y + channel->offset[1],
-						channel->transform,
-						&t[0], &t[1]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-	} else {
-		t[0] = t[1] = 0; t[2] = 1;
-		sna_get_transformed_coordinates_3d(x + channel->offset[0],
-						   y + channel->offset[1],
-						   channel->transform,
-						   &t[0], &t[1], &t[2]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-		OUT_VERTEX_F(t[2]);
-	}
-}
-
-inline static void
 gen7_emit_composite_texcoord_affine(struct sna *sna,
 				    const struct sna_composite_channel *channel,
 				    int16_t x, int16_t y)
@@ -3344,7 +3299,7 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   PixmapPtr src, struct kgem_bo *src_bo,
 				   PixmapPtr dst, struct kgem_bo *dst_bo)
 {
-	return (sna->kgem.ring != KGEM_RENDER ||
+	return (sna->kgem.ring == KGEM_BLT ||
 		prefer_blt_bo(sna, src, src_bo) ||
 		prefer_blt_bo(sna, dst, dst_bo));
 }
@@ -3667,7 +3622,7 @@ fallback:
 	if (!gen7_check_format(op->base.src.pict_format))
 		goto fallback;
 
-	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
+	op->base.op = PictOpSrc;
 
 	op->base.dst.pixmap = dst;
 	op->base.dst.width  = dst->drawable.width;
@@ -3751,7 +3706,9 @@ gen7_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 static inline bool prefer_blt_fill(struct sna *sna,
 				   struct kgem_bo *bo)
 {
-	return sna->kgem.ring != KGEM_RENDER || untiled_tlb_miss(bo);
+	return (can_switch_rings(sna) ||
+		prefer_blt_ring(sna) ||
+		untiled_tlb_miss(bo));
 }
 
 static Bool
@@ -3810,9 +3767,10 @@ gen7_render_fill_boxes(struct sna *sna,
 	return FALSE;
 #endif
 
-	if (op == PictOpClear)
+	if (op == PictOpClear) {
 		pixel = 0;
-	else if (!sna_get_pixel_from_rgba(&pixel,
+		op = PictOpSrc;
+	} else if (!sna_get_pixel_from_rgba(&pixel,
 				     color->red,
 				     color->green,
 				     color->blue,
@@ -3824,8 +3782,6 @@ gen7_render_fill_boxes(struct sna *sna,
 	     __FUNCTION__, pixel, n,
 	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
 
-	memset(&tmp, 0, sizeof(tmp));
-
 	tmp.op = op;
 
 	tmp.dst.pixmap = dst;
@@ -3833,14 +3789,21 @@ gen7_render_fill_boxes(struct sna *sna,
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.format = format;
 	tmp.dst.bo = dst_bo;
+	tmp.dst.x = tmp.dst.y = 0;
 
 	tmp.src.bo = sna_render_get_solid(sna, pixel);
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
 
+	tmp.mask.bo = NULL;
+	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
+	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
+
 	tmp.is_affine = TRUE;
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
+	tmp.has_component_alpha = FALSE;
+	tmp.need_magic_ca_pass = FALSE;
 
 	tmp.u.gen7.wm_kernel = GEN7_WM_KERNEL_NOMASK;
 	tmp.u.gen7.nr_surfaces = 2;
@@ -4004,7 +3967,7 @@ gen7_render_fill(struct sna *sna, uint8_t alu,
 	if (alu == GXclear)
 		color = 0;
 
-	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
+	op->base.op = PictOpSrc;
 
 	op->base.dst.pixmap = dst;
 	op->base.dst.width  = dst->drawable.width;
@@ -4097,7 +4060,7 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	if (alu == GXclear)
 		color = 0;
 
-	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
+	tmp.op = PictOpSrc;
 
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
@@ -4195,7 +4158,7 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	if (too_large(dst->drawable.width, dst->drawable.height))
 		return gen7_render_clear_try_blt(sna, dst, bo);
 
-	tmp.op = PictOpClear;
+	tmp.op = PictOpSrc;
 
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
commit 124c11f90766395bfcb65e13bd49bddd06a05180
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 25 11:09:35 2012 +0100

    sna: Do not automagically convert GTT mappings on untiled scanout to CPU
    
    The likelihood of an untiled mapping of the scanout is slim, except for
    gen3 with large desktops, and there it should never be in the CPU
    domain...
    
    The issue is that we may perform an operation "inplace", yet incoherent
    with the display engine, and never flush the CPU cache, resulting in
    render corruption. In theory at least!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d97f559..4c4aa7c 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3167,7 +3167,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->list));
 
-	if (bo->tiling == I915_TILING_NONE &&
+	if (bo->tiling == I915_TILING_NONE && !bo->scanout &&
 	    (kgem->has_llc || bo->domain == DOMAIN_CPU)) {
 		DBG(("%s: converting request for GTT map into CPU map\n",
 		     __FUNCTION__));
@@ -3274,6 +3274,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bytes(bo)));
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
+	assert(!bo->scanout);
 
 	if (IS_CPU_MAP(bo->map))
 		return MAP(bo->map);
commit fca776746134afb5abaa5245d3ed2b61f4620f69
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 25 11:16:30 2012 +0100

    sna: Clear the domain tracking after attaching the bo to scanout
    
    This is basically to make sure we don't continue treating it as CPU
    coherent.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index cb051b0..5275d4a 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -436,6 +436,7 @@ sna_crtc_restore(struct sna *sna)
 
 	assert(bo->tiling != I915_TILING_Y);
 	bo->scanout = true;
+	bo->domain = DOMAIN_NONE;
 
 	DBG(("%s: create fb %dx%d@%d/%d\n",
 	     __FUNCTION__,
@@ -673,6 +674,7 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 		DBG(("%s: handle %d attached to fb %d\n",
 		     __FUNCTION__, bo->handle, sna_mode->fb_id));
 		bo->scanout = true;
+		bo->domain = DOMAIN_NONE;
 		sna_mode->fb_pixmap = sna->front->drawable.serialNumber;
 	}
 
@@ -787,6 +789,7 @@ sna_crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 	DBG(("%s: attached handle %d to fb %d\n",
 	     __FUNCTION__, bo->handle, sna_crtc->shadow_fb_id));
 	bo->scanout = true;
+	bo->domain = DOMAIN_NONE;
 	return sna_crtc->shadow = shadow;
 }
 
@@ -1725,6 +1728,7 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 		if (!sna_crtc_apply(crtc))
 			goto fail;
 	}
+	bo->domain = DOMAIN_NONE;
 
 	scrn->virtualX = width;
 	scrn->virtualY = height;
@@ -1869,6 +1873,7 @@ sna_page_flip(struct sna *sna,
 	DBG(("%s: page flipped %d crtcs\n", __FUNCTION__, count));
 	if (count) {
 		bo->scanout = true;
+		bo->domain = DOMAIN_NONE;
 	} else {
 		drmModeRmFB(sna->kgem.fd, mode->fb_id);
 		mode->fb_id = *old_fb;
commit eba0cd23f09cbb3cd0476698cef178acade12af8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 23 11:09:37 2012 +0100

    sna/dri: Always clear the scanout when destroying dri2 buffers
    
    As we may end up holding onto and releasing the Screen pixmap last, we
    may also be responsible for flushing the last reference to the scanout.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index f4d55e0..1a7b6bd 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -333,6 +333,7 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 		}
 
 		private->bo->flush = 0;
+		kgem_bo_clear_scanout(&sna->kgem, private->bo); /* paranoia */
 		kgem_bo_destroy(&sna->kgem, private->bo);
 
 		free(buffer);
@@ -388,6 +389,7 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	sna_damage_destroy(&priv->cpu_damage);
 	priv->undamaged = false;
 
+	kgem_bo_clear_scanout(&sna->kgem, priv->gpu_bo); /* paranoia */
 	kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 	priv->gpu_bo = ref(bo);
 }
commit 45fce6802e9cf00a526bb772b03f5aab64c1648f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 20 17:15:37 2012 +0100

    sna: Remove the assertions that the cached upload buffers are active
    
    These were added to track down some corruption, but the assertions
    themselves are incorrect, just very rare. The upload buffer may
    genuinely be cached if we abort the render operation after uploading the
    source data, leaving the proxy not coupled to any request.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48400
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 06bd2c0..382c671 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1943,7 +1943,6 @@ sna_drawable_use_bo(DrawablePtr drawable,
 	}
 
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
-		assert(priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(&to_sna_from_pixmap(pixmap)->kgem,
 				priv->gpu_bo);
 		priv->gpu_bo = NULL;
@@ -2290,7 +2289,6 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	if (flags & MOVE_WRITE && priv->gpu_bo && priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
-		assert(priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
@@ -2744,7 +2742,6 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
-		assert(priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
@@ -3540,7 +3537,6 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 	if (dst_priv->gpu_bo && dst_priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload\n", __FUNCTION__));
-		assert(dst_priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(&sna->kgem, dst_priv->gpu_bo);
 		dst_priv->gpu_bo = NULL;
 	}
@@ -8383,9 +8379,10 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 	if (data.flags == 0)
 		return;
 
-	DBG(("%s: extents=(%d, %d), (%d, %d)\n", __FUNCTION__,
+	DBG(("%s: extents=(%d, %d), (%d, %d), flags=%x\n", __FUNCTION__,
 	     data.region.extents.x1, data.region.extents.y1,
-	     data.region.extents.x2, data.region.extents.y2));
+	     data.region.extents.x2, data.region.extents.y2,
+	     data.flags));
 
 	data.region.data = NULL;
 
commit 797e7e78754bb4513e3bd573f251369a1e3950f5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 20 13:21:40 2012 +0100

    sna: Always clear the mmapped domains when reusing  partial upload buffers
    
    As we need to make sure that we do invalidate the caches appropriately
    on reuse. Mildly paranoid, but strictly required by the spec.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 72b6ad7..d97f559 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3414,6 +3414,29 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	}
 }
 
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->proxy == NULL);
+	kgem_bo_submit(kgem, bo);
+
+	if (bo->domain != DOMAIN_GTT) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_GTT;
+		}
+	}
+}
+
 void kgem_bo_set_sync(struct kgem *kgem, struct kgem_bo *bo)
 {
 	assert(!bo->reusable);
@@ -3424,7 +3447,6 @@ void kgem_bo_set_sync(struct kgem *kgem, struct kgem_bo *bo)
 
 void kgem_sync(struct kgem *kgem)
 {
-	struct drm_i915_gem_set_domain set_domain;
 	struct kgem_request *rq;
 	struct kgem_bo *bo;
 
@@ -3437,14 +3459,7 @@ void kgem_sync(struct kgem *kgem)
 	if (rq == kgem->next_request)
 		_kgem_submit(kgem);
 
-	VG_CLEAR(set_domain);
-	set_domain.handle = rq->bo->handle;
-	set_domain.read_domains = I915_GEM_DOMAIN_GTT;
-	set_domain.write_domain = I915_GEM_DOMAIN_GTT;
-
-	drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-	kgem_retire(kgem);
-
+	kgem_bo_sync__gtt(kgem, rq->bo);
 	list_for_each_entry(bo, &kgem->sync_list, list)
 		kgem_bo_sync__cpu(kgem, bo);
 
@@ -3599,8 +3614,12 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->used = size;
 			list_move(&bo->base.list, &kgem->active_partials);
 
-			if (bo->base.vmap)
-				kgem_bo_sync__cpu(kgem, &bo->base);
+			if (bo->mmapped) {
+				if (IS_CPU_MAP(bo->base.map))
+					kgem_bo_sync__cpu(kgem, &bo->base);
+				else
+					kgem_bo_sync__gtt(kgem, &bo->base);
+			}
 
 			goto done;
 		} while (kgem_retire(kgem));
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 913e1a9..1235b83 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -365,6 +365,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 
 void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
commit ad8f17def56ab61d83e4c20fa0fb3014fa1c661c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Apr 19 10:34:23 2012 +0100

    sna: Discard proxy upload buffer if we choose to render to it
    
    Even if we try to avoid treating an upload buffer as a real GPU target,
    we may still choose to migrate the buffer to the GPU in order to keep
    other buffers on the GPU. In that case, we do want to create a real GPU
    bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1c43fb7..06bd2c0 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2288,6 +2288,13 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		goto active;
 	}
 
+	if (flags & MOVE_WRITE && priv->gpu_bo && priv->gpu_bo->proxy) {
+		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
+		assert(priv->gpu_bo->proxy->rq);
+		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
+		priv->gpu_bo = NULL;
+	}
+
 	if ((flags & MOVE_READ) == 0)
 		sna_damage_destroy(&priv->cpu_damage);
 
commit b9020dde2f4a162ce514973a67f7433bd608f356
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Apr 19 09:09:32 2012 +0100

    sna: Don't consider upload proxies as being on the GPU for render targets
    
    The upload proxy is a fake buffer that we do not want to render to as
    then the damage tracking become extremely confused and the buffer it
    self is not optimised for persistent rendering. We assert that we do not
    use it as a render target, and this patch adds the check so that we
    avoid treating the proxy as a valid target when choosing the render
    path.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 03e1969..b9acea1 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -75,7 +75,7 @@ is_gpu(DrawablePtr drawable)
 	if (priv == NULL || priv->clear)
 		return false;
 
-	if (priv->gpu_damage || (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo)))
+	if (priv->gpu_damage || (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo) && !priv->gpu_bo->proxy))
 		return true;
 
 	return priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo);
commit 74982b6e793490728951c7167fbe4a806dcc20d7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 18 11:39:43 2012 +0100

    sna: Increase the render target alignment to 4 pixels on gen4+ as well
    
    Repoerted-and-tested-by: Toralf FÃ¶rster <toralf.foerster at gmx.de
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48865
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 3d722d0..72b6ad7 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -838,13 +838,8 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
-		if (kgem->gen < 40) {
-			tile_width = scanout ? 64 : 4 * bpp >> 3;
-			tile_height = 4;
-		} else {
-			tile_width = scanout ? 64 : 2 * bpp >> 3;
-			tile_height = 2;
-		}
+		tile_width = scanout ? 64 : 4 * bpp >> 3;
+		tile_height = 2;
 		break;
 	case I915_TILING_X:
 		tile_width = 512;
@@ -898,7 +893,7 @@ static uint32_t kgem_aligned_height(struct kgem *kgem,
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
-		tile_height = kgem->gen < 40 ? 4 : 2;
+		tile_height = 2;
 		break;
 	case I915_TILING_X:
 		tile_height = 8;
@@ -2881,7 +2876,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 
 		stride = ALIGN(width, 2) * bpp >> 3;
 		stride = ALIGN(stride, 4);
-		size = ALIGN(height, kgem->gen < 40 ? 4 : 2) * stride;
+		size = ALIGN(height, 2) * stride;
 
 		assert(size >= PAGE_SIZE);
 
commit cedf695c130cb10561d3193c380e2cfcc23d78f4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 17 20:08:14 2012 +0100

    sna/dri: Decouple the frame event info after attaching along error paths
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index afec831..f4d55e0 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1372,15 +1372,15 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 			return FALSE;
 		}
 
+		sna_dri_reference_buffer(front);
+		sna_dri_reference_buffer(back);
+
 		if (!sna_dri_page_flip(sna, info)) {
 			DBG(("%s: failed to queue page flip\n", __FUNCTION__));
-			free(info);
+			sna_dri_frame_event_info_free(info);
 			return FALSE;
 		}
 
-		sna_dri_reference_buffer(front);
-		sna_dri_reference_buffer(back);
-
 		get_private(info->back)->bo =
 			kgem_create_2d(&sna->kgem,
 				       draw->width,
@@ -1426,7 +1426,7 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 			vbl.request.type |= DRM_VBLANK_SECONDARY;
 		vbl.request.sequence = 0;
 		if (drmWaitVBlank(sna->kgem.fd, &vbl)) {
-			free(info);
+			sna_dri_frame_event_info_free(info);
 			return FALSE;
 		}
 
@@ -1482,7 +1482,7 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		vbl.request.sequence -= 1;
 		vbl.request.signal = (unsigned long)info;
 		if (drmWaitVBlank(sna->kgem.fd, &vbl)) {
-			free(info);
+			sna_dri_frame_event_info_free(info);
 			return FALSE;
 		}
 
@@ -1610,9 +1610,8 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	if (pipe > 0)
 		vbl.request.type |= DRM_VBLANK_SECONDARY;
 	vbl.request.sequence = 0;
-	if (drmWaitVBlank(sna->kgem.fd, &vbl)) {
+	if (drmWaitVBlank(sna->kgem.fd, &vbl))
 		goto blit_fallback;
-	}
 
 	current_msc = vbl.reply.sequence;
 
@@ -1677,9 +1676,8 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 		vbl.request.sequence += divisor;
 
 	vbl.request.signal = (unsigned long)info;
-	if (drmWaitVBlank(sna->kgem.fd, &vbl)) {
+	if (drmWaitVBlank(sna->kgem.fd, &vbl))
 		goto blit_fallback;
-	}
 
 	*target_msc = vbl.reply.sequence;
 	info->frame = *target_msc;
@@ -1762,12 +1760,11 @@ blit:
 		if (!sna_dri_add_frame_event(info)) {
 			DBG(("%s: failed to hook up frame event\n", __FUNCTION__));
 			free(info);
-			info = NULL;
 			goto blit;
 		}
 
 		if (!sna_dri_page_flip(sna, info)) {
-			free(info);
+			sna_dri_frame_event_info_free(info);
 			goto blit;
 		}
 
@@ -1935,7 +1932,6 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 	if (!sna_dri_add_frame_event(info)) {
 		DBG(("%s: failed to hook up frame event\n", __FUNCTION__));
 		free(info);
-		info = NULL;
 		goto out_complete;
 	}
 
@@ -1959,7 +1955,7 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 					   strerror(errno));
 				limit--;
 			}
-			goto out_complete;
+			goto out_free_info;
 		}
 
 		info->frame = vbl.reply.sequence;
@@ -1996,15 +1992,16 @@ sna_dri_schedule_wait_msc(ClientPtr client, DrawablePtr draw, CARD64 target_msc,
 				   strerror(errno));
 			limit--;
 		}
-		goto out_complete;
+		goto out_free_info;
 	}
 
 	info->frame = vbl.reply.sequence;
 	DRI2BlockClient(client, draw);
 	return TRUE;
 
+out_free_info:
+	sna_dri_frame_event_info_free(info);
 out_complete:
-	free(info);
 	DRI2WaitMSCComplete(client, draw, target_msc, 0, 0);
 	return TRUE;
 }
commit 46e994a5334d91ef4c7f62b6b0acdbbc17144fc5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 17 17:54:58 2012 +0100

    Don't issue a scanline wait while VT switched
    
    Be paranoid and check that we own the VT before emitting a scanline
    wait. If we attempt to wait on a fb/pipe that we do not own, we may
    issue an illegal command and cause a lockup.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_dri.c b/src/intel_dri.c
index c4e029d..1745aef 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -558,7 +558,8 @@ I830DRI2CopyRegion(DrawablePtr drawable, RegionPtr pRegion,
 	ValidateGC(dst, gc);
 
 	/* Wait for the scanline to be outside the region to be copied */
-	if (pixmap_is_scanout(get_drawable_pixmap(dst)) &&
+	if (scrn->vtSema &&
+	    pixmap_is_scanout(get_drawable_pixmap(dst)) &&
 	    intel->swapbuffers_wait && INTEL_INFO(intel)->gen < 60) {
 		BoxPtr box;
 		BoxRec crtcbox;
diff --git a/src/intel_video.c b/src/intel_video.c
index 25f6483..0834bb2 100644
--- a/src/intel_video.c
+++ b/src/intel_video.c
@@ -1285,7 +1285,7 @@ intel_wait_for_scanline(ScrnInfoPtr scrn, PixmapPtr pixmap,
 	int y1, y2;
 
 	pipe = -1;
-	if (pixmap_is_scanout(pixmap))
+	if (scrn->vtSema && pixmap_is_scanout(pixmap))
 		pipe = intel_crtc_to_pipe(crtc);
 	if (pipe < 0)
 		return;
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index ef3b0f9..cb051b0 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -2018,6 +2018,10 @@ sna_covering_crtc(ScrnInfoPtr scrn,
 	int best_coverage, c;
 	BoxRec best_crtc_box;
 
+	/* If we do not own the VT, we do not own the CRTC either */
+	if (!scrn->vtSema)
+		return NULL;
+
 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
 
commit 4bdd362ab75d0649ba294d46dc866c426619387f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 16 20:57:03 2012 +0100

    sna: Don't assert exported buffers are not busy
    
    As we do not fully control these buffers, we cannot truly say when they
    are idle, we can only trust that the split between us and the compositor
    doesn't lead to much corruption.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 9b080f1..3d722d0 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -312,7 +312,7 @@ static void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d, domain=%d\n",
 	     __FUNCTION__, bo->handle, bo->domain));
-	assert(!kgem_busy(kgem, bo->handle));
+	assert(bo->flush || !kgem_busy(kgem, bo->handle));
 
 	if (bo->rq)
 		kgem_retire(kgem);
@@ -332,7 +332,7 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 {
 	assert(bo->refcnt);
 	assert(!bo->purged);
-	assert(!kgem_busy(kgem, bo->handle));
+	assert(bo->flush || !kgem_busy(kgem, bo->handle));
 	assert(bo->proxy == NULL);
 
 	assert(length <= bytes(bo));
@@ -1033,13 +1033,13 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 					    struct kgem_bo *bo)
 {
+	assert(bo->reusable);
+	assert(bo->rq == NULL);
+	assert(bo->domain != DOMAIN_GPU);
 	assert(!kgem_busy(kgem, bo->handle));
 	assert(!bo->proxy);
 	assert(!bo->io);
 	assert(!bo->needs_flush);
-	assert(bo->rq == NULL);
-	assert(bo->domain != DOMAIN_GPU);
-	assert(bo->reusable);
 	assert(list_is_empty(&bo->vma));
 
 	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
@@ -1098,6 +1098,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		goto destroy;
 
 	if (bo->vmap) {
+		assert(!bo->flush);
 		DBG(("%s: handle=%d is vmapped, tracking until free\n",
 		     __FUNCTION__, bo->handle));
 		if (bo->rq == NULL) {
@@ -2279,6 +2280,7 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 	}
 
 	bo->reusable = false;
+	bo->flush = true;
 	return bo;
 }
 
@@ -2807,8 +2809,7 @@ search_inactive:
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
 		assert((flags & CREATE_INACTIVE) == 0 || bo->domain != DOMAIN_GPU);
-		assert((flags & CREATE_INACTIVE) == 0 ||
-		       !kgem_busy(kgem, bo->handle));
+		assert((flags & CREATE_INACTIVE) == 0 || !kgem_busy(kgem, bo->handle));
 		assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 		bo->refcnt = 1;
 		return bo;
@@ -3206,8 +3207,8 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->domain != DOMAIN_GTT) {
 		struct drm_i915_gem_set_domain set_domain;
 
-		DBG(("%s: sync: needs_flush? %d, domain? %d\n", __FUNCTION__,
-		     bo->needs_flush, bo->domain));
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
 
 		/* XXX use PROT_READ to avoid the write flush? */
 
commit f783ec383298ece9dcb8711954b509f0f170a8eb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 14 19:03:25 2012 +0100

    uxa: Fix leak of glyph mask for unhandled glyph composition
    
    ==1401== 7,344 bytes in 34 blocks are possibly lost in loss record 570 of 587
    ==1401==    at 0x4027034: calloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
    ==1401==    by 0x8BE5150: drm_intel_gem_bo_alloc_internal (intel_bufmgr_gem.c:689)
    ==1401==    by 0x899FC04: intel_uxa_create_pixmap (intel_uxa.c:1077)
    ==1401==    by 0x89C2C41: uxa_glyphs (uxa-glyphs.c:254)
    ==1401==    by 0x21F05E: damageGlyphs (damage.c:647)
    ==1401==    by 0x218E06: ProcRenderCompositeGlyphs (render.c:1434)
    ==1401==    by 0x15AA40: Dispatch (dispatch.c:439)
    ==1401==    by 0x1499E9: main (main.c:287)
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-glyphs.c b/uxa/uxa-glyphs.c
index b754f4e..921b99c 100644
--- a/uxa/uxa-glyphs.c
+++ b/uxa/uxa-glyphs.c
@@ -812,8 +812,10 @@ uxa_glyphs_via_mask(CARD8 op,
 				if (!uxa_pixmap_is_offscreen(src_pixmap) ||
 				    !uxa_screen->info->prepare_composite(PictOpAdd,
 									 this_atlas, NULL, mask,
-									 src_pixmap, NULL, pixmap))
+									 src_pixmap, NULL, pixmap)) {
+					FreePicture(mask, 0);
 					return -1;
+				}
 
 				glyph_atlas = this_atlas;
 			}
commit e9a39f7fe69ad5e39788bae83713587d517e609d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 14 18:42:23 2012 +0100

    sna: Avoid leaking the plane resources when determining sprite planes
    
    Fixes the tiny, one-off leak:
    
    ==1407== 8 bytes in 1 blocks are definitely lost in loss record 48 of 527
    ==1407==    at 0x402894D: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
    ==1407==    by 0x8580BE8: drmMalloc (xf86drm.c:147)
    ==1407==    by 0x8583D54: drmAllocCpy (xf86drmMode.c:73)
    ==1407==    by 0x8585265: drmModeGetPlaneResources (xf86drmMode.c:955)
    ==1407==    by 0x8A1BCE9: sna_video_sprite_setup (sna_video_sprite.c:367)
    ==1407==    by 0x8A1A0A3: sna_video_init (sna_video.c:523)
    ==1407==    by 0x89FD4E0: sna_screen_init (sna_driver.c:935)
    ==1407==    by 0x15AD80: AddScreen (dispatch.c:3909)
    ==1407==    by 0x19A2DB: InitOutput (xf86Init.c:817)
    ==1407==    by 0x14981C: main (main.c:204)
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
index 82db122..0e5f3ab 100644
--- a/src/sna/sna_video_sprite.c
+++ b/src/sna/sna_video_sprite.c
@@ -361,21 +361,21 @@ XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
 					   ScreenPtr screen)
 {
 	XF86VideoAdaptorPtr adaptor;
+	struct drm_mode_get_plane_res r;
 	struct sna_video *video;
-	drmModePlaneRes *plane_resources;
 
-	plane_resources = drmModeGetPlaneResources(sna->kgem.fd);
-	if (!plane_resources)
+	memset(&r, 0, sizeof(struct drm_mode_get_plane_res));
+	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPLANERESOURCES, &r))
+		return NULL;
+	if (r.count_planes == 0)
 		return NULL;
 
 	adaptor = calloc(1,
 			 sizeof(XF86VideoAdaptorRec) +
 			 sizeof(struct sna_video) +
 			 sizeof(DevUnion));
-	if (!adaptor) {
-		free(plane_resources);
+	if (!adaptor)
 		return NULL;
-	}
 
 	adaptor->type = XvWindowMask | XvInputMask | XvImageMask;
 	adaptor->flags = VIDEO_OVERLAID_IMAGES /*| VIDEO_CLIP_TO_VIEWPORT */ ;
@@ -428,7 +428,5 @@ XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
 
 	xvColorKey = MAKE_ATOM("XV_COLORKEY");
 
-	free(plane_resources);
-
 	return adaptor;
 }
commit ac33a3e0d977c6c3bc9c4724b34f67c71963cd8c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 14 12:06:51 2012 +0100

    sna: Align texture subsurfaces to 2x2 texture samples
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 66e078a..ec2aaad 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -839,8 +839,15 @@ sna_render_pixmap_partial(struct sna *sna,
 		box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
 
 		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
-	} else
+	} else {
+		box.y1 = box.y1 & ~1;
+		box.y2 = ALIGN(box.y2, 2);
+
+		box.x1 = box.x1 & ~1;
+		box.x2 = ALIGN(box.x2, 2);
+
 		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8;
+	}
 
 	if (box.x2 > pixmap->drawable.width)
 		box.x2 = pixmap->drawable.width;
commit 7268f69350f5c060c334d7f8d6aed0808d59b5a7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 14 12:04:23 2012 +0100

    sna: Align redirect subsurfaces to 2x2 or 4x4 render spans
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 8af80f2..66e078a 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1767,8 +1767,23 @@ sna_render_composite_redirect(struct sna *sna,
 			box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);
 
 			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
-		} else
+		} else {
+			if (sna->kgem.gen < 40) {
+				box.y1 = box.y1 & ~3;
+				box.y2 = ALIGN(box.y2, 4);
+
+				box.x1 = box.x1 & ~3;
+				box.x2 = ALIGN(box.x2, 4);
+			} else {
+				box.y1 = box.y1 & ~1;
+				box.y2 = ALIGN(box.y2, 2);
+
+				box.x1 = box.x1 & ~1;
+				box.x2 = ALIGN(box.x2, 2);
+			}
+
 			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8;
+		}
 
 		if (box.y2 > op->dst.pixmap->drawable.height)
 			box.y2 = op->dst.pixmap->drawable.height;
commit c6ac5620ad158b3464f33003b678126d11b43603
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 14 11:59:31 2012 +0100

    sna: Align render target sizes on gen2/3 to 4x4 render spans
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f1b0376..9b080f1 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -686,12 +686,6 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	DBG(("%s: partial buffer size=%d [%d KiB]\n", __FUNCTION__,
 	     kgem->partial_buffer_size, kgem->partial_buffer_size / 1024));
 
-	kgem->min_alignment = 4;
-	if (gen < 60)
-		/* XXX workaround an issue where we appear to fail to
-		 * disable dual-stream mode */
-		kgem->min_alignment = 64;
-
 	kgem->max_object_size = 2 * aperture.aper_size / 3;
 	kgem->max_gpu_size = kgem->max_object_size;
 	if (!kgem->has_llc)
@@ -776,8 +770,8 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 				   uint32_t width, uint32_t bpp,
 				   bool scanout)
 {
-	width = width * bpp >> 3;
-	return ALIGN(width, scanout ? 64 : kgem->min_alignment);
+	width = ALIGN(width, 4) * bpp >> 3;
+	return ALIGN(width, scanout ? 64 : 4);
 }
 
 void kgem_get_tile_size(struct kgem *kgem, int tiling,
@@ -838,14 +832,19 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 			tile_width = 512;
 			tile_height = kgem->gen < 30 ? 16 : 8;
 		} else {
-			tile_width = scanout ? 64 : kgem->min_alignment;
-			tile_height = 2;
+			tile_width = scanout ? 64 : 4 * bpp >> 3;
+			tile_height = 4;
 		}
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
-		tile_width = scanout ? 64 : kgem->min_alignment;
-		tile_height = 2;
+		if (kgem->gen < 40) {
+			tile_width = scanout ? 64 : 4 * bpp >> 3;
+			tile_height = 4;
+		} else {
+			tile_width = scanout ? 64 : 2 * bpp >> 3;
+			tile_height = 2;
+		}
 		break;
 	case I915_TILING_X:
 		tile_width = 512;
@@ -899,7 +898,7 @@ static uint32_t kgem_aligned_height(struct kgem *kgem,
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
-		tile_height = 2;
+		tile_height = kgem->gen < 40 ? 4 : 2;
 		break;
 	case I915_TILING_X:
 		tile_height = 8;
@@ -2881,7 +2880,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 
 		stride = ALIGN(width, 2) * bpp >> 3;
 		stride = ALIGN(stride, 4);
-		size = ALIGN(height, 2) * stride;
+		size = ALIGN(height, kgem->gen < 40 ? 4 : 2) * stride;
 
 		assert(size >= PAGE_SIZE);
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index e52645c..913e1a9 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -161,7 +161,6 @@ struct kgem {
 	uint16_t half_cpu_cache_pages;
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
-	uint32_t min_alignment;
 	uint32_t max_upload_tile_size, max_copy_tile_size;
 	uint32_t max_gpu_size, max_cpu_size;
 	uint32_t large_object_size, max_object_size;
commit 1f96f42d1f3dc8e0828f05bf07fadecd57827e6a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 16:37:43 2012 +0100

    sna: Avoid using TILING_Y for large objects on gen2/3
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48636
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 8fcdd14..1c43fb7 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -458,6 +458,16 @@ static inline uint32_t default_tiling(PixmapPtr pixmap)
 		     __FUNCTION__));
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->undamaged = false;
+
+		/* Only on later generations was the render pipeline
+		 * more flexible than the BLT. So on gen2/3, prefer to
+		 * keep large objects accessible through the BLT.
+		 */
+		if (sna->kgem.gen < 40 &&
+		    (pixmap->drawable.width > sna->render.max_3d_size ||
+		     pixmap->drawable.height > sna->render.max_3d_size))
+			return I915_TILING_X;
+
 		return I915_TILING_Y;
 	}
 
commit 26408ad7ca5d111a56a2cf40d9a8e29043eb6892
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 15:12:36 2012 +0100

    sna: Relax bogus assertion
    
    The bo may be considered unmappable due to being bound to outside the
    mappable region, which we are attempting to rectify through mapping into
    the GTT domain.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 2d09666..f1b0376 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3186,7 +3186,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		assert(kgem_bo_is_mappable(kgem, bo));
+		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
commit 4c166bc6aab0d6bb33927fc2e7ebd7baff283f2f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 15:07:13 2012 +0100

    sna: Limit the buffer reuse for mappable uploads to only those with mmaps
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f0c971e..2d09666 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2156,6 +2156,9 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			//assert(!kgem_busy(kgem, bo->handle));
 			return bo;
 		}
+
+		if (flags & CREATE_EXACT)
+			return NULL;
 	}
 
 	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
@@ -3683,7 +3686,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		 * devices like gen2 or with relatively slow gpu like i3.
 		 */
 		old = search_linear_cache(kgem, alloc,
-					  CREATE_INACTIVE | CREATE_GTT_MAP);
+					  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
 #if HAVE_I915_GEM_BUFFER_INFO
 		if (old) {
 			struct drm_i915_gem_buffer_info info;
@@ -3705,7 +3708,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #endif
 		if (old == NULL)
 			old = search_linear_cache(kgem, NUM_PAGES(size),
-						  CREATE_INACTIVE | CREATE_GTT_MAP);
+						  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
 		if (old) {
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
commit c6ac70a89a72f004309a67708bdd6da0cf2650c2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 13:51:57 2012 +0100

    sna: Remove the conflicting assertion during GTT map
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48636
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 14a0067..f0c971e 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3183,7 +3183,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		assert(bytes(bo) <= kgem->aperture_mappable / 4);
+		assert(kgem_bo_is_mappable(kgem, bo));
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
commit 405a9fa547d49b4bb699b0ba4fc61698f131beae
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 09:56:39 2012 +0100

    sna: Don't use miSpan code for wide-spans by default, too expensive
    
    Only use the fall-forward miSpans code when it prevents a readback.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b3507ad..8fcdd14 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -60,7 +60,7 @@
 #define FORCE_FLUSH 0
 
 #define USE_INPLACE 1
-#define USE_WIDE_SPANS 1 /* -1 force CPU, 1 force GPU */
+#define USE_WIDE_SPANS 0 /* -1 force CPU, 1 force GPU */
 #define USE_ZERO_SPANS 1 /* -1 force CPU, 1 force GPU */
 #define USE_SHM_VMAP 0
 #define PREFER_VMAP 0
commit 21a759334a6dcacf7d906409beadb1053dc64a85
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 13 09:38:57 2012 +0100

    sna: Draw dashed PolyLines twice, once for the fgPixel, once for the bgPixel
    
    As the fast paths only setup state upfront, we were missing the state
    changes required between dash-on/off. Take advantage of that each pixel
    is only drawn once to batch the state changes and run the
    miZeroDashLines twice.
    
    A future task would be to use a custom line drawing routine...
    
    Fixes regression from ec1267df746512c2e262ef0bd9e9527bc5efe6f4.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 590cc11..b3507ad 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4275,6 +4275,18 @@ sna_fill_spans__fill(DrawablePtr drawable,
 }
 
 static void
+sna_fill_spans__dash(DrawablePtr drawable,
+		     GCPtr gc, int n,
+		     DDXPointPtr pt, int *width, int sorted)
+{
+	struct sna_fill_spans *data = sna_gc(gc)->priv;
+	struct sna_fill_op *op = data->op;
+
+	if (op->base.u.blt.pixel == gc->fgPixel)
+		sna_fill_spans__fill(drawable, gc, n, pt, width, sorted);
+}
+
+static void
 sna_fill_spans__fill_offset(DrawablePtr drawable,
 			    GCPtr gc, int n,
 			    DDXPointPtr pt, int *width, int sorted)
@@ -4306,6 +4318,18 @@ sna_fill_spans__fill_offset(DrawablePtr drawable,
 }
 
 static void
+sna_fill_spans__dash_offset(DrawablePtr drawable,
+			    GCPtr gc, int n,
+			    DDXPointPtr pt, int *width, int sorted)
+{
+	struct sna_fill_spans *data = sna_gc(gc)->priv;
+	struct sna_fill_op *op = data->op;
+
+	if (op->base.u.blt.pixel == gc->fgPixel)
+		sna_fill_spans__fill_offset(drawable, gc, n, pt, width, sorted);
+}
+
+static void
 sna_fill_spans__fill_clip_extents(DrawablePtr drawable,
 				  GCPtr gc, int n,
 				  DDXPointPtr pt, int *width, int sorted)
@@ -4349,6 +4373,18 @@ sna_fill_spans__fill_clip_extents(DrawablePtr drawable,
 }
 
 static void
+sna_fill_spans__dash_clip_extents(DrawablePtr drawable,
+				  GCPtr gc, int n,
+				  DDXPointPtr pt, int *width, int sorted)
+{
+	struct sna_fill_spans *data = sna_gc(gc)->priv;
+	struct sna_fill_op *op = data->op;
+
+	if (op->base.u.blt.pixel == gc->fgPixel)
+		sna_fill_spans__fill_clip_extents(drawable, gc, n, pt, width, sorted);
+}
+
+static void
 sna_fill_spans__fill_clip_boxes(DrawablePtr drawable,
 				GCPtr gc, int n,
 				DDXPointPtr pt, int *width, int sorted)
@@ -4422,6 +4458,18 @@ sna_fill_spans__fill_clip_boxes(DrawablePtr drawable,
 		op->boxes(data->sna, op, box, b - box);
 }
 
+static void
+sna_fill_spans__dash_clip_boxes(DrawablePtr drawable,
+				GCPtr gc, int n,
+				DDXPointPtr pt, int *width, int sorted)
+{
+	struct sna_fill_spans *data = sna_gc(gc)->priv;
+	struct sna_fill_op *op = data->op;
+
+	if (op->base.u.blt.pixel == gc->fgPixel)
+		sna_fill_spans__fill_clip_boxes(drawable, gc, n, pt, width, sorted);
+}
+
 static Bool
 sna_fill_spans_blt(DrawablePtr drawable,
 		   struct kgem_bo *bo, struct sna_damage **damage,
@@ -6646,44 +6694,78 @@ spans_fallback:
 		get_drawable_deltas(drawable, data.pixmap, &data.dx, &data.dy);
 		sna_gc(gc)->priv = &data;
 
-		if (gc->lineWidth == 0 &&
-		    gc_is_solid(gc, &color)) {
+		if (gc->lineWidth == 0 && gc_is_solid(gc, &color)) {
 			struct sna_fill_op fill;
 
-			if (!sna_fill_init_blt(&fill,
-					       data.sna, data.pixmap,
-					       data.bo, gc->alu, color))
-				goto fallback;
+			if (gc->lineStyle == LineSolid) {
+				if (!sna_fill_init_blt(&fill,
+						       data.sna, data.pixmap,
+						       data.bo, gc->alu, color))
+					goto fallback;
 
-			data.op = &fill;
+				data.op = &fill;
 
-			if ((data.flags & 2) == 0) {
-				if (data.dx | data.dy)
-					sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_offset;
-				else
-					sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill;
-			} else {
-				region_maybe_clip(&data.region,
-						  gc->pCompositeClip);
-				if (!RegionNotEmpty(&data.region))
-					return;
+				if ((data.flags & 2) == 0) {
+					if (data.dx | data.dy)
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_offset;
+					else
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill;
+				} else {
+					region_maybe_clip(&data.region,
+							  gc->pCompositeClip);
+					if (!RegionNotEmpty(&data.region))
+						return;
 
-				if (region_is_singular(&data.region))
-					sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_clip_extents;
-				else
-					sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_clip_boxes;
-			}
-			assert(gc->miTranslate);
+					if (region_is_singular(&data.region))
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_clip_extents;
+					else
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__fill_clip_boxes;
+				}
+				assert(gc->miTranslate);
 
-			gc->ops = &sna_gc_ops__tmp;
-			if (gc->lineStyle == LineSolid) {
+				gc->ops = &sna_gc_ops__tmp;
 				DBG(("%s: miZeroLine (solid fill)\n", __FUNCTION__));
 				miZeroLine(drawable, gc, mode, n, pt);
+				fill.done(data.sna, &fill);
 			} else {
-				DBG(("%s: miZeroDashLine (solid fill)\n", __FUNCTION__));
+				data.op = &fill;
+
+				if ((data.flags & 2) == 0) {
+					if (data.dx | data.dy)
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__dash_offset;
+					else
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__dash;
+				} else {
+					region_maybe_clip(&data.region,
+							  gc->pCompositeClip);
+					if (!RegionNotEmpty(&data.region))
+						return;
+
+					if (region_is_singular(&data.region))
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__dash_clip_extents;
+					else
+						sna_gc_ops__tmp.FillSpans = sna_fill_spans__dash_clip_boxes;
+				}
+				assert(gc->miTranslate);
+
+				gc->ops = &sna_gc_ops__tmp;
+				DBG(("%s: miZeroLine (solid dash)\n", __FUNCTION__));
+				if (!sna_fill_init_blt(&fill,
+						       data.sna, data.pixmap,
+						       data.bo, gc->alu, color))
+					goto fallback;
+
 				miZeroDashLine(drawable, gc, mode, n, pt);
+				fill.done(data.sna, &fill);
+
+				if (sna_fill_init_blt(&fill,
+						       data.sna, data.pixmap,
+						       data.bo, gc->alu,
+						       gc->bgPixel)) {
+					miZeroDashLine(drawable, gc, mode, n, pt);
+					fill.done(data.sna, &fill);
+				}
 			}
-			fill.done(data.sna, &fill);
 		} else {
 			/* Note that the WideDash functions alternate
 			 * between filling using fgPixel and bgPixel
commit ba6f45aca54241c392802b8e5cbffbfe0d65ebb9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Apr 12 22:46:22 2012 +0100

    sna: Restore CPU domain for vmapped buffers when reusing
    
    For a vmapped upload buffer, we need to notify the kernel (and thereby
    the GPU) to invalidate the sampler and flush its caches when we reuse an
    idle buffer.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 6ea4d48..14a0067 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3600,6 +3600,10 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			offset = 0;
 			bo->used = size;
 			list_move(&bo->base.list, &kgem->active_partials);
+
+			if (bo->base.vmap)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
 			goto done;
 		} while (kgem_retire(kgem));
 	}
commit fef43bea7891948a2b79f80d5ce258f7486e7581
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Apr 12 22:23:12 2012 +0100

    sna: Revert use of mmap64()
    
    As this just causes mayhem on a 64-bit platform. Doomed if you, doomed
    if you don't. :(
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 109f3b7..6ea4d48 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -220,7 +220,7 @@ static void *gem_mmap(int fd, uint32_t handle, int size, int prot)
 		return NULL;
 	}
 
-	ptr = mmap64(0, size, prot, MAP_SHARED, fd, mmap_arg.offset);
+	ptr = mmap(0, size, prot, MAP_SHARED, fd, mmap_arg.offset);
 	if (ptr == MAP_FAILED) {
 		assert(0);
 		ptr = NULL;
commit 18f54191a1c14accc84b8d9e54eb9c2579a64ceb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Apr 12 15:37:25 2012 +0100

    sna: Declare AC_SYS_LARGEFILE for mmap64
    
    In order to use the full 32-bits of mmap address space on small
    platforms we need to use mmap64().
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index c8a7531..0489f64 100644
--- a/configure.ac
+++ b/configure.ac
@@ -52,6 +52,7 @@ m4_ifndef([XORG_DRIVER_CHECK_EXT],
 # Initialize libtool
 AC_DISABLE_STATIC
 AC_PROG_LIBTOOL
+AC_SYS_LARGEFILE
 
 # Are we in a git checkout?
 dot_git=no
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 6ea4d48..109f3b7 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -220,7 +220,7 @@ static void *gem_mmap(int fd, uint32_t handle, int size, int prot)
 		return NULL;
 	}
 
-	ptr = mmap(0, size, prot, MAP_SHARED, fd, mmap_arg.offset);
+	ptr = mmap64(0, size, prot, MAP_SHARED, fd, mmap_arg.offset);
 	if (ptr == MAP_FAILED) {
 		assert(0);
 		ptr = NULL;
commit 09d31518d7a99f13a8e190c9f9c9e549139c45ce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 20:59:36 2012 +0100

    sna: Check ioctl return from set-domain
    
    Let's not assume it succeeds and so avoid altering our bookkeeping along
    failure paths.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 2a8b3cd..6ea4d48 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3213,10 +3213,10 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 		set_domain.handle = bo->handle;
 		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
 		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
-		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-
-		kgem_bo_retire(kgem, bo);
-		bo->domain = DOMAIN_GTT;
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_GTT;
+		}
 	}
 
 	return ptr;
@@ -3409,10 +3409,10 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 		set_domain.read_domains = I915_GEM_DOMAIN_CPU;
 		set_domain.write_domain = I915_GEM_DOMAIN_CPU;
 
-		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-
-		kgem_bo_retire(kgem, bo);
-		bo->domain = DOMAIN_CPU;
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_CPU;
+		}
 	}
 }
 
@@ -4033,11 +4033,15 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		set_domain.read_domains =
 			IS_CPU_MAP(bo->base.map) ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
 
-		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
+		if (drmIoctl(kgem->fd,
+			     DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))
+			return;
 	} else {
-		gem_read(kgem->fd,
-			 bo->base.handle, (char *)bo->mem+offset,
-			 offset, length);
+		if (gem_read(kgem->fd,
+			     bo->base.handle, (char *)bo->mem+offset,
+			     offset, length))
+			return;
+
 		kgem_bo_map__cpu(kgem, &bo->base);
 	}
 	kgem_bo_retire(kgem, &bo->base);
commit 693d2c83a507729d0e5872b48304aa6286771b7e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 15:23:54 2012 +0100

    sna: Fix shadowed variable
    
    sna_accel.c: In function 'sna_pixmap_move_area_to_gpu':
    sna_accel.c:1751:12: warning: declaration of 'flags' shadows a parameter
    [-Wshadow]
    sna_accel.c:1731:72: warning: shadowed declaration is here [-Wshadow]
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b6adc60..590cc11 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1748,20 +1748,20 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 	if (priv->gpu_bo == NULL) {
-		unsigned flags;
+		unsigned create;
 
-		flags = 0;
+		create = 0;
 		if (priv->cpu_damage)
-			flags |= CREATE_INACTIVE;
+			create |= CREATE_INACTIVE;
 		if (pixmap->usage_hint == SNA_CREATE_FB)
-			flags |= CREATE_EXACT | CREATE_SCANOUT;
+			create |= CREATE_EXACT | CREATE_SCANOUT;
 
 		priv->gpu_bo = kgem_create_2d(&sna->kgem,
 					      pixmap->drawable.width,
 					      pixmap->drawable.height,
 					      pixmap->drawable.bitsPerPixel,
 					      sna_pixmap_choose_tiling(pixmap),
-					      flags);
+					      create);
 		if (priv->gpu_bo == NULL)
 			return false;
 
commit 6d376dbf0f00da0674a5f12d9a45b58043012863
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 15:19:19 2012 +0100

    sna: Fix typo and use the right pointer for kgem_bo_destroy
    
    Useless warnings in xorg headers ftl.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48400
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ab2997d..b6adc60 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1934,7 +1934,8 @@ sna_drawable_use_bo(DrawablePtr drawable,
 
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
 		assert(priv->gpu_bo->proxy->rq);
-		kgem_bo_destroy(to_sna_from_pixmap(pixmap), priv->gpu_bo);
+		kgem_bo_destroy(&to_sna_from_pixmap(pixmap)->kgem,
+				priv->gpu_bo);
 		priv->gpu_bo = NULL;
 		goto use_cpu_bo;
 	}
@@ -2727,7 +2728,7 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
 		assert(priv->gpu_bo->proxy->rq);
-		kgem_bo_destroy(sna, priv->gpu_bo);
+		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
 
commit b98cf6745c73844d51ffe0f22e3fa55661a8b93c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 14:43:28 2012 +0100

    sna: Add missing alloc failure check for creating tile source
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 083cd3c..ab2997d 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -8767,6 +8767,9 @@ sna_pixmap_get_source_bo(PixmapPtr pixmap)
 					       pixmap->drawable.bitsPerPixel,
 					       KGEM_BUFFER_WRITE_INPLACE,
 					       &ptr);
+		if (upload == NULL)
+			return NULL;
+
 		memcpy_blt(pixmap->devPrivate.ptr, ptr,
 			   pixmap->drawable.bitsPerPixel,
 			   pixmap->devKind, upload->pitch,
commit faa93f14b06e1e3443c2a58e60ff363e4b7a90e4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 12:06:56 2012 +0100

    sna: Release the freed bo cache upon expire
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 581e3c8..2a8b3cd 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1968,6 +1968,12 @@ bool kgem_expire_cache(struct kgem *kgem)
 	bool idle;
 	unsigned int i;
 
+	while (__kgem_freed_bo) {
+		bo = __kgem_freed_bo;
+		__kgem_freed_bo = *(struct kgem_bo **)bo;
+		free(bo);
+	}
+
 	kgem_retire(kgem);
 	if (kgem->wedged)
 		kgem_cleanup(kgem);
commit 2a71b8debbbbf55d6356ff6391a807e7a97513ae
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 22:54:51 2012 +0100

    sna: Check for an inactive partial buffer to reuse after retiring requests
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 470cefb..581e3c8 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3575,7 +3575,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	}
 
 	if (flags & KGEM_BUFFER_WRITE) {
-		list_for_each_entry_reverse(bo, &kgem->inactive_partials, base.list) {
+		do list_for_each_entry_reverse(bo, &kgem->inactive_partials, base.list) {
 			assert(bo->base.io);
 			assert(bo->base.refcnt == 1);
 
@@ -3595,7 +3595,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->used = size;
 			list_move(&bo->base.list, &kgem->active_partials);
 			goto done;
-		}
+		} while (kgem_retire(kgem));
 	}
 
 #if !DBG_NO_MAP_UPLOAD
commit 62a6eb3cfe2ddfd350235e52c03349919b4351eb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 22:54:15 2012 +0100

    sna: Release partial buffers during cache expiration
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 6dd1871..470cefb 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1925,6 +1925,7 @@ void kgem_throttle(struct kgem *kgem)
 
 static void kgem_expire_partial(struct kgem *kgem)
 {
+	kgem_retire_partials(kgem);
 	while (!list_is_empty(&kgem->inactive_partials)) {
 		struct kgem_partial_bo *bo =
 			list_first_entry(&kgem->inactive_partials,
commit 118e980f4aba46aa1e78fc3dfc0beab0439548f0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 22:53:31 2012 +0100

    sna: Repeat expire whilst there remaining outstanding requests
    
    Do not allow the cache expiration to finish if we are still running
    requests.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index dde9d1d..6dd1871 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1979,7 +1979,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 	time(&now);
 	expire = 0;
 
-	idle = true;
+	idle = !kgem->need_retire;
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
 		idle &= list_is_empty(&kgem->inactive[i]);
 		list_for_each_entry(bo, &kgem->inactive[i], list) {
@@ -1999,7 +1999,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 	if (expire == 0)
 		return true;
 
-	idle = true;
+	idle = !kgem->need_retire;
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
 		struct list preserve;
 
commit 0b1324fbef61b10d9ea687a6e3ada905ba29f9cf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 10 11:14:50 2012 +0100

    sna: Only move the bo into the read domain for readback
    
    And mark it as currently in no domain afterwards, so that if we reuse
    it, it will be appropriately moved later.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index c8be7c8..dde9d1d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3997,7 +3997,6 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
 	uint32_t offset = _bo->delta, length = _bo->size.bytes;
-	int domain;
 
 	assert(_bo->io);
 	assert(_bo->exec == &_kgem_dummy_exec);
@@ -4023,15 +4022,9 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 
 		VG_CLEAR(set_domain);
 		set_domain.handle = bo->base.handle;
-		if (IS_CPU_MAP(bo->base.map)) {
-			set_domain.read_domains = I915_GEM_DOMAIN_CPU;
-			set_domain.write_domain = I915_GEM_DOMAIN_CPU;
-			domain = DOMAIN_CPU;
-		} else {
-			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
-			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
-			domain = DOMAIN_GTT;
-		}
+		set_domain.write_domain = 0;
+		set_domain.read_domains =
+			IS_CPU_MAP(bo->base.map) ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
 
 		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
 	} else {
@@ -4039,11 +4032,9 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 			 bo->base.handle, (char *)bo->mem+offset,
 			 offset, length);
 		kgem_bo_map__cpu(kgem, &bo->base);
-		domain = DOMAIN_NONE;
 	}
-	kgem_retire(kgem);
-	assert(bo->base.rq == NULL);
-	bo->base.domain = domain;
+	kgem_bo_retire(kgem, &bo->base);
+	bo->base.domain = DOMAIN_NONE;
 }
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format)
commit 861ce4facbe1f6fe030c161b924aaf9074833ba7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 14:09:42 2012 +0100

    sna: Add assertions around proxy list handling
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48400
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f23bf0c..c8be7c8 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -320,6 +320,7 @@ static void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->exec == NULL) {
 		DBG(("%s: retiring bo handle=%d (needed flush? %d), rq? %d\n",
 		     __FUNCTION__, bo->handle, bo->needs_flush, bo->rq != NULL));
+		assert(list_is_empty(&bo->vma));
 		bo->rq = NULL;
 		list_del(&bo->request);
 		bo->needs_flush = bo->flush;
@@ -1040,6 +1041,7 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(bo->rq == NULL);
 	assert(bo->domain != DOMAIN_GPU);
 	assert(bo->reusable);
+	assert(list_is_empty(&bo->vma));
 
 	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
 		kgem_bo_free(kgem, bo);
@@ -1051,12 +1053,11 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 		int type = IS_CPU_MAP(bo->map);
 		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
 		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
-			list_del(&bo->vma);
 			munmap(MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
 		if (bo->map) {
-			list_move(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
+			list_add(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
 			kgem->vma[type].count++;
 		}
 	}
commit 0fd9f7038c1b4f5a36fd95a3dd191f52b7108887
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 10:48:08 2012 +0100

    sna/traps: Use a temporary variable for the write pointer
    
    To avoid accumulating the write offset for wide spans, we need to reset
    the destination pointer between spans.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48332
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f7a2111..120e755 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1476,55 +1476,61 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 		} else
 			FAST_SAMPLES_X_TO_INT_FRAC(right->x.quo, rix, rfx);
 		if (lix == rix) {
-			if (rfx != lfx)
+			if (rfx != lfx) {
+				assert(lix < width);
 				row[lix] += (rfx-lfx) * 256 / FAST_SAMPLES_X;
+			}
 		} else {
+			assert(lix < width);
 			if (lfx == 0)
 				row[lix] = 0xff;
 			else
 				row[lix] += 256 - lfx * 256 / FAST_SAMPLES_X;
 
-			if (rfx)
+			assert(rix <= width);
+			if (rfx) {
+				assert(rix < width);
 				row[rix] += rfx * 256 / FAST_SAMPLES_X;
+			}
 
 			if (rix > ++lix) {
+				uint8_t *r = row + lix;
 				rix -= lix;
-				row += lix;
 #if 0
 				if (rix == 1)
 					*row = 0xff;
 				else
 					memset(row, 0xff, rix);
 #else
-				if ((uintptr_t)row & 1 && rix) {
-					*row++ = 0xff;
+				if ((uintptr_t)r & 1 && rix) {
+					*r++ = 0xff;
 					rix--;
 				}
-				if ((uintptr_t)row & 2 && rix >= 2) {
-					*(uint16_t *)row = 0xffff;
-					row += 2;
+				if ((uintptr_t)r & 2 && rix >= 2) {
+					*(uint16_t *)r = 0xffff;
+					r += 2;
 					rix -= 2;
 				}
-				if ((uintptr_t)row & 4 && rix >= 4) {
-					*(uint32_t *)row = 0xffffffff;
-					row += 4;
+				if ((uintptr_t)r & 4 && rix >= 4) {
+					*(uint32_t *)r = 0xffffffff;
+					r += 4;
 					rix -= 4;
 				}
 				while (rix >= 8) {
-					*(uint64_t *)row = 0xffffffffffffffff;
-					row += 8;
+					*(uint64_t *)r = 0xffffffffffffffff;
+					r += 8;
 					rix -= 8;
 				}
 				if (rix & 4) {
-					*(uint32_t *)row = 0xffffffff;
-					row += 4;
+					*(uint32_t *)r = 0xffffffff;
+					r += 4;
 				}
 				if (rix & 2) {
-					*(uint16_t *)row = 0xffff;
-					row += 2;
+					*(uint16_t *)r = 0xffff;
+					r += 2;
 				}
 				if (rix & 1)
-					*row = 0xff;
+					*r = 0xff;
 #endif
 			}
 		}
commit 47db6d42b260e87c194ea69cce078210b0e93828
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 9 10:42:18 2012 +0100

    sna/traps: Assert that the inplace row is contained before writing
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48332
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index dbf581e..f7a2111 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1786,6 +1786,7 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 
 				inplace_subrow(active, ptr, width, &min, &max);
 			}
+			assert(min >= 0 && max <= width);
 			memset(row, 0, min);
 			if (max > min)
 				inplace_end_subrows(active, row+min, (int8_t*)ptr+min, max-min);
commit 7ed1ff6428264d2f792c1c36d803e322d886a451
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 17:16:03 2012 +0100

    sna: Add some assertions for misuse of proxies
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48400
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index de02e9d..f23bf0c 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -332,6 +332,7 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	assert(bo->refcnt);
 	assert(!bo->purged);
 	assert(!kgem_busy(kgem, bo->handle));
+	assert(bo->proxy == NULL);
 
 	assert(length <= bytes(bo));
 	if (gem_write(kgem->fd, bo->handle, 0, length, data))
@@ -3156,6 +3157,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
 
 	assert(!bo->purged);
+	assert(bo->proxy == NULL);
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->list));
 
@@ -3385,6 +3387,7 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
+	assert(bo->proxy == NULL);
 	kgem_bo_submit(kgem, bo);
 
 	if (bo->domain != DOMAIN_CPU) {
@@ -4037,7 +4040,8 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		kgem_bo_map__cpu(kgem, &bo->base);
 		domain = DOMAIN_NONE;
 	}
-	kgem_bo_retire(kgem, &bo->base);
+	kgem_retire(kgem);
+	assert(bo->base.rq == NULL);
 	bo->base.domain = domain;
 }
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 7425a51..083cd3c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1933,6 +1933,7 @@ sna_drawable_use_bo(DrawablePtr drawable,
 	}
 
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
+		assert(priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(to_sna_from_pixmap(pixmap), priv->gpu_bo);
 		priv->gpu_bo = NULL;
 		goto use_cpu_bo;
@@ -2725,6 +2726,7 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 
 	if (priv->gpu_bo && priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
+		assert(priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(sna, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
@@ -3520,6 +3522,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 	if (dst_priv->gpu_bo && dst_priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload\n", __FUNCTION__));
+		assert(dst_priv->gpu_bo->proxy->rq);
 		kgem_bo_destroy(&sna->kgem, dst_priv->gpu_bo);
 		dst_priv->gpu_bo = NULL;
 	}
commit 4fb01d3e756c4fda716b6cc76028af7fdecc9cbc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 14:25:49 2012 +0100

    sna: Compress adjoining spans during FillSpans
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 8f02380..7425a51 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4255,8 +4255,15 @@ sna_fill_spans__fill(DrawablePtr drawable,
 			b->y2 = b->y1 + 1;
 			DBG(("%s: (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, b->x1, b->y1, b->x2, b->y2));
-			if (b->x2 > b->x1)
-				b++;
+			if (b->x2 > b->x1) {
+				if (b != box &&
+				    b->y1 == b[-1].y2 &&
+				    b->x1 == b[-1].x1 &&
+				    b->x2 == b[-1].x2)
+					b[-1].y2 = b->y2;
+				else
+					b++;
+			}
 		} while (--nbox);
 		if (b != box)
 			op->boxes(data->sna, op, box, b - box);
@@ -4322,7 +4329,12 @@ sna_fill_spans__fill_clip_extents(DrawablePtr drawable,
 				b->x1 += data->dx; b->x2 += data->dx;
 				b->y1 += data->dy; b->y2 += data->dy;
 			}
-			if (++b == last_box) {
+			if (b != box &&
+			    b->y1 == b[-1].y2 &&
+			    b->x1 == b[-1].x1 &&
+			    b->x2 == b[-1].x2) {
+				b[-1].y2 = b->y2;
+			} else if (++b == last_box) {
 				op->boxes(data->sna, op, box, last_box - box);
 				b = box;
 			}
commit 417a11d0b0ccf3c0cb13991ed7d24a98bd8b7ea5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 13:51:13 2012 +0100

    sna: Remove the duplicated check for use-bo? in PolySegments
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 61710ad..8f02380 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -7458,6 +7458,13 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 	     data.flags & 4));
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
+
+	data.bo = sna_drawable_use_bo(drawable,
+				      &data.region.extents,
+				      &data.damage);
+	if (data.bo == NULL)
+		goto fallback;
+
 	if (gc->lineStyle != LineSolid || gc->lineWidth > 1)
 		goto spans_fallback;
 	if (gc_is_solid(gc, &color)) {
@@ -7465,10 +7472,7 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		     __FUNCTION__, (unsigned)color, data.flags));
 
 		if (data.flags & 4) {
-			if ((data.bo = sna_drawable_use_bo(drawable,
-							   &data.region.extents,
-							   &data.damage)) &&
-			    sna_poly_segment_blt(drawable,
+			if (sna_poly_segment_blt(drawable,
 						 data.bo, data.damage,
 						 gc, color, n, seg,
 						 &data.region.extents,
@@ -7476,9 +7480,6 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 				return;
 		} else {
 			if (use_zero_spans(drawable, gc, &data.region.extents) &&
-			    (data.bo = sna_drawable_use_bo(drawable,
-							   &data.region.extents,
-							   &data.damage)) &&
 			    sna_poly_zero_segment_blt(drawable,
 						      data.bo, data.damage,
 						      gc, n, seg,
@@ -7488,70 +7489,67 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		}
 	} else if (data.flags & 4) {
 		/* Try converting these to a set of rectangles instead */
-		if ((data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
-			xRectangle *rect;
-			int i;
-
-			DBG(("%s: converting to rectagnles\n", __FUNCTION__));
+		xRectangle *rect;
+		int i;
 
-			rect = malloc (n * sizeof (xRectangle));
-			if (rect == NULL)
-				return;
+		DBG(("%s: converting to rectagnles\n", __FUNCTION__));
 
-			for (i = 0; i < n; i++) {
-				if (seg[i].x1 < seg[i].x2) {
-					rect[i].x = seg[i].x1;
-					rect[i].width = seg[i].x2 - seg[i].x1 + 1;
-				} else if (seg[i].x1 > seg[i].x2) {
-					rect[i].x = seg[i].x2;
-					rect[i].width = seg[i].x1 - seg[i].x2 + 1;
-				} else {
-					rect[i].x = seg[i].x1;
-					rect[i].width = 1;
-				}
-				if (seg[i].y1 < seg[i].y2) {
-					rect[i].y = seg[i].y1;
-					rect[i].height = seg[i].y2 - seg[i].y1 + 1;
-				} else if (seg[i].x1 > seg[i].y2) {
-					rect[i].y = seg[i].y2;
-					rect[i].height = seg[i].y1 - seg[i].y2 + 1;
-				} else {
-					rect[i].y = seg[i].y1;
-					rect[i].height = 1;
-				}
+		rect = malloc (n * sizeof (xRectangle));
+		if (rect == NULL)
+			return;
 
-				/* don't paint last pixel */
-				if (gc->capStyle == CapNotLast) {
-					if (seg[i].x1 == seg[i].x2)
-						rect[i].height--;
-					else
-						rect[i].width--;
-				}
+		for (i = 0; i < n; i++) {
+			if (seg[i].x1 < seg[i].x2) {
+				rect[i].x = seg[i].x1;
+				rect[i].width = seg[i].x2 - seg[i].x1 + 1;
+			} else if (seg[i].x1 > seg[i].x2) {
+				rect[i].x = seg[i].x2;
+				rect[i].width = seg[i].x1 - seg[i].x2 + 1;
+			} else {
+				rect[i].x = seg[i].x1;
+				rect[i].width = 1;
 			}
-
-			if (gc->fillStyle == FillTiled) {
-				i = sna_poly_fill_rect_tiled_blt(drawable,
-								 data.bo, data.damage,
-								 gc, n, rect,
-								 &data.region.extents,
-								 data.flags & 2);
+			if (seg[i].y1 < seg[i].y2) {
+				rect[i].y = seg[i].y1;
+				rect[i].height = seg[i].y2 - seg[i].y1 + 1;
+			} else if (seg[i].x1 > seg[i].y2) {
+				rect[i].y = seg[i].y2;
+				rect[i].height = seg[i].y1 - seg[i].y2 + 1;
 			} else {
-				i = sna_poly_fill_rect_stippled_blt(drawable,
-								    data.bo, data.damage,
-								    gc, n, rect,
-								    &data.region.extents,
-								    data.flags & 2);
+				rect[i].y = seg[i].y1;
+				rect[i].height = 1;
 			}
-			free (rect);
 
-			if (i)
-				return;
+			/* don't paint last pixel */
+			if (gc->capStyle == CapNotLast) {
+				if (seg[i].x1 == seg[i].x2)
+					rect[i].height--;
+				else
+					rect[i].width--;
+			}
+		}
+
+		if (gc->fillStyle == FillTiled) {
+			i = sna_poly_fill_rect_tiled_blt(drawable,
+							 data.bo, data.damage,
+							 gc, n, rect,
+							 &data.region.extents,
+							 data.flags & 2);
+		} else {
+			i = sna_poly_fill_rect_stippled_blt(drawable,
+							    data.bo, data.damage,
+							    gc, n, rect,
+							    &data.region.extents,
+							    data.flags & 2);
 		}
+		free (rect);
+
+		if (i)
+			return;
 	}
 
 spans_fallback:
-	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    (data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
+	if (use_wide_spans(drawable, gc, &data.region.extents)) {
 		void (*line)(DrawablePtr, GCPtr, int, int, DDXPointPtr);
 		int i;
 
commit e4ab411a3be2edabaccae2556345cd548afd49b5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 13:39:47 2012 +0100

    sna: Correct partial-write flag for PolySegments fallback
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 6afd582..61710ad 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -7642,7 +7642,7 @@ fallback:
 		goto out;
 	if (!sna_drawable_move_region_to_cpu(drawable, &data.region,
 					     drawable_gc_flags(drawable, gc,
-							       !(data.flags & 4 && n > 1))))
+							       !(data.flags & 4 && n == 1))))
 		goto out;
 
 	/* Install FillSpans in case we hit a fallback path in fbPolySegment */
commit 84740bdf1669f5869300282f25d1c82da6d64553
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 13:18:56 2012 +0100

    sna/gen3: Reset accumulated constants for each composite
    
    In particular the glyph routines require the composite setup to
    reinitialise state between glyph runs. This affects anything trying to
    use glyphs without a mask with a gradient source, for example.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 529884c..ed2eaf1 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2437,7 +2437,7 @@ try_blt(struct sna *sna,
 
 static void
 gen3_align_vertex(struct sna *sna,
-		  struct sna_composite_op *op)
+		  const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen3.last_floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
@@ -2450,6 +2450,7 @@ gen3_align_vertex(struct sna *sna,
 		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
 		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
 		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
+		assert(sna->render.vertex_used < sna->render.vertex_size - op->floats_per_rect);
 		sna->render_state.gen3.last_floats_per_vertex = op->floats_per_vertex;
 	}
 }
@@ -2824,6 +2825,7 @@ gen3_render_composite(struct sna *sna,
 			return FALSE;
 	}
 
+	tmp->u.gen3.num_constants = 0;
 	tmp->src.u.gen3.type = SHADER_TEXTURE;
 	tmp->src.is_affine = TRUE;
 	DBG(("%s: preparing source\n", __FUNCTION__));
commit 1861aa47b5db35a562da476e86172d3f194eaa52
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 8 10:09:42 2012 +0100

    sna: Release cached upload buffers when reusing a write buffer for readback
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=48400
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 27fa165..de02e9d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1234,6 +1234,24 @@ static void bubble_sort_partial(struct list *head, struct kgem_partial_bo *bo)
 	}
 }
 
+static void kgem_partial_buffer_release(struct kgem *kgem,
+					struct kgem_partial_bo *bo)
+{
+	while (!list_is_empty(&bo->base.vma)) {
+		struct kgem_bo *cached;
+
+		cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
+		assert(cached->proxy == &bo->base);
+		list_del(&cached->vma);
+
+		assert(*(struct kgem_bo **)cached->map == cached);
+		*(struct kgem_bo **)cached->map = NULL;
+		cached->map = NULL;
+
+		kgem_bo_destroy(kgem, cached);
+	}
+}
+
 static void kgem_retire_partials(struct kgem *kgem)
 {
 	struct kgem_partial_bo *bo, *next;
@@ -1247,19 +1265,7 @@ static void kgem_retire_partials(struct kgem *kgem)
 
 		DBG(("%s: releasing upload cache for handle=%d? %d\n",
 		     __FUNCTION__, bo->base.handle, !list_is_empty(&bo->base.vma)));
-		while (!list_is_empty(&bo->base.vma)) {
-			struct kgem_bo *cached;
-
-			cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
-			assert(cached->proxy == &bo->base);
-			list_del(&cached->vma);
-
-			assert(*(struct kgem_bo **)cached->map == cached);
-			*(struct kgem_bo **)cached->map = NULL;
-			cached->map = NULL;
-
-			kgem_bo_destroy(kgem, cached);
-		}
+		kgem_partial_buffer_release(kgem, bo);
 
 		assert(bo->base.refcnt > 0);
 		if (bo->base.refcnt != 1)
@@ -3519,6 +3525,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 			gem_write(kgem->fd, bo->base.handle,
 				  0, bo->used, bo->mem);
+			kgem_partial_buffer_release(kgem, bo);
 			bo->need_io = 0;
 			bo->write = 0;
 			offset = 0;
@@ -3991,8 +3998,9 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 	assert(_bo->io);
 	assert(_bo->exec == &_kgem_dummy_exec);
 	assert(_bo->rq == NULL);
-	if (_bo->proxy)
-		_bo = _bo->proxy;
+	assert(_bo->proxy);
+
+	_bo = _bo->proxy;
 	assert(_bo->exec == NULL);
 
 	bo = (struct kgem_partial_bo *)_bo;
commit 39440d13c3fd7af8098ab194aaab218801a900d6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Apr 7 10:01:01 2012 +0100

    sna/gradient: Compute the absolute delta between color stops
    
    Otherwise we do not detect gradients that start from white!
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48407
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_gradient.c b/src/sna/sna_gradient.c
index 943cbf9..32d26c8 100644
--- a/src/sna/sna_gradient.c
+++ b/src/sna/sna_gradient.c
@@ -44,39 +44,48 @@ sna_gradient_sample_width(PictGradient *gradient)
 {
 	int n, width;
 
-	width = 2;
+	width = 0;
 	for (n = 1; n < gradient->nstops; n++) {
 		xFixed dx = gradient->stops[n].x - gradient->stops[n-1].x;
-		uint16_t delta, max;
-		int ramp;
+		int delta, max, ramp;
 
 		if (dx == 0)
 			return 1024;
 
 		max = gradient->stops[n].color.red -
 			gradient->stops[n-1].color.red;
+		if (max < 0)
+			max = -max;
 
 		delta = gradient->stops[n].color.green -
 			gradient->stops[n-1].color.green;
+		if (delta < 0)
+			delta = -delta;
 		if (delta > max)
 			max = delta;
 
 		delta = gradient->stops[n].color.blue -
 			gradient->stops[n-1].color.blue;
+		if (delta < 0)
+			delta = -delta;
 		if (delta > max)
 			max = delta;
 
 		delta = gradient->stops[n].color.alpha -
 			gradient->stops[n-1].color.alpha;
+		if (delta < 0)
+			delta = -delta;
 		if (delta > max)
 			max = delta;
 
-		ramp = 128 * max / dx;
+		ramp = 256 * max / dx;
 		if (ramp > width)
 			width = ramp;
 	}
 
-	width *= gradient->nstops-1;
+	if (width == 0)
+		return 1;
+
 	width = (width + 7) & -8;
 	return min(width, 1024);
 }
commit 5ab2a916d6e19c846a26e9ff703208a38ed34c3a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 21:10:50 2012 +0100

    sna/video: Only wait upon the scanout pixmap
    
    Caught by the addition of the assertion.
    
    Reported-by: Jiri Slaby <jirislaby at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index b740b6a..4975f55 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -287,7 +287,8 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		}
 	}
 
-	if (crtc && video->SyncToVblank != 0)
+	if (crtc && video->SyncToVblank != 0 &&
+	    pixmap == sna->front && !sna->shadow)
 		flush = sna_wait_for_scanline(sna, pixmap, crtc,
 					      &clip->extents);
 
commit 05c34e97d7d9da6bd881530a62436a1d82735885
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 15:38:02 2012 +0100

    sna: Correct the damage offset for redirected rendering
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48385
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index 9c646d8..f2d682d 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -945,8 +945,11 @@ struct sna_damage *_sna_damage_is_all(struct sna_damage *damage,
 {
 	DBG(("%s(%d, %d)%s?\n", __FUNCTION__, width, height,
 	     damage->dirty ? "*" : ""));
-	assert(damage->mode == DAMAGE_ADD);
+	DBG(("%s: (%d, %d), (%d, %d)\n", __FUNCTION__,
+	     damage->extents.x1, damage->extents.y1,
+	     damage->extents.x2, damage->extents.y2));
 
+	assert(damage->mode == DAMAGE_ADD);
 	assert(damage->extents.x1 == 0 &&
 	       damage->extents.y1 == 0 &&
 	       damage->extents.x2 == width &&
@@ -962,10 +965,6 @@ struct sna_damage *_sna_damage_is_all(struct sna_damage *damage,
 		return damage;
 	}
 
-	DBG(("%s: (%d, %d), (%d, %d)\n", __FUNCTION__,
-	     damage->extents.x1, damage->extents.y1,
-	     damage->extents.x2, damage->extents.y2));
-
 	assert(damage->extents.x1 == 0 &&
 	       damage->extents.y1 == 0 &&
 	       damage->extents.x2 == width &&
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index d774a34..8af80f2 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1878,8 +1878,7 @@ sna_render_composite_redirect_done(struct sna *sna,
 		}
 		if (t->damage) {
 			sna_damage_combine(t->real_damage, t->damage,
-					   t->box.x1 - op->dst.x,
-					   t->box.y1 - op->dst.y);
+					   -t->box.x1, -t->box.y1);
 			__sna_damage_destroy(t->damage);
 		}
 
commit 75dd66cc75a173c0fb02db41f93dad6776ed596d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 15:14:45 2012 +0100

    sna/glyphs: Prefer a temporary upload mask for large glyph masks
    
    If the required temporary mask is larger than the 3D pipeline can
    handle, just render to a CPU buffer rather than redirect every glyph
    composition.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 235528c..87371aa 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -662,6 +662,13 @@ clear_pixmap(struct sna *sna, PixmapPtr pixmap)
 	return sna->render.clear(sna, pixmap, priv->gpu_bo);
 }
 
+static bool
+too_large(struct sna *sna, int width, int height)
+{
+	return (width > sna->render.max_3d_size ||
+		height > sna->render.max_3d_size);
+}
+
 static Bool
 glyphs_via_mask(struct sna *sna,
 		CARD8 op,
@@ -724,7 +731,8 @@ glyphs_via_mask(struct sna *sna,
 
 	component_alpha = NeedsComponent(format->format);
 	if (!NO_SMALL_MASK &&
-	    (uint32_t)width * height * format->depth < 8 * 4096) {
+	    ((uint32_t)width * height * format->depth < 8 * 4096 ||
+	     too_large(sna, width, height))) {
 		pixman_image_t *mask_image;
 		int s;
 
commit 80cc21bc34a1759fd84f38babc050be0ae0cb7c1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 14:27:15 2012 +0100

    sna: Relase the upload cache when overwriting with PutImage
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48359
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 0d229cd..529884c 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2499,6 +2499,7 @@ gen3_composite_set_target(struct sna *sna,
 			kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 			priv->gpu_bo = bo;
 		}
+		assert(priv->gpu_bo->proxy == NULL);
 
 		op->dst.bo = priv->gpu_bo;
 		op->damage = &priv->gpu_damage;
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f1d4d00..6afd582 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1032,6 +1032,8 @@ skip_inplace_map:
 		goto done;
 	}
 
+	assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
+
 	if (flags & MOVE_INPLACE_HINT &&
 	    priv->stride && priv->gpu_bo &&
 	    !kgem_bo_map_will_stall(&sna->kgem, priv->gpu_bo) &&
@@ -1772,6 +1774,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 			goto done;
 		}
 	}
+	assert(priv->gpu_bo->proxy == NULL);
 
 	if ((flags & MOVE_READ) == 0)
 		sna_damage_subtract_box(&priv->cpu_damage, box);
@@ -1929,6 +1932,12 @@ sna_drawable_use_bo(DrawablePtr drawable,
 		return NULL;
 	}
 
+	if (priv->gpu_bo && priv->gpu_bo->proxy) {
+		kgem_bo_destroy(to_sna_from_pixmap(pixmap), priv->gpu_bo);
+		priv->gpu_bo = NULL;
+		goto use_cpu_bo;
+	}
+
 	if (DAMAGE_IS_ALL(priv->gpu_damage))
 		goto use_gpu_bo;
 
@@ -2320,7 +2329,8 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		goto done;
 
 	if (priv->gpu_bo->proxy) {
-		assert((flags & MOVE_WRITE) ==0);
+		DBG(("%s: reusing cached upload\n", __FUNCTION__));
+		assert((flags & MOVE_WRITE) == 0);
 		goto done;
 	}
 
@@ -2604,6 +2614,7 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		return FALSE;
 
 	assert(priv->gpu_bo);
+	assert(priv->gpu_bo->proxy == NULL);
 
 	if (!priv->pinned && nbox == 1 &&
 	    box->x1 <= 0 && box->y1 <= 0 &&
@@ -2712,6 +2723,12 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		return true;
 	}
 
+	if (priv->gpu_bo && priv->gpu_bo->proxy) {
+		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
+		kgem_bo_destroy(sna, priv->gpu_bo);
+		priv->gpu_bo = NULL;
+	}
+
 	if (priv->cpu_bo) {
 		/* If the GPU is currently accessing the CPU pixmap, then
 		 * we will need to wait for that to finish before we can
@@ -3307,6 +3324,7 @@ sna_self_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		goto fallback;
 
 	if (priv->gpu_bo) {
+		assert(priv->gpu_bo->proxy == NULL);
 		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE | MOVE_READ)) {
 			DBG(("%s: fallback - not a pure copy and failed to move dst to GPU\n",
 			     __FUNCTION__));
@@ -3501,6 +3519,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	}
 
 	if (dst_priv->gpu_bo && dst_priv->gpu_bo->proxy) {
+		DBG(("%s: discarding cached upload\n", __FUNCTION__));
 		kgem_bo_destroy(&sna->kgem, dst_priv->gpu_bo);
 		dst_priv->gpu_bo = NULL;
 	}
commit 91aba5166f050bc74b59d7ce3641a1d06ab9aa31
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 09:24:36 2012 +0100

    sna: Use a sentinel value to prevent accessing beyond the end of the y_buckets
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 9000f82..dbf581e 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -650,12 +650,13 @@ polygon_init(struct polygon *polygon,
 			goto bail_no_mem;
 	}
 
-	if (num_buckets > ARRAY_SIZE(polygon->y_buckets_embedded)) {
-		polygon->y_buckets = malloc(num_buckets*sizeof(struct edge *));
+	if (num_buckets >= ARRAY_SIZE(polygon->y_buckets_embedded)) {
+		polygon->y_buckets = malloc((1+num_buckets)*sizeof(struct edge *));
 		if (unlikely(NULL == polygon->y_buckets))
 			goto bail_no_mem;
 	}
 	memset(polygon->y_buckets, 0, num_buckets * sizeof(struct edge *));
+	polygon->y_buckets[num_buckets] = (void *)-1;
 
 	polygon->ymin = ymin;
 	polygon->ymax = ymax;
@@ -1363,7 +1364,7 @@ tor_render(struct sna *sna,
 			if (active->head.next == &active->tail) {
 				active->min_height = INT_MAX;
 				active->is_vertical = 1;
-				for (; j < h && !polygon->y_buckets[j]; j++)
+				for (; !polygon->y_buckets[j]; j++)
 					;
 				__DBG(("%s: no new edges and no exisiting edges, skipping, %d -> %d\n",
 				       __FUNCTION__, i, j));
@@ -1386,8 +1387,7 @@ tor_render(struct sna *sna,
 			assert(active->is_vertical);
 			nonzero_row(active, coverages);
 
-			while (j < h &&
-			       polygon->y_buckets[j] == NULL &&
+			while (polygon->y_buckets[j] == NULL &&
 			       active->min_height >= 2*FAST_SAMPLES_Y)
 			{
 				active->min_height -= FAST_SAMPLES_Y;
@@ -1713,6 +1713,9 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 
 	__DBG(("%s: mono=%d, buf=%d\n", __FUNCTION__, mono, buf));
 	assert(!mono);
+	assert(converter->ymin == 0);
+	assert(converter->xmin == 0);
+	assert(scratch->drawable.depth == 8);
 
 	/* Render each pixel row. */
 	for (i = 0; i < h; i = j) {
@@ -1727,7 +1730,7 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 			if (active->head.next == &active->tail) {
 				active->min_height = INT_MAX;
 				active->is_vertical = 1;
-				for (; j < h && !polygon->y_buckets[j]; j++)
+				for (; !polygon->y_buckets[j]; j++)
 					;
 				__DBG(("%s: no new edges and no exisiting edges, skipping, %d -> %d\n",
 				       __FUNCTION__, i, j));
@@ -1754,8 +1757,7 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 			if (row != ptr)
 				memcpy(row, ptr, width);
 
-			while (j < h &&
-			       polygon->y_buckets[j] == NULL &&
+			while (polygon->y_buckets[j] == NULL &&
 			       active->min_height >= 2*FAST_SAMPLES_Y)
 			{
 				active->min_height -= FAST_SAMPLES_Y;
commit 975b4afe86cc9787b97c00ae88ea7889155be7e8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Apr 6 09:12:08 2012 +0100

    sna: Remove redundant check from tor_inplace()
    
    We only execute full-steps for vertical edges so we do not need the
    second check.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 2b4b2db..9000f82 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1383,22 +1383,21 @@ tor_render(struct sna *sna,
 		       active->min_height,
 		       active->is_vertical));
 		if (do_full_step) {
+			assert(active->is_vertical);
 			nonzero_row(active, coverages);
 
-			if (active->is_vertical) {
-				while (j < h &&
-				       polygon->y_buckets[j] == NULL &&
-				       active->min_height >= 2*FAST_SAMPLES_Y)
-				{
-					active->min_height -= FAST_SAMPLES_Y;
-					j++;
-				}
-				if (j != i + 1)
-					step_edges(active, j - (i + 1));
-
-				__DBG(("%s: vertical edges, full step (%d, %d)\n",
-				       __FUNCTION__,  i, j));
+			while (j < h &&
+			       polygon->y_buckets[j] == NULL &&
+			       active->min_height >= 2*FAST_SAMPLES_Y)
+			{
+				active->min_height -= FAST_SAMPLES_Y;
+				j++;
 			}
+			if (j != i + 1)
+				step_edges(active, j - (i + 1));
+
+			__DBG(("%s: vertical edges, full step (%d, %d)\n",
+			       __FUNCTION__,  i, j));
 		} else {
 			grid_scaled_y_t suby;
 
@@ -1748,27 +1747,27 @@ tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
 		       active->min_height,
 		       active->is_vertical));
 		if (do_full_step) {
+			assert(active->is_vertical);
+
 			memset(ptr, 0, width);
 			inplace_row(active, ptr, width);
 			if (row != ptr)
 				memcpy(row, ptr, width);
 
-			if (active->is_vertical) {
-				while (j < h &&
-				       polygon->y_buckets[j] == NULL &&
-				       active->min_height >= 2*FAST_SAMPLES_Y)
-				{
-					active->min_height -= FAST_SAMPLES_Y;
-					row += stride;
-					memcpy(row, ptr, width);
-					j++;
-				}
-				if (j != i + 1)
-					step_edges(active, j - (i + 1));
-
-				__DBG(("%s: vertical edges, full step (%d, %d)\n",
-				       __FUNCTION__,  i, j));
+			while (j < h &&
+			       polygon->y_buckets[j] == NULL &&
+			       active->min_height >= 2*FAST_SAMPLES_Y)
+			{
+				active->min_height -= FAST_SAMPLES_Y;
+				row += stride;
+				memcpy(row, ptr, width);
+				j++;
 			}
+			if (j != i + 1)
+				step_edges(active, j - (i + 1));
+
+			__DBG(("%s: vertical edges, full step (%d, %d)\n",
+			       __FUNCTION__,  i, j));
 		} else {
 			grid_scaled_y_t suby;
 			int min = width, max = 0;
commit 4b2fc04f159e6675f27a07d012cca683bf1b3fb4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Apr 4 11:13:27 2012 +0100

    sna: Only engage the GPU detiler for multiple rows
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 4f5a634..02a5c75 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -178,7 +178,7 @@ fallback:
 	}
 	if (kgem_bo_is_mappable(kgem, src_bo)) {
 		/* Is it worth detiling? */
-		if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
+		if ((extents.y2 - extents.y1 - 1) * src_bo->pitch < 4096)
 			goto fallback;
 	}
 
commit 3f7a2c205e745a119bfed4dcd2c55a9559a06154
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 3 19:47:15 2012 +0100

    sna/gen3: Don't force use of the render pipeline just for vmap
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index f77521f..0d229cd 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2404,10 +2404,6 @@ source_use_blt(struct sna *sna, PicturePtr picture)
 	if (too_large(picture->pDrawable->width, picture->pDrawable->height))
 		return true;
 
-	/* If we can sample directly from user-space, do so */
-	if (sna->kgem.has_vmap)
-		return false;
-
 	return is_cpu(picture->pDrawable) || is_dirty(picture->pDrawable);
 }
 
commit d9b60258ba67a627e7ae91938045d0ab066369ec
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 3 19:04:29 2012 +0100

    sna/gen3: Fix pre-multiplication of mask value
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index ca3d141..f77521f 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2364,7 +2364,6 @@ gen3_composite_picture(struct sna *sna,
 	x += dx + picture->pDrawable->x;
 	y += dy + picture->pDrawable->y;
 
-	channel->is_affine = sna_transform_is_affine(picture->transform);
 	if (sna_transform_is_integer_translation(picture->transform, &dx, &dy)) {
 		DBG(("%s: integer translation (%d, %d), removing\n",
 		     __FUNCTION__, dx, dy));
@@ -2372,8 +2371,10 @@ gen3_composite_picture(struct sna *sna,
 		y += dy;
 		channel->transform = NULL;
 		channel->filter = PictFilterNearest;
-	} else
+	} else {
 		channel->transform = picture->transform;
+		channel->is_affine = sna_transform_is_affine(picture->transform);
+	}
 
 	if (!gen3_composite_channel_set_format(channel, picture->format) &&
 	    !gen3_composite_channel_set_xformat(picture, channel, x, y, w, h))
@@ -2912,13 +2913,13 @@ gen3_render_composite(struct sna *sna,
 					v = multa(tmp->src.u.gen3.mode,
 						  tmp->mask.u.gen3.mode,
 						  24);
-					v != multa(tmp->src.u.gen3.mode,
+					v |= multa(tmp->src.u.gen3.mode,
 						   tmp->mask.u.gen3.mode,
 						   16);
-					v != multa(tmp->src.u.gen3.mode,
+					v |= multa(tmp->src.u.gen3.mode,
 						   tmp->mask.u.gen3.mode,
 						   8);
-					v != multa(tmp->src.u.gen3.mode,
+					v |= multa(tmp->src.u.gen3.mode,
 						   tmp->mask.u.gen3.mode,
 						   0);
 
@@ -2986,10 +2987,11 @@ gen3_render_composite(struct sna *sna,
 		tmp->floats_per_vertex += tmp->src.is_affine ? 2 : 4;
 	if (!is_constant_ps(tmp->mask.u.gen3.type))
 		tmp->floats_per_vertex += tmp->mask.is_affine ? 2 : 4;
-	DBG(("%s: floats_per_vertex = 2 + %d + %d = %d\n", __FUNCTION__,
+	DBG(("%s: floats_per_vertex = 2 + %d + %d = %d [specialised emitter? %d]\n", __FUNCTION__,
 	     !is_constant_ps(tmp->src.u.gen3.type) ? tmp->src.is_affine ? 2 : 4 : 0,
 	     !is_constant_ps(tmp->mask.u.gen3.type) ? tmp->mask.is_affine ? 2 : 4 : 0,
-	     tmp->floats_per_vertex));
+	     tmp->floats_per_vertex,
+	     tmp->prim_emit != gen3_emit_composite_primitive));
 	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
 
 	tmp->blt   = gen3_render_composite_blt;
commit 43ddc0d8e528646bfc2aadefcfe7ede46de101ca
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 3 12:34:24 2012 +0100

    sna/gen3: Convert the clear-color from picture->format to a8r8g8b8
    
    The shaders treat colours as an argb value, however the clear color is
    stored in the pixmap's native format (a8, r5g6b5, x8r8g8b8 etc). So
    before using the value of the clear color as a solid we need to convert
    it into the a8r8g8b8 format.
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48204
    Reported-by: Paul Neumann <paul104x at yahoo.de>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47308
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 0478709..ca3d141 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2080,7 +2080,7 @@ gen3_init_solid(struct sna_composite_channel *channel, uint32_t color)
 		channel->u.gen3.type = SHADER_WHITE;
 
 	channel->bo = NULL;
-	channel->is_opaque = (color & 0xff000000) == 0xff000000;
+	channel->is_opaque = (color >> 24) == 0xff;
 	channel->is_affine = 1;
 	channel->alpha_fixup = 0;
 	channel->rb_reversed = 0;
@@ -2303,6 +2303,8 @@ gen3_composite_picture(struct sna *sna,
 
 		switch (source->type) {
 		case SourcePictTypeSolidFill:
+			DBG(("%s: solid fill [%08x], format %x\n",
+			     __FUNCTION__, source->solidFill.color, picture->format));
 			ret = gen3_init_solid(channel, source->solidFill.color);
 			break;
 
@@ -2334,11 +2336,15 @@ gen3_composite_picture(struct sna *sna,
 						x, y, w, h, dst_x, dst_y);
 	}
 
-	if (sna_picture_is_solid(picture, &color))
+	if (sna_picture_is_solid(picture, &color)) {
+		DBG(("%s: solid drawable [%08x]\n", __FUNCTION__, color));
 		return gen3_init_solid(channel, color);
+	}
 
-	if (sna_picture_is_clear(picture, x, y, w, h, &color))
-		return gen3_init_solid(channel, color);
+	if (sna_picture_is_clear(picture, x, y, w, h, &color)) {
+		DBG(("%s: clear drawable [%08x]\n", __FUNCTION__, color));
+		return gen3_init_solid(channel, color_convert(color, picture->format, PICT_a8r8g8b8));
+	}
 
 	if (!gen3_check_repeat(picture))
 		return sna_render_picture_fixup(sna, picture, channel,
@@ -2517,11 +2523,16 @@ gen3_composite_set_target(struct sna *sna,
 	return TRUE;
 }
 
+static inline uint8_t
+mul_8_8(uint8_t a, uint8_t b)
+{
+    uint16_t t = a * (uint16_t)b + 0x7f;
+    return ((t >> 8) + t) >> 8;
+}
+
 static inline uint8_t multa(uint32_t s, uint32_t m, int shift)
 {
-	s = (s >> shift) & 0xff;
-	m >>= 24;
-	return (s * m) >> 8;
+	return mul_8_8((s >> shift) & 0xff, m >> 24) << shift;
 }
 
 static inline bool is_constant_ps(uint32_t type)
@@ -2894,33 +2905,31 @@ gen3_render_composite(struct sna *sna,
 			} else {
 				if (tmp->mask.is_opaque) {
 					tmp->mask.u.gen3.type = SHADER_NONE;
-					tmp->has_component_alpha = FALSE;
 				} else if (is_constant_ps(tmp->src.u.gen3.type) &&
 					   is_constant_ps(tmp->mask.u.gen3.type)) {
-					uint32_t a,r,g,b;
+					uint32_t v;
 
-					a = multa(tmp->src.u.gen3.mode,
+					v = multa(tmp->src.u.gen3.mode,
 						  tmp->mask.u.gen3.mode,
 						  24);
-					r = multa(tmp->src.u.gen3.mode,
-						  tmp->mask.u.gen3.mode,
-						  16);
-					g = multa(tmp->src.u.gen3.mode,
-						  tmp->mask.u.gen3.mode,
-						  8);
-					b = multa(tmp->src.u.gen3.mode,
-						  tmp->mask.u.gen3.mode,
-						  0);
+					v != multa(tmp->src.u.gen3.mode,
+						   tmp->mask.u.gen3.mode,
+						   16);
+					v != multa(tmp->src.u.gen3.mode,
+						   tmp->mask.u.gen3.mode,
+						   8);
+					v != multa(tmp->src.u.gen3.mode,
+						   tmp->mask.u.gen3.mode,
+						   0);
 
 					DBG(("%s: combining constant source/mask: %x x %x -> %x\n",
 					     __FUNCTION__,
 					     tmp->src.u.gen3.mode,
 					     tmp->mask.u.gen3.mode,
-					     a << 24 | r << 16 | g << 8 | b));
+					     v));
 
 					tmp->src.u.gen3.type = SHADER_CONSTANT;
-					tmp->src.u.gen3.mode =
-						a << 24 | r << 16 | g << 8 | b;
+					tmp->src.u.gen3.mode = v;
 
 					tmp->mask.u.gen3.type = SHADER_NONE;
 				}
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index e7a6182..a81a145 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -476,13 +476,13 @@ static void sna_blt_copy_one(struct sna *sna,
 	kgem->nbatch += 8;
 }
 
-static Bool
-get_rgba_from_pixel(uint32_t pixel,
-		    uint16_t *red,
-		    uint16_t *green,
-		    uint16_t *blue,
-		    uint16_t *alpha,
-		    uint32_t format)
+Bool
+sna_get_rgba_from_pixel(uint32_t pixel,
+			uint16_t *red,
+			uint16_t *green,
+			uint16_t *blue,
+			uint16_t *alpha,
+			uint32_t format)
 {
 	int rbits, bbits, gbits, abits;
 	int rshift, bshift, gshift, ashift;
@@ -607,31 +607,6 @@ _sna_get_pixel_from_rgba(uint32_t * pixel,
 	return TRUE;
 }
 
-static uint32_t
-color_convert(uint32_t pixel,
-	      uint32_t src_format,
-	      uint32_t dst_format)
-{
-	DBG(("%s: src=%08x [%08x]\n", __FUNCTION__, pixel, src_format));
-
-	if (src_format != dst_format) {
-		uint16_t red, green, blue, alpha;
-
-		if (!get_rgba_from_pixel(pixel,
-					 &red, &green, &blue, &alpha,
-					 src_format))
-			return 0;
-
-		if (!sna_get_pixel_from_rgba(&pixel,
-					     red, green, blue, alpha,
-					     dst_format))
-			return 0;
-	}
-
-	DBG(("%s: dst=%08x [%08x]\n", __FUNCTION__, pixel, dst_format));
-	return pixel;
-}
-
 uint32_t
 sna_rgba_for_color(uint32_t color, int depth)
 {
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 73ef568..9c49281 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -485,6 +485,12 @@ sna_render_get_gradient(struct sna *sna,
 
 uint32_t sna_rgba_for_color(uint32_t color, int depth);
 uint32_t sna_rgba_to_color(uint32_t rgba, uint32_t format);
+Bool sna_get_rgba_from_pixel(uint32_t pixel,
+			     uint16_t *red,
+			     uint16_t *green,
+			     uint16_t *blue,
+			     uint16_t *alpha,
+			     uint32_t format);
 Bool sna_picture_is_solid(PicturePtr picture, uint32_t *color);
 
 void no_render_init(struct sna *sna);
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 956e2aa..03e1969 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -190,4 +190,29 @@ sna_render_reduce_damage(struct sna_composite_op *op,
 	}
 }
 
+inline static uint32_t
+color_convert(uint32_t pixel,
+	      uint32_t src_format,
+	      uint32_t dst_format)
+{
+	DBG(("%s: src=%08x [%08x]\n", __FUNCTION__, pixel, src_format));
+
+	if (src_format != dst_format) {
+		uint16_t red, green, blue, alpha;
+
+		if (!sna_get_rgba_from_pixel(pixel,
+					     &red, &green, &blue, &alpha,
+					     src_format))
+			return 0;
+
+		if (!sna_get_pixel_from_rgba(&pixel,
+					     red, green, blue, alpha,
+					     dst_format))
+			return 0;
+	}
+
+	DBG(("%s: dst=%08x [%08x]\n", __FUNCTION__, pixel, dst_format));
+	return pixel;
+}
+
 #endif /* SNA_RENDER_INLINE_H */
commit 50dfce83763efd03d39496050bda46c2d12ada7f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 3 09:24:06 2012 +0100

    sna: Apply CoordMode when computing point extents
    
    Reported-by: Patrick Truebe <eko-priv at gmx.net>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48220
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 6eee977..f1d4d00 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5556,9 +5556,19 @@ sna_poly_point_extents(DrawablePtr drawable, GCPtr gc,
 
 	box.x2 = box.x1 = pt->x;
 	box.y2 = box.y1 = pt->y;
-	while (--n) {
-		pt++;
-		box_add_pt(&box, pt->x, pt->y);
+	if (mode == CoordModePrevious) {
+		DDXPointRec last = *pt++;
+		while (--n) {
+			last.x += pt->x;
+			last.y += pt->y;
+			pt++;
+			box_add_pt(&box, last.x, last.y);
+		}
+	} else {
+		while (--n) {
+			++pt;
+			box_add_pt(&box, pt->x, pt->y);
+		}
 	}
 	box.x2++;
 	box.y2++;
commit d2c7f0888e7b4599404d7f471d43d74d0dde68bc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 3 09:23:49 2012 +0100

    sna: Debugging flil spans and their clipping
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 32eae52..6eee977 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4291,14 +4291,18 @@ sna_fill_spans__fill_clip_extents(DrawablePtr drawable,
 	     extents->x2, extents->y2));
 
 	while (n--) {
+		DBG(("%s: [%d] pt=(%d, %d), width=%d\n",
+		     __FUNCTION__, n, pt->x, pt->y, *width));
 		*(DDXPointRec *)b = *pt++;
 		b->x2 = b->x1 + (int)*width++;
 		b->y2 = b->y1 + 1;
 		if (box_intersect(b, extents)) {
-			b->x1 += data->dx;
-			b->x2 += data->dx;
-			b->y1 += data->dy;
-			b->y2 += data->dy;
+			DBG(("%s: [%d] clipped=(%d, %d), (%d, %d)\n",
+			     __FUNCTION__, n, b->x1, b->y1, b->x2, b->y2));
+			if (data->dx|data->dy) {
+				b->x1 += data->dx; b->x2 += data->dx;
+				b->y1 += data->dy; b->y2 += data->dy;
+			}
 			if (++b == last_box) {
 				op->boxes(data->sna, op, box, last_box - box);
 				b = box;
@@ -8578,6 +8582,9 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 	      (gc->fillStyle == FillTiled && gc->tileIsPixel)),
 	     gc->fillStyle, gc->tileIsPixel,
 	     gc->alu));
+	DBG(("%s: draw=%d, offset=(%d, %d), size=%dx%d\n",
+	     __FUNCTION__, draw->serialNumber,
+	     draw->x, draw->y, draw->width, draw->height));
 
 	data.flags = sna_poly_point_extents(draw, gc, mode, n, pt,
 					    &data.region.extents);
commit 4ef588957e99477b720f4a39b5ec5019d82daadb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 2 16:16:24 2012 +0100

    sna: Use the solid spans fast paths for dashed zero-width lines as well
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 9829ada..32eae52 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4499,10 +4499,10 @@ no_damage_clipped:
 				b->y2 = b->y1 + 1;
 
 				if (box_intersect(b, &clip.extents)) {
-					b->x1 += dx;
-					b->x2 += dx;
-					b->y1 += dy;
-					b->y2 += dy;
+					if (dx|dy) {
+						b->x1 += dx; b->x2 += dx;
+						b->y1 += dy; b->y2 += dy;
+					}
 					if (++b == last_box) {
 						fill.boxes(sna, &fill, box, last_box - box);
 						b = box;
@@ -6598,7 +6598,6 @@ spans_fallback:
 		sna_gc(gc)->priv = &data;
 
 		if (gc->lineWidth == 0 &&
-		    gc->lineStyle == LineSolid &&
 		    gc_is_solid(gc, &color)) {
 			struct sna_fill_op fill;
 
@@ -6628,12 +6627,19 @@ spans_fallback:
 			assert(gc->miTranslate);
 
 			gc->ops = &sna_gc_ops__tmp;
-			miZeroLine(drawable, gc, mode, n, pt);
+			if (gc->lineStyle == LineSolid) {
+				DBG(("%s: miZeroLine (solid fill)\n", __FUNCTION__));
+				miZeroLine(drawable, gc, mode, n, pt);
+			} else {
+				DBG(("%s: miZeroDashLine (solid fill)\n", __FUNCTION__));
+				miZeroDashLine(drawable, gc, mode, n, pt);
+			}
 			fill.done(data.sna, &fill);
 		} else {
-			/* Note that the WideDash functions alternate between filling
-			 * using fgPixel and bgPixel so we need to reset state between
-			 * FillSpans.
+			/* Note that the WideDash functions alternate
+			 * between filling using fgPixel and bgPixel
+			 * so we need to reset state between FillSpans and
+			 * cannot use the fill fast paths.
 			 */
 			sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
 			gc->ops = &sna_gc_ops__tmp;
commit ef07e90c097fc2f2e7887b7fa12ec4d0c323d22e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 2 13:51:37 2012 +0100

    sna/gen4: Remove the accidental debugging hack from the last commit
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 1868e2f..b8a2459 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1932,7 +1932,7 @@ gen4_composite_linear_init(struct sna *sna,
 
 	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
 	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y) - .5/sf);
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
 
 	channel->embedded_transform.matrix[1][0] = 0;
 	channel->embedded_transform.matrix[1][1] = 0;
commit 93e7be96da2f4ca31f7001bf1d65b86d81a776c4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 2 13:40:22 2012 +0100

    sna/gen3+: Fix sampling of borders around gradients
    
    Incurs a slight loss of precision for the internal gradient, but much
    more preferable to the artefacts around the borders with RepeatNone.
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45016
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index e798096..0478709 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2137,7 +2137,7 @@ static Bool gen3_gradient_setup(struct sna *sna,
 
 	channel->pict_format = PICT_a8r8g8b8;
 	channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
-	channel->filter = PictFilterBilinear;
+	channel->filter = PictFilterNearest;
 	channel->is_affine = sna_transform_is_affine(picture->transform);
 	if (sna_transform_is_integer_translation(picture->transform, &dx, &dy)) {
 		DBG(("%s: integer translation (%d, %d), removing\n",
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 2e78a92..1868e2f 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1871,7 +1871,7 @@ gen4_composite_linear_init(struct sna *sna,
 	if (!channel->bo)
 		return 0;
 
-	channel->filter = PictFilterBilinear;
+	channel->filter = PictFilterNearest;
 	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
 	channel->width  = channel->bo->pitch / 4;
 	channel->height = 1;
@@ -1932,7 +1932,7 @@ gen4_composite_linear_init(struct sna *sna,
 
 	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
 	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y) - .5/sf);
 
 	channel->embedded_transform.matrix[1][0] = 0;
 	channel->embedded_transform.matrix[1][1] = 0;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index c27accd..1fb7f65 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1904,7 +1904,7 @@ gen5_composite_linear_init(struct sna *sna,
 	if (!channel->bo)
 		return 0;
 
-	channel->filter = PictFilterBilinear;
+	channel->filter = PictFilterNearest;
 	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
 	channel->width  = channel->bo->pitch / 4;
 	channel->height = 1;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 3be9195..5bbe5e3 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2080,7 +2080,7 @@ gen6_composite_linear_init(struct sna *sna,
 	if (!channel->bo)
 		return 0;
 
-	channel->filter = PictFilterBilinear;
+	channel->filter = PictFilterNearest;
 	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
 	channel->width  = channel->bo->pitch / 4;
 	channel->height = 1;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 6917c21..7dff02f 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2209,7 +2209,7 @@ gen7_composite_linear_init(struct sna *sna,
 	if (!channel->bo)
 		return 0;
 
-	channel->filter = PictFilterBilinear;
+	channel->filter = PictFilterNearest;
 	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
 	channel->width  = channel->bo->pitch / 4;
 	channel->height = 1;
commit c9bbb69e468469e8f59c743a916df3b3884182af
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 1 19:43:15 2012 +0100

    sna: Apply composite offset to damage for spans fast paths
    
    Reported-by: Jiri Slaby <jirislaby at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 20d9166..9829ada 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -6665,8 +6665,12 @@ spans_fallback:
 
 		gc->ops = (GCOps *)&sna_gc_ops;
 		assert_pixmap_contains_box(data.pixmap, &data.region.extents);
-		if (data.damage)
+		if (data.damage) {
+			if (data.dx | data.dy)
+				pixman_region_translate(&data.region, data.dx, data.dy);
+			assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 			sna_damage_add(data.damage, &data.region);
+		}
 		RegionUninit(&data.region);
 		return;
 	}
@@ -7579,8 +7583,12 @@ spans_fallback:
 		}
 
 		gc->ops = (GCOps *)&sna_gc_ops;
-		if (data.damage)
+		if (data.damage) {
+			if (data.dx | data.dy)
+				pixman_region_translate(&data.region, data.dx, data.dy);
+			assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 			sna_damage_add(data.damage, &data.region);
+		}
 		RegionUninit(&data.region);
 		return;
 	}
@@ -8299,8 +8307,12 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 			gc->ops = (GCOps *)&sna_gc_ops;
 
 			fill.done(data.sna, &fill);
-			if (data.damage)
+			if (data.damage) {
+				if (data.dx | data.dy)
+					pixman_region_translate(&data.region, data.dx, data.dy);
+				assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 				sna_damage_add(data.damage, &data.region);
+			}
 			RegionUninit(&data.region);
 			return;
 		}
@@ -8645,8 +8657,12 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 		}
 
 		gc->ops = (GCOps *)&sna_gc_ops;
-		if (data.damage)
+		if (data.damage) {
+			if (data.dx | data.dy)
+				pixman_region_translate(&data.region, data.dx, data.dy);
+			assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 			sna_damage_add(data.damage, &data.region);
+		}
 		RegionUninit(&data.region);
 		return;
 	}
@@ -10207,8 +10223,12 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 		}
 
 		gc->ops = (GCOps *)&sna_gc_ops;
-		if (data.damage)
+		if (data.damage) {
+			if (data.dx | data.dy)
+				pixman_region_translate(&data.region, data.dx, data.dy);
+			assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 			sna_damage_add(data.damage, &data.region);
+		}
 		RegionUninit(&data.region);
 		return;
 	}
commit 5c8ba07964c58c92e7e1a38575df6f135c4beaaa
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 1 16:36:16 2012 +0100

    sna: Fix assertion to look at bbox of all boxes/points
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 47a627a..20d9166 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -233,6 +233,8 @@ static void _assert_pixmap_contains_boxes(PixmapPtr pixmap, BoxPtr box, int n, i
 
 	extents = *box;
 	while (--n) {
+		++box;
+
 		if (box->x1 < extents.x1)
 			extents.x1 = box->x1;
 		if (box->x2 > extents.x2)
@@ -258,6 +260,8 @@ static void _assert_pixmap_contains_points(PixmapPtr pixmap, DDXPointRec *pt, in
 	extents.x2 = extents.x1 = pt->x;
 	extents.y2 = extents.y1 = pt->y;
 	while (--n) {
+		++pt;
+
 		if (pt->x < extents.x1)
 			extents.x1 = pt->x;
 		else if (pt->x > extents.x2)
commit e51fc3f4ef23be62c8c9ea33dbc69422486af0dd
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Apr 1 09:54:43 2012 +0100

    sna: Assert that drawing boxes are within bounds
    
    More sanity checks required.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index bf76948..47a627a 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -227,6 +227,54 @@ static void _assert_pixmap_contains_box(PixmapPtr pixmap, BoxPtr box, const char
 	}
 }
 
+static void _assert_pixmap_contains_boxes(PixmapPtr pixmap, BoxPtr box, int n, int dx, int dy, const char *function)
+{
+	BoxRec extents;
+
+	extents = *box;
+	while (--n) {
+		if (box->x1 < extents.x1)
+			extents.x1 = box->x1;
+		if (box->x2 > extents.x2)
+			extents.x2 = box->x2;
+
+		if (box->y1 < extents.y1)
+			extents.y1 = box->y1;
+		if (box->y2 > extents.y2)
+			extents.y2 = box->y2;
+	}
+	extents.x1 += dx;
+	extents.x2 += dx;
+	extents.y1 += dy;
+	extents.y2 += dy;
+	_assert_pixmap_contains_box(pixmap, &extents, function);
+}
+
+
+static void _assert_pixmap_contains_points(PixmapPtr pixmap, DDXPointRec *pt, int n, int dx, int dy, const char *function)
+{
+	BoxRec extents;
+
+	extents.x2 = extents.x1 = pt->x;
+	extents.y2 = extents.y1 = pt->y;
+	while (--n) {
+		if (pt->x < extents.x1)
+			extents.x1 = pt->x;
+		else if (pt->x > extents.x2)
+			extents.x2 = pt->x;
+
+		if (pt->y < extents.y1)
+			extents.y1 = pt->y;
+		else if (pt->y > extents.y2)
+			extents.y2 = pt->y;
+	}
+	extents.x1 += dx;
+	extents.x2 += dx + 1;
+	extents.y1 += dy;
+	extents.y2 += dy + 1;
+	_assert_pixmap_contains_box(pixmap, &extents, function);
+}
+
 static void _assert_drawable_contains_box(DrawablePtr drawable, const BoxRec *box, const char *function)
 {
 	if (box->x1 < drawable->x ||
@@ -244,8 +292,12 @@ static void _assert_drawable_contains_box(DrawablePtr drawable, const BoxRec *bo
 }
 #define assert_pixmap_contains_box(p, b) _assert_pixmap_contains_box(p, b, __FUNCTION__)
 #define assert_drawable_contains_box(d, b) _assert_drawable_contains_box(d, b, __FUNCTION__)
+#define assert_pixmap_contains_boxes(p, b, n, x, y) _assert_pixmap_contains_boxes(p, b, n, x, y, __FUNCTION__)
+#define assert_pixmap_contains_points(p, pt, n, x, y) _assert_pixmap_contains_points(p, pt, n, x, y, __FUNCTION__)
 #else
 #define assert_pixmap_contains_box(p, b)
+#define assert_pixmap_contains_boxes(p, b, n, x, y)
+#define assert_pixmap_contains_points(p, pt, n, x, y)
 #define assert_drawable_contains_box(d, b)
 #endif
 
@@ -4403,12 +4455,14 @@ damage:
 		b->y2 = b->y1 + 1;
 
 		if (++b == last_box) {
+			assert_pixmap_contains_boxes(pixmap, box, last_box-box, 0, 0);
 			fill.boxes(sna, &fill, box, last_box - box);
 			sna_damage_add_boxes(damage, box, last_box - box, 0, 0);
 			b = box;
 		}
 	} while (--n);
 	if (b != box) {
+		assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 		fill.boxes(sna, &fill, box, b - box);
 		sna_damage_add_boxes(damage, box, b - box, 0, 0);
 	}
@@ -4549,6 +4603,7 @@ damage_clipped:
 					b->y1 += dy;
 					b->y2 += dy;
 					if (++b == last_box) {
+						assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 						fill.boxes(sna, &fill, box, last_box - box);
 						sna_damage_add_boxes(damage, box, b - box, 0, 0);
 						b = box;
@@ -4609,6 +4664,7 @@ damage_clipped:
 					b->y1 = y + dy;
 					b->y2 = b->y1 + 1;
 					if (++b == last_box) {
+						assert_pixmap_contains_boxes(pixmap, box, last_box-box, 0, 0);
 						fill.boxes(sna, &fill, box, last_box - box);
 						sna_damage_add_boxes(damage, box, last_box - box, 0, 0);
 						b = box;
@@ -4618,6 +4674,7 @@ damage_clipped:
 			RegionUninit(&clip);
 		}
 		if (b != box) {
+			assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 			fill.boxes(sna, &fill, box, b - box);
 			sna_damage_add_boxes(damage, box, b - box, 0, 0);
 		}
@@ -4922,6 +4979,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 		return;
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
+	assert_pixmap_contains_boxes(pixmap, box, n, dx, dy);
 	if (arg->damage)
 		sna_damage_add_boxes(arg->damage, box, n, dx, dy);
 
@@ -5082,6 +5140,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 	sy += dy;
 
 	get_drawable_deltas(drawable, dst_pixmap, &dx, &dy);
+	assert_pixmap_contains_boxes(dst_pixmap, box, n, dx, dy);
 	if (arg->damage)
 		sna_damage_add_boxes(arg->damage, box, n, dx, dy);
 
@@ -5411,6 +5470,7 @@ sna_poly_point_blt(DrawablePtr drawable,
 		last.x += dx;
 		last.y += dy;
 
+		assert_pixmap_contains_points(pixmap, pt, n, last.x, last.y);
 		sna_damage_add_points(damage, pt, n, last.x, last.y);
 		do {
 			unsigned nbox = n;
@@ -5457,6 +5517,7 @@ sna_poly_point_blt(DrawablePtr drawable,
 				b->x2 = b->x1 + 1;
 				b->y2 = b->y1 + 1;
 				if (++b == last_box){
+					assert_pixmap_contains_boxes(pixmap, box, last_box-box, 0, 0);
 					fill.boxes(sna, &fill, box, last_box - box);
 					if (damage)
 						sna_damage_add_boxes(damage, box, last_box-box, 0, 0);
@@ -5465,6 +5526,7 @@ sna_poly_point_blt(DrawablePtr drawable,
 			}
 		}
 		if (b != box){
+			assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 			fill.boxes(sna, &fill, box, b - box);
 			if (damage)
 				sna_damage_add_boxes(damage, box, b-box, 0, 0);
@@ -5916,6 +5978,7 @@ done:
 	return true;
 
 damage:
+	assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 	sna_damage_add_boxes(damage, box, b-box, 0, 0);
 no_damage:
 	fill.boxes(sna, &fill, box, b-box);
@@ -5930,6 +5993,7 @@ no_damage_offset:
 			bb->y1 += dy;
 			bb->y2 += dy;
 		} while (++bb != b);
+		assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 		fill.boxes(sna, &fill, box, b - box);
 	}
 	goto *ret;
@@ -5943,6 +6007,7 @@ damage_offset:
 			bb->y1 += dy;
 			bb->y2 += dy;
 		} while (++bb != b);
+		assert_pixmap_contains_boxes(pixmap, box, b-box, 0, 0);
 		fill.boxes(sna, &fill, box, b - box);
 		sna_damage_add_boxes(damage, box, b - box, 0, 0);
 	}
@@ -6014,6 +6079,7 @@ sna_poly_line_blt(DrawablePtr drawable,
 			     __FUNCTION__,
 			     b->x1, b->y1, b->x2, b->y2));
 			if (++b == last_box) {
+				assert_pixmap_contains_boxes(pixmap, boxes, last_box-boxes, 0, 0);
 				fill.boxes(sna, &fill, boxes, last_box - boxes);
 				if (damage)
 					sna_damage_add_boxes(damage, boxes, last_box - boxes, 0, 0);
@@ -6075,6 +6141,7 @@ sna_poly_line_blt(DrawablePtr drawable,
 					b->y1 += dy;
 					b->y2 += dy;
 					if (++b == last_box) {
+						assert_pixmap_contains_boxes(pixmap, boxes, last_box-boxes, 0, 0);
 						fill.boxes(sna, &fill, boxes, last_box - boxes);
 						if (damage)
 							sna_damage_add_boxes(damage, boxes, last_box - boxes, 0, 0);
@@ -6139,6 +6206,7 @@ sna_poly_line_blt(DrawablePtr drawable,
 						b->y1 += dy;
 						b->y2 += dy;
 						if (++b == last_box) {
+							assert_pixmap_contains_boxes(pixmap, boxes, last_box-boxes, 0, 0);
 							fill.boxes(sna, &fill, boxes, last_box-boxes);
 							if (damage)
 								sna_damage_add_boxes(damage, boxes, last_box-boxes, 0, 0);
@@ -6153,6 +6221,7 @@ sna_poly_line_blt(DrawablePtr drawable,
 		RegionUninit(&clip);
 	}
 	if (b != boxes) {
+		assert_pixmap_contains_boxes(pixmap, boxes, b-boxes, 0, 0);
 		fill.boxes(sna, &fill, boxes, b - boxes);
 		if (damage)
 			sna_damage_add_boxes(damage, boxes, b - boxes, 0, 0);
@@ -6591,6 +6660,7 @@ spans_fallback:
 		}
 
 		gc->ops = (GCOps *)&sna_gc_ops;
+		assert_pixmap_contains_box(data.pixmap, &data.region.extents);
 		if (data.damage)
 			sna_damage_add(data.damage, &data.region);
 		RegionUninit(&data.region);
commit f18060740ed4558952b65a2d007b029bc62f0b41
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 22:51:21 2012 +0100

    sna: Minimise the risk of hotplug hangs by checking fb before vsync
    
    Everytime we issue a MI_WAIT_FOR_EVENT on a scan-line from userspace we
    run the risk of that pipe being disable before we submit a batch. As the
    pipe is then disabled or configured differently, we encounter an
    indefinite wait and trigger a GPU hang.
    
    To minimise the risk of a hotplug event being detected and submitting a
    vsynced batch prior to noticing the removal of the pipe, perform an
    explicit query of the current CRTC and delete the wait if we spot that
    our framebuffer is no longer attached. This is about as good as we can
    achieve without extra help from the kernel.
    
    Reported-by: Francis Leblanc <Francis.Leblanc-Lebeau at verint.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45413 (and others)
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 27e0e04..e52645c 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -141,6 +141,7 @@ struct kgem {
 	uint16_t nexec;
 	uint16_t nreloc;
 	uint16_t nfence;
+	uint16_t wait;
 	uint16_t max_batch_size;
 
 	uint32_t flush:1;
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 7a1e2f6..308e329 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -363,6 +363,7 @@ extern xf86CrtcPtr sna_covering_crtc(ScrnInfoPtr scrn,
 
 extern bool sna_wait_for_scanline(struct sna *sna, PixmapPtr pixmap,
 				  xf86CrtcPtr crtc, const BoxRec *clip);
+extern bool sna_crtc_is_bound(struct sna *sna, xf86CrtcPtr crtc);
 
 Bool sna_dri_open(struct sna *sna, ScreenPtr pScreen);
 void sna_dri_wakeup(struct sna *sna);
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index f413ac1..ef3b0f9 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -2102,6 +2102,7 @@ static void sna_emit_wait_for_scanline_gen6(struct sna *sna,
 	b[1] = pipe;
 	b[2] = y2 - 1;
 	b[3] = MI_WAIT_FOR_EVENT | event;
+	sna->kgem.wait = sna->kgem.nbatch + 3;
 	kgem_advance_batch(&sna->kgem, 4);
 }
 
@@ -2131,6 +2132,7 @@ static void sna_emit_wait_for_scanline_gen4(struct sna *sna,
 	b[2] = b[0] = MI_LOAD_SCAN_LINES_INCL | pipe << 20;
 	b[3] = b[1] = (y1 << 16) | (y2-1);
 	b[4] = MI_WAIT_FOR_EVENT | event;
+	sna->kgem.wait = sna->kgem.nbatch + 4;
 	kgem_advance_batch(&sna->kgem, 5);
 }
 
@@ -2158,6 +2160,7 @@ static void sna_emit_wait_for_scanline_gen2(struct sna *sna,
 		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
 	else
 		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
+	sna->kgem.wait = sna->kgem.nbatch + 4;
 	kgem_advance_batch(&sna->kgem, 5);
 }
 
@@ -2171,21 +2174,15 @@ sna_wait_for_scanline(struct sna *sna,
 	Bool full_height;
 	int y1, y2, pipe;
 
+	assert(crtc);
+	assert(sna_crtc_on(crtc));
+	assert(pixmap_is_scanout(pixmap));
+
 	/* XXX WAIT_EVENT is still causing hangs on SNB */
 	if (sna->kgem.gen >= 60)
 		return false;
 
-	if (!pixmap_is_scanout(pixmap))
-		return false;
-
-	if (crtc == NULL) {
-		crtc = sna_covering_crtc(sna->scrn, clip, NULL, &crtc_box);
-		if (crtc == NULL)
-			return false;
-	} else
-		sna_crtc_box(crtc, &crtc_box);
-	assert(sna_crtc_on(crtc));
-
+	sna_crtc_box(crtc, &crtc_box);
 	if (crtc->transform_in_use) {
 		box = *clip;
 		pixman_f_transform_bounds(&crtc->f_framebuffer_to_crtc, &box);
@@ -2227,3 +2224,14 @@ sna_wait_for_scanline(struct sna *sna,
 
 	return true;
 }
+
+bool sna_crtc_is_bound(struct sna *sna, xf86CrtcPtr crtc)
+{
+	struct drm_mode_crtc mode;
+
+	mode.crtc_id = crtc_id(crtc->driver_private);
+	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETCRTC, &mode))
+		return false;
+
+	return mode.mode_valid && sna->mode.fb_id == mode.fb_id;
+}
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 95ec07e..afec831 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -400,6 +400,7 @@ sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 	PixmapPtr pixmap = get_drawable_pixmap(draw);
 	pixman_region16_t clip;
 	bool flush = false;
+	xf86CrtcPtr crtc;
 	BoxRec box, *boxes;
 	int16_t dx, dy;
 	int n;
@@ -442,9 +443,15 @@ sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 			return;
 		}
 
-		if (pixmap == sna->front && sync)
-			flush = sna_wait_for_scanline(sna, pixmap, NULL,
-						      &region->extents);
+		if (pixmap == sna->front && sync) {
+			BoxRec crtc_box;
+
+			crtc = sna_covering_crtc(sna->scrn, &region->extents,
+						 NULL, &crtc_box);
+			if (crtc)
+				flush = sna_wait_for_scanline(sna, pixmap, crtc,
+							      &region->extents);
+		}
 
 		get_drawable_deltas(draw, pixmap, &dx, &dy);
 	}
@@ -482,8 +489,11 @@ sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 			       boxes, n);
 
 	DBG(("%s: flushing? %d\n", __FUNCTION__, flush));
-	if (flush) /* STAT! */
+	if (flush) { /* STAT! */
+		if (!sna_crtc_is_bound(sna, crtc))
+			sna->kgem.batch[sna->kgem.wait] = 0;
 		kgem_submit(&sna->kgem);
+	}
 
 	pixman_region_translate(region, dx, dy);
 	DamageRegionAppend(&pixmap->drawable, region);
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index a71751c..b740b6a 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -306,8 +306,11 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 	/* Push the frame to the GPU as soon as possible so
 	 * we can hit the next vsync.
 	 */
-	if (flush)
+	if (flush) {
+		if (!sna_crtc_is_bound(sna, crtc))
+			sna->kgem.batch[sna->kgem.wait] = 0;
 		kgem_submit(&sna->kgem);
+	}
 
 	return ret;
 }
commit cde6339cc170d2fd7ca3a417aedaf6efccb14f30
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 21:27:29 2012 +0100

    sna: Separate out scanline waiting for gen4
    
    So that we do not set a gen4 bit on gen2 and apply the old workaround of
    trimming y2 instead.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index ecfe670..f413ac1 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -2105,44 +2105,62 @@ static void sna_emit_wait_for_scanline_gen6(struct sna *sna,
 	kgem_advance_batch(&sna->kgem, 4);
 }
 
-static void sna_emit_wait_for_scanline_gen2(struct sna *sna,
+static void sna_emit_wait_for_scanline_gen4(struct sna *sna,
 					    int pipe, int y1, int y2,
 					    bool full_height)
 {
 	uint32_t event;
 	uint32_t *b;
 
-	/*
-	 * Pre-965 doesn't have SVBLANK, so we need a bit
-	 * of extra time for the blitter to start up and
-	 * do its job for a full height blit
-	 */
 	if (pipe == 0) {
-		pipe = MI_LOAD_SCAN_LINES_DISPLAY_PIPEA;
-		event = MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
 		if (full_height)
 			event = MI_WAIT_FOR_PIPEA_SVBLANK;
+		else
+			event = MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
 	} else {
-		pipe = MI_LOAD_SCAN_LINES_DISPLAY_PIPEB;
-		event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
 		if (full_height)
 			event = MI_WAIT_FOR_PIPEB_SVBLANK;
+		else
+			event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
 	}
 
-	if (sna->kgem.mode == KGEM_NONE)
-		kgem_set_mode(&sna->kgem, KGEM_BLT);
-
+	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	b = kgem_get_batch(&sna->kgem, 5);
 	/* The documentation says that the LOAD_SCAN_LINES command
 	 * always comes in pairs. Don't ask me why. */
-	b[0] = MI_LOAD_SCAN_LINES_INCL | pipe;
-	b[1] = (y1 << 16) | (y2-1);
-	b[2] = MI_LOAD_SCAN_LINES_INCL | pipe;
-	b[3] = (y1 << 16) | (y2-1);
+	b[2] = b[0] = MI_LOAD_SCAN_LINES_INCL | pipe << 20;
+	b[3] = b[1] = (y1 << 16) | (y2-1);
 	b[4] = MI_WAIT_FOR_EVENT | event;
 	kgem_advance_batch(&sna->kgem, 5);
 }
 
+static void sna_emit_wait_for_scanline_gen2(struct sna *sna,
+					    int pipe, int y1, int y2,
+					    bool full_height)
+{
+	uint32_t *b;
+
+	/*
+	 * Pre-965 doesn't have SVBLANK, so we need a bit
+	 * of extra time for the blitter to start up and
+	 * do its job for a full height blit
+	 */
+	if (full_height)
+		y2 -= 2;
+
+	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	b = kgem_get_batch(&sna->kgem, 5);
+	/* The documentation says that the LOAD_SCAN_LINES command
+	 * always comes in pairs. Don't ask me why. */
+	b[2] = b[0] = MI_LOAD_SCAN_LINES_INCL | pipe << 20;
+	b[3] = b[1] = (y1 << 16) | (y2-1);
+	if (pipe == 0)
+		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
+	else
+		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
+	kgem_advance_batch(&sna->kgem, 5);
+}
+
 bool
 sna_wait_for_scanline(struct sna *sna,
 		      PixmapPtr pixmap,
@@ -2202,6 +2220,8 @@ sna_wait_for_scanline(struct sna *sna,
 
 	if (sna->kgem.gen >= 60)
 		sna_emit_wait_for_scanline_gen6(sna, pipe, y1, y2, full_height);
+	else if (sna->kgem.gen >= 40)
+		sna_emit_wait_for_scanline_gen4(sna, pipe, y1, y2, full_height);
 	else
 		sna_emit_wait_for_scanline_gen2(sna, pipe, y1, y2, full_height);
 
commit b19825a188626947080de84d363af2fe23d7d852
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 20:45:55 2012 +0100

    sna/traps: Align the pointer+index
    
    It's the location of the pixels within the row that matter for
    alignment!
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47418
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
    Tested-by: Magnus Kessler <Magnus.Kessler at gmx.net>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index b6c5b65..2b4b2db 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1490,42 +1490,42 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 
 			if (rix > ++lix) {
 				rix -= lix;
+				row += lix;
 #if 0
 				if (rix == 1)
-					row[lix] = 0xff;
+					*row = 0xff;
 				else
-					memset(row+lix, 0xff, rix);
+					memset(row, 0xff, rix);
 #else
 				if ((uintptr_t)row & 1 && rix) {
-					row[lix] = 0xff;
-					lix++;
+					*row++ = 0xff;
 					rix--;
 				}
 				if ((uintptr_t)row & 2 && rix >= 2) {
-					*(uint16_t *)(row+lix) = 0xffff;
-					lix += 2;
+					*(uint16_t *)row = 0xffff;
+					row += 2;
 					rix -= 2;
 				}
 				if ((uintptr_t)row & 4 && rix >= 4) {
-					*(uint32_t *)(row+lix) = 0xffffffff;
-					lix += 4;
+					*(uint32_t *)row = 0xffffffff;
+					row += 4;
 					rix -= 4;
 				}
 				while (rix >= 8) {
-					*(uint64_t *)(row+lix) = 0xffffffffffffffff;
-					lix += 8;
+					*(uint64_t *)row = 0xffffffffffffffff;
+					row += 8;
 					rix -= 8;
 				}
 				if (rix & 4) {
-					*(uint32_t *)(row+lix) = 0xffffffff;
-					lix += 4;
+					*(uint32_t *)row = 0xffffffff;
+					row += 4;
 				}
 				if (rix & 2) {
-					*(uint16_t *)(row+lix) = 0xffff;
-					lix += 2;
+					*(uint16_t *)row = 0xffff;
+					row += 2;
 				}
 				if (rix & 1)
-					row[lix] = 0xff;
+					*row = 0xff;
 #endif
 			}
 		}
commit 1c6d84eeab8288f08ab89fc556801144e71a592e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 19:09:30 2012 +0100

    sna/traps: Align the pointer not the indices
    
    Magnus found that inplace_row was still crashing on his setup when it
    tried to perform an 8-byte aligned write to an unaligned pointer. This
    time it looks like the row pointer itself was not 8-byte aligned, so
    instead of assuming that and fixing up the indices, ensure that the
    (index+row) results in an 8-byte aligned value.
    
    Reported-by: Magnus Kessler <Magnus.Kessler at gmx.net>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47418
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 4067757..b6c5b65 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1496,17 +1496,17 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 				else
 					memset(row+lix, 0xff, rix);
 #else
-				if (lix & 1 && rix) {
+				if ((uintptr_t)row & 1 && rix) {
 					row[lix] = 0xff;
 					lix++;
 					rix--;
 				}
-				if (lix & 2 && rix >= 2) {
+				if ((uintptr_t)row & 2 && rix >= 2) {
 					*(uint16_t *)(row+lix) = 0xffff;
 					lix += 2;
 					rix -= 2;
 				}
-				if (lix & 4 && rix >= 4) {
+				if ((uintptr_t)row & 4 && rix >= 4) {
 					*(uint32_t *)(row+lix) = 0xffffffff;
 					lix += 4;
 					rix -= 4;
commit 389d004c75b5c1b9784f7430f3b355ba879d2091
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 12:47:21 2012 +0100

    uxa: Remove broken render glyphs-to-dst
    
    Reported-by: Vincent Untz <vuntz at gnome.org>
    Reported-by: Robert Bradford <robert.bradford at intel.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48045
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-glyphs.c b/uxa/uxa-glyphs.c
index 6172f2f..b754f4e 100644
--- a/uxa/uxa-glyphs.c
+++ b/uxa/uxa-glyphs.c
@@ -663,190 +663,6 @@ uxa_glyph_cache(ScreenPtr screen, GlyphPtr glyph, int *out_x, int *out_y)
 	return cache->picture;
 }
 
-static int
-uxa_glyphs_to_dst(CARD8 op,
-		  PicturePtr pSrc,
-		  PicturePtr pDst,
-		  INT16 src_x, INT16 src_y,
-		  INT16 xDst, INT16 yDst,
-		  int nlist, GlyphListPtr list, GlyphPtr * glyphs,
-		  BoxPtr extents)
-{
-	ScreenPtr screen = pDst->pDrawable->pScreen;
-	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
-	PixmapPtr src_pixmap, dst_pixmap;
-	PicturePtr localSrc, glyph_atlas;
-	int x, y, n;
-	BoxRec box;
-
-	if (uxa_screen->info->check_composite_texture &&
-	    uxa_screen->info->check_composite_texture(screen, pSrc)) {
-		if (pSrc->pDrawable) {
-			int src_off_x, src_off_y;
-
-			src_pixmap = uxa_get_offscreen_pixmap(pSrc->pDrawable, &src_off_x, &src_off_y);
-			if (src_pixmap == NULL)
-				return -1;
-
-			src_x += pSrc->pDrawable->x + src_off_x;
-			src_y += pSrc->pDrawable->y + src_off_y;
-		} else {
-			src_pixmap = NULL;
-		}
-		localSrc = pSrc;
-	} else {
-		int width, height;
-
-		if (extents == NULL) {
-			uxa_glyph_extents(nlist, list, glyphs, &box);
-			extents = &box;
-		}
-
-		width  = extents->x2 - extents->x1;
-		height = extents->y2 - extents->y1;
-		if (width == 0 || height == 0)
-			return 0;
-
-		if (pSrc->pDrawable) {
-			int src_off_x, src_off_y;
-
-			src_off_x = extents->x1 - xDst;
-			src_off_y = extents->y1 - yDst;
-			localSrc = uxa_acquire_drawable(screen, pSrc,
-							src_x + src_off_x, src_y + src_off_y,
-							width, height,
-							&src_x, &src_y);
-			if (uxa_screen->info->check_composite_texture &&
-			    !uxa_screen->info->check_composite_texture(screen, localSrc)) {
-				if (localSrc != pSrc)
-					FreePicture(localSrc, 0);
-				return -1;
-			}
-
-			src_pixmap = uxa_get_offscreen_pixmap(localSrc->pDrawable, &src_off_x, &src_off_y);
-			if (src_pixmap == NULL) {
-				if (localSrc != pSrc)
-					FreePicture(localSrc, 0);
-				return -1;
-			}
-
-			src_x += localSrc->pDrawable->x + src_off_x;
-			src_y += localSrc->pDrawable->y + src_off_y;
-		} else {
-			localSrc = uxa_acquire_pattern(screen, pSrc,
-						       PICT_a8r8g8b8, x, y, width, height);
-			if (!localSrc)
-				return 1;
-
-			src_pixmap = uxa_get_drawable_pixmap(localSrc->pDrawable);
-			if (src_pixmap == NULL) {
-				FreePicture(localSrc, 0);
-				return -1;
-			}
-
-			src_x = src_y = 0;
-		}
-	}
-
-	dst_pixmap = uxa_get_offscreen_pixmap(pDst->pDrawable, &x, &y);
-	x += xDst + pDst->pDrawable->x - list->xOff;
-	y += yDst + pDst->pDrawable->y - list->yOff;
-
-	glyph_atlas = NULL;
-	while (nlist--) {
-		x += list->xOff;
-		y += list->yOff;
-		n = list->len;
-		while (n--) {
-			GlyphPtr glyph = *glyphs++;
-			PicturePtr this_atlas;
-			int mask_x, mask_y, nrect;
-			struct uxa_glyph *priv;
-			BoxPtr rects;
-
-			if (glyph->info.width == 0 || glyph->info.height == 0)
-				goto next_glyph;
-
-			priv = uxa_glyph_get_private(glyph);
-			if (priv != NULL) {
-				mask_x = priv->x;
-				mask_y = priv->y;
-				this_atlas = priv->cache->picture;
-			} else {
-				if (glyph_atlas) {
-					uxa_screen->info->done_composite(dst_pixmap);
-					glyph_atlas = NULL;
-				}
-				this_atlas = uxa_glyph_cache(screen, glyph, &mask_x, &mask_y);
-				if (this_atlas == NULL) {
-					/* no cache for this glyph */
-					this_atlas = GlyphPicture(glyph)[screen->myNum];
-					mask_x = mask_y = 0;
-				}
-			}
-
-			if (this_atlas != glyph_atlas) {
-				PixmapPtr mask_pixmap;
-
-				if (glyph_atlas)
-					uxa_screen->info->done_composite(dst_pixmap);
-
-				mask_pixmap =
-					uxa_get_drawable_pixmap(this_atlas->pDrawable);
-				if (!uxa_pixmap_is_offscreen(mask_pixmap) ||
-				    !uxa_screen->info->prepare_composite(op,
-									 localSrc, this_atlas, pDst,
-									 src_pixmap, mask_pixmap, dst_pixmap))
-					return -1;
-
-				glyph_atlas = this_atlas;
-			}
-
-			rects = REGION_RECTS(pDst->pCompositeClip);
-			nrect = REGION_NUM_RECTS(pDst->pCompositeClip);
-			while (nrect--) {
-				int x1 = x - glyph->info.x, dx = 0;
-				int y1 = y - glyph->info.y, dy = 0;
-				int x2 = x1 + glyph->info.width;
-				int y2 = y1 + glyph->info.height;
-
-				if (rects->y1 >= y2)
-					break;
-
-				if (x1 < rects->x1)
-					dx = rects->x1 - x1, x1 = rects->x1;
-				if (x2 > rects->x2)
-					x2 = rects->x2;
-				if (y1 < rects->y1)
-					dy = rects->y1 - y1, y1 = rects->y1;
-				if (y2 > rects->y2)
-					y2 = rects->y2;
-
-				if (x1 < x2 && y1 < y2) {
-					uxa_screen->info->composite(dst_pixmap,
-								    x1 + src_x,  y1 + src_y,
-								    dx + mask_x, dy + mask_y,
-								    x1, y1,
-								    x2 - x1, y2 - y1);
-				}
-				rects++;
-			}
-
-next_glyph:
-			x += glyph->info.xOff;
-			y += glyph->info.yOff;
-		}
-		list++;
-	}
-	if (glyph_atlas)
-		uxa_screen->info->done_composite(dst_pixmap);
-
-	if (localSrc != pSrc)
-		FreePicture(localSrc, 0);
-
-	return 0;
-}
-
 static void
 uxa_clear_pixmap(ScreenPtr screen,
 		 uxa_screen_t *uxa_screen,
@@ -894,37 +710,30 @@ uxa_glyphs_via_mask(CARD8 op,
 		    PicturePtr pDst,
 		    PictFormatPtr maskFormat,
 		    INT16 xSrc, INT16 ySrc,
-		    INT16 xDst, INT16 yDst,
-		    int nlist, GlyphListPtr list, GlyphPtr * glyphs,
-		    BoxPtr extents)
+		    int nlist, GlyphListPtr list, GlyphPtr * glyphs)
 {
 	ScreenPtr screen = pDst->pDrawable->pScreen;
 	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
 	CARD32 component_alpha;
 	PixmapPtr pixmap;
 	PicturePtr glyph_atlas, mask;
+	int xDst = list->xOff, yDst = list->yOff;
 	int x, y, width, height;
 	int dst_off_x, dst_off_y;
 	int n, error;
 	BoxRec box;
 
-	if (!extents) {
-		uxa_glyph_extents(nlist, list, glyphs, &box);
+	uxa_glyph_extents(nlist, list, glyphs, &box);
+	if (box.x2 <= box.x1 || box.y2 <= box.y1)
+		return 0;
 
-		if (box.x2 <= box.x1 || box.y2 <= box.y1)
-			return 0;
+	dst_off_x = box.x1;
+	dst_off_y = box.y1;
 
-		extents = &box;
-		dst_off_x = box.x1;
-		dst_off_y = box.y1;
-	} else {
-		dst_off_x = dst_off_y = 0;
-	}
-
-	width  = extents->x2 - extents->x1;
-	height = extents->y2 - extents->y1;
-	x = -extents->x1;
-	y = -extents->y1;
+	width  = box.x2 - box.x1;
+	height = box.y2 - box.y1;
+	x = -box.x1;
+	y = -box.y1;
 
 	if (maskFormat->depth == 1) {
 		PictFormatPtr a8Format =
@@ -1061,11 +870,6 @@ uxa_glyphs(CARD8 op,
 {
 	ScreenPtr screen = pDst->pDrawable->pScreen;
 	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
-	int xDst = list->xOff, yDst = list->yOff;
-	BoxRec extents = { 0, 0, 0, 0 };
-	Bool have_extents = FALSE;
-	int width, height, ret;
-	PicturePtr localDst = pDst;
 
 	if (uxa_screen->info->flags & UXA_USE_GLAMOR) {
 		int ok;
@@ -1128,112 +932,12 @@ fallback:
 		}
 	}
 
-	if (!maskFormat &&
-	    uxa_screen->info->check_composite_target &&
-	    !uxa_screen->info->check_composite_target(uxa_get_drawable_pixmap(pDst->pDrawable))) {
-		int depth = pDst->pDrawable->depth;
-		PixmapPtr pixmap;
-		int x, y, error;
-		GCPtr gc;
-
-		pixmap = uxa_get_drawable_pixmap(pDst->pDrawable);
-		if (uxa_screen->info->check_copy &&
-		    !uxa_screen->info->check_copy(pixmap, pixmap, GXcopy, FB_ALLONES))
-			goto fallback;
-
-		uxa_glyph_extents(nlist, list, glyphs, &extents);
-
-		/* clip against dst bounds */
-		if (extents.x1 < 0)
-			extents.x1 = 0;
-		if (extents.y1 < 0)
-			extents.y1 = 0;
-		if (extents.x2 > pDst->pDrawable->width)
-			extents.x2 = pDst->pDrawable->width;
-		if (extents.y2 > pDst->pDrawable->height)
-			extents.y2 = pDst->pDrawable->height;
-
-		if (extents.x2 <= extents.x1 || extents.y2 <= extents.y1)
-			return;
-		width  = extents.x2 - extents.x1;
-		height = extents.y2 - extents.y1;
-		x = -extents.x1;
-		y = -extents.y1;
-		have_extents = TRUE;
-
-		xDst += x;
-		yDst += y;
-
-		pixmap = screen->CreatePixmap(screen,
-					      width, height, depth,
-					      CREATE_PIXMAP_USAGE_SCRATCH);
-		if (!pixmap)
-			return;
-
-		if (!uxa_pixmap_is_offscreen(pixmap)) {
-			screen->DestroyPixmap(pixmap);
-			goto fallback;
-		}
-
-		gc = GetScratchGC(depth, screen);
-		if (!gc) {
-			screen->DestroyPixmap(pixmap);
-			return;
-		}
-
-		ValidateGC(&pixmap->drawable, gc);
-		gc->ops->CopyArea(pDst->pDrawable, &pixmap->drawable, gc,
-				  extents.x1, extents.y1,
-				  width, height,
-				  0, 0);
-		FreeScratchGC(gc);
-
-		localDst = CreatePicture(0, &pixmap->drawable,
-					 PictureMatchFormat(screen, depth, pDst->format),
-					 0, 0, serverClient, &error);
-		screen->DestroyPixmap(pixmap);
-
-		if (!localDst)
-			return;
-
-		ValidatePicture(localDst);
-	}
-
-	if (maskFormat) {
-		ret = uxa_glyphs_via_mask(op,
-					  pSrc, localDst, maskFormat,
-					  xSrc, ySrc,
-					  xDst, yDst,
-					  nlist, list, glyphs,
-					  have_extents ? &extents : NULL);
-	} else {
-		ret = uxa_glyphs_to_dst(op,
-					pSrc, localDst,
-					xSrc, ySrc,
-					xDst, yDst,
-					nlist, list, glyphs,
-					have_extents ? &extents : NULL);
-	}
-	if (ret) {
-		if (localDst != pDst)
-			FreePicture(localDst, 0);
-
+	if (!maskFormat)
 		goto fallback;
-	}
 
-	if (localDst != pDst) {
-		GCPtr gc;
-
-		gc = GetScratchGC(pDst->pDrawable->depth, screen);
-		if (gc) {
-			ValidateGC(pDst->pDrawable, gc);
-			gc->ops->CopyArea(localDst->pDrawable, pDst->pDrawable, gc,
-					  0, 0,
-					  width, height,
-					  extents.x1, extents.y1);
-			FreeScratchGC(gc);
-		}
-
-		FreePicture(localDst, 0);
-	}
+	if (uxa_glyphs_via_mask(op,
+				pSrc, pDst, maskFormat,
+				xSrc, ySrc,
+				nlist, list, glyphs))
+		goto fallback;
 }
commit 074ce2b0c0e655abcad561ee35c22ad51f956c7e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 30 10:21:26 2012 +0100

    sna/gen7: Allow per-device specific maxima
    
    As the maximum thread count and urb size differs between different
    incarnations of the GT units, be a little more flexible in programming
    those maximums.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 1491167..6917c21 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -67,6 +67,31 @@
 
 #define is_aligned(x, y) (((x) & ((y) - 1)) == 0)
 
+struct gt_info {
+	int max_vs_threads;
+	int max_gs_threads;
+	int max_wm_threads;
+	struct {
+		int size;
+		int max_vs_entries;
+		int max_gs_entries;
+	} urb;
+};
+
+static const struct gt_info gt1_info = {
+	.max_vs_threads = 36,
+	.max_gs_threads = 36,
+	.max_wm_threads = 86,
+	.urb = { 128, 512, 192 },
+};
+
+static const struct gt_info gt2_info = {
+	.max_vs_threads = 128,
+	.max_gs_threads = 128,
+	.max_wm_threads = 86,
+	.urb = { 256, 704, 320 },
+};
+
 static const uint32_t ps_kernel_nomask_affine[][4] = {
 #include "exa_wm_src_affine.g7b"
 #include "exa_wm_src_sample_argb.g7b"
@@ -427,7 +452,7 @@ gen7_emit_urb(struct sna *sna)
 
 	/* num of VS entries must be divisible by 8 if size < 9 */
 	OUT_BATCH(GEN7_3DSTATE_URB_VS | (2 - 2));
-	OUT_BATCH((32 << GEN7_URB_ENTRY_NUMBER_SHIFT) | /* at least 32 */
+	OUT_BATCH((sna->render_state.gen7.info->urb.max_vs_entries << GEN7_URB_ENTRY_NUMBER_SHIFT) |
 		  (2 - 1) << GEN7_URB_ENTRY_SIZE_SHIFT |
 		  (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
@@ -802,7 +827,7 @@ gen7_emit_wm(struct sna *sna, unsigned int kernel, int nr_surfaces, int nr_input
 	OUT_BATCH(1 << GEN7_PS_SAMPLER_COUNT_SHIFT |
 		  nr_surfaces << GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
 	OUT_BATCH(0); /* scratch address */
-	OUT_BATCH((86 - 1) << GEN7_PS_MAX_THREADS_SHIFT |
+	OUT_BATCH((sna->render_state.gen7.info->max_wm_threads - 1) << GEN7_PS_MAX_THREADS_SHIFT |
 		  GEN7_PS_ATTRIBUTE_ENABLE |
 		  GEN7_PS_16_DISPATCH_ENABLE);
 	OUT_BATCH(6 << GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
@@ -4292,6 +4317,10 @@ static Bool gen7_render_setup(struct sna *sna)
 	struct gen7_sampler_state *ss;
 	int i, j, k, l, m;
 
+	state->info = &gt1_info;
+	if (DEVICE_ID(sna->PciInfo) & 0x20)
+		state->info = &gt2_info;
+
 	sna_static_stream_init(&general);
 
 	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 8ad8efc..7a1e2f6 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -257,7 +257,6 @@ struct sna {
 #define SNA_TILING_3D		0x4
 #define SNA_TILING_ALL (~0)
 
-	int Chipset;
 	EntityInfoPtr pEnt;
 	struct pci_device *PciInfo;
 	struct intel_chipset chipset;
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index a83f78e..73ef568 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -428,6 +428,7 @@ enum {
 };
 
 struct gen7_render_state {
+	const struct gt_info *info;
 	struct kgem_bo *general_bo;
 
 	uint32_t vs_state;
commit fd2ebbc52d0c6c51eb22c815f7a625c701a95919
Author: Eugeni Dodonov <eugeni.dodonov at intel.com>
Date:   Thu Mar 29 21:08:29 2012 -0300

    Add support for Ivy Bridge GT2 Server chipset
    
    Sometimes known as Bromlow.
    
    Signed-off-by: Eugeni Dodonov <eugeni.dodonov at intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_driver.h b/src/intel_driver.h
index e9d6d9e..98973e5 100644
--- a/src/intel_driver.h
+++ b/src/intel_driver.h
@@ -190,6 +190,7 @@
 #define PCI_CHIP_IVYBRIDGE_D_GT1	0x0152
 #define PCI_CHIP_IVYBRIDGE_D_GT2	0x0162
 #define PCI_CHIP_IVYBRIDGE_S_GT1	0x015a
+#define PCI_CHIP_IVYBRIDGE_S_GT2	0x016a
 
 #endif
 
diff --git a/src/intel_module.c b/src/intel_module.c
index 66343f2..25e6fd9 100644
--- a/src/intel_module.c
+++ b/src/intel_module.c
@@ -143,6 +143,7 @@ static const SymTabRec _intel_chipsets[] = {
 	{PCI_CHIP_IVYBRIDGE_D_GT1,		"Ivybridge Desktop (GT1)" },
 	{PCI_CHIP_IVYBRIDGE_D_GT2,		"Ivybridge Desktop (GT2)" },
 	{PCI_CHIP_IVYBRIDGE_S_GT1,		"Ivybridge Server" },
+	{PCI_CHIP_IVYBRIDGE_S_GT2,		"Ivybridge Server (GT2)" },
 	{-1,					NULL}
 };
 #define NUM_CHIPSETS (sizeof(_intel_chipsets) / sizeof(_intel_chipsets[0]))
@@ -211,6 +212,7 @@ static const struct pci_id_match intel_device_match[] = {
 	INTEL_DEVICE_MATCH (PCI_CHIP_IVYBRIDGE_D_GT1, &intel_ivybridge_info ),
 	INTEL_DEVICE_MATCH (PCI_CHIP_IVYBRIDGE_D_GT2, &intel_ivybridge_info ),
 	INTEL_DEVICE_MATCH (PCI_CHIP_IVYBRIDGE_S_GT1, &intel_ivybridge_info ),
+	INTEL_DEVICE_MATCH (PCI_CHIP_IVYBRIDGE_S_GT2, &intel_ivybridge_info ),
 
 	{ 0, 0, 0 },
 };
commit d686c767382db7d2abd0f91490ae9c72577ef944
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 28 18:59:26 2012 +0100

    sna: Add video sprite support for ILK+
    
    Based on the work by Jesse Barnes.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
index 70afd53..911a857 100644
--- a/src/sna/Makefile.am
+++ b/src/sna/Makefile.am
@@ -60,6 +60,7 @@ libsna_la_SOURCES = \
 	sna_video.c \
 	sna_video.h \
 	sna_video_overlay.c \
+	sna_video_sprite.c \
 	sna_video_textured.c \
 	gen2_render.c \
 	gen2_render.h \
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 949c18f..8ad8efc 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -371,6 +371,7 @@ void sna_dri_close(struct sna *sna, ScreenPtr pScreen);
 
 extern Bool sna_crtc_on(xf86CrtcPtr crtc);
 int sna_crtc_to_pipe(xf86CrtcPtr crtc);
+int sna_crtc_to_plane(xf86CrtcPtr crtc);
 
 CARD32 sna_format_for_depth(int depth);
 CARD32 sna_render_format_for_depth(int depth);
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index a5d69dd..ecfe670 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -60,6 +60,7 @@ struct sna_crtc {
 	int num;
 	int id;
 	int pipe;
+	int plane;
 	int active;
 	struct list link;
 };
@@ -144,6 +145,12 @@ int sna_crtc_to_pipe(xf86CrtcPtr crtc)
 	return sna_crtc->pipe;
 }
 
+int sna_crtc_to_plane(xf86CrtcPtr crtc)
+{
+	struct sna_crtc *sna_crtc = crtc->driver_private;
+	return sna_crtc->plane;
+}
+
 static uint32_t gem_create(int fd, int size)
 {
 	struct drm_i915_gem_create create;
@@ -852,6 +859,37 @@ static const xf86CrtcFuncsRec sna_crtc_funcs = {
 	.destroy = sna_crtc_destroy,
 };
 
+static uint32_t
+sna_crtc_find_plane(struct sna *sna, int pipe)
+{
+	drmModePlaneRes *resources;
+	uint32_t id = 0;
+	int i;
+
+	resources = drmModeGetPlaneResources(sna->kgem.fd);
+	if (!resources) {
+		xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+			   "failed to get plane resources: %s\n",
+			   strerror(errno));
+		return 0;
+	}
+
+	for (i = 0; id == 0 && i < resources->count_planes; i++) {
+		drmModePlane *p;
+
+		p = drmModeGetPlane(sna->kgem.fd, resources->planes[i]);
+		if (p) {
+			if (p->possible_crtcs & (1 << pipe))
+				id = p->plane_id;
+
+			drmModeFreePlane(p);
+		}
+	}
+
+	free(resources);
+	return id;
+}
+
 static void
 sna_crtc_init(ScrnInfoPtr scrn, struct sna_mode *mode, int num)
 {
@@ -878,6 +916,7 @@ sna_crtc_init(ScrnInfoPtr scrn, struct sna_mode *mode, int num)
 		 DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID,
 		 &get_pipe);
 	sna_crtc->pipe = get_pipe.pipe;
+	sna_crtc->plane = sna_crtc_find_plane(sna, sna_crtc->pipe);
 
 	if (xf86IsEntityShared(scrn->entityList[0]) &&
 	    scrn->confScreen->device->screen != sna_crtc->pipe) {
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 56cf260..c80ccfb 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -83,12 +83,17 @@ void sna_video_free_buffers(struct sna *sna, struct sna_video *video)
 
 	for (i = 0; i < ARRAY_SIZE(video->old_buf); i++) {
 		if (video->old_buf[i]) {
+			if (video->old_buf[i]->unique_id)
+				drmModeRmFB(sna->kgem.fd,
+						video->old_buf[i]->unique_id);
 			kgem_bo_destroy(&sna->kgem, video->old_buf[i]);
 			video->old_buf[i] = NULL;
 		}
 	}
 
 	if (video->buf) {
+		if (video->buf->unique_id)
+			drmModeRmFB(sna->kgem.fd, video->buf->unique_id);
 		kgem_bo_destroy(&sna->kgem, video->buf);
 		video->buf = NULL;
 	}
@@ -440,6 +445,11 @@ sna_video_copy_data(struct sna *sna,
 	if (frame->bo == NULL)
 		return FALSE;
 
+	DBG(("%s: handle=%d, size=%dx%d, rotation=%d\n",
+	     __FUNCTION__, frame->bo->handle, frame->width, frame->height,
+	     video->rotation));
+	DBG(("%s: top=%d, left=%d\n", __FUNCTION__, frame->top, frame->left));
+
 	/* In the common case, we can simply the upload in a single pwrite */
 	if (video->rotation == RR_Rotate_0) {
 		if (is_planar_fourcc(frame->id)) {
@@ -472,8 +482,8 @@ sna_video_copy_data(struct sna *sna,
 		}
 	}
 
-	/* copy data */
-	dst = kgem_bo_map(&sna->kgem, frame->bo);
+	/* copy data, must use GTT so that we keep the overlay uncached */
+	dst = kgem_bo_map__gtt(&sna->kgem, frame->bo);
 	if (dst == NULL)
 		return FALSE;
 
@@ -510,7 +520,9 @@ void sna_video_init(struct sna *sna, ScreenPtr screen)
 	 * supported hardware.
 	 */
 	textured = sna_video_textured_setup(sna, screen);
-	overlay = sna_video_overlay_setup(sna, screen);
+	overlay = sna_video_sprite_setup(sna, screen);
+	if (overlay == NULL)
+		overlay = sna_video_overlay_setup(sna, screen);
 
 	if (overlay && prefer_overlay)
 		adaptors[num_adaptors++] = overlay;
diff --git a/src/sna/sna_video.h b/src/sna/sna_video.h
index 47ddab0..687fbe1 100644
--- a/src/sna/sna_video.h
+++ b/src/sna/sna_video.h
@@ -51,6 +51,7 @@ struct sna_video {
 	uint32_t gamma5;
 
 	int color_key;
+	int color_key_changed;
 
 	/** YUV data buffers */
 	struct kgem_bo *old_buf[2];
@@ -58,6 +59,7 @@ struct sna_video {
 
 	Bool textured;
 	Rotation rotation;
+	int plane;
 
 	int SyncToVblank;	/* -1: auto, 0: off, 1: on */
 };
@@ -78,10 +80,9 @@ struct sna_video_frame {
 };
 
 void sna_video_init(struct sna *sna, ScreenPtr screen);
-XF86VideoAdaptorPtr sna_video_overlay_setup(struct sna *sna,
-					    ScreenPtr screen);
-XF86VideoAdaptorPtr sna_video_textured_setup(struct sna *sna,
-					     ScreenPtr screen);
+XF86VideoAdaptorPtr sna_video_overlay_setup(struct sna *sna, ScreenPtr screen);
+XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna, ScreenPtr screen);
+XF86VideoAdaptorPtr sna_video_textured_setup(struct sna *sna, ScreenPtr screen);
 
 #define FOURCC_XVMC     (('C' << 24) + ('M' << 16) + ('V' << 8) + 'X')
 
diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
new file mode 100644
index 0000000..82db122
--- /dev/null
+++ b/src/sna/sna_video_sprite.c
@@ -0,0 +1,434 @@
+/***************************************************************************
+
+ Copyright 2000-2011 Intel Corporation.  All Rights Reserved.
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sub license, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial portions
+ of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ IN NO EVENT SHALL INTEL, AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "sna.h"
+#include "sna_video.h"
+
+#include <xf86xv.h>
+#include <X11/extensions/Xv.h>
+#include <fourcc.h>
+#include <drm_fourcc.h>
+#include <i915_drm.h>
+
+#if DEBUG_VIDEO_OVERLAY
+#undef DBG
+#define DBG(x) ErrorF x
+#endif
+
+#define IMAGE_MAX_WIDTH		2048
+#define IMAGE_MAX_HEIGHT	2048
+
+#define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
+
+static Atom xvColorKey;
+
+static XF86VideoFormatRec xv_formats[] = {
+	{15, TrueColor}, {16, TrueColor}, {24, TrueColor}
+};
+static XF86ImageRec xv_images[] = { XVIMAGE_YUY2, XVIMAGE_UYVY, };
+static const XF86VideoEncodingRec xv_dummy_encoding[] = {
+	{ 0, "XV_IMAGE", IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT, {1, 1} }
+};
+static XF86AttributeRec attribs[] = {
+	{XvSettable | XvGettable, 0, 0xffffff, "XV_COLORKEY"},
+};
+
+static void sna_video_sprite_off(struct sna *sna, struct sna_video *video)
+{
+	if (video->plane == 0)
+		return;
+
+	if (drmModeSetPlane(sna->kgem.fd,
+			    video->plane, 0, 0, 0,
+			    0, 0, 0, 0,
+			    0, 0, 0, 0))
+		xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+			   "failed to disable plane\n");
+
+	video->plane = 0;
+}
+
+static void sna_video_sprite_stop(ScrnInfoPtr scrn, pointer data, Bool shutdown)
+{
+	return sna_video_sprite_off(to_sna(scrn), data);
+}
+
+static int sna_video_sprite_set_attr(ScrnInfoPtr scrn,
+				     Atom attribute, INT32 value,
+				     pointer data)
+{
+	struct sna_video *video = data;
+
+	if (attribute == xvColorKey) {
+		video->color_key_changed = TRUE;
+		video->color_key = value;
+		DBG(("COLORKEY = %d\n", value));
+	} else
+		return BadMatch;
+
+	return Success;
+}
+
+static int sna_video_sprite_get_attr(ScrnInfoPtr scrn,
+				     Atom attribute, INT32 *value,
+				     pointer data)
+{
+	struct sna_video *video = data;
+
+	if (attribute == xvColorKey)
+		*value = video->color_key;
+	else
+		return BadMatch;
+
+	return Success;
+}
+
+static void sna_video_sprite_best_size(ScrnInfoPtr scrn, Bool motion,
+				       short vid_w, short vid_h,
+				       short drw_w, short drw_h,
+				       unsigned int *p_w, unsigned int *p_h,
+				       pointer data)
+{
+	*p_w = vid_w;
+	*p_h = vid_h;
+}
+
+static void
+update_dst_box_to_crtc_coords(struct sna *sna, xf86CrtcPtr crtc, BoxPtr dstBox)
+{
+	ScrnInfoPtr scrn = sna->scrn;
+	int tmp;
+
+	switch (crtc->rotation & 0xf) {
+	case RR_Rotate_0:
+		dstBox->x1 -= crtc->x;
+		dstBox->x2 -= crtc->x;
+		dstBox->y1 -= crtc->y;
+		dstBox->y2 -= crtc->y;
+		break;
+
+	case RR_Rotate_90:
+		tmp = dstBox->x1;
+		dstBox->x1 = dstBox->y1 - crtc->x;
+		dstBox->y1 = scrn->virtualX - tmp - crtc->y;
+		tmp = dstBox->x2;
+		dstBox->x2 = dstBox->y2 - crtc->x;
+		dstBox->y2 = scrn->virtualX - tmp - crtc->y;
+		tmp = dstBox->y1;
+		dstBox->y1 = dstBox->y2;
+		dstBox->y2 = tmp;
+		break;
+
+	case RR_Rotate_180:
+		tmp = dstBox->x1;
+		dstBox->x1 = scrn->virtualX - dstBox->x2 - crtc->x;
+		dstBox->x2 = scrn->virtualX - tmp - crtc->x;
+		tmp = dstBox->y1;
+		dstBox->y1 = scrn->virtualY - dstBox->y2 - crtc->y;
+		dstBox->y2 = scrn->virtualY - tmp - crtc->y;
+		break;
+
+	case RR_Rotate_270:
+		tmp = dstBox->x1;
+		dstBox->x1 = scrn->virtualY - dstBox->y1 - crtc->x;
+		dstBox->y1 = tmp - crtc->y;
+		tmp = dstBox->x2;
+		dstBox->x2 = scrn->virtualY - dstBox->y2 - crtc->x;
+		dstBox->y2 = tmp - crtc->y;
+		tmp = dstBox->x1;
+		dstBox->x1 = dstBox->x2;
+		dstBox->x2 = tmp;
+		break;
+	}
+}
+
+static Bool
+sna_video_sprite_show(struct sna *sna,
+		      struct sna_video *video,
+		      struct sna_video_frame *frame,
+		      xf86CrtcPtr crtc,
+		      BoxPtr dstBox)
+{
+	int plane = sna_crtc_to_plane(crtc);
+
+	update_dst_box_to_crtc_coords(sna, crtc, dstBox);
+	if (crtc->rotation & (RR_Rotate_90 | RR_Rotate_270)) {
+		int tmp = frame->width;
+		frame->width = frame->height;
+		frame->height = tmp;
+	}
+
+#if defined(DRM_I915_SET_SPRITE_DESTKEY)
+	if (video->color_key_changed || video->plane != plane) {
+		struct drm_intel_set_sprite_destkey set;
+
+		DBG(("%s: updating color key: %x\n",
+		     __FUNCTION__, video->color_key));
+
+		set.plane_id = plane;
+		set.value = video->color_key;
+
+		if (drmCommandWrite(sna->kgem.fd,
+				    DRM_I915_SET_SPRITE_DESTKEY,
+				    &set, sizeof(set)))
+			xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+				   "failed to update color key\n");
+
+		video->color_key_changed = FALSE;
+	}
+#endif
+
+	if (frame->bo->unique_id == 0) {
+		uint32_t offsets[4], pitches[4], handles[4];
+		uint32_t pixel_format;
+
+		switch (frame->id) {
+		case FOURCC_UYVY:
+			pixel_format = DRM_FORMAT_UYVY;
+			break;
+		case FOURCC_YUY2:
+		default:
+			pixel_format = DRM_FORMAT_YUYV;
+			break;
+		}
+
+		handles[0] = frame->bo->handle;
+		pitches[0] = frame->pitch[0];
+		offsets[0] = 0;
+
+		DBG(("%s: creating new fb for handle=%d\n",
+		     __FUNCTION__, frame->bo->handle));
+
+		if (drmModeAddFB2(sna->kgem.fd,
+				  frame->width, frame->height, pixel_format,
+				  handles, pitches, offsets,
+				  &frame->bo->unique_id, 0)) {
+			xf86DrvMsg(sna->scrn->scrnIndex,
+				   X_ERROR, "failed to add fb\n");
+			return false;
+		}
+	}
+
+	DBG(("%s: updating plane=%d, handle=%d [fb %d], dst=(%d,%d)x(%d,%d)\n",
+	     __FUNCTION__, plane, frame->bo->handle, frame->bo->unique_id,
+	     dstBox->x1, dstBox->y1,
+	     dstBox->x2 - dstBox->x1, dstBox->y2 - dstBox->y1));
+	if (drmModeSetPlane(sna->kgem.fd,
+			    plane, sna_crtc_id(crtc), frame->bo->unique_id, 0,
+			    dstBox->x1, dstBox->y1,
+			    dstBox->x2 - dstBox->x1, dstBox->y2 - dstBox->y1,
+			    0, 0, frame->width << 16, frame->height << 16))
+		return false;
+
+	video->plane = plane;
+	return true;
+}
+
+static int sna_video_sprite_put_image(ScrnInfoPtr scrn,
+				      short src_x, short src_y,
+				      short drw_x, short drw_y,
+				      short src_w, short src_h,
+				      short drw_w, short drw_h,
+				      int id, unsigned char *buf,
+				      short width, short height,
+				      Bool sync, RegionPtr clip, pointer data,
+				      DrawablePtr drawable)
+{
+	struct sna *sna = to_sna(scrn);
+	struct sna_video *video = data;
+	struct sna_video_frame frame;
+	xf86CrtcPtr crtc;
+	BoxRec dst_box;
+
+	sna_video_frame_init(sna, video, id, width, height, &frame);
+
+	if (!sna_video_clip_helper(scrn, video, &frame, &crtc, &dst_box,
+				   src_x, src_y, drw_x, drw_y,
+				   src_w, src_h, drw_w, drw_h,
+				   clip))
+		return Success;
+
+	if (!crtc || !sna_crtc_to_plane(crtc)) {
+		/* If the video isn't visible on any CRTC, turn it off */
+		sna_video_sprite_off(sna, video);
+		return Success;
+	}
+
+	/* sprites can't handle rotation natively, store it for the copy func */
+	video->rotation = crtc->rotation;
+
+	frame.bo = sna_video_buffer(sna, video, &frame);
+	if (frame.bo == NULL) {
+		DBG(("%s: failed to allocate video bo\n", __FUNCTION__));
+		return BadAlloc;
+	}
+
+	if (!sna_video_copy_data(sna, video, &frame, buf)) {
+		DBG(("%s: failed to copy video data\n", __FUNCTION__));
+		return BadAlloc;
+	}
+
+	if (!sna_video_sprite_show(sna, video, &frame, crtc, &dst_box)) {
+		DBG(("%s: failed to show video frame\n", __FUNCTION__));
+		return BadAlloc;
+	}
+
+	sna_video_buffer_fini(sna, video);
+
+	if (!REGION_EQUAL(scrn->pScreen, &video->clip, clip)) {
+		REGION_COPY(scrn->pScreen, &video->clip, clip);
+		xf86XVFillKeyHelperDrawable(drawable, video->color_key, clip);
+	}
+
+	return Success;
+}
+
+static int sna_video_sprite_query_attrs(ScrnInfoPtr scrn, int id,
+					unsigned short *w, unsigned short *h,
+					int *pitches, int *offsets)
+{
+	int size;
+
+	if (*w > IMAGE_MAX_WIDTH)
+		*w = IMAGE_MAX_WIDTH;
+	if (*h > IMAGE_MAX_HEIGHT)
+		*h = IMAGE_MAX_HEIGHT;
+
+	*w = (*w + 1) & ~1;
+	if (offsets)
+		offsets[0] = 0;
+
+	switch (id) {
+	case FOURCC_YUY2:
+	default:
+		size = *w << 1;
+		if (pitches)
+			pitches[0] = size;
+		size *= *h;
+		break;
+	}
+
+	return size;
+}
+
+static int sna_video_sprite_color_key(struct sna *sna)
+{
+	ScrnInfoPtr scrn = sna->scrn;
+	int color_key;
+
+	if (xf86GetOptValInteger(sna->Options, OPTION_VIDEO_KEY,
+				 &color_key)) {
+	} else if (xf86GetOptValInteger(sna->Options, OPTION_COLOR_KEY,
+					&color_key)) {
+	} else {
+		color_key =
+		    (1 << scrn->offset.red) |
+		    (1 << scrn->offset.green) |
+		    (((scrn->mask.blue >> scrn->offset.blue) - 1) << scrn->offset.blue);
+	}
+
+	return color_key & ((1 << scrn->depth) - 1);
+}
+
+XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
+					   ScreenPtr screen)
+{
+	XF86VideoAdaptorPtr adaptor;
+	struct sna_video *video;
+	drmModePlaneRes *plane_resources;
+
+	plane_resources = drmModeGetPlaneResources(sna->kgem.fd);
+	if (!plane_resources)
+		return NULL;
+
+	adaptor = calloc(1,
+			 sizeof(XF86VideoAdaptorRec) +
+			 sizeof(struct sna_video) +
+			 sizeof(DevUnion));
+	if (!adaptor) {
+		free(plane_resources);
+		return NULL;
+	}
+
+	adaptor->type = XvWindowMask | XvInputMask | XvImageMask;
+	adaptor->flags = VIDEO_OVERLAID_IMAGES /*| VIDEO_CLIP_TO_VIEWPORT */ ;
+	adaptor->name = "Intel(R) Video Sprite";
+	adaptor->nEncodings = ARRAY_SIZE(xv_dummy_encoding);
+	adaptor->pEncodings = xnfalloc(sizeof(xv_dummy_encoding));
+	memcpy(adaptor->pEncodings, xv_dummy_encoding, sizeof(xv_dummy_encoding));
+	adaptor->nFormats = ARRAY_SIZE(xv_formats);
+	adaptor->pFormats = xv_formats;
+	adaptor->nPorts = 1;
+	adaptor->pPortPrivates = (DevUnion *)&adaptor[1];
+
+	video = (struct sna_video *)&adaptor->pPortPrivates[1];
+	adaptor->pPortPrivates[0].ptr = video;
+
+	adaptor->nAttributes = ARRAY_SIZE(attribs);
+	adaptor->pAttributes = attribs;
+
+	adaptor->nImages = ARRAY_SIZE(xv_images);
+	adaptor->pImages = xv_images;
+
+	adaptor->PutVideo = NULL;
+	adaptor->PutStill = NULL;
+	adaptor->GetVideo = NULL;
+	adaptor->GetStill = NULL;
+	adaptor->StopVideo = sna_video_sprite_stop;
+	adaptor->SetPortAttribute = sna_video_sprite_set_attr;
+	adaptor->GetPortAttribute = sna_video_sprite_get_attr;
+	adaptor->QueryBestSize = sna_video_sprite_best_size;
+	adaptor->PutImage = sna_video_sprite_put_image;
+	adaptor->QueryImageAttributes = sna_video_sprite_query_attrs;
+
+	video->textured = FALSE;
+	video->color_key = sna_video_sprite_color_key(sna);
+	video->color_key_changed = TRUE;
+	video->brightness = -19;	/* (255/219) * -16 */
+	video->contrast = 75;	/* 255/219 * 64 */
+	video->saturation = 146;	/* 128/112 * 128 */
+	video->desired_crtc = NULL;
+	video->gamma5 = 0xc0c0c0;
+	video->gamma4 = 0x808080;
+	video->gamma3 = 0x404040;
+	video->gamma2 = 0x202020;
+	video->gamma1 = 0x101010;
+	video->gamma0 = 0x080808;
+
+	video->rotation = RR_Rotate_0;
+
+	REGION_NULL(screen, &video->clip);
+
+	xvColorKey = MAKE_ATOM("XV_COLORKEY");
+
+	free(plane_resources);
+
+	return adaptor;
+}
commit 0286994c3f6a173d87e06ef16a50ae33768f8b57
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 28 22:07:10 2012 +0100

    sna: Fix up 32-bit overflow for maximum object size calculation
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 964c6e9..27fa165 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -84,7 +84,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #if defined(USE_VMAP) && !defined(I915_PARAM_HAS_VMAP)
 #define DRM_I915_GEM_VMAP       0x2c
 #define DRM_IOCTL_I915_GEM_VMAP DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_VMAP, struct drm_i915_gem_vmap)
-#define I915_PARAM_HAS_VMAP              18
+#define I915_PARAM_HAS_VMAP              19
 struct drm_i915_gem_vmap {
 	uint64_t user_ptr;
 	uint32_t user_size;
@@ -690,7 +690,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
-	kgem->max_object_size = 2 * kgem->aperture_total / 3;
+	kgem->max_object_size = 2 * aperture.aper_size / 3;
 	kgem->max_gpu_size = kgem->max_object_size;
 	if (!kgem->has_llc)
 		kgem->max_gpu_size = MAX_CACHE_SIZE;
@@ -741,9 +741,11 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	} else
 		kgem->max_cpu_size = 0;
 
+	DBG(("%s: maximum object size=%d\n",
+	     __FUNCTION__, kgem->max_object_size));
 	DBG(("%s: large object thresold=%d\n",
 	     __FUNCTION__, kgem->large_object_size));
-	DBG(("%s: max object size (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
+	DBG(("%s: max object sizes (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
 	     __FUNCTION__,
 	     kgem->max_gpu_size, kgem->max_cpu_size,
 	     kgem->max_upload_tile_size, kgem->max_copy_tile_size));
@@ -2392,11 +2394,16 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 	uint32_t pitch, size;
 	unsigned flags = 0;
 
-	if (depth < 8)
+	if (depth < 8) {
+		DBG(("%s: unhandled depth %d\n", __FUNCTION__, depth));
 		return 0;
+	}
 
-	if (width > MAXSHORT || height > MAXSHORT)
+	if (width > MAXSHORT || height > MAXSHORT) {
+		DBG(("%s: unhandled size %dx%d\n",
+		     __FUNCTION__, width, height));
 		return 0;
+	}
 
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
@@ -2405,8 +2412,11 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 		flags |= KGEM_CAN_CREATE_CPU | KGEM_CAN_CREATE_GPU;
 	if (size > kgem->large_object_size)
 		flags |= KGEM_CAN_CREATE_LARGE;
-	if (size > kgem->max_object_size)
+	if (size > kgem->max_object_size) {
+		DBG(("%s: too large (untiled) %d > %d\n",
+		     __FUNCTION__, size, kgem->max_object_size));
 		return 0;
+	}
 
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
@@ -2417,8 +2427,11 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 		flags |= KGEM_CAN_CREATE_GPU;
 	if (size > kgem->large_object_size)
 		flags |= KGEM_CAN_CREATE_LARGE;
-	if (size > kgem->max_object_size)
+	if (size > kgem->max_object_size) {
+		DBG(("%s: too large (tiled) %d > %d\n",
+		     __FUNCTION__, size, kgem->max_object_size));
 		return 0;
+	}
 
 	return flags;
 }
commit 47a610fa29164549bb806989f32e1f65676412a8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 27 10:42:59 2012 +0100

    sna/traps: Prefer to try mono spans on the GPU before trying inplace CPU
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index be46789..4067757 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3092,12 +3092,14 @@ composite_unaligned_boxes(struct sna *sna,
 
 	switch (op) {
 	case PictOpAdd:
+	case PictOpOver:
 		if (priv->clear && priv->clear_color == 0)
 			op = PictOpSrc;
 		break;
 	case PictOpIn:
 		if (priv->clear && priv->clear_color == 0)
 			return true;
+		break;
 	}
 
 	memset(&tmp, 0, sizeof(tmp));
@@ -4479,7 +4481,7 @@ sna_composite_trapezoids(CARD8 op,
 
 	/* scan through for fast rectangles */
 	rectilinear = pixel_aligned = true;
-	if (maskFormat ? maskFormat->depth == 1 : dst->polyEdge == PolyEdgeSharp) {
+	if (is_mono(dst, maskFormat)) {
 		for (n = 0; n < ntrap && rectilinear; n++) {
 			int lx1 = pixman_fixed_to_int(traps[n].left.p1.x + pixman_fixed_1_minus_e/2);
 			int lx2 = pixman_fixed_to_int(traps[n].left.p2.x + pixman_fixed_1_minus_e/2);
@@ -4531,6 +4533,13 @@ sna_composite_trapezoids(CARD8 op,
 		}
 		flags |= COMPOSITE_SPANS_RECTILINEAR;
 	}
+
+	if (is_mono(dst, maskFormat) &&
+	    mono_trapezoids_span_converter(op, src, dst,
+					   xSrc, ySrc,
+					   ntrap, traps))
+		return;
+
 	if (trapezoid_spans_maybe_inplace(op, src, dst, maskFormat)) {
 		flags |= COMPOSITE_SPANS_INPLACE_HINT;
 		if (trapezoid_span_inplace(op, src, dst, maskFormat,
commit 12122f6fafd6c759f14bbb6564ef97818e6ba0db
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 27 10:42:21 2012 +0100

    sna/traps: Fix the width of the left-hand edge of an unaligned box
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 9ab5ae2..be46789 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2759,7 +2759,7 @@ composite_unaligned_trap_row(struct sna *sna,
 	} else {
 		if (pixman_fixed_frac(trap->left.p1.x)) {
 			box.x1 = x1;
-			box.x2 = x1++;
+			box.x2 = ++x1;
 
 			opacity = covered;
 			opacity *= SAMPLES_X - grid_coverage(SAMPLES_X, trap->left.p1.x);
commit 2194925837d29674cbb553b9e5f86ccd57f336f6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 27 10:41:37 2012 +0100

    sna/gen6: Reduce opaque solid OVER to SRC for render composite
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 6f1b55a..3be9195 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2033,6 +2033,7 @@ gen6_composite_solid_init(struct sna *sna,
 	channel->repeat = RepeatNormal;
 	channel->is_affine = TRUE;
 	channel->is_solid  = TRUE;
+	channel->is_opaque = (color >> 24) == 0xff;
 	channel->transform = NULL;
 	channel->width  = 1;
 	channel->height = 1;
@@ -2251,7 +2252,7 @@ gen6_composite_picture(struct sna *sna,
 				    x, y, w, h, dst_x, dst_y);
 }
 
-static void gen6_composite_channel_convert(struct sna_composite_channel *channel)
+inline static void gen6_composite_channel_convert(struct sna_composite_channel *channel)
 {
 	channel->repeat = gen6_repeat(channel->repeat);
 	channel->filter = gen6_filter(channel->filter);
@@ -2735,6 +2736,8 @@ gen6_render_composite(struct sna *sna,
 			DBG(("%s: choosing gen6_emit_composite_primitive_solid\n",
 			     __FUNCTION__));
 			tmp->prim_emit = gen6_emit_composite_primitive_solid;
+			if (tmp->src.is_opaque && op == PictOpOver)
+				tmp->op = PictOpSrc;
 		} else if (tmp->src.transform == NULL) {
 			DBG(("%s: choosing gen6_emit_composite_primitive_identity_source\n",
 			     __FUNCTION__));
commit 447535f2ab53de11a7e2e8ba073696b14dd93f87
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 26 16:03:42 2012 +0100

    sna: Remove bogus assertion of no references to in-flight upload buffers
    
    As we may hold a cached reference to an upload buffer whilst it is
    in-flight, the assertion that there are no such references to a buffer
    being reused is no longer true. Those cached references will be released
    as soon as we retire the buffer during the readback and so we are free
    to reuse such an upload buffer for immediate readback.
    
    Reported-by: Jiri Slaby <jirislaby at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 26abdd0..964c6e9 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3502,7 +3502,6 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		if (flags == KGEM_BUFFER_LAST &&
 		    bo->write == KGEM_BUFFER_WRITE &&
 		    !bo->mmapped && size <= bytes(&bo->base)) {
-			assert(bo->base.refcnt == 1);
 			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
 			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 			gem_write(kgem->fd, bo->base.handle,
commit ddccf5a50cf741709e6a59f13287e4206d8e757d
Author: Paulo Zanoni <paulo.r.zanoni at intel.com>
Date:   Tue Mar 20 11:53:21 2012 -0300

    Avoid duplicated code with intel_output_create_ranged_atom
    
    Same change for intel_display.c and sna_display.c.
    
    Signed-off-by: Paulo Zanoni <paulo.r.zanoni at intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_display.c b/src/intel_display.c
index 11d0e2b..abdc372 100644
--- a/src/intel_display.c
+++ b/src/intel_display.c
@@ -992,6 +992,33 @@ intel_property_ignore(drmModePropertyPtr prop)
 	return FALSE;
 }
 
+static void
+intel_output_create_ranged_atom(xf86OutputPtr output, Atom *atom,
+				const char *name, INT32 min, INT32 max,
+				uint64_t value, Bool immutable)
+{
+	int err;
+	INT32 atom_range[2];
+
+	atom_range[0] = min;
+	atom_range[1] = max;
+
+	*atom = MakeAtom(name, strlen(name), TRUE);
+
+	err = RRConfigureOutputProperty(output->randr_output, *atom, FALSE,
+					TRUE, immutable, 2, atom_range);
+	if (err != 0)
+		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+			   "RRConfigureOutputProperty error, %d\n", err);
+
+	err = RRChangeOutputProperty(output->randr_output, *atom, XA_INTEGER,
+				     32, PropModeReplace, 1, &value, FALSE,
+				     TRUE);
+	if (err != 0)
+		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+			   "RRChangeOutputProperty error, %d\n", err);
+}
+
 #define BACKLIGHT_NAME             "Backlight"
 #define BACKLIGHT_DEPRECATED_NAME  "BACKLIGHT"
 static Atom backlight_atom, backlight_deprecated_atom;
@@ -1031,30 +1058,18 @@ intel_output_create_resources(xf86OutputPtr output)
 		drmModePropertyPtr drmmode_prop = p->mode_prop;
 
 		if (drmmode_prop->flags & DRM_MODE_PROP_RANGE) {
-			INT32 range[2];
-
 			p->num_atoms = 1;
 			p->atoms = calloc(p->num_atoms, sizeof(Atom));
 			if (!p->atoms)
 				continue;
 
-			p->atoms[0] = MakeAtom(drmmode_prop->name, strlen(drmmode_prop->name), TRUE);
-			range[0] = drmmode_prop->values[0];
-			range[1] = drmmode_prop->values[1];
-			err = RRConfigureOutputProperty(output->randr_output, p->atoms[0],
-							FALSE, TRUE,
-							drmmode_prop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE,
-							2, range);
-			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-					   "RRConfigureOutputProperty error, %d\n", err);
-			}
-			err = RRChangeOutputProperty(output->randr_output, p->atoms[0],
-						     XA_INTEGER, 32, PropModeReplace, 1, &p->value, FALSE, TRUE);
-			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-					   "RRChangeOutputProperty error, %d\n", err);
-			}
+			intel_output_create_ranged_atom(output, &p->atoms[0],
+							drmmode_prop->name,
+							drmmode_prop->values[0],
+							drmmode_prop->values[1],
+							p->value,
+							drmmode_prop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE);
+
 		} else if (drmmode_prop->flags & DRM_MODE_PROP_ENUM) {
 			p->num_atoms = drmmode_prop->count_enums + 1;
 			p->atoms = calloc(p->num_atoms, sizeof(Atom));
@@ -1090,50 +1105,21 @@ intel_output_create_resources(xf86OutputPtr output)
 	}
 
 	if (intel_output->backlight_iface) {
-		INT32 data, backlight_range[2];
-
 		/* Set up the backlight property, which takes effect
 		 * immediately and accepts values only within the
 		 * backlight_range.
 		 */
-		backlight_atom = MakeAtom(BACKLIGHT_NAME, sizeof(BACKLIGHT_NAME) - 1, TRUE);
-		backlight_deprecated_atom = MakeAtom(BACKLIGHT_DEPRECATED_NAME,
-						     sizeof(BACKLIGHT_DEPRECATED_NAME) - 1, TRUE);
-
-		backlight_range[0] = 0;
-		backlight_range[1] = intel_output->backlight_max;
-		err = RRConfigureOutputProperty(output->randr_output,
-					       	backlight_atom,
-						FALSE, TRUE, FALSE,
-					       	2, backlight_range);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRConfigureOutputProperty error, %d\n", err);
-		}
-		err = RRConfigureOutputProperty(output->randr_output,
-					       	backlight_deprecated_atom,
-						FALSE, TRUE, FALSE,
-					       	2, backlight_range);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRConfigureOutputProperty error, %d\n", err);
-		}
-		/* Set the current value of the backlight property */
-		data = intel_output->backlight_active_level;
-		err = RRChangeOutputProperty(output->randr_output, backlight_atom,
-					     XA_INTEGER, 32, PropModeReplace, 1, &data,
-					     FALSE, TRUE);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRChangeOutputProperty error, %d\n", err);
-		}
-		err = RRChangeOutputProperty(output->randr_output, backlight_deprecated_atom,
-					     XA_INTEGER, 32, PropModeReplace, 1, &data,
-					     FALSE, TRUE);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRChangeOutputProperty error, %d\n", err);
-		}
+		intel_output_create_ranged_atom(output, &backlight_atom,
+					BACKLIGHT_NAME, 0,
+					intel_output->backlight_max,
+					intel_output->backlight_active_level,
+					FALSE);
+		intel_output_create_ranged_atom(output,
+					&backlight_deprecated_atom,
+					BACKLIGHT_DEPRECATED_NAME, 0,
+					intel_output->backlight_max,
+					intel_output->backlight_active_level,
+					FALSE);
 	}
 }
 
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index 9401ca4..a5d69dd 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -1205,6 +1205,33 @@ sna_property_ignore(drmModePropertyPtr prop)
 	return FALSE;
 }
 
+static void
+sna_output_create_ranged_atom(xf86OutputPtr output, Atom *atom,
+			      const char *name, INT32 min, INT32 max,
+			      uint64_t value, Bool immutable)
+{
+	int err;
+	INT32 atom_range[2];
+
+	atom_range[0] = min;
+	atom_range[1] = max;
+
+	*atom = MakeAtom(name, strlen(name), TRUE);
+
+	err = RRConfigureOutputProperty(output->randr_output, *atom, FALSE,
+					TRUE, immutable, 2, atom_range);
+	if (err != 0)
+		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+			   "RRConfigureOutputProperty error, %d\n", err);
+
+	err = RRChangeOutputProperty(output->randr_output, *atom, XA_INTEGER,
+				     32, PropModeReplace, 1, &value, FALSE,
+				     TRUE);
+	if (err != 0)
+		xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
+			   "RRChangeOutputProperty error, %d\n", err);
+}
+
 #define BACKLIGHT_NAME             "Backlight"
 #define BACKLIGHT_DEPRECATED_NAME  "BACKLIGHT"
 static Atom backlight_atom, backlight_deprecated_atom;
@@ -1244,30 +1271,18 @@ sna_output_create_resources(xf86OutputPtr output)
 		drmModePropertyPtr drmmode_prop = p->mode_prop;
 
 		if (drmmode_prop->flags & DRM_MODE_PROP_RANGE) {
-			INT32 range[2];
-
 			p->num_atoms = 1;
 			p->atoms = calloc(p->num_atoms, sizeof(Atom));
 			if (!p->atoms)
 				continue;
 
-			p->atoms[0] = MakeAtom(drmmode_prop->name, strlen(drmmode_prop->name), TRUE);
-			range[0] = drmmode_prop->values[0];
-			range[1] = drmmode_prop->values[1];
-			err = RRConfigureOutputProperty(output->randr_output, p->atoms[0],
-							FALSE, TRUE,
-							drmmode_prop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE,
-							2, range);
-			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-					   "RRConfigureOutputProperty error, %d\n", err);
-			}
-			err = RRChangeOutputProperty(output->randr_output, p->atoms[0],
-						     XA_INTEGER, 32, PropModeReplace, 1, &p->value, FALSE, TRUE);
-			if (err != 0) {
-				xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-					   "RRChangeOutputProperty error, %d\n", err);
-			}
+			sna_output_create_ranged_atom(output, &p->atoms[0],
+						      drmmode_prop->name,
+						      drmmode_prop->values[0],
+						      drmmode_prop->values[1],
+						      p->value,
+						      drmmode_prop->flags & DRM_MODE_PROP_IMMUTABLE ? TRUE : FALSE);
+
 		} else if (drmmode_prop->flags & DRM_MODE_PROP_ENUM) {
 			p->num_atoms = drmmode_prop->count_enums + 1;
 			p->atoms = calloc(p->num_atoms, sizeof(Atom));
@@ -1303,50 +1318,21 @@ sna_output_create_resources(xf86OutputPtr output)
 	}
 
 	if (sna_output->backlight_iface) {
-		INT32 data, backlight_range[2];
-
 		/* Set up the backlight property, which takes effect
 		 * immediately and accepts values only within the
 		 * backlight_range.
 		 */
-		backlight_atom = MakeAtom(BACKLIGHT_NAME, sizeof(BACKLIGHT_NAME) - 1, TRUE);
-		backlight_deprecated_atom = MakeAtom(BACKLIGHT_DEPRECATED_NAME,
-						     sizeof(BACKLIGHT_DEPRECATED_NAME) - 1, TRUE);
-
-		backlight_range[0] = 0;
-		backlight_range[1] = sna_output->backlight_max;
-		err = RRConfigureOutputProperty(output->randr_output,
-					       	backlight_atom,
-						FALSE, TRUE, FALSE,
-					       	2, backlight_range);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRConfigureOutputProperty error, %d\n", err);
-		}
-		err = RRConfigureOutputProperty(output->randr_output,
-					       	backlight_deprecated_atom,
-						FALSE, TRUE, FALSE,
-					       	2, backlight_range);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRConfigureOutputProperty error, %d\n", err);
-		}
-		/* Set the current value of the backlight property */
-		data = sna_output->backlight_active_level;
-		err = RRChangeOutputProperty(output->randr_output, backlight_atom,
-					     XA_INTEGER, 32, PropModeReplace, 1, &data,
-					     FALSE, TRUE);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRChangeOutputProperty error, %d\n", err);
-		}
-		err = RRChangeOutputProperty(output->randr_output, backlight_deprecated_atom,
-					     XA_INTEGER, 32, PropModeReplace, 1, &data,
-					     FALSE, TRUE);
-		if (err != 0) {
-			xf86DrvMsg(output->scrn->scrnIndex, X_ERROR,
-				   "RRChangeOutputProperty error, %d\n", err);
-		}
+		sna_output_create_ranged_atom(output, &backlight_atom,
+					BACKLIGHT_NAME, 0,
+					sna_output->backlight_max,
+					sna_output->backlight_active_level,
+					FALSE);
+		sna_output_create_ranged_atom(output,
+					&backlight_deprecated_atom,
+					BACKLIGHT_DEPRECATED_NAME, 0,
+					sna_output->backlight_max,
+					sna_output->backlight_active_level,
+					FALSE);
 	}
 }
 
commit bbee45ed78e4d2af942d168084d881fbdb70e4c5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 25 21:25:15 2012 +0100

    sna/gen2+: Approximate expensive gradients when using imprecise rendering
    
    If we lack the ability to use a shader to compute the gradients
    per-pixel, we need to use pixman to render a fallback texture. We can
    reduce the size of this texture and upsample to reduce the cost with
    hopefully imperceptible loss of quality.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 6907dd6..0ad346e 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1310,7 +1310,8 @@ gen2_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int x, int y,
 		       int w, int h,
-		       int dst_x, int dst_y)
+		       int dst_x, int dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -1343,6 +1344,8 @@ gen2_composite_picture(struct sna *sna,
 	}
 
 	if (picture->pDrawable == NULL) {
+		int ret;
+
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
 			return gen2_composite_linear_init(sna, picture, channel,
 							  x, y,
@@ -1351,8 +1354,14 @@ gen2_composite_picture(struct sna *sna,
 
 		DBG(("%s -- fallback, unhandled source %d\n",
 		     __FUNCTION__, picture->pSourcePict->type));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
+		ret = -1;
+		if (!precise)
+			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+								      x, y, w, h, dst_x, dst_y);
+		if (ret == -1)
+			ret = sna_render_picture_fixup(sna, picture, channel,
+						       x, y, w, h, dst_x, dst_y);
+		return ret;
 	}
 
 	if (picture->alphaMap) {
@@ -1765,7 +1774,8 @@ gen2_render_composite(struct sna *sna,
 	switch (gen2_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
@@ -1781,7 +1791,8 @@ gen2_render_composite(struct sna *sna,
 			switch (gen2_composite_picture(sna, mask, &tmp->mask,
 						       mask_x, mask_y,
 						       width,  height,
-						       dst_x,  dst_y)) {
+						       dst_x,  dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				goto cleanup_src;
 			case 0:
@@ -2229,7 +2240,8 @@ gen2_render_composite_spans(struct sna *sna,
 	switch (gen2_composite_picture(sna, src, &tmp->base.src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index b236761..e798096 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2194,7 +2194,7 @@ gen3_init_linear(struct sna *sna,
 	op->u.gen3.constants[n++] = 0;
 
 	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
-		return 0;
+		return -1;
 
 	channel->u.gen3.type = SHADER_LINEAR;
 	op->u.gen3.num_constants = n;
@@ -2250,7 +2250,7 @@ gen3_init_radial(struct sna *sna,
 	}
 
 	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
-		return 0;
+		return -1;
 
 	channel->u.gen3.type = SHADER_RADIAL;
 	op->u.gen3.num_constants = n;
@@ -2285,7 +2285,8 @@ gen3_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int16_t x, int16_t y,
 		       int16_t w, int16_t h,
-		       int16_t dst_x, int16_t dst_y)
+		       int16_t dst_x, int16_t dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -2298,7 +2299,7 @@ gen3_composite_picture(struct sna *sna,
 
 	if (picture->pDrawable == NULL) {
 		SourcePict *source = picture->pSourcePict;
-		int ret = 0;
+		int ret = -1;
 
 		switch (source->type) {
 		case SourcePictTypeSolidFill:
@@ -2316,9 +2317,14 @@ gen3_composite_picture(struct sna *sna,
 			break;
 		}
 
-		if (ret == 0)
-			ret = sna_render_picture_fixup(sna, picture, channel,
-						       x, y, w, h, dst_x, dst_y);
+		if (ret == -1) {
+			if (!precise)
+				ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+									      x, y, w, h, dst_x, dst_y);
+			if (ret == -1)
+				ret = sna_render_picture_fixup(sna, picture, channel,
+							       x, y, w, h, dst_x, dst_y);
+		}
 		return ret;
 	}
 
@@ -2815,7 +2821,8 @@ gen3_render_composite(struct sna *sna,
 	switch (gen3_composite_picture(sna, src, tmp, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
@@ -2840,7 +2847,8 @@ gen3_render_composite(struct sna *sna,
 			switch (gen3_composite_picture(sna, mask, tmp, &tmp->mask,
 						       mask_x, mask_y,
 						       width,  height,
-						       dst_x,  dst_y)) {
+						       dst_x,  dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				goto cleanup_src;
 			case 0:
@@ -3379,7 +3387,8 @@ gen3_render_composite_spans(struct sna *sna,
 	switch (gen3_composite_picture(sna, src, &tmp->base, &tmp->base.src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index def5d19..2e78a92 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1957,7 +1957,8 @@ gen4_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int x, int y,
 		       int w, int h,
-		       int dst_x, int dst_y)
+		       int dst_x, int dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -1973,6 +1974,8 @@ gen4_composite_picture(struct sna *sna,
 		return gen4_composite_solid_init(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
+		int ret;
+
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
 			return gen4_composite_linear_init(sna, picture, channel,
 							  x, y,
@@ -1980,8 +1983,14 @@ gen4_composite_picture(struct sna *sna,
 							  dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
+		ret = -1;
+		if (!precise)
+			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+								      x, y, w, h, dst_x, dst_y);
+		if (ret == -1)
+			ret = sna_render_picture_fixup(sna, picture, channel,
+						       x, y, w, h, dst_x, dst_y);
+		return ret;
 	}
 
 	if (picture->alphaMap) {
@@ -2404,7 +2413,8 @@ gen4_render_composite(struct sna *sna,
 	switch (gen4_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		DBG(("%s: failed to prepare source\n", __FUNCTION__));
 		goto cleanup_dst;
@@ -2449,7 +2459,8 @@ gen4_render_composite(struct sna *sna,
 			switch (gen4_composite_picture(sna, mask, &tmp->mask,
 						       msk_x, msk_y,
 						       width, height,
-						       dst_x, dst_y)) {
+						       dst_x, dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				DBG(("%s: failed to prepare mask\n", __FUNCTION__));
 				goto cleanup_src;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 565d22a..c27accd 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1990,7 +1990,8 @@ gen5_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int x, int y,
 		       int w, int h,
-		       int dst_x, int dst_y)
+		       int dst_x, int dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -2006,6 +2007,8 @@ gen5_composite_picture(struct sna *sna,
 		return gen5_composite_solid_init(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
+		int ret;
+
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
 			return gen5_composite_linear_init(sna, picture, channel,
 							  x, y,
@@ -2013,8 +2016,14 @@ gen5_composite_picture(struct sna *sna,
 							  dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
+		ret = -1;
+		if (!precise)
+			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+								      x, y, w, h, dst_x, dst_y);
+		if (ret == -1)
+			ret = sna_render_picture_fixup(sna, picture, channel,
+						       x, y, w, h, dst_x, dst_y);
+		return ret;
 	}
 
 	if (picture->alphaMap) {
@@ -2444,7 +2453,8 @@ gen5_render_composite(struct sna *sna,
 	switch (gen5_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		DBG(("%s: failed to prepare source picture\n", __FUNCTION__));
 		goto cleanup_dst;
@@ -2488,7 +2498,8 @@ gen5_render_composite(struct sna *sna,
 			switch (gen5_composite_picture(sna, mask, &tmp->mask,
 						       msk_x, msk_y,
 						       width, height,
-						       dst_x, dst_y)) {
+						       dst_x, dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				DBG(("%s: failed to prepare mask picture\n", __FUNCTION__));
 				goto cleanup_src;
@@ -2800,7 +2811,8 @@ gen5_render_composite_spans(struct sna *sna,
 	switch (gen5_composite_picture(sna, src, &tmp->base.src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index fde0776..6f1b55a 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2165,7 +2165,8 @@ gen6_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int x, int y,
 		       int w, int h,
-		       int dst_x, int dst_y)
+		       int dst_x, int dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -2181,6 +2182,8 @@ gen6_composite_picture(struct sna *sna,
 		return gen6_composite_solid_init(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
+		int ret;
+
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
 			return gen6_composite_linear_init(sna, picture, channel,
 							  x, y,
@@ -2188,8 +2191,14 @@ gen6_composite_picture(struct sna *sna,
 							  dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
+		ret = -1;
+		if (!precise)
+			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+								      x, y, w, h, dst_x, dst_y);
+		if (ret == -1)
+			ret = sna_render_picture_fixup(sna, picture, channel,
+						       x, y, w, h, dst_x, dst_y);
+		return ret;
 	}
 
 	if (picture->alphaMap) {
@@ -2646,7 +2655,8 @@ gen6_render_composite(struct sna *sna,
 	switch (gen6_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
@@ -2702,7 +2712,8 @@ gen6_render_composite(struct sna *sna,
 			switch (gen6_composite_picture(sna, mask, &tmp->mask,
 						       msk_x, msk_y,
 						       width, height,
-						       dst_x, dst_y)) {
+						       dst_x, dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				goto cleanup_src;
 			case 0:
@@ -3089,7 +3100,8 @@ gen6_render_composite_spans(struct sna *sna,
 	switch (gen6_composite_picture(sna, src, &tmp->base.src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 36ea8a1..1491167 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2270,7 +2270,8 @@ gen7_composite_picture(struct sna *sna,
 		       struct sna_composite_channel *channel,
 		       int x, int y,
 		       int w, int h,
-		       int dst_x, int dst_y)
+		       int dst_x, int dst_y,
+		       bool precise)
 {
 	PixmapPtr pixmap;
 	uint32_t color;
@@ -2286,6 +2287,8 @@ gen7_composite_picture(struct sna *sna,
 		return gen7_composite_solid_init(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
+		int ret;
+
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
 			return gen7_composite_linear_init(sna, picture, channel,
 							  x, y,
@@ -2293,8 +2296,14 @@ gen7_composite_picture(struct sna *sna,
 							  dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
+		ret = -1;
+		if (!precise)
+			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
+								      x, y, w, h, dst_x, dst_y);
+		if (ret == -1)
+			ret = sna_render_picture_fixup(sna, picture, channel,
+						       x, y, w, h, dst_x, dst_y);
+		return ret;
 	}
 
 	if (picture->alphaMap) {
@@ -2732,7 +2741,8 @@ gen7_render_composite(struct sna *sna,
 	switch (gen7_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
@@ -2788,7 +2798,8 @@ gen7_render_composite(struct sna *sna,
 			switch (gen7_composite_picture(sna, mask, &tmp->mask,
 						       msk_x, msk_y,
 						       width, height,
-						       dst_x, dst_y)) {
+						       dst_x, dst_y,
+						       dst->polyMode == PolyModePrecise)) {
 			case -1:
 				goto cleanup_src;
 			case 0:
@@ -3185,7 +3196,8 @@ gen7_render_composite_spans(struct sna *sna,
 	switch (gen7_composite_picture(sna, src, &tmp->base.src,
 				       src_x, src_y,
 				       width, height,
-				       dst_x, dst_y)) {
+				       dst_x, dst_y,
+				       dst->polyMode == PolyModePrecise)) {
 	case -1:
 		goto cleanup_dst;
 	case 0:
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 542cdb9..d774a34 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1369,6 +1369,92 @@ sna_render_picture_flatten(struct sna *sna,
 }
 
 int
+sna_render_picture_approximate_gradient(struct sna *sna,
+					PicturePtr picture,
+					struct sna_composite_channel *channel,
+					int16_t x, int16_t y,
+					int16_t w, int16_t h,
+					int16_t dst_x, int16_t dst_y)
+{
+	pixman_image_t *dst, *src;
+	pixman_transform_t t;
+	int w2 = w/2, h2 = h/2;
+	int dx, dy;
+	void *ptr;
+
+#if NO_FIXUP
+	return -1;
+#endif
+
+	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
+
+	if (w2 == 0 || h2 == 0) {
+		DBG(("%s: fallback - unknown bounds\n", __FUNCTION__));
+		return -1;
+	}
+	if (w2 > sna->render.max_3d_size || h2 > sna->render.max_3d_size) {
+		DBG(("%s: fallback - too large (%dx%d)\n", __FUNCTION__, w, h));
+		return -1;
+	}
+
+	channel->pict_format = PIXMAN_a8r8g8b8;
+	channel->bo = kgem_create_buffer_2d(&sna->kgem,
+					    w2, h2, 32,
+					    KGEM_BUFFER_WRITE_INPLACE,
+					    &ptr);
+	if (!channel->bo) {
+		DBG(("%s: failed to create upload buffer, using clear\n",
+		     __FUNCTION__));
+		return 0;
+	}
+
+	dst = pixman_image_create_bits(PIXMAN_a8r8g8b8,
+				       w2, h2, ptr, channel->bo->pitch);
+	if (!dst) {
+		kgem_bo_destroy(&sna->kgem, channel->bo);
+		return 0;
+	}
+
+	src = image_from_pict(picture, FALSE, &dx, &dy);
+	if (src == NULL) {
+		pixman_image_unref(dst);
+		kgem_bo_destroy(&sna->kgem, channel->bo);
+		return 0;
+	}
+
+	memset(&t, 0, sizeof(t));
+	t.matrix[0][0] = (w << 16) / w2;
+	t.matrix[1][1] = (h << 16) / h2;
+	t.matrix[2][2] = 1 << 16;
+	if (picture->transform)
+		pixman_transform_multiply(&t, picture->transform, &t);
+	pixman_image_set_transform(src, &t);
+
+	pixman_image_composite(PictOpSrc, src, NULL, dst,
+			       x + dx, y + dy,
+			       0, 0,
+			       0, 0,
+			       w2, h2);
+	free_pixman_pict(picture, src);
+	pixman_image_unref(dst);
+
+	channel->width  = w2;
+	channel->height = h2;
+
+	channel->filter = PictFilterNearest;
+	channel->repeat = RepeatNone;
+	channel->is_affine = TRUE;
+
+	channel->scale[0] = 1.f/w;
+	channel->scale[1] = 1.f/h;
+	channel->offset[0] = -dst_x;
+	channel->offset[1] = -dst_y;
+	channel->transform = NULL;
+
+	return 1;
+}
+
+int
 sna_render_picture_fixup(struct sna *sna,
 			 PicturePtr picture,
 			 struct sna_composite_channel *channel,
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 8d3d9e4..a83f78e 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -615,6 +615,14 @@ sna_render_picture_extract(struct sna *sna,
 			   int16_t dst_x, int16_t dst_y);
 
 int
+sna_render_picture_approximate_gradient(struct sna *sna,
+					PicturePtr picture,
+					struct sna_composite_channel *channel,
+					int16_t x, int16_t y,
+					int16_t w, int16_t h,
+					int16_t dst_x, int16_t dst_y);
+
+int
 sna_render_picture_fixup(struct sna *sna,
 			 PicturePtr picture,
 			 struct sna_composite_channel *channel,
commit f2cce24d5ae6a65059b201908615accdc96ca76e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 23 14:56:06 2012 +0000

    uxa: Remove hook for CompositeRectangles
    
    It was broken and not flushing damage correctly. With the
    improvements made to the kernel, it is no longer a significant advantage
    per se and not worth its additional complexity.
    
    Reported-by: Tilman Sauerbeck <tilman at code-monkey.de>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=32547
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-priv.h b/uxa/uxa-priv.h
index 0de45f5..b24ec4f 100644
--- a/uxa/uxa-priv.h
+++ b/uxa/uxa-priv.h
@@ -123,7 +123,6 @@ typedef struct {
 	BitmapToRegionProcPtr SavedBitmapToRegion;
 #ifdef RENDER
 	CompositeProcPtr SavedComposite;
-	CompositeRectsProcPtr SavedCompositeRects;
 	TrianglesProcPtr SavedTriangles;
 	GlyphsProcPtr SavedGlyphs;
 	TrapezoidsProcPtr SavedTrapezoids;
diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index 877b286..1e88c5d 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -947,195 +947,6 @@ uxa_acquire_mask(ScreenPtr screen,
 				    out_x, out_y);
 }
 
-static Bool
-_pixman_region_init_rectangles(pixman_region16_t *region,
-			       int num_rects,
-			       xRectangle *rects,
-			       int tx, int ty)
-{
-	pixman_box16_t stack_boxes[64], *boxes = stack_boxes;
-	pixman_bool_t ret;
-	int i;
-
-	if (num_rects > sizeof(stack_boxes) / sizeof(stack_boxes[0])) {
-		boxes = malloc(sizeof(pixman_box16_t) * num_rects);
-		if (boxes == NULL)
-			return FALSE;
-	}
-
-	for (i = 0; i < num_rects; i++) {
-		boxes[i].x1 = rects[i].x + tx;
-		boxes[i].y1 = rects[i].y + ty;
-		boxes[i].x2 = rects[i].x + tx + rects[i].width;
-		boxes[i].y2 = rects[i].y + ty + rects[i].height;
-	}
-
-	ret = pixman_region_init_rects(region, boxes, num_rects);
-
-	if (boxes != stack_boxes)
-		free(boxes);
-
-	return ret;
-}
-
-void
-uxa_solid_rects (CARD8		op,
-		 PicturePtr	dst,
-		 xRenderColor  *color,
-		 int		num_rects,
-		 xRectangle    *rects)
-{
-	ScreenPtr screen = dst->pDrawable->pScreen;
-	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
-	PixmapPtr dst_pixmap, src_pixmap = NULL;
-	pixman_region16_t region;
-	pixman_box16_t *boxes, *extents;
-	PicturePtr src;
-	int dst_x, dst_y;
-	int num_boxes;
-
-	if (!pixman_region_not_empty(dst->pCompositeClip))
-		return;
-
-	if (uxa_screen->info->flags & UXA_USE_GLAMOR) {
-		int ok;
-
-		uxa_picture_prepare_access(dst, UXA_GLAMOR_ACCESS_RW);
-		ok = glamor_composite_rects_nf(op, dst, color,
-					       num_rects, rects);
-		uxa_picture_finish_access(dst, UXA_GLAMOR_ACCESS_RW);
-
-		if (!ok)
-			goto fallback;
-
-		return;
-	}
-
-	if (dst->alphaMap)
-		goto fallback;
-
-	dst_pixmap = uxa_get_offscreen_pixmap(dst->pDrawable, &dst_x, &dst_y);
-	if (!dst_pixmap)
-		goto fallback;
-
-	if (!_pixman_region_init_rectangles(&region,
-					    num_rects, rects,
-					    dst->pDrawable->x, dst->pDrawable->y))
-		goto fallback;
-
-	if (!pixman_region_intersect(&region, &region, dst->pCompositeClip)) {
-		pixman_region_fini(&region);
-		return;
-	}
-
-	pixman_region_translate(&region, dst_x, dst_y);
-	boxes = pixman_region_rectangles(&region, &num_boxes);
-	extents = pixman_region_extents (&region);
-
-	if (op == PictOpClear)
-		color->red = color->green = color->blue = color->alpha = 0;
-	if (color->alpha >= 0xff00 && op == PictOpOver) {
-		color->alpha = 0xffff;
-		op = PictOpSrc;
-	}
-
-	/* Using GEM, the relocation costs outweigh the advantages of the blitter */
-	if (num_boxes == 1 && (op == PictOpSrc || op == PictOpClear)) {
-		CARD32 pixel;
-
-try_solid:
-		if (uxa_screen->info->check_solid &&
-		    !uxa_screen->info->check_solid(&dst_pixmap->drawable, GXcopy, FB_ALLONES))
-			goto err_region;
-
-		if (!uxa_get_pixel_from_rgba(&pixel,
-					     color->red,
-					     color->green,
-					     color->blue,
-					     color->alpha,
-					     dst->format))
-			goto err_region;
-
-		if (!uxa_screen->info->prepare_solid(dst_pixmap, GXcopy, FB_ALLONES, pixel))
-			goto err_region;
-
-		while (num_boxes--) {
-			uxa_screen->info->solid(dst_pixmap,
-						boxes->x1, boxes->y1,
-						boxes->x2, boxes->y2);
-			boxes++;
-		}
-
-		uxa_screen->info->done_solid(dst_pixmap);
-	} else {
-		int error;
-
-		src = CreateSolidPicture(0, color, &error);
-		if (!src)
-			goto err_region;
-
-		if (!uxa_screen->info->check_composite(op, src, NULL, dst,
-						       extents->x2 - extents->x1,
-						       extents->y2 - extents->y1)) {
-			if (op == PictOpSrc || op == PictOpClear) {
-				FreePicture(src, 0);
-				goto try_solid;
-			}
-
-			goto err_src;
-		}
-
-		if (!uxa_screen->info->check_composite_texture ||
-		    !uxa_screen->info->check_composite_texture(screen, src)) {
-			PicturePtr solid;
-			int src_off_x, src_off_y;
-
-			solid = uxa_acquire_solid(screen, src->pSourcePict);
-			if (!solid)
-				goto err_src;
-			FreePicture(src, 0);
-
-			src = solid;
-			src_pixmap = uxa_get_offscreen_pixmap(src->pDrawable,
-							      &src_off_x, &src_off_y);
-			if (!src_pixmap)
-				goto err_src;
-		}
-
-		if (!uxa_screen->info->prepare_composite(op, src, NULL, dst, src_pixmap, NULL, dst_pixmap))
-			goto err_src;
-
-		while (num_boxes--) {
-			uxa_screen->info->composite(dst_pixmap,
-						    0, 0, 0, 0,
-						    boxes->x1,
-						    boxes->y1,
-						    boxes->x2 - boxes->x1,
-						    boxes->y2 - boxes->y1);
-			boxes++;
-		}
-
-		uxa_screen->info->done_composite(dst_pixmap);
-		FreePicture(src, 0);
-	}
-
-	/* XXX xserver-1.8: CompositeRects is not tracked by Damage, so we must
-	 * manually append the damaged regions ourselves.
-	 */
-	pixman_region_translate(&region, -dst_x, -dst_y);
-	DamageRegionAppend(dst->pDrawable, &region);
-
-	pixman_region_fini(&region);
-	return;
-
-err_src:
-	FreePicture(src, 0);
-err_region:
-	pixman_region_fini(&region);
-fallback:
-	uxa_screen->SavedCompositeRects(op, dst, color, num_rects, rects);
-}
-
 static int
 uxa_try_driver_composite(CARD8 op,
 			 PicturePtr pSrc,
diff --git a/uxa/uxa.c b/uxa/uxa.c
index eb2ae03..b4a1da6 100644
--- a/uxa/uxa.c
+++ b/uxa/uxa.c
@@ -407,7 +407,6 @@ static Bool uxa_close_screen(int i, ScreenPtr pScreen)
 #ifdef RENDER
 	if (ps) {
 		ps->Composite = uxa_screen->SavedComposite;
-		ps->CompositeRects = uxa_screen->SavedCompositeRects;
 		ps->Glyphs = uxa_screen->SavedGlyphs;
 		ps->Trapezoids = uxa_screen->SavedTrapezoids;
 		ps->AddTraps = uxa_screen->SavedAddTraps;
@@ -536,9 +535,6 @@ Bool uxa_driver_init(ScreenPtr screen, uxa_driver_t * uxa_driver)
 			uxa_screen->SavedComposite = ps->Composite;
 			ps->Composite = uxa_composite;
 
-			uxa_screen->SavedCompositeRects = ps->CompositeRects;
-			ps->CompositeRects = uxa_solid_rects;
-
 			uxa_screen->SavedGlyphs = ps->Glyphs;
 			ps->Glyphs = uxa_glyphs;
 
commit 816041a927a9d799b60bcf9ad1536e5ecc56d901
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 23 11:05:55 2012 +0000

    configure: Stop the debug build erroring out if it cannot find valgrind
    
    Another case where I passed an empty string believing that would be
    sufficient to replace the error path...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 63101bf..c8a7531 100644
--- a/configure.ac
+++ b/configure.ac
@@ -253,8 +253,10 @@ if test "x$DEBUG" = xno; then
 fi
 if test "x$DEBUG" != xno; then
 	AC_DEFINE(HAS_EXTRA_DEBUG,1,[Enable additional debugging])
-	PKG_CHECK_MODULES(VALGRIND, [valgrind],
-			  AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warings]),)
+	PKG_CHECK_MODULES(VALGRIND, [valgrind], have_valgrind=yes, have_valgrind=no)
+	if test x$have_valgrind = xyes; then
+		AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warnings])
+	fi
 fi
 if test "x$DEBUG" = xfull; then
 	AC_DEFINE(HAS_DEBUG_FULL,1,[Enable all debugging])
commit e42810a253edf8040c2abd8e7d927dfeca3adfae
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 22 23:47:56 2012 +0000

    sna: Make the fallback debugging messages more consistent
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 424c8ad..bf76948 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -147,22 +147,22 @@ static void __sna_fallback_flush(DrawablePtr d)
 static int sna_font_key;
 
 static const uint8_t copy_ROP[] = {
-	ROP_0,                  /* GXclear */
-	ROP_DSa,                /* GXand */
-	ROP_SDna,               /* GXandReverse */
-	ROP_S,                  /* GXcopy */
-	ROP_DSna,               /* GXandInverted */
-	ROP_D,                  /* GXnoop */
-	ROP_DSx,                /* GXxor */
-	ROP_DSo,                /* GXor */
-	ROP_DSon,               /* GXnor */
-	ROP_DSxn,               /* GXequiv */
-	ROP_Dn,                 /* GXinvert */
-	ROP_SDno,               /* GXorReverse */
-	ROP_Sn,                 /* GXcopyInverted */
-	ROP_DSno,               /* GXorInverted */
-	ROP_DSan,               /* GXnand */
-	ROP_1                   /* GXset */
+	ROP_0,		/* GXclear */
+	ROP_DSa,	/* GXand */
+	ROP_SDna,	/* GXandReverse */
+	ROP_S,		/* GXcopy */
+	ROP_DSna,	/* GXandInverted */
+	ROP_D,		/* GXnoop */
+	ROP_DSx,	/* GXxor */
+	ROP_DSo,	/* GXor */
+	ROP_DSon,	/* GXnor */
+	ROP_DSxn,	/* GXequiv */
+	ROP_Dn,		/* GXinvert */
+	ROP_SDno,	/* GXorReverse */
+	ROP_Sn,		/* GXcopyInverted */
+	ROP_DSno,	/* GXorInverted */
+	ROP_DSan,	/* GXnand */
+	ROP_1		/* GXset */
 };
 static const uint8_t fill_ROP[] = {
 	ROP_0,
@@ -2850,8 +2850,11 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
 		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
-		if (bo == NULL)
+		if (bo == NULL) {
+			DBG(("%s: fallback -- unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
+		}
 	}
 
 	assert_pixmap_contains_box(pixmap, RegionExtents(region));
@@ -2969,8 +2972,11 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
 		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
-		if (bo == NULL)
+		if (bo == NULL) {
+			DBG(("%s: fallback -- unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
+		}
 	}
 
 	assert_pixmap_contains_box(pixmap, RegionExtents(region));
@@ -3857,7 +3863,7 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	    !PM_IS_SOLID(dst, gc->planemask)) {
 		RegionRec region, *ret;
 
-		DBG(("%s: -- fallback, wedged=%d, solid=%d [%x]\n",
+		DBG(("%s: fallback -- wedged=%d, solid=%d [%x]\n",
 		     __FUNCTION__, sna->kgem.wedged,
 		     PM_IS_SOLID(dst, gc->planemask),
 		     (unsigned)gc->planemask));
@@ -5338,8 +5344,11 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		if (arg.bo->tiling == I915_TILING_Y) {
 			assert(arg.bo == sna_pixmap_get_bo(pixmap));
 			arg.bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
-			if (arg.bo == NULL)
+			if (arg.bo == NULL) {
+				DBG(("%s: fallback -- unable to change tiling\n",
+				     __FUNCTION__));
 				goto fallback;
+			}
 		}
 		RegionUninit(&region);
 		return miDoCopy(src, dst, gc,
@@ -9784,10 +9793,13 @@ sna_poly_fill_rect_stippled_blt(DrawablePtr drawable,
 
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
 		/* This is cheating, but only the gpu_bo can be tiled */
-		assert(bo == sna_pixmap(pixmap)->gpu_bo);
+		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
-		if (bo == NULL)
+		if (bo == NULL) {
+			DBG(("%s: fallback -- unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
+		}
 	}
 
 	if (!sna_drawable_move_to_cpu(&stipple->drawable, MOVE_READ))
@@ -10217,7 +10229,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	     __FUNCTION__, _x, _y, _n, fg, bg, rop));
 
 	if (wedged(sna)) {
-		DBG(("%s -- fallback, wedged\n", __FUNCTION__));
+		DBG(("%s: fallback -- wedged\n", __FUNCTION__));
 		return false;
 	}
 
@@ -10230,7 +10242,8 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
 		if (bo == NULL) {
-			DBG(("%s -- fallback, dst uses Y-tiling\n", __FUNCTION__));
+			DBG(("%s: fallback -- unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
 		}
 	}
@@ -10897,7 +10910,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
 		if (bo == NULL) {
-			DBG(("%s -- fallback, unable to change tiling\n",
+			DBG(("%s: fallback -- unable to change tiling\n",
 			     __FUNCTION__));
 			return false;
 		}
@@ -11251,8 +11264,11 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
 		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
-		if (bo == NULL)
+		if (bo == NULL) {
+			DBG(("%s: fallback -- unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
+		}
 	}
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
commit abfa4426e597e1159603b409e35b3bc1b16bf214
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 22 23:43:04 2012 +0000

    sna: Update the target bo after changing from Y tiling
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f6bbeab..424c8ad 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10894,8 +10894,11 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 
 	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
-		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_X)) {
-			DBG(("%s -- fallback, dst uses Y-tiling\n", __FUNCTION__));
+		assert(bo == sna_pixmap_get_bo(pixmap));
+		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
+		if (bo == NULL) {
+			DBG(("%s -- fallback, unable to change tiling\n",
+			     __FUNCTION__));
 			return false;
 		}
 	}
commit 02098ad0d931060ab94dcdc3861a8ba91330303d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 22 21:57:14 2012 +0000

    sna: Correctly test for clear glyphs when searching for ones to skip
    
    With xterm, it is quite common for it to redraw itself by using lots of
    spaces and so it is efficient for us if we ellide those clear glyphs and
    only draw the backing boxes. However, we were only checking the first 8
    pixels in each line because of a missing pointer increment.
    
    Fixes absent '=' characters when using a compositor and ImageText.
    
    Reported-by: Jiri Slaby <jirislaby at gmail.com
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47735
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b8f5059..f6bbeab 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10964,10 +10964,15 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			x1 = x + c->metrics.leftSideBearing;
 			y1 = y - c->metrics.ascent;
 
-			if (x1 >= extents->x2 || y1 >= extents->y2)
-				goto skip;
-			if (x1 + w <= extents->x1 || y1 + h <= extents->y1)
+			if (x1 >= extents->x2 || y1 >= extents->y2 ||
+			    x1 + w <= extents->x1 || y1 + h <= extents->y1) {
+				DBG(("%s: glyph is clipped (%d, %d)x(%d,%d) against extents (%d, %d), (%d, %d)\n",
+				     __FUNCTION__,
+				     x1, y1, w, h,
+				     extents->x1, extents->y1,
+				     extents->x2, extents->y2));
 				goto skip;
+			}
 
 			if (!transparent) {
 				int clear = 1, j = h;
@@ -10976,12 +10981,15 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 				do {
 					i = w8;
 					do {
-						clear = *g == 0;
+						clear = *g++ == 0;
 					} while (clear && --i);
 					g += stride - w8;
 				} while (clear && --j);
-				if (clear)
+				if (clear) {
+					DBG(("%s: skipping clear glyph for ImageGlyph\n",
+					     __FUNCTION__));
 					goto skip;
+				}
 			}
 
 			if (!kgem_check_batch(&sna->kgem, 3+len)) {
commit 7dab0616633e0e8099f1a79dcfabb82ab1ddad86
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 22 11:21:43 2012 +0000

    sna: Adjust the damage region for the composite offset
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index e5031c0..d281776 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -487,9 +487,19 @@ sna_composite(CARD8 op,
 	     get_drawable_dx(dst->pDrawable),
 	     get_drawable_dy(dst->pDrawable)));
 
-	if (op <= PictOpSrc)
+	if (op <= PictOpSrc) {
+		int16_t x, y;
+
+		get_drawable_deltas(dst->pDrawable, pixmap, &x, &y);
+		if (x|y)
+			pixman_region_translate(&region, x, y);
+
 		sna_damage_subtract(&priv->cpu_damage, &region);
 
+		if (x|y)
+			pixman_region_translate(&region, -x, -y);
+	}
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite(sna,
 				   op, src, mask, dst,
commit b3f5d57d6498919441bad88a64ff461e8e67fd5d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 22 09:22:52 2012 +0000

    sna: Force fallbacks if the destination is unattached
    
    Since the removal of the ability to create a backing pixmap after the
    creation of its parent, it no longer becomes practical to attempt
    rendering with the GPU to unattached pixmaps. So having made the
    decision never to render to that pixmap, perform the test explicitly
    along the render paths.
    
    This fixes a segmentation fault introduced in 8a303f195 (sna: Remove
    existing damage before overwriting with a composite op) which assumed
    the existence of a backing pixmap along a render path.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47700
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index b098fcc..e5031c0 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -413,7 +413,9 @@ sna_composite(CARD8 op,
 	      INT16 dst_x,  INT16 dst_y,
 	      CARD16 width, CARD16 height)
 {
-	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
+	PixmapPtr pixmap = get_drawable_pixmap(dst->pDrawable);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_pixmap *priv;
 	struct sna_composite_op tmp;
 	unsigned flags;
 	RegionRec region;
@@ -462,8 +464,14 @@ sna_composite(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(dst->pDrawable) &&
-	    !picture_is_gpu(src) && !picture_is_gpu(mask)) {
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: fallback as destination is unattached\n",
+		     __FUNCTION__));
+		goto fallback;
+	}
+
+	if (too_small(priv) && !picture_is_gpu(src) && !picture_is_gpu(mask)) {
 		DBG(("%s: fallback due to too small\n", __FUNCTION__));
 		goto fallback;
 	}
@@ -479,10 +487,8 @@ sna_composite(CARD8 op,
 	     get_drawable_dx(dst->pDrawable),
 	     get_drawable_dy(dst->pDrawable)));
 
-	if (op <= PictOpSrc) {
-		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
+	if (op <= PictOpSrc)
 		sna_damage_subtract(&priv->cpu_damage, &region);
-	}
 
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite(sna,
@@ -752,17 +758,17 @@ sna_composite_rectangles(CARD8		 op,
 
 	boxes = pixman_region_rectangles(&region, &num_boxes);
 
-	if (too_small(dst->pDrawable)) {
-		DBG(("%s: fallback, dst is too small\n", __FUNCTION__));
-		goto fallback;
-	}
-
 	priv = sna_pixmap(pixmap);
 	if (priv == NULL) {
 		DBG(("%s: fallback, not attached\n", __FUNCTION__));
 		goto fallback;
 	}
 
+	if (too_small(priv)) {
+		DBG(("%s: fallback, dst is too small\n", __FUNCTION__));
+		goto fallback;
+	}
+
 	/* If we going to be overwriting any CPU damage with a subsequent
 	 * operation, then we may as well delete it without moving it
 	 * first to the GPU.
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 1c536c8..235528c 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -1217,7 +1217,9 @@ sna_glyphs(CARD8 op,
 	   INT16 src_x, INT16 src_y,
 	   int nlist, GlyphListPtr list, GlyphPtr *glyphs)
 {
-	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
+	PixmapPtr pixmap = get_drawable_pixmap(dst->pDrawable);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_pixmap *priv;
 	PictFormatPtr _mask;
 
 	DBG(("%s(op=%d, nlist=%d, src=(%d, %d))\n",
@@ -1234,14 +1236,20 @@ sna_glyphs(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(dst->pDrawable) && !picture_is_gpu(src)) {
-		DBG(("%s: fallback -- too small (%dx%d)\n",
-		     __FUNCTION__, dst->pDrawable->width, dst->pDrawable->height));
+	if (dst->alphaMap) {
+		DBG(("%s: fallback -- dst alpha map\n", __FUNCTION__));
 		goto fallback;
 	}
 
-	if (dst->alphaMap) {
-		DBG(("%s: fallback -- dst alpha map\n", __FUNCTION__));
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: fallback -- destination unattached\n", __FUNCTION__));
+		goto fallback;
+	}
+
+	if (too_small(priv) && !picture_is_gpu(src)) {
+		DBG(("%s: fallback -- too small (%dx%d)\n",
+		     __FUNCTION__, dst->pDrawable->width, dst->pDrawable->height));
 		goto fallback;
 	}
 
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 88d0130..956e2aa 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -101,13 +101,10 @@ is_dirty(DrawablePtr drawable)
 	return priv == NULL || kgem_bo_is_dirty(priv->gpu_bo);
 }
 
-static inline Bool
-too_small(DrawablePtr drawable)
+static inline bool
+too_small(struct sna_pixmap *priv)
 {
-	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
-
-	if (priv == NULL)
-		return true;
+	assert(priv);
 
 	if (priv->gpu_damage)
 		return false;
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 7048cae..9ab5ae2 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -4435,7 +4435,9 @@ sna_composite_trapezoids(CARD8 op,
 			 INT16 xSrc, INT16 ySrc,
 			 int ntrap, xTrapezoid *traps)
 {
-	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
+	PixmapPtr pixmap = get_drawable_pixmap(dst->pDrawable);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_pixmap *priv;
 	bool rectilinear, pixel_aligned;
 	unsigned flags;
 	int n;
@@ -4461,7 +4463,13 @@ sna_composite_trapezoids(CARD8 op,
 		goto fallback;
 	}
 
-	if (too_small(dst->pDrawable) && !picture_is_gpu(src)) {
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: fallback -- dst is unattached\n", __FUNCTION__));
+		goto fallback;
+	}
+
+	if (too_small(priv) && !picture_is_gpu(src)) {
 		DBG(("%s: fallback -- dst is too small, %dx%d\n",
 		     __FUNCTION__,
 		     dst->pDrawable->width,
commit e2917f818eb07417e514c70d0abf498a9c291185
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 21 13:31:03 2012 +0000

    sna: Assert that the bo created is large enough
    
    Double check that the maximum access size computed from the bo
    parameters is within the allocated size for the bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index c022900..26abdd0 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2509,6 +2509,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 			bo->refcnt = 1;
 			return bo;
 		}
@@ -2561,6 +2562,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
 				assert(bo->reusable);
 				assert(bo->domain != DOMAIN_GPU && !kgem_busy(kgem, bo->handle));
+				assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 				bo->refcnt = 1;
 				return bo;
 			}
@@ -2615,6 +2617,7 @@ search_again:
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 			bo->refcnt = 1;
 			return bo;
 		}
@@ -2636,6 +2639,7 @@ search_again:
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 			bo->refcnt = 1;
 			return bo;
 		}
@@ -2669,6 +2673,7 @@ search_again:
 					bo->delta = 0;
 					DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 					     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+					assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 					bo->refcnt = 1;
 					return bo;
 				}
@@ -2715,6 +2720,7 @@ search_again:
 				bo->delta = 0;
 				DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+				assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 				bo->refcnt = 1;
 				return bo;
 			}
@@ -2773,6 +2779,7 @@ search_inactive:
 		assert((flags & CREATE_INACTIVE) == 0 || bo->domain != DOMAIN_GPU);
 		assert((flags & CREATE_INACTIVE) == 0 ||
 		       !kgem_busy(kgem, bo->handle));
+		assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 		bo->refcnt = 1;
 		return bo;
 	}
commit ad587bc9a53f10b6c4f91723b09c116420d7fde9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 21 13:30:45 2012 +0000

    sna: Assert that the tiled blt is correctly clipped
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 33e08be..b8f5059 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -8697,6 +8697,11 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 			if (tile_y < 0)
 				tile_y += tile_height;
 
+			assert(r.x + dx >= 0);
+			assert(r.y + dy >= 0);
+			assert(r.x + dx + r.width  <= pixmap->drawable.width);
+			assert(r.y + dy + r.height <= pixmap->drawable.height);
+
 			r.y += dy;
 			do {
 				int16_t width = r.width;
commit ba111177f777fd0dcf196536de314adf948db966
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 21 09:06:10 2012 +0000

    sna: Fallback to inplace upload if forced to tile the indirect replacement
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47629
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 62a8962..4f5a634 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -1072,6 +1072,7 @@ indirect_replace(struct sna *sna,
 {
 	struct kgem *kgem = &sna->kgem;
 	struct kgem_bo *src_bo;
+	BoxRec box;
 	void *ptr;
 	bool ret;
 
@@ -1083,103 +1084,37 @@ indirect_replace(struct sna *sna,
 	if ((int)pixmap->devKind * pixmap->drawable.height >> 12 > kgem->half_cpu_cache_pages)
 		return false;
 
-	if (kgem->ring == KGEM_RENDER || !kgem_bo_can_blt(kgem, bo)) {
-		BoxRec box;
-
-		assert(!must_tile(sna,
-				  pixmap->drawable.width,
-				  pixmap->drawable.height));
-
-		src_bo = kgem_create_buffer_2d(kgem,
-					       pixmap->drawable.width,
-					       pixmap->drawable.height,
-					       pixmap->drawable.bitsPerPixel,
-					       KGEM_BUFFER_WRITE_INPLACE,
-					       &ptr);
-		if (!src_bo)
-			return false;
-
-		memcpy_blt(src, ptr, pixmap->drawable.bitsPerPixel,
-			   stride, src_bo->pitch,
-			   0, 0,
-			   0, 0,
-			   pixmap->drawable.width,
-			   pixmap->drawable.height);
-
-		box.x1 = box.y1 = 0;
-		box.x2 = pixmap->drawable.width;
-		box.y2 = pixmap->drawable.height;
-
-		ret = sna->render.copy_boxes(sna, GXcopy,
-					     pixmap, src_bo, 0, 0,
-					     pixmap, bo, 0, 0,
-					     &box, 1);
-	} else {
-		uint32_t cmd, br13, *b;
-		int pitch;
-
-		pitch = pixmap->drawable.width * pixmap->drawable.bitsPerPixel;
-		pitch = ALIGN(pitch, 32) >> 3;
+	if (!kgem_bo_can_blt(kgem, bo) &&
+	    must_tile(sna, pixmap->drawable.width, pixmap->drawable.height))
+		return false;
 
-		src_bo = kgem_create_buffer(kgem,
-					    pitch * pixmap->drawable.height,
-					    KGEM_BUFFER_WRITE_INPLACE,
-					    &ptr);
-		if (!src_bo)
-			return false;
+	src_bo = kgem_create_buffer_2d(kgem,
+				       pixmap->drawable.width,
+				       pixmap->drawable.height,
+				       pixmap->drawable.bitsPerPixel,
+				       KGEM_BUFFER_WRITE_INPLACE,
+				       &ptr);
+	if (!src_bo)
+		return false;
 
-		memcpy_blt(src, ptr, pixmap->drawable.bitsPerPixel,
-			   stride, pitch,
-			   0, 0,
-			   0, 0,
-			   pixmap->drawable.width,
-			   pixmap->drawable.height);
-
-		cmd = XY_SRC_COPY_BLT_CMD;
-		br13 = bo->pitch;
-		if (kgem->gen >= 40 && bo->tiling) {
-			cmd |= BLT_DST_TILED;
-			br13 >>= 2;
-		}
-		br13 |= 0xcc << 16;
-		switch (pixmap->drawable.bitsPerPixel) {
-		default:
-		case 32: cmd |= BLT_WRITE_ALPHA | BLT_WRITE_RGB;
-			 br13 |= 1 << 25; /* RGB8888 */
-		case 16: br13 |= 1 << 24; /* RGB565 */
-		case 8: break;
-		}
+	memcpy_blt(src, ptr, pixmap->drawable.bitsPerPixel,
+		   stride, src_bo->pitch,
+		   0, 0,
+		   0, 0,
+		   pixmap->drawable.width,
+		   pixmap->drawable.height);
 
-		kgem_set_mode(kgem, KGEM_BLT);
-		if (kgem->nexec + 2 > KGEM_EXEC_SIZE(kgem) ||
-		    kgem->nreloc + 2 > KGEM_RELOC_SIZE(kgem) ||
-		    !kgem_check_batch(kgem, 8) ||
-		    !kgem_check_bo_fenced(kgem, bo, NULL)) {
-			_kgem_submit(kgem);
-			_kgem_set_mode(kgem, KGEM_BLT);
-		}
+	box.x1 = box.y1 = 0;
+	box.x2 = pixmap->drawable.width;
+	box.y2 = pixmap->drawable.height;
 
-		b = kgem->batch + kgem->nbatch;
-		b[0] = cmd;
-		b[1] = br13;
-		b[2] = 0;
-		b[3] = pixmap->drawable.height << 16 | pixmap->drawable.width;
-		b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, bo,
-				      I915_GEM_DOMAIN_RENDER << 16 |
-				      I915_GEM_DOMAIN_RENDER |
-				      KGEM_RELOC_FENCED,
-				      0);
-		b[5] = 0;
-		b[6] = pitch;
-		b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src_bo,
-				      I915_GEM_DOMAIN_RENDER << 16 |
-				      KGEM_RELOC_FENCED,
-				      0);
-		kgem->nbatch += 8;
-		ret = true;
-	}
+	ret = sna->render.copy_boxes(sna, GXcopy,
+				     pixmap, src_bo, 0, 0,
+				     pixmap, bo, 0, 0,
+				     &box, 1);
 
 	kgem_bo_destroy(kgem, src_bo);
+
 	return ret;
 }
 
commit beae5c80cd35802921658022921b14005f0f29e8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 20 22:57:50 2012 +0000

    sna: Tidy an assertion when handling tiled copies
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index bb09214..c022900 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1095,6 +1095,8 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		goto destroy;
 
 	if (bo->vmap) {
+		DBG(("%s: handle=%d is vmapped, tracking until free\n",
+		     __FUNCTION__, bo->handle));
 		if (bo->rq == NULL) {
 			if (bo->needs_flush && kgem_busy(kgem, bo->handle)) {
 				list_add(&bo->request, &kgem->flushing);
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 493e313..dcb4d1d 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -389,8 +389,8 @@ sna_tiling_fill_boxes(struct sna *sna,
 				int16_t dx = this.extents.x1;
 				int16_t dy = this.extents.y1;
 
+				assert(kgem_bo_can_blt(&sna->kgem, bo));
 				assert(bo->pitch <= 8192);
-				assert(bo->tiling != I915_TILING_Y);
 
 				if (!sna->render.copy_boxes(sna, GXcopy,
 							     dst, dst_bo, 0, 0,
commit c69265439b91f83f59f232bb9762aebb49885d81
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 21 01:01:47 2012 +0000

    sna: Remove the short-circuiting of move-to-cpu for read if no damage
    
    The danger now is that we may have either discarded the shadow pixels or
    have replaced them with a GTT mapping, either way undesirable and so we
    should reconstruct the shadow mapping.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 55c5f5a..949c18f 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -435,7 +435,7 @@ static inline bool must_check sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned
 {
 	if (flags == MOVE_READ) {
 		struct sna_pixmap *priv = sna_pixmap(pixmap);
-		if (priv == NULL || priv->gpu_damage == NULL)
+		if (priv == NULL)
 			return true;
 	}
 
commit 144db688db25dfe452294fcc593fbcd0b1d355d1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 20 21:36:26 2012 +0000

    sna: Decouple the private data after UnrealizeFont
    
    As the font is kept around and reused after UnrealizeFont, we need to
    nullify the pointer to our private data in order to prevent the later
    use-after-free.
    
    Reported-by: Peter Jordan
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index c7f041e..33e08be 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10183,6 +10183,7 @@ sna_unrealize_font(ScreenPtr screen, FontPtr font)
 		for (n = 0; n < 256; n++)
 			free(priv->glyphs16[n]);
 		free(priv);
+		FontSetPrivate(font, sna_font_key, NULL);
 	}
 
 	return TRUE;
commit 1c3c49d125ce325f379fe306dc31e9b2069fe0a6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 20 11:08:44 2012 +0000

    sna/traps: Remove bogus assertion
    
    As we only enter that path for singular unbounded boxes, we are
    guaranteed to fill the entire trapezoid extents and so do not need the
    unbounded fixup the assertion was fretting about.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 4752e48..7048cae 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3099,7 +3099,6 @@ composite_unaligned_boxes(struct sna *sna,
 		if (priv->clear && priv->clear_color == 0)
 			return true;
 	}
-	assert((priv->clear && priv->clear_color == 0) || operator_is_bounded(op));
 
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite_spans(sna, op, src, dst,
commit fcc14e01bde93d4c0adda7398688f9326e0b0fef
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 19 15:51:43 2012 +0000

    uxa: Defer the call to EnterVT till after outputs are initialised
    
    We need to do this apparently or else we never perform the VT switch.
    However, we can not do it too early, especially not before we have
    finished intialising the outputs.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47395
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_driver.c b/src/intel_driver.c
index f454f92..9124fdf 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -1069,13 +1069,6 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 			   "Hardware cursor initialization failed\n");
 	}
 
-	/* Must force it before EnterVT, so we are in control of VT and
-	 * later memory should be bound when allocating, e.g rotate_mem */
-	scrn->vtSema = TRUE;
-
-	if (!I830EnterVT(scrnIndex, 0))
-		return FALSE;
-
 	intel->BlockHandler = screen->BlockHandler;
 	screen->BlockHandler = I830BlockHandler;
 
@@ -1148,7 +1141,11 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 	I830UeventInit(scrn);
 #endif
 
-	return TRUE;
+	/* Must force it before EnterVT, so we are in control of VT and
+	 * later memory should be bound when allocating, e.g rotate_mem */
+	scrn->vtSema = TRUE;
+
+	return I830EnterVT(scrnIndex, 0);
 }
 
 static void i830AdjustFrame(int scrnIndex, int x, int y, int flags)
commit 36de1def6ae926d635efa497f9ad45b3539b2c6d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 19 14:38:28 2012 +0000

    sna: Declare videoRam correctly on gen2 devices
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index 7cf3ef1..7b3cfce 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -820,20 +820,25 @@ sna_register_all_privates(void)
 	return TRUE;
 }
 
+static size_t
+agp_aperture_size(struct pci_device *dev, int gen)
+{
+	return dev->regions[gen < 30 ? 0 : 2].size;
+}
+
 static Bool
 sna_screen_init(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 {
 	ScrnInfoPtr scrn = xf86Screens[screen->myNum];
 	struct sna *sna = to_sna(scrn);
 	VisualPtr visual;
-	struct pci_device *const device = sna->PciInfo;
 
 	DBG(("%s\n", __FUNCTION__));
 
 	if (!sna_register_all_privates())
 		return FALSE;
 
-	scrn->videoRam = device->regions[2].size / 1024;
+	scrn->videoRam = agp_aperture_size(sna->PciInfo, sna->kgem.gen) / 1024;
 
 	miClearVisualTypes();
 	if (!miSetVisualTypes(scrn->depth,
commit 26d110dd292061cde9b322080fe1ffb5bf651a26
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 19 08:25:22 2012 +0000

    sna/dri: Make the drawable as damaged for the off-screen immediate exchange
    
    In some cases off-screen is stil visible, for example under a rotation.
    As such xrandr -o left; glxgears -fullscreen was broken.
    
    Reported-by: Phillip Haddad <phillip.haddad at gmail.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index e2eed1d..95ec07e 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1294,12 +1294,24 @@ sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	/* Drawable not displayed... just complete the swap */
 	pipe = sna_dri_get_pipe(draw);
 	if (pipe == -1) {
+		RegionRec region;
+
 		DBG(("%s: off-screen, immediate update\n", __FUNCTION__));
 
 		sna_dri_exchange_attachment(front, back);
 		get_private(back)->pixmap = get_private(front)->pixmap;
 		get_private(front)->pixmap = NULL;
 		set_bo(get_private(back)->pixmap, get_private(back)->bo);
+
+		/* XXX can we query whether we need to process damage? */
+		region.extents.x1 = draw->x;
+		region.extents.y1 = draw->y;
+		region.extents.x2 = draw->x + draw->width;
+		region.extents.y2 = draw->y + draw->height;
+		region.data = NULL;
+		DamageRegionAppend(draw, &region);
+		DamageRegionProcessPending(draw);
+
 		DRI2SwapComplete(client, draw, 0, 0, 0,
 				 DRI2_EXCHANGE_COMPLETE, func, data);
 		return TRUE;
commit 591bcba254e2ba386abd415cfec3e3d3e1446434
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 13:23:26 2012 +0000

    sna/traps: Remove separate edge->vertical flag
    
    Mark vertical edges with dy==0 to reduce structure size and reduce
    memory load during edge walking.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f2caf9a..4752e48 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -190,7 +190,6 @@ struct edge {
 	struct edge *next, *prev;
 
 	int dir;
-	int vertical;
 
 	grid_scaled_y_t height_left;
 
@@ -713,13 +712,12 @@ polygon_add_edge(struct polygon *polygon,
 	e->height_left = ybot - ytop;
 
 	if (dx == 0) {
-		e->vertical = true;
 		e->x.quo = x1;
 		e->x.rem = 0;
+		e->dy = 0;
 		e->dxdy.quo = 0;
 		e->dxdy.rem = 0;
 	} else {
-		e->vertical = false;
 		e->dxdy = floored_divrem(dx, dy);
 		if (ytop == y1) {
 			e->x.quo = x1;
@@ -776,13 +774,12 @@ polygon_add_line(struct polygon *polygon,
 	e->height_left = bot - top;
 
 	if (dx == 0) {
-		e->vertical = true;
 		e->x.quo = p1->x;
 		e->x.rem = -dy;
 		e->dxdy.quo = 0;
 		e->dxdy.rem = 0;
+		e->dy = 0;
 	} else {
-		e->vertical = false;
 		e->dxdy = floored_divrem(dx, dy);
 		if (top == p1->y) {
 			e->x.quo = p1->x;
@@ -819,16 +816,16 @@ polygon_add_line(struct polygon *polygon,
 static void
 active_list_reset(struct active_list *active)
 {
-	active->head.vertical = 1;
 	active->head.height_left = INT_MAX;
 	active->head.x.quo = INT_MIN;
+	active->head.dy = 0;
 	active->head.prev = NULL;
 	active->head.next = &active->tail;
 	active->tail.prev = &active->head;
 	active->tail.next = NULL;
 	active->tail.x.quo = INT_MAX;
 	active->tail.height_left = INT_MAX;
-	active->tail.vertical = 1;
+	active->tail.dy = 0;
 	active->min_height = INT_MAX;
 	active->is_vertical = 1;
 }
@@ -934,7 +931,7 @@ can_full_step(struct active_list *active)
 		for (e = active->head.next; &active->tail != e; e = e->next) {
 			if (e->height_left < min_height)
 				min_height = e->height_left;
-			is_vertical &= e->vertical;
+			is_vertical &= e->dy == 0;
 		}
 
 		active->is_vertical = is_vertical;
@@ -971,7 +968,7 @@ fill_buckets(struct active_list *active,
 		*b = edge;
 		if (edge->height_left < min_height)
 			min_height = edge->height_left;
-		is_vertical &= edge->vertical;
+		is_vertical &= edge->dy == 0;
 		edge = next;
 	}
 
@@ -1002,7 +999,7 @@ nonzero_subrow(struct active_list *active, struct cell_list *coverages)
 			xstart = edge->x.quo;
 
 		if (--edge->height_left) {
-			if (!edge->vertical) {
+			if (edge->dy) {
 				edge->x.quo += edge->dxdy.quo;
 				edge->x.rem += edge->dxdy.rem;
 				if (edge->x.rem >= 0) {
@@ -1595,7 +1592,7 @@ inplace_subrow(struct active_list *active, int8_t *row,
 		}
 
 		if (--edge->height_left) {
-			if (!edge->vertical) {
+			if (edge->dy) {
 				edge->x.quo += edge->dxdy.quo;
 				edge->x.rem += edge->dxdy.rem;
 				if (edge->x.rem >= 0) {
@@ -1805,7 +1802,6 @@ struct mono_edge {
 
 	int32_t height_left;
 	int32_t dir;
-	int32_t vertical;
 
 	int32_t dy;
 	struct quorem x;
@@ -1925,14 +1921,12 @@ mono_add_line(struct mono *mono,
 	dy = p2->y - p1->y;
 
 	if (dx == 0) {
-		e->vertical = TRUE;
 		e->x.quo = p1->x;
 		e->x.rem = 0;
 		e->dxdy.quo = 0;
 		e->dxdy.rem = 0;
 		e->dy = 0;
 	} else {
-		e->vertical = FALSE;
 		e->dxdy = floored_muldivrem (dx, pixman_fixed_1, dy);
 		e->dy = dy;
 
@@ -2079,7 +2073,7 @@ mono_merge_edges(struct mono *c, struct mono_edge *edges)
 	DBG_MONO_EDGES(edges);
 
 	for (e = edges; c->is_vertical && e; e = e->next)
-		c->is_vertical = e->vertical;
+		c->is_vertical = e->dy == 0;
 
 	c->head.next = mono_merge_unsorted_edges(c->head.next, edges);
 }
@@ -2137,11 +2131,13 @@ mono_row(struct mono *c, int16_t y, int16_t h)
 		int16_t xend = I(edge->x.quo);
 
 		if (--edge->height_left) {
-			edge->x.quo += edge->dxdy.quo;
-			edge->x.rem += edge->dxdy.rem;
-			if (edge->x.rem >= 0) {
-				++edge->x.quo;
-				edge->x.rem -= edge->dy;
+			if (edge->dy) {
+				edge->x.quo += edge->dxdy.quo;
+				edge->x.rem += edge->dxdy.rem;
+				if (edge->x.rem >= 0) {
+					++edge->x.quo;
+					edge->x.rem -= edge->dy;
+				}
 			}
 
 			if (edge->x.quo < prev_x) {
@@ -2185,7 +2181,7 @@ mono_init(struct mono *c, int num_edges)
 	if (!mono_polygon_init(&c->polygon, &c->clip.extents, num_edges))
 		return false;
 
-	c->head.vertical = 1;
+	c->head.dy = 0;
 	c->head.height_left = INT_MAX;
 	c->head.x.quo = INT16_MIN << 16;
 	c->head.prev = NULL;
@@ -2194,7 +2190,7 @@ mono_init(struct mono *c, int num_edges)
 	c->tail.next = NULL;
 	c->tail.x.quo = INT16_MAX << 16;
 	c->tail.height_left = INT_MAX;
-	c->tail.vertical = 1;
+	c->tail.dy = 0;
 
 	c->is_vertical = 1;
 
commit c4773ba12b4b49fcdc602406559592c125161449
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 11:10:14 2012 +0000

    sna/gen3: Improve clear-to-solid reduction
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 8c88722..b236761 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2258,6 +2258,27 @@ gen3_init_radial(struct sna *sna,
 }
 
 static Bool
+sna_picture_is_clear(PicturePtr picture,
+		     int x, int y, int w, int h,
+		     uint32_t *color)
+{
+	struct sna_pixmap *priv;
+
+	if (!picture->pDrawable)
+		return FALSE;
+
+	priv = sna_pixmap(get_drawable_pixmap(picture->pDrawable));
+	if (priv == NULL || !priv->clear)
+		return FALSE;
+
+	if (!source_is_covered(picture, x, y, w, h))
+		return FALSE;
+
+	*color = priv->clear_color;
+	return TRUE;
+}
+
+static Bool
 gen3_composite_picture(struct sna *sna,
 		       PicturePtr picture,
 		       struct sna_composite_op *op,
@@ -2310,6 +2331,9 @@ gen3_composite_picture(struct sna *sna,
 	if (sna_picture_is_solid(picture, &color))
 		return gen3_init_solid(channel, color);
 
+	if (sna_picture_is_clear(picture, x, y, w, h, &color))
+		return gen3_init_solid(channel, color);
+
 	if (!gen3_check_repeat(picture))
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
commit b80350c606b8f8106c34b38d7f82b66047c30e3c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 10:55:17 2012 +0000

    sna/gen3: Do not force tiling for large pixmaps
    
    As the extraction routine is now smarter and can construction
    subsurfaces without copying we do not need to force tiling.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 8a2222c..bb09214 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2300,17 +2300,11 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		return tiling < 0 ? tiling : I915_TILING_NONE;
 
 	if (kgem->gen < 40) {
-		if (tiling) {
-			if (width * bpp > 8192 * 8) {
-				DBG(("%s: pitch too large for tliing [%d]\n",
-				     __FUNCTION__, width*bpp/8));
-				tiling = I915_TILING_NONE;
-				goto done;
-			} else if ((width|height) > 2048) {
-				DBG(("%s: large buffer (%dx%d), forcing TILING_X\n",
-				     __FUNCTION__, width, height));
-				tiling = -I915_TILING_X;
-			}
+		if (tiling && width * bpp > 8192 * 8) {
+			DBG(("%s: pitch too large for tliing [%d]\n",
+			     __FUNCTION__, width*bpp/8));
+			tiling = I915_TILING_NONE;
+			goto done;
 		}
 	} else {
 		if (width*bpp > (MAXSHORT-512) * 8) {
commit 2c97fd847bdd4b199115edcfa476266284de4bce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 10:55:06 2012 +0000

    sna/gen3: Prevent copy-fallback if we cannot blit
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 67c8956..8c88722 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -3987,6 +3987,10 @@ gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
 	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(src->drawable.width, src->drawable.height)) {
 fallback_blt:
+		if (!kgem_bo_can_blt(&sna->kgem, src_bo) ||
+		    !kgem_bo_can_blt(&sna->kgem, dst_bo))
+			return FALSE;
+
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
commit 48209ba1315c0a9d041b3e657c2a0a226aed2456
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 10:55:27 2012 +0000

    sna: Fixup the cpu shadow mappings before uploading the box
    
    On the off-chance we arrive here with a pointer to the GTT mapping.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 8be3e72..542cdb9 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -442,6 +442,7 @@ static struct kgem_bo *upload(struct sna *sna,
 			      PixmapPtr pixmap,
 			      BoxPtr box)
 {
+	struct sna_pixmap *priv;
 	struct kgem_bo *bo;
 
 	DBG(("%s: box=(%d, %d), (%d, %d), pixmap=%dx%d\n",
@@ -451,6 +452,19 @@ static struct kgem_bo *upload(struct sna *sna,
 	assert(box->x2 <= pixmap->drawable.width);
 	assert(box->y2 <= pixmap->drawable.height);
 
+	priv = sna_pixmap(pixmap);
+	if (priv) {
+		/* As we know this box is on the CPU just fixup the shadow */
+		if (priv->mapped) {
+			pixmap->devPrivate.ptr = NULL;
+			priv->mapped = false;
+		}
+		if (pixmap->devPrivate.ptr == NULL) {
+			pixmap->devPrivate.ptr = priv->ptr;
+			pixmap->devKind = priv->stride;
+		}
+	}
+
 	bo = kgem_upload_source_image(&sna->kgem,
 				      pixmap->devPrivate.ptr, box,
 				      pixmap->devKind,
@@ -463,13 +477,11 @@ static struct kgem_bo *upload(struct sna *sna,
 		channel->scale[0] = 1.f/channel->width;
 		channel->scale[1] = 1.f/channel->height;
 
-		if (pixmap->usage_hint == 0 &&
+		if (priv &&
+		    pixmap->usage_hint == 0 &&
 		    channel->width  == pixmap->drawable.width &&
-		    channel->height == pixmap->drawable.height) {
-			struct sna_pixmap *priv = sna_pixmap(pixmap);
-			if (priv)
-				kgem_proxy_bo_attach(bo, &priv->gpu_bo);
-		}
+		    channel->height == pixmap->drawable.height)
+			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
 	}
 
 	return bo;
commit bae9ebeac057f29eb540d879c7e07292e22cb4af
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 18 09:45:27 2012 +0000

    sna/traps: Apply some more operator and unbounded reductions
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 5594023..f2caf9a 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2698,6 +2698,8 @@ composite_unaligned_box(struct sna *sna,
 			float opacity,
 			pixman_region16_t *clip)
 {
+	assert(opacity != 0.);
+
 	if (clip) {
 		pixman_region16_t region;
 
@@ -3017,6 +3019,7 @@ composite_unaligned_boxes(struct sna *sna,
 {
 	BoxRec extents;
 	struct sna_composite_spans_op tmp;
+	struct sna_pixmap *priv;
 	pixman_region16_t clip, *c;
 	int dst_x, dst_y;
 	int dx, dy, n;
@@ -3030,7 +3033,8 @@ composite_unaligned_boxes(struct sna *sna,
 	if (ntrap > 1 && maskFormat)
 		return false;
 
-	if (!sna->render.composite_spans)
+	priv = sna_pixmap(get_drawable_pixmap(dst->pDrawable));
+	if (priv == NULL || !sna->render.composite_spans)
 		return composite_unaligned_boxes_fallback(op, src, dst, src_x, src_y, ntrap, traps);
 
 	dst_x = extents.x1 = pixman_fixed_to_int(traps[0].left.p1.x);
@@ -3090,6 +3094,17 @@ composite_unaligned_boxes(struct sna *sna,
 	     src_x + extents.x1 - dst_x - dx,
 	     src_y + extents.y1 - dst_y - dy));
 
+	switch (op) {
+	case PictOpAdd:
+		if (priv->clear && priv->clear_color == 0)
+			op = PictOpSrc;
+		break;
+	case PictOpIn:
+		if (priv->clear && priv->clear_color == 0)
+			return true;
+	}
+	assert((priv->clear && priv->clear_color == 0) || operator_is_bounded(op));
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite_spans(sna, op, src, dst,
 					 src_x + extents.x1 - dst_x - dx,
@@ -3321,6 +3336,7 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	BoxRec extents;
 	pixman_region16_t clip;
 	int16_t dst_x, dst_y;
+	bool was_clear;
 	int dx, dy, n;
 
 	if (NO_SCAN_CONVERTER)
@@ -3388,6 +3404,19 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + extents.x1 - dst_x - dx,
 	     src_y + extents.y1 - dst_y - dy));
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+	switch (op) {
+	case PictOpAdd:
+	case PictOpOver:
+		if (was_clear)
+			op = PictOpSrc;
+		break;
+	case PictOpIn:
+		if (was_clear)
+			return true;
+		break;
+	}
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite_spans(sna, op, src, dst,
 					 src_x + extents.x1 - dst_x - dx,
@@ -3422,7 +3451,7 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	tor_render(sna, &tor, &tmp, &clip,
 		   choose_span(&tmp, dst, maskFormat, op, &clip),
-		   maskFormat && !operator_is_bounded(op));
+		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
 	tor_fini(&tor);
@@ -5150,6 +5179,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	pixman_region16_t clip;
 	int16_t dst_x, dst_y;
 	int dx, dy, n;
+	bool was_clear;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -5216,6 +5246,8 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + extents.x1 - dst_x - dx,
 	     src_y + extents.y1 - dst_y - dy));
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite_spans(sna, op, src, dst,
 					 src_x + extents.x1 - dst_x - dx,
@@ -5248,7 +5280,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	tor_render(sna, &tor, &tmp, &clip,
 		   choose_span(&tmp, dst, maskFormat, op, &clip),
-		   maskFormat && !operator_is_bounded(op));
+		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
 	tor_fini(&tor);
@@ -5508,6 +5540,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	int16_t dst_x, dst_y;
 	int dx, dy;
 	int cw, ccw, n;
+	bool was_clear;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -5569,6 +5602,8 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + extents.x1 - dst_x - dx,
 	     src_y + extents.y1 - dst_y - dy));
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite_spans(sna, op, src, dst,
 					 src_x + extents.x1 - dst_x - dx,
@@ -5611,7 +5646,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	tor_render(sna, &tor, &tmp, &clip,
 		   choose_span(&tmp, dst, maskFormat, op, &clip),
-		   maskFormat && !operator_is_bounded(op));
+		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
 	tor_fini(&tor);
commit b8f2ecedb77bcf44aabfa9b8eaa94464c1083053
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 17 21:49:56 2012 +0000

    sna/gen[345]: Convert CPU mappings to GTT for vertices on submit
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 0991a98..67c8956 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1680,14 +1680,21 @@ static void gen3_vertex_close(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (IS_CPU_MAP(bo->map) ||
-		    sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding vbo (was CPU mapped)\n",
-			     __FUNCTION__));
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding full vbo\n", __FUNCTION__));
 			sna->render.vbo = NULL;
 			sna->render.vertices = sna->render.vertex_data;
 			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
 			free_bo = bo;
+		} else if (IS_CPU_MAP(bo->map)) {
+			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
+			sna->render.vertices = kgem_bo_map__gtt(&sna->kgem, bo);
+			if (sna->render.vertices == NULL) {
+				sna->render.vbo = NULL;
+				sna->render.vertices = sna->render.vertex_data;
+				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+				free_bo = bo;
+			}
 		}
 	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
@@ -1950,6 +1957,15 @@ gen3_render_reset(struct sna *sna)
 	state->last_floats_per_vertex = 0;
 	state->last_vertex_offset = 0;
 	state->vertex_offset = 0;
+
+	if (sna->render.vbo &&
+	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+		kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	}
 }
 
 static void
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index a69852e..def5d19 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -419,9 +419,14 @@ static int gen4_vertex_finish(struct sna *sna)
 
 static void gen4_vertex_close(struct sna *sna)
 {
-	struct kgem_bo *bo;
+	struct kgem_bo *bo, *free_bo = NULL;
 	unsigned int i, delta = 0;
 
+	assert(sna->render_state.gen4.vertex_offset == 0);
+
+	DBG(("%s: used=%d, vbo active? %d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
+
 	if (!sna->render.vertex_used) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -429,10 +434,26 @@ static void gen4_vertex_close(struct sna *sna)
 		return;
 	}
 
-	DBG(("%s: used=%d\n", __FUNCTION__, sna->render.vertex_used));
-
 	bo = sna->render.vbo;
-	if (bo == NULL) {
+	if (bo) {
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding full vbo\n", __FUNCTION__));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		} else if (IS_CPU_MAP(bo->map)) {
+			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
+			sna->render.vertices =
+				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
+			if (sna->render.vertices == NULL) {
+				sna->render.vbo = NULL;
+				sna->render.vertices = sna->render.vertex_data;
+				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+				free_bo = bo;
+			}
+		}
+	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -449,10 +470,11 @@ static void gen4_vertex_close(struct sna *sna)
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
 				kgem_bo_destroy(&sna->kgem, bo);
-				goto reset;
+				bo = NULL;
 			}
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
+			free_bo = bo;
 		}
 	}
 
@@ -471,17 +493,13 @@ static void gen4_vertex_close(struct sna *sna)
 		}
 	}
 
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
-
-reset:
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-	sna->render_state.gen4.vb_id = 0;
+	if (sna->render.vbo == NULL) {
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
 }
 
 
@@ -3207,6 +3225,15 @@ static void gen4_render_reset(struct sna *sna)
 	sna->render_state.gen4.drawrect_offset = -1;
 	sna->render_state.gen4.drawrect_limit = -1;
 	sna->render_state.gen4.surface_table = -1;
+
+	if (sna->render.vbo &&
+	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+		kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	}
 }
 
 static void gen4_render_fini(struct sna *sna)
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 01604ef..565d22a 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -428,14 +428,22 @@ static void gen5_vertex_close(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (IS_CPU_MAP(bo->map) ||
-		    sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding vbo (was CPU mapped)\n",
-			     __FUNCTION__));
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding full vbo\n", __FUNCTION__));
 			sna->render.vbo = NULL;
 			sna->render.vertices = sna->render.vertex_data;
 			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
 			free_bo = bo;
+		} else if (IS_CPU_MAP(bo->map)) {
+			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
+			sna->render.vertices =
+				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
+			if (sna->render.vertices == NULL) {
+				sna->render.vbo = NULL;
+				sna->render.vertices = sna->render.vertex_data;
+				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+				free_bo = bo;
+			}
 		}
 	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
@@ -3655,6 +3663,15 @@ static void gen5_render_reset(struct sna *sna)
 	sna->render_state.gen5.drawrect_offset = -1;
 	sna->render_state.gen5.drawrect_limit = -1;
 	sna->render_state.gen5.surface_table = -1;
+
+	if (sna->render.vbo &&
+	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
+		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
+		kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	}
 }
 
 static void gen5_render_fini(struct sna *sna)
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index daca7af..8a2222c 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3190,6 +3190,43 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	return ptr;
 }
 
+void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	void *ptr;
+
+	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+
+	assert(!bo->purged);
+	assert(bo->exec == NULL);
+	assert(list_is_empty(&bo->list));
+
+	if (IS_CPU_MAP(bo->map))
+		kgem_bo_release_map(kgem, bo);
+
+	ptr = bo->map;
+	if (ptr == NULL) {
+		assert(bytes(bo) <= kgem->aperture_mappable / 4);
+
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+
+		ptr = gem_mmap(kgem->fd, bo->handle, bytes(bo),
+			       PROT_READ | PROT_WRITE);
+		if (ptr == NULL)
+			return NULL;
+
+		/* Cache this mapping to avoid the overhead of an
+		 * excruciatingly slow GTT pagefault. This is more an
+		 * issue with compositing managers which need to frequently
+		 * flush CPU damage to their GPU bo.
+		 */
+		bo->map = ptr;
+		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
+	}
+
+	return ptr;
+}
+
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 {
 	if (bo->map)
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 98534d9..27e0e04 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -364,6 +364,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 			uint32_t delta);
 
 void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
+void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
@@ -425,9 +426,6 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 	if (bo->domain == DOMAIN_GTT)
 		return true;
 
-	if (IS_GTT_MAP(bo->map))
-		return true;
-
 	if (kgem->gen < 40 && bo->tiling &&
 	    bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
 		return false;
commit 1d52f24a1a5ddfaced0784f544ca97943e921c68
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 17 23:57:46 2012 +0000

    sna/traps: Upon reducing an ADD to a SRC, we need to apply the pending clear
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47444
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 3a93450..5594023 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -4077,6 +4077,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	struct sna_pixmap *priv;
 	RegionRec region;
 	uint32_t color;
+	bool unbounded;
 	int16_t dst_x, dst_y;
 	int dx, dy;
 	int n;
@@ -4125,19 +4126,27 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
+	unbounded = false;
 	switch (op) {
 	case PictOpAdd:
-		if (priv->clear && priv->clear_color == 0)
+		if (priv->clear && priv->clear_color == 0) {
+			unbounded = true;
 			op = PictOpSrc;
+		}
 		if ((color >> 24) == 0)
 			return true;
 		break;
 	case PictOpIn:
 		if (priv->clear && priv->clear_color == 0)
 			return true;
+		if (priv->clear && priv->clear_color == 0xff)
+			op = PictOpSrc;
 		if ((color >> 24) == 0)
 			return true;
+		unbounded = true;
+		break;
 	case PictOpSrc:
+		unbounded = !(priv->clear && priv->clear_color == 0);
 		break;
 	default:
 		DBG(("%s: fallback -- can not perform op [%d] in place\n",
@@ -4237,7 +4246,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	inplace.opacity = color >> 24;
 
 	tor_render(NULL, &tor, (void*)&inplace,
-		   dst->pCompositeClip, span, op == PictOpIn);
+		   dst->pCompositeClip, span, unbounded);
 
 	tor_fini(&tor);
 
commit c1cce12743217a843cffd0217d52038f15da29b3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 17 09:21:00 2012 +0000

    sna/traps: Align indices for unrolled memset in row_inplace()
    
    The compiler presumes that the uint64_t write is naturally aligned and
    so may emit code that crashes with an unaligned moved. To workaround
    this, make sure the write is so aligned.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=47418
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 31a661e..3a93450 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1499,6 +1499,21 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 				else
 					memset(row+lix, 0xff, rix);
 #else
+				if (lix & 1 && rix) {
+					row[lix] = 0xff;
+					lix++;
+					rix--;
+				}
+				if (lix & 2 && rix >= 2) {
+					*(uint16_t *)(row+lix) = 0xffff;
+					lix += 2;
+					rix -= 2;
+				}
+				if (lix & 4 && rix >= 4) {
+					*(uint32_t *)(row+lix) = 0xffffffff;
+					lix += 4;
+					rix -= 4;
+				}
 				while (rix >= 8) {
 					*(uint64_t *)(row+lix) = 0xffffffffffffffff;
 					lix += 8;
commit aaedc91353421a4324c9a44e3b7be3b052844b25
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 17 00:05:47 2012 +0000

    sna/traps: Tune inplace_end_subrows()
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 52be00b..31a661e 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1617,51 +1617,62 @@ inplace_end_subrows(struct active_list *active, uint8_t *row,
 {
 	int cover = 0;
 
-	while (width > 4) {
+	while (width >= 4) {
 		uint32_t dw;
 		int v;
 
 		dw = *(uint32_t *)buf;
 		buf += 4;
 
-		if (dw == 0){
+		if (dw == 0) {
 			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
 			v -= v >> 8;
 			v |= v << 8;
 			dw = v | v << 16;
-		} else if (dw) {
+		} else {
 			cover += (int8_t)(dw & 0xff);
-			assert(cover >= 0);
-			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
-			v -= v >> 8;
-			dw >>= 8;
-			dw |= v << 24;
+			if (cover) {
+				assert(cover > 0);
+				v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+				v -= v >> 8;
+				dw >>= 8;
+				dw |= v << 24;
+			} else
+				dw >>= 8;
 
 			cover += (int8_t)(dw & 0xff);
-			assert(cover >= 0);
-			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
-			v -= v >> 8;
-			dw >>= 8;
-			dw |= v << 24;
+			if (cover) {
+				assert(cover > 0);
+				v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+				v -= v >> 8;
+				dw >>= 8;
+				dw |= v << 24;
+			} else
+				dw >>= 8;
 
 			cover += (int8_t)(dw & 0xff);
-			assert(cover >= 0);
-			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
-			v -= v >> 8;
-			dw >>= 8;
-			dw |= v << 24;
+			if (cover) {
+				assert(cover > 0);
+				v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+				v -= v >> 8;
+				dw >>= 8;
+				dw |= v << 24;
+			} else
+				dw >>= 8;
 
 			cover += (int8_t)(dw & 0xff);
-			assert(cover >= 0);
-			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
-			v -= v >> 8;
-			dw >>= 8;
-			dw |= v << 24;
+			if (cover) {
+				assert(cover > 0);
+				v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+				v -= v >> 8;
+				dw >>= 8;
+				dw |= v << 24;
+			} else
+				dw >>= 8;
 		}
 
 		*(uint32_t *)row = dw;
 		row += 4;
-
 		width -= 4;
 	}
 
commit 6ebb68d1496abd929f1030808c8e60efeb7610fa
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 17:19:11 2012 +0000

    sna/dri: Mesa expects the 8-bit stencil buffer to have 2-bytes per pixel
    
    The seperate stencil buffer is full of lies, why worry about one more?
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 283b8db..e2eed1d 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -255,12 +255,11 @@ sna_dri_create_buffer(DrawablePtr drawable,
 		 * W fencing.
 		 */
 		bpp = format ? format : drawable->bitsPerPixel;
+		bpp *= 2;
 		bo = kgem_create_2d(&sna->kgem,
 				    ALIGN(drawable->width, 64),
 				    ALIGN((drawable->height + 1) / 2, 64),
-				    2*bpp,
-				    I915_TILING_NONE,
-				    CREATE_EXACT);
+				    bpp, I915_TILING_NONE, CREATE_EXACT);
 		break;
 
 	case DRI2BufferDepth:
commit 8b9efcd4bbabf9c3249bf96fa890c2b49e765ed4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 14:51:53 2012 +0000

    sna/dri: Improve handling of non-front attachments for CopyRegion
    
    Confusion reigns between using the backing pixmap for the drawable for
    the front buffer, and a fake pixmap for the auxiliary buffers.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index ccaf40f..283b8db 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -394,9 +394,9 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 }
 
 static void
-sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
-	     struct kgem_bo *dst_bo,struct kgem_bo *src_bo,
-	     bool sync)
+sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
+		      struct kgem_bo *dst_bo, struct kgem_bo *src_bo,
+		      bool sync)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(draw);
 	pixman_region16_t clip;
@@ -477,18 +477,8 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		boxes = &box;
 		n = 1;
 	}
-	/* It's important that this copy gets submitted before the
-	 * direct rendering client submits rendering for the next
-	 * frame, but we don't actually need to submit right now.  The
-	 * client will wait for the DRI2CopyRegion reply or the swap
-	 * buffer event before rendering, and we'll hit the flush
-	 * callback chain before those messages are sent.  We submit
-	 * our batch buffers from the flush callback chain so we know
-	 * that will happen before the client tries to render
-	 * again.
-	 */
 	sna->render.copy_boxes(sna, GXcopy,
-			       pixmap, src_bo, -draw->x, -draw->y,
+			       (PixmapPtr)draw, src_bo, -draw->x, -draw->y,
 			       pixmap, dst_bo, dx, dy,
 			       boxes, n);
 
@@ -505,6 +495,137 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 }
 
 static void
+sna_dri_copy_from_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
+			struct kgem_bo *dst_bo, struct kgem_bo *src_bo,
+			bool sync)
+{
+	PixmapPtr pixmap = get_drawable_pixmap(draw);
+	pixman_region16_t clip;
+	BoxRec box, *boxes;
+	int16_t dx, dy;
+	int n;
+
+	box.x1 = draw->x;
+	box.y1 = draw->y;
+	box.x2 = draw->x + draw->width;
+	box.y2 = draw->y + draw->height;
+
+	if (region) {
+		pixman_region_translate(region, draw->x, draw->y);
+		pixman_region_init_rects(&clip, &box, 1);
+		pixman_region_intersect(&clip, &clip, region);
+		region = &clip;
+
+		if (!pixman_region_not_empty(region)) {
+			DBG(("%s: all clipped\n", __FUNCTION__));
+			return;
+		}
+	}
+
+	dx = dy = 0;
+	if (draw->type != DRAWABLE_PIXMAP) {
+		WindowPtr win = (WindowPtr)draw;
+
+		DBG(("%s: draw=(%d, %d), delta=(%d, %d), clip.extents=(%d, %d), (%d, %d)\n",
+		     __FUNCTION__, draw->x, draw->y,
+		     get_drawable_dx(draw), get_drawable_dy(draw),
+		     win->clipList.extents.x1, win->clipList.extents.y1,
+		     win->clipList.extents.x2, win->clipList.extents.y2));
+
+		if (region == NULL) {
+			pixman_region_init_rects(&clip, &box, 1);
+			region = &clip;
+		}
+
+		pixman_region_intersect(region, &win->clipList, region);
+		if (!pixman_region_not_empty(region)) {
+			DBG(("%s: all clipped\n", __FUNCTION__));
+			return;
+		}
+
+		get_drawable_deltas(draw, pixmap, &dx, &dy);
+	}
+
+	if (sna->kgem.gen >= 60)
+		kgem_set_mode(&sna->kgem, KGEM_RENDER);
+
+	if (region) {
+		boxes = REGION_RECTS(region);
+		n = REGION_NUM_RECTS(region);
+		assert(n);
+	} else {
+		pixman_region_init_rects(&clip, &box, 1);
+		region = &clip;
+		boxes = &box;
+		n = 1;
+	}
+	sna->render.copy_boxes(sna, GXcopy,
+			       pixmap, src_bo, dx, dy,
+			       (PixmapPtr)draw, dst_bo, -draw->x, -draw->y,
+			       boxes, n);
+
+	if (region == &clip)
+		pixman_region_fini(&clip);
+}
+
+static void
+sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
+	     struct kgem_bo *dst_bo, struct kgem_bo *src_bo,
+	     bool sync)
+{
+	pixman_region16_t clip;
+	BoxRec box, *boxes;
+	int n;
+
+	box.x1 = 0;
+	box.y1 = 0;
+	box.x2 = draw->width;
+	box.y2 = draw->height;
+
+	if (region) {
+		pixman_region_init_rects(&clip, &box, 1);
+		pixman_region_intersect(&clip, &clip, region);
+		region = &clip;
+
+		if (!pixman_region_not_empty(region)) {
+			DBG(("%s: all clipped\n", __FUNCTION__));
+			return;
+		}
+
+		boxes = REGION_RECTS(region);
+		n = REGION_NUM_RECTS(region);
+		assert(n);
+	} else {
+		boxes = &box;
+		n = 1;
+	}
+
+	if (sna->kgem.gen >= 60) {
+		/* Sandybridge introduced a separate ring which it uses to
+		 * perform blits. Switching rendering between rings incurs
+		 * a stall as we wait upon the old ring to finish and
+		 * flush its render cache before we can proceed on with
+		 * the operation on the new ring.
+		 *
+		 * As this buffer, we presume, has just been written to by
+		 * the DRI client using the RENDER ring, we want to perform
+		 * our operation on the same ring, and ideally on the same
+		 * ring as we will flip from (which should be the RENDER ring
+		 * as well).
+		 */
+		kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	}
+
+	sna->render.copy_boxes(sna, GXcopy,
+			       (PixmapPtr)draw, src_bo, 0, 0,
+			       (PixmapPtr)draw, dst_bo, 0, 0,
+			       boxes, n);
+
+	if (region == &clip)
+		pixman_region_fini(&clip);
+}
+
+static void
 sna_dri_copy_region(DrawablePtr draw,
 		    RegionPtr region,
 		    DRI2BufferPtr dst_buffer,
@@ -513,15 +634,20 @@ sna_dri_copy_region(DrawablePtr draw,
 	PixmapPtr pixmap = get_drawable_pixmap(draw);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct kgem_bo *src, *dst;
+	void (*copy)(struct sna *, DrawablePtr, RegionPtr,
+		     struct kgem_bo *, struct kgem_bo *, bool) = sna_dri_copy;
 
-	if (dst_buffer->attachment == DRI2BufferFrontLeft)
+	if (dst_buffer->attachment == DRI2BufferFrontLeft) {
 		dst = sna_pixmap_set_dri(sna, pixmap);
-	else
+		copy = sna_dri_copy_to_front;
+	} else
 		dst = get_private(dst_buffer)->bo;
 
-	if (src_buffer->attachment == DRI2BufferFrontLeft)
+	if (src_buffer->attachment == DRI2BufferFrontLeft) {
 		src = sna_pixmap_set_dri(sna, pixmap);
-	else
+		assert(copy == sna_dri_copy);
+		copy = sna_dri_copy_from_front;
+	} else
 		src = get_private(src_buffer)->bo;
 
 	assert(dst != NULL);
@@ -534,13 +660,13 @@ sna_dri_copy_region(DrawablePtr draw,
 	DBG(("%s: src -- attachment=%d, name=%d, handle=%d\n",
 	     __FUNCTION__,
 	     src_buffer->attachment, src_buffer->name, src->handle));
-	DBG(("%s: clip (%d, %d), (%d, %d) x %d\n",
+	DBG(("%s: region (%d, %d), (%d, %d) x %d\n",
 	     __FUNCTION__,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
 	     REGION_NUM_RECTS(region)));
 
-	sna_dri_copy(sna, draw, region, dst, src, false);
+	copy(sna, draw, region, dst, src, false);
 }
 
 #if DRI2INFOREC_VERSION >= 4
@@ -916,12 +1042,12 @@ static void sna_dri_vblank_handle(int fd,
 			info->old_front.bo = NULL;
 			return;
 		}
-		/* else fall through to exchange/blit */
+		/* else fall through to blit */
 	case DRI2_SWAP:
-		sna_dri_copy(sna, draw, NULL,
-			     get_private(info->front)->bo,
-			     get_private(info->back)->bo,
-			     true);
+		sna_dri_copy_to_front(sna, draw, NULL,
+				      get_private(info->front)->bo,
+				      get_private(info->back)->bo,
+				      true);
 	case DRI2_SWAP_THROTTLE:
 		DBG(("%s: %d complete, frame=%d tv=%d.%06d\n",
 		     __FUNCTION__, info->type, frame, tv_sec, tv_usec));
@@ -1451,10 +1577,10 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 			 DRI2SwapComplete(client, draw, 0, 0, 0, DRI2_BLIT_COMPLETE, func, data);
 		 }
 
-		 sna_dri_copy(sna, draw, NULL,
-			      get_private(front)->bo,
-			      get_private(back)->bo,
-			      true);
+		 sna_dri_copy_to_front(sna, draw, NULL,
+				       get_private(front)->bo,
+				       get_private(back)->bo,
+				       true);
 		 return TRUE;
 	}
 
@@ -1540,10 +1666,10 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 
 blit_fallback:
 	DBG(("%s -- blit\n", __FUNCTION__));
-	sna_dri_copy(sna, draw, NULL,
-		     get_private(front)->bo,
-		     get_private(back)->bo,
-		     true);
+	sna_dri_copy_to_front(sna, draw, NULL,
+			      get_private(front)->bo,
+			      get_private(back)->bo,
+			      false);
 	if (info)
 		sna_dri_frame_event_info_free(info);
 	DRI2SwapComplete(client, draw, 0, 0, 0, DRI2_BLIT_COMPLETE, func, data);
@@ -1584,10 +1710,10 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 blit:
 		DBG(("%s: unable to flip, so blit\n", __FUNCTION__));
 
-		sna_dri_copy(sna, draw, NULL,
-			     get_private(front)->bo,
-			     get_private(back)->bo,
-			     false);
+		sna_dri_copy_to_front(sna, draw, NULL,
+				      get_private(front)->bo,
+				      get_private(back)->bo,
+				      false);
 		DRI2SwapComplete(client, draw, 0, 0, 0,
 				 DRI2_BLIT_COMPLETE, func, data);
 		return FALSE;
commit f0bcac6a1574a979b92b07c6f132dbd92602be90
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 13:07:00 2012 +0000

    sna/gen3: Micro-optimise solid span emission
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index c567d6b..0991a98 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1803,7 +1803,7 @@ inline static int gen3_get_rectangles(struct sna *sna,
 
 start:
 	rem = vertex_space(sna);
-	if (op->floats_per_rect > rem) {
+	if (unlikely(op->floats_per_rect > rem)) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen3_get_rectangles__flush(sna, op);
@@ -3194,6 +3194,33 @@ gen3_emit_composite_spans_primitive(struct sna *sna,
 }
 
 fastcall static void
+gen3_render_composite_spans_constant_box(struct sna *sna,
+					 const struct sna_composite_spans_op *op,
+					 const BoxRec *box, float opacity)
+{
+	float *v;
+	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
+	     __FUNCTION__,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     opacity,
+	     op->base.dst.x, op->base.dst.y,
+	     box->x1, box->y1,
+	     box->x2 - box->x1,
+	     box->y2 - box->y1));
+
+	gen3_get_rectangles(sna, &op->base, 1);
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	v[0] = box->x2;
+	v[6] = v[3] = box->x1;
+	v[4] = v[1] = box->y2;
+	v[7] = box->y1;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
 gen3_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
 				const BoxRec *box, float opacity)
@@ -3328,6 +3355,9 @@ gen3_render_composite_spans(struct sna *sna,
 		tmp->base.mask.u.gen3.type = SHADER_OPACITY;
 
 	no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
+	tmp->box   = gen3_render_composite_spans_box;
+	tmp->boxes = gen3_render_composite_spans_boxes;
+	tmp->done  = gen3_render_composite_spans_done;
 	tmp->prim_emit = gen3_emit_composite_spans_primitive;
 	switch (tmp->base.src.u.gen3.type) {
 	case SHADER_NONE:
@@ -3338,7 +3368,11 @@ gen3_render_composite_spans(struct sna *sna,
 	case SHADER_BLACK:
 	case SHADER_WHITE:
 	case SHADER_CONSTANT:
-		tmp->prim_emit = no_offset ? gen3_emit_composite_spans_primitive_constant_no_offset : gen3_emit_composite_spans_primitive_constant;
+		if (no_offset) {
+			tmp->box = gen3_render_composite_spans_constant_box;
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
+		} else
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
 		break;
 	case SHADER_LINEAR:
 	case SHADER_RADIAL:
@@ -3364,10 +3398,6 @@ gen3_render_composite_spans(struct sna *sna,
 		tmp->base.mask.u.gen3.type == SHADER_OPACITY;
 	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
 
-	tmp->box   = gen3_render_composite_spans_box;
-	tmp->boxes = gen3_render_composite_spans_boxes;
-	tmp->done  = gen3_render_composite_spans_done;
-
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
 			   NULL)) {
commit d3f4bb2b6dc794f9936523ec17ff78cddb5a7d72
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 12:59:13 2012 +0000

    sna/traps: Micro-optimise span emission
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 4dab0f9..52be00b 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1141,6 +1141,18 @@ tor_blt_span(struct sna *sna,
 }
 
 static void
+tor_blt_span__no_damage(struct sna *sna,
+			struct sna_composite_spans_op *op,
+			pixman_region16_t *clip,
+			const BoxRec *box,
+			int coverage)
+{
+	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
+
+	op->box(sna, op, box, AREA_TO_ALPHA(coverage));
+}
+
+static void
 tor_blt_span_clipped(struct sna *sna,
 		     struct sna_composite_spans_op *op,
 		     pixman_region16_t *clip,
@@ -3101,7 +3113,8 @@ project_trapezoid_onto_grid(const xTrapezoid *in,
 }
 
 static span_func_t
-choose_span(PicturePtr dst,
+choose_span(struct sna_composite_spans_op *tmp,
+	    PicturePtr dst,
 	    PictFormatPtr maskFormat,
 	    uint8_t op,
 	    RegionPtr clip)
@@ -3120,9 +3133,12 @@ choose_span(PicturePtr dst,
 				span = tor_blt_span_mono_clipped;
 		}
 	} else {
-		span = tor_blt_span;
 		if (REGION_NUM_RECTS(clip) > 1)
 			span = tor_blt_span_clipped;
+		else if (tmp->base.damage == NULL)
+			span = tor_blt_span__no_damage;
+		else
+			span = tor_blt_span;
 	}
 
 	return span;
@@ -3379,7 +3395,7 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, op, &clip),
 		   maskFormat && !operator_is_bounded(op));
 
 skip:
@@ -4669,7 +4685,7 @@ trap_span_converter(PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, clip,
-		   choose_span(dst, NULL, PictOpAdd, clip), false);
+		   choose_span(&tmp, dst, NULL, PictOpAdd, clip), false);
 
 skip:
 	tor_fini(&tor);
@@ -5196,7 +5212,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, op, &clip),
 		   maskFormat && !operator_is_bounded(op));
 
 skip:
@@ -5559,7 +5575,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	assert(tor.polygon->num_edges <= 2*count);
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, op, &clip),
 		   maskFormat && !operator_is_bounded(op));
 
 skip:
commit 914271010439535d4a77821c7c6a0110156bcb4a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 12:37:25 2012 +0000

    sna/traps: Tune nonzero_row
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index c1d42a7..4dab0f9 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -591,20 +591,31 @@ cell_list_add_subspan(struct cell_list *cells,
 		cell->uncovered_area += 2*(fx1-fx2);
 }
 
-static void
-cell_list_render_edge(struct cell_list *cells, struct edge *edge, int sign)
+inline static void
+cell_list_add_span(struct cell_list *cells,
+		   grid_scaled_x_t x1,
+		   grid_scaled_x_t x2)
 {
 	struct cell *cell;
-	grid_scaled_x_t fx;
-	int ix;
+	int ix1, fx1;
+	int ix2, fx2;
+
+	FAST_SAMPLES_X_TO_INT_FRAC(x1, ix1, fx1);
+	FAST_SAMPLES_X_TO_INT_FRAC(x2, ix2, fx2);
+
+	__DBG(("%s: x1=%d (%d+%d), x2=%d (%d+%d)\n", __FUNCTION__,
+	       x1, ix1, fx1, x2, ix2, fx2));
 
-	FAST_SAMPLES_X_TO_INT_FRAC(edge->x.quo, ix, fx);
+	cell = cell_list_find(cells, ix1);
+	if (ix1 != ix2) {
+		cell->uncovered_area += 2*fx1;
+		cell->covered_height += FAST_SAMPLES_Y;
 
-	/* We always know that ix1 is >= the cell list cursor in this
-	 * case due to the no-intersections precondition.  */
-	cell = cell_list_find(cells, ix);
-	cell->covered_height += sign*FAST_SAMPLES_Y;
-	cell->uncovered_area += sign*2*fx*FAST_SAMPLES_Y;
+		cell = cell_list_find(cells, ix2);
+		cell->uncovered_area -= 2*fx2;
+		cell->covered_height -= FAST_SAMPLES_Y;
+	} else
+		cell->uncovered_area += 2*(fx1-fx2);
 }
 
 static void
@@ -1054,9 +1065,7 @@ nonzero_row(struct active_list *active, struct cell_list *coverages)
 			right = right->next;
 		} while (1);
 
-		cell_list_render_edge(coverages, left, +1);
-		cell_list_render_edge(coverages, right, -1);
-
+		cell_list_add_span(coverages, left->x.quo, right->x.quo);
 		left = right->next;
 	}
 }
commit d59ccaf5aa81378f926e4ef10e44831de552a39d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 09:28:24 2012 +0000

    sna/traps: Make the inline u8 arithmetic more robust
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index dca9d1f..c1d42a7 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3553,6 +3553,19 @@ struct inplace {
 	uint8_t opacity;
 };
 
+static inline uint8_t
+mul_8_8(uint8_t a, uint8_t b)
+{
+    uint16_t t = a * (uint16_t)b + 0x7f;
+    return ((t >> 8) + t) >> 8;
+}
+
+static uint8_t coverage_opacity(int coverage, uint8_t opacity)
+{
+	coverage = coverage * 256 / FAST_SAMPLES_XY;
+	return mul_8_8(coverage - (coverage >> 8), opacity);
+}
+
 static void
 tor_blt_src(struct sna *sna,
 	    struct sna_composite_spans_op *op,
@@ -3564,7 +3577,7 @@ tor_blt_src(struct sna *sna,
 	uint8_t *ptr = in->ptr;
 	int h, w;
 
-	coverage = coverage * in->opacity / FAST_SAMPLES_XY;
+	coverage = coverage_opacity(coverage, in->opacity);
 
 	ptr += box->y1 * in->stride + box->x1;
 
@@ -3613,19 +3626,22 @@ tor_blt_in(struct sna *sna,
 	uint8_t *ptr = in->ptr;
 	int h, w, i;
 
-	coverage = coverage * in->opacity / FAST_SAMPLES_XY;
 	if (coverage == 0) {
 		tor_blt_src(sna, op, clip, box, 0);
 		return;
 	}
 
+	coverage = coverage_opacity(coverage, in->opacity);
+	if (coverage == 0xff)
+		return;
+
 	ptr += box->y1 * in->stride + box->x1;
 
 	h = box->y2 - box->y1;
 	w = box->x2 - box->x1;
 	do {
 		for (i = 0; i < w; i++)
-			ptr[i] = (ptr[i] * coverage) >> 8;
+			ptr[i] = mul_8_8(ptr[i], coverage);
 		ptr += in->stride;
 	} while (--h);
 }
@@ -3660,10 +3676,15 @@ tor_blt_add(struct sna *sna,
 	uint8_t *ptr = in->ptr;
 	int h, w, v, i;
 
-	coverage = coverage * in->opacity / FAST_SAMPLES_XY;
 	if (coverage == 0)
 		return;
 
+	coverage = coverage_opacity(coverage, in->opacity);
+	if (coverage == 0xff) {
+		tor_blt_src(sna, op, clip, box, 0xff);
+		return;
+	}
+
 	ptr += box->y1 * in->stride + box->x1;
 
 	h = box->y2 - box->y1;
@@ -4057,10 +4078,14 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	case PictOpAdd:
 		if (priv->clear && priv->clear_color == 0)
 			op = PictOpSrc;
+		if ((color >> 24) == 0)
+			return true;
 		break;
 	case PictOpIn:
 		if (priv->clear && priv->clear_color == 0)
 			return true;
+		if ((color >> 24) == 0)
+			return true;
 	case PictOpSrc:
 		break;
 	default:
commit 6416a7ccc64aef15096736d40e1bd44058c28004
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 16 09:18:29 2012 +0000

    sna/traps: Remove the old paths for mono inplace traps
    
    Dead code elimination.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 907ece0..dca9d1f 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3603,28 +3603,6 @@ tor_blt_src_clipped(struct sna *sna,
 }
 
 static void
-tor_blt_src_mono(struct sna *sna,
-		 struct sna_composite_spans_op *op,
-		 pixman_region16_t *clip,
-		 const BoxRec *box,
-		 int coverage)
-{
-	tor_blt_src(sna, op, clip, box,
-		    coverage < FAST_SAMPLES_XY/2 ? 0 : FAST_SAMPLES_XY);
-}
-
-static void
-tor_blt_src_clipped_mono(struct sna *sna,
-			 struct sna_composite_spans_op *op,
-			 pixman_region16_t *clip,
-			 const BoxRec *box,
-			 int coverage)
-{
-	tor_blt_src_clipped(sna, op, clip, box,
-			    coverage < FAST_SAMPLES_XY/2 ? 0 : FAST_SAMPLES_XY);
-}
-
-static void
 tor_blt_in(struct sna *sna,
 	   struct sna_composite_spans_op *op,
 	   pixman_region16_t *clip,
@@ -3672,28 +3650,6 @@ tor_blt_in_clipped(struct sna *sna,
 }
 
 static void
-tor_blt_in_mono(struct sna *sna,
-		struct sna_composite_spans_op *op,
-		pixman_region16_t *clip,
-		const BoxRec *box,
-		int coverage)
-{
-	tor_blt_in(sna, op, clip, box,
-		   coverage < FAST_SAMPLES_XY/2 ? 0 : FAST_SAMPLES_XY);
-}
-
-static void
-tor_blt_in_clipped_mono(struct sna *sna,
-			struct sna_composite_spans_op *op,
-			pixman_region16_t *clip,
-			const BoxRec *box,
-			int coverage)
-{
-	tor_blt_in_clipped(sna, op, clip, box,
-			   coverage < FAST_SAMPLES_XY/2 ? 0 : FAST_SAMPLES_XY);
-}
-
-static void
 tor_blt_add(struct sna *sna,
 	    struct sna_composite_spans_op *op,
 	    pixman_region16_t *clip,
@@ -3745,28 +3701,6 @@ tor_blt_add_clipped(struct sna *sna,
 	pixman_region_fini(&region);
 }
 
-static void
-tor_blt_add_mono(struct sna *sna,
-		 struct sna_composite_spans_op *op,
-		 pixman_region16_t *clip,
-		 const BoxRec *box,
-		 int coverage)
-{
-	if (coverage >= FAST_SAMPLES_XY/2)
-		tor_blt_add(sna, op, clip, box, FAST_SAMPLES_XY);
-}
-
-static void
-tor_blt_add_clipped_mono(struct sna *sna,
-			 struct sna_composite_spans_op *op,
-			 pixman_region16_t *clip,
-			 const BoxRec *box,
-			 int coverage)
-{
-	if (coverage >= FAST_SAMPLES_XY/2)
-		tor_blt_add_clipped(sna, op, clip, box, FAST_SAMPLES_XY);
-}
-
 struct mono_inplace_composite {
 	pixman_image_t *src, *dst;
 	int dx, dy;
@@ -4196,41 +4130,21 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	if (op == PictOpSrc) {
-		if (dst->pCompositeClip->data) {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_src_clipped_mono;
-			else
-				span = tor_blt_src_clipped;
-		} else {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_src_mono;
-			else
-				span = tor_blt_src;
-		}
+		if (dst->pCompositeClip->data)
+			span = tor_blt_src_clipped;
+		else
+			span = tor_blt_src;
 	} else if (op == PictOpIn) {
-		if (dst->pCompositeClip->data) {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_in_clipped_mono;
-			else
-				span = tor_blt_in_clipped;
-		} else {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_in_mono;
-			else
-				span = tor_blt_in;
-		}
+		if (dst->pCompositeClip->data)
+			span = tor_blt_in_clipped;
+		else
+			span = tor_blt_in;
 	} else {
-		if (dst->pCompositeClip->data) {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_add_clipped_mono;
-			else
-				span = tor_blt_add_clipped;
-		} else {
-			if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-				span = tor_blt_add_mono;
-			else
-				span = tor_blt_add;
-		}
+		assert(op == PictOpAdd);
+		if (dst->pCompositeClip->data)
+			span = tor_blt_add_clipped;
+		else
+			span = tor_blt_add;
 	}
 
 	DBG(("%s: move-to-cpu\n", __FUNCTION__));
commit cc33e83ac01c94479b44b0dcb78b2d26e4dd4d12
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 15 21:33:00 2012 +0000

    sna: Correct the order of screen private teardown
    
    So that we do not keep a stale references of the last cached pixmap
    across server regeneration (or shutdown).
    
    Reported-by: Thierry Reding <thierry.reding at avionic-design.de>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47357
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 22ee1e6..c7f041e 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -12082,6 +12082,10 @@ void sna_accel_watch_flush(struct sna *sna, int enable)
 
 void sna_accel_close(struct sna *sna)
 {
+	sna_composite_close(sna);
+	sna_gradients_close(sna);
+	sna_glyphs_close(sna);
+
 	if (sna->freed_pixmap) {
 		assert(sna->freed_pixmap->refcnt == 1);
 		free(sna_pixmap(sna->freed_pixmap));
@@ -12089,10 +12093,6 @@ void sna_accel_close(struct sna *sna)
 		sna->freed_pixmap = NULL;
 	}
 
-	sna_composite_close(sna);
-	sna_gradients_close(sna);
-	sna_glyphs_close(sna);
-
 	DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
 
 	kgem_cleanup_cache(&sna->kgem);
commit 417f49a5c046a2dcfb03d37ee25f98409de5a766
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 15 13:20:35 2012 +0000

    sna: Prefer the CPU bo for uploads if last access was not through the shadow
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 03ac400..22ee1e6 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1745,7 +1745,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 		if (n) {
 			Bool ok = FALSE;
 
-			if (use_cpu_bo_for_xfer(priv))
+			if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->cpu_bo, 0, 0,
 							    pixmap, priv->gpu_bo, 0, 0,
@@ -1783,7 +1783,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	} else if (DAMAGE_IS_ALL(priv->cpu_damage) ||
 		   sna_damage_contains_box__no_reduce(priv->cpu_damage, box)) {
 		Bool ok = FALSE;
-		if (use_cpu_bo_for_xfer(priv))
+		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -1812,7 +1812,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 		box = REGION_RECTS(&i);
 		ok = FALSE;
-		if (use_cpu_bo_for_xfer(priv))
+		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -2283,7 +2283,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		DBG(("%s: uploading %d damage boxes\n", __FUNCTION__, n));
 
 		ok = FALSE;
-		if (use_cpu_bo_for_xfer(priv))
+		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
commit 0591e7660bccfca20c6ce24f0e2368f5570bad31
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 15 13:18:43 2012 +0000

    sna: Fixup the shadow pixmap pointer for move-to-gpu
    
    If we choose not to use the CPU bo for the upload (because we fear the
    subsequent synchronisation cost), we need to fixup the shadow pointer
    before dereferencing it.
    
    On the move-to-cpu side, the fixup is already performed as we will need
    to access the shadow pixels for the subsequent drawing operation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 63afaaa..03ac400 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1751,6 +1751,11 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 							    pixmap, priv->gpu_bo, 0, 0,
 							    box, n);
 			if (!ok) {
+				if (pixmap->devPrivate.ptr == NULL) {
+					assert(priv->stride && priv->ptr);
+					pixmap->devPrivate.ptr = priv->ptr;
+					pixmap->devKind = priv->stride;
+				}
 				if (n == 1 && !priv->pinned &&
 				    box->x1 <= 0 && box->y1 <= 0 &&
 				    box->x2 >= pixmap->drawable.width &&
@@ -1783,13 +1788,19 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, 1);
-		if (!ok)
+		if (!ok) {
+			if (pixmap->devPrivate.ptr == NULL) {
+				assert(priv->stride && priv->ptr);
+				pixmap->devPrivate.ptr = priv->ptr;
+				pixmap->devKind = priv->stride;
+			}
 			ok = sna_write_boxes(sna, pixmap,
 					     priv->gpu_bo, 0, 0,
 					     pixmap->devPrivate.ptr,
 					     pixmap->devKind,
 					     0, 0,
 					     box, 1);
+		}
 		if (!ok)
 			return false;
 
@@ -1806,13 +1817,19 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
-		if (!ok)
+		if (!ok) {
+			if (pixmap->devPrivate.ptr == NULL) {
+				assert(priv->stride && priv->ptr);
+				pixmap->devPrivate.ptr = priv->ptr;
+				pixmap->devKind = priv->stride;
+			}
 			ok = sna_write_boxes(sna, pixmap,
 					     priv->gpu_bo, 0, 0,
 					     pixmap->devPrivate.ptr,
 					     pixmap->devKind,
 					     0, 0,
 					     box, n);
+		}
 		if (!ok)
 			return false;
 
@@ -2272,7 +2289,11 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
 		if (!ok) {
-			assert(pixmap->devPrivate.ptr != NULL);
+			if (pixmap->devPrivate.ptr == NULL) {
+				assert(priv->stride && priv->ptr);
+				pixmap->devPrivate.ptr = priv->ptr;
+				pixmap->devKind = priv->stride;
+			}
 			if (n == 1 && !priv->pinned &&
 			    (box->x2 - box->x1) >= pixmap->drawable.width &&
 			    (box->y2 - box->y1) >= pixmap->drawable.height) {
commit 70af962a1b348ce3d1ded00562699355e13b0266
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 10:03:23 2012 +0000

    uxa: Simplify allocation of backing pixmap
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index 446befd..0b1a369 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -640,6 +640,9 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 
 		dri_bo_unreference(priv->bo);
 		list_del(&priv->batch);
+
+		free(priv);
+		priv = NULL;
 	}
 
 	if (bo != NULL) {
@@ -647,13 +650,11 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 		uint32_t swizzle_mode;
 		int ret;
 
-		if (priv == NULL) {
-			priv = calloc(1, sizeof (struct intel_pixmap));
-			if (priv == NULL)
-				goto BAIL;
+		priv = calloc(1, sizeof (struct intel_pixmap));
+		if (priv == NULL)
+			goto BAIL;
 
-			list_init(&priv->batch);
-		}
+		list_init(&priv->batch);
 
 		dri_bo_reference(bo);
 		priv->bo = bo;
@@ -668,11 +669,6 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 		priv->tiling = tiling;
 		priv->busy = -1;
 		priv->offscreen = 1;
-	} else {
-		if (priv != NULL) {
-			free(priv);
-			priv = NULL;
-		}
 	}
 
   BAIL:
commit de8ebd213871a19b8131b381d54e45c21325a5b0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 09:31:24 2012 +0000

    uxa/i915: Remove broken CA pass, fallback to magic 2-pass composite helper
    
    The backend failed to handle all the corner cases, so remove the
    complication.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i915_render.c b/src/i915_render.c
index ea20322..c6d5ed7 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -694,7 +694,6 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 	if (!intel_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
 		return FALSE;
 
-	intel->needs_render_ca_pass = FALSE;
 	if (mask_picture != NULL && mask_picture->componentAlpha &&
 	    PICT_FORMAT_RGB(mask_picture->format)) {
 		/* Check if it's component alpha that relies on a source alpha
@@ -702,12 +701,8 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 		 * into the single source value that we get to blend with.
 		 */
 		if (i915_blend_op[op].src_alpha &&
-		    (i915_blend_op[op].src_blend != BLENDFACT_ZERO)) {
-			if (op != PictOpOver)
-				return FALSE;
-
-			intel->needs_render_ca_pass = TRUE;
-		}
+		    (i915_blend_op[op].src_blend != BLENDFACT_ZERO))
+			return FALSE;
 	}
 
 	intel->transform[0] = NULL;
@@ -944,18 +939,12 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 					   TEXCOORDFMT_2D : TEXCOORDFMT_4D);
 		}
 
-		if (intel->needs_render_ca_pass) {
-			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | 0);
-			OUT_BATCH(ss2);
-		} else {
-			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
-			OUT_BATCH(ss2);
-			OUT_BATCH(i915_get_blend_cntl(op, mask_picture, dest_picture->format));
-		}
+		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
+		OUT_BATCH(ss2);
+		OUT_BATCH(i915_get_blend_cntl(op, mask_picture, dest_picture->format));
 	}
 
-	if (! intel->needs_render_ca_pass)
-		i915_composite_emit_shader(intel, op);
+	i915_composite_emit_shader(intel, op);
 }
 
 void
@@ -1000,14 +989,6 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	}
 
 	if (intel->prim_offset == 0) {
-		if (intel->needs_render_ca_pass) {
-			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
-			OUT_BATCH(i915_get_blend_cntl(PictOpOutReverse,
-						      intel->render_mask_picture,
-						      intel->render_dest_picture->format));
-			i915_composite_emit_shader(intel, PictOpOutReverse);
-		}
-
 		intel->prim_offset = intel->batch_used;
 		OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL);
 		OUT_BATCH(intel->vertex_index);
@@ -1032,16 +1013,6 @@ i915_vertex_flush(intel_screen_private *intel)
 	intel->batch_ptr[intel->prim_offset] |= intel->vertex_count;
 	intel->prim_offset = 0;
 
-	if (intel->needs_render_ca_pass) {
-		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
-		OUT_BATCH(i915_get_blend_cntl(PictOpAdd,
-					      intel->render_mask_picture,
-					      intel->render_dest_picture->format));
-		i915_composite_emit_shader(intel, PictOpAdd);
-		OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL | intel->vertex_count);
-		OUT_BATCH(intel->vertex_index);
-	}
-
 	intel->vertex_index += intel->vertex_count;
 	intel->vertex_count = 0;
 }
diff --git a/src/intel.h b/src/intel.h
index 2520831..34ca650 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -272,7 +272,6 @@ typedef struct intel_screen_private {
 	Bool needs_3d_invariant;
 	Bool needs_render_state_emit;
 	Bool needs_render_vertex_emit;
-	Bool needs_render_ca_pass;
 
 	/* i830 render accel state */
 	uint32_t render_dest_format;
commit 7445948d10e51e686291587ff4a72de59f82b358
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 09:28:27 2012 +0000

    uxa: Remove unused render_mask_solid members
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel.h b/src/intel.h
index 1a27cf9..2520831 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -268,9 +268,7 @@ typedef struct intel_screen_private {
 	PixmapPtr render_source, render_mask, render_dest;
 	PicturePtr render_source_picture, render_mask_picture, render_dest_picture;
 	CARD32 render_source_solid;
-	CARD32 render_mask_solid;
 	Bool render_source_is_solid;
-	Bool render_mask_is_solid;
 	Bool needs_3d_invariant;
 	Bool needs_render_state_emit;
 	Bool needs_render_vertex_emit;
commit 296a073c8ca5d83ec678e502c644db9237751bd1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 09:27:36 2012 +0000

    uxa: Remove unused tracking of the current render target
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i915_render.c b/src/i915_render.c
index 9d8b8ac..ea20322 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -899,7 +899,7 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	 * XXX However for reasons unfathomed, correct rendering in KDE requires
 	 * at least a MI_FLUSH | INHIBIT_RENDER_CACHE_FLUSH here.
 	 */
-	if (1 || dest != intel->render_current_dest) {
+	if (1) {
 		uint32_t tiling_bits;
 
 		if (intel_pixmap_tiled(dest)) {
@@ -927,8 +927,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 			  DRAW_XMAX(dest->drawable.width - 1));
 		/* yorig, xorig (relate to color buffer?) */
 		OUT_BATCH(0x00000000);
-
-		intel->render_current_dest = dest;
 	}
 
 	{
@@ -1052,6 +1050,5 @@ void
 i915_batch_commit_notify(intel_screen_private *intel)
 {
 	intel->needs_render_state_emit = TRUE;
-	intel->render_current_dest = NULL;
 	intel->last_floats_per_vertex = 0;
 }
diff --git a/src/intel.h b/src/intel.h
index be4e57f..1a27cf9 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -269,7 +269,6 @@ typedef struct intel_screen_private {
 	PicturePtr render_source_picture, render_mask_picture, render_dest_picture;
 	CARD32 render_source_solid;
 	CARD32 render_mask_solid;
-	PixmapPtr render_current_dest;
 	Bool render_source_is_solid;
 	Bool render_mask_is_solid;
 	Bool needs_3d_invariant;
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index c0e1183..446befd 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -628,12 +628,9 @@ dri_bo *intel_get_pixmap_bo(PixmapPtr pixmap)
 
 void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 {
-	ScrnInfoPtr scrn = xf86Screens[pixmap->drawable.pScreen->myNum];
-	intel_screen_private *intel = intel_get_screen_private(scrn);
 	struct intel_pixmap *priv;
 
 	priv = intel_get_pixmap_private(pixmap);
-
 	if (priv == NULL && bo == NULL)
 	    return;
 
@@ -643,9 +640,6 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 
 		dri_bo_unreference(priv->bo);
 		list_del(&priv->batch);
-
-		if (intel->render_current_dest == pixmap)
-		    intel->render_current_dest = NULL;
 	}
 
 	if (bo != NULL) {
@@ -1261,7 +1255,6 @@ Bool intel_uxa_init(ScreenPtr screen)
 	intel->uxa_driver->uxa_major = 1;
 	intel->uxa_driver->uxa_minor = 0;
 
-	intel->render_current_dest = NULL;
 	intel->prim_offset = 0;
 	intel->vertex_count = 0;
 	intel->vertex_offset = 0;
commit e40615b051c10a7702bbc1424d8ac2575b34eff5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 09:26:10 2012 +0000

    uxa: Simplify flush tracking
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830_render.c b/src/i830_render.c
index 3f3d2ef..c12e87b 100644
--- a/src/i830_render.c
+++ b/src/i830_render.c
@@ -563,8 +563,7 @@ i830_prepare_composite(int op, PicturePtr source_picture,
 		intel->s8_blendctl = blendctl;
 	}
 
-	if(intel_pixmap_is_dirty(source) ||
-	   (mask && intel_pixmap_is_dirty(mask)))
+	if (intel_pixmap_is_dirty(source) || intel_pixmap_is_dirty(mask))
 		intel_batch_emit_flush(scrn);
 
 	intel->needs_render_state_emit = TRUE;
diff --git a/src/i915_render.c b/src/i915_render.c
index 6210035..9d8b8ac 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -743,11 +743,7 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 
 	intel->i915_render_state.op = op;
 
-	/* BUF_INFO is an implicit flush */
-	if (dest != intel->render_current_dest)
-		intel_batch_do_flush(scrn);
-	else if((source && intel_pixmap_is_dirty(source)) ||
-		(mask && intel_pixmap_is_dirty(mask)))
+	if (intel_pixmap_is_dirty(source) || intel_pixmap_is_dirty(mask))
 		intel_batch_emit_flush(scrn);
 
 	intel->needs_render_state_emit = TRUE;
@@ -906,8 +902,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	if (1 || dest != intel->render_current_dest) {
 		uint32_t tiling_bits;
 
-		intel_batch_do_flush(scrn);
-
 		if (intel_pixmap_tiled(dest)) {
 			tiling_bits = BUF_3D_TILED_SURFACE;
 			if (intel_get_pixmap_private(dest)->tiling
diff --git a/src/i965_render.c b/src/i965_render.c
index 8907139..b981ecc 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -2035,8 +2035,7 @@ i965_prepare_composite(int op, PicturePtr source_picture,
 	}
 
 	/* Flush any pending writes prior to relocating the textures. */
-	if (intel_pixmap_is_dirty(source) ||
-	    (mask && intel_pixmap_is_dirty(mask)))
+	if (intel_pixmap_is_dirty(source) || intel_pixmap_is_dirty(mask))
 		intel_batch_emit_flush(scrn);
 
 	composite_op->op = op;
diff --git a/src/intel.h b/src/intel.h
index f328d31..be4e57f 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -82,12 +82,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 struct intel_pixmap {
 	dri_bo *bo;
 
-	struct list flush, batch;
+	struct list batch;
 
 	uint16_t stride;
 	uint8_t tiling;
 	int8_t busy :2;
-	int8_t batch_write :1;
+	int8_t dirty :1;
 	int8_t offscreen :1;
 	int8_t pinned :1;
 };
@@ -121,7 +121,7 @@ static inline void intel_set_pixmap_private(PixmapPtr pixmap, struct intel_pixma
 
 static inline Bool intel_pixmap_is_dirty(PixmapPtr pixmap)
 {
-	return !list_is_empty(&intel_get_pixmap_private(pixmap)->flush);
+	return pixmap && intel_get_pixmap_private(pixmap)->dirty;
 }
 
 static inline Bool intel_pixmap_tiled(PixmapPtr pixmap)
@@ -188,7 +188,6 @@ typedef struct intel_screen_private {
 	/** Ending batch_used that was verified by intel_start_batch_atomic() */
 	int batch_atomic_limit;
 	struct list batch_pixmaps;
-	struct list flush_pixmaps;
 	drm_intel_bo *wa_scratch_bo;
 	OsTimerPtr cache_expire;
 
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 8e54d3a..2719c38 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -114,17 +114,15 @@ void intel_batch_teardown(ScrnInfoPtr scrn)
 
 	while (!list_is_empty(&intel->batch_pixmaps))
 		list_del(intel->batch_pixmaps.next);
-
-	while (!list_is_empty(&intel->flush_pixmaps))
-		list_del(intel->flush_pixmaps.next);
 }
 
-void intel_batch_do_flush(ScrnInfoPtr scrn)
+static void intel_batch_do_flush(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+	struct intel_pixmap *priv;
 
-	while (!list_is_empty(&intel->flush_pixmaps))
-		list_del(intel->flush_pixmaps.next);
+	list_for_each_entry(priv, &intel->batch_pixmaps, batch)
+		priv->dirty = 0;
 }
 
 static void intel_emit_post_sync_nonzero_flush(ScrnInfoPtr scrn)
@@ -268,13 +266,10 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 					 batch);
 
 		entry->busy = -1;
-		entry->batch_write = 0;
+		entry->dirty = 0;
 		list_del(&entry->batch);
 	}
 
-	while (!list_is_empty(&intel->flush_pixmaps))
-		list_del(intel->flush_pixmaps.next);
-
 	if (intel->debug_flush & DEBUG_FLUSH_WAIT)
 		drm_intel_bo_wait_rendering(intel->batch_bo);
 
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index f5f118e..b2bb390 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -36,7 +36,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 void intel_batch_init(ScrnInfoPtr scrn);
 void intel_batch_teardown(ScrnInfoPtr scrn);
 void intel_batch_emit_flush(ScrnInfoPtr scrn);
-void intel_batch_do_flush(ScrnInfoPtr scrn);
 void intel_batch_submit(ScrnInfoPtr scrn);
 
 static inline int intel_batch_space(intel_screen_private *intel)
@@ -132,10 +131,8 @@ intel_batch_mark_pixmap_domains(intel_screen_private *intel,
 
 	if (list_is_empty(&priv->batch))
 		list_add(&priv->batch, &intel->batch_pixmaps);
-	if (write_domain && list_is_empty(&priv->flush))
-		list_add(&priv->flush, &intel->flush_pixmaps);
 
-	priv->batch_write |= write_domain != 0;
+	priv->dirty |= write_domain != 0;
 	priv->busy = 1;
 
 	intel->needs_flush |= write_domain != 0;
diff --git a/src/intel_driver.c b/src/intel_driver.c
index 9e2aa59..f454f92 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -412,7 +412,6 @@ static int intel_init_bufmgr(intel_screen_private *intel)
 	drm_intel_bufmgr_gem_enable_fenced_relocs(intel->bufmgr);
 
 	list_init(&intel->batch_pixmaps);
-	list_init(&intel->flush_pixmaps);
 
 	if ((INTEL_INFO(intel)->gen == 60)) {
 		intel->wa_scratch_bo =
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index 7fb5a96..c0e1183 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -643,7 +643,6 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 
 		dri_bo_unreference(priv->bo);
 		list_del(&priv->batch);
-		list_del(&priv->flush);
 
 		if (intel->render_current_dest == pixmap)
 		    intel->render_current_dest = NULL;
@@ -660,7 +659,6 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 				goto BAIL;
 
 			list_init(&priv->batch);
-			list_init(&priv->flush);
 		}
 
 		dri_bo_reference(bo);
@@ -710,8 +708,7 @@ static Bool intel_uxa_prepare_access(PixmapPtr pixmap, uxa_access_t access)
 
 	/* When falling back to swrast, flush all pending operations */
 	intel_glamor_flush(intel);
-	if (!list_is_empty(&priv->batch) &&
-	    (access == UXA_ACCESS_RW || priv->batch_write))
+	if (access == UXA_ACCESS_RW || priv->dirty)
 		intel_batch_submit(scrn);
 
 	assert(bo->size <= intel->max_gtt_map_size);
@@ -1105,7 +1102,6 @@ intel_uxa_create_pixmap(ScreenPtr screen, int w, int h, int depth,
 		priv->offscreen = 1;
 
 		list_init(&priv->batch);
-		list_init(&priv->flush);
 		intel_set_pixmap_private(pixmap, priv);
 
 		screen->ModifyPixmapHeader(pixmap, w, h, 0, 0, stride, NULL);
commit 92aafe43b7760fd03e37803c3715abed768ce3a4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Mar 10 09:10:16 2012 +0000

    uxa: Kill the complicated in-flight tracking
    
    Reference leak hunting.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel.h b/src/intel.h
index 1c68af0..f328d31 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -82,7 +82,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 struct intel_pixmap {
 	dri_bo *bo;
 
-	struct list flush, batch, in_flight;
+	struct list flush, batch;
 
 	uint16_t stride;
 	uint8_t tiling;
@@ -189,7 +189,6 @@ typedef struct intel_screen_private {
 	int batch_atomic_limit;
 	struct list batch_pixmaps;
 	struct list flush_pixmaps;
-	struct list in_flight;
 	drm_intel_bo *wa_scratch_bo;
 	OsTimerPtr cache_expire;
 
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 2b8fbb6..8e54d3a 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -117,18 +117,6 @@ void intel_batch_teardown(ScrnInfoPtr scrn)
 
 	while (!list_is_empty(&intel->flush_pixmaps))
 		list_del(intel->flush_pixmaps.next);
-
-	while (!list_is_empty(&intel->in_flight)) {
-		struct intel_pixmap *entry;
-
-		entry = list_first_entry(&intel->in_flight,
-					 struct intel_pixmap,
-					 in_flight);
-
-		dri_bo_unreference(entry->bo);
-		list_del(&entry->in_flight);
-		free(entry);
-	}
 }
 
 void intel_batch_do_flush(ScrnInfoPtr scrn)
@@ -287,18 +275,6 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	while (!list_is_empty(&intel->flush_pixmaps))
 		list_del(intel->flush_pixmaps.next);
 
-	while (!list_is_empty(&intel->in_flight)) {
-		struct intel_pixmap *entry;
-
-		entry = list_first_entry(&intel->in_flight,
-					 struct intel_pixmap,
-					 in_flight);
-
-		dri_bo_unreference(entry->bo);
-		list_del(&entry->in_flight);
-		free(entry);
-	}
-
 	if (intel->debug_flush & DEBUG_FLUSH_WAIT)
 		drm_intel_bo_wait_rendering(intel->batch_bo);
 
diff --git a/src/intel_driver.c b/src/intel_driver.c
index da340e3..9e2aa59 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -413,7 +413,6 @@ static int intel_init_bufmgr(intel_screen_private *intel)
 
 	list_init(&intel->batch_pixmaps);
 	list_init(&intel->flush_pixmaps);
-	list_init(&intel->in_flight);
 
 	if ((INTEL_INFO(intel)->gen == 60)) {
 		intel->wa_scratch_bo =
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index ed4f375..7fb5a96 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -641,16 +641,9 @@ void intel_set_pixmap_bo(PixmapPtr pixmap, dri_bo * bo)
 		if (priv->bo == bo)
 			return;
 
-		if (list_is_empty(&priv->batch)) {
-			dri_bo_unreference(priv->bo);
-		} else if (!drm_intel_bo_is_reusable(priv->bo)) {
-			dri_bo_unreference(priv->bo);
-			list_del(&priv->batch);
-			list_del(&priv->flush);
-		} else {
-			list_add(&priv->in_flight, &intel->in_flight);
-			priv = NULL;
-		}
+		dri_bo_unreference(priv->bo);
+		list_del(&priv->batch);
+		list_del(&priv->flush);
 
 		if (intel->render_current_dest == pixmap)
 		    intel->render_current_dest = NULL;
@@ -1088,45 +1081,6 @@ intel_uxa_create_pixmap(ScreenPtr screen, int w, int h, int depth,
 		if (size > intel->max_bo_size || stride >= KB(32))
 			goto fallback_pixmap;
 
-		/* Perform a preliminary search for an in-flight bo */
-		if (usage != UXA_CREATE_PIXMAP_FOR_MAP) {
-			int aligned_h;
-
-			if (tiling == I915_TILING_X)
-				aligned_h = ALIGN(h, 8);
-			else if (tiling == I915_TILING_Y)
-				aligned_h = ALIGN(h, 32);
-			else
-				aligned_h = ALIGN(h, 2);
-
-			list_for_each_entry(priv, &intel->in_flight, in_flight) {
-				if (priv->tiling != tiling)
-					continue;
-
-				if (tiling == I915_TILING_NONE) {
-				    if (priv->bo->size < size)
-					    continue;
-
-					priv->stride = stride;
-				} else {
-					if (priv->stride < stride ||
-					    priv->bo->size < priv->stride * aligned_h)
-						continue;
-
-					stride = priv->stride;
-				}
-
-				list_del(&priv->in_flight);
-				intel_set_pixmap_private(pixmap, priv);
-
-				screen->ModifyPixmapHeader(pixmap, w, h, 0, 0, stride, NULL);
-
-				if (!intel_glamor_create_textured_pixmap(pixmap))
-					goto fallback_glamor;
-				return pixmap;
-			}
-		}
-
 		priv = calloc(1, sizeof (struct intel_pixmap));
 		if (priv == NULL)
 			goto fallback_pixmap;
commit 6889e2cbc502466936622613018bfab18abd98ba
Author: Thierry Reding <thierry.reding at avionic-design.de>
Date:   Thu Mar 15 13:10:20 2012 +0100

    configure: Keep passed-in CFLAGS for DRI tests
    
    When the user passes extra CFLAGS and CPPFLAGS to the configure script,
    they should be kept when performing subsequent checks with additional
    flags. This is required to properly build in cross-compilation setups
    where the user may pass in flags like --sysroot in order to pick up the
    cross-built dependencies.
    
    Signed-off-by: Thierry Reding <thierry.reding at avionic-design.de>

diff --git a/configure.ac b/configure.ac
index 06c40e8..63101bf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -191,8 +191,8 @@ sdkdir=`$PKG_CONFIG --variable=sdkdir xorg-server`
 if test "x$enable_dri" != "xno"; then
         save_CFLAGS="$CFLAGS"
         save_CPPFLAGS="$CPPFLAGS"
-        CFLAGS="$XORG_CFLAGS $DRI_CFLAGS $DRM_CFLAGS"
-        CPPFLAGS="$XORG_CFLAGS $DRI_CFLAGS $DRM_CFLAGS"
+        CFLAGS="$CFLAGS $XORG_CFLAGS $DRI_CFLAGS $DRM_CFLAGS"
+        CPPFLAGS="$CPPFLAGS $XORG_CFLAGS $DRI_CFLAGS $DRM_CFLAGS"
         AC_CHECK_HEADERS([dri.h sarea.h dristruct.h],, [DRI=no],
                 [/* for dri.h */
                  #include <xf86str.h>
commit 854f7a89f39a5042adb118d72143a110ae1fcee0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 15 12:38:22 2012 +0000

    sna/traps: dst IN WHITE does not reduce to SRC!
    
    I was getting too carried with my reductions. However, IN over a
    clear surface is a no-op, though unlikely!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index e73805d..907ece0 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -4071,7 +4071,6 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	struct sna_pixmap *priv;
 	RegionRec region;
 	uint32_t color;
-	bool unbounded;
 	int16_t dst_x, dst_y;
 	int dx, dy;
 	int n;
@@ -4120,17 +4119,14 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	unbounded = false;
 	switch (op) {
-	case PictOpIn:
-		unbounded = true;
-		if (priv->clear && priv->clear_color == 0xff)
-			op = PictOpSrc;
-		break;
 	case PictOpAdd:
 		if (priv->clear && priv->clear_color == 0)
 			op = PictOpSrc;
 		break;
+	case PictOpIn:
+		if (priv->clear && priv->clear_color == 0)
+			return true;
 	case PictOpSrc:
 		break;
 	default:
@@ -4251,7 +4247,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	inplace.opacity = color >> 24;
 
 	tor_render(NULL, &tor, (void*)&inplace,
-		   dst->pCompositeClip, span, unbounded);
+		   dst->pCompositeClip, span, op == PictOpIn);
 
 	tor_fini(&tor);
 
commit f7325d75a6da85a0ea9460b3f8f986d1264c20e2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 15 10:31:51 2012 +0000

    sna/traps: Fix off-by-one for filling vertical segments in tor_inplace
    
    If the last solid portion was exactly 4-pixels wide, we would miss
    filling in the mask.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 8ecd18a..e73805d 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1437,7 +1437,7 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 			}
 
 			winding += right->dir;
-			if (0 == winding)
+			if (0 == winding && right->x.quo != right->next->x.quo)
 				break;
 
 			right = right->next;
@@ -1445,7 +1445,7 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 
 		if (left->x.quo < 0) {
 			lix = lfx = 0;
-		} else if (left->x.quo > width * FAST_SAMPLES_X) {
+		} else if (left->x.quo >= width * FAST_SAMPLES_X) {
 			lix = width;
 			lfx = 0;
 		} else
@@ -1453,7 +1453,7 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 
 		if (right->x.quo < 0) {
 			rix = rfx = 0;
-		} else if (right->x.quo > width * FAST_SAMPLES_X) {
+		} else if (right->x.quo >= width * FAST_SAMPLES_X) {
 			rix = width;
 			rfx = 0;
 		} else
@@ -1478,12 +1478,14 @@ inplace_row(struct active_list *active, uint8_t *row, int width)
 				else
 					memset(row+lix, 0xff, rix);
 #else
-				while (rix && lix & 3)
-					row[lix++] = 0xff, rix--;
-				while (rix > 4) {
+				while (rix >= 8) {
+					*(uint64_t *)(row+lix) = 0xffffffffffffffff;
+					lix += 8;
+					rix -= 8;
+				}
+				if (rix & 4) {
 					*(uint32_t *)(row+lix) = 0xffffffff;
 					lix += 4;
-					rix -= 4;
 				}
 				if (rix & 2) {
 					*(uint16_t *)(row+lix) = 0xffff;
@@ -1533,16 +1535,16 @@ inplace_subrow(struct active_list *active, int8_t *row,
 							*min = ix;
 
 						row[ix++] += FAST_SAMPLES_X - fx;
-						if (ix < width)
+						if (fx && ix < width)
 							row[ix] += fx;
 					}
 
 					xstart = edge->x.quo;
 					if (xstart < FAST_SAMPLES_X * width) {
 						FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
-						row[ix++] -= FAST_SAMPLES_X - fx;
-						if (ix < width)
-							row[ix] -= fx;
+						row[ix] -= FAST_SAMPLES_X - fx;
+						if (fx && ix + 1< width)
+							row[++ix] -= fx;
 
 						if (ix > *max)
 							*max = ix;
commit aa1ad5f6cf6d3da212e38232bbe8d82342d488f2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 23:43:20 2012 +0000

    sna/damage: Handle a reduced damage rather than assert
    
    As we may reduce a damage to empty along the migration paths and not
    detect that reduced damage till later, handle those scenarios rather
    asserting.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?42426
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index f52ecac..9c646d8 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -1005,12 +1005,16 @@ static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 	if (damage == NULL)
 		return NULL;
 
+	if (!RegionNotEmpty(&damage->region)) {
+		__sna_damage_destroy(damage);
+		return NULL;
+	}
+
 	assert(RegionNotEmpty(region));
 
 	if (!sna_damage_maybe_contains_box(damage, &region->extents))
 		return damage;
 
-	assert(RegionNotEmpty(&damage->region));
 
 	if (region_is_singular(region) &&
 	    box_contains(&region->extents, &damage->extents)) {
@@ -1028,15 +1032,12 @@ static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 	}
 
 	if (damage->mode != DAMAGE_SUBTRACT) {
-		if (damage->dirty)
+		if (damage->dirty) {
 			__sna_damage_reduce(damage);
-
-		if (pixman_region_equal(region, &damage->region)) {
-			__sna_damage_destroy(damage);
-			return NULL;
+			assert(RegionNotEmpty(&damage->region));
 		}
 
-		if (!pixman_region_not_empty(&damage->region)) {
+		if (pixman_region_equal(region, &damage->region)) {
 			__sna_damage_destroy(damage);
 			return NULL;
 		}
@@ -1091,23 +1092,23 @@ inline static struct sna_damage *__sna_damage_subtract_box(struct sna_damage *da
 	if (damage == NULL)
 		return NULL;
 
+	if (!RegionNotEmpty(&damage->region)) {
+		__sna_damage_destroy(damage);
+		return NULL;
+	}
+
 	if (!sna_damage_maybe_contains_box(damage, box))
 		return damage;
 
-	assert(RegionNotEmpty(&damage->region));
-
 	if (box_contains(box, &damage->extents)) {
 		__sna_damage_destroy(damage);
 		return NULL;
 	}
 
 	if (damage->mode != DAMAGE_SUBTRACT) {
-		if (damage->dirty)
+		if (damage->dirty) {
 			__sna_damage_reduce(damage);
-
-		if (!pixman_region_not_empty(&damage->region)) {
-			__sna_damage_destroy(damage);
-			return NULL;
+			assert(RegionNotEmpty(&damage->region));
 		}
 
 		if (region_is_singular(&damage->region)) {
@@ -1163,6 +1164,11 @@ static struct sna_damage *__sna_damage_subtract_boxes(struct sna_damage *damage,
 	if (damage == NULL)
 		return NULL;
 
+	if (!RegionNotEmpty(&damage->region)) {
+		__sna_damage_destroy(damage);
+		return NULL;
+	}
+
 	assert(n);
 
 	extents = box[0];
@@ -1191,12 +1197,9 @@ static struct sna_damage *__sna_damage_subtract_boxes(struct sna_damage *damage,
 		return __sna_damage_subtract_box(damage, &extents);
 
 	if (damage->mode != DAMAGE_SUBTRACT) {
-		if (damage->dirty)
+		if (damage->dirty) {
 			__sna_damage_reduce(damage);
-
-		if (!pixman_region_not_empty(&damage->region)) {
-			__sna_damage_destroy(damage);
-			return NULL;
+			assert(RegionNotEmpty(&damage->region));
 		}
 
 		damage->mode = DAMAGE_SUBTRACT;
commit 412d030d08d741304dbbf849554bf71bfe678ad6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 23:08:31 2012 +0000

    sna: Treat unmapped but CPU-mappable bo as available for mapping
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 7e5ffac..98534d9 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -443,7 +443,7 @@ static inline bool kgem_bo_mapped(struct kgem_bo *bo)
 	DBG_HDR(("%s: map=%p, tiling=%d\n", __FUNCTION__, bo->map, bo->tiling));
 
 	if (bo->map == NULL)
-		return false;
+		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
 
 	return IS_CPU_MAP(bo->map) == !bo->tiling;
 }
commit 4d657144c45b836098c3c7fb4f9b58a5cbea615b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 23:05:58 2012 +0000

    sna: Disable tiling for single row pixmaps (unless required for hw limits)
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index a1ed6ca..daca7af 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2330,6 +2330,12 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 	if (tiling < 0)
 		return tiling;
 
+	if (tiling && height == 1) {
+		DBG(("%s: disabling tiling [%d] for single row\n",
+		     __FUNCTION__,height));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
 	if (tiling == I915_TILING_Y && height <= 16) {
 		DBG(("%s: too short [%d] for TILING_Y\n",
 		     __FUNCTION__,height));
commit e3b6eb29dfccea59f24e28c0e1ae1b8bafb24330
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 21:30:13 2012 +0000

    sna/traps: Explicitly create an unattach pixmap for fallback
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42426
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 6ec4748..55c5f5a 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -419,6 +419,8 @@ static inline Bool pixmap_is_scanout(PixmapPtr pixmap)
 PixmapPtr sna_pixmap_create_upload(ScreenPtr screen,
 				   int width, int height, int depth,
 				   unsigned flags);
+PixmapPtr sna_pixmap_create_unattached(ScreenPtr screen,
+				       int width, int height, int depth);
 
 struct sna_pixmap *sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags);
 struct sna_pixmap *sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags);
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a5da2ca..63afaaa 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -623,6 +623,15 @@ sna_pixmap_create_shm(ScreenPtr screen,
 	return pixmap;
 }
 
+PixmapPtr
+sna_pixmap_create_unattached(ScreenPtr screen,
+			     int width, int height, int depth)
+{
+	return create_pixmap(to_sna_from_screen(screen),
+			     screen, width, height, depth,
+			     CREATE_PIXMAP_USAGE_SCRATCH);
+}
+
 static PixmapPtr
 sna_pixmap_create_scratch(ScreenPtr screen,
 			  int width, int height, int depth,
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f0f48e8..8ecd18a 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2464,9 +2464,8 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 				pixman_image_unref(image);
 			}
 		} else {
-			scratch = screen->CreatePixmap(screen,
-						       width, height, depth,
-						       CREATE_PIXMAP_USAGE_SCRATCH);
+			scratch = sna_pixmap_create_unattached(screen,
+							       width, height, depth);
 			if (!scratch)
 				return;
 
commit f4b0d177393579188789c3730643b2af2bbed093
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 20:48:56 2012 +0000

    sna: Avoid using kgem_bo_reference() internally
    
    So that we can keep the assertion to track the refcnt elsewhere.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 761218f..a1ed6ca 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2275,8 +2275,10 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags)
 
 	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	bo = search_linear_cache(kgem, size, CREATE_INACTIVE | flags);
-	if (bo)
-		return kgem_bo_reference(bo);
+	if (bo) {
+		bo->refcnt = 1;
+		return bo;
+	}
 
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
@@ -2505,7 +2507,8 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-			return kgem_bo_reference(bo);
+			bo->refcnt = 1;
+			return bo;
 		}
 
 		goto create;
@@ -2556,7 +2559,8 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
 				assert(bo->reusable);
 				assert(bo->domain != DOMAIN_GPU && !kgem_busy(kgem, bo->handle));
-				return kgem_bo_reference(bo);
+				bo->refcnt = 1;
+				return bo;
 			}
 		} while (!list_is_empty(cache) && kgem_retire(kgem));
 	}
@@ -2609,7 +2613,8 @@ search_again:
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-			return kgem_bo_reference(bo);
+			bo->refcnt = 1;
+			return bo;
 		}
 	} else {
 		list_for_each_entry(bo, cache, list) {
@@ -2629,7 +2634,8 @@ search_again:
 			bo->delta = 0;
 			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-			return kgem_bo_reference(bo);
+			bo->refcnt = 1;
+			return bo;
 		}
 	}
 
@@ -2661,7 +2667,8 @@ search_again:
 					bo->delta = 0;
 					DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 					     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-					return kgem_bo_reference(bo);
+					bo->refcnt = 1;
+					return bo;
 				}
 			}
 		}
@@ -2706,7 +2713,8 @@ search_again:
 				bo->delta = 0;
 				DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
-				return kgem_bo_reference(bo);
+				bo->refcnt = 1;
+				return bo;
 			}
 		}
 	}
@@ -2763,7 +2771,8 @@ search_inactive:
 		assert((flags & CREATE_INACTIVE) == 0 || bo->domain != DOMAIN_GPU);
 		assert((flags & CREATE_INACTIVE) == 0 ||
 		       !kgem_busy(kgem, bo->handle));
-		return kgem_bo_reference(bo);
+		bo->refcnt = 1;
+		return bo;
 	}
 
 	if (flags & CREATE_INACTIVE && !list_is_empty(&kgem->requests)) {
commit 915742d86504f0e13c3cd67c7839dc3367521d7e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 20:46:59 2012 +0000

    sna: Ellide no-op image glyphs
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index e957b1b..a5da2ca 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10163,7 +10163,8 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	      int _x, int _y, unsigned int _n,
 	      CharInfoPtr *_info,
 	      RegionRec *clip,
-	      uint32_t fg, uint32_t bg)
+	      uint32_t fg, uint32_t bg,
+	      bool transparent)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
@@ -10174,7 +10175,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	int16_t dx, dy;
 	uint32_t br00;
 
-	uint8_t rop = bg == -1 ? copy_ROP[gc->alu] : ROP_S;
+	uint8_t rop = transparent ? copy_ROP[gc->alu] : ROP_S;
 
 	DBG(("%s (%d, %d) x %d, fg=%08x, bg=%08x alu=%02x\n",
 	     __FUNCTION__, _x, _y, _n, fg, bg, rop));
@@ -10206,7 +10207,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	extents = REGION_RECTS(clip);
 	last_extents = extents + REGION_NUM_RECTS(clip);
 
-	if (bg != -1) /* emulate miImageGlyphBlt */
+	if (!transparent) /* emulate miImageGlyphBlt */
 		sna_blt_fill_boxes(sna, GXcopy,
 				   bo, drawable->bitsPerPixel,
 				   bg, extents, REGION_NUM_RECTS(clip));
@@ -10225,7 +10226,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
-	b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+	b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 	b[2] = extents->y1 << 16 | extents->x1;
 	b[3] = extents->y2 << 16 | extents->x2;
 	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10256,6 +10257,9 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			if (c->bits == (void *)1)
 				goto skip;
 
+			if (!transparent && c->bits == (void *)2)
+				goto skip;
+
 			len = (w8 * h + 7) >> 3 << 1;
 			DBG(("%s glyph: (%d, %d) x (%d[%d], %d), len=%d\n" ,__FUNCTION__,
 			     x,y, w, w8, h, len));
@@ -10268,6 +10272,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			if (x1 + w <= extents->x1 || y1 + h <= extents->y1)
 				goto skip;
 
+
 			if (!kgem_check_batch(&sna->kgem, 3+len)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -10279,7 +10284,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
-				b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+				b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 				b[2] = extents->y1 << 16 | extents->x1;
 				b[3] = extents->y2 << 16 | extents->x2;
 				b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10299,7 +10304,9 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			b[0] = br00 | (1 + len);
 			b[1] = (uint16_t)y1 << 16 | (uint16_t)x1;
 			b[2] = (uint16_t)(y1+h) << 16 | (uint16_t)(x1+w);
-			 {
+			if (c->bits == (void *)2) {
+				memset(b+3, 0, len*4);
+			} else {
 				uint64_t *src = (uint64_t *)c->bits;
 				uint64_t *dst = (uint64_t *)(b + 3);
 				do  {
@@ -10384,6 +10391,7 @@ static bool sna_set_glyph(CharInfoPtr in, CharInfoPtr out)
 	int h = GLYPHHEIGHTPIXELS(in);
 	int stride = GLYPHWIDTHBYTESPADDED(in);
 	uint8_t *dst, *src;
+	int clear = 1;
 
 	out->metrics = in->metrics;
 
@@ -10405,11 +10413,17 @@ static bool sna_set_glyph(CharInfoPtr in, CharInfoPtr out)
 	do {
 		int i = w;
 		do {
+			clear &= *src == 0;
 			*dst++ = byte_reverse(*src++);
 		} while (--i);
 		src += stride;
 	} while (--h);
 
+	if (clear) {
+		free(out->bits);
+		out->bits = (void *)2;
+	}
+
 	return true;
 }
 
@@ -10511,7 +10525,7 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	if (!gc_is_solid(gc, &fg))
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1)) {
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1, true)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10600,7 +10614,7 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	if (!gc_is_solid(gc, &fg))
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1)) {
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1, true)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10698,7 +10712,8 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, gc->fgPixel, gc->bgPixel)) {
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region,
+			   gc->fgPixel, gc->bgPixel, false)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10788,7 +10803,8 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, gc->fgPixel, gc->bgPixel)) {
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region,
+			   gc->fgPixel, gc->bgPixel, false)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10830,14 +10846,15 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		       struct kgem_bo *bo,
 		       struct sna_damage **damage,
 		       RegionPtr clip,
-		       uint32_t fg, uint32_t bg)
+		       uint32_t fg, uint32_t bg,
+		       bool transparent)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	const BoxRec *extents, *last_extents;
 	uint32_t *b;
 	int16_t dx, dy;
-	uint8_t rop = bg == -1 ? copy_ROP[gc->alu] : ROP_S;
+	uint8_t rop = transparent ? copy_ROP[gc->alu] : ROP_S;
 
 	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
@@ -10855,7 +10872,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	extents = REGION_RECTS(clip);
 	last_extents = extents + REGION_NUM_RECTS(clip);
 
-	if (bg != -1) /* emulate miImageGlyphBlt */
+	if (!transparent) /* emulate miImageGlyphBlt */
 		sna_blt_fill_boxes(sna, GXcopy,
 				   bo, drawable->bitsPerPixel,
 				   bg, extents, REGION_NUM_RECTS(clip));
@@ -10874,7 +10891,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
-	b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+	b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 	b[2] = extents->y1 << 16 | extents->x1;
 	b[3] = extents->y2 << 16 | extents->x2;
 	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10916,6 +10933,21 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			if (x1 + w <= extents->x1 || y1 + h <= extents->y1)
 				goto skip;
 
+			if (!transparent) {
+				int clear = 1, j = h;
+				uint8_t *g = glyph;
+
+				do {
+					i = w8;
+					do {
+						clear = *g == 0;
+					} while (clear && --i);
+					g += stride - w8;
+				} while (clear && --j);
+				if (clear)
+					goto skip;
+			}
+
 			if (!kgem_check_batch(&sna->kgem, 3+len)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -10927,7 +10959,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
-				b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+				b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 				b[2] = extents->y1 << 16 | extents->x1;
 				b[3] = extents->y2 << 16 | extents->x2;
 				b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
@@ -11054,7 +11086,7 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
 				   bo, damage, &region,
-				   gc->fgPixel, gc->bgPixel))
+				   gc->fgPixel, gc->bgPixel, false))
 		goto out;
 
 fallback:
@@ -11129,7 +11161,7 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 
 	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
-				   bo, damage, &region, fg, -1))
+				   bo, damage, &region, fg, -1, true))
 		goto out;
 
 fallback:
commit 884ba6c0bfa70e301427d7f7e83d545930969931
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 20:19:30 2012 +0000

    sna: Don't mark cached upload buffers for inactivity expiration
    
    As these do not follow the normal rules of damage tracking, we have to
    be careful not to force migration.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42426
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ce13982..e957b1b 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2073,7 +2073,8 @@ static inline struct sna_pixmap *
 sna_pixmap_mark_active(struct sna *sna, struct sna_pixmap *priv)
 {
 	assert(priv->gpu_bo);
-	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
+	if (!priv->pinned && priv->gpu_bo->proxy == NULL &&
+	    (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return priv;
commit c5b72861f15eebaddd0e5bb41ed17f51612b80f7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 19:56:42 2012 +0000

    sna: Add a couple of asserts for inactive_partial reference counting
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 72751f3..761218f 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1257,6 +1257,7 @@ static void kgem_retire_partials(struct kgem *kgem)
 			kgem_bo_destroy(kgem, cached);
 		}
 
+		assert(bo->base.refcnt > 0);
 		if (bo->base.refcnt != 1)
 			continue;
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 52e5e9c..7e5ffac 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -270,6 +270,7 @@ static inline void kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
 
 static inline struct kgem_bo *kgem_bo_reference(struct kgem_bo *bo)
 {
+	assert(bo->refcnt);
 	bo->refcnt++;
 	return bo;
 }
commit eccee243fd746079bbf6c78cdd2d55b0b3d3924a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 11:51:39 2012 +0000

    sna/traps: Use a more direct fallback path for the CPU
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f05324f..f0f48e8 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2423,44 +2423,63 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 
 		DBG(("%s: mask (%dx%d) depth=%d, format=%08x\n",
 		     __FUNCTION__, width, height, depth, format));
-		scratch = sna_pixmap_create_upload(screen,
-						   width, height, 8,
-						   KGEM_BUFFER_WRITE);
-		if (!scratch)
-			return;
+		if (is_gpu(dst->pDrawable) || picture_is_gpu(src)) {
+			scratch = sna_pixmap_create_upload(screen,
+							   width, height, 8,
+							   KGEM_BUFFER_WRITE);
+			if (!scratch)
+				return;
 
-		if (depth < 8) {
-			image = pixman_image_create_bits(format, width, height,
-							 NULL, 0);
+			if (depth < 8) {
+				image = pixman_image_create_bits(format, width, height,
+								 NULL, 0);
+			} else {
+				memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
+				image = pixman_image_create_bits(format, width, height,
+								 scratch->devPrivate.ptr,
+								 scratch->devKind);
+			}
+			if (image) {
+				for (; ntrap; ntrap--, traps++)
+					pixman_rasterize_trapezoid(image,
+								   (pixman_trapezoid_t *)traps,
+								   -bounds.x1, -bounds.y1);
+				if (depth < 8) {
+					pixman_image_t *a8;
+
+					a8 = pixman_image_create_bits(PIXMAN_a8, width, height,
+								      scratch->devPrivate.ptr,
+								      scratch->devKind);
+					if (a8) {
+						pixman_image_composite(PIXMAN_OP_SRC,
+								       image, NULL, a8,
+								       0, 0,
+								       0, 0,
+								       0, 0,
+								       width, height);
+						pixman_image_unref (a8);
+					}
+				}
+
+				pixman_image_unref(image);
+			}
 		} else {
+			scratch = screen->CreatePixmap(screen,
+						       width, height, depth,
+						       CREATE_PIXMAP_USAGE_SCRATCH);
+			if (!scratch)
+				return;
+
 			memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
 			image = pixman_image_create_bits(format, width, height,
 							 scratch->devPrivate.ptr,
 							 scratch->devKind);
-		}
-		if (image) {
-			for (; ntrap; ntrap--, traps++)
-				pixman_rasterize_trapezoid(image,
-							   (pixman_trapezoid_t *)traps,
-							   -bounds.x1, -bounds.y1);
-			if (depth < 8) {
-				pixman_image_t *a8;
-
-				a8 = pixman_image_create_bits(PIXMAN_a8, width, height,
-							      scratch->devPrivate.ptr,
-							      scratch->devKind);
-				if (a8) {
-					pixman_image_composite(PIXMAN_OP_SRC,
-							       image, NULL, a8,
-							       0, 0,
-							       0, 0,
-							       0, 0,
-							       width, height);
-					pixman_image_unref (a8);
-				}
+			if (image) {
+				for (; ntrap; ntrap--, traps++)
+					pixman_rasterize_trapezoid(image,
+								   (pixman_trapezoid_t *)traps,
+								   -bounds.x1, -bounds.y1);
 			}
-
-			pixman_image_unref(image);
 		}
 
 		mask = CreatePicture(0, &scratch->drawable,
commit 4943428b235560af64a5a17f80f87e9f6d729723
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 11:46:18 2012 +0000

    sna/traps: Rasterise using pixman inplace where appropriate
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 933f580..f05324f 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2302,6 +2302,79 @@ trapezoids_bounds(int n, const xTrapezoid *t, BoxPtr box)
 	box->y2 = pixman_fixed_integer_ceil(y2);
 }
 
+static bool
+is_mono(PicturePtr dst, PictFormatPtr mask)
+{
+	return mask ? mask->depth < 8 : dst->polyEdge==PolyEdgeSharp;
+}
+
+static bool
+trapezoids_inplace_fallback(CARD8 op,
+			    PicturePtr src, PicturePtr dst, PictFormatPtr mask,
+			    int ntrap, xTrapezoid *traps)
+{
+	pixman_image_t *image;
+	BoxRec box;
+	uint32_t color;
+	int dx, dy;
+
+	if (op != PictOpAdd)
+		return false;
+
+	if (is_mono(dst, mask)) {
+		if (dst->format != PICT_a1)
+			return false;
+	} else {
+		if (dst->format != PICT_a8)
+			return false;
+	}
+
+	if (!sna_picture_is_solid(src, &color) || (color >> 24) != 0xff) {
+		DBG(("%s: not an opaque solid source\n", __FUNCTION__));
+		return false;
+	}
+
+	box.x1 = dst->pDrawable->x;
+	box.y1 = dst->pDrawable->y;
+	box.x2 = dst->pDrawable->width;
+	box.y2 = dst->pDrawable->height;
+	if (pixman_region_contains_rectangle(dst->pCompositeClip,
+					     &box) != PIXMAN_REGION_IN) {
+		DBG(("%s: requires clipping, drawable (%d,%d), (%d, %d), clip (%d, %d), (%d, %d)\n", __FUNCTION__,
+		     box.x1, box.y1, box.x2, box.y2,
+		     dst->pCompositeClip->extents.x1,
+		     dst->pCompositeClip->extents.y1,
+		     dst->pCompositeClip->extents.x2,
+		     dst->pCompositeClip->extents.y2));
+		return false;
+	}
+
+	if (is_gpu(dst->pDrawable)) {
+		DBG(("%s: not performing inplace as dst is already on the GPU\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	DBG(("%s\n", __FUNCTION__));
+
+	image = NULL;
+	if (sna_drawable_move_to_cpu(dst->pDrawable, MOVE_READ | MOVE_WRITE))
+		image = image_from_pict(dst, FALSE, &dx, &dy);
+	if (image) {
+		dx += dst->pDrawable->x;
+		dy += dst->pDrawable->y;
+
+		for (; ntrap; ntrap--, traps++)
+			pixman_rasterize_trapezoid(image,
+						   (pixman_trapezoid_t *)traps,
+						   dx, dy);
+
+		pixman_image_unref(image);
+	}
+
+	return true;
+}
+
 static void
 trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 		    PictFormatPtr maskFormat, INT16 xSrc, INT16 ySrc,
@@ -2998,12 +3071,6 @@ project_trapezoid_onto_grid(const xTrapezoid *in,
 	return xTrapezoidValid(out);
 }
 
-static bool
-is_mono(PicturePtr dst, PictFormatPtr mask)
-{
-	return mask ? mask->depth < 8 : dst->polyEdge==PolyEdgeSharp;
-}
-
 static span_func_t
 choose_span(PicturePtr dst,
 	    PictFormatPtr maskFormat,
@@ -4448,6 +4515,9 @@ fallback:
 				    xSrc, ySrc, ntrap, traps))
 		return;
 
+	if (trapezoids_inplace_fallback(op, src, dst, maskFormat, ntrap, traps))
+		return;
+
 	DBG(("%s: fallback mask=%08x, ntrap=%d\n", __FUNCTION__,
 	     maskFormat ? (unsigned)maskFormat->format : 0, ntrap));
 	trapezoids_fallback(op, src, dst, maskFormat,
commit 87cbfa20db7b0a10622e1d92ac3e4de73a55f698
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 11:26:03 2012 +0000

    sna: Remove existing damage before overwriting with a composite op
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index a610e7c..b098fcc 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -479,6 +479,11 @@ sna_composite(CARD8 op,
 	     get_drawable_dx(dst->pDrawable),
 	     get_drawable_dy(dst->pDrawable)));
 
+	if (op <= PictOpSrc) {
+		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
+		sna_damage_subtract(&priv->cpu_damage, &region);
+	}
+
 	memset(&tmp, 0, sizeof(tmp));
 	if (!sna->render.composite(sna,
 				   op, src, mask, dst,
commit 8d272c6ff08a531d1022dfda69db5fdea86513d1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 14 10:30:47 2012 +0000

    sna/gen3: Look harder to see if we can indeed use the BLT for composite
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index d3ed2ef..c567d6b 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2744,15 +2744,13 @@ gen3_render_composite(struct sna *sna,
 					    width,  height,
 					    tmp);
 
-	memset(&tmp->u.gen3, 0, sizeof(tmp->u.gen3));
-
 	if (!gen3_composite_set_target(sna, tmp, dst)) {
 		DBG(("%s: unable to set render target\n",
 		     __FUNCTION__));
 		return FALSE;
 	}
 
-	if (mask == NULL && sna->kgem.mode == KGEM_BLT  &&
+	if (mask == NULL && sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_composite(sna, op,
 			      src, dst,
 			      src_x, src_y,
commit df7b9cb7ee99687b93a4357c48a5facd41060dc7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 22:00:25 2012 +0000

    sna: Reuse the cached upload as a source GPU bo
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42426
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b0b55a4..ce13982 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2236,6 +2236,11 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (priv->cpu_damage == NULL)
 		goto done;
 
+	if (priv->gpu_bo->proxy) {
+		assert((flags & MOVE_WRITE) ==0);
+		goto done;
+	}
+
 	if (priv->mapped) {
 		assert(priv->stride);
 		pixmap->devPrivate.ptr = priv->ptr;
commit 144149c684b72fc7ed238e849d968e7afde4276c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 21:29:57 2012 +0000

    sna: Defer the release of the upload buffer cache till retirement
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index cee4513..72751f3 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1238,7 +1238,26 @@ static void kgem_retire_partials(struct kgem *kgem)
 		assert(next->base.list.prev == &bo->base.list);
 		assert(bo->base.io);
 
-		if (bo->base.refcnt != 1 || bo->base.rq)
+		if (bo->base.rq)
+			continue;
+
+		DBG(("%s: releasing upload cache for handle=%d? %d\n",
+		     __FUNCTION__, bo->base.handle, !list_is_empty(&bo->base.vma)));
+		while (!list_is_empty(&bo->base.vma)) {
+			struct kgem_bo *cached;
+
+			cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
+			assert(cached->proxy == &bo->base);
+			list_del(&cached->vma);
+
+			assert(*(struct kgem_bo **)cached->map == cached);
+			*(struct kgem_bo **)cached->map = NULL;
+			cached->map = NULL;
+
+			kgem_bo_destroy(kgem, cached);
+		}
+
+		if (bo->base.refcnt != 1)
 			continue;
 
 		DBG(("%s: handle=%d, used %d/%d\n", __FUNCTION__,
@@ -1256,6 +1275,8 @@ static void kgem_retire_partials(struct kgem *kgem)
 		bo->base.needs_flush = false;
 		bo->used = 0;
 
+		DBG(("%s: transferring partial handle=%d to inactive\n",
+		     __FUNCTION__, bo->base.handle));
 		list_move_tail(&bo->base.list, &kgem->inactive_partials);
 		bubble_sort_partial(&kgem->inactive_partials, bo);
 	}
@@ -1381,9 +1402,7 @@ static void kgem_commit(struct kgem *kgem)
 {
 	struct kgem_request *rq = kgem->next_request;
 	struct kgem_bo *bo, *next;
-	struct list release;
 
-	list_init(&release);
 	list_for_each_entry_safe(bo, next, &rq->buffers, request) {
 		assert(next->request.prev == &bo->request);
 
@@ -1392,7 +1411,7 @@ static void kgem_commit(struct kgem *kgem)
 		     bo->dirty, bo->needs_flush, (unsigned)bo->exec->offset));
 
 		assert(!bo->purged);
-		assert(bo->proxy || bo->rq == rq);
+		assert(bo->rq == rq || (bo->proxy->rq == rq));
 
 		bo->presumed_offset = bo->exec->offset;
 		bo->exec = NULL;
@@ -1411,20 +1430,8 @@ static void kgem_commit(struct kgem *kgem)
 			list_del(&bo->request);
 			bo->rq = NULL;
 			bo->exec = &_kgem_dummy_exec;
-			if (bo->map)
-				list_add_tail(&bo->request, &release);
 		}
 	}
-	while (!list_is_empty(&release)) {
-		bo = list_first_entry(&release, struct kgem_bo, request);
-		DBG(("%s: releasing upload cache, handle=%d\n",
-		     __FUNCTION__, bo->handle));
-		list_del(&bo->request);
-		assert(*(struct kgem_bo **)bo->map == bo);
-		*(struct kgem_bo **)bo->map = NULL;
-		bo->map = NULL;
-		kgem_bo_destroy(kgem, bo);
-	}
 
 	if (rq == &_kgem_static_request) {
 		struct drm_i915_gem_set_domain set_domain;
@@ -2869,11 +2876,12 @@ void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle, bo->proxy != NULL));
 
 	if (bo->proxy) {
+		_list_del(&bo->vma);
+		_list_del(&bo->request);
 		if (bo->io && (bo->exec == NULL || bo->proxy->rq == NULL))
 			_kgem_bo_delete_partial(kgem, bo);
 		kgem_bo_unref(kgem, bo->proxy);
 		kgem_bo_binding_free(kgem, bo);
-		_list_del(&bo->request);
 		free(bo);
 		return;
 	}
@@ -3468,6 +3476,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			}
 		}
 
+		if (bo->used && bo->base.rq == NULL && bo->base.refcnt == 1) {
+			bo->used = 0;
+			bubble_sort_partial(&kgem->active_partials, bo);
+		}
+
 		if (bo->used + size <= bytes(&bo->base)) {
 			DBG(("%s: reusing partial buffer? used=%d + size=%d, total=%d\n",
 			     __FUNCTION__, bo->used, size, bytes(&bo->base)));
@@ -3895,6 +3908,8 @@ void kgem_proxy_bo_attach(struct kgem_bo *bo,
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 	assert(bo->map == NULL);
+	assert(bo->proxy);
+	list_add(&bo->vma, &bo->proxy->vma);
 	bo->map = ptr;
 	*ptr = kgem_bo_reference(bo);
 }
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index c9016a1..b0b55a4 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11822,6 +11822,7 @@ static void sna_accel_inactive(struct sna *sna)
 					inactive);
 		assert((priv->create & KGEM_CAN_CREATE_LARGE) == 0);
 		assert(priv->gpu_bo);
+		assert(!priv->gpu_bo->proxy);
 
 		/* XXX Rather than discarding the GPU buffer here, we
 		 * could mark it purgeable and allow the shrinker to
commit e9f1f935f27fcdfb698c9537a23c310e2a7f23bb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 20:01:53 2012 +0000

    sna: Destroy the cached upload buffer if before copying into it
    
    As we discard the buffer after the next batch, we will lose the contents
    of the pixmap. Instead discard the cache, and treat it as a normal
    shadow again.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42426
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index d5420a3..c9016a1 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3402,7 +3402,10 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		dst_priv->clear = false;
 	}
 
-	assert(dst_priv->gpu_bo == NULL || dst_priv->gpu_bo->proxy == NULL);
+	if (dst_priv->gpu_bo && dst_priv->gpu_bo->proxy) {
+		kgem_bo_destroy(&sna->kgem, dst_priv->gpu_bo);
+		dst_priv->gpu_bo = NULL;
+	}
 
 	/* Try to maintain the data on the GPU */
 	if (dst_priv->gpu_bo == NULL &&
commit 393fdf784681eda022a4bed0e497d6b732d45b59
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 16:29:41 2012 +0000

    sna: Refactor source upload-to-gpu
    
    This was originally split between into functions for flexibility that
    nothing used, so merge the duplicated code into a common function.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index b5d314b..8be3e72 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -354,122 +354,88 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 	return priv->cpu_bo;
 }
 
-static Bool
+static struct kgem_bo *
 move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 {
 	struct sna_pixmap *priv;
 	int count, w, h;
+	bool migrate = false;
 
 	if (DBG_FORCE_UPLOAD > 0)
-		return FALSE;
+		return NULL;
+
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: not migrating unattached pixmap\n",
+		     __FUNCTION__));
+		return NULL;
+	}
+
+	if (priv->gpu_bo) {
+		if (priv->cpu_damage &&
+		    sna_damage_contains_box(priv->cpu_damage,
+					    box) != PIXMAN_REGION_OUT) {
+			if (!sna_pixmap_move_to_gpu(pixmap, MOVE_READ))
+				return NULL;
+		}
+
+		return priv->gpu_bo;
+	}
 
 	if (pixmap->usage_hint) {
 		DBG(("%s: not migrating pixmap due to usage_hint=%d\n",
 		     __FUNCTION__, pixmap->usage_hint));
-		return FALSE;
+		return NULL;
 	}
 
 	if (DBG_FORCE_UPLOAD < 0)
-		return TRUE;
+		migrate = true;
 
 	w = box->x2 - box->x1;
 	h = box->y2 - box->y1;
 	if (w == pixmap->drawable.width && h == pixmap->drawable.height) {
-		bool upload;
-
-		priv = sna_pixmap(pixmap);
-		if (!priv) {
-			DBG(("%s: not migrating unattached pixmap\n",
-			     __FUNCTION__));
-			return false;
-		}
-
-		upload = true;
+		migrate = true;
 		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
 		    kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
 				       I915_TILING_X,
 				       pixmap->drawable.width,
 				       pixmap->drawable.height,
 				       pixmap->drawable.bitsPerPixel) == I915_TILING_NONE)
-			upload = priv->source_count++ > SOURCE_BIAS;
+			migrate = priv->source_count++ > SOURCE_BIAS;
 
 		DBG(("%s: migrating whole pixmap (%dx%d) for source (%d,%d),(%d,%d), count %d? %d\n",
 		     __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height,
 		     box->x1, box->y1, box->x2, box->y2, priv->source_count,
-		     upload));
-		return upload;
+		     migrate));
+	} else {
+		/* ignore tiny fractions */
+		if (64*w*h > pixmap->drawable.width * pixmap->drawable.height) {
+			count = priv->source_count++;
+			if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
+			    kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
+					       I915_TILING_X,
+					       pixmap->drawable.width,
+					       pixmap->drawable.height,
+					       pixmap->drawable.bitsPerPixel) == I915_TILING_NONE)
+				count -= SOURCE_BIAS;
+
+			DBG(("%s: migrate box (%d, %d), (%d, %d)? source count=%d, fraction=%d/%d [%d]\n",
+			     __FUNCTION__,
+			     box->x1, box->y1, box->x2, box->y2,
+			     count, w*h,
+			     pixmap->drawable.width * pixmap->drawable.height,
+			     pixmap->drawable.width * pixmap->drawable.height / (w*h)));
+
+			migrate =  count*w*h > pixmap->drawable.width * pixmap->drawable.height;
+		}
 	}
 
-	/* ignore tiny fractions */
-	if (64*w*h < pixmap->drawable.width * pixmap->drawable.height)
-		return FALSE;
-
-	priv = sna_pixmap(pixmap);
-	if (!priv)
-		return FALSE;
-
-	count = priv->source_count++;
-	if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
-	    kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
-			       I915_TILING_X,
-			       pixmap->drawable.width,
-			       pixmap->drawable.height,
-			       pixmap->drawable.bitsPerPixel) == I915_TILING_NONE)
-		count -= SOURCE_BIAS;
-
-	DBG(("%s: migrate box (%d, %d), (%d, %d)? source count=%d, fraction=%d/%d [%d]\n",
-	     __FUNCTION__,
-	     box->x1, box->y1, box->x2, box->y2,
-	     count, w*h,
-	     pixmap->drawable.width * pixmap->drawable.height,
-	     pixmap->drawable.width * pixmap->drawable.height / (w*h)));
-
-	return count*w*h > pixmap->drawable.width * pixmap->drawable.height;
-}
-
-static Bool
-_texture_is_cpu(PixmapPtr pixmap, const BoxRec *box)
-{
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
-
-	if (priv == NULL)
-		return TRUE;
-
-	if (priv->gpu_bo == NULL)
-		return TRUE;
-
-	if (!priv->cpu_damage)
-		return FALSE;
-
-	if (DAMAGE_IS_ALL(priv->cpu_damage))
-		return TRUE;
-
-	if (sna_damage_contains_box__no_reduce(priv->cpu_damage, box))
-		return TRUE;
-
-	if (sna_damage_contains_box(priv->gpu_damage, box) != PIXMAN_REGION_OUT)
-		return FALSE;
-
-	return sna_damage_contains_box(priv->cpu_damage, box) != PIXMAN_REGION_OUT;
-}
+	if (migrate && !sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+		return NULL;
 
-#if DEBUG_RENDER
-static Bool
-texture_is_cpu(PixmapPtr pixmap, const BoxRec *box)
-{
-	Bool ret = _texture_is_cpu(pixmap, box);
-	ErrorF("%s(pixmap=%p, box=((%d, %d), (%d, %d)) = %d\n",
-	       __FUNCTION__, pixmap, box->x1, box->y1, box->x2, box->y2, ret);
-	return ret;
-}
-#else
-static Bool
-texture_is_cpu(PixmapPtr pixmap, const BoxRec *box)
-{
-	return _texture_is_cpu(pixmap, box);
+	return priv->gpu_bo;
 }
-#endif
 
 static struct kgem_bo *upload(struct sna *sna,
 			      struct sna_composite_channel *channel,
@@ -600,16 +566,13 @@ sna_render_pixmap_bo(struct sna *sna,
 	if (bo) {
 		bo = kgem_bo_reference(bo);
 	} else {
-		if (!texture_is_cpu(pixmap, &box) || move_to_gpu(pixmap, &box)) {
-			priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ);
-			if (priv)
-				bo = kgem_bo_reference(priv->gpu_bo);
-		}
+		bo = move_to_gpu(pixmap, &box);
 		if (bo == NULL) {
 			DBG(("%s: uploading CPU box (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 			bo = upload(sna, channel, pixmap, &box);
-		}
+		} else
+			bo = kgem_bo_reference(bo);
 	}
 
 	channel->bo = bo;
@@ -1146,18 +1109,8 @@ sna_render_picture_extract(struct sna *sna,
 		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
 			return 0;
 	} else {
-		bool upload = true;
-		if (!texture_is_cpu(pixmap, &box) ||
-		    move_to_gpu(pixmap, &box)) {
-			struct sna_pixmap *priv;
-
-			priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ);
-			if (priv) {
-				src_bo = priv->gpu_bo;
-				upload = false;
-			}
-		}
-		if (upload) {
+		src_bo = move_to_gpu(pixmap, &box);
+		if (src_bo == NULL) {
 			bo = kgem_upload_source_image(&sna->kgem,
 						      pixmap->devPrivate.ptr,
 						      &box,
commit 40c8dbe8a7e41a81b174ccd6480019eb658e70a2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 15:47:27 2012 +0000

    sna/gen6: Remove the double application of the render offset
    
    Cut'n'paste error from an older generation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 8a8cdd8..fde0776 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1466,26 +1466,17 @@ gen6_emit_composite_primitive(struct sna *sna,
 			      const struct sna_composite_rectangles *r)
 {
 	gen6_emit_composite_vertex(sna, op,
-				   r->src.x + r->width,
-				   r->src.y + r->height,
-				   r->mask.x + r->width,
-				   r->mask.y + r->height,
-				   op->dst.x + r->dst.x + r->width,
-				   op->dst.y + r->dst.y + r->height);
+				   r->src.x + r->width,  r->src.y + r->height,
+				   r->mask.x + r->width, r->mask.y + r->height,
+				   r->dst.x + r->width, r->dst.y + r->height);
 	gen6_emit_composite_vertex(sna, op,
-				   r->src.x,
-				   r->src.y + r->height,
-				   r->mask.x,
-				   r->mask.y + r->height,
-				   op->dst.x + r->dst.x,
-				   op->dst.y + r->dst.y + r->height);
+				   r->src.x,  r->src.y + r->height,
+				   r->mask.x, r->mask.y + r->height,
+				   r->dst.x,  r->dst.y + r->height);
 	gen6_emit_composite_vertex(sna, op,
-				   r->src.x,
-				   r->src.y,
-				   r->mask.x,
-				   r->mask.y,
-				   op->dst.x + r->dst.x,
-				   op->dst.y + r->dst.y);
+				   r->src.x,  r->src.y,
+				   r->mask.x, r->mask.y,
+				   r->dst.x,  r->dst.y);
 }
 
 static void gen6_emit_vertex_buffer(struct sna *sna,
commit 82b13f29b01f99e9aaade2d239ba2333b28ea230
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 13 12:04:04 2012 +0000

    sna: Only use the cpu bo for xfer between CPU and GPU if either is busy
    
    The synchronisation costs overwhelm any benefit from offloading the
    copy, unless we are currently streaming the updates anyway.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index dc330ca..d5420a3 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -857,6 +857,14 @@ sna_pixmap_create_mappable_gpu(PixmapPtr pixmap)
 	return priv->gpu_bo && kgem_bo_is_mappable(&sna->kgem, priv->gpu_bo);
 }
 
+static bool use_cpu_bo_for_xfer(struct sna_pixmap *priv)
+{
+	if (priv->cpu_bo == NULL)
+		return FALSE;
+
+	return kgem_bo_is_busy(priv->gpu_bo) || kgem_bo_is_busy(priv->cpu_bo);
+}
+
 bool
 _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 {
@@ -1028,16 +1036,12 @@ skip_inplace_map:
 
 		n = sna_damage_get_boxes(priv->gpu_damage, &box);
 		if (n) {
-			struct kgem_bo *dst_bo;
 			Bool ok = FALSE;
 
-			dst_bo = NULL;
-			if (sna->kgem.gen >= 30)
-				dst_bo = priv->cpu_bo;
-			if (dst_bo)
+			if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
-							    pixmap, dst_bo, 0, 0,
+							    pixmap, priv->cpu_bo, 0, 0,
 							    box, n);
 			if (!ok)
 				sna_read_boxes(sna,
@@ -1421,7 +1425,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			assert(pixmap_contains_damage(pixmap, priv->gpu_damage));
 
 			ok = FALSE;
-			if (priv->cpu_bo && sna->kgem.gen >= 30)
+			if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
 							    pixmap, priv->cpu_bo, 0, 0,
@@ -1496,7 +1500,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				if (n) {
 					Bool ok = FALSE;
 
-					if (priv->cpu_bo && sna->kgem.gen >= 30)
+					if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
 									    pixmap, priv->cpu_bo, 0, 0,
@@ -1516,10 +1520,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 								      &r->extents)) {
 				BoxPtr box = REGION_RECTS(r);
 				int n = REGION_NUM_RECTS(r);
-				Bool ok;
+				Bool ok = FALSE;
 
-				ok = FALSE;
-				if (priv->cpu_bo && sna->kgem.gen >= 30)
+				if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
 					ok = sna->render.copy_boxes(sna, GXcopy,
 								    pixmap, priv->gpu_bo, 0, 0,
 								    pixmap, priv->cpu_bo, 0, 0,
@@ -1539,10 +1542,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				if (sna_damage_intersect(priv->gpu_damage, r, &need)) {
 					BoxPtr box = REGION_RECTS(&need);
 					int n = REGION_NUM_RECTS(&need);
-					Bool ok;
+					Bool ok = FALSE;
 
-					ok = FALSE;
-					if (priv->cpu_bo && sna->kgem.gen >= 30)
+					if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
 									    pixmap, priv->cpu_bo, 0, 0,
@@ -1732,10 +1734,9 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 		n = sna_damage_get_boxes(priv->cpu_damage, &box);
 		if (n) {
-			Bool ok;
+			Bool ok = FALSE;
 
-			ok = FALSE;
-			if (priv->cpu_bo)
+			if (use_cpu_bo_for_xfer(priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->cpu_bo, 0, 0,
 							    pixmap, priv->gpu_bo, 0, 0,
@@ -1768,7 +1769,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	} else if (DAMAGE_IS_ALL(priv->cpu_damage) ||
 		   sna_damage_contains_box__no_reduce(priv->cpu_damage, box)) {
 		Bool ok = FALSE;
-		if (priv->cpu_bo)
+		if (use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -1791,7 +1792,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 		box = REGION_RECTS(&i);
 		ok = FALSE;
-		if (priv->cpu_bo)
+		if (use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -2250,7 +2251,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		DBG(("%s: uploading %d damage boxes\n", __FUNCTION__, n));
 
 		ok = FALSE;
-		if (priv->cpu_bo)
+		if (use_cpu_bo_for_xfer(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
commit f540ba0c840abd990adcf8fbe17617a8580288ba
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 23:28:51 2012 +0000

    sna: Reduce OVER with a clear pixmap to a BLT
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 441b24e..6ec4748 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -460,6 +460,13 @@ sna_drawable_move_to_gpu(DrawablePtr drawable, unsigned flags)
 	return sna_pixmap_move_to_gpu(get_drawable_pixmap(drawable), flags) != NULL;
 }
 
+static inline bool
+sna_drawable_is_clear(DrawablePtr d)
+{
+	struct sna_pixmap *priv = sna_pixmap(get_drawable_pixmap(d));
+	return priv && priv->clear && priv->clear_color == 0;
+}
+
 static inline struct kgem_bo *sna_pixmap_get_bo(PixmapPtr pixmap)
 {
 	return sna_pixmap(pixmap)->gpu_bo;
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index eb8dbf8..e7a6182 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -1556,6 +1556,7 @@ sna_blt_composite(struct sna *sna,
 	struct sna_pixmap *priv;
 	int16_t tx, ty;
 	uint32_t alpha_fixup;
+	bool was_clear;
 	Bool ret;
 
 #if DEBUG_NO_BLT || NO_BLT_COMPOSITE
@@ -1576,6 +1577,7 @@ sna_blt_composite(struct sna *sna,
 		return FALSE;
 	}
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
 	tmp->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	priv = sna_pixmap_move_to_gpu(tmp->dst.pixmap, MOVE_WRITE | MOVE_READ);
 	if (priv == NULL) {
@@ -1606,16 +1608,22 @@ sna_blt_composite(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
-	if (op == PictOpClear)
+	if (op == PictOpClear) {
+clear:
+		if (was_clear)
+			return TRUE;
 		return prepare_blt_clear(sna, tmp);
+	}
 
 	if (is_solid(src)) {
 		if (op == PictOpOver && is_opaque_solid(src))
 			op = PictOpSrc;
 		if (op == PictOpAdd && is_white(src))
 			op = PictOpSrc;
+		if (was_clear && (op == PictOpAdd || op == PictOpOver))
+			op = PictOpSrc;
 		if (op == PictOpOutReverse && is_opaque_solid(src))
-			return prepare_blt_clear(sna, tmp);
+			goto clear;
 
 		if (op != PictOpSrc) {
 			DBG(("%s: unsuported op [%d] for blitting\n",
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index d2f16f2..933f580 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3033,13 +3033,6 @@ choose_span(PicturePtr dst,
 }
 
 static bool
-sna_drawable_is_clear(DrawablePtr d)
-{
-	struct sna_pixmap *priv = sna_pixmap(get_drawable_pixmap(d));
-	return priv && priv->clear && priv->clear_color == 0;
-}
-
-static bool
 mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			       INT16 src_x, INT16 src_y,
 			       int ntrap, xTrapezoid *traps)
commit cc3c1100902b5c78b861efb9c9cfce689ab2c3b4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 23:05:17 2012 +0000

    sna: Reuse the same upload buffer for the duration of the batch
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index db579d0..cee4513 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1381,7 +1381,9 @@ static void kgem_commit(struct kgem *kgem)
 {
 	struct kgem_request *rq = kgem->next_request;
 	struct kgem_bo *bo, *next;
+	struct list release;
 
+	list_init(&release);
 	list_for_each_entry_safe(bo, next, &rq->buffers, request) {
 		assert(next->request.prev == &bo->request);
 
@@ -1408,8 +1410,21 @@ static void kgem_commit(struct kgem *kgem)
 			/* proxies are not used for domain tracking */
 			list_del(&bo->request);
 			bo->rq = NULL;
+			bo->exec = &_kgem_dummy_exec;
+			if (bo->map)
+				list_add_tail(&bo->request, &release);
 		}
 	}
+	while (!list_is_empty(&release)) {
+		bo = list_first_entry(&release, struct kgem_bo, request);
+		DBG(("%s: releasing upload cache, handle=%d\n",
+		     __FUNCTION__, bo->handle));
+		list_del(&bo->request);
+		assert(*(struct kgem_bo **)bo->map == bo);
+		*(struct kgem_bo **)bo->map = NULL;
+		bo->map = NULL;
+		kgem_bo_destroy(kgem, bo);
+	}
 
 	if (rq == &_kgem_static_request) {
 		struct drm_i915_gem_set_domain set_domain;
@@ -2842,7 +2857,7 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
 	     __FUNCTION__, bo->size.bytes, bo->delta, io->used));
 
-	if (bo->delta + bo->size.bytes == io->used) {
+	if (ALIGN(bo->delta + bo->size.bytes, 64) == io->used) {
 		io->used = bo->delta;
 		bubble_sort_partial(&kgem->active_partials, io);
 	}
@@ -2850,9 +2865,11 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 
 void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 {
+	DBG(("%s: handle=%d, proxy? %d\n",
+	     __FUNCTION__, bo->handle, bo->proxy != NULL));
+
 	if (bo->proxy) {
-		assert(bo->map == NULL);
-		if (bo->io && bo->exec == NULL)
+		if (bo->io && (bo->exec == NULL || bo->proxy->rq == NULL))
 			_kgem_bo_delete_partial(kgem, bo);
 		kgem_bo_unref(kgem, bo->proxy);
 		kgem_bo_binding_free(kgem, bo);
@@ -3873,6 +3890,15 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 	return bo;
 }
 
+void kgem_proxy_bo_attach(struct kgem_bo *bo,
+			  struct kgem_bo **ptr)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+	assert(bo->map == NULL);
+	bo->map = ptr;
+	*ptr = kgem_bo_reference(bo);
+}
+
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
@@ -3880,7 +3906,8 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 	int domain;
 
 	assert(_bo->io);
-	assert(_bo->exec == NULL);
+	assert(_bo->exec == &_kgem_dummy_exec);
+	assert(_bo->rq == NULL);
 	if (_bo->proxy)
 		_bo = _bo->proxy;
 	assert(_bo->exec == NULL);
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index dff8bb2..52e5e9c 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -200,6 +200,7 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 					 const void *data,
 					 BoxPtr box,
 					 int stride, int bpp);
+void kgem_proxy_bo_attach(struct kgem_bo *bo, struct kgem_bo **ptr);
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1b0ac3c..dc330ca 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1064,8 +1064,10 @@ skip_inplace_map:
 	}
 
 done:
-	if (flags & MOVE_WRITE)
+	if (flags & MOVE_WRITE) {
 		priv->source_count = SOURCE_BIAS;
+		assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
+	}
 
 	if ((flags & MOVE_ASYNC_HINT) == 0 && priv->cpu_bo) {
 		DBG(("%s: syncing CPU bo\n", __FUNCTION__));
@@ -1260,6 +1262,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 
 		if (priv->stride && priv->gpu_bo &&
 		    region_inplace(sna, pixmap, region, priv)) {
+			assert(priv->gpu_bo->proxy == NULL);
 			if (sync_will_stall(priv->gpu_bo) &&
 			    priv->gpu_bo->exec == NULL)
 				kgem_retire(&sna->kgem);
@@ -1371,6 +1374,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	if (priv->gpu_bo == NULL)
 		goto done;
 
+	assert(priv->gpu_bo->proxy == NULL);
 	if (priv->clear) {
 		int n = REGION_NUM_RECTS(region);
 		BoxPtr box = REGION_RECTS(region);
@@ -1581,8 +1585,10 @@ done:
 		RegionTranslate(region, -dx, -dy);
 
 out:
-	if (flags & MOVE_WRITE)
+	if (flags & MOVE_WRITE) {
 		priv->source_count = SOURCE_BIAS;
+		assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
+	}
 	if (priv->cpu_bo) {
 		DBG(("%s: syncing cpu bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
@@ -1942,6 +1948,7 @@ done:
 
 	DBG(("%s: using GPU bo with damage? %d\n",
 	     __FUNCTION__, *damage != NULL));
+	assert(priv->gpu_bo->proxy == NULL);
 	return priv->gpu_bo;
 
 use_gpu_bo:
@@ -1951,6 +1958,7 @@ use_gpu_bo:
 			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
 	DBG(("%s: using whole GPU bo\n", __FUNCTION__));
+	assert(priv->gpu_bo->proxy == NULL);
 	return priv->gpu_bo;
 
 use_cpu_bo:
@@ -2083,6 +2091,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		DBG(("%s: GPU all-damaged\n", __FUNCTION__));
+		assert(!priv->gpu_bo->proxy || (flags & MOVE_WRITE) == 0);
 		return sna_pixmap_mark_active(to_sna_from_pixmap(pixmap), priv);
 	}
 
@@ -2285,6 +2294,7 @@ done:
 		}
 	}
 active:
+	assert(!priv->gpu_bo->proxy || (flags & MOVE_WRITE) == 0);
 	return sna_pixmap_mark_active(sna, priv);
 }
 
@@ -2532,6 +2542,8 @@ static bool upload_inplace(struct sna *sna,
 		return false;
 
 	if (priv->gpu_bo) {
+		assert(priv->gpu_bo->proxy == NULL);
+
 		if (!kgem_bo_map_will_stall(&sna->kgem, priv->gpu_bo))
 			return true;
 
@@ -3389,6 +3401,8 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		dst_priv->clear = false;
 	}
 
+	assert(dst_priv->gpu_bo == NULL || dst_priv->gpu_bo->proxy == NULL);
+
 	/* Try to maintain the data on the GPU */
 	if (dst_priv->gpu_bo == NULL &&
 	    ((dst_priv->cpu_damage == NULL && copy_use_gpu_bo(sna, dst_priv, &region)) ||
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 09a33c1..b5d314b 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -496,6 +496,14 @@ static struct kgem_bo *upload(struct sna *sna,
 		channel->offset[1] -= box->y1;
 		channel->scale[0] = 1.f/channel->width;
 		channel->scale[1] = 1.f/channel->height;
+
+		if (pixmap->usage_hint == 0 &&
+		    channel->width  == pixmap->drawable.width &&
+		    channel->height == pixmap->drawable.height) {
+			struct sna_pixmap *priv = sna_pixmap(pixmap);
+			if (priv)
+				kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+		}
 	}
 
 	return bo;
@@ -526,7 +534,8 @@ sna_render_pixmap_bo(struct sna *sna,
 	priv = sna_pixmap(pixmap);
 	if (priv) {
 		if (priv->gpu_bo &&
-		    (DAMAGE_IS_ALL(priv->gpu_damage) || !priv->cpu_damage)) {
+		    (DAMAGE_IS_ALL(priv->gpu_damage) || !priv->cpu_damage ||
+		     priv->gpu_bo->proxy)) {
 			channel->bo = kgem_bo_reference(priv->gpu_bo);
 			return 1;
 		}
@@ -1148,12 +1157,20 @@ sna_render_picture_extract(struct sna *sna,
 				upload = false;
 			}
 		}
-		if (upload)
+		if (upload) {
 			bo = kgem_upload_source_image(&sna->kgem,
 						      pixmap->devPrivate.ptr,
 						      &box,
 						      pixmap->devKind,
 						      pixmap->drawable.bitsPerPixel);
+			if (pixmap->usage_hint == 0 &&
+			    box.x2 - box.x1 == pixmap->drawable.width &&
+			    box.y2 - box.y1 == pixmap->drawable.height) {
+				struct sna_pixmap *priv = sna_pixmap(pixmap);
+				if (priv)
+					kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+			}
+		}
 	}
 	if (src_bo) {
 		bo = kgem_create_2d(&sna->kgem, w, h,
commit 48f1367212d9344bd6617fca3fed0090b7dc4933
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 21:09:19 2012 +0000

    sna: Prefer to render very thin trapezoids inplace
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index e6015af..8d3d9e4 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -204,6 +204,7 @@ struct sna_render {
 				unsigned flags,
 				struct sna_composite_spans_op *tmp);
 #define COMPOSITE_SPANS_RECTILINEAR 0x1
+#define COMPOSITE_SPANS_INPLACE_HINT 0x2
 
 	Bool (*video)(struct sna *sna,
 		      struct sna_video *video,
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index a523fed..88d0130 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -85,7 +85,13 @@ static inline Bool
 is_cpu(DrawablePtr drawable)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
-	return !priv || priv->cpu_damage != NULL;
+	if (priv == NULL || priv->clear || DAMAGE_IS_ALL(priv->cpu_damage))
+		return true;
+
+	if (priv->gpu_damage || (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))
+		return false;
+
+	return true;
 }
 
 static inline Bool
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 23a65ef..d2f16f2 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3180,7 +3180,8 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 static bool
 trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
-			 PictFormatPtr maskFormat, INT16 src_x, INT16 src_y,
+			 PictFormatPtr maskFormat, unsigned int flags,
+			 INT16 src_x, INT16 src_y,
 			 int ntrap, xTrapezoid *traps)
 {
 	struct sna *sna;
@@ -3263,8 +3264,7 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 					 extents.x1,  extents.y1,
 					 extents.x2 - extents.x1,
 					 extents.y2 - extents.y1,
-					 0,
-					 &tmp)) {
+					 flags, &tmp)) {
 		DBG(("%s: fallback -- composite spans render op not supported\n",
 		     __FUNCTION__));
 		return false;
@@ -3776,6 +3776,40 @@ mono_inplace_composite_boxes(struct sna *sna,
 }
 
 static bool
+trapezoid_spans_maybe_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
+			      PictFormatPtr maskFormat)
+{
+	if (NO_SCAN_CONVERTER)
+		return false;
+
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat))
+		return false;
+	if (dst->alphaMap)
+		return false;
+
+	if (is_mono(dst, maskFormat))
+		goto out;
+
+	if (!sna_picture_is_solid(src, NULL))
+		return false;
+
+	if (dst->format != PICT_a8)
+		return false;
+
+	switch (op) {
+	case PictOpIn:
+	case PictOpAdd:
+	case PictOpSrc:
+		break;
+	default:
+		return false;
+	}
+
+out:
+	return is_cpu(dst->pDrawable) ? true : dst->pDrawable->width <= TOR_INPLACE_SIZE;
+}
+
+static bool
 trapezoid_span_mono_inplace(CARD8 op,
 			    PicturePtr src,
 			    PicturePtr dst,
@@ -4304,6 +4338,7 @@ sna_composite_trapezoids(CARD8 op,
 {
 	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
 	bool rectilinear, pixel_aligned;
+	unsigned flags;
 	int n;
 
 	DBG(("%s(op=%d, src=(%d, %d), mask=%08x, ntrap=%d)\n", __FUNCTION__,
@@ -4370,8 +4405,9 @@ sna_composite_trapezoids(CARD8 op,
 		}
 	}
 
-	DBG(("%s: rectlinear? %d, pixel-aligned? %d\n",
+	DBG(("%s: rectilinear? %d, pixel-aligned? %d\n",
 	     __FUNCTION__, rectilinear, pixel_aligned));
+	flags = 0;
 	if (rectilinear) {
 		if (pixel_aligned) {
 			if (composite_aligned_boxes(sna, op, src, dst,
@@ -4386,9 +4422,17 @@ sna_composite_trapezoids(CARD8 op,
 						      ntrap, traps))
 				return;
 		}
+		flags |= COMPOSITE_SPANS_RECTILINEAR;
+	}
+	if (trapezoid_spans_maybe_inplace(op, src, dst, maskFormat)) {
+		flags |= COMPOSITE_SPANS_INPLACE_HINT;
+		if (trapezoid_span_inplace(op, src, dst, maskFormat,
+					   xSrc, ySrc, ntrap, traps,
+					   false))
+			return;
 	}
 
-	if (trapezoid_span_converter(op, src, dst, maskFormat,
+	if (trapezoid_span_converter(op, src, dst, maskFormat, flags,
 				     xSrc, ySrc, ntrap, traps))
 		return;
 
commit dc5ca6a8cece2fc03447c7921b1c660c5728a603
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 20:34:42 2012 +0000

    sna: Always reset the source counter after rendering to with the CPU
    
    The goal is to avoid moving to the GPU too early for a frequently
    modified CPU buffer.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2e3df8a..1b0ac3c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1061,11 +1061,12 @@ skip_inplace_map:
 
 		if (priv->flush)
 			list_move(&priv->list, &sna->dirty_pixmaps);
-
-		priv->source_count = SOURCE_BIAS;
 	}
 
 done:
+	if (flags & MOVE_WRITE)
+		priv->source_count = SOURCE_BIAS;
+
 	if ((flags & MOVE_ASYNC_HINT) == 0 && priv->cpu_bo) {
 		DBG(("%s: syncing CPU bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
@@ -1580,6 +1581,8 @@ done:
 		RegionTranslate(region, -dx, -dy);
 
 out:
+	if (flags & MOVE_WRITE)
+		priv->source_count = SOURCE_BIAS;
 	if (priv->cpu_bo) {
 		DBG(("%s: syncing cpu bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 11549b4..09a33c1 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -393,10 +393,10 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 				       pixmap->drawable.bitsPerPixel) == I915_TILING_NONE)
 			upload = priv->source_count++ > SOURCE_BIAS;
 
-		DBG(("%s: migrating whole pixmap (%dx%d) for source (%d,%d),(%d,%d)? %d\n",
+		DBG(("%s: migrating whole pixmap (%dx%d) for source (%d,%d),(%d,%d), count %d? %d\n",
 		     __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height,
-		     box->x1, box->y1, box->x2, box->y2,
+		     box->x1, box->y1, box->x2, box->y2, priv->source_count,
 		     upload));
 		return upload;
 	}
commit 5e4358cf74de1670ed44d86d958e85b28ab0dbe3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 20:25:50 2012 +0000

    sna: After move-to-gpu signals yes, force the GPU bo creation
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index d1e3500..11549b4 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -591,22 +591,16 @@ sna_render_pixmap_bo(struct sna *sna,
 	if (bo) {
 		bo = kgem_bo_reference(bo);
 	} else {
-		if (texture_is_cpu(pixmap, &box) && !move_to_gpu(pixmap, &box)) {
+		if (!texture_is_cpu(pixmap, &box) || move_to_gpu(pixmap, &box)) {
+			priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ);
+			if (priv)
+				bo = kgem_bo_reference(priv->gpu_bo);
+		}
+		if (bo == NULL) {
 			DBG(("%s: uploading CPU box (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 			bo = upload(sna, channel, pixmap, &box);
 		}
-
-		if (bo == NULL) {
-			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
-			if (priv) {
-				bo = kgem_bo_reference(priv->gpu_bo);
-			} else {
-				DBG(("%s: failed to upload pixmap to gpu, uploading CPU box (%d, %d), (%d, %d) instead\n",
-				     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
-				bo = upload(sna, channel, pixmap, &box);
-			}
-		}
 	}
 
 	channel->bo = bo;
@@ -1148,7 +1142,7 @@ sna_render_picture_extract(struct sna *sna,
 		    move_to_gpu(pixmap, &box)) {
 			struct sna_pixmap *priv;
 
-			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
+			priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ);
 			if (priv) {
 				src_bo = priv->gpu_bo;
 				upload = false;
commit 7f590c2e60ad3a040bbd65a3c4899152b52fd562
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 20:09:05 2012 +0000

    sna/trapezoids: Reduce mono ADD/OVER against a clear background to a SRC
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f936b4b..23a65ef 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3849,6 +3849,7 @@ trapezoid_span_mono_inplace(CARD8 op,
 
 	if (sna_picture_is_solid(src, &inplace.fill.color) &&
 	    (op == PictOpSrc || op == PictOpClear ||
+	     (was_clear && (op == PictOpOver || op == PictOpAdd)) ||
 	     (op == PictOpOver && inplace.fill.color >> 24 == 0xff))) {
 		PixmapPtr pixmap;
 		int16_t dx, dy;
commit 2bc841e7c2caf44fb3c729f62ba4609d2080d667
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 19:49:30 2012 +0000

    sna: Treat backing pixmaps no differently from their forward facing cousins
    
    Another fix for the large buffers overhaul.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3619101..2e3df8a 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -395,13 +395,6 @@ static inline uint32_t default_tiling(PixmapPtr pixmap)
 	if (sna->kgem.gen == 21)
 		return I915_TILING_X;
 
-	if (pixmap->usage_hint == CREATE_PIXMAP_USAGE_BACKING_PIXMAP) {
-		/* Treat this like a window, and require accelerated
-		 * scrolling i.e. overlapped blits.
-		 */
-		return I915_TILING_X;
-	}
-
 	if (sna_damage_is_all(&priv->cpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
@@ -765,6 +758,8 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 
 	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE)
 		flags &= ~KGEM_CAN_CREATE_GPU;
+	if (usage == CREATE_PIXMAP_USAGE_BACKING_PIXMAP)
+		usage = 0;
 
 force_create:
 	pad = PixmapBytePad(width, depth);
commit 41f182d09621bce1e35a32898a4306544d1b24b2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 19:45:35 2012 +0000

    sna/display: Only flush pending output when installing a new scanout
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index 676125d..9401ca4 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -634,8 +634,10 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 	     sna_mode->fb_pixmap,
 	     sna->front->drawable.serialNumber));
 
-	if (sna_mode->fb_pixmap != sna->front->drawable.serialNumber)
+	if (sna_mode->fb_pixmap != sna->front->drawable.serialNumber) {
+		kgem_submit(&sna->kgem);
 		sna_mode_remove_fb(sna);
+	}
 
 	if (sna_mode->fb_id == 0) {
 		struct kgem_bo *bo = sna_pixmap_pin(sna->front);
@@ -677,8 +679,6 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 	crtc->y = y;
 	crtc->rotation = rotation;
 
-	kgem_submit(&sna->kgem);
-
 	mode_to_kmode(&sna_crtc->kmode, mode);
 	if (!sna_crtc_apply(crtc)) {
 		crtc->x = saved_x;
commit 75c3e1a28359ceec2a8ccfa92042cf6108395f15
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 11:50:54 2012 +0000

    sna/trapezoids: Further improve the clipping criteria for inplace traps
    
    Not only must we defend against the span starting too far to the right,
    we must also defend against the span terminating too far to the left.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index e76f918..f936b4b 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1520,37 +1520,40 @@ inplace_subrow(struct active_list *active, int8_t *row,
 
 		winding += edge->dir;
 		if (0 == winding) {
-			if (edge->x.quo >= FAST_SAMPLES_X * width) {
-				*max = width;
-			} else if (edge->next->x.quo != edge->x.quo) {
-				grid_scaled_x_t fx;
-				int ix;
-
-				xstart = edge->x.quo;
-				FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
-				row[ix++] -= FAST_SAMPLES_X - fx;
-				if (ix < width)
-					row[ix] -= fx;
-
-				if (ix > *max)
-					*max = ix;
-
-				xstart = INT_MIN;
+			if (edge->next->x.quo != edge->x.quo) {
+				if (edge->x.quo <= xstart) {
+					xstart = INT_MIN;
+				} else  {
+					grid_scaled_x_t fx;
+					int ix;
+
+					if (xstart < FAST_SAMPLES_X * width) {
+						FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
+						if (ix < *min)
+							*min = ix;
+
+						row[ix++] += FAST_SAMPLES_X - fx;
+						if (ix < width)
+							row[ix] += fx;
+					}
+
+					xstart = edge->x.quo;
+					if (xstart < FAST_SAMPLES_X * width) {
+						FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
+						row[ix++] -= FAST_SAMPLES_X - fx;
+						if (ix < width)
+							row[ix] -= fx;
+
+						if (ix > *max)
+							*max = ix;
+
+						xstart = INT_MIN;
+					} else
+						*max = width;
+				}
 			}
 		} else if (xstart < 0) {
 			xstart = MAX(edge->x.quo, 0);
-			if (xstart < FAST_SAMPLES_X * width) {
-				grid_scaled_x_t fx;
-				int ix;
-
-				FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
-				if (ix < *min)
-					*min = ix;
-
-				row[ix++] += FAST_SAMPLES_X - fx;
-				if (ix < width)
-					row[ix] += fx;
-			}
 		}
 
 		if (--edge->height_left) {
commit da4d7893ed2d2327a4802e6e09e570ad12510bc6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 12 10:49:46 2012 +0000

    sna/trapezoids: Add paranoia to ensure that the span starts within the clip
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47226
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 3e2802e..e76f918 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1538,16 +1538,19 @@ inplace_subrow(struct active_list *active, int8_t *row,
 				xstart = INT_MIN;
 			}
 		} else if (xstart < 0) {
-			grid_scaled_x_t fx;
-			int ix;
-
 			xstart = MAX(edge->x.quo, 0);
-			FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
-			if (ix < *min)
-				*min = ix;
+			if (xstart < FAST_SAMPLES_X * width) {
+				grid_scaled_x_t fx;
+				int ix;
 
-			row[ix++] += FAST_SAMPLES_X - fx;
-			row[ix] += fx;
+				FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
+				if (ix < *min)
+					*min = ix;
+
+				row[ix++] += FAST_SAMPLES_X - fx;
+				if (ix < width)
+					row[ix] += fx;
+			}
 		}
 
 		if (--edge->height_left) {
commit 254ed150288ba2fa55b207f704380a695cd45cd3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 11 19:45:55 2012 +0000

    sna: Make the maximum BLT pitch assertions consistent
    
    The maximum permissibly BLT pitch value is 32767, so make the assertions
    match...
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47206
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index d70e30e..eb8dbf8 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -118,6 +118,7 @@ static bool sna_blt_fill_init(struct sna *sna,
 {
 	struct kgem *kgem = &sna->kgem;
 
+	assert(kgem_bo_can_blt (kgem, bo));
 	assert(bo->tiling != I915_TILING_Y);
 	blt->bo[0] = bo;
 
@@ -127,7 +128,7 @@ static bool sna_blt_fill_init(struct sna *sna,
 		blt->cmd |= BLT_DST_TILED;
 		blt->br13 >>= 2;
 	}
-	assert(blt->br13 < MAXSHORT);
+	assert(blt->br13 <= MAXSHORT);
 
 	if (alu == GXclear)
 		pixel = 0;
@@ -258,6 +259,9 @@ static Bool sna_blt_copy_init(struct sna *sna,
 {
 	struct kgem *kgem = &sna->kgem;
 
+	assert(kgem_bo_can_blt (kgem, src));
+	assert(kgem_bo_can_blt (kgem, dst));
+
 	blt->bo[0] = src;
 	blt->bo[1] = dst;
 
@@ -270,14 +274,14 @@ static Bool sna_blt_copy_init(struct sna *sna,
 		blt->cmd |= BLT_SRC_TILED;
 		blt->pitch[0] >>= 2;
 	}
-	assert(blt->pitch[0] < MAXSHORT);
+	assert(blt->pitch[0] <= MAXSHORT);
 
 	blt->pitch[1] = dst->pitch;
 	if (kgem->gen >= 40 && dst->tiling) {
 		blt->cmd |= BLT_DST_TILED;
 		blt->pitch[1] >>= 2;
 	}
-	assert(blt->pitch[1] < MAXSHORT);
+	assert(blt->pitch[1] <= MAXSHORT);
 
 	blt->overwrites = alu == GXcopy || alu == GXclear || alu == GXset;
 	blt->br13 = (copy_ROP[alu] << 16) | blt->pitch[1];
@@ -308,6 +312,9 @@ static Bool sna_blt_alpha_fixup_init(struct sna *sna,
 {
 	struct kgem *kgem = &sna->kgem;
 
+	assert(kgem_bo_can_blt (kgem, src));
+	assert(kgem_bo_can_blt (kgem, dst));
+
 	blt->bo[0] = src;
 	blt->bo[1] = dst;
 
@@ -317,14 +324,14 @@ static Bool sna_blt_alpha_fixup_init(struct sna *sna,
 		blt->cmd |= BLT_SRC_TILED;
 		blt->pitch[0] >>= 2;
 	}
-	assert(blt->pitch[0] < MAXSHORT);
+	assert(blt->pitch[0] <= MAXSHORT);
 
 	blt->pitch[1] = dst->pitch;
 	if (kgem->gen >= 40 && dst->tiling) {
 		blt->cmd |= BLT_DST_TILED;
 		blt->pitch[1] >>= 2;
 	}
-	assert(blt->pitch[1] < MAXSHORT);
+	assert(blt->pitch[1] <= MAXSHORT);
 
 	blt->overwrites = 1;
 	blt->br13 = (0xfc << 16) | blt->pitch[1];
@@ -1829,6 +1836,8 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 	uint32_t br13, cmd, *b;
 	bool overwrites;
 
+	assert(kgem_bo_can_blt (kgem, bo));
+
 	DBG(("%s: box=((%d, %d), (%d, %d))\n", __FUNCTION__,
 	     box->x1, box->y1, box->x2, box->y2));
 
@@ -1841,7 +1850,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
-	assert(br13 < MAXSHORT);
+	assert(br13 <= MAXSHORT);
 
 	br13 |= fill_ROP[alu] << 16;
 	switch (bpp) {
@@ -1954,7 +1963,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 		cmd |= 1 << 11;
 		br13 >>= 2;
 	}
-	assert(br13 < MAXSHORT);
+	assert(br13 <= MAXSHORT);
 
 	br13 |= 1<<31 | fill_ROP[alu] << 16;
 	switch (bpp) {
@@ -2105,7 +2114,7 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
-	assert(br13 < MAXSHORT);
+	assert(br13 <= MAXSHORT);
 
 	br13 |= copy_ROP[alu] << 16;
 	switch (bpp) {
commit d93053ee23ec80664553b47eece5a9ccfe557f94
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 22:44:16 2012 +0000

    sna: Feed fallback mono trapezoids through the mono rasteriser
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 8420730..d70e30e 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -631,6 +631,12 @@ sna_rgba_for_color(uint32_t color, int depth)
 	return color_convert(color, sna_format_for_depth(depth), PICT_a8r8g8b8);
 }
 
+uint32_t
+sna_rgba_to_color(uint32_t rgba, uint32_t format)
+{
+	return color_convert(rgba, PICT_a8r8g8b8, format);
+}
+
 static uint32_t
 get_pixel(PicturePtr picture)
 {
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 71a6fc5..e6015af 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -482,6 +482,7 @@ sna_render_get_gradient(struct sna *sna,
 			PictGradient *pattern);
 
 uint32_t sna_rgba_for_color(uint32_t color, int depth);
+uint32_t sna_rgba_to_color(uint32_t rgba, uint32_t format);
 Bool sna_picture_is_solid(PicturePtr picture, uint32_t *color);
 
 void no_render_init(struct sna *sna);
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 4493331..3e2802e 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3681,6 +3681,262 @@ tor_blt_add_clipped_mono(struct sna *sna,
 		tor_blt_add_clipped(sna, op, clip, box, FAST_SAMPLES_XY);
 }
 
+struct mono_inplace_composite {
+	pixman_image_t *src, *dst;
+	int dx, dy;
+	int sx, sy;
+	int op;
+};
+struct mono_inplace_fill {
+	uint32_t *data, stride;
+	uint32_t color;
+	int bpp;
+};
+
+fastcall static void
+mono_inplace_fill_box(struct sna *sna,
+		      const struct sna_composite_op *op,
+		      const BoxRec *box)
+{
+	struct mono_inplace_fill *fill = op->priv;
+
+	DBG(("(%s: (%d, %d)x(%d, %d):%08x\n",
+	     __FUNCTION__,
+	     box->x1, box->y1,
+	     box->x2 - box->x1,
+	     box->y2 - box->y1,
+	     fill->color));
+	pixman_fill(fill->data, fill->stride, fill->bpp,
+		    box->x1, box->y1,
+		    box->x2 - box->x1,
+		    box->y2 - box->y1,
+		    fill->color);
+}
+
+static void
+mono_inplace_fill_boxes(struct sna *sna,
+			const struct sna_composite_op *op,
+			const BoxRec *box, int nbox)
+{
+	struct mono_inplace_fill *fill = op->priv;
+
+	do {
+		DBG(("(%s: (%d, %d)x(%d, %d):%08x\n",
+		     __FUNCTION__,
+		     box->x1, box->y1,
+		     box->x2 - box->x1,
+		     box->y2 - box->y1,
+		     fill->color));
+		pixman_fill(fill->data, fill->stride, fill->bpp,
+			    box->x1, box->y1,
+			    box->x2 - box->x1,
+			    box->y2 - box->y1,
+			    fill->color);
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+mono_inplace_composite_box(struct sna *sna,
+			   const struct sna_composite_op *op,
+			   const BoxRec *box)
+{
+	struct mono_inplace_composite *c = op->priv;
+
+	pixman_image_composite(c->op, c->src, NULL, c->dst,
+			       box->x1 + c->sx, box->y1 + c->sy,
+			       0, 0,
+			       box->x1 + c->dx, box->y1 + c->dy,
+			       box->x2 - box->x1,
+			       box->y2 - box->y1);
+}
+
+static void
+mono_inplace_composite_boxes(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const BoxRec *box, int nbox)
+{
+	struct mono_inplace_composite *c = op->priv;
+
+	do {
+		pixman_image_composite(c->op, c->src, NULL, c->dst,
+				       box->x1 + c->sx, box->y1 + c->sy,
+				       0, 0,
+				       box->x1 + c->dx, box->y1 + c->dy,
+				       box->x2 - box->x1,
+				       box->y2 - box->y1);
+		box++;
+	} while (--nbox);
+}
+
+static bool
+trapezoid_span_mono_inplace(CARD8 op,
+			    PicturePtr src,
+			    PicturePtr dst,
+			    INT16 src_x, INT16 src_y,
+			    int ntrap, xTrapezoid *traps)
+{
+	struct mono mono;
+	union {
+		struct mono_inplace_fill fill;
+		struct mono_inplace_composite composite;
+	} inplace;
+	int was_clear;
+	int x, y, n;
+
+	trapezoids_bounds(ntrap, traps, &mono.clip.extents);
+	if (mono.clip.extents.y1 >= mono.clip.extents.y2 ||
+	    mono.clip.extents.x1 >= mono.clip.extents.x2)
+		return true;
+
+	DBG(("%s: extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     mono.clip.extents.x1, mono.clip.extents.y1,
+	     mono.clip.extents.x2, mono.clip.extents.y2));
+
+	if (!sna_compute_composite_region(&mono.clip,
+					  src, NULL, dst,
+					  src_x, src_y,
+					  0, 0,
+					  mono.clip.extents.x1, mono.clip.extents.y1,
+					  mono.clip.extents.x2 - mono.clip.extents.x1,
+					  mono.clip.extents.y2 - mono.clip.extents.y1)) {
+		DBG(("%s: trapezoids do not intersect drawable clips\n",
+		     __FUNCTION__)) ;
+		return true;
+	}
+
+	DBG(("%s: clipped extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     mono.clip.extents.x1, mono.clip.extents.y1,
+	     mono.clip.extents.x2, mono.clip.extents.y2));
+
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+	if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &mono.clip,
+					     MOVE_WRITE | MOVE_READ))
+		return true;
+
+	mono.sna = to_sna_from_drawable(dst->pDrawable);
+	if (!mono_init(&mono, 2*ntrap))
+		return false;
+
+	mono.op.damage = NULL;
+
+	x = dst->pDrawable->x;
+	y = dst->pDrawable->y;
+
+	for (n = 0; n < ntrap; n++) {
+		if (!xTrapezoidValid(&traps[n]))
+			continue;
+
+		if (pixman_fixed_to_int(traps[n].top) + y >= mono.clip.extents.y2 ||
+		    pixman_fixed_to_int(traps[n].bottom) + y < mono.clip.extents.y1)
+			continue;
+
+		mono_add_line(&mono, x, y,
+			      traps[n].top, traps[n].bottom,
+			      &traps[n].left.p1, &traps[n].left.p2, 1);
+		mono_add_line(&mono, x, y,
+			      traps[n].top, traps[n].bottom,
+			      &traps[n].right.p1, &traps[n].right.p2, -1);
+	}
+
+	if (sna_picture_is_solid(src, &inplace.fill.color) &&
+	    (op == PictOpSrc || op == PictOpClear ||
+	     (op == PictOpOver && inplace.fill.color >> 24 == 0xff))) {
+		PixmapPtr pixmap;
+		int16_t dx, dy;
+		uint8_t *ptr;
+
+unbounded_pass:
+		pixmap = get_drawable_pixmap(dst->pDrawable);
+		get_drawable_deltas(dst->pDrawable, pixmap, &dx, &dy);
+
+		ptr = pixmap->devPrivate.ptr;
+		ptr += dy * pixmap->devKind + dx * pixmap->drawable.bitsPerPixel / 8;
+		inplace.fill.data = (uint32_t *)ptr;
+		inplace.fill.stride = pixmap->devKind / sizeof(uint32_t);
+		inplace.fill.bpp = pixmap->drawable.bitsPerPixel;
+
+		if (op == PictOpClear)
+			inplace.fill.color = 0;
+		else if (dst->format != PICT_a8r8g8b8)
+			inplace.fill.color = sna_rgba_to_color(inplace.fill.color, dst->format);
+
+		DBG(("%s: fill %x\n", __FUNCTION__, inplace.fill.color));
+
+		mono.op.priv = &inplace.fill;
+		mono.op.box = mono_inplace_fill_box;
+		mono.op.boxes = mono_inplace_fill_boxes;
+
+		op = 0;
+	} else {
+		inplace.composite.dst = image_from_pict(dst, FALSE,
+							&inplace.composite.dx,
+							&inplace.composite.dy);
+		inplace.composite.src = image_from_pict(src, FALSE,
+							&inplace.composite.sx,
+							&inplace.composite.sy);
+		inplace.composite.sx +=
+			src_x - pixman_fixed_to_int(traps[0].left.p1.x),
+		inplace.composite.sy +=
+			src_y - pixman_fixed_to_int(traps[0].left.p1.y),
+		inplace.composite.op = op;
+
+		mono.op.priv = &inplace.composite;
+		mono.op.box = mono_inplace_composite_box;
+		mono.op.boxes = mono_inplace_composite_boxes;
+	}
+	mono_render(&mono);
+	mono_fini(&mono);
+
+	if (op) {
+		free_pixman_pict(src, inplace.composite.src);
+		free_pixman_pict(dst, inplace.composite.dst);
+
+		if (!was_clear && !operator_is_bounded(op)) {
+			xPointFixed p1, p2;
+
+			DBG(("%s: unbounded fixup\n", __FUNCTION__));
+
+			if (!mono_init(&mono, 2+2*ntrap))
+				return false;
+
+			p1.y = mono.clip.extents.y1 * pixman_fixed_1;
+			p2.y = mono.clip.extents.y2 * pixman_fixed_1;
+
+			p1.x = mono.clip.extents.x1 * pixman_fixed_1;
+			p2.x = mono.clip.extents.x1 * pixman_fixed_1;
+			mono_add_line(&mono, 0, 0, p1.y, p2.y, &p1, &p2, -1);
+
+			p1.x = mono.clip.extents.x2 * pixman_fixed_1;
+			p2.x = mono.clip.extents.x2 * pixman_fixed_1;
+			mono_add_line(&mono, 0, 0, p1.y, p2.y, &p1, &p2, 1);
+
+			for (n = 0; n < ntrap; n++) {
+				if (!xTrapezoidValid(&traps[n]))
+					continue;
+
+				if (pixman_fixed_to_int(traps[n].top) + x >= mono.clip.extents.y2 ||
+				    pixman_fixed_to_int(traps[n].bottom) + y < mono.clip.extents.y1)
+					continue;
+
+				mono_add_line(&mono, x, y,
+					      traps[n].top, traps[n].bottom,
+					      &traps[n].left.p1, &traps[n].left.p2, 1);
+				mono_add_line(&mono, x, y,
+					      traps[n].top, traps[n].bottom,
+					      &traps[n].right.p1, &traps[n].right.p2, -1);
+			}
+
+			op = PictOpClear;
+			goto unbounded_pass;
+		}
+	}
+
+	return true;
+}
+
 static bool
 trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		       PictFormatPtr maskFormat, INT16 src_x, INT16 src_y,
@@ -3713,7 +3969,24 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	if (dst->format != PICT_a8 || !sna_picture_is_solid(src, &color)) {
+	if (!fallback && is_gpu(dst->pDrawable)) {
+		DBG(("%s: fallback -- can not perform operation in place, destination busy\n",
+		     __FUNCTION__));
+
+		return false;
+	}
+
+	if (is_mono(dst, maskFormat))
+		return trapezoid_span_mono_inplace(op, src, dst,
+						   src_x, src_y, ntrap, traps);
+
+	if (!sna_picture_is_solid(src, &color)) {
+		DBG(("%s: fallback -- can not perform operation in place, requires solid source\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	if (dst->format != PICT_a8) {
 		DBG(("%s: fallback -- can not perform operation in place, format=%x\n",
 		     __FUNCTION__, dst->format));
 		return false;
@@ -3744,8 +4017,6 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		     __FUNCTION__, op));
 		return false;
 	}
-	if (!fallback && is_gpu(dst->pDrawable))
-		return false;
 
 	DBG(("%s: format=%x, op=%d, color=%x\n",
 	     __FUNCTION__, dst->format, op, color));
commit 99ff40eb017bbcce0075b03dc8319492b7ff3fa0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 20:02:44 2012 +0000

    sna/traps: Add a fast path for narrow masks
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 5773d66..db579d0 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3785,6 +3785,12 @@ done:
 	return kgem_create_proxy(&bo->base, offset, size);
 }
 
+bool kgem_buffer_is_inplace(struct kgem_bo *_bo)
+{
+	struct kgem_partial_bo *bo = (struct kgem_partial_bo *)_bo->proxy;
+	return bo->write & KGEM_BUFFER_WRITE_INPLACE;
+}
+
 struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 				      int width, int height, int bpp,
 				      uint32_t flags,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 6c31f33..dff8bb2 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -503,6 +503,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 				      int width, int height, int bpp,
 				      uint32_t flags,
 				      void **ret);
+bool kgem_buffer_is_inplace(struct kgem_bo *bo);
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *bo);
 
 void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo);
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3429438..3619101 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2007,17 +2007,10 @@ sna_pixmap_create_upload(ScreenPtr screen,
 		pixmap = sna->freed_pixmap;
 		sna->freed_pixmap = NULL;
 
-		pixmap->usage_hint = CREATE_PIXMAP_USAGE_SCRATCH;
 		pixmap->drawable.serialNumber = NEXT_SERIAL_NUMBER;
 		pixmap->refcnt = 1;
-
-		DBG(("%s: serial=%ld, usage=%d\n",
-		     __FUNCTION__,
-		     pixmap->drawable.serialNumber,
-		     pixmap->usage_hint));
 	} else {
-		pixmap = create_pixmap(sna, screen, 0, 0, depth,
-				       CREATE_PIXMAP_USAGE_SCRATCH);
+		pixmap = create_pixmap(sna, screen, 0, 0, depth, 0);
 		if (!pixmap)
 			return NullPixmap;
 
@@ -2035,8 +2028,7 @@ sna_pixmap_create_upload(ScreenPtr screen,
 
 	priv->gpu_bo = kgem_create_buffer_2d(&sna->kgem,
 					     width, height, bpp,
-					     flags,
-					     &ptr);
+					     flags, &ptr);
 	if (!priv->gpu_bo) {
 		free(priv);
 		fbDestroyPixmap(pixmap);
@@ -2058,6 +2050,15 @@ sna_pixmap_create_upload(ScreenPtr screen,
 	pixmap->devKind = priv->gpu_bo->pitch;
 	pixmap->devPrivate.ptr = ptr;
 
+	pixmap->usage_hint = 0;
+	if (!kgem_buffer_is_inplace(priv->gpu_bo))
+		pixmap->usage_hint = 1;
+
+	DBG(("%s: serial=%ld, usage=%d\n",
+	     __FUNCTION__,
+	     pixmap->drawable.serialNumber,
+	     pixmap->usage_hint));
+
 	return pixmap;
 }
 
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 8c6cf34..4493331 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1409,6 +1409,342 @@ tor_render(struct sna *sna,
 	}
 }
 
+static void
+inplace_row(struct active_list *active, uint8_t *row, int width)
+{
+	struct edge *left = active->head.next;
+
+	assert(active->is_vertical);
+
+	while (&active->tail != left) {
+		struct edge *right;
+		int winding = left->dir;
+		grid_scaled_x_t lfx, rfx;
+		int lix, rix;
+
+		left->height_left -= FAST_SAMPLES_Y;
+		if (!left->height_left) {
+			left->prev->next = left->next;
+			left->next->prev = left->prev;
+		}
+
+		right = left->next;
+		do {
+			right->height_left -= FAST_SAMPLES_Y;
+			if (!right->height_left) {
+				right->prev->next = right->next;
+				right->next->prev = right->prev;
+			}
+
+			winding += right->dir;
+			if (0 == winding)
+				break;
+
+			right = right->next;
+		} while (1);
+
+		if (left->x.quo < 0) {
+			lix = lfx = 0;
+		} else if (left->x.quo > width * FAST_SAMPLES_X) {
+			lix = width;
+			lfx = 0;
+		} else
+			FAST_SAMPLES_X_TO_INT_FRAC(left->x.quo, lix, lfx);
+
+		if (right->x.quo < 0) {
+			rix = rfx = 0;
+		} else if (right->x.quo > width * FAST_SAMPLES_X) {
+			rix = width;
+			rfx = 0;
+		} else
+			FAST_SAMPLES_X_TO_INT_FRAC(right->x.quo, rix, rfx);
+		if (lix == rix) {
+			if (rfx != lfx)
+				row[lix] += (rfx-lfx) * 256 / FAST_SAMPLES_X;
+		} else {
+			if (lfx == 0)
+				row[lix] = 0xff;
+			else
+				row[lix] += 256 - lfx * 256 / FAST_SAMPLES_X;
+
+			if (rfx)
+				row[rix] += rfx * 256 / FAST_SAMPLES_X;
+
+			if (rix > ++lix) {
+				rix -= lix;
+#if 0
+				if (rix == 1)
+					row[lix] = 0xff;
+				else
+					memset(row+lix, 0xff, rix);
+#else
+				while (rix && lix & 3)
+					row[lix++] = 0xff, rix--;
+				while (rix > 4) {
+					*(uint32_t *)(row+lix) = 0xffffffff;
+					lix += 4;
+					rix -= 4;
+				}
+				if (rix & 2) {
+					*(uint16_t *)(row+lix) = 0xffff;
+					lix += 2;
+				}
+				if (rix & 1)
+					row[lix] = 0xff;
+#endif
+			}
+		}
+
+		left = right->next;
+	}
+}
+
+static inline uint8_t clip255(int x)
+{
+	if (x > 255)
+		return 255;
+
+	return x;
+}
+
+inline static void
+inplace_subrow(struct active_list *active, int8_t *row,
+	       int width, int *min, int *max)
+{
+	struct edge *edge = active->head.next;
+	grid_scaled_x_t prev_x = INT_MIN;
+	int winding = 0, xstart = INT_MIN;
+
+	while (&active->tail != edge) {
+		struct edge *next = edge->next;
+
+		winding += edge->dir;
+		if (0 == winding) {
+			if (edge->x.quo >= FAST_SAMPLES_X * width) {
+				*max = width;
+			} else if (edge->next->x.quo != edge->x.quo) {
+				grid_scaled_x_t fx;
+				int ix;
+
+				xstart = edge->x.quo;
+				FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
+				row[ix++] -= FAST_SAMPLES_X - fx;
+				if (ix < width)
+					row[ix] -= fx;
+
+				if (ix > *max)
+					*max = ix;
+
+				xstart = INT_MIN;
+			}
+		} else if (xstart < 0) {
+			grid_scaled_x_t fx;
+			int ix;
+
+			xstart = MAX(edge->x.quo, 0);
+			FAST_SAMPLES_X_TO_INT_FRAC(xstart, ix, fx);
+			if (ix < *min)
+				*min = ix;
+
+			row[ix++] += FAST_SAMPLES_X - fx;
+			row[ix] += fx;
+		}
+
+		if (--edge->height_left) {
+			if (!edge->vertical) {
+				edge->x.quo += edge->dxdy.quo;
+				edge->x.rem += edge->dxdy.rem;
+				if (edge->x.rem >= 0) {
+					++edge->x.quo;
+					edge->x.rem -= edge->dy;
+				}
+			}
+
+			if (edge->x.quo < prev_x) {
+				struct edge *pos = edge->prev;
+				pos->next = next;
+				next->prev = pos;
+				do {
+					pos = pos->prev;
+				} while (edge->x.quo < pos->x.quo);
+				pos->next->prev = edge;
+				edge->next = pos->next;
+				edge->prev = pos;
+				pos->next = edge;
+			} else
+				prev_x = edge->x.quo;
+		} else {
+			edge->prev->next = next;
+			next->prev = edge->prev;
+		}
+
+		edge = next;
+	}
+}
+
+inline static void
+inplace_end_subrows(struct active_list *active, uint8_t *row,
+		    int8_t *buf, int width)
+{
+	int cover = 0;
+
+	while (width > 4) {
+		uint32_t dw;
+		int v;
+
+		dw = *(uint32_t *)buf;
+		buf += 4;
+
+		if (dw == 0){
+			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+			v -= v >> 8;
+			v |= v << 8;
+			dw = v | v << 16;
+		} else if (dw) {
+			cover += (int8_t)(dw & 0xff);
+			assert(cover >= 0);
+			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+			v -= v >> 8;
+			dw >>= 8;
+			dw |= v << 24;
+
+			cover += (int8_t)(dw & 0xff);
+			assert(cover >= 0);
+			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+			v -= v >> 8;
+			dw >>= 8;
+			dw |= v << 24;
+
+			cover += (int8_t)(dw & 0xff);
+			assert(cover >= 0);
+			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+			v -= v >> 8;
+			dw >>= 8;
+			dw |= v << 24;
+
+			cover += (int8_t)(dw & 0xff);
+			assert(cover >= 0);
+			v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+			v -= v >> 8;
+			dw >>= 8;
+			dw |= v << 24;
+		}
+
+		*(uint32_t *)row = dw;
+		row += 4;
+
+		width -= 4;
+	}
+
+	while (width--) {
+		int v;
+
+		cover += *buf++;
+		assert(cover >= 0);
+
+		v = cover * 256 / (FAST_SAMPLES_X * FAST_SAMPLES_Y);
+		v -= v >> 8;
+		*row++ = v;
+	}
+}
+
+#define TOR_INPLACE_SIZE 128
+static void
+tor_inplace(struct tor *converter, PixmapPtr scratch, int mono, uint8_t *buf)
+{
+	int i, j, h = converter->ymax;
+	struct polygon *polygon = converter->polygon;
+	struct active_list *active = converter->active;
+	struct edge *buckets[FAST_SAMPLES_Y] = { 0 };
+	uint8_t *row = scratch->devPrivate.ptr;
+	int stride = scratch->devKind;
+	int width = scratch->drawable.width;
+
+	__DBG(("%s: mono=%d, buf=%d\n", __FUNCTION__, mono, buf));
+	assert(!mono);
+
+	/* Render each pixel row. */
+	for (i = 0; i < h; i = j) {
+		int do_full_step = 0;
+		void *ptr = buf ?: row;
+
+		j = i + 1;
+
+		/* Determine if we can ignore this row or use the full pixel
+		 * stepper. */
+		if (!polygon->y_buckets[i]) {
+			if (active->head.next == &active->tail) {
+				active->min_height = INT_MAX;
+				active->is_vertical = 1;
+				for (; j < h && !polygon->y_buckets[j]; j++)
+					;
+				__DBG(("%s: no new edges and no exisiting edges, skipping, %d -> %d\n",
+				       __FUNCTION__, i, j));
+
+				memset(row, 0, stride*(j-i));
+				row += stride*(j-i);
+				continue;
+			}
+
+			do_full_step = can_full_step(active);
+		}
+
+		__DBG(("%s: y=%d [%d], do_full_step=%d, new edges=%d, min_height=%d, vertical=%d\n",
+		       __FUNCTION__,
+		       i, i+ymin, do_full_step,
+		       polygon->y_buckets[i] != NULL,
+		       active->min_height,
+		       active->is_vertical));
+		if (do_full_step) {
+			memset(ptr, 0, width);
+			inplace_row(active, ptr, width);
+			if (row != ptr)
+				memcpy(row, ptr, width);
+
+			if (active->is_vertical) {
+				while (j < h &&
+				       polygon->y_buckets[j] == NULL &&
+				       active->min_height >= 2*FAST_SAMPLES_Y)
+				{
+					active->min_height -= FAST_SAMPLES_Y;
+					row += stride;
+					memcpy(row, ptr, width);
+					j++;
+				}
+				if (j != i + 1)
+					step_edges(active, j - (i + 1));
+
+				__DBG(("%s: vertical edges, full step (%d, %d)\n",
+				       __FUNCTION__,  i, j));
+			}
+		} else {
+			grid_scaled_y_t suby;
+			int min = width, max = 0;
+
+			fill_buckets(active, polygon->y_buckets[i], buckets);
+
+			/* Subsample this row. */
+			memset(ptr, 0, width);
+			for (suby = 0; suby < FAST_SAMPLES_Y; suby++) {
+				if (buckets[suby]) {
+					merge_edges(active, buckets[suby]);
+					buckets[suby] = NULL;
+				}
+
+				inplace_subrow(active, ptr, width, &min, &max);
+			}
+			memset(row, 0, min);
+			if (max > min)
+				inplace_end_subrows(active, row+min, (int8_t*)ptr+min, max-min);
+			if (max < width)
+				memset(row+max, 0, width-max);
+		}
+
+		active->min_height -= FAST_SAMPLES_Y;
+		row += stride;
+	}
+}
+
 struct mono_edge {
 	struct mono_edge *next, *prev;
 
@@ -1936,7 +2272,7 @@ trapezoids_bounds(int n, const xTrapezoid *t, BoxPtr box)
 		if (((x2 - t->right.p1.x) | (x2 - t->right.p2.x)) < 0) {
 			if (pixman_fixed_floor(t->right.p1.x) == pixman_fixed_floor(t->right.p2.x)) {
 				x2 = pixman_fixed_ceil(t->right.p1.x);
-			} else  {
+			} else {
 				if (t->right.p1.y == t->top)
 					fx1 = t->right.p1.x;
 				else
@@ -3007,7 +3343,6 @@ trapezoid_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			 int ntrap, xTrapezoid *traps)
 {
 	struct tor tor;
-	span_func_t span;
 	ScreenPtr screen = dst->pDrawable->pScreen;
 	PixmapPtr scratch;
 	PicturePtr mask;
@@ -3041,8 +3376,8 @@ trapezoid_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (extents.y1 >= extents.y2 || extents.x1 >= extents.x2)
 		return true;
 
-	DBG(("%s: extents (%d, %d), (%d, %d)\n",
-	     __FUNCTION__, extents.x1, extents.y1, extents.x2, extents.y2));
+	DBG(("%s: ntraps=%d, extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__, ntrap, extents.x1, extents.y1, extents.x2, extents.y2));
 
 	if (!sna_compute_composite_extents(&extents,
 					   src, NULL, dst,
@@ -3096,15 +3431,18 @@ trapezoid_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		tor_add_edge(&tor, &t, &t.right, -1);
 	}
 
-	if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-		span = tor_blt_mask_mono;
-	else
-		span = tor_blt_mask;
-
-	tor_render(NULL, &tor,
-		   scratch->devPrivate.ptr,
-		   (void *)(intptr_t)scratch->devKind,
-		   span, true);
+	if (extents.x2 <= TOR_INPLACE_SIZE) {
+		uint8_t buf[TOR_INPLACE_SIZE];
+		tor_inplace(&tor, scratch, is_mono(dst, maskFormat),
+			    scratch->usage_hint ? NULL : buf);
+	} else {
+		tor_render(NULL, &tor,
+			   scratch->devPrivate.ptr,
+			   (void *)(intptr_t)scratch->devKind,
+			   is_mono(dst, maskFormat) ? tor_blt_mask_mono : tor_blt_mask,
+			   true);
+	}
+	tor_fini(&tor);
 
 	mask = CreatePicture(0, &scratch->drawable,
 			     PictureMatchFormat(screen, 8, PICT_a8),
@@ -3119,7 +3457,6 @@ trapezoid_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 				 extents.x2, extents.y2);
 		FreePicture(mask, 0);
 	}
-	tor_fini(&tor);
 
 	return true;
 }
@@ -3535,7 +3872,6 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 			int ntrap, xTrapezoid *traps)
 {
 	struct tor tor;
-	span_func_t span;
 	ScreenPtr screen = dst->pDrawable->pScreen;
 	PixmapPtr scratch;
 	PicturePtr mask;
@@ -3569,8 +3905,8 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (extents.y1 >= extents.y2 || extents.x1 >= extents.x2)
 		return true;
 
-	DBG(("%s: extents (%d, %d), (%d, %d)\n",
-	     __FUNCTION__, extents.x1, extents.y1, extents.x2, extents.y2));
+	DBG(("%s: ntraps=%d, extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__, ntrap, extents.x1, extents.y1, extents.x2, extents.y2));
 
 	if (!sna_compute_composite_extents(&extents,
 					   src, NULL, dst,
@@ -3624,15 +3960,16 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 		tor_add_edge(&tor, &t, &t.right, -1);
 	}
 
-	if (maskFormat ? maskFormat->depth < 8 : dst->polyEdge == PolyEdgeSharp)
-		span = tor_blt_mask_mono;
-	else
-		span = tor_blt_mask;
-
-	tor_render(NULL, &tor,
-		   scratch->devPrivate.ptr,
-		   (void *)(intptr_t)scratch->devKind,
-		   span, true);
+	if (extents.x2 <= TOR_INPLACE_SIZE) {
+		tor_inplace(&tor, scratch, is_mono(dst, maskFormat), NULL);
+	} else {
+		tor_render(NULL, &tor,
+			   scratch->devPrivate.ptr,
+			   (void *)(intptr_t)scratch->devKind,
+			   is_mono(dst, maskFormat) ? tor_blt_mask_mono : tor_blt_mask,
+			   true);
+	}
+	tor_fini(&tor);
 
 	mask = CreatePicture(0, &scratch->drawable,
 			     PictureMatchFormat(screen, 8, PICT_a8),
@@ -3675,7 +4012,6 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 done:
 		FreePicture(mask, 0);
 	}
-	tor_fini(&tor);
 
 	return true;
 }
commit de38e5900ee1c9ca8722606c2fcb4fa69b6b4819
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 12:19:33 2012 +0000

    sna: Handle partial reads with a pending clear
    
    Skip the filling of the whole pixmap if we have a small read and we
    know the GPU bo is clear. Also choose to operate inplace on the GPU bo
    if we meet the usual criteria.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 5aad88b..3429438 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1257,13 +1257,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 	}
 
-	if (priv->clear) {
-		DBG(("%s: pending clear, moving whole pixmap\n", __FUNCTION__));
-		if (dx | dy)
-			RegionTranslate(region, -dx, -dy);
-		return _sna_pixmap_move_to_cpu(pixmap, flags | MOVE_READ);
-	}
-
 	if ((flags & MOVE_READ) == 0) {
 		DBG(("%s: no read, checking to see if we can stream the write into the GPU bo\n",
 		     __FUNCTION__));
@@ -1295,6 +1288,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 					sna_damage_add(&priv->gpu_damage,
 						       region);
 
+				priv->clear = false;
 				return true;
 			}
 		}
@@ -1333,6 +1327,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			} else
 				sna_damage_add(&priv->gpu_damage, region);
 
+			priv->clear = false;
 			return true;
 		}
 	}
@@ -1354,12 +1349,20 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			pixmap->devKind = priv->gpu_bo->pitch;
 			if (!DAMAGE_IS_ALL(priv->gpu_damage))
 				sna_damage_add(&priv->gpu_damage, region);
+			priv->clear = false;
 			return true;
 		}
 
 		priv->mapped = false;
 	}
 
+	if (priv->clear && flags & MOVE_WRITE) {
+		DBG(("%s: pending clear, moving whole pixmap for partial write\n", __FUNCTION__));
+		if (dx | dy)
+			RegionTranslate(region, -dx, -dy);
+		return _sna_pixmap_move_to_cpu(pixmap, flags | MOVE_READ);
+	}
+
 	if (priv->mapped) {
 		pixmap->devPrivate.ptr = NULL;
 		priv->mapped = false;
@@ -1372,6 +1375,35 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	if (priv->gpu_bo == NULL)
 		goto done;
 
+	if (priv->clear) {
+		int n = REGION_NUM_RECTS(region);
+		BoxPtr box = REGION_RECTS(region);
+
+		DBG(("%s: pending clear, doing partial fill\n", __FUNCTION__));
+		if (priv->cpu_bo) {
+			DBG(("%s: syncing CPU bo\n", __FUNCTION__));
+			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+		}
+
+		do {
+			pixman_fill(pixmap->devPrivate.ptr,
+				    pixmap->devKind/sizeof(uint32_t),
+				    pixmap->drawable.bitsPerPixel,
+				    box->x1, box->y1,
+				    box->x2 - box->x1,
+				    box->y2 - box->y1,
+				    priv->clear_color);
+			box++;
+		} while (--n);
+
+		if (region->extents.x2 - region->extents.x1 > 1 ||
+		    region->extents.y2 - region->extents.y1 > 1) {
+			sna_damage_subtract(&priv->gpu_damage, region);
+			priv->clear = false;
+		}
+		goto done;
+	}
+
 	if ((flags & MOVE_READ) == 0) {
 		assert(flags & MOVE_WRITE);
 		sna_damage_subtract(&priv->gpu_damage, region);
commit 9773d1d6c0bdeb5a0e924b378653acc82fb47806
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 10:37:34 2012 +0000

    sna/traps: Apply somes simple but common operator reduction for clipmasks
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index e28c669..8c6cf34 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3354,8 +3354,10 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	struct inplace inplace;
 	span_func_t span;
 	PixmapPtr pixmap;
+	struct sna_pixmap *priv;
 	RegionRec region;
 	uint32_t color;
+	bool unbounded;
 	int16_t dst_x, dst_y;
 	int dx, dy;
 	int n;
@@ -3380,18 +3382,33 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
+	pixmap = get_drawable_pixmap(dst->pDrawable);
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: fallback -- unattached\n", __FUNCTION__));
+		return false;
+	}
+
+	unbounded = false;
 	switch (op) {
 	case PictOpIn:
+		unbounded = true;
+		if (priv->clear && priv->clear_color == 0xff)
+			op = PictOpSrc;
+		break;
 	case PictOpAdd:
+		if (priv->clear && priv->clear_color == 0)
+			op = PictOpSrc;
+		break;
 	case PictOpSrc:
-		if (!fallback && is_gpu(dst->pDrawable))
-			return false;
 		break;
 	default:
 		DBG(("%s: fallback -- can not perform op [%d] in place\n",
 		     __FUNCTION__, op));
 		return false;
 	}
+	if (!fallback && is_gpu(dst->pDrawable))
+		return false;
 
 	DBG(("%s: format=%x, op=%d, color=%x\n",
 	     __FUNCTION__, dst->format, op, color));
@@ -3497,7 +3514,6 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 					     op == PictOpSrc ? MOVE_WRITE : MOVE_WRITE | MOVE_READ))
 		return true;
 
-	pixmap = get_drawable_pixmap(dst->pDrawable);
 	get_drawable_deltas(dst->pDrawable, pixmap, &dst_x, &dst_y);
 
 	inplace.ptr = pixmap->devPrivate.ptr;
@@ -3506,7 +3522,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	inplace.opacity = color >> 24;
 
 	tor_render(NULL, &tor, (void*)&inplace,
-		   dst->pCompositeClip, span, op == PictOpIn);
+		   dst->pCompositeClip, span, unbounded);
 
 	tor_fini(&tor);
 
commit 3f40460d0c6d12a6075711e8cd784e4f9fabccad
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 09:43:46 2012 +0000

    sna/dri: Only delivered a delayed flip if the drawable is still on the root
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 92132d6..ccaf40f 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1053,7 +1053,8 @@ static void sna_dri_flip_event(struct sna *sna,
 				      flip->drawable_id,
 				      serverClient,
 				      M_ANY, DixWriteAccess) == Success) {
-			if (!sna_dri_flip_continue(sna, drawable, flip)) {
+			if (can_flip(sna, drawable, flip->front, flip->back) &&
+			    !sna_dri_flip_continue(sna, drawable, flip)) {
 				DRI2SwapComplete(flip->client, drawable,
 						 0, 0, 0,
 						 DRI2_BLIT_COMPLETE,
commit 527d0d0b82ac17a159785c4ad6698fd3f0654d36
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 09:43:24 2012 +0000

    sna/traps: Remove some dead code
    
    This function was never used in this implementation, remove it.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 727b4d2..e28c669 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -483,15 +483,6 @@ cell_list_rewind(struct cell_list *cells)
 	cells->cursor = &cells->head;
 }
 
-/* Rewind the cell list if its cursor has been advanced past x. */
-inline static void
-cell_list_maybe_rewind(struct cell_list *cells, int x)
-{
-	struct cell *tail = cells->cursor;
-	if (tail->x > x)
-		cell_list_rewind (cells);
-}
-
 static void
 cell_list_init(struct cell_list *cells)
 {
commit e202a37e85edff370a5d4877682f4e4463f7c29c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 9 00:37:32 2012 +0000

    sna: Emit a INFO when compiled with debugging enabled
    
    It is useful to know and to receive confirmation that you have
    successfully compiled and executed the driver with debugging enabled.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index ddacfd1..7cf3ef1 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -1074,6 +1074,10 @@ void sna_init_scrn(ScrnInfoPtr scrn, int entity_num)
 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
 		   "SNA compiled: %s\n", BUILDER_DESCRIPTION);
 #endif
+#if HAVE_EXTRA_DEBUG
+	xf86DrvMsg(scrn->scrnIndex, X_INFO,
+		   "SNA compiled with debugging enabled\n");
+#endif
 
 	DBG(("%s\n", __FUNCTION__));
 	DBG(("pixman version: %s\n", pixman_version_string()));
commit 7f30c7c440d46f9398a5349239a0bf732790e6f4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 18:07:22 2012 +0000

    sna/traps: Fix the initialisation of the error term for vertical mono edges
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 8e735ef..727b4d2 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -776,7 +776,7 @@ polygon_add_line(struct polygon *polygon,
 	if (dx == 0) {
 		e->vertical = true;
 		e->x.quo = p1->x;
-		e->x.rem = 0;
+		e->x.rem = -dy;
 		e->dxdy.quo = 0;
 		e->dxdy.rem = 0;
 	} else {
commit 15040d00fba09862448f889d24cd678cb6cd7765
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 18:06:51 2012 +0000

    sna/traps: Unroll insertion sort
    
    As the compiler cannot know the loop is bounded by a sentinel, manually
    unroll it.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 33ea3bb..8e735ef 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -556,6 +556,14 @@ cell_list_find(struct cell_list *cells, int x)
 			break;
 
 		tail = tail->next;
+		if (tail->next->x > x)
+			break;
+
+		tail = tail->next;
+		if (tail->next->x > x)
+			break;
+
+		tail = tail->next;
 	} while (1);
 
 	if (tail->x != x)
commit 4b086eedba12de2bf1fa7be0c544db9f17afdabf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 17:13:39 2012 +0000

    sna/gen6: Replace the memset with explict initialisation
    
    The profiles told me to kill it...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 764b629..8a8cdd8 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3705,8 +3705,6 @@ gen6_render_fill_boxes(struct sna *sna,
 	     __FUNCTION__, pixel, n,
 	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
 
-	memset(&tmp, 0, sizeof(tmp));
-
 	tmp.op = op;
 
 	tmp.dst.pixmap = dst;
@@ -3714,16 +3712,21 @@ gen6_render_fill_boxes(struct sna *sna,
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.format = format;
 	tmp.dst.bo = dst_bo;
+	tmp.dst.x = tmp.dst.y = 0;
 
 	tmp.src.bo = sna_render_get_solid(sna, pixel);
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
 
 	tmp.mask.bo = NULL;
+	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
+	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	tmp.is_affine = TRUE;
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
+	tmp.has_component_alpha = FALSE;
+	tmp.need_magic_ca_pass = FALSE;
 
 	tmp.u.gen6.wm_kernel = GEN6_WM_KERNEL_NOMASK;
 	tmp.u.gen6.nr_surfaces = 2;
commit bd3fa4ba3dca228adff3a0ef03706507d3a80d22
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 13:41:58 2012 +0000

    sna: Fix handling of large glyphs following large and shared buffer work
    
    Part of the large buffer handling was to move the decision making about
    whether to create GPU bo for a pixmap to creation time. The single
    instance where we change our minds later is involving large glyphs which
    we choose not to cache.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 8340345..441b24e 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -684,7 +684,6 @@ memcpy_xor(const void *src, void *dst, int bpp,
 
 #define SNA_CREATE_FB 0x10
 #define SNA_CREATE_SCRATCH 0x11
-#define SNA_CREATE_GLYPH 0x12
 
 inline static bool is_power_of_two(unsigned x)
 {
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 08ee537..5aad88b 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -764,7 +764,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	}
 
 	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE)
-		goto fallback;
+		flags &= ~KGEM_CAN_CREATE_GPU;
 
 force_create:
 	pad = PixmapBytePad(width, depth);
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 2733a1a..1c536c8 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -315,7 +315,7 @@ glyph_cache(ScreenPtr screen,
 		PixmapPtr pixmap = (PixmapPtr)glyph_picture->pDrawable;
 		assert(glyph_picture->pDrawable->type == DRAWABLE_PIXMAP);
 		if (pixmap->drawable.depth >= 8) {
-			pixmap->usage_hint = SNA_CREATE_GLYPH;
+			pixmap->usage_hint = 0;
 			sna_pixmap_force_to_gpu(pixmap, MOVE_READ);
 		}
 		return FALSE;
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 421c7ff..d1e3500 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -378,8 +378,11 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 		bool upload;
 
 		priv = sna_pixmap(pixmap);
-		if (!priv)
+		if (!priv) {
+			DBG(("%s: not migrating unattached pixmap\n",
+			     __FUNCTION__));
 			return false;
+		}
 
 		upload = true;
 		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
commit 894e2f15d7a4c1a056f1aeb44b3d12d4efbf5b29
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 12:27:23 2012 +0000

    sna: Fix reversed logic for CREATE_NO_RETIRE
    
    If the flag is set, we cannot retire, not the other way around!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index cfa46cf..5773d66 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2060,7 +2060,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		DBG(("%s: inactive and cache bucket empty\n",
 		     __FUNCTION__));
 
-		if ((flags & CREATE_NO_RETIRE) == 0) {
+		if (flags & CREATE_NO_RETIRE) {
 			DBG(("%s: can not retire\n", __FUNCTION__));
 			return NULL;
 		}
commit bf395bfb2f203e24525679b3e717df9c8d29a59e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 12:13:36 2012 +0000

    sna: Avoid NULL deference in DBG
    
    Only print out the details of the allocated CPU bo, if we actually
    allocate it.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a1798a5..08ee537 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -313,10 +313,10 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 						  pixmap->drawable.height,
 						  pixmap->drawable.bitsPerPixel,
 						  from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE);
-		DBG(("%s: allocated CPU handle=%d\n", __FUNCTION__,
-		     priv->cpu_bo->handle));
-
 		if (priv->cpu_bo) {
+			DBG(("%s: allocated CPU handle=%d\n", __FUNCTION__,
+			     priv->cpu_bo->handle));
+
 			priv->ptr = kgem_bo_map__cpu(&sna->kgem, priv->cpu_bo);
 			priv->stride = priv->cpu_bo->pitch;
 		}
commit 7003a4ea61d5c181bfb824af3116d12c9a190f80
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 12:10:24 2012 +0000

    sna: Force the creation of a backing pixmap for scanout
    
    Ordinarily if the GPU is wedged, we just want to create a shadow buffer.
    Except that we must ensure that we do allow a bo to be created for
    attaching to the scanout.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 2dff2ed..cfa46cf 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2365,7 +2365,7 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 	uint32_t pitch, size;
 	unsigned flags = 0;
 
-	if (depth < 8 || kgem->wedged)
+	if (depth < 8)
 		return 0;
 
 	if (width > MAXSHORT || height > MAXSHORT)
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2769d4b..a1798a5 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -730,6 +730,14 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	}
 	assert(width && height);
 
+	if (wedged(sna)) {
+		if (usage == SNA_CREATE_FB) {
+			flags = KGEM_CAN_CREATE_GPU;
+			goto force_create;
+		}
+		goto fallback;
+	}
+
 	flags = kgem_can_create_2d(&sna->kgem, width, height, depth);
 	if (flags == 0) {
 		DBG(("%s: can not use GPU, just creating shadow\n",
@@ -758,6 +766,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE)
 		goto fallback;
 
+force_create:
 	pad = PixmapBytePad(width, depth);
 	if (pad * height <= 4096) {
 		DBG(("%s: small buffer [%d], attaching to shadow pixmap\n",
commit aacc36faf3b9f21b08303fcdc247c48146e8e13e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 11:59:02 2012 +0000

    sna: Mark the pixmap for writing when creating the screen resources
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 419d1c6..2769d4b 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2051,6 +2051,13 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 		struct sna *sna = to_sna_from_pixmap(pixmap);
 		unsigned mode;
 
+		DBG(("%s: forcing creation of  gpu bo (%dx%d@%d, flags=%x)\n",
+		     __FUNCTION__,
+		     pixmap->drawable.width,
+		     pixmap->drawable.height,
+		     pixmap->drawable.bitsPerPixel,
+		     priv->create));
+
 		mode = 0;
 		if (priv->cpu_damage && !priv->cpu_bo)
 			mode |= CREATE_INACTIVE;
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index e53b75f..ddacfd1 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -198,7 +198,7 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
 		return FALSE;
 	}
 
-	if (!sna_pixmap_force_to_gpu(sna->front, MOVE_READ)) {
+	if (!sna_pixmap_force_to_gpu(sna->front, MOVE_WRITE)) {
 		xf86DrvMsg(screen->myNum, X_ERROR,
 			   "[intel] Failed to allocate video resources for front buffer %dx%d at depth %d\n",
 			   screen->width,
commit 133eb8f9f47212e62724380911337b1fa2fa8d48
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 11:13:34 2012 +0000

    intel: Fix typo s/asert/assert/
    
    The joy of conditional compiles masked this compilation failure when
    testing.
    
    Reported-by: Reinhard Karcher <reinhard.karcher at gmx.net>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_list.h b/src/intel_list.h
index cbadebf..cfaa1ad 100644
--- a/src/intel_list.h
+++ b/src/intel_list.h
@@ -207,7 +207,7 @@ list_append(struct list *entry, struct list *head)
 static inline void
 __list_del(struct list *prev, struct list *next)
 {
-	asert(next->prev == prev->next);
+	assert(next->prev == prev->next);
 	next->prev = prev;
 	prev->next = next;
 }
commit 616c63aed04bebc78b717cb5100b5ac44da8299e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 11:04:05 2012 +0000

    sna/gen2+: Use the reduced operator from CompositeRectangles
    
    Do not attempt to further reduce the operator locally in each backend as
    the reduction is already performed in the upper layer.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42606
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 597d5f3..6907dd6 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2348,30 +2348,24 @@ gen2_render_fill_boxes_try_blt(struct sna *sna,
 			       PixmapPtr dst, struct kgem_bo *dst_bo,
 			       const BoxRec *box, int n)
 {
-	uint8_t alu = GXcopy;
+	uint8_t alu;
 	uint32_t pixel;
 
-	if (!sna_get_pixel_from_rgba(&pixel,
-				     color->red,
-				     color->green,
-				     color->blue,
-				     color->alpha,
-				     format))
+	if (op > PictOpSrc)
 		return FALSE;
 
 	if (op == PictOpClear) {
 		alu = GXclear;
 		pixel = 0;
-		op = PictOpSrc;
-	}
-
-	if (op == PictOpOver) {
-		if ((pixel & 0xff000000) == 0xff000000)
-			op = PictOpSrc;
-	}
-
-	if (op != PictOpSrc)
+	} else if (!sna_get_pixel_from_rgba(&pixel,
+					    color->red,
+					    color->green,
+					    color->blue,
+					    color->alpha,
+					    format))
 		return FALSE;
+	else
+		alu = GXcopy;
 
 	return sna_blt_fill_boxes(sna, alu,
 				  dst_bo, dst->drawable.bitsPerPixel,
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 2a18631..d3ed2ef 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -4153,7 +4153,7 @@ gen3_render_fill_boxes_try_blt(struct sna *sna,
 			       PixmapPtr dst, struct kgem_bo *dst_bo,
 			       const BoxRec *box, int n)
 {
-	uint8_t alu = GXcopy;
+	uint8_t alu;
 	uint32_t pixel;
 
 	if (dst_bo->tiling == I915_TILING_Y) {
@@ -4162,36 +4162,21 @@ gen3_render_fill_boxes_try_blt(struct sna *sna,
 		return FALSE;
 	}
 
-	if (color->alpha >= 0xff00) {
-		if (op == PictOpOver)
-			op = PictOpSrc;
-		else if (op == PictOpOutReverse)
-			op = PictOpClear;
-		else if (op == PictOpAdd &&
-			 (color->red & color->green & color->blue) >= 0xff00)
-			op = PictOpSrc;
-	}
+	if (op > PictOpSrc)
+		return FALSE;
 
-	pixel = 0;
 	if (op == PictOpClear) {
 		alu = GXclear;
-	} else if (op == PictOpSrc) {
-		if (color->alpha <= 0x00ff)
-			alu = GXclear;
-		else if (!sna_get_pixel_from_rgba(&pixel,
-						  color->red,
-						  color->green,
-						  color->blue,
-						  color->alpha,
-						  format)) {
-			DBG(("%s: unknown format %x\n", __FUNCTION__,
-			     (uint32_t)format));
-			return FALSE;
-		}
-	} else {
-		DBG(("%s: unhandle op %d\n", __FUNCTION__, alu));
+		pixel = 0;
+	} else if (!sna_get_pixel_from_rgba(&pixel,
+					    color->red,
+					    color->green,
+					    color->blue,
+					    color->alpha,
+					    format))
 		return FALSE;
-	}
+	else
+		alu = GXcopy;
 
 	return sna_blt_fill_boxes(sna, alu,
 				  dst_bo, dst->drawable.bitsPerPixel,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 02454b2..a69852e 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2901,29 +2901,24 @@ gen4_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt(sna) ||
-	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    !gen4_check_dst_format(format)) {
+	if (op <= PictOpSrc &&
+	    (prefer_blt(sna) ||
+	     too_large(dst->drawable.width, dst->drawable.height) ||
+	     !gen4_check_dst_format(format))) {
 		uint8_t alu = -1;
 
-		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
+		pixel = 0;
+		if (op == PictOpClear)
 			alu = GXclear;
-
-		if (op == PictOpSrc || (op == PictOpOver && color->alpha >= 0xff00)) {
+		else if (sna_get_pixel_from_rgba(&pixel,
+						 color->red,
+						 color->green,
+						 color->blue,
+						 color->alpha,
+						 format))
 			alu = GXcopy;
-			if (color->alpha <= 0x00ff)
-				alu = GXclear;
-		}
 
-		pixel = 0;
-		if ((alu == GXclear ||
-		     (alu == GXcopy &&
-		      sna_get_pixel_from_rgba(&pixel,
-					      color->red,
-					      color->green,
-					      color->blue,
-					      color->alpha,
-					      format))) &&
+		if (alu != -1 &&
 		    sna_blt_fill_boxes(sna, alu,
 				       dst_bo, dst->drawable.bitsPerPixel,
 				       pixel, box, n))
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 6763edf..01604ef 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -3241,29 +3241,24 @@ gen5_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt_fill(sna) ||
-	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    !gen5_check_dst_format(format)) {
+	if (op <= PictOpSrc &&
+	    (prefer_blt_fill(sna) ||
+	     too_large(dst->drawable.width, dst->drawable.height) ||
+	     !gen5_check_dst_format(format))) {
 		uint8_t alu = -1;
 
-		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
+		pixel = 0;
+		if (op == PictOpClear)
 			alu = GXclear;
-
-		if (op == PictOpSrc || (op == PictOpOver && color->alpha >= 0xff00)) {
+		else if (sna_get_pixel_from_rgba(&pixel,
+						 color->red,
+						 color->green,
+						 color->blue,
+						 color->alpha,
+						 format))
 			alu = GXcopy;
-			if (color->alpha <= 0x00ff)
-				alu = GXclear;
-		}
 
-		pixel = 0;
-		if ((alu == GXclear ||
-		     (alu == GXcopy &&
-		      sna_get_pixel_from_rgba(&pixel,
-					      color->red,
-					      color->green,
-					      color->blue,
-					      color->alpha,
-					      format))) &&
+		if (alu != -1 &&
 		    sna_blt_fill_boxes(sna, alu,
 				       dst_bo, dst->drawable.bitsPerPixel,
 				       pixel, box, n))
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 390cb0a..764b629 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3655,29 +3655,24 @@ gen6_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt_fill(sna, dst_bo) ||
-	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    !gen6_check_dst_format(format)) {
+	if (op <= PictOpSrc &&
+	    (prefer_blt_fill(sna, dst_bo) ||
+	     too_large(dst->drawable.width, dst->drawable.height) ||
+	     !gen6_check_dst_format(format))) {
 		uint8_t alu = -1;
 
-		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
+		pixel = 0;
+		if (op == PictOpClear)
 			alu = GXclear;
-
-		if (op == PictOpSrc || (op == PictOpOver && color->alpha >= 0xff00)) {
+		else if (sna_get_pixel_from_rgba(&pixel,
+						 color->red,
+						 color->green,
+						 color->blue,
+						 color->alpha,
+						 format))
 			alu = GXcopy;
-			if (color->alpha <= 0x00ff)
-				alu = GXclear;
-		}
 
-		pixel = 0;
-		if ((alu == GXclear ||
-		     (alu == GXcopy &&
-		      sna_get_pixel_from_rgba(&pixel,
-					      color->red,
-					      color->green,
-					      color->blue,
-					      color->alpha,
-					      format))) &&
+		if (alu != -1 &&
 		    sna_blt_fill_boxes(sna, alu,
 				       dst_bo, dst->drawable.bitsPerPixel,
 				       pixel, box, n))
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 2b3f67b..36ea8a1 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3738,29 +3738,24 @@ gen7_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt_fill(sna, dst_bo) ||
-	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    !gen7_check_dst_format(format)) {
+	if (op <= PictOpSrc &&
+	    (prefer_blt_fill(sna, dst_bo) ||
+	     too_large(dst->drawable.width, dst->drawable.height) ||
+	     !gen7_check_dst_format(format))) {
 		uint8_t alu = -1;
 
-		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
+		pixel = 0;
+		if (op == PictOpClear)
 			alu = GXclear;
-
-		if (op == PictOpSrc || (op == PictOpOver && color->alpha >= 0xff00)) {
+		else if (sna_get_pixel_from_rgba(&pixel,
+						 color->red,
+						 color->green,
+						 color->blue,
+						 color->alpha,
+						 format))
 			alu = GXcopy;
-			if (color->alpha <= 0x00ff)
-				alu = GXclear;
-		}
 
-		pixel = 0;
-		if ((alu == GXclear ||
-		     (alu == GXcopy &&
-		      sna_get_pixel_from_rgba(&pixel,
-					      color->red,
-					      color->green,
-					      color->blue,
-					      color->alpha,
-					      format))) &&
+		if (alu != -1 &&
 		    sna_blt_fill_boxes(sna, alu,
 				       dst_bo, dst->drawable.bitsPerPixel,
 				       pixel, box, n))
commit 0b1f4e6e6b3476cee7bbd1f9a5f61d9ee92690af
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 10:51:05 2012 +0000

    sna: Tidy marking pixmap->clear for CompositeRectangles
    
    Reduce the two unsightly checks into one.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 38ff99d..a610e7c 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -781,45 +781,29 @@ sna_composite_rectangles(CARD8		 op,
 	/* Clearing a pixmap after creation is a common operation, so take
 	 * advantage and reduce further damage operations.
 	 */
+	if (region.data == NULL &&
+	    region.extents.x2 - region.extents.x1 == pixmap->drawable.width &&
+	    region.extents.y2 - region.extents.y1 == pixmap->drawable.height) {
+		sna_damage_all(&priv->gpu_damage,
+			       pixmap->drawable.width, pixmap->drawable.height);
+		priv->undamaged = false;
+		if (op <= PictOpSrc) {
+			priv->clear = true;
+			priv->clear_color = 0;
+			if (op == PictOpSrc)
+				sna_get_pixel_from_rgba(&priv->clear_color,
+							color->red,
+							color->green,
+							color->blue,
+							color->alpha,
+							dst->format);
+			DBG(("%s: marking clear [%08x]\n",
+			     __FUNCTION__, priv->clear_color));
+		}
+	}
 	if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 		assert_pixmap_contains_box(pixmap, RegionExtents(&region));
-
-		if (region.data == NULL &&
-		    region.extents.x2 - region.extents.x1 == pixmap->drawable.width &&
-		    region.extents.y2 - region.extents.y1 == pixmap->drawable.height) {
-			sna_damage_all(&priv->gpu_damage,
-				       pixmap->drawable.width, pixmap->drawable.height);
-			priv->undamaged = false;
-			if (op <= PictOpSrc) {
-				priv->clear = true;
-				priv->clear_color = 0;
-				if (op == PictOpSrc)
-					sna_get_pixel_from_rgba(&priv->clear_color,
-								color->red,
-								color->green,
-								color->blue,
-								color->alpha,
-								dst->format);
-				DBG(("%s: marking clear [%08x]\n",
-				     __FUNCTION__, priv->clear_color));
-			}
-		} else
-			sna_damage_add(&priv->gpu_damage, &region);
-	} else if (op <= PictOpSrc &&
-		   region.data == NULL &&
-		   region.extents.x2 - region.extents.x1 == pixmap->drawable.width &&
-		   region.extents.y2 - region.extents.y1 == pixmap->drawable.height) {
-		priv->clear = true;
-		priv->clear_color = 0;
-		if (op == PictOpSrc)
-			sna_get_pixel_from_rgba(&priv->clear_color,
-						color->red,
-						color->green,
-						color->blue,
-						color->alpha,
-						dst->format);
-		DBG(("%s: marking clear [%08x]\n",
-		     __FUNCTION__, priv->clear_color));
+		sna_damage_add(&priv->gpu_damage, &region);
 	}
 
 	goto done;
commit dafcf4d2881f1477941cbefe2de5a2ee2cf6645d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 10:50:32 2012 +0000

    sna: Add some assertions around pixmap creation for render operations
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index e961c2c..419d1c6 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2023,6 +2023,7 @@ sna_pixmap_create_upload(ScreenPtr screen,
 static inline struct sna_pixmap *
 sna_pixmap_mark_active(struct sna *sna, struct sna_pixmap *priv)
 {
+	assert(priv->gpu_bo);
 	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
@@ -2051,7 +2052,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 		unsigned mode;
 
 		mode = 0;
-		if (priv->cpu_damage)
+		if (priv->cpu_damage && !priv->cpu_bo)
 			mode |= CREATE_INACTIVE;
 		if (pixmap->usage_hint == SNA_CREATE_FB)
 			mode |= CREATE_EXACT | CREATE_SCANOUT;
@@ -2132,14 +2133,25 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	sna_damage_reduce(&priv->cpu_damage);
 	DBG(("%s: CPU damage? %d\n", __FUNCTION__, priv->cpu_damage != NULL));
 	if (priv->gpu_bo == NULL) {
-		if (!wedged(sna) && priv->create & KGEM_CAN_CREATE_GPU)
+		DBG(("%s: creating GPU bo (%dx%d@%d), create=%x\n",
+		     __FUNCTION__,
+		     pixmap->drawable.width,
+		     pixmap->drawable.height,
+		     pixmap->drawable.bitsPerPixel,
+		     priv->create));
+		assert(!priv->mapped);
+		if (!wedged(sna) && priv->create & KGEM_CAN_CREATE_GPU) {
+			assert(pixmap->drawable.width > 0);
+			assert(pixmap->drawable.height > 0);
+			assert(pixmap->drawable.bitsPerPixel >= 8);
 			priv->gpu_bo =
 				kgem_create_2d(&sna->kgem,
 					       pixmap->drawable.width,
 					       pixmap->drawable.height,
 					       pixmap->drawable.bitsPerPixel,
 					       sna_pixmap_choose_tiling(pixmap),
-					       priv->cpu_damage ? CREATE_GTT_MAP | CREATE_INACTIVE : 0);
+					       (priv->cpu_damage && priv->cpu_bo == NULL) ? CREATE_GTT_MAP | CREATE_INACTIVE : 0);
+		}
 		if (priv->gpu_bo == NULL) {
 			DBG(("%s: not creating GPU bo\n", __FUNCTION__));
 			assert(list_is_empty(&priv->list));
@@ -2179,6 +2191,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		Bool ok;
 
 		assert(pixmap_contains_damage(pixmap, priv->cpu_damage));
+		DBG(("%s: uploading %d damage boxes\n", __FUNCTION__, n));
 
 		ok = FALSE;
 		if (priv->cpu_bo)
@@ -2225,7 +2238,7 @@ done:
 		}
 	}
 active:
-	return sna_pixmap_mark_active(to_sna_from_pixmap(pixmap), priv);
+	return sna_pixmap_mark_active(sna, priv);
 }
 
 static bool must_check sna_validate_pixmap(DrawablePtr draw, PixmapPtr pixmap)
commit 6307edb1c34c5dcfe52a8a3066a0adbe1f80369b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 09:54:16 2012 +0000

    sna: Discard unbound partial buffers
    
    Instead of keeping a virgin partial buffer around on its inactive list,
    just transfer it to the global bo cache (in actuality destroy it since
    it is just a kmalloc with no pages bound).
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index b036d26..2dff2ed 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1246,13 +1246,12 @@ static void kgem_retire_partials(struct kgem *kgem)
 
 		assert(bo->base.refcnt == 1);
 		assert(bo->base.exec == NULL);
-		if (!bo->mmapped) {
+		if (!bo->mmapped || bo->base.presumed_offset == 0) {
 			list_del(&bo->base.list);
 			kgem_bo_unref(kgem, &bo->base);
 			continue;
 		}
 
-		assert(kgem->has_llc || !IS_CPU_MAP(bo->base.map));
 		bo->base.dirty = false;
 		bo->base.needs_flush = false;
 		bo->used = 0;
commit fa7d9ab65dd77b01aff83ab918a3125d0370dcf1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 09:42:58 2012 +0000

    sna: Preserve the offset alignment when trimming unused rows from partials
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 8e3de97..b036d26 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2062,7 +2062,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		     __FUNCTION__));
 
 		if ((flags & CREATE_NO_RETIRE) == 0) {
-			DBG(("%s: can not retire\n"));
+			DBG(("%s: can not retire\n", __FUNCTION__));
 			return NULL;
 		}
 
@@ -3340,7 +3340,7 @@ void kgem_sync(struct kgem *kgem)
 	list_for_each_entry(bo, &kgem->sync_list, list)
 		kgem_bo_sync__cpu(kgem, bo);
 
-	assert (kgem->sync == NULL);
+	assert(kgem->sync == NULL);
 }
 
 void kgem_clear_dirty(struct kgem *kgem)
@@ -3812,15 +3812,23 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 
 	if (height & 1) {
 		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
+		int min;
+
+		assert(io->used);
 
 		/* Having padded this surface to ensure that accesses to
 		 * the last pair of rows is valid, remove the padding so
 		 * that it can be allocated to other pixmaps.
 		 */
-		if (io->used)
-			io->used -= stride;
+		min = bo->delta + height * stride;
+		min = ALIGN(min, 64);
+		if (io->used != min) {
+			DBG(("%s: trimming partial buffer from %d to %d\n",
+			     __FUNCTION__, io->used, min));
+			io->used = min;
+			bubble_sort_partial(&kgem->active_partials, io);
+		}
 		bo->size.bytes -= stride;
-		bubble_sort_partial(&kgem->active_partials, io);
 	}
 
 	bo->pitch = stride;
commit 14846dfa9104133349fd624e523a84302bfc86bc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 02:09:25 2012 +0000

    sna: Becareful not to reduce operators for superluminal colors
    
    wine-1.4 is such an example of a crazy application.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42606
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 225008f..38ff99d 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -644,7 +644,7 @@ sna_composite_rectangles(CARD8		 op,
 		return;
 	}
 
-	if (color->alpha <= 0x00ff) {
+	if ((color->red|color->green|color->blue|color->alpha) <= 0x00ff) {
 		switch (op) {
 		case PictOpOver:
 		case PictOpOutReverse:
@@ -661,6 +661,22 @@ sna_composite_rectangles(CARD8		 op,
 			op = PictOpOverReverse;
 			break;
 		}
+	}
+	if (color->alpha <= 0x00ff) {
+		switch (op) {
+		case PictOpOver:
+		case PictOpOutReverse:
+			return;
+		case  PictOpInReverse:
+			op = PictOpClear;
+			break;
+		case  PictOpAtopReverse:
+			op = PictOpOut;
+			break;
+		case  PictOpXor:
+			op = PictOpOverReverse;
+			break;
+		}
 	} else if (color->alpha >= 0xff00) {
 		switch (op) {
 		case PictOpOver:
commit e9041174156f6596251f00bbedd69d5653237172
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 08:54:24 2012 +0000

    sna/dri: Use a counter for the number of DRI drawables attached to a pixmap
    
    The root pixmap, for instance, may have unique DRI2Drawables for each
    inferior window. We only want to clear the flush flag on the last
    release, so we need to keep a count of how many DRI drawables remain
    attached rather than a solitary flag.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 119244d..8340345 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -129,12 +129,12 @@ struct sna_pixmap {
 
 	uint32_t stride;
 	uint32_t clear_color;
+	unsigned flush;
 
 #define SOURCE_BIAS 4
 	uint16_t source_count;
 	uint8_t pinned :1;
 	uint8_t mapped :1;
-	uint8_t flush :1;
 	uint8_t clear :1;
 	uint8_t undamaged :1;
 	uint8_t create :3;
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 3909b84..92132d6 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -163,7 +163,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	if (priv == NULL)
 		return NULL;
 
-	if (priv->flush)
+	if (priv->flush++)
 		return priv->gpu_bo;
 
 	tiling = color_tiling(sna, &pixmap->drawable);
@@ -177,7 +177,6 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	 *
 	 * As we don't track which Client, we flush for all.
 	 */
-	priv->flush = 1;
 	sna_accel_watch_flush(sna, 1);
 
 	/* Don't allow this named buffer to be replaced */
@@ -324,10 +323,12 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 			struct sna_pixmap *priv = sna_pixmap(private->pixmap);
 
 			/* Undo the DRI markings on this pixmap */
-			list_del(&priv->list);
-			sna_accel_watch_flush(sna, -1);
-			priv->pinned = private->pixmap == sna->front;
-			priv->flush = 0;
+			assert(priv->flush > 0);
+			if (--priv->flush == 0) {
+				list_del(&priv->list);
+				sna_accel_watch_flush(sna, -1);
+				priv->pinned = private->pixmap == sna->front;
+			}
 
 			screen->DestroyPixmap(private->pixmap);
 		}
commit 7a571d780138ad444ef1d2d9fd2d8afb3c00e9cf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 15:52:41 2012 +0000

    sna/gen2+: Prefer not to fallback if the source is busy
    
    As if we try to perform the operation with outstanding operations on the
    source pixmaps, we will stall waiting for them to complete.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 57bb835..597d5f3 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1512,16 +1512,37 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL)
+		return false;
+
+	if (priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		is_unhandled_gradient(p) ||
-		!gen2_check_filter(p) ||
-		!gen2_check_repeat(p) ||
-		need_upload(p));
+	if (is_unhandled_gradient(p) || !gen2_check_repeat(p))
+		return true;
+
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen2_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -1534,6 +1555,7 @@ gen2_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen2_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -1542,18 +1564,27 @@ gen2_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(mask, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = NULL;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -1566,34 +1597,28 @@ gen2_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (src_pixmap && !src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask_pixmap && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 78c7ea0..2a18631 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2511,16 +2511,37 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL)
+		return false;
+
+	if (priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		!gen3_check_xformat(p) ||
-		!gen3_check_filter(p) ||
-		!gen3_check_repeat(p) ||
-		need_upload(p));
+	if (!gen3_check_xformat(p) || !gen3_check_repeat(p))
+		return true;
+
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen3_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -2534,6 +2555,7 @@ gen3_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen3_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -2542,18 +2564,27 @@ gen3_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(mask, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = false;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -2575,38 +2606,28 @@ gen3_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv &&
-		    ((priv->gpu_damage && !priv->cpu_damage) ||
-		     (priv->cpu_bo && priv->cpu_bo->domain != DOMAIN_CPU))) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (src_pixmap && !src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv &&
-		    ((priv->gpu_damage && !priv->cpu_damage) ||
-		     (priv->cpu_bo && priv->cpu_bo->domain != DOMAIN_CPU))) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask_pixmap && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index c3a82a3..02454b2 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2122,11 +2122,8 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_gradient(PicturePtr picture)
+check_gradient(PicturePtr picture)
 {
-	if (picture->pDrawable)
-		return FALSE;
-
 	switch (picture->pSourcePict->type) {
 	case SourcePictTypeSolidFill:
 	case SourcePictTypeLinear:
@@ -2155,17 +2152,38 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL)
+		return false;
+
+	if (priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		is_gradient(p) ||
-		!gen4_check_filter(p) ||
-		!gen4_check_repeat(p) ||
-		!gen4_check_format(p->format) ||
-		need_upload(p));
+	if (p->pSourcePict)
+		return check_gradient(p);
+
+	if (!gen4_check_repeat(p) || !gen4_check_format(p->format))
+		return true;
+
+	/* soft errors: perfer to upload/compute rather than readback */
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen4_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -2178,6 +2196,7 @@ gen4_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen4_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -2186,18 +2205,27 @@ gen4_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(mask, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = false;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -2210,34 +2238,28 @@ gen4_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (!src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index bce5a3c..6763edf 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2198,17 +2198,39 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL)
+		return false;
+
+	if (priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		is_gradient(p) ||
-		!gen5_check_filter(p) ||
-		!gen5_check_repeat(p) ||
-		!gen5_check_format(p->format) ||
-		need_upload(p));
+	if (is_gradient(p) ||
+	    !gen5_check_repeat(p) ||
+	    !gen5_check_format(p->format))
+		return true;
+
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen5_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -2221,6 +2243,7 @@ gen5_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen5_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -2229,18 +2252,27 @@ gen5_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(mask, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = false;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -2253,34 +2285,28 @@ gen5_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (src_pixmap && !src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask_pixmap && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 9eb4221..390cb0a 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2374,7 +2374,7 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_gradient(PicturePtr picture)
+check_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
 		return FALSE;
@@ -2407,17 +2407,37 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL || priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		is_gradient(p) ||
-		!gen6_check_filter(p) ||
-		!gen6_check_repeat(p) ||
-		!gen6_check_format(p->format) ||
-		need_upload(p));
+	if (p->pSourcePict)
+		return check_gradient(p);
+
+	if (!gen6_check_repeat(p) || !gen6_check_format(p->format))
+		return true;
+
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen6_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -2430,6 +2450,7 @@ gen6_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen6_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -2438,18 +2459,27 @@ gen6_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(mask, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = false;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -2464,34 +2494,28 @@ gen6_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (src_pixmap && !src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask_pixmap && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 5829ae3..2b3f67b 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2487,17 +2487,36 @@ need_upload(PicturePtr p)
 }
 
 static bool
-source_fallback(PicturePtr p)
+source_is_busy(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL || priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	return priv->gpu_damage && !priv->cpu_damage;
+}
+
+static bool
+source_fallback(PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
 
-	return (has_alphamap(p) ||
-		is_gradient(p) ||
-		!gen7_check_filter(p) ||
-		!gen7_check_repeat(p) ||
-		!gen7_check_format(p->format) ||
-		need_upload(p));
+	if (is_gradient(p) ||
+	    !gen7_check_repeat(p) ||
+	    !gen7_check_format(p->format))
+		return true;
+
+	if (pixmap && source_is_busy(pixmap))
+		return false;
+
+	return has_alphamap(p) || !gen7_check_filter(p) || need_upload(p);
 }
 
 static bool
@@ -2510,6 +2529,7 @@ gen7_composite_fallback(struct sna *sna,
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
+	bool src_fallback, mask_fallback;
 
 	if (!gen7_check_dst_format(dst->format)) {
 		DBG(("%s: unknown destination format: %d\n",
@@ -2518,18 +2538,27 @@ gen7_composite_fallback(struct sna *sna,
 	}
 
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
+
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	mask_pixmap = (mask && mask->pDrawable) ? get_drawable_pixmap(mask->pDrawable) : NULL;
+	src_fallback = source_fallback(src, src_pixmap);
+
+	if (mask) {
+		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
+		mask_fallback = source_fallback(src, mask_pixmap);
+	} else {
+		mask_pixmap = NULL;
+		mask_fallback = false;
+	}
 
 	/* If we are using the destination as a source and need to
 	 * readback in order to upload the source, do it all
 	 * on the cpu.
 	 */
-	if (src_pixmap == dst_pixmap && source_fallback(src)) {
+	if (src_pixmap == dst_pixmap && src_fallback) {
 		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
-	if (mask_pixmap == dst_pixmap && source_fallback(mask)) {
+	if (mask_pixmap == dst_pixmap && mask_fallback) {
 		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
 		return TRUE;
 	}
@@ -2544,34 +2573,28 @@ gen7_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !source_fallback(src)) {
-		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: src is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (src_pixmap && !src_fallback) {
+		DBG(("%s: src is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
-	if (mask_pixmap && !source_fallback(mask)) {
-		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
-			DBG(("%s: mask is already on the GPU, try to use GPU\n",
-			     __FUNCTION__));
-			return FALSE;
-		}
+	if (mask_pixmap && !mask_fallback) {
+		DBG(("%s: mask is already on the GPU, try to use GPU\n",
+		     __FUNCTION__));
+		return FALSE;
 	}
 
 	/* However if the dst is not on the GPU and we need to
 	 * render one of the sources using the CPU, we may
 	 * as well do the entire operation in place onthe CPU.
 	 */
-	if (source_fallback(src)) {
+	if (src_fallback) {
 		DBG(("%s: dst is on the CPU and src will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
 	}
 
-	if (mask && source_fallback(mask)) {
+	if (mask && mask_fallback) {
 		DBG(("%s: dst is on the CPU and mask will fallback\n",
 		     __FUNCTION__));
 		return TRUE;
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 96d945e..6c31f33 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -450,7 +450,6 @@ static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
 {
 	DBG_HDR(("%s: domain: %d exec? %d, rq? %d\n",
 		 __FUNCTION__, bo->domain, bo->exec != NULL, bo->rq != NULL));
-	assert(bo->proxy == NULL);
 	return bo->rq;
 }
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ce3afae..e961c2c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -871,10 +871,10 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 		return true;
 	}
 
-	DBG(("%s: gpu_bo=%d, gpu_damage=%p\n",
+	DBG(("%s: gpu_bo=%d, gpu_damage=%p, cpu_damage=%p, is-clear?=%d\n",
 	     __FUNCTION__,
 	     priv->gpu_bo ? priv->gpu_bo->handle : 0,
-	     priv->gpu_damage));
+	     priv->gpu_damage, priv->cpu_damage, priv->clear));
 
 	if ((flags & MOVE_READ) == 0) {
 		assert(flags & MOVE_WRITE);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 74c04af..421c7ff 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1418,7 +1418,7 @@ sna_render_picture_fixup(struct sna *sna,
 
 	if (picture->alphaMap) {
 		DBG(("%s: alphamap\n", __FUNCTION__));
-		if ((is_gpu(picture->pDrawable) || is_gpu(picture->alphaMap->pDrawable))) {
+		if (is_gpu(picture->pDrawable) || is_gpu(picture->alphaMap->pDrawable)) {
 			return sna_render_picture_flatten(sna, picture, channel,
 							  x, y, w, y, dst_x, dst_y);
 		}
@@ -1428,7 +1428,7 @@ sna_render_picture_fixup(struct sna *sna,
 
 	if (picture->filter == PictFilterConvolution) {
 		DBG(("%s: convolution\n", __FUNCTION__));
-		if (picture->pDrawable && is_gpu(picture->pDrawable)) {
+		if (is_gpu(picture->pDrawable)) {
 			return sna_render_picture_convolve(sna, picture, channel,
 							   x, y, w, h, dst_x, dst_y);
 		}
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 6c8f66a..a523fed 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -72,10 +72,10 @@ is_gpu(DrawablePtr drawable)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
 
-	if (priv == NULL)
+	if (priv == NULL || priv->clear)
 		return false;
 
-	if (priv->gpu_damage)
+	if (priv->gpu_damage || (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo)))
 		return true;
 
 	return priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo);
commit 3ccd87f88919ca9fd2bffa250f7d78903430b4e6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 8 00:57:12 2012 +0000

    sna: Do not reset partial buffers if they are not attached to the current batch
    
    As we may be holding on to them as an active mapping whilst they are
    executing; reseting the used counter back to zero in this case can cause
    corruption.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index e4dcbb2..8e3de97 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1464,10 +1464,6 @@ static void kgem_finish_partials(struct kgem *kgem)
 		assert(bo->base.refcnt >= 1);
 
 		if (!bo->base.exec) {
-			if (bo->base.refcnt == 1 && bo->used) {
-				bo->used = 0;
-				bubble_sort_partial(&kgem->active_partials, bo);
-			}
 			DBG(("%s: skipping unattached handle=%d, used=%d\n",
 			     __FUNCTION__, bo->base.handle, bo->used));
 			continue;
commit d114dcc901738641100687196c404f9a51e59c18
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 15:45:21 2012 +0000

    sna: Convolution filter fixes
    
    A couple of typos made the convolution filter explode rather than
    convolve.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 404e12b..c3a82a3 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -681,7 +681,10 @@ static uint32_t gen4_check_filter(PicturePtr picture)
 	case PictFilterBilinear:
 		return TRUE;
 	default:
-		DBG(("%s: unknown filter: %d\n", __FUNCTION__, picture->filter));
+		DBG(("%s: unknown filter: %s [%d]\n",
+		     __FUNCTION__,
+		     PictureGetFilterName(picture->filter),
+		     picture->filter));
 		return FALSE;
 	}
 }
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index a345962..74c04af 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1234,15 +1234,18 @@ sna_render_picture_convolve(struct sna *sna,
 	PixmapPtr pixmap;
 	PicturePtr tmp;
 	pixman_fixed_t *params = picture->filter_params;
-	int x_off = (params[0] - pixman_fixed_1) >> 1;
-	int y_off = (params[1] - pixman_fixed_1) >> 1;
+	int x_off = -pixman_fixed_to_int((params[0] - pixman_fixed_1) >> 1);
+	int y_off = -pixman_fixed_to_int((params[1] - pixman_fixed_1) >> 1);
 	int cw = pixman_fixed_to_int(params[0]);
 	int ch = pixman_fixed_to_int(params[1]);
 	int i, j, error, depth;
+	struct kgem_bo *bo;
 
 	/* Lame multi-pass accumulation implementation of a general convolution
 	 * that works everywhere.
 	 */
+	DBG(("%s: origin=(%d,%d) kernel=%dx%d, size=%dx%d\n",
+	     __FUNCTION__, x_off, y_off, cw, ch, w, h));
 
 	assert(picture->pDrawable);
 	assert(picture->filter == PictFilterConvolution);
@@ -1267,8 +1270,10 @@ sna_render_picture_convolve(struct sna *sna,
 	if (tmp == NULL)
 		return 0;
 
-	if (!sna->render.fill_one(sna, pixmap, sna_pixmap_get_bo(pixmap), 0,
-				  0, 0, w, h, GXclear)) {
+	ValidatePicture(tmp);
+
+	bo = sna_pixmap_get_bo(pixmap);
+	if (!sna->render.clear(sna, pixmap, bo)) {
 		FreePicture(tmp, 0);
 		return 0;
 	}
@@ -1282,6 +1287,8 @@ sna_render_picture_convolve(struct sna *sna,
 
 			color.alpha = *params++;
 			color.red = color.green = color.blue = 0;
+			DBG(("%s: (%d, %d), alpha=%x\n",
+			     __FUNCTION__, i,j, color.alpha));
 
 			if (color.alpha <= 0x00ff)
 				continue;
@@ -1291,7 +1298,7 @@ sna_render_picture_convolve(struct sna *sna,
 				sna_composite(PictOpAdd, picture, alpha, tmp,
 					      x, y,
 					      0, 0,
-					      x_off-i, y_off-j,
+					      x_off+i, y_off+j,
 					      w, h);
 				FreePicture(alpha, 0);
 			}
@@ -1309,7 +1316,7 @@ sna_render_picture_convolve(struct sna *sna,
 	channel->scale[1] = 1.f / h;
 	channel->offset[0] = -dst_x;
 	channel->offset[1] = -dst_y;
-	channel->bo = kgem_bo_reference(sna_pixmap_get_bo(pixmap));
+	channel->bo = kgem_bo_reference(bo); /* transfer ownership */
 	FreePicture(tmp, 0);
 
 	return 1;
@@ -1423,7 +1430,7 @@ sna_render_picture_fixup(struct sna *sna,
 		DBG(("%s: convolution\n", __FUNCTION__));
 		if (picture->pDrawable && is_gpu(picture->pDrawable)) {
 			return sna_render_picture_convolve(sna, picture, channel,
-							   x, y, w, y, dst_x, dst_y);
+							   x, y, w, h, dst_x, dst_y);
 		}
 
 		goto do_fixup;
commit c4e4153c2ab62765fda5c1aca053578a2fe3fba7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 17:49:01 2012 +0000

    sna: Avoid recursive calls to kgem_retire_partials()
    
    Whilst iterating the partial list and uploading the buffers, we need to
    avoid trigger a recursive call into retire should we attempt to shrink a
    buffer. Such a recursive call will modify the list beneath us so that we
    chase a stale pointer and wreak havoc with memory corruption.
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47061
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_list.h b/src/intel_list.h
index 366b9e8..cbadebf 100644
--- a/src/intel_list.h
+++ b/src/intel_list.h
@@ -207,8 +207,9 @@ list_append(struct list *entry, struct list *head)
 static inline void
 __list_del(struct list *prev, struct list *next)
 {
-    next->prev = prev;
-    prev->next = next;
+	asert(next->prev == prev->next);
+	next->prev = prev;
+	prev->next = next;
 }
 
 static inline void
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 279face..e4dcbb2 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -121,22 +121,47 @@ static bool validate_partials(struct kgem *kgem)
 
 	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
 		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.refcnt >= 1);
 		assert(bo->base.io);
-		if (bo->base.list.next == &kgem->active_partials)
-			return true;
+
+		if (&next->base.list == &kgem->active_partials)
+			break;
+
 		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
-			ErrorF("this rem: %d, next rem: %d\n",
+			ErrorF("active error: this rem: %d, next rem: %d\n",
 			       bytes(&bo->base) - bo->used,
 			       bytes(&next->base) - next->used);
 			goto err;
 		}
 	}
+
+	list_for_each_entry_safe(bo, next, &kgem->inactive_partials, base.list) {
+		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.io);
+		assert(bo->base.refcnt == 1);
+
+		if (&next->base.list == &kgem->inactive_partials)
+			break;
+
+		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
+			ErrorF("inactive error: this rem: %d, next rem: %d\n",
+			       bytes(&bo->base) - bo->used,
+			       bytes(&next->base) - next->used);
+			goto err;
+		}
+	}
+
 	return true;
 
 err:
+	ErrorF("active partials:\n");
 	list_for_each_entry(bo, &kgem->active_partials, base.list)
-		ErrorF("bo: used=%d / %d, rem=%d\n",
-		       bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
+		ErrorF("bo handle=%d: used=%d / %d, rem=%d\n",
+		       bo->base.handle, bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
+	ErrorF("inactive partials:\n");
+	list_for_each_entry(bo, &kgem->inactive_partials, base.list)
+		ErrorF("bo handle=%d: used=%d / %d, rem=%d\n",
+		       bo->base.handle, bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
 	return false;
 }
 #else
@@ -1220,7 +1245,13 @@ static void kgem_retire_partials(struct kgem *kgem)
 		     bo->base.handle, bo->used, bytes(&bo->base)));
 
 		assert(bo->base.refcnt == 1);
-		assert(bo->mmapped);
+		assert(bo->base.exec == NULL);
+		if (!bo->mmapped) {
+			list_del(&bo->base.list);
+			kgem_bo_unref(kgem, &bo->base);
+			continue;
+		}
+
 		assert(kgem->has_llc || !IS_CPU_MAP(bo->base.map));
 		bo->base.dirty = false;
 		bo->base.needs_flush = false;
@@ -1229,6 +1260,7 @@ static void kgem_retire_partials(struct kgem *kgem)
 		list_move_tail(&bo->base.list, &kgem->inactive_partials);
 		bubble_sort_partial(&kgem->inactive_partials, bo);
 	}
+	assert(validate_partials(kgem));
 }
 
 bool kgem_retire(struct kgem *kgem)
@@ -1423,12 +1455,23 @@ static void kgem_finish_partials(struct kgem *kgem)
 	struct kgem_partial_bo *bo, *next;
 
 	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
+		DBG(("%s: partial handle=%d, used=%d, exec?=%d, write=%d, mmapped=%d\n",
+		     __FUNCTION__, bo->base.handle, bo->used, bo->base.exec!=NULL,
+		     bo->write, bo->mmapped));
+
 		assert(next->base.list.prev == &bo->base.list);
 		assert(bo->base.io);
 		assert(bo->base.refcnt >= 1);
 
-		if (!bo->base.exec)
+		if (!bo->base.exec) {
+			if (bo->base.refcnt == 1 && bo->used) {
+				bo->used = 0;
+				bubble_sort_partial(&kgem->active_partials, bo);
+			}
+			DBG(("%s: skipping unattached handle=%d, used=%d\n",
+			     __FUNCTION__, bo->base.handle, bo->used));
 			continue;
+		}
 
 		if (!bo->write) {
 			assert(bo->base.exec || bo->base.refcnt > 1);
@@ -1458,12 +1501,14 @@ static void kgem_finish_partials(struct kgem *kgem)
 		assert(bo->base.rq == kgem->next_request);
 		assert(bo->base.domain != DOMAIN_GPU);
 
-		if (bo->base.refcnt == 1 && bo->used < bytes(&bo->base) / 2) {
+		if (bo->base.refcnt == 1 &&
+		    bo->base.size.pages.count > 1 &&
+		    bo->used < bytes(&bo->base) / 2) {
 			struct kgem_bo *shrink;
 
 			shrink = search_linear_cache(kgem,
 						     PAGE_ALIGN(bo->used),
-						     CREATE_INACTIVE);
+						     CREATE_INACTIVE | CREATE_NO_RETIRE);
 			if (shrink) {
 				int n;
 
@@ -1513,6 +1558,8 @@ static void kgem_finish_partials(struct kgem *kgem)
 		bo->need_io = 0;
 
 decouple:
+		DBG(("%s: releasing handle=%d\n",
+		     __FUNCTION__, bo->base.handle));
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
 	}
@@ -2018,6 +2065,11 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		DBG(("%s: inactive and cache bucket empty\n",
 		     __FUNCTION__));
 
+		if ((flags & CREATE_NO_RETIRE) == 0) {
+			DBG(("%s: can not retire\n"));
+			return NULL;
+		}
+
 		if (!kgem->need_retire || !kgem_retire(kgem)) {
 			DBG(("%s: nothing retired\n", __FUNCTION__));
 			return NULL;
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 9abb72a..96d945e 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -222,6 +222,7 @@ enum {
 	CREATE_GTT_MAP = 0x8,
 	CREATE_SCANOUT = 0x10,
 	CREATE_TEMPORARY = 0x20,
+	CREATE_NO_RETIRE = 0x40,
 };
 struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			       int width,
commit bfc37c4c55aade6fe6450f2c1eff0ce771e69796
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 13:17:21 2012 +0000

    sna: Restore checking for all-clipped-out for CompositeRectangles
    
    In the refactoring to avoid repeatedly applying the singular
    pCompositeClip, the check for the all-clipped state was lost.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 7f1d096..225008f 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -596,11 +596,9 @@ _pixman_region_init_clipped_rectangles(pixman_region16_t *region,
 			j++;
 	}
 
-	ret = TRUE;
+	ret = FALSE;
 	if (j)
 	    ret = pixman_region_init_rects(region, boxes, j);
-	else
-	    pixman_region_init(region);
 
 	if (boxes != stack_boxes)
 		free(boxes);
@@ -609,7 +607,7 @@ _pixman_region_init_clipped_rectangles(pixman_region16_t *region,
 	     __FUNCTION__, num_rects,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
-	     pixman_region_n_rects(region)));
+	     j));
 	return ret;
 }
 
commit 5f4e279ebe0ed546ec9a85a888241acaf50ba1d7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 10:47:42 2012 +0000

    sna/gen2: Fix transformation of linear gradients
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index c3a16cb..57bb835 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1189,33 +1189,30 @@ gen2_composite_linear_init(struct sna *sna,
 		struct pixman_f_vector p1, p2;
 		struct pixman_f_transform m, inv;
 
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
 		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
 		     __FUNCTION__,
-		     pixman_fixed_to_double(picture->transform->matrix[0][0]),
-		     pixman_fixed_to_double(picture->transform->matrix[0][1]),
-		     pixman_fixed_to_double(picture->transform->matrix[0][2]),
-		     pixman_fixed_to_double(picture->transform->matrix[1][0]),
-		     pixman_fixed_to_double(picture->transform->matrix[1][1]),
-		     pixman_fixed_to_double(picture->transform->matrix[1][2]),
-		     pixman_fixed_to_double(picture->transform->matrix[2][0]),
-		     pixman_fixed_to_double(picture->transform->matrix[2][1]),
-		     pixman_fixed_to_double(picture->transform->matrix[2][2])));
-
-		pixman_f_transform_from_pixman_transform(&m,
-							 picture->transform);
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
 		if (!pixman_f_transform_invert(&inv, &m))
 			return 0;
 
-		p1.v[0] = linear->p1.x;
-		p1.v[1] = linear->p1.y;
-		p1.v[2] = pixman_fixed_1;
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
 		pixman_f_transform_point(&inv, &p1);
 
-		p2.v[0] = linear->p2.x;
-		p2.v[1] = linear->p2.y;
-		p2.v[2] = pixman_fixed_1;
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
 		pixman_f_transform_point(&inv, &p2);
 
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
 		dx = p2.v[0] - p1.v[0];
 		dy = p2.v[1] - p1.v[1];
 
@@ -1229,7 +1226,7 @@ gen2_composite_linear_init(struct sna *sna,
 
 	channel->u.gen2.linear_dx = dx;
 	channel->u.gen2.linear_dy = dy;
-	channel->u.gen2.linear_offset = -dx*(x0+x-dst_x) + -dy*(y0+y-dst_y);
+	channel->u.gen2.linear_offset = -dx*(x0+dst_x-x) + -dy*(y0+dst_y-y);
 
 	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
 	     __FUNCTION__, dx, dy, channel->u.gen2.linear_offset));
commit 2feeae0b64c28487a4915f7ab9a9dd7677ed046e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 10:43:24 2012 +0000

    sna/gen4: Hook in the poor-man's linear gradient
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 97af7fc..404e12b 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1816,6 +1816,120 @@ gen4_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
+static Bool
+gen4_composite_linear_init(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int x, int y,
+			   int w, int h,
+			   int dst_x, int dst_y)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	pixman_fixed_t tx, ty;
+	float x0, y0, sf;
+	float dx, dy;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
+	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
+	     x, y, dst_x, dst_y, w, h));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	if (!sna_transform_is_affine(picture->transform)) {
+		DBG(("%s: fallback due to projective transform\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, w, h, dst_x, dst_y);
+	}
+
+	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
+	if (!channel->bo)
+		return 0;
+
+	channel->filter = PictFilterBilinear;
+	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+
+	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
+		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
+		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
+
+		x0 = pixman_fixed_to_double(linear->p1.x);
+		y0 = pixman_fixed_to_double(linear->p1.y);
+
+		if (tx | ty) {
+			x0 -= pixman_fixed_to_double(tx);
+			y0 -= pixman_fixed_to_double(ty);
+		}
+	} else {
+		struct pixman_f_vector p1, p2;
+		struct pixman_f_transform m, inv;
+
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
+		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
+		     __FUNCTION__,
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
+		if (!pixman_f_transform_invert(&inv, &m))
+			return 0;
+
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p1);
+
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p2);
+
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
+		dx = p2.v[0] - p1.v[0];
+		dy = p2.v[1] - p1.v[1];
+
+		x0 = p1.v[0];
+		y0 = p1.v[1];
+	}
+
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
+	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
+
+	channel->embedded_transform.matrix[1][0] = 0;
+	channel->embedded_transform.matrix[1][1] = 0;
+	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
+
+	channel->embedded_transform.matrix[2][0] = 0;
+	channel->embedded_transform.matrix[2][1] = 0;
+	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
+
+	channel->transform = &channel->embedded_transform;
+	channel->is_affine = 1;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
+	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
+
+	return channel->bo != NULL;
+}
+
 static int
 gen4_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -1838,7 +1952,13 @@ gen4_composite_picture(struct sna *sna,
 		return gen4_composite_solid_init(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
-		DBG(("%s: procedural source fixup\n", __FUNCTION__));
+		if (picture->pSourcePict->type == SourcePictTypeLinear)
+			return gen4_composite_linear_init(sna, picture, channel,
+							  x, y,
+							  w, h,
+							  dst_x, dst_y);
+
+		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
 	}
@@ -2004,7 +2124,13 @@ is_gradient(PicturePtr picture)
 	if (picture->pDrawable)
 		return FALSE;
 
-	return picture->pSourcePict->type != SourcePictTypeSolidFill;
+	switch (picture->pSourcePict->type) {
+	case SourcePictTypeSolidFill:
+	case SourcePictTypeLinear:
+		return FALSE;
+	default:
+		return TRUE;
+	}
 }
 
 static bool
commit 842c6166bbee1598817beeae4c21012b2881ccbe
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 10:43:24 2012 +0000

    sna/gen5: Hook in the poor-man's linear gradient
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 18325b5..bce5a3c 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1862,6 +1862,120 @@ gen5_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
+static Bool
+gen5_composite_linear_init(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int x, int y,
+			   int w, int h,
+			   int dst_x, int dst_y)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	pixman_fixed_t tx, ty;
+	float x0, y0, sf;
+	float dx, dy;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
+	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
+	     x, y, dst_x, dst_y, w, h));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	if (!sna_transform_is_affine(picture->transform)) {
+		DBG(("%s: fallback due to projective transform\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, w, h, dst_x, dst_y);
+	}
+
+	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
+	if (!channel->bo)
+		return 0;
+
+	channel->filter = PictFilterBilinear;
+	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+
+	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
+		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
+		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
+
+		x0 = pixman_fixed_to_double(linear->p1.x);
+		y0 = pixman_fixed_to_double(linear->p1.y);
+
+		if (tx | ty) {
+			x0 -= pixman_fixed_to_double(tx);
+			y0 -= pixman_fixed_to_double(ty);
+		}
+	} else {
+		struct pixman_f_vector p1, p2;
+		struct pixman_f_transform m, inv;
+
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
+		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
+		     __FUNCTION__,
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
+		if (!pixman_f_transform_invert(&inv, &m))
+			return 0;
+
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p1);
+
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p2);
+
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
+		dx = p2.v[0] - p1.v[0];
+		dy = p2.v[1] - p1.v[1];
+
+		x0 = p1.v[0];
+		y0 = p1.v[1];
+	}
+
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
+	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
+
+	channel->embedded_transform.matrix[1][0] = 0;
+	channel->embedded_transform.matrix[1][1] = 0;
+	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
+
+	channel->embedded_transform.matrix[2][0] = 0;
+	channel->embedded_transform.matrix[2][1] = 0;
+	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
+
+	channel->transform = &channel->embedded_transform;
+	channel->is_affine = 1;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
+	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
+
+	return channel->bo != NULL;
+}
+
 static int
 gen5_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -1883,9 +1997,17 @@ gen5_composite_picture(struct sna *sna,
 	if (sna_picture_is_solid(picture, &color))
 		return gen5_composite_solid_init(sna, channel, color);
 
-	if (picture->pDrawable == NULL)
+	if (picture->pDrawable == NULL) {
+		if (picture->pSourcePict->type == SourcePictTypeLinear)
+			return gen5_composite_linear_init(sna, picture, channel,
+							  x, y,
+							  w, h,
+							  dst_x, dst_y);
+
+		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
+	}
 
 	if (picture->alphaMap) {
 		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
@@ -2048,7 +2170,13 @@ is_gradient(PicturePtr picture)
 	if (picture->pDrawable)
 		return FALSE;
 
-	return picture->pSourcePict->type != SourcePictTypeSolidFill;
+	switch (picture->pSourcePict->type) {
+	case SourcePictTypeSolidFill:
+	case SourcePictTypeLinear:
+		return FALSE;
+	default:
+		return TRUE;
+	}
 }
 
 static bool
commit 0d49ae0c79484589ff84f4411b4b96b351a7524d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 10:43:24 2012 +0000

    sna/gen7: Hook in the poor-man's linear gradient
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index a401d94..5829ae3 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2150,6 +2150,120 @@ gen7_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
+static Bool
+gen7_composite_linear_init(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int x, int y,
+			   int w, int h,
+			   int dst_x, int dst_y)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	pixman_fixed_t tx, ty;
+	float x0, y0, sf;
+	float dx, dy;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
+	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
+	     x, y, dst_x, dst_y, w, h));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	if (!sna_transform_is_affine(picture->transform)) {
+		DBG(("%s: fallback due to projective transform\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, w, h, dst_x, dst_y);
+	}
+
+	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
+	if (!channel->bo)
+		return 0;
+
+	channel->filter = PictFilterBilinear;
+	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+
+	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
+		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
+		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
+
+		x0 = pixman_fixed_to_double(linear->p1.x);
+		y0 = pixman_fixed_to_double(linear->p1.y);
+
+		if (tx | ty) {
+			x0 -= pixman_fixed_to_double(tx);
+			y0 -= pixman_fixed_to_double(ty);
+		}
+	} else {
+		struct pixman_f_vector p1, p2;
+		struct pixman_f_transform m, inv;
+
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
+		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
+		     __FUNCTION__,
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
+		if (!pixman_f_transform_invert(&inv, &m))
+			return 0;
+
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p1);
+
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p2);
+
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
+		dx = p2.v[0] - p1.v[0];
+		dy = p2.v[1] - p1.v[1];
+
+		x0 = p1.v[0];
+		y0 = p1.v[1];
+	}
+
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
+	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
+
+	channel->embedded_transform.matrix[1][0] = 0;
+	channel->embedded_transform.matrix[1][1] = 0;
+	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
+
+	channel->embedded_transform.matrix[2][0] = 0;
+	channel->embedded_transform.matrix[2][1] = 0;
+	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
+
+	channel->transform = &channel->embedded_transform;
+	channel->is_affine = 1;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
+	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
+
+	return channel->bo != NULL;
+}
+
 static int
 gen7_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -2171,9 +2285,17 @@ gen7_composite_picture(struct sna *sna,
 	if (sna_picture_is_solid(picture, &color))
 		return gen7_composite_solid_init(sna, channel, color);
 
-	if (picture->pDrawable == NULL)
+	if (picture->pDrawable == NULL) {
+		if (picture->pSourcePict->type == SourcePictTypeLinear)
+			return gen7_composite_linear_init(sna, picture, channel,
+							  x, y,
+							  w, h,
+							  dst_x, dst_y);
+
+		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
+	}
 
 	if (picture->alphaMap) {
 		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
@@ -2337,7 +2459,13 @@ is_gradient(PicturePtr picture)
 	if (picture->pDrawable)
 		return FALSE;
 
-	return picture->pSourcePict->type != SourcePictTypeSolidFill;
+	switch (picture->pSourcePict->type) {
+	case SourcePictTypeSolidFill:
+	case SourcePictTypeLinear:
+		return FALSE;
+	default:
+		return TRUE;
+	}
 }
 
 static bool
commit 474e63d280993393522a6c537617cd5c7b10e088
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Mar 7 10:40:50 2012 +0000

    sna/gen6: Add poor-man's linear implementation
    
    Still no JIT, in the meantime we can at least cache the gradient ramps.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 71c0046..9eb4221 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1420,141 +1420,72 @@ gen6_emit_composite_primitive_identity_source_mask(struct sna *sna,
 	v[14] = msk_y * op->mask.scale[1];
 }
 
-fastcall static void
-gen6_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
+inline static void
+gen6_emit_composite_texcoord(struct sna *sna,
+			     const struct sna_composite_channel *channel,
+			     int16_t x, int16_t y)
 {
-	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
-	Bool is_affine = op->is_affine;
-	const float *src_sf = op->src.scale;
-	const float *mask_sf = op->mask.scale;
-
-	if (is_affine) {
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1],
-						op->src.transform,
-						&src_x[0],
-						&src_y[0]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[1],
-						&src_y[1]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0] + r->width,
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[2],
-						&src_y[2]);
-	} else {
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-							r->src.y + op->src.offset[1],
-							op->src.transform,
-							&src_x[0],
-							&src_y[0],
-							&src_w[0]))
-			return;
-
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-							r->src.y + op->src.offset[1] + r->height,
-							op->src.transform,
-							&src_x[1],
-							&src_y[1],
-							&src_w[1]))
-			return;
-
-		if (!sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0] + r->width,
-							r->src.y + op->src.offset[1] + r->height,
-							op->src.transform,
-							&src_x[2],
-							&src_y[2],
-							&src_w[2]))
-			return;
-	}
+	x += channel->offset[0];
+	y += channel->offset[1];
 
-	if (op->mask.bo) {
-		if (is_affine) {
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1],
-							op->mask.transform,
-							&mask_x[0],
-							&mask_y[0]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[1],
-							&mask_y[1]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0] + r->width,
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[2],
-							&mask_y[2]);
-		} else {
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-								r->mask.y + op->mask.offset[1],
-								op->mask.transform,
-								&mask_x[0],
-								&mask_y[0],
-								&mask_w[0]))
-				return;
-
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-								r->mask.y + op->mask.offset[1] + r->height,
-								op->mask.transform,
-								&mask_x[1],
-								&mask_y[1],
-								&mask_w[1]))
-				return;
-
-			if (!sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0] + r->width,
-								r->mask.y + op->mask.offset[1] + r->height,
-								op->mask.transform,
-								&mask_x[2],
-								&mask_y[2],
-								&mask_w[2]))
-				return;
-		}
-	}
+	if (channel->is_affine) {
+		float s, t;
 
-	OUT_VERTEX(r->dst.x + r->width, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[2] * src_sf[0]);
-	OUT_VERTEX_F(src_y[2] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[2]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[2] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[2] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[2]);
-	}
+		sna_get_transformed_coordinates(x, y,
+						channel->transform,
+						&s, &t);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+	} else {
+		float s, t, w;
 
-	OUT_VERTEX(r->dst.x, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[1] * src_sf[0]);
-	OUT_VERTEX_F(src_y[1] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[1]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[1] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[1] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[1]);
+		sna_get_transformed_coordinates_3d(x, y,
+						   channel->transform,
+						   &s, &t, &w);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+		OUT_VERTEX_F(w);
 	}
+}
 
-	OUT_VERTEX(r->dst.x, r->dst.y);
-	OUT_VERTEX_F(src_x[0] * src_sf[0]);
-	OUT_VERTEX_F(src_y[0] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[0]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[0] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[0] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[0]);
-	}
+static void
+gen6_emit_composite_vertex(struct sna *sna,
+			   const struct sna_composite_op *op,
+			   int16_t srcX, int16_t srcY,
+			   int16_t mskX, int16_t mskY,
+			   int16_t dstX, int16_t dstY)
+{
+	OUT_VERTEX(dstX, dstY);
+	gen6_emit_composite_texcoord(sna, &op->src, srcX, srcY);
+	gen6_emit_composite_texcoord(sna, &op->mask, mskX, mskY);
+}
+
+fastcall static void
+gen6_emit_composite_primitive(struct sna *sna,
+			      const struct sna_composite_op *op,
+			      const struct sna_composite_rectangles *r)
+{
+	gen6_emit_composite_vertex(sna, op,
+				   r->src.x + r->width,
+				   r->src.y + r->height,
+				   r->mask.x + r->width,
+				   r->mask.y + r->height,
+				   op->dst.x + r->dst.x + r->width,
+				   op->dst.y + r->dst.y + r->height);
+	gen6_emit_composite_vertex(sna, op,
+				   r->src.x,
+				   r->src.y + r->height,
+				   r->mask.x,
+				   r->mask.y + r->height,
+				   op->dst.x + r->dst.x,
+				   op->dst.y + r->dst.y + r->height);
+	gen6_emit_composite_vertex(sna, op,
+				   r->src.x,
+				   r->src.y,
+				   r->mask.x,
+				   r->mask.y,
+				   op->dst.x + r->dst.x,
+				   op->dst.y + r->dst.y);
 }
 
 static void gen6_emit_vertex_buffer(struct sna *sna,
@@ -2123,6 +2054,120 @@ gen6_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
+static Bool
+gen6_composite_linear_init(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int x, int y,
+			   int w, int h,
+			   int dst_x, int dst_y)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	pixman_fixed_t tx, ty;
+	float x0, y0, sf;
+	float dx, dy;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
+	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
+	     x, y, dst_x, dst_y, w, h));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	if (!sna_transform_is_affine(picture->transform)) {
+		DBG(("%s: fallback due to projective transform\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, w, h, dst_x, dst_y);
+	}
+
+	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
+	if (!channel->bo)
+		return 0;
+
+	channel->filter = PictFilterBilinear;
+	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+
+	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
+		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
+		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
+
+		x0 = pixman_fixed_to_double(linear->p1.x);
+		y0 = pixman_fixed_to_double(linear->p1.y);
+
+		if (tx | ty) {
+			x0 -= pixman_fixed_to_double(tx);
+			y0 -= pixman_fixed_to_double(ty);
+		}
+	} else {
+		struct pixman_f_vector p1, p2;
+		struct pixman_f_transform m, inv;
+
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
+		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
+		     __FUNCTION__,
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
+		if (!pixman_f_transform_invert(&inv, &m))
+			return 0;
+
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p1);
+
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p2);
+
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
+		dx = p2.v[0] - p1.v[0];
+		dy = p2.v[1] - p1.v[1];
+
+		x0 = p1.v[0];
+		y0 = p1.v[1];
+	}
+
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
+	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
+	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
+
+	channel->embedded_transform.matrix[1][0] = 0;
+	channel->embedded_transform.matrix[1][1] = 0;
+	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
+
+	channel->embedded_transform.matrix[2][0] = 0;
+	channel->embedded_transform.matrix[2][1] = 0;
+	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
+
+	channel->transform = &channel->embedded_transform;
+	channel->is_affine = 1;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
+	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
+
+	return channel->bo != NULL;
+}
+
 static int
 gen6_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -2144,12 +2189,20 @@ gen6_composite_picture(struct sna *sna,
 	if (sna_picture_is_solid(picture, &color))
 		return gen6_composite_solid_init(sna, channel, color);
 
-	if (picture->pDrawable == NULL)
+	if (picture->pDrawable == NULL) {
+		if (picture->pSourcePict->type == SourcePictTypeLinear)
+			return gen6_composite_linear_init(sna, picture, channel,
+							  x, y,
+							  w, h,
+							  dst_x, dst_y);
+
+		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
+	}
 
 	if (picture->alphaMap) {
-		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
+		DBG(("%s -- fixup, alphamap\n", __FUNCTION__));
 		return sna_render_picture_fixup(sna, picture, channel,
 						x, y, w, h, dst_x, dst_y);
 	}
@@ -2326,7 +2379,13 @@ is_gradient(PicturePtr picture)
 	if (picture->pDrawable)
 		return FALSE;
 
-	return picture->pSourcePict->type != SourcePictTypeSolidFill;
+	switch (picture->pSourcePict->type) {
+	case SourcePictTypeSolidFill:
+	case SourcePictTypeLinear:
+		return FALSE;
+	default:
+		return TRUE;
+	}
 }
 
 static bool
@@ -2740,32 +2799,6 @@ gen6_composite_alpha_gradient_init(struct sna *sna,
 }
 
 inline static void
-gen6_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	float t[3];
-
-	if (channel->is_affine) {
-		sna_get_transformed_coordinates(x + channel->offset[0],
-						y + channel->offset[1],
-						channel->transform,
-						&t[0], &t[1]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-	} else {
-		t[0] = t[1] = 0; t[2] = 1;
-		sna_get_transformed_coordinates_3d(x + channel->offset[0],
-						   y + channel->offset[1],
-						   channel->transform,
-						   &t[0], &t[1], &t[2]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-		OUT_VERTEX_F(t[2]);
-	}
-}
-
-inline static void
 gen6_emit_composite_texcoord_affine(struct sna *sna,
 				    const struct sna_composite_channel *channel,
 				    int16_t x, int16_t y)
commit 2baf355ba1e37497595ec5048382475f863b476e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Mar 6 12:17:03 2012 +0000

    sna: Remove the 2-step damage flush
    
    The idea was to reduce the number of unnecessary flushes by checking for
    outgoing damage (could be refined further by inspecting the reply/event
    callback for a XDamageNotifyEvent). However, it does not flush
    sufficiently for the compositors' liking. As it doesn't appear to restore
    performance to near uncomposited levels anyway, remove the complication.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a8737b5..ce3afae 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11459,31 +11459,20 @@ static Bool sna_change_window_attributes(WindowPtr win, unsigned long mask)
 }
 
 static void
-sna_accel_reply_callback(CallbackListPtr *list,
-			 pointer user_data, pointer call_data)
-{
-	struct sna *sna = user_data;
-
-	if (sna->flush)
-		return;
-
-	/* Assume each callback corresponds to a new request. The use
-	 * of continuation WriteToClients in the server is relatively rare,
-	 * and we err on the side of safety.
-	 */
-	sna->flush = (sna->kgem.flush || sna->kgem.sync ||
-		      !list_is_empty(&sna->dirty_pixmaps));
-}
-
-static void
 sna_accel_flush_callback(CallbackListPtr *list,
 			 pointer user_data, pointer call_data)
 {
 	struct sna *sna = user_data;
 	struct list preserve;
 
-	if (!sna->flush)
-		return;
+	/* XXX we should be able to reduce the frequency of flushes further
+	 * by checking for outgoing damage events or sync replies. Tricky,
+	 * and doesn't appear to mitigate the performance loss.
+	 */
+	if (!(sna->kgem.flush ||
+	      sna->kgem.sync ||
+	      !list_is_empty(&sna->dirty_pixmaps)))
+	    return;
 
 	DBG(("%s: need_sync=%d, need_flush=%d, dirty? %d\n", __FUNCTION__,
 	     sna->kgem.sync!=NULL, sna->kgem.flush, !list_is_empty(&sna->dirty_pixmaps)));
@@ -11934,8 +11923,7 @@ void sna_accel_watch_flush(struct sna *sna, int enable)
 	if (sna->watch_flush == 0) {
 		DBG(("%s: installing watchers\n", __FUNCTION__));
 		assert(enable > 0);
-		if (!AddCallback(&ReplyCallback, sna_accel_reply_callback, sna) ||
-		    !AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
+		if (!AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
 			xf86DrvMsg(sna->scrn->scrnIndex, X_Error,
 				   "Failed to attach ourselves to the flush callbacks, expect missing synchronisation with DRI clients (e.g a compositor)\n");
 		}
@@ -11959,7 +11947,6 @@ void sna_accel_close(struct sna *sna)
 	sna_glyphs_close(sna);
 
 	DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
-	DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
 
 	kgem_cleanup_cache(&sna->kgem);
 }
@@ -11989,7 +11976,6 @@ void sna_accel_block_handler(struct sna *sna)
 
 	if (sna->flush == 0 && sna->watch_flush == 1) {
 		DBG(("%s: removing watchers\n", __FUNCTION__));
-		DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
 		DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
 		sna->watch_flush = 0;
 	}
commit d84c7337151051d8b8e8034da1fd3ec0c0f27f94
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 5 22:55:57 2012 +0000

    sna: Defer the FlushCallback removal until after the next flush
    
    Try to reduce the amount of Add/Delete ping-pong, in particular around
    the recreation of the DRI2 attachment to the scanout after pageflipping.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1dc0b99..a8737b5 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11482,7 +11482,6 @@ sna_accel_flush_callback(CallbackListPtr *list,
 	struct sna *sna = user_data;
 	struct list preserve;
 
-	assert(sna->watch_flush);
 	if (!sna->flush)
 		return;
 
@@ -11929,22 +11928,21 @@ Bool sna_accel_create(struct sna *sna)
 
 void sna_accel_watch_flush(struct sna *sna, int enable)
 {
+	DBG(("%s: enable=%d\n", __FUNCTION__, enable));
+	assert(enable);
+
 	if (sna->watch_flush == 0) {
+		DBG(("%s: installing watchers\n", __FUNCTION__));
 		assert(enable > 0);
 		if (!AddCallback(&ReplyCallback, sna_accel_reply_callback, sna) ||
-		    AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
+		    !AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
 			xf86DrvMsg(sna->scrn->scrnIndex, X_Error,
 				   "Failed to attach ourselves to the flush callbacks, expect missing synchronisation with DRI clients (e.g a compositor)\n");
 		}
+		sna->watch_flush++;
 	}
 
 	sna->watch_flush += enable;
-
-	if (sna->watch_flush == 0) {
-		assert(enable < 0);
-		DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
-		DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
-	}
 }
 
 void sna_accel_close(struct sna *sna)
@@ -11989,6 +11987,13 @@ void sna_accel_block_handler(struct sna *sna)
 	if (sna_accel_do_inactive(sna))
 		sna_accel_inactive(sna);
 
+	if (sna->flush == 0 && sna->watch_flush == 1) {
+		DBG(("%s: removing watchers\n", __FUNCTION__));
+		DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
+		DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
+		sna->watch_flush = 0;
+	}
+
 	sna->timer_ready = 0;
 }
 
commit 0e9afebe60550389bda61edd89a5e1d10fab363b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 5 22:46:20 2012 +0000

    sna: Only install the flush callback for the duration of the foriegn buffer
    
    After we are no longer sharing the bo with foreign clients, we no longer
    need to keep flushing before every X_Reply and so we can remove the
    callbacks to remove the overhead of having to check every time.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 1196cce..119244d 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -221,6 +221,7 @@ struct sna {
 #define SNA_NO_THROTTLE		0x1
 #define SNA_NO_DELAYED_FLUSH	0x2
 
+	unsigned watch_flush;
 	unsigned flush;
 
 	int timer[NUM_TIMERS];
@@ -560,6 +561,7 @@ Bool sna_accel_pre_init(struct sna *sna);
 Bool sna_accel_init(ScreenPtr sreen, struct sna *sna);
 void sna_accel_block_handler(struct sna *sna);
 void sna_accel_wakeup_handler(struct sna *sna, fd_set *ready);
+void sna_accel_watch_flush(struct sna *sna, int enable);
 void sna_accel_close(struct sna *sna);
 void sna_accel_free(struct sna *sna);
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 460fbb3..1dc0b99 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -345,8 +345,10 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
 		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
-		if (priv->cpu_bo->sync)
+		if (priv->cpu_bo->sync) {
 			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+			sna_accel_watch_flush(sna, -1);
+		}
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 		priv->cpu_bo = NULL;
 	} else
@@ -617,6 +619,7 @@ sna_pixmap_create_shm(ScreenPtr screen,
 					      bpp, pitch, addr);
 	}
 	kgem_bo_set_sync(&sna->kgem, priv->cpu_bo);
+	sna_accel_watch_flush(sna, 1);
 	priv->cpu_bo->pitch = pitch;
 
 	priv->header = true;
@@ -11479,6 +11482,7 @@ sna_accel_flush_callback(CallbackListPtr *list,
 	struct sna *sna = user_data;
 	struct list preserve;
 
+	assert(sna->watch_flush);
 	if (!sna->flush)
 		return;
 
@@ -11828,11 +11832,6 @@ Bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 {
 	const char *backend;
 
-	if (!AddCallback(&ReplyCallback, sna_accel_reply_callback, sna))
-		return FALSE;
-	if (!AddCallback(&FlushCallback, sna_accel_flush_callback, sna))
-		return FALSE;
-
 	sna_font_key = AllocateFontPrivateIndex();
 	screen->RealizeFont = sna_realize_font;
 	screen->UnrealizeFont = sna_unrealize_font;
@@ -11928,6 +11927,26 @@ Bool sna_accel_create(struct sna *sna)
 	return TRUE;
 }
 
+void sna_accel_watch_flush(struct sna *sna, int enable)
+{
+	if (sna->watch_flush == 0) {
+		assert(enable > 0);
+		if (!AddCallback(&ReplyCallback, sna_accel_reply_callback, sna) ||
+		    AddCallback(&FlushCallback, sna_accel_flush_callback, sna)) {
+			xf86DrvMsg(sna->scrn->scrnIndex, X_Error,
+				   "Failed to attach ourselves to the flush callbacks, expect missing synchronisation with DRI clients (e.g a compositor)\n");
+		}
+	}
+
+	sna->watch_flush += enable;
+
+	if (sna->watch_flush == 0) {
+		assert(enable < 0);
+		DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
+		DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
+	}
+}
+
 void sna_accel_close(struct sna *sna)
 {
 	if (sna->freed_pixmap) {
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index fe3d1cf..3909b84 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -178,6 +178,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	 * As we don't track which Client, we flush for all.
 	 */
 	priv->flush = 1;
+	sna_accel_watch_flush(sna, 1);
 
 	/* Don't allow this named buffer to be replaced */
 	priv->pinned = 1;
@@ -324,6 +325,7 @@ static void _sna_dri_destroy_buffer(struct sna *sna, DRI2Buffer2Ptr buffer)
 
 			/* Undo the DRI markings on this pixmap */
 			list_del(&priv->list);
+			sna_accel_watch_flush(sna, -1);
 			priv->pinned = private->pixmap == sna->front;
 			priv->flush = 0;
 
commit 4cab5bfbc116e99b785eb3a077dd88985fe43f17
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 5 22:29:38 2012 +0000

    sna: Check for flush at the start of every WriteToClient
    
    The goal is to simply avoid the flush before going to sleep when we have
    no pending events. That is we only want to flush when we know there will
    be at least on X_Reply sent to a Client. (Preferably, it would a Damage
    reply!) We can safe assume that every WriteToClient marks the beginning
    of a new reply added to the Client output queue and thus know that upon
    the next flush event we will emitting a Reply and so need to submit our
    batches.
    
    Second attempt to fix a438e4ac.
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 709f29d..460fbb3 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11460,11 +11460,14 @@ sna_accel_reply_callback(CallbackListPtr *list,
 			 pointer user_data, pointer call_data)
 {
 	struct sna *sna = user_data;
-	ReplyInfoRec *info = call_data;
 
-	if (sna->flush || !info->startOfReply)
+	if (sna->flush)
 		return;
 
+	/* Assume each callback corresponds to a new request. The use
+	 * of continuation WriteToClients in the server is relatively rare,
+	 * and we err on the side of safety.
+	 */
 	sna->flush = (sna->kgem.flush || sna->kgem.sync ||
 		      !list_is_empty(&sna->dirty_pixmaps));
 }
commit 28333cae76f16f38030e5874cfac754744482b12
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 4 22:23:39 2012 +0000

    sna/trapezoids: Ellide empty cells
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 2d8b3b9..33ea3bb 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -1249,18 +1249,19 @@ tor_blt(struct sna *sna,
 		       cell->x, cell->covered_height, cell->uncovered_area,
 		       cover, xmax));
 
-		box.x2 = x;
-		if (box.x2 > box.x1 && (unbounded || cover)) {
-			__DBG(("%s: span (%d, %d)x(%d, %d) @ %d\n", __FUNCTION__,
-			       box.x1, box.y1,
-			       box.x2 - box.x1,
-			       box.y2 - box.y1,
-			       cover));
-			span(sna, op, clip, &box, cover);
+		if (cell->covered_height || cell->uncovered_area) {
+			box.x2 = x;
+			if (box.x2 > box.x1 && (unbounded || cover)) {
+				__DBG(("%s: span (%d, %d)x(%d, %d) @ %d\n", __FUNCTION__,
+				       box.x1, box.y1,
+				       box.x2 - box.x1,
+				       box.y2 - box.y1,
+				       cover));
+				span(sna, op, clip, &box, cover);
+			}
+			box.x1 = box.x2;
+			cover += cell->covered_height*FAST_SAMPLES_X*2;
 		}
-		box.x1 = box.x2;
-
-		cover += cell->covered_height*FAST_SAMPLES_X*2;
 
 		if (cell->uncovered_area) {
 			int area = cover - cell->uncovered_area;
commit bee8a5528267a6ca674f046f94740bb71f08de61
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 5 21:05:34 2012 +0000

    sna/composite: Skip clipping the rectangle region against the singular clip
    
    As we will already have taken it into account when constructing the
    region from the rectangles.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 55a496e..7f1d096 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -563,7 +563,7 @@ _pixman_region_init_clipped_rectangles(pixman_region16_t *region,
 				       unsigned int num_rects,
 				       xRectangle *rects,
 				       int tx, int ty,
-				       int maxx, int maxy)
+				       BoxPtr extents)
 {
 	pixman_box16_t stack_boxes[64], *boxes = stack_boxes;
 	pixman_bool_t ret;
@@ -576,25 +576,21 @@ _pixman_region_init_clipped_rectangles(pixman_region16_t *region,
 	}
 
 	for (i = j = 0; i < num_rects; i++) {
-		boxes[j].x1 = rects[i].x;
-		if (boxes[j].x1 < 0)
-			boxes[j].x1 = 0;
-		boxes[j].x1 += tx;
-
-		boxes[j].y1 = rects[i].y;
-		if (boxes[j].y1 < 0)
-			boxes[j].y1 = 0;
-		boxes[j].y1 += ty;
-
-		boxes[j].x2 = bound(rects[i].x, rects[i].width);
-		if (boxes[j].x2 > maxx)
-			boxes[j].x2 = maxx;
-		boxes[j].x2 += tx;
-
-		boxes[j].y2 = bound(rects[i].y, rects[i].height);
-		if (boxes[j].y2 > maxy)
-			boxes[j].y2 = maxy;
-		boxes[j].y2 += ty;
+		boxes[j].x1 = rects[i].x + tx;
+		if (boxes[j].x1 < extents->x1)
+			boxes[j].x1 = extents->x1;
+
+		boxes[j].y1 = rects[i].y + ty;
+		if (boxes[j].y1 < extents->y1)
+			boxes[j].y1 = extents->y1;
+
+		boxes[j].x2 = bound(rects[i].x + tx, rects[i].width);
+		if (boxes[j].x2 > extents->x2)
+			boxes[j].x2 = extents->x2;
+
+		boxes[j].y2 = bound(rects[i].y + ty, rects[i].height);
+		if (boxes[j].y2 > extents->y2)
+			boxes[j].y2 = extents->y2;
 
 		if (boxes[j].x2 > boxes[j].x1 && boxes[j].y2 > boxes[j].y1)
 			j++;
@@ -689,8 +685,9 @@ sna_composite_rectangles(CARD8		 op,
 
 	if (!_pixman_region_init_clipped_rectangles(&region,
 						    num_rects, rects,
-						    dst->pDrawable->x, dst->pDrawable->y,
-						    dst->pDrawable->width, dst->pDrawable->height))
+						    dst->pDrawable->x,
+						    dst->pDrawable->y,
+						    &dst->pCompositeClip->extents))
 	{
 		DBG(("%s: allocation failed for region\n", __FUNCTION__));
 		return;
@@ -702,8 +699,9 @@ sna_composite_rectangles(CARD8		 op,
 	     RegionExtents(&region)->x2, RegionExtents(&region)->y2,
 	     RegionNumRects(&region)));
 
-	if (!pixman_region_intersect(&region, &region, dst->pCompositeClip) ||
-	    region_is_empty(&region)) {
+	if (dst->pCompositeClip->data &&
+	    (!pixman_region_intersect(&region, &region, dst->pCompositeClip) ||
+	     region_is_empty(&region))) {
 		DBG(("%s: zero-intersection between rectangles and clip\n",
 		     __FUNCTION__));
 		pixman_region_fini(&region);
commit b46f9107c2765ac9e90051763136c882b8e57ab8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Mar 5 21:04:25 2012 +0000

    sna: Flush dirty CPU damage before notifying the compositor
    
    Fixes regression from a438e4ac (sna: Revamp vmap support)
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 595b834..709f29d 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11465,7 +11465,8 @@ sna_accel_reply_callback(CallbackListPtr *list,
 	if (sna->flush || !info->startOfReply)
 		return;
 
-	sna->flush = sna->kgem.flush || sna->kgem.sync;
+	sna->flush = (sna->kgem.flush || sna->kgem.sync ||
+		      !list_is_empty(&sna->dirty_pixmaps));
 }
 
 static void
commit 34a00bfb7033846036ba6ed4c3e80cf7d7ccdf5f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 4 19:12:29 2012 +0000

    sna: Add some assertions to partial buffer list tracking
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f913369..279face 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1425,6 +1425,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
 		assert(next->base.list.prev == &bo->base.list);
 		assert(bo->base.io);
+		assert(bo->base.refcnt >= 1);
 
 		if (!bo->base.exec)
 			continue;
@@ -3366,6 +3367,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		flags &= ~KGEM_BUFFER_INPLACE;
 
 	list_for_each_entry(bo, &kgem->active_partials, base.list) {
+		assert(bo->base.io);
+		assert(bo->base.refcnt >= 1);
+
 		/* We can reuse any write buffer which we can fit */
 		if (flags == KGEM_BUFFER_LAST &&
 		    bo->write == KGEM_BUFFER_WRITE &&
@@ -3415,6 +3419,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 
 	if (flags & KGEM_BUFFER_WRITE) {
 		list_for_each_entry_reverse(bo, &kgem->inactive_partials, base.list) {
+			assert(bo->base.io);
+			assert(bo->base.refcnt == 1);
+
 			if (size > bytes(&bo->base))
 				continue;
 
@@ -3691,6 +3698,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 init:
 	bo->base.reusable = false;
 	assert(num_pages(&bo->base) == alloc);
+	assert(bo->base.io);
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 
commit edbe0fa3e0891bfb9001824bcfda5fc29f2507f2
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Mar 4 15:48:33 2012 +0000

    sna: Fix assertion for checking inactive shadow buffers
    
    We may have an ordinary malloc with no CPU bo attached so check before
    dereferencing.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index fc44b47..595b834 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11766,7 +11766,7 @@ static void sna_accel_inactive(struct sna *sna)
 			sna_damage_destroy(&priv->cpu_damage);
 			list_del(&priv->list);
 
-			assert(!priv->cpu_bo->sync);
+			assert(priv->cpu_bo == NULL || !priv->cpu_bo->sync);
 			sna_pixmap_free_cpu(sna, priv);
 			priv->undamaged = false;
 
commit 39f862bb17e4724e45bb809b8ac4db050cdbbcac
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 23:31:24 2012 +0000

    sna: Encourage promotion of snooped CPU bo to real GPU bo
    
    This fixes the regression in performance of fishietank on gen2. As
    the texture atlas is too large to be tiled, one might presume that it
    has the same performance characteristics as the snooped linear CPU
    buffer. It does not. Therefore if we attempt to reuse a vmap bo, promote
    it to a full GPU bo. This hopefully gains the benefit of avoiding the
    copy for single shot sources, but still gives us the benefit of avoiding
    the clflushes.
    
    On the plus side, it does prove that gen2 handles snoopable memory from
    both the blitter and the sampler!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 0266ea4..a345962 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -323,7 +323,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 		}
 
 		if (priv->gpu_bo->tiling != I915_TILING_NONE &&
-		    priv->cpu_bo->pitch >= 4096) {
+		    (priv->cpu_bo->vmap || priv->cpu_bo->pitch >= 4096)) {
 			DBG(("%s: GPU bo exists and is tiled [%d], upload\n",
 			     __FUNCTION__, priv->gpu_bo->tiling));
 			return NULL;
@@ -332,21 +332,23 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 		int w = box->x2 - box->x1;
 		int h = box->y2 - box->y1;
 
-		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0)
-			goto done;
+		if (priv->cpu_bo->vmap && priv->source_count > SOURCE_BIAS) {
+			DBG(("%s: promoting snooped CPU bo due to reuse\n",
+			     __FUNCTION__));
+			return NULL;
+		}
 
-		if (priv->source_count*w*h >= pixmap->drawable.width * pixmap->drawable.height &&
-		    I915_TILING_NONE != kgem_choose_tiling(&sna->kgem, I915_TILING_X,
-							   pixmap->drawable.width,
-							   pixmap->drawable.height,
-							   pixmap->drawable.bitsPerPixel)) {
+		if (priv->source_count++*w*h >= (int)pixmap->drawable.width * pixmap->drawable.height &&
+		     I915_TILING_NONE != kgem_choose_tiling(&sna->kgem, I915_TILING_X,
+							    pixmap->drawable.width,
+							    pixmap->drawable.height,
+							    pixmap->drawable.bitsPerPixel)) {
 			DBG(("%s: pitch (%d) requires tiling\n",
 			     __FUNCTION__, priv->cpu_bo->pitch));
 			return NULL;
 		}
 	}
 
-done:
 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
 	return priv->cpu_bo;
@@ -528,7 +530,7 @@ sna_render_pixmap_bo(struct sna *sna,
 
 		if (priv->cpu_bo &&
 		    (DAMAGE_IS_ALL(priv->cpu_damage) || !priv->gpu_damage) &&
-		    priv->cpu_bo->pitch < 4096) {
+		    !priv->cpu_bo->vmap && priv->cpu_bo->pitch < 4096) {
 			channel->bo = kgem_bo_reference(priv->cpu_bo);
 			return 1;
 		}
commit 618d0e531d6735a22a46f70b2c410234b53bc462
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 20:18:32 2012 +0000

    sna: Align allocations with partial buffers to 64 bytes.
    
    A magic number required for so many functions of the GPU. In this
    particular case it is likely to be that the offset of a texture in the
    GTT has to have a minimum alignment of 64 bytes.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=46415
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index cc80278..f913369 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3391,6 +3391,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				     __FUNCTION__, bo->write, flags));
 				continue;
 			}
+			assert(bo->mmapped || bo->need_io);
 		} else {
 			if (bo->write & KGEM_BUFFER_WRITE) {
 				DBG(("%s: skip write %x buffer, need %x\n",
@@ -3548,6 +3549,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			list_init(&bo->base.list);
 			free(old);
 
+			assert(bo->base.tiling == I915_TILING_NONE);
+			assert(num_pages(&bo->base) >= NUM_PAGES(size));
+
 			bo->mem = kgem_bo_map(kgem, &bo->base);
 			if (bo->mem) {
 				bo->need_io = false;
@@ -3564,11 +3568,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 #else
-	alloc = ALIGN(size, 64*1024) / PAGE_SIZE;
+	flags &= ~KGEM_BUFFER_INPLACE;
 #endif
 	/* Be more parsimonious with pwrite/pread buffers */
 	if ((flags & KGEM_BUFFER_INPLACE) == 0)
-		alloc = PAGE_ALIGN(size) / PAGE_SIZE;
+		alloc = NUM_PAGES(size);
 	flags &= ~KGEM_BUFFER_INPLACE;
 
 	if (kgem->has_vmap) {
@@ -3700,6 +3704,7 @@ init:
 	     __FUNCTION__, alloc, bo->base.handle));
 
 done:
+	bo->used = ALIGN(bo->used, 64);
 	/* adjust the position within the list to maintain decreasing order */
 	alloc = bytes(&bo->base) - bo->used;
 	{
commit 1ccb1d69a1f12cd9daa58da7151a79818ecaa9b7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 18:18:48 2012 +0000

    sna: Silence an assertion failure during shutdown
    
    Clear the scanout flag on the front buffer during teardown to silence
    the debugger.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index bcd1191..e53b75f 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -784,6 +784,9 @@ static Bool sna_close_screen(int scrnIndex, ScreenPtr screen)
 
 	sna_mode_remove_fb(sna);
 	if (sna->front) {
+		struct kgem_bo *bo = sna_pixmap_get_bo(sna->front);
+		if (bo)
+			kgem_bo_clear_scanout(&sna->kgem, bo); /* valgrind */
 		screen->DestroyPixmap(sna->front);
 		sna->front = NULL;
 	}
commit bbf8b6b365c5e8e318d32a9442d4c352afe930f9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 18:11:56 2012 +0000

    sna: And fix compilation for last commit
    
    I skipped a GCC warning about the implicit function declaration, which
    of course results in a runtime silent death. Oops.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 5b81596..55a496e 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -740,15 +740,18 @@ sna_composite_rectangles(CARD8		 op,
 		goto fallback;
 	}
 
+	priv = sna_pixmap(pixmap);
+	if (priv == NULL) {
+		DBG(("%s: fallback, not attached\n", __FUNCTION__));
+		goto fallback;
+	}
+
 	/* If we going to be overwriting any CPU damage with a subsequent
 	 * operation, then we may as well delete it without moving it
 	 * first to the GPU.
 	 */
-	if (op <= PictOpSrc) {
-		priv = sna_pixmap_attach(pixmap);
-		if (priv)
-			sna_damage_subtract(&priv->cpu_damage, &region);
-	}
+	if (op <= PictOpSrc)
+		sna_damage_subtract(&priv->cpu_damage, &region);
 
 	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
 	if (priv == NULL) {
commit 1c4e2ac2c1c584f98ffc51eb95ebcfbc8f8d659a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 17:36:50 2012 +0000

    sna: Prevent backing pixmaps being created later
    
    We used to allow the backing pixmap to be created later in order to
    accommodate ShmPixmaps and ShmPutImage. However, they are now correctly
    handled upfront if we choose to accelerate those paths, and so all
    choice over whether to attach to a pixmap are made during creation and
    are invariant.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index c772d7d..1196cce 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -415,18 +415,6 @@ static inline Bool pixmap_is_scanout(PixmapPtr pixmap)
 	return pixmap == screen->GetScreenPixmap(screen);
 }
 
-struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap);
-inline static struct sna_pixmap *sna_pixmap_attach(PixmapPtr pixmap)
-{
-	struct sna_pixmap *priv;
-
-	priv = sna_pixmap(pixmap);
-	if (priv)
-		return priv;
-
-	return _sna_pixmap_attach(pixmap);
-}
-
 PixmapPtr sna_pixmap_create_upload(ScreenPtr screen,
 				   int width, int height, int depth,
 				   unsigned flags);
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index c427808..fc44b47 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -523,8 +523,7 @@ _sna_pixmap_reset(PixmapPtr pixmap)
 	return _sna_pixmap_init(priv, pixmap);
 }
 
-static struct sna_pixmap *__sna_pixmap_attach(struct sna *sna,
-					      PixmapPtr pixmap)
+static struct sna_pixmap *sna_pixmap_attach(struct sna *sna, PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv;
 
@@ -536,50 +535,6 @@ static struct sna_pixmap *__sna_pixmap_attach(struct sna *sna,
 	return _sna_pixmap_init(priv, pixmap);
 }
 
-struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
-{
-	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv;
-
-	DBG(("%s: serial=%ld, %dx%d, usage=%d\n",
-	     __FUNCTION__,
-	     pixmap->drawable.serialNumber,
-	     pixmap->drawable.width,
-	     pixmap->drawable.height,
-	     pixmap->usage_hint));
-
-	switch (pixmap->usage_hint) {
-	case CREATE_PIXMAP_USAGE_GLYPH_PICTURE:
-		DBG(("%s: not attaching due to crazy usage: %d\n",
-		     __FUNCTION__, pixmap->usage_hint));
-		return NULL;
-
-	case SNA_CREATE_FB:
-		/* We assume that the Screen pixmap will be pre-validated */
-		break;
-
-	default:
-		if (!kgem_can_create_2d(&sna->kgem,
-					pixmap->drawable.width,
-					pixmap->drawable.height,
-					pixmap->drawable.depth))
-			return NULL;
-		break;
-	}
-
-	priv = __sna_pixmap_attach(sna, pixmap);
-	if (priv == NULL)
-		return NULL;
-
-	DBG(("%s: created priv and marking all cpu damaged\n", __FUNCTION__));
-
-	sna_damage_all(&priv->cpu_damage,
-		       pixmap->drawable.width,
-		       pixmap->drawable.height);
-
-	return priv;
-}
-
 static inline PixmapPtr
 create_pixmap(struct sna *sna, ScreenPtr screen,
 	      int width, int height, int depth,
@@ -647,7 +602,7 @@ sna_pixmap_create_shm(ScreenPtr screen,
 		pixmap->drawable.depth = depth;
 		pixmap->drawable.bitsPerPixel = bpp;
 
-		priv = __sna_pixmap_attach(sna, pixmap);
+		priv = sna_pixmap_attach(sna, pixmap);
 		if (!priv) {
 			fbDestroyPixmap(pixmap);
 			return NullPixmap;
@@ -729,7 +684,7 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 		pixmap->drawable.depth = depth;
 		pixmap->drawable.bitsPerPixel = bpp;
 
-		priv = __sna_pixmap_attach(sna, pixmap);
+		priv = sna_pixmap_attach(sna, pixmap);
 		if (!priv) {
 			fbDestroyPixmap(pixmap);
 			return NullPixmap;
@@ -770,9 +725,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		usage = -1;
 		goto fallback;
 	}
-
-	if (!sna->have_render)
-		goto fallback;
+	assert(width && height);
 
 	flags = kgem_can_create_2d(&sna->kgem, width, height, depth);
 	if (flags == 0) {
@@ -811,7 +764,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		if (pixmap == NullPixmap)
 			return NullPixmap;
 
-		__sna_pixmap_attach(sna, pixmap);
+		sna_pixmap_attach(sna, pixmap);
 	} else {
 		struct sna_pixmap *priv;
 
@@ -827,7 +780,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		pixmap->devKind = pad;
 		pixmap->devPrivate.ptr = NULL;
 
-		priv = __sna_pixmap_attach(sna, pixmap);
+		priv = sna_pixmap_attach(sna, pixmap);
 		if (priv == NULL) {
 			free(pixmap);
 			goto fallback;
@@ -2080,7 +2033,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	DBG(("%s(pixmap=%p)\n", __FUNCTION__, pixmap));
 
-	priv = sna_pixmap_attach(pixmap);
+	priv = sna_pixmap(pixmap);
 	if (priv == NULL)
 		return NULL;
 
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 8c51a77..8420730 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -1395,7 +1395,7 @@ prepare_blt_put(struct sna *sna,
 	op->done = nop_done;
 
 	src_bo = NULL;
-	priv = _sna_pixmap_attach(src);
+	priv = sna_pixmap(src);
 	if (priv)
 		src_bo = priv->cpu_bo;
 	if (src_bo) {
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 572d6ea..0266ea4 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -375,7 +375,7 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 	if (w == pixmap->drawable.width && h == pixmap->drawable.height) {
 		bool upload;
 
-		priv = sna_pixmap_attach(pixmap);
+		priv = sna_pixmap(pixmap);
 		if (!priv)
 			return false;
 
@@ -400,7 +400,7 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 	if (64*w*h < pixmap->drawable.width * pixmap->drawable.height)
 		return FALSE;
 
-	priv = sna_pixmap_attach(pixmap);
+	priv = sna_pixmap(pixmap);
 	if (!priv)
 		return FALSE;
 
commit 6f286faa6826492ce1c7ce099bba8bdca3470ba4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 16:06:59 2012 +0000

    sna: Disable vmap on 965gm
    
    The sampler just dies if it encounters a snoopable page, for no apparent
    reason. Whilst I encountered the bug on Crestline, disable it for the
    rest of gen4 just to be safe.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 1ec9fb4..cc80278 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -599,6 +599,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 #if defined(USE_VMAP)
 	if (!DBG_NO_VMAP)
 		kgem->has_vmap = gem_param(kgem, I915_PARAM_HAS_VMAP) > 0;
+	if (gen == 40)
+		kgem->has_vmap = false; /* sampler dies with snoopable memory */
 #endif
 	DBG(("%s: using vmap=%d\n", __FUNCTION__, kgem->has_vmap));
 
commit fb210329af27a6e9639073925d2a792900e40416
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 14:34:23 2012 +0000

    sna: Pass usage hint for creating linear buffers
    
    As we wish to immediate map the vertices buffers, it is beneficial to
    search the linear cache for an existing mapping to reuse first.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index bd1eddd..78c7ea0 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1642,7 +1642,8 @@ static int gen3_vertex_finish(struct sna *sna)
 	}
 
 	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem, 256*1024);
+	sna->render.vbo = kgem_create_linear(&sna->kgem,
+					     256*1024, CREATE_GTT_MAP);
 	if (sna->render.vbo)
 		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertices == NULL) {
@@ -1702,7 +1703,7 @@ static void gen3_vertex_close(struct sna *sna)
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
 			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used);
+						4*sna->render.vertex_used, 0);
 			if (bo)
 				kgem_bo_write(&sna->kgem, bo,
 					      sna->render.vertex_data,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 6ba59ee..97af7fc 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -398,7 +398,8 @@ static int gen4_vertex_finish(struct sna *sna)
 	}
 
 	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem, 256*1024);
+	sna->render.vbo = kgem_create_linear(&sna->kgem,
+					     256*1024, CREATE_GTT_MAP);
 	if (sna->render.vbo)
 		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertices == NULL) {
@@ -442,7 +443,8 @@ static void gen4_vertex_close(struct sna *sna)
 			bo = NULL;
 			sna->kgem.nbatch += sna->render.vertex_used;
 		} else {
-			bo = kgem_create_linear(&sna->kgem, 4*sna->render.vertex_used);
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used, 0);
 			if (bo && !kgem_bo_write(&sna->kgem, bo,
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index bccd343..18325b5 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -390,7 +390,8 @@ static int gen5_vertex_finish(struct sna *sna)
 	}
 
 	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem, 256*1024);
+	sna->render.vbo = kgem_create_linear(&sna->kgem,
+					     256*1024, CREATE_GTT_MAP);
 	if (sna->render.vbo)
 		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertices == NULL) {
@@ -447,7 +448,8 @@ static void gen5_vertex_close(struct sna *sna)
 			bo = NULL;
 			sna->kgem.nbatch += sna->render.vertex_used;
 		} else {
-			bo = kgem_create_linear(&sna->kgem, 4*sna->render.vertex_used);
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used, 0);
 			if (bo && !kgem_bo_write(&sna->kgem, bo,
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 439fb52..71c0046 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -964,9 +964,10 @@ static int gen6_vertex_finish(struct sna *sna)
 	}
 
 	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem, 256*1024);
+	sna->render.vbo = kgem_create_linear(&sna->kgem,
+					     256*1024, CREATE_GTT_MAP);
 	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map__cpu(&sna->kgem, sna->render.vbo);
+		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertices == NULL) {
 		kgem_bo_destroy(&sna->kgem, sna->render.vbo);
 		sna->render.vbo = NULL;
@@ -1024,7 +1025,8 @@ static void gen6_vertex_close(struct sna *sna)
 			bo = NULL;
 			sna->kgem.nbatch += sna->render.vertex_used;
 		} else {
-			bo = kgem_create_linear(&sna->kgem, 4*sna->render.vertex_used);
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used, 0);
 			if (bo && !kgem_bo_write(&sna->kgem, bo,
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index e3d9757..a401d94 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1065,9 +1065,10 @@ static int gen7_vertex_finish(struct sna *sna)
 	}
 
 	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem, 256*1024);
+	sna->render.vbo = kgem_create_linear(&sna->kgem,
+					     256*1024, CREATE_GTT_MAP);
 	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map__cpu(&sna->kgem, sna->render.vbo);
+		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertices == NULL) {
 		kgem_bo_destroy(&sna->kgem, sna->render.vbo);
 		sna->render.vbo = NULL;
@@ -1121,7 +1122,8 @@ static void gen7_vertex_close(struct sna *sna)
 			bo = NULL;
 			sna->kgem.nbatch += sna->render.vertex_used;
 		} else {
-			bo = kgem_create_linear(&sna->kgem, 4*sna->render.vertex_used);
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used, 0);
 			if (bo && !kgem_bo_write(&sna->kgem, bo,
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 5776a4f..1ec9fb4 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1687,7 +1687,7 @@ void _kgem_submit(struct kgem *kgem)
 		size = compact_batch_surface(kgem);
 	else
 		size = kgem->nbatch * sizeof(kgem->batch[0]);
-	rq->bo = kgem_create_linear(kgem, size);
+	rq->bo = kgem_create_linear(kgem, size, 0);
 	if (rq->bo) {
 		uint32_t handle = rq->bo->handle;
 		int i;
@@ -2188,15 +2188,20 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 	return bo;
 }
 
-struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
+struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags)
 {
 	struct kgem_bo *bo;
 	uint32_t handle;
 
 	DBG(("%s(%d)\n", __FUNCTION__, size));
 
+	if (flags & CREATE_GTT_MAP && kgem->has_llc) {
+		flags &= ~CREATE_GTT_MAP;
+		flags |= CREATE_CPU_MAP;
+	}
+
 	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
-	bo = search_linear_cache(kgem, size, CREATE_INACTIVE);
+	bo = search_linear_cache(kgem, size, CREATE_INACTIVE | flags);
 	if (bo)
 		return kgem_bo_reference(bo);
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 446ac68..9abb72a 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -192,7 +192,7 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 
 struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name);
 
-struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size);
+struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags);
 struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 				  int offset, int length);
 
diff --git a/src/sna/sna_gradient.c b/src/sna/sna_gradient.c
index 96841dd..943cbf9 100644
--- a/src/sna/sna_gradient.c
+++ b/src/sna/sna_gradient.c
@@ -168,7 +168,7 @@ sna_render_get_gradient(struct sna *sna,
 	     width/2, pixman_image_get_data(image)[width/2],
 	     width-1, pixman_image_get_data(image)[width-1]));
 
-	bo = kgem_create_linear(&sna->kgem, width*4);
+	bo = kgem_create_linear(&sna->kgem, width*4, 0);
 	if (!bo) {
 		pixman_image_unref(image);
 		return NULL;
@@ -248,7 +248,7 @@ sna_render_finish_solid(struct sna *sna, bool force)
 
 	DBG(("sna_render_finish_solid reset\n"));
 
-	cache->cache_bo = kgem_create_linear(&sna->kgem, sizeof(cache->color));
+	cache->cache_bo = kgem_create_linear(&sna->kgem, sizeof(cache->color), 0);
 	cache->bo[0] = kgem_create_proxy(cache->cache_bo, 0, sizeof(uint32_t));
 	cache->bo[0]->pitch = 4;
 	if (force)
@@ -316,7 +316,7 @@ static Bool sna_alpha_cache_init(struct sna *sna)
 
 	DBG(("%s\n", __FUNCTION__));
 
-	cache->cache_bo = kgem_create_linear(&sna->kgem, sizeof(color));
+	cache->cache_bo = kgem_create_linear(&sna->kgem, sizeof(color), 0);
 	if (!cache->cache_bo)
 		return FALSE;
 
@@ -338,7 +338,7 @@ static Bool sna_solid_cache_init(struct sna *sna)
 	DBG(("%s\n", __FUNCTION__));
 
 	cache->cache_bo =
-		kgem_create_linear(&sna->kgem, sizeof(cache->color));
+		kgem_create_linear(&sna->kgem, sizeof(cache->color), 0);
 	if (!cache->cache_bo)
 		return FALSE;
 
diff --git a/src/sna/sna_stream.c b/src/sna/sna_stream.c
index d6d817d..7f05d21 100644
--- a/src/sna/sna_stream.c
+++ b/src/sna/sna_stream.c
@@ -87,7 +87,7 @@ struct kgem_bo *sna_static_stream_fini(struct sna *sna,
 
 	DBG(("uploaded %d bytes of static state\n", stream->used));
 
-	bo = kgem_create_linear(&sna->kgem, stream->used);
+	bo = kgem_create_linear(&sna->kgem, stream->used, 0);
 	if (bo && !kgem_bo_write(&sna->kgem, bo, stream->data, stream->used)) {
 		kgem_bo_destroy(&sna->kgem, bo);
 		return NULL;
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index ebc3860..56cf260 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -104,7 +104,8 @@ sna_video_buffer(struct sna *sna,
 		sna_video_free_buffers(sna, video);
 
 	if (video->buf == NULL)
-		video->buf = kgem_create_linear(&sna->kgem, frame->size);
+		video->buf = kgem_create_linear(&sna->kgem, frame->size,
+						CREATE_GTT_MAP);
 
 	return video->buf;
 }
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index 1aaf972..a71751c 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -273,7 +273,8 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 
 		assert(kgem_bo_size(frame.bo) >= frame.size);
 	} else {
-		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
+		frame.bo = kgem_create_linear(&sna->kgem, frame.size,
+					      CREATE_GTT_MAP);
 		if (frame.bo == NULL) {
 			DBG(("%s: failed to allocate bo\n", __FUNCTION__));
 			return BadAlloc;
commit a929e9f19f725ac3c656cf29aac8eb1e9a11fa59
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 10:01:07 2012 +0000

    sna: Only discard the inplace flag for LLC partial buffers
    
    KGEM_BUFFER_WRITE_INPLACE is WRITE | INPLACE and so the typo prevented
    uploading of partial data through the pwrite paths.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index e4ff6a7..5776a4f 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3356,7 +3356,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	assert(size <= kgem->max_object_size);
 
 	if (kgem->has_llc)
-		flags &= ~KGEM_BUFFER_WRITE_INPLACE;
+		flags &= ~KGEM_BUFFER_INPLACE;
 
 	list_for_each_entry(bo, &kgem->active_partials, base.list) {
 		/* We can reuse any write buffer which we can fit */
commit 757d82b214e243f7b3b6b79de48e0e1cba3889ea
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Mar 2 09:47:10 2012 +0000

    sna: Be careful not to discard the clear operation for move-region-to-cpu
    
    When moving only a region to the CPU and we detect a pending clear, we
    transform the operation into a move whole pixmap. In such situations, we
    only have a partial damage area and so need to or in MOVE_READ to
    prevent the pending clear of the whole pixmap from being discarded.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46792
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 52e75c7..c427808 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -922,6 +922,8 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 
 	if ((flags & MOVE_READ) == 0) {
 		assert(flags & MOVE_WRITE);
+		DBG(("%s: no readbck, discarding gpu damage [%d], pending clear[%d]\n",
+		     __FUNCTION__, priv->gpu_damage != NULL, priv->clear));
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->clear = false;
 
@@ -1273,9 +1275,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		goto out;
 	}
 
-	if (priv->clear)
-		return _sna_pixmap_move_to_cpu(pixmap, flags);
-
 	if (priv->gpu_bo == NULL &&
 	    (priv->create & KGEM_CAN_CREATE_GPU) == 0 &&
 	    flags & MOVE_WRITE)
@@ -1293,6 +1292,13 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 	}
 
+	if (priv->clear) {
+		DBG(("%s: pending clear, moving whole pixmap\n", __FUNCTION__));
+		if (dx | dy)
+			RegionTranslate(region, -dx, -dy);
+		return _sna_pixmap_move_to_cpu(pixmap, flags | MOVE_READ);
+	}
+
 	if ((flags & MOVE_READ) == 0) {
 		DBG(("%s: no read, checking to see if we can stream the write into the GPU bo\n",
 		     __FUNCTION__));
commit ea00eac3575a1201da555b0ebe4934ea6334aa1b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 1 17:54:51 2012 +0000

    sna/gen5: Help the compiler avoid an uncached read
    
    Debug builds are excruitatingly slow as the compiler doesn't store the
    temporary in a register but uses an uncached readback instead. Maybe
    this will help...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index bcba0d8..bccd343 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -849,16 +849,14 @@ gen5_emit_composite_primitive_identity_source(struct sna *sna,
 	dst.p.y = r->dst.y + r->height;
 	v[0] = dst.f;
 	v[1] = (sx + r->width) * sf[0];
-	v[2] = (sy + r->height) * sf[1];
+	v[5] = v[2] = (sy + r->height) * sf[1];
 
 	dst.p.x = r->dst.x;
 	v[3] = dst.f;
-	v[4] = sx * sf[0];
-	v[5] = v[2];
+	v[7] = v[4] = sx * sf[0];
 
 	dst.p.y = r->dst.y;
 	v[6] = dst.f;
-	v[7] = v[4];
 	v[8] = sy * sf[1];
 }
 
commit 73761b9e74638c8e8ab97e448bc53fdabceac30f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 1 14:52:39 2012 +0000

    sna: Split storage of inactive partials
    
    As we now attempt to keep retain partial buffers after execution, we can
    end up will lots of inactive buffers sitting on the partial buffer list.
    In any one batch, we wish to minimise the number of buffers used, so
    keep all the inactive buffers on a seperate list and only pull from them
    as required.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_list.h b/src/intel_list.h
index 8e8c612..366b9e8 100644
--- a/src/intel_list.h
+++ b/src/intel_list.h
@@ -214,6 +214,8 @@ __list_del(struct list *prev, struct list *next)
 static inline void
 _list_del(struct list *entry)
 {
+    assert(entry->prev->next == entry);
+    assert(entry->next->prev == entry);
     __list_del(entry->prev, entry->next);
 }
 
@@ -327,6 +329,11 @@ list_is_empty(struct list *head)
 	 &pos->member != (head);					\
 	 pos = __container_of(pos->member.next, pos, member))
 
+#define list_for_each_entry_reverse(pos, head, member)				\
+    for (pos = __container_of((head)->prev, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.prev, pos, member))
+
 /**
  * Loop through the list, keeping a backup pointer to the element. This
  * macro allows for the deletion of a list element while looping through the
@@ -353,6 +360,8 @@ list_add_tail(struct list *entry, struct list *head)
 static inline void
 _list_del(struct list *entry)
 {
+    assert(entry->prev->next == entry);
+    assert(entry->next->prev == entry);
     __list_del(entry->prev, entry->next);
 }
 
@@ -382,6 +391,11 @@ static inline void list_move_tail(struct list *list, struct list *head)
 #define list_last_entry(ptr, type, member) \
     list_entry((ptr)->prev, type, member)
 
+#define list_for_each_entry_reverse(pos, head, member)				\
+    for (pos = __container_of((head)->prev, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.prev, pos, member))
+
 #endif
 
 #undef container_of
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4fcdf37..e4ff6a7 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -119,8 +119,10 @@ static bool validate_partials(struct kgem *kgem)
 {
 	struct kgem_partial_bo *bo, *next;
 
-	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
-		if (bo->base.list.next == &kgem->partial)
+	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
+		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.io);
+		if (bo->base.list.next == &kgem->active_partials)
 			return true;
 		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
 			ErrorF("this rem: %d, next rem: %d\n",
@@ -132,7 +134,7 @@ static bool validate_partials(struct kgem *kgem)
 	return true;
 
 err:
-	list_for_each_entry(bo, &kgem->partial, base.list)
+	list_for_each_entry(bo, &kgem->active_partials, base.list)
 		ErrorF("bo: used=%d / %d, rem=%d\n",
 		       bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
 	return false;
@@ -573,7 +575,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 
 	kgem->half_cpu_cache_pages = cpu_cache_size() >> 13;
 
-	list_init(&kgem->partial);
+	list_init(&kgem->active_partials);
+	list_init(&kgem->inactive_partials);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
 	list_init(&kgem->sync_list);
@@ -1173,11 +1176,11 @@ static void kgem_bo_unref(struct kgem *kgem, struct kgem_bo *bo)
 		__kgem_bo_destroy(kgem, bo);
 }
 
-static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
+static void bubble_sort_partial(struct list *head, struct kgem_partial_bo *bo)
 {
 	int remain = bytes(&bo->base) - bo->used;
 
-	while (bo->base.list.prev != &kgem->partial) {
+	while (bo->base.list.prev != head) {
 		struct kgem_partial_bo *p;
 
 		p = list_entry(bo->base.list.prev,
@@ -1204,21 +1207,25 @@ static void kgem_retire_partials(struct kgem *kgem)
 {
 	struct kgem_partial_bo *bo, *next;
 
-	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
-		if (bo->used == 0 || !bo->mmapped)
-			continue;
+	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
+		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.io);
+
 		if (bo->base.refcnt != 1 || bo->base.rq)
 			continue;
 
 		DBG(("%s: handle=%d, used %d/%d\n", __FUNCTION__,
 		     bo->base.handle, bo->used, bytes(&bo->base)));
 
+		assert(bo->base.refcnt == 1);
+		assert(bo->mmapped);
 		assert(kgem->has_llc || !IS_CPU_MAP(bo->base.map));
 		bo->base.dirty = false;
 		bo->base.needs_flush = false;
 		bo->used = 0;
 
-		bubble_sort_partial(kgem, bo);
+		list_move_tail(&bo->base.list, &kgem->inactive_partials);
+		bubble_sort_partial(&kgem->inactive_partials, bo);
 	}
 }
 
@@ -1343,6 +1350,8 @@ static void kgem_commit(struct kgem *kgem)
 	struct kgem_bo *bo, *next;
 
 	list_for_each_entry_safe(bo, next, &rq->buffers, request) {
+		assert(next->request.prev == &bo->request);
+
 		DBG(("%s: release handle=%d (proxy? %d), dirty? %d flush? %d -> offset=%x\n",
 		     __FUNCTION__, bo->handle, bo->proxy != NULL,
 		     bo->dirty, bo->needs_flush, (unsigned)bo->exec->offset));
@@ -1411,15 +1420,18 @@ static void kgem_finish_partials(struct kgem *kgem)
 {
 	struct kgem_partial_bo *bo, *next;
 
-	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
+	list_for_each_entry_safe(bo, next, &kgem->active_partials, base.list) {
+		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.io);
+
+		if (!bo->base.exec)
+			continue;
+
 		if (!bo->write) {
 			assert(bo->base.exec || bo->base.refcnt > 1);
 			goto decouple;
 		}
 
-		if (!bo->base.exec)
-			continue;
-
 		if (bo->mmapped) {
 			assert(!bo->need_io);
 			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
@@ -1439,9 +1451,9 @@ static void kgem_finish_partials(struct kgem *kgem)
 			goto decouple;
 		}
 
+		assert(bo->need_io);
 		assert(bo->base.rq == kgem->next_request);
 		assert(bo->base.domain != DOMAIN_GPU);
-		assert(bo->need_io);
 
 		if (bo->base.refcnt == 1 && bo->used < bytes(&bo->base) / 2) {
 			struct kgem_bo *shrink;
@@ -1485,8 +1497,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 				bo->base.needs_flush = false;
 				bo->used = 0;
 
-				bubble_sort_partial(kgem, bo);
-				continue;
+				goto decouple;
 			}
 		}
 
@@ -1508,16 +1519,6 @@ decouple:
 
 static void kgem_cleanup(struct kgem *kgem)
 {
-	while (!list_is_empty(&kgem->partial)) {
-		struct kgem_bo *bo;
-
-		bo = list_first_entry(&kgem->partial,
-				      struct kgem_bo,
-				      list);
-		list_del(&bo->list);
-		kgem_bo_unref(kgem, bo);
-	}
-
 	while (!list_is_empty(&kgem->requests)) {
 		struct kgem_request *rq;
 
@@ -1844,14 +1845,17 @@ void kgem_throttle(struct kgem *kgem)
 
 static void kgem_expire_partial(struct kgem *kgem)
 {
-	struct kgem_partial_bo *bo, *next;
-
-	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
-		if (bo->base.refcnt > 1 || bo->base.rq)
-			continue;
-
-		DBG(("%s: discarding unused partial buffer: %d/%d, write? %d\n",
-		     __FUNCTION__, bo->used, bytes(&bo->base), bo->write));
+	while (!list_is_empty(&kgem->inactive_partials)) {
+		struct kgem_partial_bo *bo =
+			list_first_entry(&kgem->inactive_partials,
+					 struct kgem_partial_bo,
+					 base.list);
+
+		DBG(("%s: discarding unused partial buffer: %d, last write? %d\n",
+		     __FUNCTION__, bytes(&bo->base), bo->write));
+		assert(bo->base.list.prev == &kgem->inactive_partials);
+		assert(bo->base.io);
+		assert(bo->base.refcnt == 1);
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
 	}
@@ -2785,7 +2789,7 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 
 	if (bo->delta + bo->size.bytes == io->used) {
 		io->used = bo->delta;
-		bubble_sort_partial(kgem, io);
+		bubble_sort_partial(&kgem->active_partials, io);
 	}
 }
 
@@ -3210,10 +3214,16 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	return bo;
 }
 #else
+static uint32_t gem_vmap(int fd, void *ptr, int size, int read_only)
+{
+	assert(0);
+	return 0;
+}
 struct kgem_bo *kgem_create_map(struct kgem *kgem,
 				void *ptr, uint32_t size,
 				bool read_only)
 {
+	assert(0);
 	return 0;
 }
 #endif
@@ -3243,6 +3253,7 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 void kgem_bo_set_sync(struct kgem *kgem, struct kgem_bo *bo)
 {
 	assert(!bo->reusable);
+	assert(list_is_empty(&bo->list));
 	list_add(&bo->list, &kgem->sync_list);
 	bo->sync = true;
 }
@@ -3342,17 +3353,16 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size <= kgem->max_cpu_size);
+	assert(size <= kgem->max_object_size);
 
 	if (kgem->has_llc)
 		flags &= ~KGEM_BUFFER_WRITE_INPLACE;
 
-	list_for_each_entry(bo, &kgem->partial, base.list) {
+	list_for_each_entry(bo, &kgem->active_partials, base.list) {
 		/* We can reuse any write buffer which we can fit */
 		if (flags == KGEM_BUFFER_LAST &&
 		    bo->write == KGEM_BUFFER_WRITE &&
-		    bo->base.exec && !bo->mmapped &&
-		    size <= bytes(&bo->base)) {
+		    !bo->mmapped && size <= bytes(&bo->base)) {
 			assert(bo->base.refcnt == 1);
 			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
 			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
@@ -3362,7 +3372,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->write = 0;
 			offset = 0;
 			bo->used = size;
-			bubble_sort_partial(kgem, bo);
+			bubble_sort_partial(&kgem->active_partials, bo);
 			goto done;
 		}
 
@@ -3395,6 +3405,27 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		break;
 	}
 
+	if (flags & KGEM_BUFFER_WRITE) {
+		list_for_each_entry_reverse(bo, &kgem->inactive_partials, base.list) {
+			if (size > bytes(&bo->base))
+				continue;
+
+			if (((bo->write & ~flags) & KGEM_BUFFER_INPLACE) &&
+			    !bo->base.vmap) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
+
+			DBG(("%s: reusing inactive partial buffer? size=%d, total=%d\n",
+			     __FUNCTION__, size, bytes(&bo->base)));
+			offset = 0;
+			bo->used = size;
+			list_move(&bo->base.list, &kgem->active_partials);
+			goto done;
+		}
+	}
+
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
@@ -3656,7 +3687,8 @@ init:
 	bo->write = flags & KGEM_BUFFER_WRITE_INPLACE;
 	offset = 0;
 
-	list_add(&bo->base.list, &kgem->partial);
+	assert(list_is_empty(&bo->base.list));
+	list_add(&bo->base.list, &kgem->active_partials);
 	DBG(("%s(pages=%d) new handle=%d\n",
 	     __FUNCTION__, alloc, bo->base.handle));
 
@@ -3669,7 +3701,7 @@ done:
 		first = p = list_first_entry(&bo->base.list,
 					     struct kgem_partial_bo,
 					     base.list);
-		while (&p->base.list != &kgem->partial &&
+		while (&p->base.list != &kgem->active_partials &&
 		       alloc < bytes(&p->base) - p->used) {
 			DBG(("%s: this=%d, right=%d\n",
 			     __FUNCTION__, alloc, bytes(&p->base) -p->used));
@@ -3720,7 +3752,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		if (io->used)
 			io->used -= stride;
 		bo->size.bytes -= stride;
-		bubble_sort_partial(kgem, io);
+		bubble_sort_partial(&kgem->active_partials, io);
 	}
 
 	bo->pitch = stride;
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 58316dc..446ac68 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -125,7 +125,7 @@ struct kgem {
 	struct list large;
 	struct list active[NUM_CACHE_BUCKETS][3];
 	struct list inactive[NUM_CACHE_BUCKETS];
-	struct list partial;
+	struct list active_partials, inactive_partials;
 	struct list requests;
 	struct list sync_list;
 	struct kgem_request *next_request;
commit 8d0d2007aa5dcc8790cc3a7557c1dff6fe374cbb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 28 19:15:34 2012 +0000

    sna: Revamp vmap support
    
    Dust off the kernel patches and update to reflect the changes made to
    support LLC CPU bo, in particular to support the unsynchronized shadow
    buffers.
    
    However, due to the forced synchronisation required for strict client
    coherency we prefer not to use the vmap for shared pixmaps unless we are
    already busy (i.e. sync afterwards rather than before in the hope that
    we can squash a few operations into one). Being able to block the reply
    to the client until the request is actually complete and so avoid the
    sync remains a dream.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 3ba2ec3..4fcdf37 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -76,8 +76,23 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define MAX_CPU_VMA_CACHE INT16_MAX
 #define MAP_PRESERVE_TIME 10
 
-#define CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) & ~1))
+#define MAP(ptr) ((void*)((uintptr_t)(ptr) & ~3))
 #define MAKE_CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
+#define MAKE_VMAP_MAP(ptr) ((void*)((uintptr_t)(ptr) | 3))
+#define IS_VMAP_MAP(ptr) ((uintptr_t)(ptr) & 2)
+
+#if defined(USE_VMAP) && !defined(I915_PARAM_HAS_VMAP)
+#define DRM_I915_GEM_VMAP       0x2c
+#define DRM_IOCTL_I915_GEM_VMAP DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_VMAP, struct drm_i915_gem_vmap)
+#define I915_PARAM_HAS_VMAP              18
+struct drm_i915_gem_vmap {
+	uint64_t user_ptr;
+	uint32_t user_size;
+	uint32_t flags;
+#define I915_VMAP_READ_ONLY 0x1
+	uint32_t handle;
+};
+#endif
 
 struct kgem_partial_bo {
 	struct kgem_bo base;
@@ -561,6 +576,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	list_init(&kgem->partial);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
+	list_init(&kgem->sync_list);
 	list_init(&kgem->large);
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
 		list_init(&kgem->inactive[i]);
@@ -577,7 +593,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 
 	kgem->next_request = __kgem_request_alloc();
 
-#if defined(USE_VMAP) && defined(I915_PARAM_HAS_VMAP)
+#if defined(USE_VMAP)
 	if (!DBG_NO_VMAP)
 		kgem->has_vmap = gem_param(kgem, I915_PARAM_HAS_VMAP) > 0;
 #endif
@@ -605,9 +621,8 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		}
 		kgem->has_llc = has_llc;
 	}
-	kgem->has_cpu_bo = kgem->has_llc;
-	DBG(("%s: cpu bo enabled %d: llc? %d\n", __FUNCTION__,
-	     kgem->has_cpu_bo, kgem->has_llc));
+	DBG(("%s: cpu bo enabled %d: llc? %d, vmap? %d\n", __FUNCTION__,
+	     kgem->has_llc | kgem->has_vmap, kgem->has_llc, kgem->has_vmap));
 
 	kgem->has_semaphores = false;
 	if (gen >= 60 && semaphores_enabled())
@@ -688,10 +703,13 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		kgem->max_upload_tile_size = half_gpu_max;
 
 	kgem->large_object_size = MAX_CACHE_SIZE;
-	if (kgem->large_object_size > kgem->max_cpu_size)
-		kgem->large_object_size = kgem->max_cpu_size;
 	if (kgem->large_object_size > kgem->max_gpu_size)
 		kgem->large_object_size = kgem->max_gpu_size;
+	if (kgem->has_llc | kgem->has_vmap) {
+		if (kgem->large_object_size > kgem->max_cpu_size)
+			kgem->large_object_size = kgem->max_cpu_size;
+	} else
+		kgem->max_cpu_size = 0;
 
 	DBG(("%s: large object thresold=%d\n",
 	     __FUNCTION__, kgem->large_object_size));
@@ -887,8 +905,10 @@ void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
 
 	/* XXX is it worth working around gcc here? */
 	kgem->flush |= bo->flush;
-	kgem->sync |= bo->sync;
 	kgem->scanout |= bo->scanout;
+
+	if (bo->sync)
+		kgem->sync = kgem->next_request;
 }
 
 static uint32_t kgem_end_batch(struct kgem *kgem)
@@ -932,12 +952,14 @@ static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
 {
 	int type = IS_CPU_MAP(bo->map);
 
+	assert(!IS_VMAP_MAP(bo->map));
+
 	DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
 	     __FUNCTION__, type ? "CPU" : "GTT",
 	     bo->handle, kgem->vma[type].count));
 
-	VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-	munmap(CPU_MAP(bo->map), bytes(bo));
+	VG(if (type) VALGRIND_FREELIKE_BLOCK(MAP(bo->map), 0));
+	munmap(MAP(bo->map), bytes(bo));
 	bo->map = NULL;
 
 	if (!list_is_empty(&bo->vma)) {
@@ -951,9 +973,15 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 	assert(bo->refcnt == 0);
 	assert(bo->exec == NULL);
+	assert(!bo->vmap || bo->rq == NULL);
 
 	kgem_bo_binding_free(kgem, bo);
 
+	if (IS_VMAP_MAP(bo->map)) {
+		assert(bo->rq == NULL);
+		free(MAP(bo->map));
+		bo->map = NULL;
+	}
 	if (bo->map)
 		kgem_bo_release_map(kgem, bo);
 	assert(list_is_empty(&bo->vma));
@@ -978,6 +1006,7 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(!bo->needs_flush);
 	assert(bo->rq == NULL);
 	assert(bo->domain != DOMAIN_GPU);
+	assert(bo->reusable);
 
 	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
 		kgem_bo_free(kgem, bo);
@@ -990,7 +1019,7 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
 		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
 			list_del(&bo->vma);
-			munmap(CPU_MAP(bo->map), bytes(bo));
+			munmap(MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
 		if (bo->map) {
@@ -1035,6 +1064,19 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	if (NO_CACHE)
 		goto destroy;
 
+	if (bo->vmap) {
+		if (bo->rq == NULL) {
+			if (bo->needs_flush && kgem_busy(kgem, bo->handle)) {
+				list_add(&bo->request, &kgem->flushing);
+				bo->rq = &_kgem_static_request;
+			} else
+				kgem_bo_free(kgem, bo);
+		} else {
+			assert(!bo->sync);
+		}
+		return;
+	}
+
 	if (bo->io) {
 		struct kgem_bo *base;
 
@@ -1065,7 +1107,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	assert(list_is_empty(&bo->vma));
 	assert(list_is_empty(&bo->list));
-	assert(bo->vmap == false && bo->sync == false);
+	assert(bo->vmap == false);
 	assert(bo->io == false);
 	assert(bo->scanout == false);
 	assert(bo->flush == false);
@@ -1197,7 +1239,7 @@ bool kgem_retire(struct kgem *kgem)
 
 		DBG(("%s: moving %d from flush to inactive\n",
 		     __FUNCTION__, bo->handle));
-		if (kgem_bo_set_purgeable(kgem, bo)) {
+		if (bo->reusable && kgem_bo_set_purgeable(kgem, bo)) {
 			bo->needs_flush = false;
 			bo->domain = DOMAIN_NONE;
 			bo->rq = NULL;
@@ -1278,6 +1320,9 @@ bool kgem_retire(struct kgem *kgem)
 			kgem_bo_free(kgem, rq->bo);
 		}
 
+		if (kgem->sync == rq)
+			kgem->sync = NULL;
+
 		_list_del(&rq->list);
 		free(rq);
 	}
@@ -1308,7 +1353,7 @@ static void kgem_commit(struct kgem *kgem)
 		bo->presumed_offset = bo->exec->offset;
 		bo->exec = NULL;
 
-		if (!bo->refcnt && !bo->reusable) {
+		if (!bo->refcnt && !bo->reusable && !bo->vmap) {
 			kgem_bo_free(kgem, bo);
 			continue;
 		}
@@ -1708,8 +1753,10 @@ void _kgem_submit(struct kgem *kgem)
 #if !NDEBUG
 			if (ret < 0) {
 				int i;
-				ErrorF("batch (end=%d, size=%d) submit failed: %d\n",
-				       batch_end, size, errno);
+
+				ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d: errno=%d\n",
+				       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
+				       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, errno);
 
 				i = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
 				if (i != -1) {
@@ -1746,8 +1793,7 @@ void _kgem_submit(struct kgem *kgem)
 					       kgem->reloc[i].write_domain,
 					       (int)kgem->reloc[i].presumed_offset);
 				}
-				FatalError("SNA: failed to submit batchbuffer: ret=%d\n",
-					   errno);
+				FatalError("SNA: failed to submit batchbuffer\n");
 			}
 #endif
 
@@ -2594,6 +2640,7 @@ search_inactive:
 	cache = &kgem->inactive[bucket];
 	list_for_each_entry_safe(bo, next, cache, list) {
 		assert(bucket(bo) == bucket);
+		assert(bo->reusable);
 
 		if (size > num_pages(bo)) {
 			DBG(("inactive too small: %d < %d\n",
@@ -2673,6 +2720,59 @@ create:
 	return bo;
 }
 
+struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
+				   int width,
+				   int height,
+				   int bpp,
+				   uint32_t flags)
+{
+	struct kgem_bo *bo;
+
+	DBG(("%s(%dx%d, bpp=%d)\n", __FUNCTION__, width, height, bpp));
+
+	if (kgem->has_llc) {
+		bo = kgem_create_2d(kgem, width, height, bpp,
+				    I915_TILING_NONE, flags);
+		if (bo == NULL)
+			return bo;
+
+		if (kgem_bo_map__cpu(kgem, bo) == NULL) {
+			_kgem_bo_destroy(kgem, bo);
+			return NULL;
+		}
+
+		return bo;
+	}
+
+	if (kgem->has_vmap) {
+		int stride, size;
+		void *ptr;
+
+		stride = ALIGN(width, 2) * bpp >> 3;
+		stride = ALIGN(stride, 4);
+		size = ALIGN(height, 2) * stride;
+
+		assert(size >= PAGE_SIZE);
+
+		/* XXX */
+		//if (posix_memalign(&ptr, 64, ALIGN(size, 64)))
+		if (posix_memalign(&ptr, PAGE_SIZE, ALIGN(size, PAGE_SIZE)))
+			return NULL;
+
+		bo = kgem_create_map(kgem, ptr, size, false);
+		if (bo == NULL) {
+			free(ptr);
+			return NULL;
+		}
+
+		bo->map = MAKE_VMAP_MAP(ptr);
+		bo->pitch = stride;
+		return bo;
+	}
+
+	return NULL;
+}
+
 static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
@@ -2702,9 +2802,6 @@ void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		return;
 	}
 
-	if (bo->vmap)
-		kgem_bo_sync__cpu(kgem, bo);
-
 	__kgem_bo_destroy(kgem, bo);
 }
 
@@ -2915,8 +3012,8 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 		assert(bo->map);
 		assert(bo->rq == NULL);
 
-		VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-		munmap(CPU_MAP(bo->map), bytes(bo));
+		VG(if (type) VALGRIND_FREELIKE_BLOCK(MAP(bo->map), 0));
+		munmap(MAP(bo->map), bytes(bo));
 		bo->map = NULL;
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
@@ -2996,7 +3093,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 {
 	if (bo->map)
-		return CPU_MAP(bo->map);
+		return MAP(bo->map);
 
 	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 	return bo->map = gem_mmap(kgem->fd, bo->handle, bytes(bo),
@@ -3012,7 +3109,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	assert(list_is_empty(&bo->list));
 
 	if (IS_CPU_MAP(bo->map))
-		return CPU_MAP(bo->map);
+		return MAP(bo->map);
 
 	if (bo->map)
 		kgem_bo_release_map(kgem, bo);
@@ -3067,7 +3164,7 @@ uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
 	return flink.name;
 }
 
-#if defined(USE_VMAP) && defined(I915_PARAM_HAS_VMAP)
+#if defined(USE_VMAP)
 static uint32_t gem_vmap(int fd, void *ptr, int size, int read_only)
 {
 	struct drm_i915_gem_vmap vmap;
@@ -3095,14 +3192,11 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	if (!kgem->has_vmap)
 		return NULL;
 
-	if (size >= MAX_CACHE_SIZE)
-		return NULL;
-
 	handle = gem_vmap(kgem->fd, ptr, size, read_only);
 	if (handle == 0)
 		return NULL;
 
-	bo = __kgem_bo_alloc(handle, size);
+	bo = __kgem_bo_alloc(handle, NUM_PAGES(size));
 	if (bo == NULL) {
 		gem_close(kgem->fd, handle);
 		return NULL;
@@ -3110,10 +3204,9 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 
 	bo->reusable = false;
 	bo->vmap = true;
-	bo->sync = true;
 
-	DBG(("%s(ptr=%p, size=%d, read_only=%d) => handle=%d\n",
-	     __FUNCTION__, ptr, size, read_only, handle));
+	DBG(("%s(ptr=%p, size=%d, pages=%d, read_only=%d) => handle=%d\n",
+	     __FUNCTION__, ptr, size, NUM_PAGES(size), read_only, handle));
 	return bo;
 }
 #else
@@ -3129,7 +3222,6 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	kgem_bo_submit(kgem, bo);
 
-	/* XXX assumes bo is snoopable */
 	if (bo->domain != DOMAIN_CPU) {
 		struct drm_i915_gem_set_domain set_domain;
 
@@ -3148,28 +3240,40 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	}
 }
 
+void kgem_bo_set_sync(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(!bo->reusable);
+	list_add(&bo->list, &kgem->sync_list);
+	bo->sync = true;
+}
+
 void kgem_sync(struct kgem *kgem)
 {
+	struct drm_i915_gem_set_domain set_domain;
+	struct kgem_request *rq;
+	struct kgem_bo *bo;
+
 	DBG(("%s\n", __FUNCTION__));
 
-	if (!list_is_empty(&kgem->requests)) {
-		struct drm_i915_gem_set_domain set_domain;
-		struct kgem_request *rq;
+	rq = kgem->sync;
+	if (rq == NULL)
+		return;
 
-		rq = list_first_entry(&kgem->requests,
-				      struct kgem_request,
-				      list);
+	if (rq == kgem->next_request)
+		_kgem_submit(kgem);
 
-		VG_CLEAR(set_domain);
-		set_domain.handle = rq->bo->handle;
-		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
-		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+	VG_CLEAR(set_domain);
+	set_domain.handle = rq->bo->handle;
+	set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+	set_domain.write_domain = I915_GEM_DOMAIN_GTT;
 
-		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-		kgem_retire(kgem);
-	}
+	drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
+	kgem_retire(kgem);
+
+	list_for_each_entry(bo, &kgem->sync_list, list)
+		kgem_bo_sync__cpu(kgem, bo);
 
-	kgem->sync = false;
+	assert (kgem->sync == NULL);
 }
 
 void kgem_clear_dirty(struct kgem *kgem)
@@ -3262,11 +3366,20 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			goto done;
 		}
 
-		if ((bo->write & KGEM_BUFFER_WRITE) != (flags & KGEM_BUFFER_WRITE) ||
-		    (bo->write & ~flags) & KGEM_BUFFER_INPLACE) {
-			DBG(("%s: skip write %x buffer, need %x\n",
-			     __FUNCTION__, bo->write, flags));
-			continue;
+		if (flags & KGEM_BUFFER_WRITE) {
+			if ((bo->write & KGEM_BUFFER_WRITE) == 0 ||
+			    (((bo->write & ~flags) & KGEM_BUFFER_INPLACE) &&
+			     !bo->base.vmap)) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
+		} else {
+			if (bo->write & KGEM_BUFFER_WRITE) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
 		}
 
 		if (bo->used + size <= bytes(&bo->base)) {
@@ -3290,7 +3403,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	if (alloc > MAX_CACHE_SIZE)
 		alloc = PAGE_ALIGN(size);
 	alloc /= PAGE_SIZE;
-	if (kgem->has_cpu_bo) {
+	if (kgem->has_llc) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
 			return NULL;
@@ -3420,6 +3533,29 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		alloc = PAGE_ALIGN(size) / PAGE_SIZE;
 	flags &= ~KGEM_BUFFER_INPLACE;
 
+	if (kgem->has_vmap) {
+		bo = partial_bo_alloc(alloc);
+		if (bo) {
+			if (!__kgem_bo_init(&bo->base,
+					    gem_vmap(kgem->fd, bo->mem,
+						     alloc * PAGE_SIZE, false),
+					    alloc)) {
+				free(bo);
+				return NULL;
+			}
+
+			DBG(("%s: created vmap handle=%d for buffer\n",
+			     __FUNCTION__, bo->base.handle));
+
+			bo->need_io = false;
+			bo->base.io = true;
+			bo->base.vmap = true;
+			bo->mmapped = true;
+
+			goto init;
+		}
+	}
+
 	old = NULL;
 	if ((flags & KGEM_BUFFER_WRITE) == 0)
 		old = search_linear_cache(kgem, alloc, 0);
@@ -3561,7 +3697,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	assert(width > 0 && height > 0);
 	assert(ret != NULL);
 	stride = ALIGN(width, 2) * bpp >> 3;
-	stride = ALIGN(stride, kgem->min_alignment);
+	stride = ALIGN(stride, 4);
 
 	DBG(("%s: %dx%d, %d bpp, stride=%d\n",
 	     __FUNCTION__, width, height, bpp, stride));
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 30303ce..58316dc 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -127,7 +127,9 @@ struct kgem {
 	struct list inactive[NUM_CACHE_BUCKETS];
 	struct list partial;
 	struct list requests;
+	struct list sync_list;
 	struct kgem_request *next_request;
+	struct kgem_request *sync;
 
 	struct {
 		struct list inactive[NUM_CACHE_BUCKETS];
@@ -142,7 +144,6 @@ struct kgem {
 	uint16_t max_batch_size;
 
 	uint32_t flush:1;
-	uint32_t sync:1;
 	uint32_t need_expire:1;
 	uint32_t need_purge:1;
 	uint32_t need_retire:1;
@@ -154,7 +155,6 @@ struct kgem {
 	uint32_t has_relaxed_fencing :1;
 	uint32_t has_semaphores :1;
 	uint32_t has_llc :1;
-	uint32_t has_cpu_bo :1;
 
 	uint16_t fence_max;
 	uint16_t half_cpu_cache_pages;
@@ -229,6 +229,11 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			       int bpp,
 			       int tiling,
 			       uint32_t flags);
+struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
+				   int width,
+				   int height,
+				   int bpp,
+				   uint32_t flags);
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
@@ -359,6 +364,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_bo_set_sync(struct kgem *kgem, struct kgem_bo *bo);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 
 Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 00fc80a..c772d7d 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -115,11 +115,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "sna_damage.h"
 #include "sna_render.h"
 
-#ifndef CREATE_PIXMAP_USAGE_SCRATCH_HEADER
-#define FAKE_CREATE_PIXMAP_USAGE_SCRATCH_HEADER 1
-#define CREATE_PIXMAP_USAGE_SCRATCH_HEADER (unsigned)-1
-#endif
-
 #define SNA_CURSOR_X			64
 #define SNA_CURSOR_Y			SNA_CURSOR_X
 
@@ -226,13 +221,14 @@ struct sna {
 #define SNA_NO_THROTTLE		0x1
 #define SNA_NO_DELAYED_FLUSH	0x2
 
+	unsigned flush;
+
 	int timer[NUM_TIMERS];
 	uint16_t timer_active;
 	uint16_t timer_ready;
 
 	int vblank_interval;
 
-	struct list deferred_free;
 	struct list dirty_pixmaps;
 	struct list active_pixmaps;
 	struct list inactive_clock[2];
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 8fa59a6..52e75c7 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -44,6 +44,7 @@
 #include <fbpict.h>
 #endif
 #include <miline.h>
+#include <shmint.h>
 
 #include <sys/time.h>
 #include <sys/mman.h>
@@ -61,7 +62,8 @@
 #define USE_INPLACE 1
 #define USE_WIDE_SPANS 1 /* -1 force CPU, 1 force GPU */
 #define USE_ZERO_SPANS 1 /* -1 force CPU, 1 force GPU */
-#define USE_BO_FOR_SCRATCH_PIXMAP 1
+#define USE_SHM_VMAP 0
+#define PREFER_VMAP 0
 
 #define MIGRATE_ALL 0
 
@@ -302,27 +304,21 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	DBG(("%s: pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
 	assert(priv->stride);
 
-	if ((sna->kgem.has_cpu_bo || (priv->create & KGEM_CAN_CREATE_GPU) == 0) &&
-	    (priv->create & KGEM_CAN_CREATE_CPU)) {
+	if (priv->create & KGEM_CAN_CREATE_CPU) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
-		priv->cpu_bo = kgem_create_2d(&sna->kgem,
-					      pixmap->drawable.width,
-					      pixmap->drawable.height,
-					      pixmap->drawable.bitsPerPixel,
-					      I915_TILING_NONE,
-					      from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE);
+		priv->cpu_bo = kgem_create_cpu_2d(&sna->kgem,
+						  pixmap->drawable.width,
+						  pixmap->drawable.height,
+						  pixmap->drawable.bitsPerPixel,
+						  from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE);
 		DBG(("%s: allocated CPU handle=%d\n", __FUNCTION__,
 		     priv->cpu_bo->handle));
 
 		if (priv->cpu_bo) {
 			priv->ptr = kgem_bo_map__cpu(&sna->kgem, priv->cpu_bo);
-			if (priv->ptr == NULL) {
-				kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-				priv->cpu_bo = NULL;
-			} else
-				priv->stride = priv->cpu_bo->pitch;
+			priv->stride = priv->cpu_bo->pitch;
 		}
 	}
 
@@ -349,7 +345,8 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
 		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
-
+		if (priv->cpu_bo->sync)
+			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 		priv->cpu_bo = NULL;
 	} else
@@ -377,14 +374,6 @@ static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv)
 	if (priv->ptr)
 		sna_pixmap_free_cpu(sna, priv);
 
-	if (priv->cpu_bo) {
-		if (priv->cpu_bo->vmap && kgem_bo_is_busy(priv->cpu_bo)) {
-			list_add_tail(&priv->list, &sna->deferred_free);
-			return false;
-		}
-		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-	}
-
 	if (!sna->freed_pixmap && priv->header) {
 		sna->freed_pixmap = pixmap;
 		assert(priv->ptr == NULL);
@@ -524,7 +513,6 @@ _sna_pixmap_reset(PixmapPtr pixmap)
 
 	assert(pixmap->drawable.type == DRAWABLE_PIXMAP);
 	assert(pixmap->drawable.class == 0);
-	assert(pixmap->drawable.id == 0);
 	assert(pixmap->drawable.x == 0);
 	assert(pixmap->drawable.y == 0);
 
@@ -561,11 +549,6 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 	     pixmap->usage_hint));
 
 	switch (pixmap->usage_hint) {
-	case CREATE_PIXMAP_USAGE_SCRATCH_HEADER:
-#if !FAKE_CREATE_PIXMAP_USAGE_SCRATCH_HEADER
-		if (sna->kgem.has_vmap)
-			break;
-#endif
 	case CREATE_PIXMAP_USAGE_GLYPH_PICTURE:
 		DBG(("%s: not attaching due to crazy usage: %d\n",
 		     __FUNCTION__, pixmap->usage_hint));
@@ -594,15 +577,6 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 		       pixmap->drawable.width,
 		       pixmap->drawable.height);
 
-	if (pixmap->usage_hint == CREATE_PIXMAP_USAGE_SCRATCH_HEADER) {
-		priv->cpu_bo = kgem_create_map(&sna->kgem,
-					       pixmap->devPrivate.ptr,
-					       pixmap_size(pixmap),
-					       0);
-		if (priv->cpu_bo)
-			priv->cpu_bo->pitch = pixmap->devKind;
-	}
-
 	return priv;
 }
 
@@ -630,6 +604,75 @@ create_pixmap(struct sna *sna, ScreenPtr screen,
 }
 
 static PixmapPtr
+sna_pixmap_create_shm(ScreenPtr screen,
+		      int width, int height, int depth,
+		      char *addr)
+{
+	struct sna *sna = to_sna_from_screen(screen);
+	int bpp = BitsPerPixel(depth);
+	int pitch = PixmapBytePad(width, depth);
+	struct sna_pixmap *priv;
+	PixmapPtr pixmap;
+
+	DBG(("%s(%d, %d, %d)\n", __FUNCTION__,
+	     width, height, depth));
+
+	if (sna->freed_pixmap) {
+		pixmap = sna->freed_pixmap;
+		sna->freed_pixmap = NULL;
+
+		pixmap->usage_hint = -1;
+		pixmap->refcnt = 1;
+
+		pixmap->drawable.width = width;
+		pixmap->drawable.height = height;
+		pixmap->drawable.depth = depth;
+		pixmap->drawable.bitsPerPixel = bpp;
+		pixmap->drawable.serialNumber = NEXT_SERIAL_NUMBER;
+
+		DBG(("%s: serial=%ld, %dx%d\n",
+		     __FUNCTION__,
+		     pixmap->drawable.serialNumber,
+		     pixmap->drawable.width,
+		     pixmap->drawable.height));
+
+		priv = _sna_pixmap_reset(pixmap);
+	} else {
+		pixmap = create_pixmap(sna, screen, 0, 0, depth, -1);
+		if (pixmap == NullPixmap)
+			return NullPixmap;
+
+		pixmap->drawable.width = width;
+		pixmap->drawable.height = height;
+		pixmap->drawable.depth = depth;
+		pixmap->drawable.bitsPerPixel = bpp;
+
+		priv = __sna_pixmap_attach(sna, pixmap);
+		if (!priv) {
+			fbDestroyPixmap(pixmap);
+			return NullPixmap;
+		}
+	}
+
+	priv->cpu_bo = kgem_create_map(&sna->kgem, addr, pitch*height, false);
+	if (priv->cpu_bo == NULL) {
+		free(priv);
+		fbDestroyPixmap(pixmap);
+		return GetScratchPixmapHeader(screen, width, height, depth,
+					      bpp, pitch, addr);
+	}
+	kgem_bo_set_sync(&sna->kgem, priv->cpu_bo);
+	priv->cpu_bo->pitch = pitch;
+
+	priv->header = true;
+	sna_damage_all(&priv->cpu_damage, width, height);
+
+	pixmap->devKind = pitch;
+	pixmap->devPrivate.ptr = addr;
+	return pixmap;
+}
+
+static PixmapPtr
 sna_pixmap_create_scratch(ScreenPtr screen,
 			  int width, int height, int depth,
 			  uint32_t tiling)
@@ -723,6 +766,11 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
+	if ((width|height) == 0) {
+		usage = -1;
+		goto fallback;
+	}
+
 	if (!sna->have_render)
 		goto fallback;
 
@@ -733,11 +781,6 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		goto fallback;
 	}
 
-#if FAKE_CREATE_PIXMAP_USAGE_SCRATCH_HEADER
-	if (width == 0 || height == 0)
-		goto fallback;
-#endif
-
 	if (usage == CREATE_PIXMAP_USAGE_SCRATCH) {
 		if (flags & KGEM_CAN_CREATE_GPU)
 			return sna_pixmap_create_scratch(screen,
@@ -919,14 +962,16 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 			sna_damage_destroy(&priv->cpu_damage);
 			priv->undamaged = false;
 			list_del(&priv->list);
-			if (priv->cpu_bo)
+			if (priv->cpu_bo) {
+				assert(!priv->cpu_bo->sync);
 				sna_pixmap_free_cpu(sna, priv);
+			}
 
 			return true;
 		}
 
 skip_inplace_map:
-		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)) {
+		if (priv->cpu_bo && !priv->cpu_bo->sync && kgem_bo_is_busy(priv->cpu_bo)) {
 			if (priv->cpu_bo->exec == NULL)
 				kgem_retire(&sna->kgem);
 
@@ -941,6 +986,7 @@ skip_inplace_map:
 					list_del(&priv->list);
 					priv->undamaged = false;
 				}
+				assert(!priv->cpu_bo->sync);
 				sna_pixmap_free_cpu(sna, priv);
 			}
 		}
@@ -982,7 +1028,7 @@ skip_inplace_map:
 	}
 
 	if (priv->clear) {
-		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		if (priv->cpu_bo && !priv->cpu_bo->sync && kgem_bo_is_busy(priv->cpu_bo))
 			sna_pixmap_free_cpu(sna, priv);
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->undamaged = true;
@@ -1282,7 +1328,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			}
 		}
 
-		if (priv->cpu_bo && !priv->cpu_bo->vmap) {
+		if (priv->cpu_bo && !priv->cpu_bo->sync) {
 			if (sync_will_stall(priv->cpu_bo) && priv->cpu_bo->exec == NULL)
 				kgem_retire(&sna->kgem);
 			if (sync_will_stall(priv->cpu_bo)) {
@@ -1372,7 +1418,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			assert(pixmap_contains_damage(pixmap, priv->gpu_damage));
 
 			ok = FALSE;
-			if (priv->cpu_bo && sna->kgem.gen >= 60)
+			if (priv->cpu_bo && sna->kgem.gen >= 30)
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
 							    pixmap, priv->cpu_bo, 0, 0,
@@ -1910,6 +1956,9 @@ use_cpu_bo:
 	if (priv->cpu_bo == NULL)
 		return NULL;
 
+	if (priv->cpu_bo->sync && !kgem_bo_is_busy(priv->cpu_bo))
+		return NULL;
+
 	/* Continue to use the shadow pixmap once mapped */
 	if (pixmap->devPrivate.ptr) {
 		/* But only if we do not need to sync the CPU bo */
@@ -2084,6 +2133,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 		sna_damage_destroy(&priv->cpu_damage);
 		priv->undamaged = false;
 		list_del(&priv->list);
+		assert(!priv->cpu_bo->sync);
 		sna_pixmap_free_cpu(to_sna_from_pixmap(pixmap), priv);
 	}
 
@@ -2098,8 +2148,6 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	BoxPtr box;
 	int n;
 
-	assert(pixmap->usage_hint != CREATE_PIXMAP_USAGE_SCRATCH_HEADER);
-
 	DBG(("%s(pixmap=%ld, usage=%d)\n",
 	     __FUNCTION__, pixmap->drawable.serialNumber, pixmap->usage_hint));
 
@@ -2209,8 +2257,10 @@ done:
 			      pixmap->drawable.height);
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		priv->undamaged = false;
-		if (priv->ptr)
+		if (priv->ptr) {
+			assert(!priv->cpu_bo->sync);
 			sna_pixmap_free_cpu(sna, priv);
+		}
 	}
 active:
 	return sna_pixmap_mark_active(to_sna_from_pixmap(pixmap), priv);
@@ -2410,8 +2460,6 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	struct kgem_bo *src_bo;
-	Bool ok = FALSE;
 	BoxPtr box;
 	int nbox;
 	int16_t dx, dy;
@@ -2423,14 +2471,16 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	     __FUNCTION__, nbox,
 	     box->x1, box->y1, box->x2, box->y2));
 
+	if (gc->alu != GXcopy)
+		return FALSE;
+
 	if (priv->gpu_bo == NULL &&
 	    !sna_pixmap_create_mappable_gpu(pixmap))
 		return FALSE;
 
 	assert(priv->gpu_bo);
 
-	if (gc->alu == GXcopy &&
-	    !priv->pinned && nbox == 1 &&
+	if (!priv->pinned && nbox == 1 &&
 	    box->x1 <= 0 && box->y1 <= 0 &&
 	    box->x2 >= pixmap->drawable.width &&
 	    box->y2 >= pixmap->drawable.height)
@@ -2440,25 +2490,10 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	x += dx + drawable->x;
 	y += dy + drawable->y;
 
-	src_bo = kgem_create_map(&sna->kgem, bits, stride*h, 1);
-	if (src_bo) {
-		src_bo->pitch = stride;
-		ok = sna->render.copy_boxes(sna, gc->alu,
-					    pixmap, src_bo, -x, -y,
-					    pixmap, priv->gpu_bo, 0, 0,
-					    box, nbox);
-		kgem_bo_destroy(&sna->kgem, src_bo);
-	}
-
-	if (!ok && gc->alu == GXcopy)
-		ok = sna_write_boxes(sna, pixmap,
-				     priv->gpu_bo, 0, 0,
-				     bits,
-				     stride,
-				     -x, -y,
-				     box, nbox);
-
-	return ok;
+	return sna_write_boxes(sna, pixmap,
+			       priv->gpu_bo, 0, 0,
+			       bits, stride, -x, -y,
+			       box, nbox);
 }
 
 static bool upload_inplace(struct sna *sna,
@@ -2561,7 +2596,7 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		if (sync_will_stall(priv->cpu_bo) && priv->cpu_bo->exec == NULL)
 			kgem_retire(&sna->kgem);
 		if (sync_will_stall(priv->cpu_bo)) {
-			if (priv->cpu_bo->vmap) {
+			if (priv->cpu_bo->sync) {
 				if (sna_put_image_upload_blt(drawable, gc, region,
 							     x, y, w, h, bits, stride)) {
 					if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
@@ -2603,6 +2638,7 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 					list_del(&priv->list);
 					priv->undamaged = false;
 				}
+				assert(!priv->cpu_bo->sync);
 				sna_pixmap_free_cpu(sna, priv);
 			}
 		}
@@ -3236,6 +3272,22 @@ static bool copy_use_gpu_bo(struct sna *sna,
 	return kgem_bo_is_busy(priv->cpu_bo);
 }
 
+static bool
+copy_use_cpu_bo(struct sna_pixmap *priv, struct kgem_bo *dst_bo)
+{
+	if (priv == NULL || priv->cpu_bo == NULL)
+		return false;
+
+	if (PREFER_VMAP) {
+		return true;
+	} else {
+		if (kgem_bo_is_busy(priv->cpu_bo) || kgem_bo_is_busy(dst_bo))
+			return true;
+
+		return !priv->cpu_bo->sync;
+	}
+}
+
 static void
 sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	       BoxPtr box, int n,
@@ -3433,7 +3485,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 					RegionTranslate(&region, -dst_dx, -dst_dy);
 				}
 			}
-		} else if (src_priv && src_priv->cpu_bo) {
+		} else if (copy_use_cpu_bo(src_priv, dst_priv->gpu_bo)) {
 			if (!sna->render.copy_boxes(sna, alu,
 						    src_pixmap, src_priv->cpu_bo, src_dx, src_dy,
 						    dst_pixmap, dst_priv->gpu_bo, dst_dx, dst_dy,
@@ -11445,17 +11497,30 @@ static Bool sna_change_window_attributes(WindowPtr win, unsigned long mask)
 }
 
 static void
+sna_accel_reply_callback(CallbackListPtr *list,
+			 pointer user_data, pointer call_data)
+{
+	struct sna *sna = user_data;
+	ReplyInfoRec *info = call_data;
+
+	if (sna->flush || !info->startOfReply)
+		return;
+
+	sna->flush = sna->kgem.flush || sna->kgem.sync;
+}
+
+static void
 sna_accel_flush_callback(CallbackListPtr *list,
 			 pointer user_data, pointer call_data)
 {
 	struct sna *sna = user_data;
 	struct list preserve;
 
-	if ((sna->kgem.sync|sna->kgem.flush) == 0 &&
-	    list_is_empty(&sna->dirty_pixmaps))
+	if (!sna->flush)
 		return;
 
-	DBG(("%s\n", __FUNCTION__));
+	DBG(("%s: need_sync=%d, need_flush=%d, dirty? %d\n", __FUNCTION__,
+	     sna->kgem.sync!=NULL, sna->kgem.flush, !list_is_empty(&sna->dirty_pixmaps)));
 
 	/* flush any pending damage from shadow copies to tfp clients */
 	list_init(&preserve);
@@ -11476,35 +11541,9 @@ sna_accel_flush_callback(CallbackListPtr *list,
 	kgem_submit(&sna->kgem);
 	sna->kgem.flush_now = 0;
 
-	if (sna->kgem.sync) {
-		kgem_sync(&sna->kgem);
+	kgem_sync(&sna->kgem);
 
-		while (!list_is_empty(&sna->deferred_free)) {
-			struct sna_pixmap *priv =
-				list_first_entry(&sna->deferred_free,
-						 struct sna_pixmap,
-						 list);
-			list_del(&priv->list);
-			kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-			fbDestroyPixmap(priv->pixmap);
-			free(priv);
-		}
-	}
-}
-
-static void sna_deferred_free(struct sna *sna)
-{
-	struct sna_pixmap *priv, *next;
-
-	list_for_each_entry_safe(priv, next, &sna->deferred_free, list) {
-		if (kgem_bo_is_busy(priv->cpu_bo))
-			continue;
-
-		list_del(&priv->list);
-		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-		fbDestroyPixmap(priv->pixmap);
-		free(priv);
-	}
+	sna->flush = false;
 }
 
 static struct sna_pixmap *sna_accel_scanout(struct sna *sna)
@@ -11768,6 +11807,7 @@ static void sna_accel_inactive(struct sna *sna)
 			sna_damage_destroy(&priv->cpu_damage);
 			list_del(&priv->list);
 
+			assert(!priv->cpu_bo->sync);
 			sna_pixmap_free_cpu(sna, priv);
 			priv->undamaged = false;
 
@@ -11819,10 +11859,14 @@ Bool sna_accel_pre_init(struct sna *sna)
 	return TRUE;
 }
 
+static ShmFuncs shm_funcs = { sna_pixmap_create_shm, NULL };
+
 Bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 {
 	const char *backend;
 
+	if (!AddCallback(&ReplyCallback, sna_accel_reply_callback, sna))
+		return FALSE;
 	if (!AddCallback(&FlushCallback, sna_accel_flush_callback, sna))
 		return FALSE;
 
@@ -11830,7 +11874,6 @@ Bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 	screen->RealizeFont = sna_realize_font;
 	screen->UnrealizeFont = sna_unrealize_font;
 
-	list_init(&sna->deferred_free);
 	list_init(&sna->dirty_pixmaps);
 	list_init(&sna->active_pixmaps);
 	list_init(&sna->inactive_clock[0]);
@@ -11866,6 +11909,9 @@ Bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 	}
 #endif
 
+	if (USE_SHM_VMAP && sna->kgem.has_vmap)
+		ShmRegisterFuncs(screen, &shm_funcs);
+
 	backend = "no";
 	sna->have_render = false;
 	sna->default_tiling = I915_TILING_X;
@@ -11933,6 +11979,7 @@ void sna_accel_close(struct sna *sna)
 	sna_glyphs_close(sna);
 
 	DeleteCallback(&FlushCallback, sna_accel_flush_callback, sna);
+	DeleteCallback(&ReplyCallback, sna_accel_reply_callback, sna);
 
 	kgem_cleanup_cache(&sna->kgem);
 }
@@ -11976,8 +12023,6 @@ void sna_accel_wakeup_handler(struct sna *sna, fd_set *ready)
 	for (id = 0; id < NUM_TIMERS; id++)
 		if (active & (1 << id) && FD_ISSET(sna->timer[id], ready))
 			sna->timer_ready |= 1 << id;
-
-	sna_deferred_free(sna);
 }
 
 void sna_accel_free(struct sna *sna)
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index a9ec899..8c51a77 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -907,7 +907,7 @@ prepare_blt_clear(struct sna *sna,
 {
 	DBG(("%s\n", __FUNCTION__));
 
-	op->blt   = blt_composite_fill;
+	op->blt = blt_composite_fill;
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
@@ -915,7 +915,7 @@ prepare_blt_clear(struct sna *sna,
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
 	}
-	op->done  = nop_done;
+	op->done = nop_done;
 
 	return sna_blt_fill_init(sna, &op->u.blt,
 				 op->dst.bo,
@@ -930,7 +930,7 @@ prepare_blt_fill(struct sna *sna,
 {
 	DBG(("%s\n", __FUNCTION__));
 
-	op->blt   = blt_composite_fill;
+	op->blt = blt_composite_fill;
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
@@ -938,7 +938,7 @@ prepare_blt_fill(struct sna *sna,
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
 	}
-	op->done  = nop_done;
+	op->done = nop_done;
 
 	return sna_blt_fill_init(sna, &op->u.blt, op->dst.bo,
 				 op->dst.pixmap->drawable.bitsPerPixel,
@@ -1126,9 +1126,9 @@ prepare_blt_copy(struct sna *sna,
 	DBG(("%s\n", __FUNCTION__));
 
 	if (sna->kgem.gen >= 60)
-		op->done  = gen6_blt_copy_done;
+		op->done = gen6_blt_copy_done;
 	else
-		op->done  = nop_done;
+		op->done = nop_done;
 
 	if (alpha_fixup) {
 		op->blt   = blt_composite_copy_with_alpha;
@@ -1153,14 +1153,6 @@ prepare_blt_copy(struct sna *sna,
 	}
 }
 
-static void blt_vmap_done(struct sna *sna, const struct sna_composite_op *op)
-{
-	struct kgem_bo *bo = (struct kgem_bo *)op->u.blt.src_pixmap;
-
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
-}
-
 fastcall static void
 blt_put_composite(struct sna *sna,
 		  const struct sna_composite_op *op,
@@ -1395,26 +1387,18 @@ prepare_blt_put(struct sna *sna,
 		uint32_t alpha_fixup)
 {
 	PixmapPtr src = op->u.blt.src_pixmap;
-	struct sna_pixmap *priv = sna_pixmap_attach(src);
-	struct kgem_bo *src_bo = NULL;
-	struct kgem_bo *free_bo = NULL;
+	struct sna_pixmap *priv;
+	struct kgem_bo *src_bo;
 
 	DBG(("%s\n", __FUNCTION__));
 
-	if (priv) {
+	op->done = nop_done;
+
+	src_bo = NULL;
+	priv = _sna_pixmap_attach(src);
+	if (priv)
 		src_bo = priv->cpu_bo;
-	} else {
-		src_bo = kgem_create_map(&sna->kgem,
-					 src->devPrivate.ptr,
-					 pixmap_size(src),
-					 0);
-		free_bo = src_bo;
-	}
 	if (src_bo) {
-		op->u.blt.src_pixmap = (void *)free_bo;
-		op->done = blt_vmap_done;
-
-		src_bo->pitch = src->devKind;
 		if (alpha_fixup) {
 			op->blt   = blt_composite_copy_with_alpha;
 			op->box   = blt_composite_copy_box_with_alpha;
@@ -1435,12 +1419,15 @@ prepare_blt_put(struct sna *sna,
 						 GXcopy);
 		}
 	} else {
-		if (alpha_fixup)
-			return FALSE; /* XXX */
-
 		if (!sna_pixmap_move_to_cpu(src, MOVE_READ))
 			return FALSE;
 
+		assert(src->devKind);
+		assert(src->devPrivate.ptr);
+
+		if (alpha_fixup)
+			return FALSE; /* XXX */
+
 		if (alpha_fixup) {
 			op->u.blt.pixel = alpha_fixup;
 			op->blt   = blt_put_composite_with_alpha;
@@ -1451,7 +1438,6 @@ prepare_blt_put(struct sna *sna,
 			op->box   = blt_put_composite_box;
 			op->boxes = blt_put_composite_boxes;
 		}
-		op->done  = nop_done;
 	}
 
 	return TRUE;
commit 46de691ed9bd4cc68b177af6d55288f36d229424
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Mar 1 11:10:03 2012 +0000

    sna: Discard use of inplace GTT uploads on LLC architectures
    
    As the buffer is cache-coherent, we can read as well as write to any
    partial buffer so the distinction is irrelevant.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index ee81c1b..3ba2ec3 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1171,7 +1171,6 @@ static void kgem_retire_partials(struct kgem *kgem)
 		DBG(("%s: handle=%d, used %d/%d\n", __FUNCTION__,
 		     bo->base.handle, bo->used, bytes(&bo->base)));
 
-		assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
 		assert(kgem->has_llc || !IS_CPU_MAP(bo->base.map));
 		bo->base.dirty = false;
 		bo->base.needs_flush = false;
@@ -1377,7 +1376,6 @@ static void kgem_finish_partials(struct kgem *kgem)
 			continue;
 
 		if (bo->mmapped) {
-			assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
 			assert(!bo->need_io);
 			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
 				DBG(("%s: retaining partial upload buffer (%d/%d)\n",
@@ -3242,10 +3240,14 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	/* we should never be asked to create anything TOO large */
 	assert(size <= kgem->max_cpu_size);
 
+	if (kgem->has_llc)
+		flags &= ~KGEM_BUFFER_WRITE_INPLACE;
+
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		/* We can reuse any write buffer which we can fit */
 		if (flags == KGEM_BUFFER_LAST &&
-		    bo->write == KGEM_BUFFER_WRITE && bo->base.exec &&
+		    bo->write == KGEM_BUFFER_WRITE &&
+		    bo->base.exec && !bo->mmapped &&
 		    size <= bytes(&bo->base)) {
 			assert(bo->base.refcnt == 1);
 			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
commit df20aa35d3c4e6207e0d5b2b05d8a892eb36f5ec
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 28 13:37:14 2012 +0000

    sna: Sort the partial buffers after stealing a write buffer
    
    It will be decoupled and not used again, but this keeps the sanity
    checks happy.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 3dd7863..ee81c1b 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3254,7 +3254,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				  0, bo->used, bo->mem);
 			bo->need_io = 0;
 			bo->write = 0;
-			offset = bo->used = 0;
+			offset = 0;
+			bo->used = size;
+			bubble_sort_partial(kgem, bo);
 			goto done;
 		}
 
commit bcb284ae6258cc5ee56373a5790cc4adba476ff4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 28 10:42:19 2012 +0000

    sna/gen3: Tweak glyph rendering fast paths
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 8f597cf..bd1eddd 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -524,6 +524,31 @@ gen3_emit_composite_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
+							const struct sna_composite_op *op,
+							const struct sna_composite_rectangles *r)
+{
+	float w = r->width;
+	float h = r->height;
+	float *v;
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	v[8] = v[4] = r->dst.x;
+	v[9] = r->dst.y;
+
+	v[0] = v[4] + w;
+	v[5] = v[1] = v[9] + h;
+
+	v[10] = v[6] = r->src.x * op->src.scale[0];
+	v[11] = r->src.y * op->src.scale[1];
+
+	v[2] = v[6] + w * op->src.scale[0];
+	v[7] = v[3] = v[11] + h * op->src.scale[1];
+}
+
+fastcall static void
 gen3_emit_composite_primitive_affine_source(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const struct sna_composite_rectangles *r)
@@ -584,6 +609,31 @@ gen3_emit_composite_primitive_constant_identity_mask(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_primitive_constant_identity_mask_no_offset(struct sna *sna,
+							       const struct sna_composite_op *op,
+							       const struct sna_composite_rectangles *r)
+{
+	float w = r->width;
+	float h = r->height;
+	float *v;
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	v[8] = v[4] = r->dst.x;
+	v[9] = r->dst.y;
+
+	v[0] = v[4] + w;
+	v[5] = v[1] = v[9] + h;
+
+	v[10] = v[6] = r->mask.x * op->mask.scale[0];
+	v[11] = r->mask.y * op->mask.scale[1];
+
+	v[2] = v[6] + w * op->mask.scale[0];
+	v[7] = v[3] = v[11] + h * op->mask.scale[1];
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_source_mask(struct sna *sna,
 						   const struct sna_composite_op *op,
 						   const struct sna_composite_rectangles *r)
@@ -2831,17 +2881,23 @@ gen3_render_composite(struct sna *sna,
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
 			break;
 		case SHADER_TEXTURE:
-			if (tmp->src.transform == NULL)
-				tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
-			else if (tmp->src.is_affine)
+			if (tmp->src.transform == NULL) {
+				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0)
+					tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
+				else
+					tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
+			} else if (tmp->src.is_affine)
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
 			break;
 		}
 	} else if (tmp->mask.u.gen3.type == SHADER_TEXTURE) {
 		if (tmp->mask.transform == NULL) {
-			if (is_constant_ps(tmp->src.u.gen3.type))
-				tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask;
-			else if (tmp->src.transform == NULL)
+			if (is_constant_ps(tmp->src.u.gen3.type)) {
+				if ((tmp->mask.offset[0]|tmp->mask.offset[1]|tmp->dst.x|tmp->dst.y) == 0)
+					tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset;
+				else
+					tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask;
+			} else if (tmp->src.transform == NULL)
 				tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask;
 			else if (tmp->src.is_affine)
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask;
commit 368213b8e0df97329f56361a7e560ab444a3fceb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 27 16:29:38 2012 +0000

    uxa/gen3: Remove special casing of solid pictures
    
    Fixes use of alpha-groups and opacity masks in cairo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i915_render.c b/src/i915_render.c
index 87d2336..6210035 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -257,11 +257,8 @@ i915_check_composite_texture(ScreenPtr screen, PicturePtr picture)
 		return FALSE;
 	}
 
-	if (picture->pSourcePict) {
-		SourcePict *source = picture->pSourcePict;
-		if (source->type == SourcePictTypeSolidFill)
-			return TRUE;
-	}
+	if (picture->pSourcePict)
+		return FALSE;
 
 	if (picture->pDrawable) {
 		int w, h, i;
@@ -387,23 +384,6 @@ static Bool i915_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 }
 
 static void
-i915_emit_composite_primitive_constant(intel_screen_private *intel,
-				       int srcX, int srcY,
-				       int maskX, int maskY,
-				       int dstX, int dstY,
-				       int w, int h)
-{
-	OUT_VERTEX(dstX + w);
-	OUT_VERTEX(dstY + h);
-
-	OUT_VERTEX(dstX);
-	OUT_VERTEX(dstY + h);
-
-	OUT_VERTEX(dstX);
-	OUT_VERTEX(dstY);
-}
-
-static void
 i915_emit_composite_primitive_identity_source(intel_screen_private *intel,
 					      int srcX, int srcY,
 					      int maskX, int maskY,
@@ -470,29 +450,6 @@ i915_emit_composite_primitive_affine_source(intel_screen_private *intel,
 }
 
 static void
-i915_emit_composite_primitive_constant_identity_mask(intel_screen_private *intel,
-						     int srcX, int srcY,
-						     int maskX, int maskY,
-						     int dstX, int dstY,
-						     int w, int h)
-{
-	OUT_VERTEX(dstX + w);
-	OUT_VERTEX(dstY + h);
-	OUT_VERTEX((maskX + w) * intel->scale_units[0][0]);
-	OUT_VERTEX((maskY + h) * intel->scale_units[0][1]);
-
-	OUT_VERTEX(dstX);
-	OUT_VERTEX(dstY + h);
-	OUT_VERTEX(maskX * intel->scale_units[0][0]);
-	OUT_VERTEX((maskY + h) * intel->scale_units[0][1]);
-
-	OUT_VERTEX(dstX);
-	OUT_VERTEX(dstY);
-	OUT_VERTEX(maskX * intel->scale_units[0][0]);
-	OUT_VERTEX(maskY * intel->scale_units[0][1]);
-}
-
-static void
 i915_emit_composite_primitive_identity_source_mask(intel_screen_private *intel,
 						   int srcX, int srcY,
 						   int maskX, int maskY,
@@ -536,63 +493,61 @@ i915_emit_composite_primitive(intel_screen_private *intel,
 
 	per_vertex = 2;		/* dest x/y */
 
-	if (! intel->render_source_is_solid) {
-		src_unit = tex_unit++;
-
-		is_affine_src = intel_transform_is_affine(intel->transform[src_unit]);
-		if (is_affine_src) {
-			if (!intel_get_transformed_coordinates(srcX, srcY,
-							      intel->
-							      transform[src_unit],
-							      &src_x[0],
-							      &src_y[0]))
-				return;
-
-			if (!intel_get_transformed_coordinates(srcX, srcY + h,
-							      intel->
-							      transform[src_unit],
-							      &src_x[1],
-							      &src_y[1]))
-				return;
-
-			if (!intel_get_transformed_coordinates(srcX + w, srcY + h,
-							      intel->
-							      transform[src_unit],
-							      &src_x[2],
-							      &src_y[2]))
-				return;
-
-			per_vertex += 2;	/* src x/y */
-		} else {
-			if (!intel_get_transformed_coordinates_3d(srcX, srcY,
-								 intel->
-								 transform[src_unit],
-								 &src_x[0],
-								 &src_y[0],
-								 &src_w[0]))
-				return;
-
-			if (!intel_get_transformed_coordinates_3d(srcX, srcY + h,
-								 intel->
-								 transform[src_unit],
-								 &src_x[1],
-								 &src_y[1],
-								 &src_w[1]))
-				return;
-
-			if (!intel_get_transformed_coordinates_3d(srcX + w, srcY + h,
-								 intel->
-								 transform[src_unit],
-								 &src_x[2],
-								 &src_y[2],
-								 &src_w[2]))
-				return;
-
-			per_vertex += 4;	/* src x/y/z/w */
-		}
+	src_unit = tex_unit++;
+
+	is_affine_src = intel_transform_is_affine(intel->transform[src_unit]);
+	if (is_affine_src) {
+		if (!intel_get_transformed_coordinates(srcX, srcY,
+						      intel->
+						      transform[src_unit],
+						      &src_x[0],
+						      &src_y[0]))
+			return;
+
+		if (!intel_get_transformed_coordinates(srcX, srcY + h,
+						      intel->
+						      transform[src_unit],
+						      &src_x[1],
+						      &src_y[1]))
+			return;
+
+		if (!intel_get_transformed_coordinates(srcX + w, srcY + h,
+						      intel->
+						      transform[src_unit],
+						      &src_x[2],
+						      &src_y[2]))
+			return;
+
+		per_vertex += 2;	/* src x/y */
+	} else {
+		if (!intel_get_transformed_coordinates_3d(srcX, srcY,
+							 intel->
+							 transform[src_unit],
+							 &src_x[0],
+							 &src_y[0],
+							 &src_w[0]))
+			return;
+
+		if (!intel_get_transformed_coordinates_3d(srcX, srcY + h,
+							 intel->
+							 transform[src_unit],
+							 &src_x[1],
+							 &src_y[1],
+							 &src_w[1]))
+			return;
+
+		if (!intel_get_transformed_coordinates_3d(srcX + w, srcY + h,
+							 intel->
+							 transform[src_unit],
+							 &src_x[2],
+							 &src_y[2],
+							 &src_w[2]))
+			return;
+
+		per_vertex += 4;	/* src x/y/z/w */
 	}
 
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
+	if (intel->render_mask) {
 		mask_unit = tex_unit++;
 
 		is_affine_mask = intel_transform_is_affine(intel->transform[mask_unit]);
@@ -650,15 +605,13 @@ i915_emit_composite_primitive(intel_screen_private *intel,
 
 	OUT_VERTEX(dstX + w);
 	OUT_VERTEX(dstY + h);
-	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[2] * intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[2] * intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
+	OUT_VERTEX(src_x[2] * intel->scale_units[src_unit][0]);
+	OUT_VERTEX(src_y[2] * intel->scale_units[src_unit][1]);
+	if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[2]);
-	    }
 	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
+	if (intel->render_mask) {
 		OUT_VERTEX(mask_x[2] * intel->scale_units[mask_unit][0]);
 		OUT_VERTEX(mask_y[2] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
@@ -669,15 +622,13 @@ i915_emit_composite_primitive(intel_screen_private *intel,
 
 	OUT_VERTEX(dstX);
 	OUT_VERTEX(dstY + h);
-	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[1] * intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[1] * intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
+	OUT_VERTEX(src_x[1] * intel->scale_units[src_unit][0]);
+	OUT_VERTEX(src_y[1] * intel->scale_units[src_unit][1]);
+	if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[1]);
-	    }
 	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
+	if (intel->render_mask) {
 		OUT_VERTEX(mask_x[1] * intel->scale_units[mask_unit][0]);
 		OUT_VERTEX(mask_y[1] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
@@ -688,15 +639,13 @@ i915_emit_composite_primitive(intel_screen_private *intel,
 
 	OUT_VERTEX(dstX);
 	OUT_VERTEX(dstY);
-	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[0] * intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[0] * intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
+	OUT_VERTEX(src_x[0] * intel->scale_units[src_unit][0]);
+	OUT_VERTEX(src_y[0] * intel->scale_units[src_unit][1]);
+	if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[0]);
-	    }
 	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
+	if (intel->render_mask) {
 		OUT_VERTEX(mask_x[0] * intel->scale_units[mask_unit][0]);
 		OUT_VERTEX(mask_y[0] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
@@ -729,29 +678,11 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 	intel->render_dest_picture = dest_picture;
 	intel->render_dest = dest;
 
-	intel->render_source_is_solid = FALSE;
-	if (source_picture->pSourcePict) {
-		SourcePict *source = source_picture->pSourcePict;
-		if (source->type == SourcePictTypeSolidFill) {
-			intel->render_source_is_solid = TRUE;
-			intel->render_source_solid = source->solidFill.color;
-		}
-	}
-	if (!intel->render_source_is_solid && !intel_check_pitch_3d(source))
+	if (!intel_check_pitch_3d(source))
 		return FALSE;
 
-	intel->render_mask_is_solid = FALSE;
-	if (mask) {
-		if (mask_picture->pSourcePict) {
-			SourcePict *source = mask_picture->pSourcePict;
-			if (source->type == SourcePictTypeSolidFill) {
-				intel->render_mask_is_solid = TRUE;
-				intel->render_mask_solid = source->solidFill.color;
-			}
-		}
-		if (!intel->render_mask_is_solid && !intel_check_pitch_3d(mask))
-			return FALSE;
-	}
+	if (mask && !intel_check_pitch_3d(mask))
+		return FALSE;
 
 	if (!intel_check_pitch_3d(dest))
 		return FALSE;
@@ -787,31 +718,27 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 	intel->scale_units[1][1] = -1;
 
 	floats_per_vertex = 2;		/* dest x/y */
-	if (! intel->render_source_is_solid) {
-		if (!i915_texture_setup(source_picture, source, tex_unit++)) {
-			intel_debug_fallback(scrn, "fail to setup src texture\n");
-			return FALSE;
-		}
-
-		if (intel_transform_is_affine(source_picture->transform))
-			floats_per_vertex += 2;	/* src x/y */
-		else
-			floats_per_vertex += 4;	/* src x/y/z/w */
+	if (!i915_texture_setup(source_picture, source, tex_unit++)) {
+		intel_debug_fallback(scrn, "fail to setup src texture\n");
+		return FALSE;
 	}
 
-	if (mask != NULL) {
-		if (! intel->render_mask_is_solid) {
-			if (!i915_texture_setup(mask_picture, mask, tex_unit++)) {
-				intel_debug_fallback(scrn,
-						"fail to setup mask texture\n");
-				return FALSE;
-			}
+	if (intel_transform_is_affine(source_picture->transform))
+		floats_per_vertex += 2;	/* src x/y */
+	else
+		floats_per_vertex += 4;	/* src x/y/z/w */
 
-			if (intel_transform_is_affine(mask_picture->transform))
-				floats_per_vertex += 2;	/* mask x/y */
-			else
-				floats_per_vertex += 4;	/* mask x/y/z/w */
+	if (mask != NULL) {
+		if (!i915_texture_setup(mask_picture, mask, tex_unit++)) {
+			intel_debug_fallback(scrn,
+					     "fail to setup mask texture\n");
+			return FALSE;
 		}
+
+		if (intel_transform_is_affine(mask_picture->transform))
+			floats_per_vertex += 2;	/* mask x/y */
+		else
+			floats_per_vertex += 4;	/* mask x/y/z/w */
 	}
 
 	intel->i915_render_state.op = op;
@@ -827,17 +754,13 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 
 	intel->prim_emit = i915_emit_composite_primitive;
 	if (!mask) {
-		if (intel->render_source_is_solid)
-			intel->prim_emit = i915_emit_composite_primitive_constant;
-		else if (intel->transform[0] == NULL)
+		if (intel->transform[0] == NULL)
 			intel->prim_emit = i915_emit_composite_primitive_identity_source;
 		else if (intel_transform_is_affine(intel->transform[0]))
 			intel->prim_emit = i915_emit_composite_primitive_affine_source;
 	} else {
 		if (intel->transform[0] == NULL) {
-			if (intel->render_source_is_solid)
-				intel->prim_emit = i915_emit_composite_primitive_constant_identity_mask;
-			else if (intel->transform[1] == NULL)
+			if (intel->transform[1] == NULL)
 				intel->prim_emit = i915_emit_composite_primitive_identity_source_mask;
 		}
 	}
@@ -856,39 +779,25 @@ i915_composite_emit_shader(intel_screen_private *intel, CARD8 op)
 	PicturePtr mask_picture = intel->render_mask_picture;
 	PixmapPtr mask = intel->render_mask;
 	int src_reg, mask_reg;
-	Bool is_solid_src, is_solid_mask;
 	Bool dest_is_alpha = PIXMAN_FORMAT_RGB(intel->render_dest_picture->format) == 0;
-	int tex_unit, t;
 	FS_LOCALS();
 
-	is_solid_src = intel->render_source_is_solid;
-	is_solid_mask = intel->render_mask_is_solid;
-
 	FS_BEGIN();
 
 	/* Declare the registers necessary for our program.  */
-	t = 0;
-	if (is_solid_src) {
-		i915_fs_dcl(FS_T8);
-		src_reg = FS_T8;
-	} else {
-		i915_fs_dcl(FS_T0);
-		i915_fs_dcl(FS_S0);
-		t++;
-	}
+	i915_fs_dcl(FS_T0);
+	i915_fs_dcl(FS_S0);
 	if (!mask) {
 		/* No mask, so load directly to output color */
-		if (! is_solid_src) {
-			if (dest_is_alpha)
-				src_reg = FS_R0;
-			else
-				src_reg = FS_OC;
+		if (dest_is_alpha)
+			src_reg = FS_R0;
+		else
+			src_reg = FS_OC;
 
-			if (intel_transform_is_affine(intel->transform[0]))
-				i915_fs_texld(src_reg, FS_S0, FS_T0);
-			else
-				i915_fs_texldp(src_reg, FS_S0, FS_T0);
-		}
+		if (intel_transform_is_affine(intel->transform[0]))
+			i915_fs_texld(src_reg, FS_S0, FS_T0);
+		else
+			i915_fs_texldp(src_reg, FS_S0, FS_T0);
 
 		if (src_reg != FS_OC) {
 			if (dest_is_alpha)
@@ -897,35 +806,24 @@ i915_composite_emit_shader(intel_screen_private *intel, CARD8 op)
 				i915_fs_mov(FS_OC, i915_fs_operand_reg(src_reg));
 		}
 	} else {
-		if (is_solid_mask) {
-			i915_fs_dcl(FS_T9);
-			mask_reg = FS_T9;
-		} else {
-			i915_fs_dcl(FS_T0 + t);
-			i915_fs_dcl(FS_S0 + t);
-		}
+		i915_fs_dcl(FS_T1);
+		i915_fs_dcl(FS_S1);
 
-		tex_unit = 0;
-		if (! is_solid_src) {
-			/* Load the source_picture texel */
-			if (intel_transform_is_affine(intel->transform[tex_unit]))
-				i915_fs_texld(FS_R0, FS_S0, FS_T0);
-			else
-				i915_fs_texldp(FS_R0, FS_S0, FS_T0);
+		/* Load the source_picture texel */
+		if (intel_transform_is_affine(intel->transform[0]))
+			i915_fs_texld(FS_R0, FS_S0, FS_T0);
+		else
+			i915_fs_texldp(FS_R0, FS_S0, FS_T0);
 
-			src_reg = FS_R0;
-			tex_unit++;
-		}
+		src_reg = FS_R0;
 
-		if (! is_solid_mask) {
-			/* Load the mask_picture texel */
-			if (intel_transform_is_affine(intel->transform[tex_unit]))
-				i915_fs_texld(FS_R1, FS_S0 + t, FS_T0 + t);
-			else
-				i915_fs_texldp(FS_R1, FS_S0 + t, FS_T0 + t);
+		/* Load the mask_picture texel */
+		if (intel_transform_is_affine(intel->transform[1]))
+			i915_fs_texld(FS_R1, FS_S1, FS_T1);
+		else
+			i915_fs_texldp(FS_R1, FS_S1, FS_T1);
 
-			mask_reg = FS_R1;
-		}
+		mask_reg = FS_R1;
 
 		if (dest_is_alpha) {
 			i915_fs_mul(FS_OC,
@@ -972,7 +870,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	PicturePtr dest_picture = intel->render_dest_picture;
 	PixmapPtr mask = intel->render_mask;
 	PixmapPtr dest = intel->render_dest;
-	Bool is_solid_src, is_solid_mask;
 	int tex_count, t;
 
 	intel->needs_render_state_emit = FALSE;
@@ -980,12 +877,7 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	IntelEmitInvarientState(scrn);
 	intel->last_3d = LAST_3D_RENDER;
 
-	is_solid_src = intel->render_source_is_solid;
-	is_solid_mask = intel->render_mask_is_solid;
-
-	tex_count = 0;
-	tex_count += ! is_solid_src;
-	tex_count += mask && ! is_solid_mask;
+	tex_count = 1 + (mask != NULL);
 
 	assert(intel->in_batch_atomic);
 
@@ -1007,15 +899,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	    }
 	}
 
-	if (is_solid_src) {
-	    OUT_BATCH (_3DSTATE_DFLT_DIFFUSE_CMD);
-	    OUT_BATCH (intel->render_source_solid);
-	}
-	if (mask && is_solid_mask) {
-	    OUT_BATCH (_3DSTATE_DFLT_SPEC_CMD);
-	    OUT_BATCH (intel->render_mask_solid);
-	}
-
 	/* BUF_INFO is an implicit flush, so avoid if the target has not changed.
 	 * XXX However for reasons unfathomed, correct rendering in KDE requires
 	 * at least a MI_FLUSH | INHIBIT_RENDER_CACHE_FLUSH here.
@@ -1058,20 +941,15 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 		uint32_t ss2;
 
 		ss2 = ~0;
-		t = 0;
-		if (! is_solid_src) {
-		    ss2 &= ~S2_TEXCOORD_FMT(t, TEXCOORDFMT_NOT_PRESENT);
-		    ss2 |= S2_TEXCOORD_FMT(t,
-					   intel_transform_is_affine(intel->transform[t]) ?
-					   TEXCOORDFMT_2D : TEXCOORDFMT_4D);
-		    t++;
-		}
-		if (mask && ! is_solid_mask) {
-		    ss2 &= ~S2_TEXCOORD_FMT(t, TEXCOORDFMT_NOT_PRESENT);
-		    ss2 |= S2_TEXCOORD_FMT(t,
-					   intel_transform_is_affine(intel->transform[t]) ?
+		ss2 &= ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT);
+		ss2 |= S2_TEXCOORD_FMT(0,
+				       intel_transform_is_affine(intel->transform[0]) ?
+				       TEXCOORDFMT_2D : TEXCOORDFMT_4D);
+		if (mask) {
+		    ss2 &= ~S2_TEXCOORD_FMT(1, TEXCOORDFMT_NOT_PRESENT);
+		    ss2 |= S2_TEXCOORD_FMT(1,
+					   intel_transform_is_affine(intel->transform[1]) ?
 					   TEXCOORDFMT_2D : TEXCOORDFMT_4D);
-		    t++;
 		}
 
 		if (intel->needs_render_ca_pass) {
commit c0a5daa028eb74d1ca891b6605404600c11655d6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 27 13:58:58 2012 +0000

    sna/gen2; Initialise channel.is-opaque for fills
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 64b4e7c..c3a16cb 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1116,10 +1116,7 @@ gen2_composite_solid_init(struct sna *sna,
 {
 	channel->filter = PictFilterNearest;
 	channel->repeat = RepeatNormal;
-	channel->is_affine = TRUE;
 	channel->is_solid  = TRUE;
-	channel->is_linear = FALSE;
-	channel->transform = NULL;
 	channel->width  = 1;
 	channel->height = 1;
 	channel->pict_format = PICT_a8r8g8b8;
@@ -1169,11 +1166,7 @@ gen2_composite_linear_init(struct sna *sna,
 
 	channel->filter = PictFilterNearest;
 	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
-	channel->is_affine = TRUE;
-	channel->is_opaque = FALSE;
-	channel->is_solid  = FALSE;
 	channel->is_linear = TRUE;
-	channel->transform = NULL;
 	channel->width  = channel->bo->pitch / 4;
 	channel->height = 1;
 	channel->pict_format = PICT_a8r8g8b8;
@@ -1331,6 +1324,9 @@ gen2_composite_picture(struct sna *sna,
 
 	channel->is_solid = FALSE;
 	channel->is_linear = FALSE;
+	channel->is_opaque = FALSE;
+	channel->is_affine = TRUE;
+	channel->transform = NULL;
 
 	if (sna_picture_is_solid(picture, &color))
 		return gen2_composite_solid_init(sna, channel, color);
@@ -1726,7 +1722,7 @@ gen2_render_composite(struct sna *sna,
 		return FALSE;
 	}
 
-	if (mask == NULL && sna->kgem.mode == KGEM_BLT  &&
+	if (mask == NULL && sna->kgem.mode == KGEM_BLT &&
 	    sna_blt_composite(sna, op,
 			      src, dst,
 			      src_x, src_y,
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index a672c46..a9ec899 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -667,7 +667,7 @@ is_solid(PicturePtr picture)
 	}
 
 	if (picture->pDrawable) {
-		if (picture->pDrawable->width == 1 &&
+		if (picture->pDrawable->width  == 1 &&
 		    picture->pDrawable->height == 1 &&
 		    picture->repeat)
 			return TRUE;
commit 1798eda4a70ca76861b08f8d596fa37eca800c49
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 27 12:28:22 2012 +0000

    Revert "meh"
    
    This reverts commit 4adb6967a84af8a04769c2d936a41f4a49ed1428.
    
    Oops, this debugging commit was not intended to be pushed along with the
    bugfix. :(
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 27b9327..3dd7863 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1376,10 +1376,15 @@ static void kgem_finish_partials(struct kgem *kgem)
 		if (!bo->base.exec)
 			continue;
 
-		if (bo->write & KGEM_BUFFER_WRITE_INPLACE) {
-			DBG(("%s: retaining partial upload buffer (%d/%d)\n",
-			     __FUNCTION__, bo->used, bytes(&bo->base)));
-			continue;
+		if (bo->mmapped) {
+			assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
+			assert(!bo->need_io);
+			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
+				DBG(("%s: retaining partial upload buffer (%d/%d)\n",
+				     __FUNCTION__, bo->used, bytes(&bo->base)));
+				continue;
+			}
+			goto decouple;
 		}
 
 		if (!bo->used) {
commit 710c8b1b8b728bc7b1cadd72c4b14e66a3f29a60
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 27 12:06:50 2012 +0000

    sna: Upload the ordinary partial buffers!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 48c131b..27b9327 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1393,6 +1393,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 		assert(bo->base.rq == kgem->next_request);
 		assert(bo->base.domain != DOMAIN_GPU);
+		assert(bo->need_io);
 
 		if (bo->base.refcnt == 1 && bo->used < bytes(&bo->base) / 2) {
 			struct kgem_bo *shrink;
@@ -1439,16 +1440,16 @@ static void kgem_finish_partials(struct kgem *kgem)
 				bubble_sort_partial(kgem, bo);
 				continue;
 			}
-
-			DBG(("%s: handle=%d, uploading %d/%d\n",
-			     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
-			assert(!kgem_busy(kgem, bo->base.handle));
-			assert(bo->used <= bytes(&bo->base));
-			gem_write(kgem->fd, bo->base.handle,
-				  0, bo->used, bo->mem);
-			bo->need_io = 0;
 		}
 
+		DBG(("%s: handle=%d, uploading %d/%d\n",
+		     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
+		assert(!kgem_busy(kgem, bo->base.handle));
+		assert(bo->used <= bytes(&bo->base));
+		gem_write(kgem->fd, bo->base.handle,
+			  0, bo->used, bo->mem);
+		bo->need_io = 0;
+
 decouple:
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
commit ba390c9ed2387e79b721c4fe103cb1e3c14ab545
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 27 11:36:35 2012 +0000

    meh

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 2097994..48c131b 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1376,15 +1376,10 @@ static void kgem_finish_partials(struct kgem *kgem)
 		if (!bo->base.exec)
 			continue;
 
-		if (bo->mmapped) {
-			assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
-			assert(!bo->need_io);
-			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
-				DBG(("%s: retaining partial upload buffer (%d/%d)\n",
-				     __FUNCTION__, bo->used, bytes(&bo->base)));
-				continue;
-			}
-			goto decouple;
+		if (bo->write & KGEM_BUFFER_WRITE_INPLACE) {
+			DBG(("%s: retaining partial upload buffer (%d/%d)\n",
+			     __FUNCTION__, bo->used, bytes(&bo->base)));
+			continue;
 		}
 
 		if (!bo->used) {
@@ -3471,8 +3466,6 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			}
 			DBG(("%s: created handle=%d for buffer\n",
 			     __FUNCTION__, bo->base.handle));
-
-			bo->base.domain = DOMAIN_CPU;
 		}
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
commit 9da8867da35649abd01e6d990577cc2bb8f7d718
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Feb 26 22:34:30 2012 +0000

    sna: Avoid reusing mmapped partial write buffers for readback
    
    An artefact of retaining the mmapped partial buffers is that it
    magnified the effect of stealing those for readback, causing extra
    writes on non-llc platforms.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4c70ad9..2097994 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1378,11 +1378,13 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 		if (bo->mmapped) {
 			assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
+			assert(!bo->need_io);
 			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
 				DBG(("%s: retaining partial upload buffer (%d/%d)\n",
 				     __FUNCTION__, bo->used, bytes(&bo->base)));
 				continue;
 			}
+			goto decouple;
 		}
 
 		if (!bo->used) {
@@ -1395,55 +1397,52 @@ static void kgem_finish_partials(struct kgem *kgem)
 		}
 
 		assert(bo->base.rq == kgem->next_request);
-		if (bo->used && bo->need_io) {
-			assert(bo->base.domain != DOMAIN_GPU);
-
-			if (bo->base.refcnt == 1 &&
-			    bo->used < bytes(&bo->base) / 2) {
-				struct kgem_bo *shrink;
-
-				shrink = search_linear_cache(kgem,
-							     PAGE_ALIGN(bo->used),
-							     CREATE_INACTIVE);
-				if (shrink) {
-					int n;
-
-					DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
-					     __FUNCTION__,
-					     bo->used, bytes(&bo->base), bytes(shrink),
-					     bo->base.handle, shrink->handle));
-
-					assert(bo->used <= bytes(shrink));
-					gem_write(kgem->fd, shrink->handle,
-						  0, bo->used, bo->mem);
-
-					for (n = 0; n < kgem->nreloc; n++) {
-						if (kgem->reloc[n].target_handle == bo->base.handle) {
-							kgem->reloc[n].target_handle = shrink->handle;
-							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
-							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
-								kgem->reloc[n].delta + shrink->presumed_offset;
-						}
+		assert(bo->base.domain != DOMAIN_GPU);
+
+		if (bo->base.refcnt == 1 && bo->used < bytes(&bo->base) / 2) {
+			struct kgem_bo *shrink;
+
+			shrink = search_linear_cache(kgem,
+						     PAGE_ALIGN(bo->used),
+						     CREATE_INACTIVE);
+			if (shrink) {
+				int n;
+
+				DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
+				     __FUNCTION__,
+				     bo->used, bytes(&bo->base), bytes(shrink),
+				     bo->base.handle, shrink->handle));
+
+				assert(bo->used <= bytes(shrink));
+				gem_write(kgem->fd, shrink->handle,
+					  0, bo->used, bo->mem);
+
+				for (n = 0; n < kgem->nreloc; n++) {
+					if (kgem->reloc[n].target_handle == bo->base.handle) {
+						kgem->reloc[n].target_handle = shrink->handle;
+						kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+						kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+							kgem->reloc[n].delta + shrink->presumed_offset;
 					}
-
-					bo->base.exec->handle = shrink->handle;
-					bo->base.exec->offset = shrink->presumed_offset;
-					shrink->exec = bo->base.exec;
-					shrink->rq = bo->base.rq;
-					list_replace(&bo->base.request,
-						     &shrink->request);
-					list_init(&bo->base.request);
-					shrink->needs_flush = bo->base.dirty;
-
-					bo->base.exec = NULL;
-					bo->base.rq = NULL;
-					bo->base.dirty = false;
-					bo->base.needs_flush = false;
-					bo->used = 0;
-
-					bubble_sort_partial(kgem, bo);
-					continue;
 				}
+
+				bo->base.exec->handle = shrink->handle;
+				bo->base.exec->offset = shrink->presumed_offset;
+				shrink->exec = bo->base.exec;
+				shrink->rq = bo->base.rq;
+				list_replace(&bo->base.request,
+					     &shrink->request);
+				list_init(&bo->base.request);
+				shrink->needs_flush = bo->base.dirty;
+
+				bo->base.exec = NULL;
+				bo->base.rq = NULL;
+				bo->base.dirty = false;
+				bo->base.needs_flush = false;
+				bo->used = 0;
+
+				bubble_sort_partial(kgem, bo);
+				continue;
 			}
 
 			DBG(("%s: handle=%d, uploading %d/%d\n",
@@ -3243,29 +3242,19 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	assert(size <= kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
-		if (flags == KGEM_BUFFER_LAST && bo->write) {
-			/* We can reuse any write buffer which we can fit */
-			if (size <= bytes(&bo->base)) {
-				if (bo->base.refcnt == 1 && bo->base.exec) {
-					DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
-					gem_write(kgem->fd, bo->base.handle,
-						  0, bo->used, bo->mem);
-					bo->need_io = 0;
-					bo->write = 0;
-					offset = 0;
-					goto done;
-				} else if (bo->used + size <= bytes(&bo->base)) {
-					DBG(("%s: reusing unfinished write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
-					gem_write(kgem->fd, bo->base.handle,
-						  0, bo->used, bo->mem);
-					bo->need_io = 0;
-					bo->write = 0;
-					offset = bo->used;
-					goto done;
-				}
-			}
+		/* We can reuse any write buffer which we can fit */
+		if (flags == KGEM_BUFFER_LAST &&
+		    bo->write == KGEM_BUFFER_WRITE && bo->base.exec &&
+		    size <= bytes(&bo->base)) {
+			assert(bo->base.refcnt == 1);
+			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
+			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
+			gem_write(kgem->fd, bo->base.handle,
+				  0, bo->used, bo->mem);
+			bo->need_io = 0;
+			bo->write = 0;
+			offset = bo->used = 0;
+			goto done;
 		}
 
 		if ((bo->write & KGEM_BUFFER_WRITE) != (flags & KGEM_BUFFER_WRITE) ||
commit cfc763918759e746a8df5df75516d29948e2f093
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 25 10:59:14 2012 +0000

    sna: Retain unfinished partial buffers between batches
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4051892..4c70ad9 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1131,6 +1131,56 @@ static void kgem_bo_unref(struct kgem *kgem, struct kgem_bo *bo)
 		__kgem_bo_destroy(kgem, bo);
 }
 
+static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
+{
+	int remain = bytes(&bo->base) - bo->used;
+
+	while (bo->base.list.prev != &kgem->partial) {
+		struct kgem_partial_bo *p;
+
+		p = list_entry(bo->base.list.prev,
+			       struct kgem_partial_bo,
+			       base.list);
+		if (remain <= bytes(&p->base) - p->used)
+			break;
+
+		assert(p->base.list.next == &bo->base.list);
+		bo->base.list.prev = p->base.list.prev;
+		p->base.list.prev->next = &bo->base.list;
+		p->base.list.prev = &bo->base.list;
+
+		p->base.list.next = bo->base.list.next;
+		bo->base.list.next->prev = &p->base.list;
+		bo->base.list.next = &p->base.list;
+
+		assert(p->base.list.next->prev == &p->base.list);
+		assert(bo->base.list.prev->next == &bo->base.list);
+	}
+}
+
+static void kgem_retire_partials(struct kgem *kgem)
+{
+	struct kgem_partial_bo *bo, *next;
+
+	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
+		if (bo->used == 0 || !bo->mmapped)
+			continue;
+		if (bo->base.refcnt != 1 || bo->base.rq)
+			continue;
+
+		DBG(("%s: handle=%d, used %d/%d\n", __FUNCTION__,
+		     bo->base.handle, bo->used, bytes(&bo->base)));
+
+		assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
+		assert(kgem->has_llc || !IS_CPU_MAP(bo->base.map));
+		bo->base.dirty = false;
+		bo->base.needs_flush = false;
+		bo->used = 0;
+
+		bubble_sort_partial(kgem, bo);
+	}
+}
+
 bool kgem_retire(struct kgem *kgem)
 {
 	struct kgem_bo *bo, *next;
@@ -1233,6 +1283,8 @@ bool kgem_retire(struct kgem *kgem)
 		free(rq);
 	}
 
+	kgem_retire_partials(kgem);
+
 	kgem->need_retire = !list_is_empty(&kgem->requests);
 	DBG(("%s -- need_retire=%d\n", __FUNCTION__, kgem->need_retire));
 
@@ -1311,33 +1363,6 @@ static void kgem_close_inactive(struct kgem *kgem)
 		kgem_close_list(kgem, &kgem->inactive[i]);
 }
 
-static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
-{
-	int remain = bytes(&bo->base) - bo->used;
-
-	while (bo->base.list.prev != &kgem->partial) {
-		struct kgem_partial_bo *p;
-
-		p = list_entry(bo->base.list.prev,
-			       struct kgem_partial_bo,
-			       base.list);
-		if (remain <= bytes(&p->base) - p->used)
-			break;
-
-		assert(p->base.list.next == &bo->base.list);
-		bo->base.list.prev = p->base.list.prev;
-		p->base.list.prev->next = &bo->base.list;
-		p->base.list.prev = &bo->base.list;
-
-		p->base.list.next = bo->base.list.next;
-		bo->base.list.next->prev = &p->base.list;
-		bo->base.list.next = &p->base.list;
-
-		assert(p->base.list.next->prev == &p->base.list);
-		assert(bo->base.list.prev->next == &bo->base.list);
-	}
-}
-
 static void kgem_finish_partials(struct kgem *kgem)
 {
 	struct kgem_partial_bo *bo, *next;
@@ -1348,10 +1373,18 @@ static void kgem_finish_partials(struct kgem *kgem)
 			goto decouple;
 		}
 
-		assert(bo->base.domain != DOMAIN_GPU);
 		if (!bo->base.exec)
 			continue;
 
+		if (bo->mmapped) {
+			assert(bo->write & KGEM_BUFFER_WRITE_INPLACE);
+			if (kgem->has_llc || !IS_CPU_MAP(bo->base.map)) {
+				DBG(("%s: retaining partial upload buffer (%d/%d)\n",
+				     __FUNCTION__, bo->used, bytes(&bo->base)));
+				continue;
+			}
+		}
+
 		if (!bo->used) {
 			/* Unless we replace the handle in the execbuffer,
 			 * then this bo will become active. So decouple it
@@ -1363,6 +1396,8 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 		assert(bo->base.rq == kgem->next_request);
 		if (bo->used && bo->need_io) {
+			assert(bo->base.domain != DOMAIN_GPU);
+
 			if (bo->base.refcnt == 1 &&
 			    bo->used < bytes(&bo->base) / 2) {
 				struct kgem_bo *shrink;
@@ -1768,7 +1803,7 @@ static void kgem_expire_partial(struct kgem *kgem)
 	struct kgem_partial_bo *bo, *next;
 
 	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
-		if (bo->base.refcnt > 1 || bo->base.exec)
+		if (bo->base.refcnt > 1 || bo->base.rq)
 			continue;
 
 		DBG(("%s: discarding unused partial buffer: %d/%d, write? %d\n",
@@ -3214,11 +3249,19 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				if (bo->base.refcnt == 1 && bo->base.exec) {
 					DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
 					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
+					gem_write(kgem->fd, bo->base.handle,
+						  0, bo->used, bo->mem);
+					bo->need_io = 0;
+					bo->write = 0;
 					offset = 0;
 					goto done;
 				} else if (bo->used + size <= bytes(&bo->base)) {
 					DBG(("%s: reusing unfinished write buffer for read of %d bytes? used=%d, total=%d\n",
 					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
+					gem_write(kgem->fd, bo->base.handle,
+						  0, bo->used, bo->mem);
+					bo->need_io = 0;
+					bo->write = 0;
 					offset = bo->used;
 					goto done;
 				}
commit 9e5154335483a85166aa84a27a370d750cdf321f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 25 09:32:20 2012 +0000

    sna/gen3+: Keep the vertex buffer resident between batches
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index b50d067..8f597cf 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1612,20 +1612,33 @@ static int gen3_vertex_finish(struct sna *sna)
 
 static void gen3_vertex_close(struct sna *sna)
 {
-	struct kgem_bo *bo;
-	int delta = 0;
+	struct kgem_bo *bo, *free_bo = NULL;
+	unsigned int delta = 0;
+
+	assert(sna->render_state.gen3.vertex_offset == 0);
 
-	if (!sna->render.vertex_used) {
+	DBG(("%s: used=%d, vbo active? %d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
+
+	if (sna->render.vertex_used == 0) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
 		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
 		return;
 	}
 
-	DBG(("%s: used=%d\n", __FUNCTION__, sna->render.vertex_used));
-
 	bo = sna->render.vbo;
-	if (bo == NULL) {
+	if (bo) {
+		if (IS_CPU_MAP(bo->map) ||
+		    sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding vbo (was CPU mapped)\n",
+			     __FUNCTION__));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		}
+	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -1636,36 +1649,37 @@ static void gen3_vertex_close(struct sna *sna)
 			bo = NULL;
 			sna->kgem.nbatch += sna->render.vertex_used;
 		} else {
-			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used);
-			if (bo && !kgem_bo_write(&sna->kgem, bo,
-						 sna->render.vertex_data,
-						 4*sna->render.vertex_used)) {
-				kgem_bo_destroy(&sna->kgem, bo);
-				goto reset;
-			}
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used);
+			if (bo)
+				kgem_bo_write(&sna->kgem, bo,
+					      sna->render.vertex_data,
+					      4*sna->render.vertex_used);
+			free_bo = bo;
 		}
 	}
 
 	DBG(("%s: reloc = %d\n", __FUNCTION__,
 	     sna->render.vertex_reloc[0]));
 
-	sna->kgem.batch[sna->render.vertex_reloc[0]] =
-		kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
-			       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
+	if (sna->render.vertex_reloc[0]) {
+		sna->kgem.batch[sna->render.vertex_reloc[0]] =
+			kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
+				       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
+		sna->render.vertex_reloc[0] = 0;
+	}
 
-reset:
-	sna->render.vertex_reloc[0] = 0;
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
+	if (sna->render.vbo == NULL) {
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+		assert(sna->render.vertices == sna->render.vertex_data);
+		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
+	}
 
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
 }
 
 static bool gen3_rectangle_begin(struct sna *sna,
@@ -1885,10 +1899,23 @@ gen3_render_reset(struct sna *sna)
 	state->last_floats_per_vertex = 0;
 	state->last_vertex_offset = 0;
 	state->vertex_offset = 0;
+}
 
-	assert(sna->render.vertex_used == 0);
-	assert(sna->render.vertex_index == 0);
-	assert(sna->render.vertex_reloc[0] == 0);
+static void
+gen3_render_retire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+		DBG(("%s: discarding vbo\n", __FUNCTION__));
+		kgem_bo_destroy(kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 }
 
 static Bool gen3_composite_channel_set_format(struct sna_composite_channel *channel,
@@ -4466,5 +4493,7 @@ Bool gen3_render_init(struct sna *sna)
 
 	render->max_3d_size = MAX_3D_SIZE;
 	render->max_3d_pitch = MAX_3D_PITCH;
+
+	sna->kgem.retire = gen3_render_retire;
 	return TRUE;
 }
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index a80ce0a..bcba0d8 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -410,11 +410,14 @@ static int gen5_vertex_finish(struct sna *sna)
 
 static void gen5_vertex_close(struct sna *sna)
 {
-	struct kgem_bo *bo;
+	struct kgem_bo *bo, *free_bo = NULL;
 	unsigned int i, delta = 0;
 
 	assert(sna->render_state.gen5.vertex_offset == 0);
 
+	DBG(("%s: used=%d, vbo active? %d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
+
 	if (!sna->render.vertex_used) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -422,10 +425,18 @@ static void gen5_vertex_close(struct sna *sna)
 		return;
 	}
 
-	DBG(("%s: used=%d\n", __FUNCTION__, sna->render.vertex_used));
-
 	bo = sna->render.vbo;
-	if (bo == NULL) {
+	if (bo) {
+		if (IS_CPU_MAP(bo->map) ||
+		    sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding vbo (was CPU mapped)\n",
+			     __FUNCTION__));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		}
+	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -441,10 +452,11 @@ static void gen5_vertex_close(struct sna *sna)
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
 				kgem_bo_destroy(&sna->kgem, bo);
-				goto reset;
+				bo = NULL;
 			}
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
+			free_bo = bo;
 		}
 	}
 
@@ -469,17 +481,13 @@ static void gen5_vertex_close(struct sna *sna)
 		}
 	}
 
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
-
-reset:
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-	sna->render_state.gen5.vb_id = 0;
+	if (sna->render.vbo == NULL) {
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
 }
 
 static uint32_t gen5_get_blend(int op,
@@ -3470,6 +3478,23 @@ gen5_render_context_switch(struct kgem *kgem,
 	}
 }
 
+static void
+gen5_render_retire(struct kgem *kgem)
+{
+	struct sna *sna;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+		DBG(("%s: discarding vbo\n", __FUNCTION__));
+		kgem_bo_destroy(kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
+}
+
 static void gen5_render_reset(struct sna *sna)
 {
 	sna->render_state.gen5.needs_invariant = TRUE;
@@ -3730,6 +3755,7 @@ Bool gen5_render_init(struct sna *sna)
 		return FALSE;
 
 	sna->kgem.context_switch = gen5_render_context_switch;
+	sna->kgem.retire = gen5_render_retire;
 
 	sna->render.composite = gen5_render_composite;
 #if !NO_COMPOSITE_SPANS
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index b69b3a2..439fb52 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -989,9 +989,14 @@ static int gen6_vertex_finish(struct sna *sna)
 
 static void gen6_vertex_close(struct sna *sna)
 {
-	struct kgem_bo *bo;
+	struct kgem_bo *bo, *free_bo = NULL;
 	unsigned int i, delta = 0;
 
+	assert(sna->render_state.gen6.vertex_offset == 0);
+
+	DBG(("%s: used=%d, vbo active? %d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
+
 	if (!sna->render.vertex_used) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -999,13 +1004,16 @@ static void gen6_vertex_close(struct sna *sna)
 		return;
 	}
 
-	DBG(("%s: used=%d / %d\n", __FUNCTION__,
-	     sna->render.vertex_used, sna->render.vertex_size));
-
 	bo = sna->render.vbo;
-	if (bo == NULL) {
-		assert(sna->render.vertices == sna->render.vertex_data);
-		assert(sna->render.vertex_used < ARRAY_SIZE(sna->render.vertex_data));
+	if (bo) {
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding vbo (full)\n", __FUNCTION__));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		}
+	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -1021,10 +1029,11 @@ static void gen6_vertex_close(struct sna *sna)
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
 				kgem_bo_destroy(&sna->kgem, bo);
-				goto reset;
+				bo = NULL;
 			}
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
+			free_bo = bo;
 		}
 	}
 
@@ -1049,17 +1058,15 @@ static void gen6_vertex_close(struct sna *sna)
 		}
 	}
 
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
-
-reset:
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-	sna->render_state.gen6.vb_id = 0;
+	if (sna->render.vbo == NULL) {
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+		assert(sna->render.vertices == sna->render.vertex_data);
+		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
+	}
 
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
 }
 
 typedef struct gen6_surface_state_padded {
@@ -4095,8 +4102,21 @@ gen6_render_context_switch(struct kgem *kgem,
 static void
 gen6_render_retire(struct kgem *kgem)
 {
+	struct sna *sna;
+
 	if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
 		kgem->ring = kgem->mode;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+		DBG(("%s: discarding vbo\n", __FUNCTION__));
+		kgem_bo_destroy(kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 }
 
 static void gen6_render_reset(struct sna *sna)
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 0d913f6..e3d9757 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1086,9 +1086,14 @@ static int gen7_vertex_finish(struct sna *sna)
 
 static void gen7_vertex_close(struct sna *sna)
 {
-	struct kgem_bo *bo;
+	struct kgem_bo *bo, *free_bo = NULL;
 	unsigned int i, delta = 0;
 
+	assert(sna->render_state.gen7.vertex_offset == 0);
+
+	DBG(("%s: used=%d, vbo active? %d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
+
 	if (!sna->render.vertex_used) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -1096,11 +1101,16 @@ static void gen7_vertex_close(struct sna *sna)
 		return;
 	}
 
-	DBG(("%s: used=%d / %d\n", __FUNCTION__,
-	     sna->render.vertex_used, sna->render.vertex_size));
-
 	bo = sna->render.vbo;
-	if (bo == NULL) {
+	if (bo) {
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding vbo (full)\n", __FUNCTION__));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		}
+	} else {
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -1116,10 +1126,11 @@ static void gen7_vertex_close(struct sna *sna)
 						 sna->render.vertex_data,
 						 4*sna->render.vertex_used)) {
 				kgem_bo_destroy(&sna->kgem, bo);
-				goto reset;
+				bo = NULL;
 			}
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
+			free_bo = bo;
 		}
 	}
 
@@ -1144,17 +1155,13 @@ static void gen7_vertex_close(struct sna *sna)
 		}
 	}
 
-	if (bo)
-		kgem_bo_destroy(&sna->kgem, bo);
-
-reset:
-	sna->render.vertex_used = 0;
-	sna->render.vertex_index = 0;
-	sna->render_state.gen7.vb_id = 0;
+	if (sna->render.vbo == NULL) {
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 
-	sna->render.vbo = NULL;
-	sna->render.vertices = sna->render.vertex_data;
-	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
 }
 
 static void null_create(struct sna_static_stream *stream)
@@ -4080,8 +4087,21 @@ gen7_render_context_switch(struct kgem *kgem,
 static void
 gen7_render_retire(struct kgem *kgem)
 {
+	struct sna *sna;
+
 	if (kgem->ring && (kgem->has_semaphores || !kgem->need_retire))
 		kgem->ring = kgem->mode;
+
+	sna = container_of(kgem, struct sna, kgem);
+	if (!kgem->need_retire && kgem->nbatch == 0 && sna->render.vbo) {
+		DBG(("%s: discarding vbo\n", __FUNCTION__));
+		kgem_bo_destroy(kgem, sna->render.vbo);
+		sna->render.vbo = NULL;
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+	}
 }
 
 static void gen7_render_reset(struct sna *sna)
commit ca888a417ed7450eb168fa28f63366a3035bea3f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 25 11:07:16 2012 +0000

    sna: Ensure we trigger a retire for search_linear_cache
    
    Bo used for batch buffers are handled differently and not tracked
    through the active cache, so we failed to notice when we might be able
    to run retire and recover a suitable buffer for reuse. So simply always
    run retire when we might need to create a new linear buffer.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d73fc30..4051892 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1922,22 +1922,32 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 	bool use_active = (flags & CREATE_INACTIVE) == 0;
 	struct list *cache;
 
+	DBG(("%s: num_pages=%d, flags=%x, use_active? %d\n",
+	     __FUNCTION__, num_pages, flags, use_active));
+
 	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
 		return NULL;
 
 	if (!use_active && list_is_empty(inactive(kgem, num_pages))) {
-		if (list_is_empty(active(kgem, num_pages, I915_TILING_NONE)))
-			return NULL;
+		DBG(("%s: inactive and cache bucket empty\n",
+		     __FUNCTION__));
 
-		if (!kgem_retire(kgem))
+		if (!kgem->need_retire || !kgem_retire(kgem)) {
+			DBG(("%s: nothing retired\n", __FUNCTION__));
 			return NULL;
+		}
 
-		if (list_is_empty(inactive(kgem, num_pages)))
+		if (list_is_empty(inactive(kgem, num_pages))) {
+			DBG(("%s: active cache bucket still empty after retire\n",
+			     __FUNCTION__));
 			return NULL;
+		}
 	}
 
 	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
+		DBG(("%s: searching for inactive %s map\n",
+		     __FUNCTION__, for_cpu ? "cpu" : "gtt"));
 		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
 		list_for_each_entry(bo, cache, vma) {
 			assert(IS_CPU_MAP(bo->map) == for_cpu);
@@ -2111,7 +2121,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 	if (handle == 0)
 		return NULL;
 
-	DBG(("%s: new handle=%d\n", __FUNCTION__, handle));
+	DBG(("%s: new handle=%d, num_pages=%d\n", __FUNCTION__, handle, size));
 	bo = __kgem_bo_alloc(handle, size);
 	if (bo == NULL) {
 		gem_close(kgem->fd, handle);
commit 057b283504e79e0833b8f3ec4366dc41f0c63a4b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 25 00:43:30 2012 +0000

    sna: Skip a tiled bo when searching the cache for a linear mmap
    
    If we change tiling on a bo, we are effectively discarding the cached
    mmap so it is preferable to look for another.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 007dc04..d73fc30 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1982,7 +1982,9 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		if (num_pages > num_pages(bo))
 			continue;
 
-		if (use_active && bo->tiling != I915_TILING_NONE)
+		if (use_active &&
+		    kgem->gen <= 40 &&
+		    bo->tiling != I915_TILING_NONE)
 			continue;
 
 		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
@@ -1991,7 +1993,10 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		}
 
 		if (I915_TILING_NONE != bo->tiling) {
-			if (use_active)
+			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP))
+				continue;
+
+			if (first)
 				continue;
 
 			if (gem_set_tiling(kgem->fd, bo->handle,
commit b2d55a1f3f79d3c577f525453a38a39ca7107698
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 24 21:40:44 2012 +0000

    legacy: Rename XF86DRI to HAVE_DRI1 to avoid conflicts with xorg-server.h
    
    We use the XF86DRI as a user configurable option to control whether to
    build DRI support for i810, but it is also used internally within xorg
    and there exists a public define in xorg-server.h which overrides our
    configure option. So rename our define to HAVE_DRI1 to avoid the
    conflict.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=46590
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 58c99f0..06c40e8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -216,7 +216,7 @@ AC_MSG_RESULT([${DRI-yes}])
 
 AM_CONDITIONAL(DRI, test x$DRI != xno)
 if test "x$DRI" != "xno"; then
-        AC_DEFINE(XF86DRI,1,[Enable DRI driver support])
+        AC_DEFINE(HAVE_DRI1,1,[Enable DRI driver support])
 else
         DRI_CFLAGS=""
         DRI_LIBS=""
diff --git a/src/legacy/i810/i810.h b/src/legacy/i810/i810.h
index 183c701..2c0b53e 100644
--- a/src/legacy/i810/i810.h
+++ b/src/legacy/i810/i810.h
@@ -51,7 +51,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xorg-server.h"
 #include <pciaccess.h>
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
 #include "xf86drm.h"
 #include "sarea.h"
 #define _XF86DRI_SERVER_
@@ -209,7 +209,7 @@ typedef struct _I810Rec {
    Bool directRenderingDisabled;        /* DRI disabled in PreInit */
    Bool directRenderingEnabled;		/* false if XF86DRI not defined. */
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    int LockHeld;
    DRIInfoPtr pDRIInfo;
    int drmSubFD;
@@ -248,7 +248,7 @@ typedef struct _I810Rec {
 #define I810_SELECT_BACK	1
 #define I810_SELECT_DEPTH	2
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
 extern Bool I810DRIScreenInit(ScreenPtr pScreen);
 extern void I810DRICloseScreen(ScreenPtr pScreen);
 extern Bool I810DRIFinishScreenInit(ScreenPtr pScreen);
diff --git a/src/legacy/i810/i810_accel.c b/src/legacy/i810/i810_accel.c
index 9aa3e42..6b57dbb 100644
--- a/src/legacy/i810/i810_accel.c
+++ b/src/legacy/i810/i810_accel.c
@@ -213,7 +213,7 @@ I810WaitLpRing(ScrnInfoPtr pScrn, int n, int timeout_millis)
 		start);
 	 I810PrintErrorState(pScrn);
 	 ErrorF("space: %d wanted %d\n", ring->space, n);
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
 	 if (pI810->directRenderingEnabled) {
 	    DRIUnlock(screenInfo.screens[pScrn->scrnIndex]);
 	    DRICloseScreen(screenInfo.screens[pScrn->scrnIndex]);
@@ -245,7 +245,7 @@ I810Sync(ScrnInfoPtr pScrn)
    if (I810_DEBUG & (DEBUG_VERBOSE_ACCEL | DEBUG_VERBOSE_SYNC))
       ErrorF("I810Sync\n");
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    /* VT switching tries to do this.  
     */
    if (!pI810->LockHeld && pI810->directRenderingEnabled) {
diff --git a/src/legacy/i810/i810_driver.c b/src/legacy/i810/i810_driver.c
index d8f7c45..02da574 100644
--- a/src/legacy/i810/i810_driver.c
+++ b/src/legacy/i810/i810_driver.c
@@ -71,7 +71,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "i810.h"
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
 #include "dri.h"
 #endif
 
@@ -132,7 +132,7 @@ int I810_DEBUG = (0
       );
 #endif
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
 static int i810_pitches[] = {
    512,
    1024,
@@ -352,7 +352,7 @@ I810PreInit(ScrnInfoPtr pScrn, int flags)
       }
    }
    
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    pI810->directRenderingDisabled =
      !xf86ReturnOptValBool(pI810->Options, OPTION_DRI, TRUE);
 
@@ -551,7 +551,7 @@ I810PreInit(ScrnInfoPtr pScrn, int flags)
 
    i = xf86ValidateModes(pScrn, pScrn->monitor->Modes,
 			 pScrn->display->modes, clockRanges,
-#ifndef XF86DRI
+#ifndef HAVE_DRI1
 			 0, 320, 1600, 64 * pScrn->bitsPerPixel,
 #else
 			 i810_pitches, 0, 0, 64 * pScrn->bitsPerPixel,
@@ -607,7 +607,7 @@ I810PreInit(ScrnInfoPtr pScrn, int flags)
    pI810->allowPageFlip=FALSE;
    enable = xf86ReturnOptValBool(pI810->Options, OPTION_PAGEFLIP, FALSE);   
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (!pI810->directRenderingDisabled) {
      pI810->allowPageFlip = enable;
      if (pI810->allowPageFlip == TRUE)
@@ -645,7 +645,7 @@ I810PreInit(ScrnInfoPtr pScrn, int flags)
       pI810->numSurfaces = 0;
    }
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    /* Load the dri module if requested. */
    if (xf86ReturnOptValBool(pI810->Options, OPTION_DRI, FALSE)) {
       xf86LoadSubModule(pScrn, "dri");
@@ -1351,7 +1351,7 @@ I810ModeInit(ScrnInfoPtr pScrn, DisplayModePtr mode)
    if (!I810SetMode(pScrn, mode))
       return FALSE;
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       DRILock(screenInfo.screens[pScrn->scrnIndex], 0);
       pI810->LockHeld = 1;
@@ -1360,7 +1360,7 @@ I810ModeInit(ScrnInfoPtr pScrn, DisplayModePtr mode)
 
    DoRestore(pScrn, &hwp->ModeReg, &pI810->ModeReg, FALSE);
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       DRIUnlock(screenInfo.screens[pScrn->scrnIndex]);
       pI810->LockHeld = 0;
@@ -1610,7 +1610,7 @@ I810ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
     * memory.  Wonder if this is going to be a problem...
     */
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    /*
     * Setup DRI after visuals have been established, but before fbScreenInit
     * is called.   fbScreenInit will eventually call into the drivers
@@ -1676,7 +1676,7 @@ I810ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
 
    xf86SetBlackWhitePixels(pScreen);
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->LpRing->mem.Start == 0 && pI810->directRenderingEnabled) {
       pI810->directRenderingEnabled = FALSE;
       I810DRICloseScreen(pScreen);
@@ -1753,7 +1753,7 @@ I810ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
 
    I810InitVideo(pScreen);
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       /* Now that mi, fb, drm and others have done their thing,
        * complete the DRI setup.
@@ -1803,7 +1803,7 @@ I810SwitchMode(int scrnIndex, DisplayModePtr mode, int flags)
  * If lockups on mode switch are still seen revisit this code. (EE)
  */
 
-# ifdef XF86DRI
+# ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       if (I810_DEBUG & DEBUG_VERBOSE_DRI)
 	 ErrorF("calling dri lock\n");
@@ -1818,7 +1818,7 @@ I810SwitchMode(int scrnIndex, DisplayModePtr mode, int flags)
    }
    I810Restore(pScrn);
 
-# ifdef XF86DRI
+# ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
        if (!I810DRILeave(pScrn))
 	   return FALSE;
@@ -1892,7 +1892,7 @@ I810EnterVT(int scrnIndex, int flags)
 {
    ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    I810Ptr pI810 = I810PTR(pScrn);
 #endif
 
@@ -1902,7 +1902,7 @@ I810EnterVT(int scrnIndex, int flags)
    if (!I810BindGARTMemory(pScrn)) {
       return FALSE;
    }
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (!I810DRIEnter(pScrn)) {
       return FALSE;
    }
@@ -1930,7 +1930,7 @@ I810LeaveVT(int scrnIndex, int flags)
    if (I810_DEBUG & DEBUG_VERBOSE_DRI)
       ErrorF("\n\n\nLeave VT\n");
 
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       if (I810_DEBUG & DEBUG_VERBOSE_DRI)
 	 ErrorF("calling dri lock\n");
@@ -1948,7 +1948,7 @@ I810LeaveVT(int scrnIndex, int flags)
 
    if (!I810UnbindGARTMemory(pScrn))
       return;
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (!I810DRILeave(pScrn))
       return;
 #endif
@@ -1973,7 +1973,7 @@ I810CloseScreen(int scrnIndex, ScreenPtr pScreen)
       I810Restore(pScrn);
       vgaHWLock(hwp);
    }
-#ifdef XF86DRI
+#ifdef HAVE_DRI1
    if (pI810->directRenderingEnabled) {
       I810DRICloseScreen(pScreen);
       pI810->directRenderingEnabled = FALSE;
commit 612de7136a597afbfae04b44a7e6306423514bfe
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 24 21:36:30 2012 +0000

    legacy: Delete unused XF86DRI_DEVEL #define
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46590
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 584b32c..58c99f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -217,7 +217,6 @@ AC_MSG_RESULT([${DRI-yes}])
 AM_CONDITIONAL(DRI, test x$DRI != xno)
 if test "x$DRI" != "xno"; then
         AC_DEFINE(XF86DRI,1,[Enable DRI driver support])
-        AC_DEFINE(XF86DRI_DEVEL,1,[Enable developmental DRI driver support])
 else
         DRI_CFLAGS=""
         DRI_LIBS=""
commit 50c8cef1c8c73ecfbacf96e61f09cf036dfa0b67
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 24 11:21:49 2012 +0000

    configure, NEWS: Bump version to 2.18.0 for release
    
    Another quarter, a bit late as I was debugging a few regressions,
    another release.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/NEWS b/NEWS
index 2bd476b..9d2b15e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,29 @@
+Release 2.18.0 (2012-02-24)
+===========================
+Time passes, a few more bugs have crept out of the woodwork that are a
+compelling reason to update.
+
+Bugs fixed in this release (compared to 2.17.0)
+-----------------------------------------------
+
+* Limit maximum object size so that all of the source, mask and
+  destination can be mapped into the aperture simultaneously by basing the
+  limit on the mappable aperture size rather than the size of the total
+  GATT.
+
+* Incorrect clipping of polygons
+  https://bugs.freedesktop.org/show_bug.cgi?id=43649
+  Regression from 2.15.901
+
+* Limit number of VMA cached to avoid hitting the per-process VMA limit
+  There still is a residual bug in that we seem to have so many objects
+  floating around in the first place and that still leads to exhaustion
+  of system limits.
+  https://bugs.freedesktop.org/show_bug.cgi?id=43075
+  https://bugs.freedesktop.org/show_bug.cgi?id=40066
+
+* Latency in processing user-input during continuous rendering
+
 Release 2.17.0 (2011-11-16)
 ==============================
 A few months have passed, and we have accumulated a surprising number of
diff --git a/configure.ac b/configure.ac
index 67a513d..584b32c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
 # Initialize Autoconf
 AC_PREREQ([2.60])
 AC_INIT([xf86-video-intel],
-        [2.17.0],
+        [2.18.0],
         [https://bugs.freedesktop.org/enter_bug.cgi?product=xorg],
         [xf86-video-intel])
 AC_CONFIG_SRCDIR([Makefile.am])
commit 9236513ba720fda9f93e82ea912f0b602cec269c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 24 11:14:26 2012 +0000

    uxa: Add a option to disable the bo cache
    
    If you are suffering from regular X crashes and rendering corruption
    with a flood of ENOSPC or even EFILE reported in the Xorg.log, try
    adding this snippet to your xorg.conf:
    
    Section "Driver"
      Option "BufferCache" "False"
    EndSection
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=39552
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_driver.c b/src/intel_driver.c
index 2f1314f..da340e3 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -110,6 +110,7 @@ typedef enum {
    OPTION_DEBUG_WAIT,
    OPTION_HOTPLUG,
    OPTION_RELAXED_FENCING,
+   OPTION_BUFFER_CACHE,
 } I830Opts;
 
 static OptionInfoRec I830Options[] = {
@@ -131,6 +132,7 @@ static OptionInfoRec I830Options[] = {
    {OPTION_DEBUG_WAIT, "DebugWait", OPTV_BOOLEAN, {0}, FALSE},
    {OPTION_HOTPLUG,	"HotPlug",	OPTV_BOOLEAN,	{0},	TRUE},
    {OPTION_RELAXED_FENCING,	"RelaxedFencing",	OPTV_BOOLEAN,	{0},	TRUE},
+   {OPTION_BUFFER_CACHE,	"BufferCache",	OPTV_BOOLEAN,	{0},	TRUE},
    {-1,			NULL,		OPTV_NONE,	{0},	FALSE}
 };
 /* *INDENT-ON* */
@@ -404,7 +406,8 @@ static int intel_init_bufmgr(intel_screen_private *intel)
 	if (!intel->bufmgr)
 		return FALSE;
 
-	drm_intel_bufmgr_gem_enable_reuse(intel->bufmgr);
+	if (xf86ReturnOptValBool(intel->Options, OPTION_BUFFER_CACHE, TRUE))
+		drm_intel_bufmgr_gem_enable_reuse(intel->bufmgr);
 	drm_intel_bufmgr_gem_set_vma_cache_size(intel->bufmgr, 512);
 	drm_intel_bufmgr_gem_enable_fenced_relocs(intel->bufmgr);
 
commit 61642b941556cfb124b302f1d5be435e953bd5c5
Author: Gaetan Nadon <memsize at videotron.ca>
Date:   Sat Feb 18 13:49:02 2012 -0500

    Revert "Update autotools configuration"
    
    This reverts commit 9184af921bc2f332fcb6c9b47001414378eab8e2.
    
    All X.Org modules must be able to be configured with autoconf 2.60.
    In addition, version 2.63 has GPL licensing issues which prevents
    some vendor to release software based on it.
    
    The AM_SILENT_RULES are already handled by XORG_DEFAULT_OPTIONS.
    
    All X.Org modules must be able to be configured with libtool 1.5.
    
    AM_MAINTAINER_MODE default value is "enabled" already.
    
    We use the same autogen script for all x.org modules.
    There are proposals for changes which should be reviewed and eventually
    applied to all modules together.
    
    The lt*.m4 patterns are already included in the root .gitignore file.
    This can be proposed as a change to all modules, but it invloves
    changing the topvel .gitignore, the m4/.gitignore, the ACLOCAL_AMFLAGS
    and the AC_CONFIG_MACRO_DIR together.
    
    For more information on project wide configuration guidelines,
    consult http://www.x.org/wiki/ModularDevelopersGuide
    and http://www.x.org/wiki/NewModuleGuidelines.
    
    Acked-by: Matthieu Herrb <matthieu.herrb at laas.fr>
    Signed-off-by: Gaetan Nadon <memsize at videotron.ca>

diff --git a/Makefile.am b/Makefile.am
index 48c3477..b3d37b2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,6 @@
 #  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 #  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
 
 SUBDIRS = man
 
diff --git a/autogen.sh b/autogen.sh
index 30d679f..904cd67 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,6 +1,12 @@
 #! /bin/sh
 
-test -n "$srcdir" || srcdir=`dirname "$0"`
-test -n "$srcdir" || srcdir=.
-autoreconf --force --install --verbose "$srcdir"
-test -n "$NOCONFIGURE" || "$srcdir/configure" "$@"
+srcdir=`dirname $0`
+test -z "$srcdir" && srcdir=.
+
+ORIGDIR=`pwd`
+cd $srcdir
+
+autoreconf -v --install || exit 1
+cd $ORIGDIR || exit $?
+
+$srcdir/configure --enable-maintainer-mode "$@"
diff --git a/configure.ac b/configure.ac
index 83afdb5..67a513d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,24 +21,18 @@
 # Process this file with autoconf to produce a configure script
 
 # Initialize Autoconf
-AC_PREREQ([2.63])
+AC_PREREQ([2.60])
 AC_INIT([xf86-video-intel],
         [2.17.0],
         [https://bugs.freedesktop.org/enter_bug.cgi?product=xorg],
         [xf86-video-intel])
 AC_CONFIG_SRCDIR([Makefile.am])
 AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_AUX_DIR([build-aux])
-AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_AUX_DIR(.)
 
 # Initialize Automake
-AM_INIT_AUTOMAKE([1.10 foreign dist-bzip2])
-AM_MAINTAINER_MODE([enable])
-
-# Support silent build rules, requires at least automake-1.11. Disable
-# by either passing --disable-silent-rules to configure or passing V=1
-# to make
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+AM_MAINTAINER_MODE
 
 # Require X.Org macros 1.8 or later for MAN_SUBSTS set by XORG_MANPAGE_SECTIONS
 m4_ifndef([XORG_MACROS_VERSION],
@@ -56,8 +50,8 @@ m4_ifndef([XORG_DRIVER_CHECK_EXT],
   depending on your distribution, try package 'xserver-xorg-dev' or 'xorg-x11-server-devel'])])
 
 # Initialize libtool
-LT_PREREQ([2.2])
-LT_INIT([disable-static])
+AC_DISABLE_STATIC
+AC_PROG_LIBTOOL
 
 # Are we in a git checkout?
 dot_git=no
diff --git a/m4/.gitignore b/m4/.gitignore
deleted file mode 100644
index 464ba5c..0000000
--- a/m4/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-libtool.m4
-lt~obsolete.m4
-ltoptions.m4
-ltsugar.m4
-ltversion.m4
commit 098485f8932c1427855015154d7a00ba59866cfa
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 12:04:09 2012 +0000

    sna/gen3: Silence the compiler complaining with DBG enabled
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 6828a16..b50d067 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1732,7 +1732,7 @@ inline static int gen3_get_rectangles(struct sna *sna,
 	int rem;
 
 	DBG(("%s: want=%d, rem=%d\n",
-	     __FUNCTION__, want*op->floats_per_rect, rem));
+	     __FUNCTION__, want*op->floats_per_rect, vertex_space(sna)));
 
 	assert(sna->render.vertex_index * op->floats_per_vertex == sna->render.vertex_used);
 
@@ -1742,12 +1742,12 @@ start:
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen3_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (unlikely(rem == 0))
 			goto flush;
 	}
 
-	if (sna->render_state.gen3.vertex_offset == 0 &&
-	    !gen3_rectangle_begin(sna, op))
+	if (unlikely(sna->render_state.gen3.vertex_offset == 0 &&
+		     !gen3_rectangle_begin(sna, op)))
 		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
commit 49f4546d15a3a5233a5671e962cf471f36d8fc2a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:40:57 2012 +0000

    sna/gen4 Refactor get_rectangles() to re-emit state after a flush
    
    Condense the work performed by each caller into the callee.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index cccdf4c..6ba59ee 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1116,19 +1116,12 @@ static bool gen4_rectangle_begin(struct sna *sna,
 	int id = op->u.gen4.ve_id;
 	int ndwords;
 
-	ndwords = 0;
+	/* 7xpipelined pointers + 6xprimitive + 1xflush */
+	ndwords = op->need_magic_ca_pass? 20 : 6;
 	if (FLUSH_EVERY_VERTEX)
 		ndwords += 1;
 	if ((sna->render_state.gen4.vb_id & (1 << id)) == 0)
 		ndwords += 5;
-	if (sna->render_state.gen4.vertex_offset == 0)
-		ndwords += 6;
-	if (ndwords == 0)
-		return true;
-
-	if (op->need_magic_ca_pass)
-		/* 7xpipelined pointers + 6xprimitive + 1xflush */
-		ndwords += 14;
 
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
@@ -1159,19 +1152,23 @@ static int gen4_get_rectangles__flush(struct sna *sna,
 
 inline static int gen4_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
-				      int want)
+				      int want,
+				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
 {
-	int rem = vertex_space(sna);
+	int rem;
 
+start:
+	rem = vertex_space(sna);
 	if (rem < 3*op->floats_per_vertex) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, 3*op->floats_per_vertex));
 		rem = gen4_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (unlikely(rem == 0))
 			goto flush;
 	}
 
-	if (!gen4_rectangle_begin(sna, op))
+	if (unlikely(sna->render_state.gen4.vertex_offset == 0 &&
+		     !gen4_rectangle_begin(sna, op)))
 		goto flush;
 
 	if (want > 1 && want * op->floats_per_vertex*3 > rem)
@@ -1186,7 +1183,8 @@ flush:
 		gen4_magic_ca_pass(sna, op);
 	}
 	_kgem_submit(&sna->kgem);
-	return 0;
+	emit_state(sna, op);
+	goto start;
 }
 
 static uint32_t *gen4_composite_get_binding_table(struct sna *sna,
@@ -1541,11 +1539,7 @@ gen4_render_composite_blt(struct sna *sna,
 			gen4_bind_surfaces(sna, op);
 	}
 
-	if (!gen4_get_rectangles(sna, op, 1)) {
-		gen4_bind_surfaces(sna, op);
-		gen4_get_rectangles(sna, op, 1);
-	}
-
+	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
 	op->prim_emit(sna, op, r);
 
 	/* XXX are the shaders fubar? */
@@ -1629,9 +1623,9 @@ static uint32_t gen4_bind_video_source(struct sna *sna,
 }
 
 static void gen4_video_bind_surfaces(struct sna *sna,
-				     const struct sna_composite_op *op,
-				     struct sna_video_frame *frame)
+				     const struct sna_composite_op *op)
 {
+	struct sna_video_frame *frame = op->priv;
 	uint32_t src_surf_format;
 	uint32_t src_surf_base[6];
 	int src_width[6];
@@ -1732,13 +1726,14 @@ gen4_render_video(struct sna *sna,
 	tmp.is_affine = TRUE;
 	tmp.floats_per_vertex = 3;
 	tmp.u.gen4.ve_id = 1;
+	tmp.priv = frame;
 
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
 	}
 
-	gen4_video_bind_surfaces(sna, &tmp, frame);
+	gen4_video_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
@@ -1769,10 +1764,7 @@ gen4_render_video(struct sna *sna,
 		r.y1 = box->y1 + pix_yoff;
 		r.y2 = box->y2 + pix_yoff;
 
-		if (!gen4_get_rectangles(sna, &tmp, 1)) {
-			gen4_video_bind_surfaces(sna, &tmp, frame);
-			gen4_get_rectangles(sna, &tmp, 1);
-		}
+		gen4_get_rectangles(sna, &tmp, 1, gen4_video_bind_surfaces);
 
 		OUT_VERTEX(r.x2, r.y2);
 		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
@@ -2393,10 +2385,7 @@ gen4_render_copy_one(struct sna *sna,
 		     int w, int h,
 		     int dx, int dy)
 {
-	if (!gen4_get_rectangles(sna, op, 1)) {
-		gen4_copy_bind_surfaces(sna, op);
-		gen4_get_rectangles(sna, op, 1);
-	}
+	gen4_get_rectangles(sna, op, 1, gen4_copy_bind_surfaces);
 
 	OUT_VERTEX(dx+w, dy+h);
 	OUT_VERTEX_F((sx+w)*op->src.scale[0]);
@@ -2725,10 +2714,7 @@ gen4_render_fill_rectangle(struct sna *sna,
 			   const struct sna_composite_op *op,
 			   int x, int y, int w, int h)
 {
-	if (!gen4_get_rectangles(sna, op, 1)) {
-		gen4_fill_bind_surfaces(sna, op);
-		gen4_get_rectangles(sna, op, 1);
-	}
+	gen4_get_rectangles(sna, op, 1, gen4_fill_bind_surfaces);
 
 	OUT_VERTEX(x+w, y+h);
 	OUT_VERTEX_F(1);
commit 004142dc158885364d797640e34ae23b204e52a5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:40:57 2012 +0000

    sna/gen7 Refactor get_rectangles() to re-emit state after a flush
    
    Condense the work performed by each caller into the callee.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index d039a48..0d913f6 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1664,20 +1664,23 @@ static int gen7_get_rectangles__flush(struct sna *sna,
 
 inline static int gen7_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
-				      int want)
+				      int want,
+				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
 {
 	int rem = vertex_space(sna);
 
+start:
+	rem = vertex_space(sna);
 	if (rem < op->floats_per_rect) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen7_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (unlikely(rem == 0))
 			goto flush;
 	}
 
-	if (sna->render_state.gen7.vertex_offset == 0 &&
-	    !gen7_rectangle_begin(sna, op))
+	if (unlikely(sna->render_state.gen7.vertex_offset == 0 &&
+		     !gen7_rectangle_begin(sna, op)))
 		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
@@ -1692,7 +1695,8 @@ flush:
 		gen7_magic_ca_pass(sna, op);
 	}
 	_kgem_submit(&sna->kgem);
-	return 0;
+	emit_state(sna, op);
+	goto start;
 }
 
 inline static uint32_t *gen7_composite_get_binding_table(struct sna *sna,
@@ -1801,11 +1805,7 @@ gen7_render_composite_blt(struct sna *sna,
 			  const struct sna_composite_op *op,
 			  const struct sna_composite_rectangles *r)
 {
-	if (unlikely(!gen7_get_rectangles(sna, op, 1))) {
-		gen7_emit_composite_state(sna, op);
-		gen7_get_rectangles(sna, op, 1);
-	}
-
+	gen7_get_rectangles(sna, op, 1, gen7_emit_composite_state);
 	op->prim_emit(sna, op, r);
 }
 
@@ -1816,10 +1816,7 @@ gen7_render_composite_box(struct sna *sna,
 {
 	struct sna_composite_rectangles r;
 
-	if (unlikely(!gen7_get_rectangles(sna, op, 1))) {
-		gen7_emit_composite_state(sna, op);
-		gen7_get_rectangles(sna, op, 1);
-	}
+	gen7_get_rectangles(sna, op, 1, gen7_emit_composite_state);
 
 	DBG(("  %s: (%d, %d), (%d, %d)\n",
 	     __FUNCTION__,
@@ -1842,12 +1839,12 @@ gen7_render_composite_boxes(struct sna *sna,
 	DBG(("composite_boxes(%d)\n", nbox));
 
 	do {
-		int nbox_this_time = gen7_get_rectangles(sna, op, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen7_emit_composite_state(sna, op);
-			nbox_this_time = gen7_get_rectangles(sna, op, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
+						     gen7_emit_composite_state);
 		nbox -= nbox_this_time;
+
 		do {
 			struct sna_composite_rectangles r;
 
@@ -1934,9 +1931,9 @@ static uint32_t gen7_bind_video_source(struct sna *sna,
 }
 
 static void gen7_emit_video_state(struct sna *sna,
-				  struct sna_composite_op *op,
-				  struct sna_video_frame *frame)
+				  const struct sna_composite_op *op)
 {
+	struct sna_video_frame *frame = op->priv;
 	uint32_t src_surf_format;
 	uint32_t src_surf_base[6];
 	int src_width[6];
@@ -2055,6 +2052,7 @@ gen7_render_video(struct sna *sna,
 	}
 	tmp.u.gen7.nr_inputs = 1;
 	tmp.u.gen7.ve_id = 1;
+	tmp.priv = frame;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
@@ -2063,7 +2061,7 @@ gen7_render_video(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen7_emit_video_state(sna, &tmp, frame);
+	gen7_emit_video_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
@@ -2094,10 +2092,7 @@ gen7_render_video(struct sna *sna,
 		r.y1 = box->y1 + pix_yoff;
 		r.y2 = box->y2 + pix_yoff;
 
-		if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-			gen7_emit_video_state(sna, &tmp, frame);
-			gen7_get_rectangles(sna, &tmp, 1);
-		}
+		gen7_get_rectangles(sna, &tmp, 1, gen7_emit_video_state);
 
 		OUT_VERTEX(r.x2, r.y2);
 		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
@@ -2941,11 +2936,7 @@ gen7_render_composite_spans_box(struct sna *sna,
 	     box->x2 - box->x1,
 	     box->y2 - box->y1));
 
-	if (unlikely(gen7_get_rectangles(sna, &op->base, 1) == 0)) {
-		gen7_emit_composite_state(sna, &op->base);
-		gen7_get_rectangles(sna, &op->base, 1);
-	}
-
+	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_composite_state);
 	op->prim_emit(sna, op, box, opacity);
 }
 
@@ -2964,11 +2955,8 @@ gen7_render_composite_spans_boxes(struct sna *sna,
 	do {
 		int nbox_this_time;
 
-		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen7_emit_composite_state(sna, &op->base);
-			nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
-		}
+		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
+						     gen7_emit_composite_state);
 		nbox -= nbox_this_time;
 
 		do {
@@ -3356,11 +3344,10 @@ fallback_blt:
 
 	do {
 		float *v;
-		int n_this_time = gen7_get_rectangles(sna, &tmp, n);
-		if (unlikely(n_this_time == 0)) {
-			gen7_emit_copy_state(sna, &tmp);
-			n_this_time = gen7_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time;
+
+		n_this_time = gen7_get_rectangles(sna, &tmp, n,
+						  gen7_emit_copy_state);
 		n -= n_this_time;
 
 		v = sna->render.vertices + sna->render.vertex_used;
@@ -3410,10 +3397,7 @@ gen7_render_copy_blt(struct sna *sna,
 		     int16_t w,  int16_t h,
 		     int16_t dx, int16_t dy)
 {
-	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		gen7_emit_copy_state(sna, &op->base);
-		gen7_get_rectangles(sna, &op->base, 1);
-	}
+	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_copy_state);
 
 	OUT_VERTEX(dx+w, dy+h);
 	OUT_VERTEX_F((sx+w)*op->base.src.scale[0]);
@@ -3680,12 +3664,12 @@ gen7_render_fill_boxes(struct sna *sna,
 	gen7_align_vertex(sna, &tmp);
 
 	do {
-		int n_this_time = gen7_get_rectangles(sna, &tmp, n);
-		if (unlikely(n_this_time == 0)) {
-			gen7_emit_fill_state(sna, &tmp);
-			n_this_time = gen7_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time;
+
+		n_this_time = gen7_get_rectangles(sna, &tmp, n,
+						  gen7_emit_fill_state);
 		n -= n_this_time;
+
 		do {
 			DBG(("	(%d, %d), (%d, %d)\n",
 			     box->x1, box->y1, box->x2, box->y2));
@@ -3717,10 +3701,7 @@ gen7_render_fill_op_blt(struct sna *sna,
 {
 	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
 
-	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		gen7_emit_fill_state(sna, &op->base);
-		gen7_get_rectangles(sna, &op->base, 1);
-	}
+	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_fill_state);
 
 	OUT_VERTEX(x+w, y+h);
 	OUT_VERTEX_F(1);
@@ -3743,10 +3724,7 @@ gen7_render_fill_op_box(struct sna *sna,
 	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
 	     box->x1, box->y1, box->x2, box->y2));
 
-	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		gen7_emit_fill_state(sna, &op->base);
-		gen7_get_rectangles(sna, &op->base, 1);
-	}
+	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_fill_state);
 
 	OUT_VERTEX(box->x2, box->y2);
 	OUT_VERTEX_F(1);
@@ -3771,11 +3749,10 @@ gen7_render_fill_op_boxes(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2, nbox));
 
 	do {
-		int nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen7_emit_fill_state(sna, &op->base);
-			nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
+						     gen7_emit_fill_state);
 		nbox -= nbox_this_time;
 
 		do {
@@ -3967,10 +3944,7 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
 
-	if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-		gen7_emit_fill_state(sna, &tmp);
-		gen7_get_rectangles(sna, &tmp, 1);
-	}
+	gen7_get_rectangles(sna, &tmp, 1, gen7_emit_fill_state);
 
 	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
 	OUT_VERTEX(x2, y2);
@@ -4065,10 +4039,7 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
 
-	if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-		gen7_emit_fill_state(sna, &tmp);
-		gen7_get_rectangles(sna, &tmp, 1);
-	}
+	gen7_get_rectangles(sna, &tmp, 1, gen7_emit_fill_state);
 
 	OUT_VERTEX(dst->drawable.width, dst->drawable.height);
 	OUT_VERTEX_F(1);
commit 482a0aa5040a4bf44ac72205419de2f0d3b4f423
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:40:57 2012 +0000

    sna/gen5 Refactor get_rectangles() to re-emit state after a flush
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 8d6a3e9..a80ce0a 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1130,13 +1130,9 @@ static bool gen5_rectangle_begin(struct sna *sna,
 
 	assert((unsigned)id <= 3);
 
-	ndwords = 0;
+	ndwords = op->need_magic_ca_pass ? 20 : 6;
 	if ((sna->render_state.gen5.vb_id & (1 << id)) == 0)
 		ndwords += 5;
-	if (sna->render_state.gen5.vertex_offset == 0)
-		ndwords += op->need_magic_ca_pass ? 20 : 6;
-	if (ndwords == 0)
-		return true;
 
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
@@ -1167,19 +1163,24 @@ static int gen5_get_rectangles__flush(struct sna *sna,
 
 inline static int gen5_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
-				      int want)
+				      int want,
+				      void (*emit_state)(struct sna *sna,
+							 const struct sna_composite_op *op))
 {
-	int rem = vertex_space(sna);
+	int rem;
 
+start:
+	rem = vertex_space(sna);
 	if (rem < op->floats_per_rect) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen5_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (unlikely (rem == 0))
 			goto flush;
 	}
 
-	if (!gen5_rectangle_begin(sna, op))
+	if (unlikely(sna->render_state.gen5.vertex_offset == 0 &&
+		     !gen5_rectangle_begin(sna, op)))
 		goto flush;
 
 	if (want * op->floats_per_rect > rem)
@@ -1194,7 +1195,8 @@ flush:
 		gen5_magic_ca_pass(sna, op);
 	}
 	_kgem_submit(&sna->kgem);
-	return 0;
+	emit_state(sna, op);
+	goto start;
 }
 
 static uint32_t *
@@ -1562,11 +1564,7 @@ gen5_render_composite_blt(struct sna *sna,
 	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
 	     r->width, r->height));
 
-	if (!gen5_get_rectangles(sna, op, 1)) {
-		gen5_bind_surfaces(sna, op);
-		gen5_get_rectangles(sna, op, 1);
-	}
-
+	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
 	op->prim_emit(sna, op, r);
 }
 
@@ -1581,10 +1579,7 @@ gen5_render_composite_box(struct sna *sna,
 	     __FUNCTION__,
 	     box->x1, box->y1, box->x2, box->y2));
 
-	if (!gen5_get_rectangles(sna, op, 1)) {
-		gen5_bind_surfaces(sna, op);
-		gen5_get_rectangles(sna, op, 1);
-	}
+	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
 
 	r.dst.x = box->x1;
 	r.dst.y = box->y1;
@@ -1608,11 +1603,10 @@ gen5_render_composite_boxes(struct sna *sna,
 	     op->mask.width, op->mask.height));
 
 	do {
-		int nbox_this_time = gen5_get_rectangles(sna, op, nbox);
-		if (nbox_this_time == 0) {
-			gen5_bind_surfaces(sna, op);
-			nbox_this_time = gen5_get_rectangles(sna, op, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
+						     gen5_bind_surfaces);
 		nbox -= nbox_this_time;
 
 		do {
@@ -1669,9 +1663,9 @@ static uint32_t gen5_bind_video_source(struct sna *sna,
 }
 
 static void gen5_video_bind_surfaces(struct sna *sna,
-				     struct sna_composite_op *op,
-				     struct sna_video_frame *frame)
+				     const struct sna_composite_op *op)
 {
+	struct sna_video_frame *frame = op->priv;
 	uint32_t src_surf_format;
 	uint32_t src_surf_base[6];
 	int src_width[6];
@@ -1775,13 +1769,14 @@ gen5_render_video(struct sna *sna,
 	tmp.is_affine = TRUE;
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
+	tmp.priv = frame;
 
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
 	}
 
-	gen5_video_bind_surfaces(sna, &tmp, frame);
+	gen5_video_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
@@ -1812,10 +1807,7 @@ gen5_render_video(struct sna *sna,
 		r.y1 = box->y1 + pix_yoff;
 		r.y2 = box->y2 + pix_yoff;
 
-		if (!gen5_get_rectangles(sna, &tmp, 1)) {
-			gen5_video_bind_surfaces(sna, &tmp, frame);
-			gen5_get_rectangles(sna, &tmp, 1);
-		}
+		gen5_get_rectangles(sna, &tmp, 1, gen5_video_bind_surfaces);
 
 		OUT_VERTEX(r.x2, r.y2);
 		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
@@ -2549,11 +2541,7 @@ gen5_render_composite_spans_box(struct sna *sna,
 	     box->x2 - box->x1,
 	     box->y2 - box->y1));
 
-	if (gen5_get_rectangles(sna, &op->base, 1) == 0) {
-		gen5_bind_surfaces(sna, &op->base);
-		gen5_get_rectangles(sna, &op->base, 1);
-	}
-
+	gen5_get_rectangles(sna, &op->base, 1, gen5_bind_surfaces);
 	op->prim_emit(sna, op, box, opacity);
 }
 
@@ -2572,11 +2560,8 @@ gen5_render_composite_spans_boxes(struct sna *sna,
 	do {
 		int nbox_this_time;
 
-		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox);
-		if (nbox_this_time == 0) {
-			gen5_bind_surfaces(sna, &op->base);
-			nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox);
-		}
+		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
+						     gen5_bind_surfaces);
 		nbox -= nbox_this_time;
 
 		do {
@@ -2871,11 +2856,10 @@ fallback_blt:
 	gen5_align_vertex(sna, &tmp);
 
 	do {
-		int n_this_time = gen5_get_rectangles(sna, &tmp, n);
-		if (n_this_time == 0) {
-			gen5_copy_bind_surfaces(sna, &tmp);
-			n_this_time = gen5_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time;
+
+		n_this_time = gen5_get_rectangles(sna, &tmp, n,
+						  gen5_copy_bind_surfaces);
 		n -= n_this_time;
 
 		do {
@@ -2926,10 +2910,7 @@ gen5_render_copy_blt(struct sna *sna,
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n", __FUNCTION__,
 	     sx, sy, dx, dy, w, h));
 
-	if (!gen5_get_rectangles(sna, &op->base, 1)) {
-		gen5_copy_bind_surfaces(sna, &op->base);
-		gen5_get_rectangles(sna, &op->base, 1);
-	}
+	gen5_get_rectangles(sna, &op->base, 1, gen5_copy_bind_surfaces);
 
 	OUT_VERTEX(dx+w, dy+h);
 	OUT_VERTEX_F((sx+w)*op->base.src.scale[0]);
@@ -3173,12 +3154,12 @@ gen5_render_fill_boxes(struct sna *sna,
 	gen5_align_vertex(sna, &tmp);
 
 	do {
-		int n_this_time = gen5_get_rectangles(sna, &tmp, n);
-		if (n_this_time == 0) {
-			gen5_fill_bind_surfaces(sna, &tmp);
-			n_this_time = gen5_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time;
+
+		n_this_time = gen5_get_rectangles(sna, &tmp, n,
+						  gen5_fill_bind_surfaces);
 		n -= n_this_time;
+
 		do {
 			DBG(("	(%d, %d), (%d, %d)\n",
 			     box->x1, box->y1, box->x2, box->y2));
@@ -3210,10 +3191,7 @@ gen5_render_fill_op_blt(struct sna *sna,
 {
 	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x,y,w,h));
 
-	if (!gen5_get_rectangles(sna, &op->base, 1)) {
-		gen5_fill_bind_surfaces(sna, &op->base);
-		gen5_get_rectangles(sna, &op->base, 1);
-	}
+	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
 
 	OUT_VERTEX(x+w, y+h);
 	OUT_VERTEX_F(1);
@@ -3236,10 +3214,7 @@ gen5_render_fill_op_box(struct sna *sna,
 	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
 	     box->x1, box->y1, box->x2, box->y2));
 
-	if (!gen5_get_rectangles(sna, &op->base, 1)) {
-		gen5_fill_bind_surfaces(sna, &op->base);
-		gen5_get_rectangles(sna, &op->base, 1);
-	}
+	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
 
 	OUT_VERTEX(box->x2, box->y2);
 	OUT_VERTEX_F(1);
@@ -3264,11 +3239,10 @@ gen5_render_fill_op_boxes(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2, nbox));
 
 	do {
-		int nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox);
-		if (nbox_this_time == 0) {
-			gen5_fill_bind_surfaces(sna, &op->base);
-			nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
+						     gen5_fill_bind_surfaces);
 		nbox -= nbox_this_time;
 
 		do {
@@ -3452,10 +3426,7 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	gen5_fill_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
 
-	if (!gen5_get_rectangles(sna, &tmp, 1)) {
-		gen5_fill_bind_surfaces(sna, &tmp);
-		gen5_get_rectangles(sna, &tmp, 1);
-	}
+	gen5_get_rectangles(sna, &tmp, 1, gen5_fill_bind_surfaces);
 
 	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
 	OUT_VERTEX(x2, y2);
commit d89ff3ff34173e397032c2059e31f9392e2ea70f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:40:57 2012 +0000

    sna/gen6: Refactor get_rectangles() to re-emit state after a flush
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index e975328..b69b3a2 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1630,20 +1630,23 @@ static int gen6_get_rectangles__flush(struct sna *sna,
 
 inline static int gen6_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
-				      int want)
+				      int want,
+				      void (*emit_state)(struct sna *, const struct sna_composite_op *op))
 {
-	int rem = vertex_space(sna);
+	int rem;
 
+start:
+	rem = vertex_space(sna);
 	if (rem < op->floats_per_rect) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen6_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (unlikely(rem == 0))
 			goto flush;
 	}
 
-	if (sna->render_state.gen6.vertex_offset == 0 &&
-	    !gen6_rectangle_begin(sna, op))
+	if (unlikely(sna->render_state.gen6.vertex_offset == 0 &&
+		     !gen6_rectangle_begin(sna, op)))
 		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
@@ -1659,7 +1662,8 @@ flush:
 		gen6_magic_ca_pass(sna, op);
 	}
 	_kgem_submit(&sna->kgem);
-	return 0;
+	emit_state(sna, op);
+	goto start;
 }
 
 inline static uint32_t *gen6_composite_get_binding_table(struct sna *sna,
@@ -1772,11 +1776,7 @@ gen6_render_composite_blt(struct sna *sna,
 			  const struct sna_composite_op *op,
 			  const struct sna_composite_rectangles *r)
 {
-	if (unlikely(!gen6_get_rectangles(sna, op, 1))) {
-		gen6_emit_composite_state(sna, op);
-		gen6_get_rectangles(sna, op, 1);
-	}
-
+	gen6_get_rectangles(sna, op, 1, gen6_emit_composite_state);
 	op->prim_emit(sna, op, r);
 }
 
@@ -1787,10 +1787,7 @@ gen6_render_composite_box(struct sna *sna,
 {
 	struct sna_composite_rectangles r;
 
-	if (unlikely(!gen6_get_rectangles(sna, op, 1))) {
-		gen6_emit_composite_state(sna, op);
-		gen6_get_rectangles(sna, op, 1);
-	}
+	gen6_get_rectangles(sna, op, 1, gen6_emit_composite_state);
 
 	DBG(("  %s: (%d, %d), (%d, %d)\n",
 	     __FUNCTION__,
@@ -1813,13 +1810,12 @@ gen6_render_composite_boxes(struct sna *sna,
 	DBG(("composite_boxes(%d)\n", nbox));
 
 	do {
-		int nbox_this_time = gen6_get_rectangles(sna, op, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen6_emit_composite_state(sna, op);
-			nbox_this_time = gen6_get_rectangles(sna, op, nbox);
-			assert(nbox_this_time);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen6_get_rectangles(sna, op, nbox,
+						     gen6_emit_composite_state);
 		nbox -= nbox_this_time;
+
 		do {
 			struct sna_composite_rectangles r;
 
@@ -1906,9 +1902,9 @@ static uint32_t gen6_bind_video_source(struct sna *sna,
 }
 
 static void gen6_emit_video_state(struct sna *sna,
-				  struct sna_composite_op *op,
-				  struct sna_video_frame *frame)
+				  const struct sna_composite_op *op)
 {
+	struct sna_video_frame *frame = op->priv;
 	uint32_t src_surf_format;
 	uint32_t src_surf_base[6];
 	int src_width[6];
@@ -2029,6 +2025,7 @@ gen6_render_video(struct sna *sna,
 	}
 	tmp.u.gen6.nr_inputs = 1;
 	tmp.u.gen6.ve_id = 1;
+	tmp.priv = frame;
 
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
@@ -2037,7 +2034,7 @@ gen6_render_video(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
-	gen6_emit_video_state(sna, &tmp, frame);
+	gen6_emit_video_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
 
 	/* Set up the offset for translating from the given region (in screen
@@ -2068,10 +2065,7 @@ gen6_render_video(struct sna *sna,
 		r.y1 = box->y1 + pix_yoff;
 		r.y2 = box->y2 + pix_yoff;
 
-		if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-			gen6_emit_video_state(sna, &tmp, frame);
-			gen6_get_rectangles(sna, &tmp, 1);
-		}
+		gen6_get_rectangles(sna, &tmp, 1, gen6_emit_video_state);
 
 		OUT_VERTEX(r.x2, r.y2);
 		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
@@ -2949,11 +2943,7 @@ gen6_render_composite_spans_box(struct sna *sna,
 	     box->x2 - box->x1,
 	     box->y2 - box->y1));
 
-	if (unlikely(gen6_get_rectangles(sna, &op->base, 1) == 0)) {
-		gen6_emit_composite_state(sna, &op->base);
-		gen6_get_rectangles(sna, &op->base, 1);
-	}
-
+	gen6_get_rectangles(sna, &op->base, 1, gen6_emit_composite_state);
 	op->prim_emit(sna, op, box, opacity);
 }
 
@@ -2972,12 +2962,8 @@ gen6_render_composite_spans_boxes(struct sna *sna,
 	do {
 		int nbox_this_time;
 
-		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen6_emit_composite_state(sna, &op->base);
-			nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
-			assert(nbox_this_time);
-		}
+		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox,
+						     gen6_emit_composite_state);
 		nbox -= nbox_this_time;
 
 		do {
@@ -3371,11 +3357,8 @@ fallback_blt:
 
 	do {
 		float *v;
-		int n_this_time = gen6_get_rectangles(sna, &tmp, n);
-		if (unlikely(n_this_time == 0)) {
-			gen6_emit_copy_state(sna, &tmp);
-			n_this_time = gen6_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time = gen6_get_rectangles(sna, &tmp, n,
+						      gen6_emit_copy_state);
 		n -= n_this_time;
 
 		v = sna->render.vertices + sna->render.vertex_used;
@@ -3425,10 +3408,7 @@ gen6_render_copy_blt(struct sna *sna,
 		     int16_t w,  int16_t h,
 		     int16_t dx, int16_t dy)
 {
-	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		gen6_emit_copy_state(sna, &op->base);
-		gen6_get_rectangles(sna, &op->base, 1);
-	}
+	gen6_get_rectangles(sna, &op->base, 1, gen6_emit_copy_state);
 
 	OUT_VERTEX(dx+w, dy+h);
 	OUT_VERTEX_F((sx+w)*op->base.src.scale[0]);
@@ -3698,11 +3678,8 @@ gen6_render_fill_boxes(struct sna *sna,
 	gen6_align_vertex(sna, &tmp);
 
 	do {
-		int n_this_time = gen6_get_rectangles(sna, &tmp, n);
-		if (unlikely(n_this_time == 0)) {
-			gen6_emit_fill_state(sna, &tmp);
-			n_this_time = gen6_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time = gen6_get_rectangles(sna, &tmp, n,
+						      gen6_emit_fill_state);
 		n -= n_this_time;
 		do {
 			DBG(("	(%d, %d), (%d, %d)\n",
@@ -3735,10 +3712,7 @@ gen6_render_op_fill_blt(struct sna *sna,
 {
 	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
 
-	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		gen6_emit_fill_state(sna, &op->base);
-		gen6_get_rectangles(sna, &op->base, 1);
-	}
+	gen6_get_rectangles(sna, &op->base, 1, gen6_emit_fill_state);
 
 	OUT_VERTEX(x+w, y+h);
 	OUT_VERTEX_F(1);
@@ -3761,10 +3735,7 @@ gen6_render_op_fill_box(struct sna *sna,
 	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
 	     box->x1, box->y1, box->x2, box->y2));
 
-	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		gen6_emit_fill_state(sna, &op->base);
-		gen6_get_rectangles(sna, &op->base, 1);
-	}
+	gen6_get_rectangles(sna, &op->base, 1, gen6_emit_fill_state);
 
 	OUT_VERTEX(box->x2, box->y2);
 	OUT_VERTEX_F(1);
@@ -3789,11 +3760,10 @@ gen6_render_op_fill_boxes(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2, nbox));
 
 	do {
-		int nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
-		if (unlikely(nbox_this_time == 0)) {
-			gen6_emit_fill_state(sna, &op->base);
-			nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox,
+						     gen6_emit_fill_state);
 		nbox -= nbox_this_time;
 
 		do {
@@ -3987,10 +3957,7 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
 
-	if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-		gen6_emit_fill_state(sna, &tmp);
-		gen6_get_rectangles(sna, &tmp, 1);
-	}
+	gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 
 	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
 	OUT_VERTEX(x2, y2);
@@ -4085,10 +4052,7 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
 
-	if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-		gen6_emit_fill_state(sna, &tmp);
-		gen6_get_rectangles(sna, &tmp, 1);
-	}
+	gen6_get_rectangles(sna, &tmp, 1, gen6_emit_fill_state);
 
 	OUT_VERTEX(dst->drawable.width, dst->drawable.height);
 	OUT_VERTEX_F(1);
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 7243042..71a6fc5 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -134,9 +134,9 @@ struct sna_composite_op {
 			int nr_inputs;
 			int ve_id;
 		} gen7;
-
-		void *priv;
 	} u;
+
+	void *priv;
 };
 
 struct sna_composite_spans_op {
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 17ecaea..493e313 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -89,7 +89,7 @@ sna_tiling_composite_blt(struct sna *sna,
 			 const struct sna_composite_op *op,
 			 const struct sna_composite_rectangles *r)
 {
-	sna_tiling_composite_add_rect(op->u.priv, r);
+	sna_tiling_composite_add_rect(op->priv, r);
 	(void)sna;
 }
 
@@ -107,7 +107,7 @@ sna_tiling_composite_box(struct sna *sna,
 	r.width  = box->x2 - box->x1;
 	r.height = box->y2 - box->y1;
 
-	sna_tiling_composite_add_rect(op->u.priv, &r);
+	sna_tiling_composite_add_rect(op->priv, &r);
 	(void)sna;
 }
 
@@ -126,7 +126,7 @@ sna_tiling_composite_boxes(struct sna *sna,
 		r.width  = box->x2 - box->x1;
 		r.height = box->y2 - box->y1;
 
-		sna_tiling_composite_add_rect(op->u.priv, &r);
+		sna_tiling_composite_add_rect(op->priv, &r);
 		box++;
 	}
 	(void)sna;
@@ -136,7 +136,7 @@ static void
 sna_tiling_composite_done(struct sna *sna,
 			  const struct sna_composite_op *op)
 {
-	struct sna_tile_state *tile = op->u.priv;
+	struct sna_tile_state *tile = op->priv;
 	struct sna_composite_op tmp;
 	int x, y, n, step;
 
@@ -312,7 +312,7 @@ sna_tiling_composite(uint32_t op,
 	tmp->boxes = sna_tiling_composite_boxes;
 	tmp->done  = sna_tiling_composite_done;
 
-	tmp->u.priv = tile;
+	tmp->priv = tile;
 	return TRUE;
 }
 
commit 13b43a54fa6e15946f1c6f1dd6253d35170c84f7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:29:38 2012 +0000

    sna/gen3: Refactor get_rectangles() to emit composite state and retry
    
    As gen3 only uses the single state emission block, and uniformly calls
    get_rectangles(), we can move that caller protocol into the callee.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index cb85cb9..6828a16 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1729,12 +1729,15 @@ inline static int gen3_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
 				      int want)
 {
-	int rem = vertex_space(sna);
+	int rem;
 
 	DBG(("%s: want=%d, rem=%d\n",
 	     __FUNCTION__, want*op->floats_per_rect, rem));
 
 	assert(sna->render.vertex_index * op->floats_per_vertex == sna->render.vertex_used);
+
+start:
+	rem = vertex_space(sna);
 	if (op->floats_per_rect > rem) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
@@ -1762,7 +1765,8 @@ flush:
 		gen3_magic_ca_pass(sna, op);
 	}
 	_kgem_submit(&sna->kgem);
-	return 0;
+	gen3_emit_composite_state(sna, op);
+	goto start;
 }
 
 fastcall static void
@@ -1776,10 +1780,7 @@ gen3_render_composite_blt(struct sna *sna,
 	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
 	     r->width, r->height));
 
-	if (!gen3_get_rectangles(sna, op, 1)) {
-		gen3_emit_composite_state(sna, op);
-		gen3_get_rectangles(sna, op, 1);
-	}
+	gen3_get_rectangles(sna, op, 1);
 
 	op->prim_emit(sna, op, r);
 }
@@ -1797,10 +1798,7 @@ gen3_render_composite_box(struct sna *sna,
 	     op->mask.offset[0], op->mask.offset[1],
 	     op->dst.x, op->dst.y));
 
-	if (!gen3_get_rectangles(sna, op, 1)) {
-		gen3_emit_composite_state(sna, op);
-		gen3_get_rectangles(sna, op, 1);
-	}
+	gen3_get_rectangles(sna, op, 1);
 
 	r.dst.x  = box->x1;
 	r.dst.y  = box->y1;
@@ -1826,10 +1824,6 @@ gen3_render_composite_boxes(struct sna *sna,
 		int nbox_this_time;
 
 		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
-		if (nbox_this_time == 0) {
-			gen3_emit_composite_state(sna, op);
-			nbox_this_time = gen3_get_rectangles(sna, op, nbox);
-		}
 		nbox -= nbox_this_time;
 
 		do {
@@ -3110,11 +3104,7 @@ gen3_render_composite_spans_box(struct sna *sna,
 	     box->x2 - box->x1,
 	     box->y2 - box->y1));
 
-	if (!gen3_get_rectangles(sna, &op->base, 1)) {
-		gen3_emit_composite_state(sna, &op->base);
-		gen3_get_rectangles(sna, &op->base, 1);
-	}
-
+	gen3_get_rectangles(sna, &op->base, 1);
 	op->prim_emit(sna, op, box, opacity);
 }
 
@@ -3134,10 +3124,6 @@ gen3_render_composite_spans_boxes(struct sna *sna,
 		int nbox_this_time;
 
 		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
-		if (nbox_this_time == 0) {
-			gen3_emit_composite_state(sna, &op->base);
-			nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
-		}
 		nbox -= nbox_this_time;
 
 		do {
@@ -3918,10 +3904,6 @@ fallback_blt:
 		int n_this_time;
 
 		n_this_time = gen3_get_rectangles(sna, &tmp, n);
-		if (n_this_time == 0) {
-			gen3_emit_composite_state(sna, &tmp);
-			n_this_time = gen3_get_rectangles(sna, &tmp, n);
-		}
 		n -= n_this_time;
 
 		do {
@@ -3966,10 +3948,7 @@ gen3_render_copy_blt(struct sna *sna,
 		     int16_t w, int16_t h,
 		     int16_t dx, int16_t dy)
 {
-	if (!gen3_get_rectangles(sna, &op->base, 1)) {
-		gen3_emit_composite_state(sna, &op->base);
-		gen3_get_rectangles(sna, &op->base, 1);
-	}
+	gen3_get_rectangles(sna, &op->base, 1);
 
 	OUT_VERTEX(dx+w);
 	OUT_VERTEX(dy+h);
@@ -4215,11 +4194,9 @@ gen3_render_fill_boxes(struct sna *sna,
 	gen3_align_vertex(sna, &tmp);
 
 	do {
-		int n_this_time = gen3_get_rectangles(sna, &tmp, n);
-		if (n_this_time == 0) {
-			gen3_emit_composite_state(sna, &tmp);
-			n_this_time = gen3_get_rectangles(sna, &tmp, n);
-		}
+		int n_this_time;
+
+		n_this_time = gen3_get_rectangles(sna, &tmp, n);
 		n -= n_this_time;
 
 		do {
@@ -4244,10 +4221,7 @@ gen3_render_fill_op_blt(struct sna *sna,
 			const struct sna_fill_op *op,
 			int16_t x, int16_t y, int16_t w, int16_t h)
 {
-	if (!gen3_get_rectangles(sna, &op->base, 1)) {
-		gen3_emit_composite_state(sna, &op->base);
-		gen3_get_rectangles(sna, &op->base, 1);
-	}
+	gen3_get_rectangles(sna, &op->base, 1);
 
 	OUT_VERTEX(x+w);
 	OUT_VERTEX(y+h);
@@ -4262,10 +4236,7 @@ gen3_render_fill_op_box(struct sna *sna,
 			const struct sna_fill_op *op,
 			const BoxRec *box)
 {
-	if (!gen3_get_rectangles(sna, &op->base, 1)) {
-		gen3_emit_composite_state(sna, &op->base);
-		gen3_get_rectangles(sna, &op->base, 1);
-	}
+	gen3_get_rectangles(sna, &op->base, 1);
 
 	OUT_VERTEX(box->x2);
 	OUT_VERTEX(box->y2);
@@ -4285,11 +4256,9 @@ gen3_render_fill_op_boxes(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2, nbox));
 
 	do {
-		int nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
-		if (nbox_this_time == 0) {
-			gen3_emit_composite_state(sna, &op->base);
-			nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
-		}
+		int nbox_this_time;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
 		nbox -= nbox_this_time;
 
 		do {
commit 91e35ac7bc171cd5577bbe16301cefa4bc78bedc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 10:17:34 2012 +0000

    sna/gen3+: Force a batch flush when run out of CA vbo
    
    As we prematurely end the batch if we bail on extending the vbo for CA
    glyphs, we need to force the flush.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 5253a8c..cb85cb9 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1761,6 +1761,7 @@ flush:
 		gen3_vertex_flush(sna);
 		gen3_magic_ca_pass(sna, op);
 	}
+	_kgem_submit(&sna->kgem);
 	return 0;
 }
 
@@ -3109,7 +3110,7 @@ gen3_render_composite_spans_box(struct sna *sna,
 	     box->x2 - box->x1,
 	     box->y2 - box->y1));
 
-	if (gen3_get_rectangles(sna, &op->base, 1) == 0) {
+	if (!gen3_get_rectangles(sna, &op->base, 1)) {
 		gen3_emit_composite_state(sna, &op->base);
 		gen3_get_rectangles(sna, &op->base, 1);
 	}
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 4bc2938..cccdf4c 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1185,6 +1185,7 @@ flush:
 		gen4_vertex_flush(sna);
 		gen4_magic_ca_pass(sna, op);
 	}
+	_kgem_submit(&sna->kgem);
 	return 0;
 }
 
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index ac55b09..8d6a3e9 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1193,6 +1193,7 @@ flush:
 		gen5_vertex_flush(sna);
 		gen5_magic_ca_pass(sna, op);
 	}
+	_kgem_submit(&sna->kgem);
 	return 0;
 }
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 035da78..e975328 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1658,6 +1658,7 @@ flush:
 		gen6_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
 	}
+	_kgem_submit(&sna->kgem);
 	return 0;
 }
 
@@ -1772,7 +1773,6 @@ gen6_render_composite_blt(struct sna *sna,
 			  const struct sna_composite_rectangles *r)
 {
 	if (unlikely(!gen6_get_rectangles(sna, op, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_composite_state(sna, op);
 		gen6_get_rectangles(sna, op, 1);
 	}
@@ -1788,7 +1788,6 @@ gen6_render_composite_box(struct sna *sna,
 	struct sna_composite_rectangles r;
 
 	if (unlikely(!gen6_get_rectangles(sna, op, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_composite_state(sna, op);
 		gen6_get_rectangles(sna, op, 1);
 	}
@@ -1816,7 +1815,6 @@ gen6_render_composite_boxes(struct sna *sna,
 	do {
 		int nbox_this_time = gen6_get_rectangles(sna, op, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_composite_state(sna, op);
 			nbox_this_time = gen6_get_rectangles(sna, op, nbox);
 			assert(nbox_this_time);
@@ -2071,7 +2069,6 @@ gen6_render_video(struct sna *sna,
 		r.y2 = box->y2 + pix_yoff;
 
 		if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_video_state(sna, &tmp, frame);
 			gen6_get_rectangles(sna, &tmp, 1);
 		}
@@ -2953,7 +2950,6 @@ gen6_render_composite_spans_box(struct sna *sna,
 	     box->y2 - box->y1));
 
 	if (unlikely(gen6_get_rectangles(sna, &op->base, 1) == 0)) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_composite_state(sna, &op->base);
 		gen6_get_rectangles(sna, &op->base, 1);
 	}
@@ -2978,7 +2974,6 @@ gen6_render_composite_spans_boxes(struct sna *sna,
 
 		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_composite_state(sna, &op->base);
 			nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
 			assert(nbox_this_time);
@@ -3378,7 +3373,6 @@ fallback_blt:
 		float *v;
 		int n_this_time = gen6_get_rectangles(sna, &tmp, n);
 		if (unlikely(n_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_copy_state(sna, &tmp);
 			n_this_time = gen6_get_rectangles(sna, &tmp, n);
 		}
@@ -3432,7 +3426,6 @@ gen6_render_copy_blt(struct sna *sna,
 		     int16_t dx, int16_t dy)
 {
 	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_copy_state(sna, &op->base);
 		gen6_get_rectangles(sna, &op->base, 1);
 	}
@@ -3707,7 +3700,6 @@ gen6_render_fill_boxes(struct sna *sna,
 	do {
 		int n_this_time = gen6_get_rectangles(sna, &tmp, n);
 		if (unlikely(n_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_fill_state(sna, &tmp);
 			n_this_time = gen6_get_rectangles(sna, &tmp, n);
 		}
@@ -3744,7 +3736,6 @@ gen6_render_op_fill_blt(struct sna *sna,
 	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
 
 	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_fill_state(sna, &op->base);
 		gen6_get_rectangles(sna, &op->base, 1);
 	}
@@ -3771,7 +3762,6 @@ gen6_render_op_fill_box(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2));
 
 	if (unlikely(!gen6_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_fill_state(sna, &op->base);
 		gen6_get_rectangles(sna, &op->base, 1);
 	}
@@ -3801,7 +3791,6 @@ gen6_render_op_fill_boxes(struct sna *sna,
 	do {
 		int nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen6_emit_fill_state(sna, &op->base);
 			nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox);
 		}
@@ -3999,7 +3988,6 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	gen6_align_vertex(sna, &tmp);
 
 	if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_fill_state(sna, &tmp);
 		gen6_get_rectangles(sna, &tmp, 1);
 	}
@@ -4098,7 +4086,6 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	gen6_align_vertex(sna, &tmp);
 
 	if (unlikely(!gen6_get_rectangles(sna, &tmp, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen6_emit_fill_state(sna, &tmp);
 		gen6_get_rectangles(sna, &tmp, 1);
 	}
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index c872c63..d039a48 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1691,6 +1691,7 @@ flush:
 		gen7_vertex_flush(sna);
 		gen7_magic_ca_pass(sna, op);
 	}
+	_kgem_submit(&sna->kgem);
 	return 0;
 }
 
@@ -1801,7 +1802,6 @@ gen7_render_composite_blt(struct sna *sna,
 			  const struct sna_composite_rectangles *r)
 {
 	if (unlikely(!gen7_get_rectangles(sna, op, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_composite_state(sna, op);
 		gen7_get_rectangles(sna, op, 1);
 	}
@@ -1817,7 +1817,6 @@ gen7_render_composite_box(struct sna *sna,
 	struct sna_composite_rectangles r;
 
 	if (unlikely(!gen7_get_rectangles(sna, op, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_composite_state(sna, op);
 		gen7_get_rectangles(sna, op, 1);
 	}
@@ -1845,7 +1844,6 @@ gen7_render_composite_boxes(struct sna *sna,
 	do {
 		int nbox_this_time = gen7_get_rectangles(sna, op, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_composite_state(sna, op);
 			nbox_this_time = gen7_get_rectangles(sna, op, nbox);
 		}
@@ -2097,7 +2095,6 @@ gen7_render_video(struct sna *sna,
 		r.y2 = box->y2 + pix_yoff;
 
 		if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_video_state(sna, &tmp, frame);
 			gen7_get_rectangles(sna, &tmp, 1);
 		}
@@ -2945,7 +2942,6 @@ gen7_render_composite_spans_box(struct sna *sna,
 	     box->y2 - box->y1));
 
 	if (unlikely(gen7_get_rectangles(sna, &op->base, 1) == 0)) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_composite_state(sna, &op->base);
 		gen7_get_rectangles(sna, &op->base, 1);
 	}
@@ -2970,7 +2966,6 @@ gen7_render_composite_spans_boxes(struct sna *sna,
 
 		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_composite_state(sna, &op->base);
 			nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
 		}
@@ -3363,7 +3358,6 @@ fallback_blt:
 		float *v;
 		int n_this_time = gen7_get_rectangles(sna, &tmp, n);
 		if (unlikely(n_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_copy_state(sna, &tmp);
 			n_this_time = gen7_get_rectangles(sna, &tmp, n);
 		}
@@ -3417,7 +3411,6 @@ gen7_render_copy_blt(struct sna *sna,
 		     int16_t dx, int16_t dy)
 {
 	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_copy_state(sna, &op->base);
 		gen7_get_rectangles(sna, &op->base, 1);
 	}
@@ -3689,7 +3682,6 @@ gen7_render_fill_boxes(struct sna *sna,
 	do {
 		int n_this_time = gen7_get_rectangles(sna, &tmp, n);
 		if (unlikely(n_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_fill_state(sna, &tmp);
 			n_this_time = gen7_get_rectangles(sna, &tmp, n);
 		}
@@ -3726,7 +3718,6 @@ gen7_render_fill_op_blt(struct sna *sna,
 	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
 
 	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_fill_state(sna, &op->base);
 		gen7_get_rectangles(sna, &op->base, 1);
 	}
@@ -3753,7 +3744,6 @@ gen7_render_fill_op_box(struct sna *sna,
 	     box->x1, box->y1, box->x2, box->y2));
 
 	if (unlikely(!gen7_get_rectangles(sna, &op->base, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_fill_state(sna, &op->base);
 		gen7_get_rectangles(sna, &op->base, 1);
 	}
@@ -3783,7 +3773,6 @@ gen7_render_fill_op_boxes(struct sna *sna,
 	do {
 		int nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
 		if (unlikely(nbox_this_time == 0)) {
-			_kgem_submit(&sna->kgem);
 			gen7_emit_fill_state(sna, &op->base);
 			nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox);
 		}
@@ -3979,7 +3968,6 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	gen7_align_vertex(sna, &tmp);
 
 	if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_fill_state(sna, &tmp);
 		gen7_get_rectangles(sna, &tmp, 1);
 	}
@@ -4078,7 +4066,6 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	gen7_align_vertex(sna, &tmp);
 
 	if (unlikely(!gen7_get_rectangles(sna, &tmp, 1))) {
-		_kgem_submit(&sna->kgem);
 		gen7_emit_fill_state(sna, &tmp);
 		gen7_get_rectangles(sna, &tmp, 1);
 	}
commit 2a6d2d38b263574bc17067ee6da320d3a081e74d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 23 00:33:16 2012 +0000

    sna: Use a CPU mapping if the bo is already in the CPU domain
    
    The heuristic of using the mapping only before the first use in an
    execbuffer was suboptimal and broken by the change in bo initialisation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 5ded904..007dc04 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2893,7 +2893,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	assert(list_is_empty(&bo->list));
 
 	if (bo->tiling == I915_TILING_NONE &&
-	    (kgem->has_llc || bo->domain == bo->presumed_offset)) {
+	    (kgem->has_llc || bo->domain == DOMAIN_CPU)) {
 		DBG(("%s: converting request for GTT map into CPU map\n",
 		     __FUNCTION__));
 		ptr = kgem_bo_map__cpu(kgem, bo);
commit d3913bf53ed9e8c95720ea3bd7c6a71a8a8de0fe
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 18:33:09 2012 +0000

    sna/gen4: Fix vertex flushing across batch flushing
    
    Due to the w/a for its buggy shaders, gen4 is significantly different
    that backporting the simple patch from gen5 was prone to failure. We
    need to check that the vertices have not already been flushed prior to
    flushing again.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index f6a47a0..4bc2938 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -351,8 +351,7 @@ static void gen4_magic_ca_pass(struct sna *sna,
 
 static void gen4_vertex_flush(struct sna *sna)
 {
-	if (sna->render_state.gen4.vertex_offset == 0)
-		return;
+	assert(sna->render_state.gen4.vertex_offset);
 
 	DBG(("%s[%x] = %d\n", __FUNCTION__,
 	     4*sna->render_state.gen4.vertex_offset,
@@ -1182,8 +1181,10 @@ inline static int gen4_get_rectangles(struct sna *sna,
 	return want;
 
 flush:
-	gen4_vertex_flush(sna);
-	gen4_magic_ca_pass(sna, op);
+	if (sna->render_state.gen4.vertex_offset) {
+		gen4_vertex_flush(sna);
+		gen4_magic_ca_pass(sna, op);
+	}
 	return 0;
 }
 
@@ -1202,14 +1203,6 @@ static uint32_t *gen4_composite_get_binding_table(struct sna *sna,
 }
 
 static void
-gen4_emit_sip(struct sna *sna)
-{
-	/* Set system instruction pointer */
-	OUT_BATCH(GEN4_STATE_SIP | 0);
-	OUT_BATCH(0);
-}
-
-static void
 gen4_emit_urb(struct sna *sna)
 {
 	int urb_vs_start, urb_vs_size;
@@ -1282,7 +1275,6 @@ gen4_emit_invariant(struct sna *sna)
 	else
 		OUT_BATCH(GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 
-	gen4_emit_sip(sna);
 	gen4_emit_state_base_address(sna);
 
 	sna->render_state.gen4.needs_invariant = FALSE;
@@ -1803,7 +1795,8 @@ gen4_render_video(struct sna *sna,
 	}
 	priv->clear = false;
 
-	gen4_vertex_flush(sna);
+	if (sna->render_state.gen4.vertex_offset)
+		gen4_vertex_flush(sna);
 	return TRUE;
 }
 
@@ -1920,8 +1913,10 @@ gen4_render_composite_done(struct sna *sna,
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	gen4_vertex_flush(sna);
-	gen4_magic_ca_pass(sna, op);
+	if (sna->render_state.gen4.vertex_offset) {
+		gen4_vertex_flush(sna);
+		gen4_magic_ca_pass(sna, op);
+	}
 
 	if (op->mask.bo)
 		kgem_bo_destroy(&sna->kgem, op->mask.bo);
@@ -2240,7 +2235,7 @@ gen4_render_composite(struct sna *sna,
 	if (too_large(tmp->dst.width, tmp->dst.height) &&
 	    !sna_render_composite_redirect(sna, tmp,
 					   dst_x, dst_y, width, height))
-			return FALSE;
+		return FALSE;
 
 	switch (gen4_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
@@ -2600,7 +2595,8 @@ gen4_render_copy_blt(struct sna *sna,
 static void
 gen4_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
-	gen4_vertex_flush(sna);
+	if (sna->render_state.gen4.vertex_offset)
+		gen4_vertex_flush(sna);
 }
 
 static Bool
@@ -2765,7 +2761,9 @@ gen4_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt(sna) || too_large(dst->drawable.width, dst->drawable.height)) {
+	if (prefer_blt(sna) ||
+	    too_large(dst->drawable.width, dst->drawable.height) ||
+	    !gen4_check_dst_format(format)) {
 		uint8_t alu = -1;
 
 		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
@@ -2806,11 +2804,11 @@ gen4_render_fill_boxes(struct sna *sna,
 	if (op == PictOpClear)
 		pixel = 0;
 	else if (!sna_get_pixel_from_rgba(&pixel,
-				     color->red,
-				     color->green,
-				     color->blue,
-				     color->alpha,
-				     PICT_a8r8g8b8))
+					  color->red,
+					  color->green,
+					  color->blue,
+					  color->alpha,
+					  PICT_a8r8g8b8))
 		return FALSE;
 
 	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
@@ -2888,7 +2886,8 @@ gen4_render_fill_op_boxes(struct sna *sna,
 static void
 gen4_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
 {
-	gen4_vertex_flush(sna);
+	if (sna->render_state.gen4.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 }
 
@@ -2949,7 +2948,7 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 	op->base.u.gen4.wm_kernel = WM_KERNEL;
 	op->base.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))  {
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
 	}
@@ -3048,7 +3047,8 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	gen4_render_fill_rectangle(sna, &tmp, x1, y1, x2 - x1, y2 - y1);
 
-	gen4_vertex_flush(sna);
+	if (sna->render_state.gen4.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return TRUE;
@@ -3113,7 +3113,6 @@ static uint32_t gen4_create_sf_state(struct sna_static_stream *stream,
 	sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
 	sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
 	sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
-	sf_state->thread4.stats_enable = 1;
 	sf_state->sf5.viewport_transform = FALSE;	/* skip viewport */
 	sf_state->sf6.cull_mode = GEN4_CULLMODE_NONE;
 	sf_state->sf6.scissor = 0;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 64be24b..ac55b09 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1549,7 +1549,6 @@ static void gen5_bind_surfaces(struct sna *sna,
 	gen5_emit_state(sna, op, offset);
 }
 
-
 fastcall static void
 gen5_render_composite_blt(struct sna *sna,
 			  const struct sna_composite_op *op,
@@ -2273,11 +2272,10 @@ gen5_render_composite(struct sna *sna,
 
 	sna_render_reduce_damage(tmp, dst_x, dst_y, width, height);
 
-	if (too_large(tmp->dst.width, tmp->dst.height)) {
-		if (!sna_render_composite_redirect(sna, tmp,
-						   dst_x, dst_y, width, height))
-			return FALSE;
-	}
+	if (too_large(tmp->dst.width, tmp->dst.height) &&
+	    !sna_render_composite_redirect(sna, tmp,
+					   dst_x, dst_y, width, height))
+		return FALSE;
 
 	DBG(("%s: preparing source\n", __FUNCTION__));
 	switch (gen5_composite_picture(sna, src, &tmp->src,
commit c53a3e5d8796e61a17b2892e151e5bb6789cde65
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 17:00:38 2012 +0000

    sna/blt: Avoid clobbering the composite state if we fail to setup the BLT
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 03b80c5..5253a8c 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2282,7 +2282,7 @@ source_use_blt(struct sna *sna, PicturePtr picture)
 	if (sna->kgem.has_vmap)
 		return false;
 
-	return is_cpu(picture->pDrawable);
+	return is_cpu(picture->pDrawable) || is_dirty(picture->pDrawable);
 }
 
 static Bool
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index ff9e462..64be24b 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2012,7 +2012,7 @@ picture_is_cpu(PicturePtr picture)
 	if (too_large(picture->pDrawable->width, picture->pDrawable->height))
 		return TRUE;
 
-	return is_cpu(picture->pDrawable);
+	return is_cpu(picture->pDrawable) || is_dirty(picture->pDrawable);
 }
 
 static Bool
@@ -2377,19 +2377,6 @@ gen5_render_composite(struct sna *sna,
 			goto cleanup_mask;
 	}
 
-	if (kgem_bo_is_dirty(tmp->src.bo) || kgem_bo_is_dirty(tmp->mask.bo)) {
-		if (mask == NULL &&
-		    tmp->redirect.real_bo == NULL &&
-		    sna_blt_composite(sna, op,
-				      src, dst,
-				      src_x, src_y,
-				      dst_x, dst_y,
-				      width, height, tmp)) {
-			kgem_bo_destroy(&sna->kgem, tmp->src.bo);
-			return TRUE;
-		}
-	}
-
 	gen5_bind_surfaces(sna, tmp);
 	gen5_align_vertex(sna, tmp);
 	return TRUE;
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index a7ea95c..a672c46 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -1552,8 +1552,8 @@ sna_blt_composite(struct sna *sna,
 		  int16_t width, int16_t height,
 		  struct sna_composite_op *tmp)
 {
-	struct sna_blt_state *blt = &tmp->u.blt;
 	PictFormat src_format = src->format;
+	PixmapPtr src_pixmap;
 	struct sna_pixmap *priv;
 	int16_t tx, ty;
 	uint32_t alpha_fixup;
@@ -1688,30 +1688,31 @@ sna_blt_composite(struct sna *sna,
 		return FALSE;
 	}
 
-	blt->src_pixmap = get_drawable_pixmap(src->pDrawable);
-	get_drawable_deltas(src->pDrawable, blt->src_pixmap, &tx, &ty);
+	src_pixmap = get_drawable_pixmap(src->pDrawable);
+	get_drawable_deltas(src->pDrawable, src_pixmap, &tx, &ty);
 	x += tx + src->pDrawable->x;
 	y += ty + src->pDrawable->y;
 	if (x < 0 || y < 0 ||
-	    x + width > blt->src_pixmap->drawable.width ||
-	    y + height > blt->src_pixmap->drawable.height) {
+	    x + width  > src_pixmap->drawable.width ||
+	    y + height > src_pixmap->drawable.height) {
 		DBG(("%s: source extends outside (%d, %d), (%d, %d) of valid pixmap %dx%d\n",
 		     __FUNCTION__,
-		     x, y, x+width, y+width, blt->src_pixmap->drawable.width, blt->src_pixmap->drawable.height));
+		     x, y, x+width, y+width, src_pixmap->drawable.width, src_pixmap->drawable.height));
 		return FALSE;
 	}
 
+	tmp->u.blt.src_pixmap = src_pixmap;
 	tmp->u.blt.sx = x - dst_x;
 	tmp->u.blt.sy = y - dst_y;
 	DBG(("%s: blt dst offset (%d, %d), source offset (%d, %d), with alpha fixup? %x\n",
 	     __FUNCTION__,
 	     tmp->dst.x, tmp->dst.y, tmp->u.blt.sx, tmp->u.blt.sy, alpha_fixup));
 
-	if (has_gpu_area(blt->src_pixmap, x, y, width, height))
+	if (has_gpu_area(src_pixmap, x, y, width, height))
 		ret = prepare_blt_copy(sna, tmp, alpha_fixup);
-	else if (has_cpu_area(blt->src_pixmap, x, y, width, height))
+	else if (has_cpu_area(src_pixmap, x, y, width, height))
 		ret = prepare_blt_put(sna, tmp, alpha_fixup);
-	else if (sna_pixmap_move_to_gpu(blt->src_pixmap, MOVE_READ))
+	else if (sna_pixmap_move_to_gpu(src_pixmap, MOVE_READ))
 		ret = prepare_blt_copy(sna, tmp, alpha_fixup);
 	else
 		ret = prepare_blt_put(sna, tmp, alpha_fixup);
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 2805a01..6c8f66a 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -89,6 +89,13 @@ is_cpu(DrawablePtr drawable)
 }
 
 static inline Bool
+is_dirty(DrawablePtr drawable)
+{
+	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
+	return priv == NULL || kgem_bo_is_dirty(priv->gpu_bo);
+}
+
+static inline Bool
 too_small(DrawablePtr drawable)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
commit 92c20fa586f79e8cfe761ba97a51261d96821ac5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 14:36:35 2012 +0000

    sna/gen3+: Flush the vertices during vertex-finish
    
    But only when finishing the vbo, which is the tricky part of the recent
    CA glyph bugs.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 3c3de12..03b80c5 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1574,6 +1574,9 @@ static int gen3_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen3.vertex_offset)
+			gen3_vertex_flush(sna);
+
 		DBG(("%s: reloc = %d\n", __FUNCTION__,
 		     sna->render.vertex_reloc[0]));
 
@@ -1736,20 +1739,13 @@ inline static int gen3_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen3_get_rectangles__flush(sna, op);
-		if (rem == 0) {
-			if (sna->render_state.gen3.vertex_offset) {
-				gen3_vertex_flush(sna);
-				gen3_magic_ca_pass(sna, op);
-			}
-			return 0;
-		}
+		if (rem == 0)
+			goto flush;
 	}
 
 	if (sna->render_state.gen3.vertex_offset == 0 &&
-	    !gen3_rectangle_begin(sna, op)) {
-		DBG(("%s: flushing batch\n", __FUNCTION__));
-		return 0;
-	}
+	    !gen3_rectangle_begin(sna, op))
+		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
@@ -1758,6 +1754,14 @@ inline static int gen3_get_rectangles(struct sna *sna,
 	assert(want);
 	assert(sna->render.vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
 	return want;
+
+flush:
+	DBG(("%s: flushing batch\n", __FUNCTION__));
+	if (sna->render_state.gen3.vertex_offset) {
+		gen3_vertex_flush(sna);
+		gen3_magic_ca_pass(sna, op);
+	}
+	return 0;
 }
 
 fastcall static void
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index c9f10a8..f6a47a0 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -64,7 +64,8 @@
 
 #if FLUSH_EVERY_VERTEX
 #define FLUSH(OP) do { \
-	gen4_vertex_flush(sna, OP); \
+	gen4_vertex_flush(sna); \
+	gen4_magic_ca_pass(sna, OP); \
 	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH); \
 } while (0)
 #else
@@ -348,8 +349,7 @@ static void gen4_magic_ca_pass(struct sna *sna,
 	state->last_primitive = sna->kgem.nbatch;
 }
 
-static void gen4_vertex_flush(struct sna *sna,
-			      const struct sna_composite_op *op)
+static void gen4_vertex_flush(struct sna *sna)
 {
 	if (sna->render_state.gen4.vertex_offset == 0)
 		return;
@@ -360,8 +360,6 @@ static void gen4_vertex_flush(struct sna *sna,
 	sna->kgem.batch[sna->render_state.gen4.vertex_offset] =
 		sna->render.vertex_index - sna->render.vertex_start;
 	sna->render_state.gen4.vertex_offset = 0;
-
-	gen4_magic_ca_pass(sna, op);
 }
 
 static int gen4_vertex_finish(struct sna *sna)
@@ -375,6 +373,9 @@ static int gen4_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen4.vertex_offset)
+			gen4_vertex_flush(sna);
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1167,20 +1168,23 @@ inline static int gen4_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, 3*op->floats_per_vertex));
 		rem = gen4_get_rectangles__flush(sna, op);
-		if (rem == 0) {
-			gen4_vertex_flush(sna, op);
-			return 0;
-		}
+		if (rem == 0)
+			goto flush;
 	}
 
 	if (!gen4_rectangle_begin(sna, op))
-		return 0;
+		goto flush;
 
 	if (want > 1 && want * op->floats_per_vertex*3 > rem)
 		want = rem / (3*op->floats_per_vertex);
 
 	sna->render.vertex_index += 3*want;
 	return want;
+
+flush:
+	gen4_vertex_flush(sna);
+	gen4_magic_ca_pass(sna, op);
+	return 0;
 }
 
 static uint32_t *gen4_composite_get_binding_table(struct sna *sna,
@@ -1799,7 +1803,7 @@ gen4_render_video(struct sna *sna,
 	}
 	priv->clear = false;
 
-	gen4_vertex_flush(sna, &tmp);
+	gen4_vertex_flush(sna);
 	return TRUE;
 }
 
@@ -1916,7 +1920,8 @@ gen4_render_composite_done(struct sna *sna,
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	gen4_vertex_flush(sna, op);
+	gen4_vertex_flush(sna);
+	gen4_magic_ca_pass(sna, op);
 
 	if (op->mask.bo)
 		kgem_bo_destroy(&sna->kgem, op->mask.bo);
@@ -2595,7 +2600,7 @@ gen4_render_copy_blt(struct sna *sna,
 static void
 gen4_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
-	gen4_vertex_flush(sna, &op->base);
+	gen4_vertex_flush(sna);
 }
 
 static Bool
@@ -2883,7 +2888,7 @@ gen4_render_fill_op_boxes(struct sna *sna,
 static void
 gen4_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
 {
-	gen4_vertex_flush(sna, &op->base);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 }
 
@@ -3043,7 +3048,7 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	gen4_render_fill_rectangle(sna, &tmp, x1, y1, x2 - x1, y2 - y1);
 
-	gen4_vertex_flush(sna, &tmp);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return TRUE;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 251eb39..ff9e462 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -66,8 +66,8 @@
 #define URB_CS_ENTRY_SIZE     1
 #define URB_CS_ENTRIES	      0
 
-#define URB_VS_ENTRY_SIZE     1	// each 512-bit row
-#define URB_VS_ENTRIES	      8	// we needs at least 8 entries
+#define URB_VS_ENTRY_SIZE     1
+#define URB_VS_ENTRIES	      128 /* minimum of 8 */
 
 #define URB_GS_ENTRY_SIZE     0
 #define URB_GS_ENTRIES	      0
@@ -76,7 +76,7 @@
 #define URB_CLIP_ENTRIES      0
 
 #define URB_SF_ENTRY_SIZE     2
-#define URB_SF_ENTRIES	      1
+#define URB_SF_ENTRIES	      32
 
 /*
  * this program computes dA/dx and dA/dy for the texture coordinates along
@@ -358,6 +358,9 @@ static int gen5_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen5.vertex_offset)
+			gen5_vertex_flush(sna);
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -410,6 +413,8 @@ static void gen5_vertex_close(struct sna *sna)
 	struct kgem_bo *bo;
 	unsigned int i, delta = 0;
 
+	assert(sna->render_state.gen5.vertex_offset == 0);
+
 	if (!sna->render.vertex_used) {
 		assert(sna->render.vbo == NULL);
 		assert(sna->render.vertices == sna->render.vertex_data);
@@ -421,7 +426,6 @@ static void gen5_vertex_close(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo == NULL) {
-
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -1082,6 +1086,8 @@ static void gen5_emit_vertex_buffer(struct sna *sna,
 {
 	int id = op->u.gen5.ve_id;
 
+	assert((unsigned)id <= 3);
+
 	OUT_BATCH(GEN5_3DSTATE_VERTEX_BUFFERS | 3);
 	OUT_BATCH((id << VB0_BUFFER_INDEX_SHIFT) | VB0_VERTEXDATA |
 		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
@@ -1122,6 +1128,8 @@ static bool gen5_rectangle_begin(struct sna *sna,
 	int id = op->u.gen5.ve_id;
 	int ndwords;
 
+	assert((unsigned)id <= 3);
+
 	ndwords = 0;
 	if ((sna->render_state.gen5.vb_id & (1 << id)) == 0)
 		ndwords += 5;
@@ -1167,23 +1175,25 @@ inline static int gen5_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen5_get_rectangles__flush(sna, op);
-		if (rem == 0) {
-			if (sna->render_state.gen5.vertex_offset) {
-				gen5_vertex_flush(sna);
-				gen5_magic_ca_pass(sna, op);
-			}
-			return 0;
-		}
+		if (rem == 0)
+			goto flush;
 	}
 
 	if (!gen5_rectangle_begin(sna, op))
-		return 0;
+		goto flush;
 
 	if (want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
 	sna->render.vertex_index += 3*want;
 	return want;
+
+flush:
+	if (sna->render_state.gen5.vertex_offset) {
+		gen5_vertex_flush(sna);
+		gen5_magic_ca_pass(sna, op);
+	}
+	return 0;
 }
 
 static uint32_t *
@@ -1414,8 +1424,9 @@ gen5_emit_vertex_elements(struct sna *sna,
 	int selem = is_affine ? 2 : 3;
 	uint32_t w_component;
 	uint32_t src_format;
-	int id = op->u.gen5.ve_id;;
+	int id = op->u.gen5.ve_id;
 
+	assert((unsigned)id <= 3);
 	if (!DBG_NO_STATE_CACHE && render->ve_id == id)
 		return;
 
@@ -3554,7 +3565,6 @@ static uint32_t gen5_create_sf_state(struct sna_static_stream *stream,
 	sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
 	sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
 	sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
-	sf_state->thread4.stats_enable = 1;
 	sf_state->sf5.viewport_transform = FALSE;	/* skip viewport */
 	sf_state->sf6.cull_mode = GEN5_CULLMODE_NONE;
 	sf_state->sf6.scissor = 0;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index e2c78e1..035da78 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -932,6 +932,9 @@ static int gen6_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen6.vertex_offset)
+			gen6_vertex_flush(sna);
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1635,18 +1638,13 @@ inline static int gen6_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen6_get_rectangles__flush(sna, op);
-		if (rem == 0) {
-			if (sna->render_state.gen6.vertex_offset) {
-				gen6_vertex_flush(sna);
-				gen6_magic_ca_pass(sna, op);
-			}
-			return 0;
-		}
+		if (rem == 0)
+			goto flush;
 	}
 
 	if (sna->render_state.gen6.vertex_offset == 0 &&
 	    !gen6_rectangle_begin(sna, op))
-		return 0;
+		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
@@ -1654,6 +1652,13 @@ inline static int gen6_get_rectangles(struct sna *sna,
 	assert(want > 0);
 	sna->render.vertex_index += 3*want;
 	return want;
+
+flush:
+	if (sna->render_state.gen6.vertex_offset) {
+		gen6_vertex_flush(sna);
+		gen6_magic_ca_pass(sna, op);
+	}
+	return 0;
 }
 
 inline static uint32_t *gen6_composite_get_binding_table(struct sna *sna,
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index faeedf0..c872c63 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1033,6 +1033,9 @@ static int gen7_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen7.vertex_offset)
+			gen7_vertex_flush(sna);
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1669,24 +1672,26 @@ inline static int gen7_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen7_get_rectangles__flush(sna, op);
-		if (rem == 0) {
-			if (sna->render_state.gen7.vertex_offset) {
-				gen7_vertex_flush(sna);
-				gen7_magic_ca_pass(sna, op);
-			}
-			return 0;
-		}
+		if (rem == 0)
+			goto flush;
 	}
 
 	if (sna->render_state.gen7.vertex_offset == 0 &&
 	    !gen7_rectangle_begin(sna, op))
-		return 0;
+		goto flush;
 
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
 	sna->render.vertex_index += 3*want;
 	return want;
+
+flush:
+	if (sna->render_state.gen7.vertex_offset) {
+		gen7_vertex_flush(sna);
+		gen7_magic_ca_pass(sna, op);
+	}
+	return 0;
 }
 
 inline static uint32_t *gen7_composite_get_binding_table(struct sna *sna,
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index f8df2a4..2d8b3b9 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2216,18 +2216,23 @@ composite_unaligned_box(struct sna *sna,
 			float opacity,
 			pixman_region16_t *clip)
 {
-	pixman_region16_t region;
+	if (clip) {
+		pixman_region16_t region;
 
-	pixman_region_init_rects(&region, box, 1);
-	RegionIntersect(&region, &region, clip);
-	if (REGION_NUM_RECTS(&region)) {
-		tmp->boxes(sna, tmp,
-			  REGION_RECTS(&region),
-			  REGION_NUM_RECTS(&region),
-			  opacity);
-		apply_damage(&tmp->base, &region);
+		pixman_region_init_rects(&region, box, 1);
+		RegionIntersect(&region, &region, clip);
+		if (REGION_NUM_RECTS(&region)) {
+			tmp->boxes(sna, tmp,
+				   REGION_RECTS(&region),
+				   REGION_NUM_RECTS(&region),
+				   opacity);
+			apply_damage(&tmp->base, &region);
+		}
+		pixman_region_fini(&region);
+	} else {
+		tmp->box(sna, tmp, box, opacity);
+		apply_damage_box(&tmp->base, box);
 	}
-	pixman_region_fini(&region);
 }
 
 static void
@@ -2244,17 +2249,19 @@ composite_unaligned_trap_row(struct sna *sna,
 	if (covered == 0)
 		return;
 
-	if (y2 > clip->extents.y2)
-		y2 = clip->extents.y2;
-	if (y1 < clip->extents.y1)
-		y1 = clip->extents.y1;
-	if (y1 >= y2)
-		return;
-
 	x1 = dx + pixman_fixed_to_int(trap->left.p1.x);
 	x2 = dx + pixman_fixed_to_int(trap->right.p1.x);
-	if (x2 < clip->extents.x1 || x1 > clip->extents.x2)
-		return;
+	if (clip) {
+		if (y2 > clip->extents.y2)
+			y2 = clip->extents.y2;
+		if (y1 < clip->extents.y1)
+			y1 = clip->extents.y1;
+		if (y1 >= y2)
+			return;
+
+		if (x2 < clip->extents.x1 || x1 > clip->extents.x2)
+			return;
+	}
 
 	box.y1 = y1;
 	box.y2 = y2;
@@ -2528,7 +2535,7 @@ composite_unaligned_boxes(struct sna *sna,
 {
 	BoxRec extents;
 	struct sna_composite_spans_op tmp;
-	pixman_region16_t clip;
+	pixman_region16_t clip, *c;
 	int dst_x, dst_y;
 	int dx, dy, n;
 
@@ -2584,6 +2591,11 @@ composite_unaligned_boxes(struct sna *sna,
 		return true;
 	}
 
+	c = NULL;
+	if (extents.x2 - extents.x1 > clip.extents.x2 - clip.extents.x1 ||
+	    extents.y2 - extents.y1 > clip.extents.y2 - clip.extents.y1)
+		c = &clip;
+
 	extents = *RegionExtents(&clip);
 	dx = dst->pDrawable->x;
 	dy = dst->pDrawable->y;
@@ -2611,7 +2623,7 @@ composite_unaligned_boxes(struct sna *sna,
 	}
 
 	for (n = 0; n < ntrap; n++)
-		composite_unaligned_trap(sna, &tmp, &traps[n], dx, dy, &clip);
+		composite_unaligned_trap(sna, &tmp, &traps[n], dx, dy, c);
 	tmp.done(sna, &tmp);
 
 	REGION_UNINIT(NULL, &clip);
commit 6729bcf98015b73e37677800cdea587fcf7bd277
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 13:44:24 2012 +0000

    sna: Clear the flush flag on release of scanout/dri
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=46445
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f79deff..5ded904 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3665,6 +3665,7 @@ void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
 {
 	bo->needs_flush = true;
 	bo->reusable = true;
+	bo->flush = false;
 
 	if (!bo->scanout)
 		return;
commit 5c075114e309b78fb25844996935bc0b7d1a948d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 13:17:01 2012 +0000

    sna/trapezoids: Only the inplace PictOpIn is unbounded
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index fafb16f..f8df2a4 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3494,7 +3494,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	inplace.opacity = color >> 24;
 
 	tor_render(NULL, &tor, (void*)&inplace,
-		   dst->pCompositeClip, span, true);
+		   dst->pCompositeClip, span, op == PictOpIn);
 
 	tor_fini(&tor);
 
commit 3b99d636e9a0e4c5d39d1f28aaa8900f5554e042
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 22 09:15:25 2012 +0000

    sna: Ensure we restore the shadow pixels before uploading CPU data
    
    Reported-by: Joe Nahmias <joe at nahmias.net>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=46425
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 9447cf7..8fa59a6 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1618,8 +1618,12 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 	if (sna_damage_is_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
-			      pixmap->drawable.height))
+			      pixmap->drawable.height)) {
+		sna_damage_destroy(&priv->cpu_damage);
+		priv->undamaged = false;
+		list_del(&priv->list);
 		goto done;
+	}
 
 	if (priv->gpu_bo == NULL) {
 		unsigned flags;
@@ -1659,10 +1663,13 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 	if (priv->mapped) {
+		pixmap->devPrivate.ptr = NULL;
+		priv->mapped = false;
+	}
+	if (pixmap->devPrivate.ptr == NULL) {
 		assert(priv->stride);
 		pixmap->devPrivate.ptr = priv->ptr;
 		pixmap->devKind = priv->stride;
-		priv->mapped = false;
 	}
 	assert(pixmap->devPrivate.ptr != NULL);
 
commit cc02419da4033993230b60b64d13a308b9069846
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 23:27:33 2012 +0000

    sna/gen5: Remove CA glyph workaround
    
    The root cause has been found and destroyed, so the w/a is now
    redundant.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 91273c9..2733a1a 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -1248,10 +1248,6 @@ sna_glyphs(CARD8 op,
 	_mask = mask;
 	/* XXX discard the mask for non-overlapping glyphs? */
 
-	/* XXX more shader breakage?: CA to dst is fubar on ilk */
-	if (sna->kgem.gen == 50 && !_mask)
-		_mask = list[0].format;
-
 	if (!_mask ||
 	    (((nlist == 1 && list->len == 1) || op == PictOpAdd) &&
 	     dst->format == (_mask->depth << 24 | _mask->format))) {
commit d1f352cf14d213ed326c8fae599e2cc4de63194c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 21:26:29 2012 +0000

    sna/gen3+: Re-emit composite state after flushing CA vertices
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=42891
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 189653d..3c3de12 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1566,8 +1566,7 @@ static void gen3_vertex_flush(struct sna *sna)
 	sna->render_state.gen3.vertex_offset = 0;
 }
 
-static int gen3_vertex_finish(struct sna *sna,
-			      const struct sna_composite_op *op)
+static int gen3_vertex_finish(struct sna *sna)
 {
 	struct kgem_bo *bo;
 
@@ -1575,11 +1574,6 @@ static int gen3_vertex_finish(struct sna *sna,
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (sna->render_state.gen3.vertex_offset) {
-			gen3_vertex_flush(sna);
-			gen3_magic_ca_pass(sna, op);
-		}
-
 		DBG(("%s: reloc = %d\n", __FUNCTION__,
 		     sna->render.vertex_reloc[0]));
 
@@ -1722,7 +1716,10 @@ static int gen3_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 1)
 		return 0;
 
-	return gen3_vertex_finish(sna, op);
+	if (op->need_magic_ca_pass && sna->render.vbo)
+		return 0;
+
+	return gen3_vertex_finish(sna);
 }
 
 inline static int gen3_get_rectangles(struct sna *sna,
@@ -2318,7 +2315,7 @@ gen3_align_vertex(struct sna *sna,
 {
 	if (op->floats_per_vertex != sna->render_state.gen3.last_floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen3_vertex_finish(sna, op);
+			gen3_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen3.last_floats_per_vertex,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 24de833..c9f10a8 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -364,8 +364,7 @@ static void gen4_vertex_flush(struct sna *sna,
 	gen4_magic_ca_pass(sna, op);
 }
 
-static int gen4_vertex_finish(struct sna *sna,
-			      const struct sna_composite_op *op)
+static int gen4_vertex_finish(struct sna *sna)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -376,8 +375,6 @@ static int gen4_vertex_finish(struct sna *sna,
 
 	bo = sna->render.vbo;
 	if (bo) {
-		gen4_vertex_flush(sna, op);
-
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1154,7 +1151,10 @@ static int gen4_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 1)
 		return 0;
 
-	return gen4_vertex_finish(sna, op);
+	if (op->need_magic_ca_pass && sna->render.vbo)
+		return 0;
+
+	return gen4_vertex_finish(sna);
 }
 
 inline static int gen4_get_rectangles(struct sna *sna,
@@ -1306,7 +1306,7 @@ gen4_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 6*op->floats_per_vertex)
-			gen4_vertex_finish(sna, op);
+			gen4_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen4.floats_per_vertex,
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index d4b6313..251eb39 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -347,8 +347,7 @@ static void gen5_vertex_flush(struct sna *sna)
 	sna->render_state.gen5.vertex_offset = 0;
 }
 
-static int gen5_vertex_finish(struct sna *sna,
-			      const struct sna_composite_op *op)
+static int gen5_vertex_finish(struct sna *sna)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -359,11 +358,6 @@ static int gen5_vertex_finish(struct sna *sna,
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (sna->render_state.gen5.vertex_offset) {
-			gen5_vertex_flush(sna);
-			gen5_magic_ca_pass(sna, op);
-		}
-
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1157,7 +1151,10 @@ static int gen5_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen5_vertex_finish(sna, op);
+	if (op->need_magic_ca_pass && sna->render.vbo)
+		return 0;
+
+	return gen5_vertex_finish(sna);
 }
 
 inline static int gen5_get_rectangles(struct sna *sna,
@@ -1312,7 +1309,7 @@ gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen5_vertex_finish(sna, op);
+			gen5_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen5.floats_per_vertex,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 56a0f4a..e2c78e1 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -919,8 +919,7 @@ static void gen6_vertex_flush(struct sna *sna)
 	sna->render_state.gen6.vertex_offset = 0;
 }
 
-static int gen6_vertex_finish(struct sna *sna,
-			      const struct sna_composite_op *op)
+static int gen6_vertex_finish(struct sna *sna)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -933,11 +932,6 @@ static int gen6_vertex_finish(struct sna *sna,
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (sna->render_state.gen6.vertex_offset) {
-			gen6_vertex_flush(sna);
-			gen6_magic_ca_pass(sna, op);
-		}
-
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1625,7 +1619,10 @@ static int gen6_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen6_vertex_finish(sna, op);
+	if (op->need_magic_ca_pass && sna->render.vbo)
+		return 0;
+
+	return gen6_vertex_finish(sna);
 }
 
 inline static int gen6_get_rectangles(struct sna *sna,
@@ -1751,7 +1748,7 @@ gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 	if (op->floats_per_vertex != sna->render_state.gen6.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
 			/* XXX propagate failure */
-			gen6_vertex_finish(sna, op);
+			gen6_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen6.floats_per_vertex,
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index fce19fe..faeedf0 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1022,8 +1022,7 @@ static void gen7_vertex_flush(struct sna *sna)
 	sna->render_state.gen7.vertex_offset = 0;
 }
 
-static int gen7_vertex_finish(struct sna *sna,
-			      const struct sna_composite_op *op)
+static int gen7_vertex_finish(struct sna *sna)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -1034,11 +1033,6 @@ static int gen7_vertex_finish(struct sna *sna,
 
 	bo = sna->render.vbo;
 	if (bo) {
-		if (sna->render_state.gen7.vertex_offset) {
-			gen7_vertex_flush(sna);
-			gen7_magic_ca_pass(sna, op);
-		}
-
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1659,7 +1653,10 @@ static int gen7_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen7_vertex_finish(sna, op);
+	if (op->need_magic_ca_pass && sna->render.vbo)
+		return 0;
+
+	return gen7_vertex_finish(sna);
 }
 
 inline static int gen7_get_rectangles(struct sna *sna,
@@ -1780,7 +1777,7 @@ gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen7_vertex_finish(sna, op);
+			gen7_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen7.floats_per_vertex,
commit 442a184829fcd4d26a9c98c748aa59c31fdc320e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 21:26:29 2012 +0000

    sna/gen3+: Only flush the vertices after checking for end-of-batch
    
    Or upon actually closing the vertex buffer.
    
    However, the underlying issue remains. That is we are failing to re-emit
    the first-pass for CA text after flushing the vertex buffer (and so
    emitting the second-pass for the flushed vertices).
    
    Reported-by: lemens Eisserer <linuxhippy at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=42891
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 97e5839..189653d 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -1566,7 +1566,8 @@ static void gen3_vertex_flush(struct sna *sna)
 	sna->render_state.gen3.vertex_offset = 0;
 }
 
-static int gen3_vertex_finish(struct sna *sna)
+static int gen3_vertex_finish(struct sna *sna,
+			      const struct sna_composite_op *op)
 {
 	struct kgem_bo *bo;
 
@@ -1574,6 +1575,11 @@ static int gen3_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen3.vertex_offset) {
+			gen3_vertex_flush(sna);
+			gen3_magic_ca_pass(sna, op);
+		}
+
 		DBG(("%s: reloc = %d\n", __FUNCTION__,
 		     sna->render.vertex_reloc[0]));
 
@@ -1709,11 +1715,6 @@ static bool gen3_rectangle_begin(struct sna *sna,
 static int gen3_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen3.vertex_offset) {
-		gen3_vertex_flush(sna);
-		gen3_magic_ca_pass(sna, op);
-	}
-
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 105: 5))
 		return 0;
 	if (sna->kgem.nexec > KGEM_EXEC_SIZE(&sna->kgem) - 2)
@@ -1721,7 +1722,7 @@ static int gen3_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 1)
 		return 0;
 
-	return gen3_vertex_finish(sna);
+	return gen3_vertex_finish(sna, op);
 }
 
 inline static int gen3_get_rectangles(struct sna *sna,
@@ -1738,8 +1739,13 @@ inline static int gen3_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen3_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (rem == 0) {
+			if (sna->render_state.gen3.vertex_offset) {
+				gen3_vertex_flush(sna);
+				gen3_magic_ca_pass(sna, op);
+			}
 			return 0;
+		}
 	}
 
 	if (sna->render_state.gen3.vertex_offset == 0 &&
@@ -2312,7 +2318,7 @@ gen3_align_vertex(struct sna *sna,
 {
 	if (op->floats_per_vertex != sna->render_state.gen3.last_floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen3_vertex_finish(sna);
+			gen3_vertex_finish(sna, op);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen3.last_floats_per_vertex,
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 6e7d4be..24de833 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -364,7 +364,8 @@ static void gen4_vertex_flush(struct sna *sna,
 	gen4_magic_ca_pass(sna, op);
 }
 
-static int gen4_vertex_finish(struct sna *sna)
+static int gen4_vertex_finish(struct sna *sna,
+			      const struct sna_composite_op *op)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -375,6 +376,8 @@ static int gen4_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		gen4_vertex_flush(sna, op);
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1144,8 +1147,6 @@ static bool gen4_rectangle_begin(struct sna *sna,
 static int gen4_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	gen4_vertex_flush(sna, op);
-
 	if (!kgem_check_batch(&sna->kgem, 25))
 		return 0;
 	if (sna->kgem.nexec > KGEM_EXEC_SIZE(&sna->kgem) - 1)
@@ -1153,7 +1154,7 @@ static int gen4_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 1)
 		return 0;
 
-	return gen4_vertex_finish(sna);
+	return gen4_vertex_finish(sna, op);
 }
 
 inline static int gen4_get_rectangles(struct sna *sna,
@@ -1166,8 +1167,10 @@ inline static int gen4_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, 3*op->floats_per_vertex));
 		rem = gen4_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (rem == 0) {
+			gen4_vertex_flush(sna, op);
 			return 0;
+		}
 	}
 
 	if (!gen4_rectangle_begin(sna, op))
@@ -1303,7 +1306,7 @@ gen4_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 6*op->floats_per_vertex)
-			gen4_vertex_finish(sna);
+			gen4_vertex_finish(sna, op);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen4.floats_per_vertex,
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 7ac993c..d4b6313 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -347,7 +347,8 @@ static void gen5_vertex_flush(struct sna *sna)
 	sna->render_state.gen5.vertex_offset = 0;
 }
 
-static int gen5_vertex_finish(struct sna *sna)
+static int gen5_vertex_finish(struct sna *sna,
+			      const struct sna_composite_op *op)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -358,6 +359,11 @@ static int gen5_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen5.vertex_offset) {
+			gen5_vertex_flush(sna);
+			gen5_magic_ca_pass(sna, op);
+		}
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1144,11 +1150,6 @@ static bool gen5_rectangle_begin(struct sna *sna,
 static int gen5_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen5.vertex_offset) {
-		gen5_vertex_flush(sna);
-		gen5_magic_ca_pass(sna, op);
-	}
-
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 20 : 6))
 		return 0;
 	if (sna->kgem.nexec > KGEM_EXEC_SIZE(&sna->kgem) - 1)
@@ -1156,7 +1157,7 @@ static int gen5_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen5_vertex_finish(sna);
+	return gen5_vertex_finish(sna, op);
 }
 
 inline static int gen5_get_rectangles(struct sna *sna,
@@ -1169,8 +1170,13 @@ inline static int gen5_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen5_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (rem == 0) {
+			if (sna->render_state.gen5.vertex_offset) {
+				gen5_vertex_flush(sna);
+				gen5_magic_ca_pass(sna, op);
+			}
 			return 0;
+		}
 	}
 
 	if (!gen5_rectangle_begin(sna, op))
@@ -1306,7 +1312,7 @@ gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen5_vertex_finish(sna);
+			gen5_vertex_finish(sna, op);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen5.floats_per_vertex,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 1e99709..56a0f4a 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -919,7 +919,8 @@ static void gen6_vertex_flush(struct sna *sna)
 	sna->render_state.gen6.vertex_offset = 0;
 }
 
-static int gen6_vertex_finish(struct sna *sna)
+static int gen6_vertex_finish(struct sna *sna,
+			      const struct sna_composite_op *op)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -932,6 +933,11 @@ static int gen6_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen6.vertex_offset) {
+			gen6_vertex_flush(sna);
+			gen6_magic_ca_pass(sna, op);
+		}
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1612,11 +1618,6 @@ static bool gen6_rectangle_begin(struct sna *sna,
 static int gen6_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen6.vertex_offset) {
-		gen6_vertex_flush(sna);
-		gen6_magic_ca_pass(sna, op);
-	}
-
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 5))
 		return 0;
 	if (sna->kgem.nexec > KGEM_EXEC_SIZE(&sna->kgem) - 1)
@@ -1624,7 +1625,7 @@ static int gen6_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen6_vertex_finish(sna);
+	return gen6_vertex_finish(sna, op);
 }
 
 inline static int gen6_get_rectangles(struct sna *sna,
@@ -1637,8 +1638,13 @@ inline static int gen6_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen6_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (rem == 0) {
+			if (sna->render_state.gen6.vertex_offset) {
+				gen6_vertex_flush(sna);
+				gen6_magic_ca_pass(sna, op);
+			}
 			return 0;
+		}
 	}
 
 	if (sna->render_state.gen6.vertex_offset == 0 &&
@@ -1745,7 +1751,7 @@ gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 	if (op->floats_per_vertex != sna->render_state.gen6.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
 			/* XXX propagate failure */
-			gen6_vertex_finish(sna);
+			gen6_vertex_finish(sna, op);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen6.floats_per_vertex,
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 5294547..fce19fe 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -1022,7 +1022,8 @@ static void gen7_vertex_flush(struct sna *sna)
 	sna->render_state.gen7.vertex_offset = 0;
 }
 
-static int gen7_vertex_finish(struct sna *sna)
+static int gen7_vertex_finish(struct sna *sna,
+			      const struct sna_composite_op *op)
 {
 	struct kgem_bo *bo;
 	unsigned int i;
@@ -1033,6 +1034,11 @@ static int gen7_vertex_finish(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo) {
+		if (sna->render_state.gen7.vertex_offset) {
+			gen7_vertex_flush(sna);
+			gen7_magic_ca_pass(sna, op);
+		}
+
 		for (i = 0; i < ARRAY_SIZE(sna->render.vertex_reloc); i++) {
 			if (sna->render.vertex_reloc[i]) {
 				DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
@@ -1646,11 +1652,6 @@ static bool gen7_rectangle_begin(struct sna *sna,
 static int gen7_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen7.vertex_offset) {
-		gen7_vertex_flush(sna);
-		gen7_magic_ca_pass(sna, op);
-	}
-
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
 		return 0;
 	if (sna->kgem.nexec > KGEM_EXEC_SIZE(&sna->kgem) - 1)
@@ -1658,7 +1659,7 @@ static int gen7_get_rectangles__flush(struct sna *sna,
 	if (sna->kgem.nreloc > KGEM_RELOC_SIZE(&sna->kgem) - 2)
 		return 0;
 
-	return gen7_vertex_finish(sna);
+	return gen7_vertex_finish(sna, op);
 }
 
 inline static int gen7_get_rectangles(struct sna *sna,
@@ -1671,8 +1672,13 @@ inline static int gen7_get_rectangles(struct sna *sna,
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen7_get_rectangles__flush(sna, op);
-		if (rem == 0)
+		if (rem == 0) {
+			if (sna->render_state.gen7.vertex_offset) {
+				gen7_vertex_flush(sna);
+				gen7_magic_ca_pass(sna, op);
+			}
 			return 0;
+		}
 	}
 
 	if (sna->render_state.gen7.vertex_offset == 0 &&
@@ -1774,7 +1780,7 @@ gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen7_vertex_finish(sna);
+			gen7_vertex_finish(sna, op);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen7.floats_per_vertex,
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index ae5bec6..676125d 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -1927,6 +1927,7 @@ sna_mode_fini(struct sna *sna)
 #endif
 
 	sna_mode_remove_fb(sna);
+	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(sna->front));
 
 	/* mode->shadow_fb_id should have been destroyed already */
 }
commit 31313d0d00e66d1f170653aa6a410a15a721e4db
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 13:16:43 2012 +0000

    sna: Clear the scanout flag after releasing the scanout pixmap
    
    In the future, this will be a good place to restore the cache level of
    the bo as well.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index e7c4987..f79deff 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1067,8 +1067,9 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(list_is_empty(&bo->list));
 	assert(bo->vmap == false && bo->sync == false);
 	assert(bo->io == false);
+	assert(bo->scanout == false);
+	assert(bo->flush == false);
 
-	bo->scanout = bo->flush = false;
 	if (bo->rq) {
 		struct list *cache;
 
@@ -3660,6 +3661,17 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 	}
 }
 
+void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
+{
+	bo->needs_flush = true;
+	bo->reusable = true;
+
+	if (!bo->scanout)
+		return;
+
+	bo->scanout = false;
+}
+
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
 		struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index f3a7b94..30303ce 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -499,6 +499,8 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 				      void **ret);
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *bo);
 
+void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo);
+
 void kgem_throttle(struct kgem *kgem);
 #define MAX_INACTIVE_TIME 10
 bool kgem_expire_cache(struct kgem *kgem);
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index b2779aa..ae5bec6 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -648,7 +648,6 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 		     scrn->depth, scrn->bitsPerPixel));
 
 		assert(bo->tiling != I915_TILING_Y);
-		bo->scanout = true;
 		ret = drmModeAddFB(sna->kgem.fd,
 				   scrn->virtualX, scrn->virtualY,
 				   scrn->depth, scrn->bitsPerPixel,
@@ -664,6 +663,7 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 
 		DBG(("%s: handle %d attached to fb %d\n",
 		     __FUNCTION__, bo->handle, sna_mode->fb_id));
+		bo->scanout = true;
 		sna_mode->fb_pixmap = sna->front->drawable.serialNumber;
 	}
 
@@ -765,7 +765,6 @@ sna_crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 	}
 
 	assert(bo->tiling != I915_TILING_Y);
-	bo->scanout = true;
 	if (drmModeAddFB(sna->kgem.fd,
 			 width, height, scrn->depth, scrn->bitsPerPixel,
 			 bo->pitch, bo->handle,
@@ -780,6 +779,7 @@ sna_crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 
 	DBG(("%s: attached handle %d to fb %d\n",
 	     __FUNCTION__, bo->handle, sna_crtc->shadow_fb_id));
+	bo->scanout = true;
 	return sna_crtc->shadow = shadow;
 }
 
@@ -806,6 +806,7 @@ sna_crtc_shadow_destroy(xf86CrtcPtr crtc, PixmapPtr pixmap, void *data)
 	drmModeRmFB(sna->kgem.fd, sna_crtc->shadow_fb_id);
 	sna_crtc->shadow_fb_id = 0;
 
+	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(pixmap));
 	pixmap->drawable.pScreen->DestroyPixmap(pixmap);
 	sna_crtc->shadow = NULL;
 }
@@ -1674,7 +1675,6 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 		goto fail;
 
 	assert(bo->tiling != I915_TILING_Y);
-	bo->scanout = true;
 	if (drmModeAddFB(sna->kgem.fd, width, height,
 			 scrn->depth, scrn->bitsPerPixel,
 			 bo->pitch, bo->handle,
@@ -1690,6 +1690,7 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	     __FUNCTION__, bo->handle,
 	     sna->front->drawable.serialNumber, mode->fb_id));
 
+	bo->scanout = true;
 	for (i = 0; i < xf86_config->num_crtc; i++) {
 		xf86CrtcPtr crtc = xf86_config->crtc[i];
 
@@ -1711,7 +1712,7 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 
 	if (old_fb_id)
 		drmModeRmFB(sna->kgem.fd, old_fb_id);
-	sna_pixmap_get_bo(old_front)->needs_flush = true;
+	kgem_bo_clear_scanout(&sna->kgem, sna_pixmap_get_bo(old_front));
 	scrn->pScreen->DestroyPixmap(old_front);
 
 	return TRUE;
@@ -1842,13 +1843,7 @@ sna_page_flip(struct sna *sna,
 	count = do_page_flip(sna, data, ref_crtc_hw_id);
 	DBG(("%s: page flipped %d crtcs\n", __FUNCTION__, count));
 	if (count) {
-		/* Although the kernel performs an implicit flush upon
-		 * page-flipping, marking the bo as requiring a flush
-		 * here ensures that the buffer goes into the active cache
-		 * upon release.
-		 */
-		bo->needs_flush = true;
-		bo->reusable = true;
+		bo->scanout = true;
 	} else {
 		drmModeRmFB(sna->kgem.fd, mode->fb_id);
 		mode->fb_id = *old_fb;
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 1def833..fe3d1cf 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -717,12 +717,18 @@ sna_dri_add_frame_event(struct sna_dri_frame_event *info)
 static void
 sna_dri_frame_event_release_bo(struct kgem *kgem, struct kgem_bo *bo)
 {
-	bo->needs_flush = true; /* has been used externally, reset domains */
-	bo->reusable = true; /* No longer in use by an external client */
+	kgem_bo_clear_scanout(kgem, bo);
 	kgem_bo_destroy(kgem, bo);
 }
 
 static void
+sna_dri_frame_event_finish(struct sna_dri_frame_event *info)
+{
+	sna_mode_delete_fb(info->sna, info->old_fb);
+	kgem_bo_clear_scanout(&info->sna->kgem, info->old_front.bo);
+}
+
+static void
 sna_dri_frame_event_info_free(struct sna_dri_frame_event *info)
 {
 	DBG(("%s: del[%p] (%p, %ld)\n", __FUNCTION__,
@@ -1028,12 +1034,12 @@ static void sna_dri_flip_event(struct sna *sna,
 					 flip->event_data);
 		}
 
-		sna_mode_delete_fb(flip->sna, flip->old_fb);
+		sna_dri_frame_event_finish(flip);
 		sna_dri_frame_event_info_free(flip);
 		break;
 
 	case DRI2_FLIP_THROTTLE:
-		sna_mode_delete_fb(sna, flip->old_fb);
+		sna_dri_frame_event_finish(flip);
 
 		assert(sna->dri.flip_pending[flip->pipe] == flip);
 		sna->dri.flip_pending[flip->pipe] = NULL;
@@ -1071,7 +1077,7 @@ static void sna_dri_flip_event(struct sna *sna,
 		     flip->front->name != flip->old_front.name));
 		assert(sna->dri.flip_pending[flip->pipe] == flip);
 
-		sna_mode_delete_fb(flip->sna, flip->old_fb);
+		sna_dri_frame_event_finish(flip);
 
 		if (flip->front->name != flip->next_front.name) {
 			DBG(("%s: async flip continuing\n", __FUNCTION__));
commit c8255e7d8a24693e288376e5003b7a90a6904968
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 16:33:26 2012 +0000

    sna/dri: Queue a flush on the back DRI2 when enqueing a flip
    
    As we may wait upon the bo having finished rendering before we can
    execute the flip, flushing the render cache as early as possible is
    beneficial
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 78d9a6e..1def833 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1636,6 +1636,7 @@ blit:
 		}
 		info->front->name = info->back->name;
 		get_private(info->front)->bo = get_private(info->back)->bo;
+		__kgem_flush(&sna->kgem, get_private(info->back)->bo);
 	}
 
 	if (bo == NULL) {
commit a414857713a9d4c49122a3e5a5d70ce955197a70
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 14:49:30 2012 +0000

    uxa: Prevent laggy applications by throttling after rendering
    
    Before blocking and waiting for further input, we need to make sure that
    we have not developed too large a queue of outstanding rendering. As we
    rendering to the front-buffer with no natural throttling and allow X
    clients to render as fast as they wish, it is entirely possible for a
    large queue of outstanding rendering to develop. For such an example,
    watch firefox rendering the fishietank demo and notice the delay that
    can build up before the tooltips appear.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index 0cb8df3..ed4f375 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -32,6 +32,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #endif
 
 #include <xf86.h>
+#include <xf86drm.h>
 #include <xaarop.h>
 #include <string.h>
 #include <errno.h>
@@ -1001,6 +1002,11 @@ static void intel_flush_rendering(intel_screen_private *intel)
 	intel->needs_flush = 0;
 }
 
+static void intel_throttle(intel_screen_private *intel)
+{
+	drmCommandNone(intel->drmSubFD, DRM_I915_GEM_THROTTLE);
+}
+
 void intel_uxa_block_handler(intel_screen_private *intel)
 {
 	if (intel->shadow_damage &&
@@ -1015,6 +1021,7 @@ void intel_uxa_block_handler(intel_screen_private *intel)
 	 */
 	intel_glamor_flush(intel);
 	intel_flush_rendering(intel);
+	intel_throttle(intel);
 }
 
 static PixmapPtr
commit ea0ee27ac1ac5a9711f60407421a5f05eef0e4cb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 13:31:16 2012 +0000

    sna: Mark the pixmap as active for the force-to-gpu short-circuit
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3b417cc..9447cf7 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2002,6 +2002,15 @@ sna_pixmap_create_upload(ScreenPtr screen,
 	return pixmap;
 }
 
+static inline struct sna_pixmap *
+sna_pixmap_mark_active(struct sna *sna, struct sna_pixmap *priv)
+{
+	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
+		list_move(&priv->inactive, &sna->active_pixmaps);
+	priv->clear = false;
+	return priv;
+}
+
 struct sna_pixmap *
 sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 {
@@ -2015,7 +2024,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		DBG(("%s: GPU all-damaged\n", __FUNCTION__));
-		return priv;
+		return sna_pixmap_mark_active(to_sna_from_pixmap(pixmap), priv);
 	}
 
 	/* Unlike move-to-gpu, we ignore wedged and always create the GPU bo */
@@ -2197,10 +2206,7 @@ done:
 			sna_pixmap_free_cpu(sna, priv);
 	}
 active:
-	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
-		list_move(&priv->inactive, &sna->active_pixmaps);
-	priv->clear = false;
-	return priv;
+	return sna_pixmap_mark_active(to_sna_from_pixmap(pixmap), priv);
 }
 
 static bool must_check sna_validate_pixmap(DrawablePtr draw, PixmapPtr pixmap)
commit a5bff56d63ad0ce0cc5a49dcfecf0a9c99d389e8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 13:17:35 2012 +0000

    sna: Skip the CPU synchronization when marking a pixmap as inactive
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 575c213..00fc80a 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -442,6 +442,7 @@ struct kgem_bo *sna_pixmap_change_tiling(PixmapPtr pixmap, uint32_t tiling);
 #define MOVE_WRITE 0x1
 #define MOVE_READ 0x2
 #define MOVE_INPLACE_HINT 0x4
+#define MOVE_ASYNC_HINT 0x8
 bool must_check _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned flags);
 static inline bool must_check sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned flags)
 {
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 72c3907..3b417cc 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1058,7 +1058,7 @@ skip_inplace_map:
 	}
 
 done:
-	if (priv->cpu_bo) {
+	if ((flags & MOVE_ASYNC_HINT) == 0 && priv->cpu_bo) {
 		DBG(("%s: syncing CPU bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 	}
@@ -11763,7 +11763,7 @@ static void sna_accel_inactive(struct sna *sna)
 			DBG(("%s: discarding inactive GPU bo handle=%d\n",
 			     __FUNCTION__, priv->gpu_bo->handle));
 			if (!sna_pixmap_move_to_cpu(priv->pixmap,
-						    MOVE_READ | MOVE_WRITE))
+						    MOVE_READ | MOVE_WRITE | MOVE_ASYNC_HINT))
 				list_add(&priv->inactive, &preserve);
 		}
 	}
commit 0093e1c223563a7da1bbd5310e571d884739bad3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 13:11:32 2012 +0000

    sna/dri: Ensure that we reattach to the DRI2 front buffer after modeswitch
    
    If we change the Screen pixmap due to a change of mode, we lose the
    flag that we've attached a DRI2 buffer to it. So the next time we try to
    copy from/to it, reassert its DRI2 status.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 58a3e3c..78d9a6e 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -164,7 +164,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 		return NULL;
 
 	if (priv->flush)
-		return ref(priv->gpu_bo);
+		return priv->gpu_bo;
 
 	tiling = color_tiling(sna, &pixmap->drawable);
 	if (tiling < 0)
@@ -182,7 +182,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	/* Don't allow this named buffer to be replaced */
 	priv->pinned = 1;
 
-	return ref(priv->gpu_bo);
+	return priv->gpu_bo;
 }
 
 static DRI2Buffer2Ptr
@@ -201,16 +201,15 @@ sna_dri_create_buffer(DrawablePtr drawable,
 	     __FUNCTION__, attachment, format,
 	     drawable->width, drawable->height));
 
-	buffer = calloc(1, sizeof *buffer + sizeof *private);
-	if (buffer == NULL)
-		return NULL;
-
 	pixmap = NULL;
-	bo = NULL;
 	switch (attachment) {
 	case DRI2BufferFrontLeft:
 		pixmap = get_drawable_pixmap(drawable);
 		bo = sna_pixmap_set_dri(sna, pixmap);
+		if (bo == NULL)
+			return NULL;
+
+		bo = ref(bo);
 		bpp = pixmap->drawable.bitsPerPixel;
 		DBG(("%s: attaching to front buffer %dx%d [%p:%d]\n",
 		     __FUNCTION__,
@@ -276,9 +275,13 @@ sna_dri_create_buffer(DrawablePtr drawable,
 		break;
 
 	default:
-		break;
+		return NULL;
 	}
 	if (bo == NULL)
+		return NULL;
+
+	buffer = calloc(1, sizeof *buffer + sizeof *private);
+	if (buffer == NULL)
 		goto err;
 
 	private = get_private(buffer);
@@ -293,10 +296,8 @@ sna_dri_create_buffer(DrawablePtr drawable,
 	private->pixmap = pixmap;
 	private->bo = bo;
 
-	if (buffer->name == 0) {
-		kgem_bo_destroy(&sna->kgem, bo);
+	if (buffer->name == 0)
 		goto err;
-	}
 
 	if (pixmap)
 		pixmap->refcnt++;
@@ -304,6 +305,7 @@ sna_dri_create_buffer(DrawablePtr drawable,
 	return buffer;
 
 err:
+	kgem_bo_destroy(&sna->kgem, bo);
 	free(buffer);
 	return NULL;
 }
@@ -505,22 +507,27 @@ sna_dri_copy_region(DrawablePtr draw,
 		    DRI2BufferPtr dst_buffer,
 		    DRI2BufferPtr src_buffer)
 {
+	PixmapPtr pixmap = get_drawable_pixmap(draw);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct kgem_bo *src, *dst;
 
 	if (dst_buffer->attachment == DRI2BufferFrontLeft)
-		dst = sna_pixmap_get_bo(get_drawable_pixmap(draw));
+		dst = sna_pixmap_set_dri(sna, pixmap);
 	else
 		dst = get_private(dst_buffer)->bo;
 
 	if (src_buffer->attachment == DRI2BufferFrontLeft)
-		src = sna_pixmap_get_bo(get_drawable_pixmap(draw));
+		src = sna_pixmap_set_dri(sna, pixmap);
 	else
 		src = get_private(src_buffer)->bo;
 
+	assert(dst != NULL);
+	assert(src != NULL);
+
 	DBG(("%s: dst -- attachment=%d, name=%d, handle=%d [screen=%d]\n",
 	     __FUNCTION__,
 	     dst_buffer->attachment, dst_buffer->name, dst->handle,
-	     sna_pixmap_get_bo(to_sna_from_drawable(draw)->front)->handle));
+	     sna_pixmap_get_bo(sna->front)->handle));
 	DBG(("%s: src -- attachment=%d, name=%d, handle=%d\n",
 	     __FUNCTION__,
 	     src_buffer->attachment, src_buffer->name, src->handle));
@@ -529,10 +536,8 @@ sna_dri_copy_region(DrawablePtr draw,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
 	     REGION_NUM_RECTS(region)));
-	assert(dst != NULL);
-	assert(src != NULL);
 
-	sna_dri_copy(to_sna_from_drawable(draw), draw, region, dst, src, false);
+	sna_dri_copy(sna, draw, region, dst, src, false);
 }
 
 #if DRI2INFOREC_VERSION >= 4
commit 1f8c9b2c09b913a678cf3ecb457c3870bb8dcff5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 11:42:31 2012 +0000

    sna/dri: Improve error handling of failing to create a DRI2 pixmap
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 96324e6..58a3e3c 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -210,7 +210,6 @@ sna_dri_create_buffer(DrawablePtr drawable,
 	switch (attachment) {
 	case DRI2BufferFrontLeft:
 		pixmap = get_drawable_pixmap(drawable);
-		pixmap->refcnt++;
 		bo = sna_pixmap_set_dri(sna, pixmap);
 		bpp = pixmap->drawable.bitsPerPixel;
 		DBG(("%s: attaching to front buffer %dx%d [%p:%d]\n",
@@ -295,14 +294,13 @@ sna_dri_create_buffer(DrawablePtr drawable,
 	private->bo = bo;
 
 	if (buffer->name == 0) {
-		/* failed to name buffer */
-		if (pixmap)
-			pixmap->drawable.pScreen->DestroyPixmap(pixmap);
-		else
-			kgem_bo_destroy(&sna->kgem, bo);
+		kgem_bo_destroy(&sna->kgem, bo);
 		goto err;
 	}
 
+	if (pixmap)
+		pixmap->refcnt++;
+
 	return buffer;
 
 err:
@@ -531,6 +529,8 @@ sna_dri_copy_region(DrawablePtr draw,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
 	     REGION_NUM_RECTS(region)));
+	assert(dst != NULL);
+	assert(src != NULL);
 
 	sna_dri_copy(to_sna_from_drawable(draw), draw, region, dst, src, false);
 }
commit 2a91083cb20877123f8ade9dbb9cb86260a8994c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 10:55:46 2012 +0000

    sna: Short-circuit repeated calls to force-to-gpu
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 215dbca..72c3907 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2013,6 +2013,11 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (priv == NULL)
 		return NULL;
 
+	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
+		DBG(("%s: GPU all-damaged\n", __FUNCTION__));
+		return priv;
+	}
+
 	/* Unlike move-to-gpu, we ignore wedged and always create the GPU bo */
 	if (priv->gpu_bo == NULL) {
 		struct sna *sna = to_sna_from_pixmap(pixmap);
commit 93415b17ef8fe69f17b70c38a90d359d378a6bcf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 10:43:11 2012 +0000

    uxa: Silence compiler warning for const arguments
    
    i965_video.c: In function 'gen6_create_cc_state':
    i965_video.c:1374:12: warning: passing argument 4 of
    'intel_bo_alloc_for_data' discards 'const' qualifier from pointer target
    type [enabled by default]
    
    Repeated ad nauseam.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel.h b/src/intel.h
index 2fcd064..1c68af0 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -609,7 +609,7 @@ intel_emit_reloc(drm_intel_bo * bo, uint32_t offset,
 static inline drm_intel_bo *intel_bo_alloc_for_data(intel_screen_private *intel,
 						    const void *data,
 						    unsigned int size,
-						    char *name)
+						    const char *name)
 {
 	drm_intel_bo *bo;
 
commit 37b3132b63d898945e39bc7adccff36e0888c284
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 10:39:48 2012 +0000

    uxa: Remove DPRINTF stubs
    
    It wasn't being used for anything non-trivial and was throwing compiler
    warnings, so remove it.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/common.h b/src/common.h
index f9f2300..06b2192 100644
--- a/src/common.h
+++ b/src/common.h
@@ -51,20 +51,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define PFX __FILE__,__LINE__,__FUNCTION__
 #define FUNCTION_NAME __FUNCTION__
 
-#ifdef I830DEBUG
-#define MARKER() ErrorF("\n### %s:%d: >>> %s <<< ###\n\n", \
-			 __FILE__, __LINE__,__FUNCTION__)
-#define DPRINTF I830DPRINTF
-#else /* #ifdef I830DEBUG */
-#define MARKER()
-#define DPRINTF I830DPRINTF_stub
-static inline void
-I830DPRINTF_stub(const char *filename, int line, const char *function,
-		 const char *fmt, ...)
-{
-}
-#endif /* #ifdef I830DEBUG */
-
 #define KB(x) ((x) * 1024)
 #define MB(x) ((x) * KB(1024))
 
@@ -82,9 +68,6 @@ extern void intel_init_scrn(ScrnInfoPtr scrn);
 /* Symbol lists shared by the i810 and i830 parts. */
 extern int I830EntityIndex;
 
-extern void I830DPRINTF_stub(const char *filename, int line,
-			     const char *function, const char *fmt, ...);
-
 #ifdef _I830_H_
 #define PrintErrorState i830_dump_error_state
 #define WaitRingFunc I830WaitLpRing
diff --git a/src/intel_driver.c b/src/intel_driver.c
index 4da6ba9..2f1314f 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -142,23 +142,6 @@ static Bool I830EnterVT(int scrnIndex, int flags);
 /* temporary */
 extern void xf86SetCursor(ScreenPtr screen, CursorPtr pCurs, int x, int y);
 
-#ifdef I830DEBUG
-void
-I830DPRINTF(const char *filename, int line, const char *function,
-	    const char *fmt, ...)
-{
-	va_list ap;
-
-	ErrorF("\n##############################################\n"
-	       "*** In function %s, on line %d, in file %s ***\n",
-	       function, line, filename);
-	va_start(ap, fmt);
-	VErrorF(fmt, ap);
-	va_end(ap);
-	ErrorF("##############################################\n\n");
-}
-#endif /* #ifdef I830DEBUG */
-
 /* Export I830 options to i830 driver where necessary */
 const OptionInfoRec *intel_uxa_available_options(int chipid, int busid)
 {
@@ -174,8 +157,6 @@ I830LoadPalette(ScrnInfoPtr scrn, int numColors, int *indices,
 	int p;
 	uint16_t lut_r[256], lut_g[256], lut_b[256];
 
-	DPRINTF(PFX, "I830LoadPalette: numColors: %d\n", numColors);
-
 	for (p = 0; p < xf86_config->num_crtc; p++) {
 		xf86CrtcPtr crtc = xf86_config->crtc[p];
 
@@ -1037,9 +1018,6 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 	if (!miSetPixmapDepths())
 		return FALSE;
 
-	DPRINTF(PFX, "assert( if(!I830EnterVT(scrnIndex, 0)) )\n");
-
-	DPRINTF(PFX, "assert( if(!fbScreenInit(screen, ...) )\n");
 	if (!fbScreenInit(screen, NULL,
 			  scrn->virtualX, scrn->virtualY,
 			  scrn->xDpi, scrn->yDpi,
@@ -1113,11 +1091,9 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 	if (!xf86CrtcScreenInit(screen))
 		return FALSE;
 
-	DPRINTF(PFX, "assert( if(!miCreateDefColormap(screen)) )\n");
 	if (!miCreateDefColormap(screen))
 		return FALSE;
 
-	DPRINTF(PFX, "assert( if(!xf86HandleColormaps(screen, ...)) )\n");
 	if (!xf86HandleColormaps(screen, 256, 8, I830LoadPalette, NULL,
 				 CMAP_RELOAD_ON_MODE_SWITCH |
 				 CMAP_PALETTED_TRUECOLOR)) {
@@ -1199,8 +1175,6 @@ static void I830LeaveVT(int scrnIndex, int flags)
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	int ret;
 
-	DPRINTF(PFX, "Leave VT\n");
-
 	xf86RotateFreeShadow(scrn);
 
 	xf86_hide_cursors(scrn);
@@ -1220,8 +1194,6 @@ static Bool I830EnterVT(int scrnIndex, int flags)
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	int ret;
 
-	DPRINTF(PFX, "Enter VT\n");
-
 	ret = drmSetMaster(intel->drmSubFD);
 	if (ret) {
 		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
@@ -1355,9 +1327,6 @@ static Bool I830PMEvent(int scrnIndex, pmEvent event, Bool undo)
 	ScrnInfoPtr scrn = xf86Screens[scrnIndex];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-	DPRINTF(PFX, "Enter VT, event %d, undo: %s\n", event,
-		BOOLTOSTRING(undo));
-
 	switch (event) {
 	case XF86_APM_SYS_SUSPEND:
 	case XF86_APM_CRITICAL_SUSPEND:	/*do we want to delay a critical suspend? */
commit c5473d5b7788476bd7e7d4a292215114dab3f627
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 10:21:56 2012 +0000

    sna/dri: Update for AsyncSwap interface changes
    
    We now need to return TRUE/FALSE depending on whether we need to
    invalidate the drawable.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 2a25cbd..96324e6 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -1537,7 +1537,7 @@ blit_fallback:
 }
 
 #if DRI2INFOREC_VERSION >= 7
-static void
+static Bool
 sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 		   DRI2BufferPtr front, DRI2BufferPtr back,
 		   DRI2SwapEventPtr func, void *data)
@@ -1553,6 +1553,8 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 	if (pipe == -1) {
 		PixmapPtr pixmap = get_drawable_pixmap(draw);
 
+		DBG(("%s: unattached, exchange pixmaps\n", __FUNCTION__));
+
 		set_bo(pixmap, get_private(back)->bo);
 		sna_dri_exchange_attachment(front, back);
 		get_private(back)->pixmap = pixmap;
@@ -1560,18 +1562,20 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 
 		DRI2SwapComplete(client, draw, 0, 0, 0,
 				 DRI2_EXCHANGE_COMPLETE, func, data);
-		return;
+		return TRUE;
 	}
 
 	if (!can_flip(sna, draw, front, back)) {
 blit:
+		DBG(("%s: unable to flip, so blit\n", __FUNCTION__));
+
 		sna_dri_copy(sna, draw, NULL,
 			     get_private(front)->bo,
 			     get_private(back)->bo,
 			     false);
 		DRI2SwapComplete(client, draw, 0, 0, 0,
 				 DRI2_BLIT_COMPLETE, func, data);
-		return;
+		return FALSE;
 	}
 
 	bo = NULL;
@@ -1611,13 +1615,13 @@ blit:
 
 		sna_dri_reference_buffer(front);
 		sna_dri_reference_buffer(back);
-
 	} else if (info->type != DRI2_ASYNC_FLIP) {
 		/* A normal vsync'ed client is finishing, wait for it
 		 * to unpin the old framebuffer before taking over.
 		 */
 		goto blit;
 	} else {
+		DBG(("%s: pending flip, chaining next\n", __FUNCTION__));
 		if (info->next_front.name == info->front->name) {
 			name = info->cache.name;
 			bo = info->cache.bo;
@@ -1629,16 +1633,16 @@ blit:
 		get_private(info->front)->bo = get_private(info->back)->bo;
 	}
 
-	if (bo == NULL)
+	if (bo == NULL) {
+		DBG(("%s: creating new back buffer\n", __FUNCTION__));
 		bo = kgem_create_2d(&sna->kgem,
 				    draw->width,
 				    draw->height,
 				    draw->bitsPerPixel,
 				    I915_TILING_X, CREATE_EXACT);
-	get_private(info->back)->bo = bo;
-
-	if (name == 0)
 		name = kgem_bo_flink(&sna->kgem, bo);
+	}
+	get_private(info->back)->bo = bo;
 	info->back->name = name;
 
 	set_bo(sna->front, get_private(info->front)->bo);
@@ -1646,6 +1650,7 @@ blit:
 
 	DRI2SwapComplete(client, draw, 0, 0, 0,
 			 DRI2_EXCHANGE_COMPLETE, func, data);
+	return TRUE;
 }
 #endif
 
commit 738e5f896409f14ea03bf0a6f427155c8e5dca00
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 09:29:41 2012 +0000

    sna: Fix use of RegionInit() for singular regions
    
    For a singular region, we want to use a value for nboxes of 0 not 1,
    fortunately if you pass in a box, it ignores the value of nboxes.
    RegionInit() is a most peculiar API!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index bef1774..91273c9 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -1026,7 +1026,7 @@ glyphs_fallback(CARD8 op,
 	DBG(("%s: (%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
-	RegionInit(&region, &box, 1);
+	RegionInit(&region, &box, 0);
 	RegionTranslate(&region, dst->pDrawable->x, dst->pDrawable->y);
 	if (dst->pCompositeClip)
 		RegionIntersect(&region, &region, dst->pCompositeClip);
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index cec0473..ebc3860 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -157,7 +157,7 @@ sna_video_clip_helper(ScrnInfoPtr scrn,
 
 	/* For textured video, we don't actually want to clip at all. */
 	if (crtc && !video->textured) {
-		RegionInit(&crtc_region_local, &crtc_box, 1);
+		RegionInit(&crtc_region_local, &crtc_box, 0);
 		crtc_region = &crtc_region_local;
 		RegionIntersect(crtc_region, crtc_region, reg);
 	}
commit 90400e43f189d3b8a4a87531c5d8ecc1fa2bc123
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Fri Feb 17 19:50:52 2012 +0800

    uxa/glamor/dri: Should fixup the drawable pixmap.
    
    Two fixes in this commit, first we only need to check the
    front left buffer, for other attachment we don't need to
    check them. The second is, we should fixup the pixmap's
    drawable not the original drawable.
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_dri.c b/src/intel_dri.c
index 9db52ca..c4e029d 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -285,14 +285,18 @@ I830DRI2CreateBuffers(DrawablePtr drawable, unsigned int *attachments,
 		pixmap = NULL;
 		if (attachments[i] == DRI2BufferFrontLeft) {
 			pixmap = get_front_buffer(drawable);
+
+			if (pixmap && intel_get_pixmap_private(pixmap) == NULL) {
+				is_glamor_pixmap = TRUE;
+				drawable = &pixmap->drawable;
+				pixmap = NULL;
+			}
 		} else if (attachments[i] == DRI2BufferStencil && pDepthPixmap) {
 			pixmap = pDepthPixmap;
 			pixmap->refcnt++;
 		}
 
-		is_glamor_pixmap = pixmap && (intel_get_pixmap_private(pixmap) == NULL);
-
-		if (pixmap == NULL || is_glamor_pixmap) {
+		if (pixmap == NULL) {
 			unsigned int hint = INTEL_CREATE_PIXMAP_DRI2;
 
 			if (intel->tiling & INTEL_TILING_3D) {
@@ -403,11 +407,17 @@ I830DRI2CreateBuffer(DrawablePtr drawable, unsigned int attachment,
 	}
 
 	pixmap = NULL;
-	if (attachment == DRI2BufferFrontLeft)
+	if (attachment == DRI2BufferFrontLeft) {
 		pixmap = get_front_buffer(drawable);
 
-	is_glamor_pixmap = pixmap && (intel_get_pixmap_private(pixmap) == NULL);
-	if (pixmap == NULL || is_glamor_pixmap) {
+		if (pixmap && intel_get_pixmap_private(pixmap) == NULL) {
+			is_glamor_pixmap = TRUE;
+			drawable = &pixmap->drawable;
+			pixmap = NULL;
+		}
+	}
+
+	if (pixmap == NULL) {
 		unsigned int hint = INTEL_CREATE_PIXMAP_DRI2;
 		int pixmap_width = drawable->width;
 		int pixmap_height = drawable->height;
commit 233767b24b84edf94340a2dd188f50227e5e1d65
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Fri Feb 17 19:50:51 2012 +0800

    uxa/glamor/dri: Enable the pageflip support on glamor.
    
    To support easy buffer exchange at glamor layer, glamor
    added a new API glamor_egl_exchange_buffers() to exchange
    two pixmaps' EGL image and fbos and textures without
    recreating any of them. But this simple method's requirement
    is that there are two pixmaps. A exceptional case is:
    If we are using triple buffer when do page flipping, we
    will have an extra back_buffer which doesn't have a pixmap
    attached to it. Then each time we set that buffer to a
    pixmap, we will have to call the create_egl_textured_pixmap
    to create the corresponding EGL image and fbo and texture
    for it. This is not efficient.
    
    To fix this issue, this commit introduces a new back_pixmap
    to intel structure to hold the back buffer and corresponding
    glamor resources. Then we will just need to do the light
    weight buffer exchanging at both DDX and glamor layer.
    
    As the new back pixmap is similar to the screen pixmap
    and need to be handled carefully when close screen. As the
    glamor data structure is a per screen data, and will be
    released at its close screen method. The glamor's close
    screen method must cleanup the screen pixmap and back
    pixmap's glamor resources. screen pixmap is easy to get,
    but there is no good way to store the back pixmap.
    
    So the glamor add a new API glamor_egl_create_textured_screen_ext
    function to pass the back pixmap's pointer to glamor layer.
    
    This commit make us depend on glamor commit: 4e58c4f.
    And we increased the required glamor version from 0.3.0 to 0.3.1
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index ae04b01..83afdb5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -158,7 +158,7 @@ AC_ARG_ENABLE(glamor,
 AC_MSG_RESULT([$GLAMOR])
 AM_CONDITIONAL(GLAMOR, test x$GLAMOR != xno)
 if test "x$GLAMOR" != "xno"; then
-	PKG_CHECK_MODULES(LIBGLAMOR, [glamor >= 0.3.0])
+	PKG_CHECK_MODULES(LIBGLAMOR, [glamor >= 0.3.1])
 	PKG_CHECK_MODULES(LIBGLAMOR_EGL, [glamor-egl])
 	AC_DEFINE(USE_GLAMOR, 1, [Enable glamor acceleration])
 fi
diff --git a/src/intel.h b/src/intel.h
index 6b1a424..2fcd064 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -166,6 +166,7 @@ typedef struct intel_screen_private {
 
 	void *modes;
 	drm_intel_bo *front_buffer, *back_buffer;
+	PixmapPtr back_pixmap;
 	unsigned int back_name;
 	long front_pitch, front_tiling;
 	void *shadow_buffer;
diff --git a/src/intel_display.c b/src/intel_display.c
index d525ffa..11d0e2b 100644
--- a/src/intel_display.c
+++ b/src/intel_display.c
@@ -1369,6 +1369,7 @@ intel_xf86crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	int	    i, old_width, old_height, old_pitch;
 	unsigned long pitch;
 	uint32_t tiling;
+	ScreenPtr screen;
 
 	if (scrn->virtualX == width && scrn->virtualY == height)
 		return TRUE;
@@ -1382,6 +1383,12 @@ intel_xf86crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	old_fb_id = mode->fb_id;
 	old_front = intel->front_buffer;
 
+	if (intel->back_pixmap) {
+		screen = intel->back_pixmap->drawable.pScreen;
+		screen->DestroyPixmap(intel->back_pixmap);
+		intel->back_pixmap = NULL;
+	}
+
 	if (intel->back_buffer) {
 		drm_intel_bo_unreference(intel->back_buffer);
 		intel->back_buffer = NULL;
diff --git a/src/intel_dri.c b/src/intel_dri.c
index db65cc7..9db52ca 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -71,6 +71,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "dri2.h"
 
 #include "intel_glamor.h"
+#include "uxa.h"
 
 typedef struct {
 	int refcnt;
@@ -841,7 +842,7 @@ i830_dri2_del_frame_event(DrawablePtr drawable, DRI2FrameEventPtr info)
 }
 
 static struct intel_pixmap *
-intel_exchange_pixmap_buffers(PixmapPtr front, PixmapPtr back)
+intel_exchange_pixmap_buffers(struct intel_screen_private *intel, PixmapPtr front, PixmapPtr back)
 {
 	struct intel_pixmap *new_front, *new_back;
 
@@ -852,6 +853,7 @@ intel_exchange_pixmap_buffers(PixmapPtr front, PixmapPtr back)
 	new_front->busy = 1;
 	new_back->busy = -1;
 
+	intel_glamor_exchange_buffers(intel, front, back);
 	return new_front;
 }
 
@@ -871,13 +873,46 @@ I830DRI2ExchangeBuffers(struct intel_screen_private *intel, DRI2BufferPtr front,
 	back->name = tmp;
 
 	/* Swap pixmap bos */
-	new_front = intel_exchange_pixmap_buffers(front_priv->pixmap,
+	new_front = intel_exchange_pixmap_buffers(intel,
+						  front_priv->pixmap,
 						  back_priv->pixmap);
 	dri_bo_unreference (intel->front_buffer);
 	intel->front_buffer = new_front->bo;
 	dri_bo_reference (intel->front_buffer);
 }
 
+static PixmapPtr
+intel_glamor_create_back_pixmap(ScreenPtr screen,
+				PixmapPtr front_pixmap,
+				drm_intel_bo *back_bo)
+{
+	PixmapPtr back_pixmap;
+
+	back_pixmap = screen->CreatePixmap(screen,
+					   0,
+					   0,
+				           front_pixmap->drawable.depth,
+				           0);
+	if (back_pixmap == NULL)
+		return NULL;
+
+	screen->ModifyPixmapHeader(back_pixmap,
+				   front_pixmap->drawable.width,
+				   front_pixmap->drawable.height,
+				   0, 0,
+				   front_pixmap->devKind,
+				   0);
+	intel_set_pixmap_bo(back_pixmap, back_bo);
+	if (!intel_glamor_create_textured_pixmap(back_pixmap)) {
+		ScrnInfoPtr scrn = xf86Screens[screen->myNum];
+		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+			   "Failed to create textured back pixmap.\n");
+		screen->DestroyPixmap(back_pixmap);
+		return NULL;
+	}
+	return back_pixmap;
+}
+
 /*
  * Our internal swap routine takes care of actually exchanging, blitting, or
  * flipping buffers as necessary.
@@ -909,6 +944,10 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 	}
 
 	if (intel->back_buffer == NULL) {
+		I830DRI2BufferPrivatePtr priv;
+		PixmapPtr front_pixmap, back_pixmap;
+		ScreenPtr screen;
+
 		new_back = drm_intel_bo_alloc(intel->bufmgr, "front buffer",
 					      intel->front_buffer->size, 0);
 		if (new_back == NULL)
@@ -925,6 +964,21 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 
 		drm_intel_bo_disable_reuse(new_back);
 		dri_bo_flink(new_back, &intel->back_name);
+
+		if ((intel->uxa_flags & UXA_USE_GLAMOR)) {
+			screen = draw->pScreen;
+			priv = info->front->driverPrivate;
+			front_pixmap = priv->pixmap;
+
+			back_pixmap = intel_glamor_create_back_pixmap(screen,
+								      front_pixmap,
+								      new_back);
+			if (back_pixmap == NULL) {
+				drm_intel_bo_unreference(new_back);
+				return FALSE;
+			}
+			intel->back_pixmap = back_pixmap;
+		}
 	} else {
 		new_back = intel->back_buffer;
 		intel->back_buffer = NULL;
@@ -938,12 +992,17 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 	info->type = DRI2_SWAP_CHAIN;
 	intel->pending_flip[info->pipe] = info;
 
-	/* Exchange the current front-buffer with the fresh bo */
-	intel->back_buffer = intel->front_buffer;
-	drm_intel_bo_reference(intel->back_buffer);
-
 	priv = info->front->driverPrivate;
-	intel_set_pixmap_bo(priv->pixmap, new_back);
+
+	/* Exchange the current front-buffer with the fresh bo */
+	if (!(intel->uxa_flags & UXA_USE_GLAMOR)) {
+		intel->back_buffer = intel->front_buffer;
+		drm_intel_bo_reference(intel->back_buffer);
+		intel_set_pixmap_bo(priv->pixmap, new_back);
+	}
+	else
+		intel_exchange_pixmap_buffers(intel, priv->pixmap,
+					      intel->back_pixmap);
 
 	tmp_name = info->front->name;
 	info->front->name = intel->back_name;
diff --git a/src/intel_driver.c b/src/intel_driver.c
index e0f0a41..4da6ba9 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -1268,6 +1268,11 @@ static Bool I830CloseScreen(int scrnIndex, ScreenPtr screen)
 		intel->uxa_driver = NULL;
 	}
 
+	if (intel->back_pixmap) {
+		screen->DestroyPixmap(intel->back_pixmap);
+		intel->back_pixmap = NULL;
+	}
+
 	if (intel->back_buffer) {
 		drm_intel_bo_unreference(intel->back_buffer);
 		intel->back_buffer = NULL;
diff --git a/src/intel_glamor.c b/src/intel_glamor.c
index c468d34..a868157 100644
--- a/src/intel_glamor.c
+++ b/src/intel_glamor.c
@@ -40,6 +40,16 @@
 #include "intel_glamor.h"
 #include "uxa.h"
 
+void
+intel_glamor_exchange_buffers(struct intel_screen_private *intel,
+			      PixmapPtr src,
+			      PixmapPtr dst)
+{
+	if (!(intel->uxa_flags & UXA_USE_GLAMOR))
+		return;
+	glamor_egl_exchange_buffers(src, dst);
+}
+
 Bool
 intel_glamor_create_screen_resources(ScreenPtr screen)
 {
@@ -52,9 +62,10 @@ intel_glamor_create_screen_resources(ScreenPtr screen)
 	if (!glamor_glyphs_init(screen))
 		return FALSE;
 
-	if (!glamor_egl_create_textured_screen(screen,
-					       intel->front_buffer->handle,
-					       intel->front_pitch))
+	if (!glamor_egl_create_textured_screen_ext(screen,
+						   intel->front_buffer->handle,
+						   intel->front_pitch,
+						   &intel->back_pixmap))
 		return FALSE;
 
 	return TRUE;
@@ -70,7 +81,7 @@ intel_glamor_pre_init(ScrnInfoPtr scrn)
 	/* Load glamor module */
 	if ((glamor_module = xf86LoadSubModule(scrn, GLAMOR_EGL_MODULE_NAME))) {
 		version = xf86GetModuleVersion(glamor_module);
-		if (version < MODULE_VERSION_NUMERIC(0,3,0)) {
+		if (version < MODULE_VERSION_NUMERIC(0,3,1)) {
 			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
 			"Incompatible glamor version, required >= 0.3.0.\n");
 		} else {
diff --git a/src/intel_glamor.h b/src/intel_glamor.h
index 3065132..46692bc 100644
--- a/src/intel_glamor.h
+++ b/src/intel_glamor.h
@@ -44,7 +44,7 @@ Bool intel_glamor_create_textured_pixmap(PixmapPtr pixmap);
 void intel_glamor_destroy_pixmap(PixmapPtr pixmap);
 PixmapPtr intel_glamor_create_pixmap(ScreenPtr screen, int w, int h,
 				     int depth, unsigned int usage);
-
+void intel_glamor_exchange_buffers(struct intel_screen_private *intel, PixmapPtr src, PixmapPtr dst);
 #else
 
 static inline Bool intel_glamor_pre_init(ScrnInfoPtr scrn) { return TRUE; }
@@ -61,6 +61,7 @@ static inline void intel_glamor_destroy_pixmap(PixmapPtr pixmap) { }
 static inline PixmapPtr intel_glamor_create_pixmap(ScreenPtr screen, int w, int h,
 						   int depth, unsigned int usage) { return NULL; }
 
+static inline void intel_glamor_exchange_buffers(struct intel_screen_private *intel, PixmapPtr src, PixmapPtr dst) {}
 #endif
 
 #endif /* INTEL_GLAMOR_H */
commit 2b042121c223076eeda7f442a3fd7e7ee75207d8
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Fri Feb 17 19:50:50 2012 +0800

    uxa/dri: Refine the pageflip processing.
    
    Add a new element back_name to intel structure to track
    the back bo's name then avoid flink every time.
    And at function I830DRI2ExchangeBuffers, after finish
    the BO exchange between info's front and back pixmap,
    it set the new front bo to the screen pixmap. But the
    screen pixmap should be the same as front's pixmap,
    so this is a duplicate operation and can be removed.
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>

diff --git a/src/intel.h b/src/intel.h
index 83982f1..6b1a424 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -166,6 +166,7 @@ typedef struct intel_screen_private {
 
 	void *modes;
 	drm_intel_bo *front_buffer, *back_buffer;
+	unsigned int back_name;
 	long front_pitch, front_tiling;
 	void *shadow_buffer;
 	int shadow_stride;
diff --git a/src/intel_dri.c b/src/intel_dri.c
index d2a02c9..db65cc7 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -840,14 +840,27 @@ i830_dri2_del_frame_event(DrawablePtr drawable, DRI2FrameEventPtr info)
 	free(info);
 }
 
+static struct intel_pixmap *
+intel_exchange_pixmap_buffers(PixmapPtr front, PixmapPtr back)
+{
+	struct intel_pixmap *new_front, *new_back;
+
+	new_front = intel_get_pixmap_private(back);
+	new_back = intel_get_pixmap_private(front);
+	intel_set_pixmap_private(front, new_front);
+	intel_set_pixmap_private(back, new_back);
+	new_front->busy = 1;
+	new_back->busy = -1;
+
+	return new_front;
+}
+
 static void
-I830DRI2ExchangeBuffers(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr back)
+I830DRI2ExchangeBuffers(struct intel_screen_private *intel, DRI2BufferPtr front, DRI2BufferPtr back)
 {
 	I830DRI2BufferPrivatePtr front_priv, back_priv;
-	struct intel_pixmap *front_intel, *back_intel;
-	ScreenPtr screen;
-	intel_screen_private *intel;
 	int tmp;
+	struct intel_pixmap *new_front;
 
 	front_priv = front->driverPrivate;
 	back_priv = back->driverPrivate;
@@ -858,21 +871,11 @@ I830DRI2ExchangeBuffers(DrawablePtr draw, DRI2BufferPtr front, DRI2BufferPtr bac
 	back->name = tmp;
 
 	/* Swap pixmap bos */
-	front_intel = intel_get_pixmap_private(front_priv->pixmap);
-	back_intel = intel_get_pixmap_private(back_priv->pixmap);
-	intel_set_pixmap_private(front_priv->pixmap, back_intel);
-	intel_set_pixmap_private(back_priv->pixmap, front_intel);
-
-	screen = draw->pScreen;
-	intel = intel_get_screen_private(xf86Screens[screen->myNum]);
-
+	new_front = intel_exchange_pixmap_buffers(front_priv->pixmap,
+						  back_priv->pixmap);
 	dri_bo_unreference (intel->front_buffer);
-	intel->front_buffer = back_intel->bo;
+	intel->front_buffer = new_front->bo;
 	dri_bo_reference (intel->front_buffer);
-
-	intel_set_pixmap_private(screen->GetScreenPixmap(screen), back_intel);
-	back_intel->busy = 1;
-	front_intel->busy = -1;
 }
 
 /*
@@ -886,6 +889,7 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 {
 	I830DRI2BufferPrivatePtr priv = info->back->driverPrivate;
 	drm_intel_bo *new_back, *old_back;
+	int tmp_name;
 
 	if (!intel->use_triple_buffer) {
 		if (!intel_do_pageflip(intel,
@@ -894,7 +898,7 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 			return FALSE;
 
 		info->type = DRI2_SWAP;
-		I830DRI2ExchangeBuffers(draw, info->front, info->back);
+		I830DRI2ExchangeBuffers(intel, info->front, info->back);
 		return TRUE;
 	}
 
@@ -920,6 +924,7 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 		}
 
 		drm_intel_bo_disable_reuse(new_back);
+		dri_bo_flink(new_back, &intel->back_name);
 	} else {
 		new_back = intel->back_buffer;
 		intel->back_buffer = NULL;
@@ -939,10 +944,13 @@ I830DRI2ScheduleFlip(struct intel_screen_private *intel,
 
 	priv = info->front->driverPrivate;
 	intel_set_pixmap_bo(priv->pixmap, new_back);
-	dri_bo_flink(new_back, &info->front->name);
+
+	tmp_name = info->front->name;
+	info->front->name = intel->back_name;
+	intel->back_name = tmp_name;
 
 	/* Then flip DRI2 pointers and update the screen pixmap */
-	I830DRI2ExchangeBuffers(draw, info->front, info->back);
+	I830DRI2ExchangeBuffers(intel, info->front, info->back);
 	DRI2SwapComplete(info->client, draw, 0, 0, 0,
 			 DRI2_EXCHANGE_COMPLETE,
 			 info->event_complete,
commit 3efb0ace8e7e3fe01be3793c487176589019fc56
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 21 08:09:52 2012 +0000

    sna: Split up/down edge walking in order to handle endpoint clipping
    
    In order to prevent walking upwards off the top of the pixmap when
    rendering a clipped vertical edge, we need to tweak the boundary
    conditions for the vertical edge walker.
    
    Reported-by: Clemens Eisserer <linuxhippy at gmail.com>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46261
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index c8fcc75..215dbca 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5669,35 +5669,48 @@ X_continue2:
 				}
 
 				b->x2 = b->x1 = x;
-				b->y1 = y;
-				while (length--) {
-					e += e1;
-					y += sdy;
-					if (e >= 0) {
-						b->y2 = y;
-						if (b->y2 < b->y1) {
-							int16_t t = b->y1;
-							b->y1 = b->y2;
-							b->y2 = t;
+				if (sdy < 0) {
+					b->y2 = y + 1;
+					while (length--) {
+						e += e1;
+						y--;
+						if (e >= 0) {
+							b->y1 = y;
+							b->x2++;
+							if (++b == last_box) {
+								ret = &&Y_up_continue;
+								goto *jump;
+Y_up_continue:
+								b = box;
+							}
+							e += e3;
+							b->x2 = b->x1 = ++x;
+							b->y2 = y;
 						}
-						b->x2++;
-						if (++b == last_box) {
-							ret = &&Y_continue;
-							goto *jump;
-Y_continue:
-							b = box;
+					}
+
+					b->y1 = y;
+				} else {
+					b->y1 = y;
+					while (length--) {
+						e += e1;
+						y++;
+						if (e >= 0) {
+							b->y2 = y;
+							b->x2++;
+							if (++b == last_box) {
+								ret = &&Y_down_continue;
+								goto *jump;
+Y_down_continue:
+								b = box;
+							}
+							e += e3;
+							b->x2 = b->x1 = ++x;
+							b->y1 = y;
 						}
-						e += e3;
-						b->x2 = b->x1 = ++x;
-						b->y1 = y;
 					}
-				}
 
-				b->y2 = y + sdy;
-				if (b->y2 < b->y1) {
-					int16_t t = b->y1;
-					b->y1 = b->y2;
-					b->y2 = t;
+					b->y2 = ++y;
 				}
 				b->x2++;
 				if (++b == last_box) {
@@ -6949,35 +6962,48 @@ X_continue2:
 				}
 
 				b->x2 = b->x1 = x1;
-				b->y1 = y1;
-				while (--length) {
-					e += e1;
-					y1 += sdy;
-					if (e >= 0) {
-						b->y2 = y1;
-						if (b->y2 < b->y1) {
-							int16_t t = b->y1;
-							b->y1 = b->y2;
-							b->y2 = t;
+				if (sdy < 0) {
+					b->y2 = y1 + 1;
+					while (--length) {
+						e += e1;
+						y1--;
+						if (e >= 0) {
+							b->y1 = y1;
+							b->x2++;
+							if (++b == last_box) {
+								ret = &&Y_up_continue;
+								goto *jump;
+Y_up_continue:
+								b = box;
+							}
+							e += e3;
+							b->x2 = b->x1 = ++x1;
+							b->y2 = y1;
 						}
-						b->x2++;
-						if (++b == last_box) {
-							ret = &&Y_continue;
-							goto *jump;
-Y_continue:
-							b = box;
+					}
+
+					b->y1 = y1;
+				} else {
+					b->y1 = y1;
+					while (--length) {
+						e += e1;
+						y1++;
+						if (e >= 0) {
+							b->y2 = y1;
+							b->x2++;
+							if (++b == last_box) {
+								ret = &&Y_down_continue;
+								goto *jump;
+Y_down_continue:
+								b = box;
+							}
+							e += e3;
+							b->x2 = b->x1 = ++x1;
+							b->y1 = y1;
 						}
-						e += e3;
-						b->x2 = b->x1 = ++x1;
-						b->y1 = y1;
 					}
-				}
 
-				b->y2 = y1 + sdy;
-				if (b->y2 < b->y1) {
-					int16_t t = b->y1;
-					b->y1 = b->y2;
-					b->y2 = t;
+					b->y2 = ++y1;
 				}
 				b->x2++;
 				if (++b == last_box) {
commit e7362267f757b756469290f020e92a28446186b6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 23:50:42 2012 +0000

    sna: Restore the shadow pixels when reducing CPU damage to all
    
    Reported-by: Joe Nahmias <joe at nahmias.net>
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46346
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index aab85d8..c8fcc75 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1215,8 +1215,17 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 
 	if (sna_damage_is_all(&priv->cpu_damage,
 			      pixmap->drawable.width,
-			      pixmap->drawable.height))
+			      pixmap->drawable.height)) {
+		sna_damage_destroy(&priv->gpu_damage);
+		sna_pixmap_free_gpu(sna, priv);
+		priv->undamaged = false;
+
+		if (pixmap->devPrivate.ptr == NULL &&
+		    !sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
+			return false;
+
 		goto out;
+	}
 
 	if (priv->clear)
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
commit d2c760cff21fe4e808da66450f37b336cf8be619
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 16:30:53 2012 +0000

    sna: gen4+ suffer no penalty for changing tiling
    
    On gen4, the tiling/fence constraints are fairly lax, only requiring
    page alignment of the object and its size, and so we can switch
    tiling modes without incurring a GPU stall on active bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f107f14..e7c4987 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2403,15 +2403,28 @@ search_again:
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
-			if (bo->pitch < pitch) {
-				DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
-				     bo->tiling, tiling,
-				     bo->pitch, pitch));
-				continue;
-			}
+			if (kgem->gen < 40) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
 
-			if (bo->pitch * tiled_height > bytes(bo))
-				continue;
+				if (bo->pitch * tiled_height > bytes(bo))
+					continue;
+			} else {
+				if (num_pages(bo) < size)
+					continue;
+
+				if (bo->pitch != pitch) {
+					gem_set_tiling(kgem->fd,
+						       bo->handle,
+						       tiling, pitch);
+
+					bo->pitch = pitch;
+				}
+			}
 
 			kgem_bo_remove_from_active(kgem, bo);
 
@@ -2444,6 +2457,38 @@ search_again:
 	}
 
 	if (--retry && flags & CREATE_EXACT) {
+		if (kgem->gen >= 40) {
+			for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) {
+				if (i == tiling)
+					continue;
+
+				cache = &kgem->active[bucket][i];
+				list_for_each_entry(bo, cache, list) {
+					assert(!bo->purged);
+					assert(bo->refcnt == 0);
+					assert(bo->reusable);
+
+					if (num_pages(bo) < size)
+						continue;
+
+					if (tiling != gem_set_tiling(kgem->fd,
+								     bo->handle,
+								     tiling, pitch))
+						continue;
+
+					kgem_bo_remove_from_active(kgem, bo);
+
+					bo->unique_id = kgem_get_unique_id(kgem);
+					bo->pitch = pitch;
+					bo->tiling = tiling;
+					bo->delta = 0;
+					DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+					     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+					return kgem_bo_reference(bo);
+				}
+			}
+		}
+
 		bucket++;
 		goto search_again;
 	}
commit a2eaa4b1ef5f7bc021ddd20a5b634ec1ad56056b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 13:38:11 2012 +0000

    sna: Move sync'ing of CPU bo after allocation to first write
    
    The idea was that we could afford to allocate an active CPU bo for
    copying to from using the GPU and later sync just before we need to
    write to the shadow pixels. Having the sync inside the allocation
    function potentially causes an unwanted stall.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ab44cf3..aab85d8 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -321,10 +321,8 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 			if (priv->ptr == NULL) {
 				kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 				priv->cpu_bo = NULL;
-			} else {
-				kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+			} else
 				priv->stride = priv->cpu_bo->pitch;
-			}
 		}
 	}
 
@@ -2599,6 +2597,11 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		DBG(("%s: applying clear [%08x]\n",
 		     __FUNCTION__, priv->clear_color));
 
+		if (priv->cpu_bo) {
+			DBG(("%s: syncing CPU bo\n", __FUNCTION__));
+			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+		}
+
 		pixman_fill(pixmap->devPrivate.ptr,
 			    pixmap->devKind/sizeof(uint32_t),
 			    pixmap->drawable.bitsPerPixel,
commit ed117fe558e8e2c9e8c7dffa67e8bdb458edd322
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 12:25:31 2012 +0000

    sna: Trim clipped lines to end within bounds
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46261
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45673
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2088019..ab44cf3 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5425,8 +5425,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	int x2, y2, xstart, ystart;
-	int oc2, pt2_clipped = 0;
+	int x2, y2, xstart, ystart, oc2;
 	unsigned int bias = miGetZeroLineBias(drawable->pScreen);
 	bool degenerate = true;
 	struct sna_fill_op fill;
@@ -5471,9 +5470,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 		x2 = xstart;
 		y2 = ystart;
 		oc2 = 0;
-		MIOUTCODES(oc2, x2, y2,
-			   extents->x1, extents->y1,
-			   extents->x2, extents->y2);
+		OUTCODES(oc2, x2, y2, extents);
 
 		while (--n) {
 			int16_t sdx, sdy;
@@ -5504,9 +5501,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 			degenerate = false;
 
 			oc2 = 0;
-			MIOUTCODES(oc2, x2, y2,
-				   extents->x1, extents->y1,
-				   extents->x2, extents->y2);
+			OUTCODES(oc2, x2, y2, extents);
 			if (oc1 & oc2)
 				continue;
 
@@ -5554,13 +5549,12 @@ rectangle_continue:
 
 				x = x1;
 				y = y1;
-				pt2_clipped = 0;
 
 				if (oc1 | oc2) {
-					int pt1_clipped;
+					int pt1_clipped, pt2_clipped;
 
 					if (miZeroClipLine(extents->x1, extents->y1,
-							   extents->x2, extents->y2,
+							   extents->x2-1, extents->y2-1,
 							   &x, &y, &x2_clipped, &y2_clipped,
 							   adx, ady,
 							   &pt1_clipped, &pt2_clipped,
@@ -5568,12 +5562,6 @@ rectangle_continue:
 						continue;
 
 					length = abs(x2_clipped - x);
-
-					/* if we've clipped the endpoint, always draw the full length
-					 * of the segment, because then the capstyle doesn't matter
-					 */
-					if (pt2_clipped)
-						length++;
 					if (length == 0)
 						continue;
 
@@ -5636,13 +5624,12 @@ X_continue2:
 
 				x = x1;
 				y = y1;
-				pt2_clipped = 0;
 
 				if (oc1 | oc2) {
-					int pt1_clipped;
+					int pt1_clipped, pt2_clipped;
 
 					if (miZeroClipLine(extents->x1, extents->y1,
-							   extents->x2, extents->y2,
+							   extents->x2-1, extents->y2-1,
 							   &x, &y, &x2_clipped, &y2_clipped,
 							   adx, ady,
 							   &pt1_clipped, &pt2_clipped,
@@ -5650,12 +5637,6 @@ X_continue2:
 						continue;
 
 					length = abs(y2_clipped - y);
-
-					/* if we've clipped the endpoint, always draw the full length
-					 * of the segment, because then the capstyle doesn't matter
-					 */
-					if (pt2_clipped)
-						length++;
 					if (length == 0)
 						continue;
 
@@ -6807,17 +6788,9 @@ sna_poly_zero_segment_blt(DrawablePtr drawable,
 				continue;
 
 			oc1 = 0;
-			MIOUTCODES(oc1, x1, y1,
-				   extents->x1,
-				   extents->y1,
-				   extents->x2,
-				   extents->y2);
+			OUTCODES(oc1, x1, y1, extents);
 			oc2 = 0;
-			MIOUTCODES(oc2, x2, y2,
-				   extents->x1,
-				   extents->y1,
-				   extents->x2,
-				   extents->y2);
+			OUTCODES(oc2, x2, y2, extents);
 			if (oc1 & oc2)
 				continue;
 
@@ -6865,10 +6838,8 @@ rectangle_continue:
 					int pt1_clipped, pt2_clipped;
 					int x = x1, y = y1;
 
-					if (miZeroClipLine(extents->x1,
-							   extents->y1,
-							   extents->x2,
-							   extents->y2,
+					if (miZeroClipLine(extents->x1, extents->y1,
+							   extents->x2-1, extents->y2-1,
 							   &x1, &y1, &x2, &y2,
 							   adx, ady,
 							   &pt1_clipped, &pt2_clipped,
@@ -6937,10 +6908,8 @@ X_continue2:
 					int pt1_clipped, pt2_clipped;
 					int x = x1, y = y1;
 
-					if (miZeroClipLine(extents->x1,
-							   extents->y1,
-							   extents->x2,
-							   extents->y2,
+					if (miZeroClipLine(extents->x1, extents->y1,
+							   extents->x2-1, extents->y2-1,
 							   &x1, &y1, &x2, &y2,
 							   adx, ady,
 							   &pt1_clipped, &pt2_clipped,
commit 5973c8982b49d41a0f3964884e3af9a64dc162cc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 12:09:19 2012 +0000

    sna: When reversing line-drawing direction, use the clipped endpoint
    
    Make sure we take the clipping into account if we choose to reverse the
    draw direction (to maintain left-to-right inside the box emission).
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46261
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45673
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 23fc9ad..2088019 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5451,11 +5451,11 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 	}
 
 	jump = _jump[(damage != NULL) | !!(dx|dy) << 1];
-	DBG(("%s: [clipped] extents=(%d, %d), (%d, %d), delta=(%d, %d)\n",
-	     __FUNCTION__,
+	DBG(("%s: [clipped=%x] extents=(%d, %d), (%d, %d), delta=(%d, %d), damage=%p\n",
+	     __FUNCTION__, clipped,
 	     clip.extents.x1, clip.extents.y1,
 	     clip.extents.x2, clip.extents.y2,
-	     dx, dy));
+	     dx, dy, damage));
 
 	extents = REGION_RECTS(&clip);
 	last_extents = extents + REGION_NUM_RECTS(&clip);
@@ -5514,7 +5514,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 				       adx, ady, sdx, sdy,
 				       1, 1, octant);
 
-			DBG(("%s: adx=(%d, %d), sdx=(%d, %d), oc1=%d, oc2=%d\n",
+			DBG(("%s: adx=(%d, %d), sdx=(%d, %d), oc1=%x, oc2=%x\n",
 			     __FUNCTION__, adx, ady, sdx, sdy, oc1, oc2));
 			if (adx == 0 || ady == 0) {
 				if (x1 <= x2) {
@@ -5542,11 +5542,13 @@ rectangle_continue:
 					b = box;
 				}
 			} else if (adx >= ady) {
+				int x2_clipped = x2, y2_clipped = y2;
+
 				/* X-major segment */
 				e1 = ady << 1;
 				e2 = e1 - (adx << 1);
 				e  = e1 - adx;
-				length = adx;	/* don't draw endpoint in main loop */
+				length = adx;
 
 				FIXUP_ERROR(e, octant, bias);
 
@@ -5555,7 +5557,6 @@ rectangle_continue:
 				pt2_clipped = 0;
 
 				if (oc1 | oc2) {
-					int x2_clipped = x2, y2_clipped = y2;
 					int pt1_clipped;
 
 					if (miZeroClipLine(extents->x1, extents->y1,
@@ -5587,8 +5588,8 @@ rectangle_continue:
 				e  = e - e1;
 
 				if (sdx < 0) {
-					x = x2;
-					y = y2;
+					x = x2_clipped;
+					y = y2_clipped;
 					sdy = -sdy;
 				}
 
@@ -5622,11 +5623,13 @@ X_continue2:
 					b = box;
 				}
 			} else {
+				int x2_clipped = x2, y2_clipped = y2;
+
 				/* Y-major segment */
 				e1 = adx << 1;
 				e2 = e1 - (ady << 1);
 				e  = e1 - ady;
-				length  = ady;	/* don't draw endpoint in main loop */
+				length  = ady;
 
 				SetYMajorOctant(octant);
 				FIXUP_ERROR(e, octant, bias);
@@ -5636,7 +5639,6 @@ X_continue2:
 				pt2_clipped = 0;
 
 				if (oc1 | oc2) {
-					int x2_clipped = x2, y2_clipped = y2;
 					int pt1_clipped;
 
 					if (miZeroClipLine(extents->x1, extents->y1,
@@ -5668,8 +5670,8 @@ X_continue2:
 				e  = e - e1;
 
 				if (sdx < 0) {
-					x = x2;
-					y = y2;
+					x = x2_clipped;
+					y = y2_clipped;
 					sdy = -sdy;
 				}
 
commit 12b587538706e3280e5803f13bfa5b4d64244235
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 20 09:52:37 2012 +0000

    sna/dri: Ensure the domain tracking is reset when releasing bo used for swaps
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 0689fc5..2a25cbd 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -710,6 +710,14 @@ sna_dri_add_frame_event(struct sna_dri_frame_event *info)
 }
 
 static void
+sna_dri_frame_event_release_bo(struct kgem *kgem, struct kgem_bo *bo)
+{
+	bo->needs_flush = true; /* has been used externally, reset domains */
+	bo->reusable = true; /* No longer in use by an external client */
+	kgem_bo_destroy(kgem, bo);
+}
+
+static void
 sna_dri_frame_event_info_free(struct sna_dri_frame_event *info)
 {
 	DBG(("%s: del[%p] (%p, %ld)\n", __FUNCTION__,
@@ -721,18 +729,17 @@ sna_dri_frame_event_info_free(struct sna_dri_frame_event *info)
 	_sna_dri_destroy_buffer(info->sna, info->front);
 	_sna_dri_destroy_buffer(info->sna, info->back);
 
-	if (info->old_front.bo) {
-		info->old_front.bo->reusable = true;
-		kgem_bo_destroy(&info->sna->kgem, info->old_front.bo);
-	}
-	if (info->next_front.bo) {
-		info->next_front.bo->reusable = true;
-		kgem_bo_destroy(&info->sna->kgem, info->next_front.bo);
-	}
-	if (info->cache.bo) {
-		info->cache.bo->reusable = true;
-		kgem_bo_destroy(&info->sna->kgem, info->cache.bo);
-	}
+	if (info->old_front.bo)
+		sna_dri_frame_event_release_bo(&info->sna->kgem,
+					       info->old_front.bo);
+
+	if (info->next_front.bo)
+		sna_dri_frame_event_release_bo(&info->sna->kgem,
+					       info->next_front.bo);
+
+	if (info->cache.bo)
+		sna_dri_frame_event_release_bo(&info->sna->kgem,
+					       info->cache.bo);
 
 	free(info);
 }
commit 09c1da1cfa5409766072ad1d937687c2b417d0d1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Feb 19 17:50:56 2012 +0000

    sna: Correct tile sizes for Y-tiling on i915g
    
    128-byte Y-tiling wasn't introduced until the 945.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index f20a7bb..f107f14 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -731,11 +731,16 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 void kgem_get_tile_size(struct kgem *kgem, int tiling,
 			int *tile_width, int *tile_height, int *tile_size)
 {
-	if (kgem->gen < 30) {
+	if (kgem->gen <= 30) {
 		if (tiling) {
 			*tile_width = 512;
-			*tile_height = 16;
-			*tile_size = 2048;
+			if (kgem->gen < 30) {
+				*tile_height = 16;
+				*tile_size = 2048;
+			} else {
+				*tile_height = 8;
+				*tile_size = 4096;
+			}
 		} else {
 			*tile_width = 1;
 			*tile_height = 1;
@@ -754,7 +759,7 @@ void kgem_get_tile_size(struct kgem *kgem, int tiling,
 		*tile_size = 4096;
 		break;
 	case I915_TILING_Y:
-		*tile_width = kgem->gen <= 30 ? 512 : 128;
+		*tile_width = 128;
 		*tile_height = 32;
 		*tile_size = 4096;
 		break;
@@ -776,10 +781,10 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 	assert(width <= MAXSHORT);
 	assert(height <= MAXSHORT);
 
-	if (kgem->gen < 30) {
+	if (kgem->gen <= 30) {
 		if (tiling) {
 			tile_width = 512;
-			tile_height = 16;
+			tile_height = kgem->gen < 30 ? 16 : 8;
 		} else {
 			tile_width = scanout ? 64 : kgem->min_alignment;
 			tile_height = 2;
@@ -795,7 +800,7 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 		tile_height = 8;
 		break;
 	case I915_TILING_Y:
-		tile_width = kgem->gen <= 30 ? 512 : 128;
+		tile_width = 128;
 		tile_height = 32;
 		break;
 	}
@@ -837,8 +842,8 @@ static uint32_t kgem_aligned_height(struct kgem *kgem,
 {
 	uint32_t tile_height;
 
-	if (kgem->gen < 30) {
-		tile_height = tiling ? 16 : 2;
+	if (kgem->gen <= 30) {
+		tile_height = tiling ? kgem->gen < 30 ? 16 : 8 : 1;
 	} else switch (tiling) {
 	default:
 	case I915_TILING_NONE:
commit 7d24f414d1df6753eec537ad8242d10cc800e99d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 16 22:01:41 2012 +0000

    sna/trapezoids: Presume that Precise mono rasterisation adheres to the spec
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46156
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index b02f8f7..fafb16f 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3007,7 +3007,7 @@ trapezoid_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (NO_SCAN_CONVERTER)
 		return false;
 
-	if (dst->polyMode == PolyModePrecise) {
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat)) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
@@ -3351,7 +3351,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (NO_SCAN_CONVERTER)
 		return false;
 
-	if (dst->polyMode == PolyModePrecise) {
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat)) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
@@ -3519,7 +3519,7 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (NO_SCAN_CONVERTER)
 		return false;
 
-	if (dst->polyMode == PolyModePrecise) {
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat)) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
@@ -4008,7 +4008,8 @@ trap_mask_converter(PicturePtr picture,
 		return false;
 
 	/* XXX strict adherence to the Render specification */
-	if (picture->polyMode == PolyModePrecise) {
+	if (picture->polyMode == PolyModePrecise &&
+	    picture->polyEdge != PolyEdgeSharp) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
@@ -4515,7 +4516,7 @@ triangles_mask_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (NO_SCAN_CONVERTER)
 		return false;
 
-	if (dst->polyMode == PolyModePrecise) {
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat)) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
@@ -4748,7 +4749,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 
 	/* XXX strict adherence to the Render specification */
-	if (dst->polyMode == PolyModePrecise) {
+	if (dst->polyMode == PolyModePrecise && !is_mono(dst, maskFormat)) {
 		DBG(("%s: fallback -- precise rasterisation requested\n",
 		     __FUNCTION__));
 		return false;
commit cbf661459a6fb5d6e887d1f0301e8c814e77d0b0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 16 11:22:23 2012 +0000

    sna: Upconvert fallback trapezoids to a8
    
    Since the hardware only handles a8 without tricky emulation and pixman
    insists on using a1 for sharp trapezoids we need to ensure that we
    convert the a1 to a8 for our trapezoidal mask.
    
    More worryingly, this path should never be hit...
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=46156
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 28c8a67..b02f8f7 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2009,20 +2009,41 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 		DBG(("%s: mask (%dx%d) depth=%d, format=%08x\n",
 		     __FUNCTION__, width, height, depth, format));
 		scratch = sna_pixmap_create_upload(screen,
-						   width, height, depth,
+						   width, height, 8,
 						   KGEM_BUFFER_WRITE);
 		if (!scratch)
 			return;
 
-		memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
-		image = pixman_image_create_bits(format, width, height,
-						 scratch->devPrivate.ptr,
-						 scratch->devKind);
+		if (depth < 8) {
+			image = pixman_image_create_bits(format, width, height,
+							 NULL, 0);
+		} else {
+			memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
+			image = pixman_image_create_bits(format, width, height,
+							 scratch->devPrivate.ptr,
+							 scratch->devKind);
+		}
 		if (image) {
 			for (; ntrap; ntrap--, traps++)
 				pixman_rasterize_trapezoid(image,
 							   (pixman_trapezoid_t *)traps,
 							   -bounds.x1, -bounds.y1);
+			if (depth < 8) {
+				pixman_image_t *a8;
+
+				a8 = pixman_image_create_bits(PIXMAN_a8, width, height,
+							      scratch->devPrivate.ptr,
+							      scratch->devKind);
+				if (a8) {
+					pixman_image_composite(PIXMAN_OP_SRC,
+							       image, NULL, a8,
+							       0, 0,
+							       0, 0,
+							       0, 0,
+							       width, height);
+					pixman_image_unref (a8);
+				}
+			}
 
 			pixman_image_unref(image);
 		}
commit 7faac5f35ef2e439354d27df05c76cab90d98ef6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 15 17:01:18 2012 +0000

    sna/dri: Mark bo as reusable after completion of a flip-event
    
    After the flip chain is completed, any residual buffers are no longer in
    use and so available for reuse.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index c83d580..0689fc5 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -721,12 +721,18 @@ sna_dri_frame_event_info_free(struct sna_dri_frame_event *info)
 	_sna_dri_destroy_buffer(info->sna, info->front);
 	_sna_dri_destroy_buffer(info->sna, info->back);
 
-	if (info->old_front.bo)
+	if (info->old_front.bo) {
+		info->old_front.bo->reusable = true;
 		kgem_bo_destroy(&info->sna->kgem, info->old_front.bo);
-	if (info->next_front.bo)
+	}
+	if (info->next_front.bo) {
+		info->next_front.bo->reusable = true;
 		kgem_bo_destroy(&info->sna->kgem, info->next_front.bo);
-	if (info->cache.bo)
+	}
+	if (info->cache.bo) {
+		info->cache.bo->reusable = true;
 		kgem_bo_destroy(&info->sna->kgem, info->cache.bo);
+	}
 
 	free(info);
 }
commit 9c1de13135fbeef032f1aded9f66541a06cf9de4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 15 16:08:23 2012 +0000

    sna/dri: Don't attempt to change tiling if it is a no-op
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_module.c b/src/intel_module.c
index d0367e5..66343f2 100644
--- a/src/intel_module.c
+++ b/src/intel_module.c
@@ -286,10 +286,8 @@ static Bool has_kernel_mode_setting(struct pci_device *dev)
 		 dev->domain, dev->bus, dev->dev, dev->func);
 
 	ret = drmCheckModesettingSupported(id);
-	if (ret) {
-		if (xf86LoadKernelModule("i915"))
-			ret = drmCheckModesettingSupported(id);
-	}
+	if (ret && xf86LoadKernelModule("i915"))
+		ret = drmCheckModesettingSupported(id);
 	/* Be nice to the user and load fbcon too */
 	if (!ret)
 		(void)xf86LoadKernelModule("fbcon");
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index d0e19cf..c83d580 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -157,7 +157,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 					  PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv;
-	uint32_t tiling;
+	int tiling;
 
 	priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
 	if (priv == NULL)
@@ -167,6 +167,8 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 		return ref(priv->gpu_bo);
 
 	tiling = color_tiling(sna, &pixmap->drawable);
+	if (tiling < 0)
+		tiling = -tiling;
 	if (priv->gpu_bo->tiling != tiling)
 		sna_pixmap_change_tiling(pixmap, tiling);
 
commit 3b162a20a420ca91f3ded7a4f98af7f5732ef896
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 15 11:58:42 2012 +0000

    Be paranoid about the definition of container_of
    
    Replace any existing definition with a correct version, since there are
    broken container_of macros floating around the xorg includes.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_list.h b/src/intel_list.h
index 9187766..8e8c612 100644
--- a/src/intel_list.h
+++ b/src/intel_list.h
@@ -267,24 +267,6 @@ list_is_empty(struct list *head)
 }
 
 /**
- * Returns a pointer to the container of this list element.
- *
- * Example:
- * struct foo* f;
- * f = container_of(&foo->entry, struct foo, entry);
- * assert(f == foo);
- *
- * @param ptr Pointer to the struct list.
- * @param type Data type of the list element.
- * @param member Member name of the struct list field in the list element.
- * @return A pointer to the data struct containing the list head.
- */
-#ifndef container_of
-#define container_of(ptr, type, member) \
-	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
-#endif
-
-/**
  * Alias of container_of
  */
 #define list_entry(ptr, type, member) \
@@ -397,14 +379,14 @@ static inline void list_move_tail(struct list *list, struct list *head)
 	list_add_tail(list, head);
 }
 
-#undef container_of
-#define container_of(ptr, type, member) \
-	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
-
 #define list_last_entry(ptr, type, member) \
     list_entry((ptr)->prev, type, member)
 
 #endif
 
+#undef container_of
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+
 #endif /* _INTEL_LIST_H_ */
 
commit 6d6016bd7ef9339f101c8667fa38d477343139ba
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 13 00:48:15 2012 +0000

    Add a missing macro for old xorg/list.h
    
    list_last_entry() needs to be defined if we are including the xorg
    list.h as opposed to our standalone variant.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_list.h b/src/intel_list.h
index 2595a29..9187766 100644
--- a/src/intel_list.h
+++ b/src/intel_list.h
@@ -401,6 +401,9 @@ static inline void list_move_tail(struct list *list, struct list *head)
 #define container_of(ptr, type, member) \
 	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
 
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
 #endif
 
 #endif /* _INTEL_LIST_H_ */
commit b3bfa0813f1b4c0284a0468a479c7bc124e59aeb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 11 20:54:18 2012 +0000

    Include a local copy of list.h
    
    In 1.11.903, the list.h was renamed to xorg-list.h with a corresponding
    change to all structures. As we carried local fixes to list.h and
    extended functionality, just create our own list.h with a bit of
    handwaving to protect us for the brief existence of xorg/include/list.h.
    
    Reported-by: Armin K <krejzi at email.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45938
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/Makefile.am b/src/Makefile.am
index a632543..448a354 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -41,6 +41,7 @@ endif
 NULL:=#
 
 intel_drv_la_SOURCES = \
+	intel_list.h \
 	intel_module.c \
 	$(NULL)
 
diff --git a/src/intel.h b/src/intel.h
index 775afb6..83982f1 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -68,100 +68,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "i915_drm.h"
 
 #include "intel_driver.h"
+#include "intel_list.h"
 
 #if HAVE_UDEV
 #include <libudev.h>
 #endif
 
-/* XXX
- * The X server gained an *almost* identical implementation in 1.9.
- *
- * Remove this duplicate code either in 2.16 (when we can depend upon 1.9)
- * or the drivers are merged back into the xserver tree, whichever happens
- * earlier.
- */
-
-#ifndef _LIST_H_
-/* classic doubly-link circular list */
-struct list {
-	struct list *next, *prev;
-};
-
-static void
-list_init(struct list *list)
-{
-	list->next = list->prev = list;
-}
-
-static inline void
-__list_add(struct list *entry,
-	    struct list *prev,
-	    struct list *next)
-{
-	next->prev = entry;
-	entry->next = next;
-	entry->prev = prev;
-	prev->next = entry;
-}
-
-static inline void
-list_add(struct list *entry, struct list *head)
-{
-	__list_add(entry, head, head->next);
-}
-
-static inline void
-__list_del(struct list *prev, struct list *next)
-{
-	next->prev = prev;
-	prev->next = next;
-}
-
-static inline void
-list_del(struct list *entry)
-{
-	__list_del(entry->prev, entry->next);
-	list_init(entry);
-}
-
-static inline Bool
-list_is_empty(struct list *head)
-{
-	return head->next == head;
-}
-#endif
-
-/* XXX work around a broken define in list.h currently [ickle 20100713] */
-#undef container_of
-
-#ifndef container_of
-#define container_of(ptr, type, member) \
-	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
-#endif
-
-#ifndef list_entry
-#define list_entry(ptr, type, member) \
-	container_of(ptr, type, member)
-#endif
-
-#ifndef list_first_entry
-#define list_first_entry(ptr, type, member) \
-	list_entry((ptr)->next, type, member)
-#endif
-
-#ifndef list_foreach
-#define list_foreach(pos, head)			\
-	for (pos = (head)->next; pos != (head);	pos = pos->next)
-#endif
-
-/* XXX list.h from xserver-1.9 uses a GCC-ism to avoid having to pass type */
-#ifndef list_foreach_entry
-#define list_foreach_entry(pos, type, head, member)		\
-	for (pos = list_entry((head)->next, type, member);\
-	     &pos->member != (head);					\
-	     pos = list_entry(pos->member.next, type, member))
-#endif
-
 /* remain compatible to xorg-server 1.6 */
 #ifndef MONITOR_EDID_COMPLETE_RAWDATA
 #define MONITOR_EDID_COMPLETE_RAWDATA EDID_COMPLETE_RAWDATA
diff --git a/src/intel_list.h b/src/intel_list.h
new file mode 100644
index 0000000..2595a29
--- /dev/null
+++ b/src/intel_list.h
@@ -0,0 +1,407 @@
+/*
+ * Copyright Â© 2010-2012 Intel Corporation
+ * Copyright Â© 2010 Francisco Jerez <currojerez at riseup.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_LIST_H_
+#define _INTEL_LIST_H_
+
+#include <xorgVersion.h>
+
+#if XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,9,0,0,0) || XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,11,99,903,0)
+
+#include <stdbool.h>
+
+/**
+ * @file Classic doubly-link circular list implementation.
+ * For real usage examples of the linked list, see the file test/list.c
+ *
+ * Example:
+ * We need to keep a list of struct foo in the parent struct bar, i.e. what
+ * we want is something like this.
+ *
+ *     struct bar {
+ *          ...
+ *          struct foo *list_of_foos; -----> struct foo {}, struct foo {}, struct foo{}
+ *          ...
+ *     }
+ *
+ * We need one list head in bar and a list element in all list_of_foos (both are of
+ * data type 'struct list').
+ *
+ *     struct bar {
+ *          ...
+ *          struct list list_of_foos;
+ *          ...
+ *     }
+ *
+ *     struct foo {
+ *          ...
+ *          struct list entry;
+ *          ...
+ *     }
+ *
+ * Now we initialize the list head:
+ *
+ *     struct bar bar;
+ *     ...
+ *     list_init(&bar.list_of_foos);
+ *
+ * Then we create the first element and add it to this list:
+ *
+ *     struct foo *foo = malloc(...);
+ *     ....
+ *     list_add(&foo->entry, &bar.list_of_foos);
+ *
+ * Repeat the above for each element you want to add to the list. Deleting
+ * works with the element itself.
+ *      list_del(&foo->entry);
+ *      free(foo);
+ *
+ * Note: calling list_del(&bar.list_of_foos) will set bar.list_of_foos to an empty
+ * list again.
+ *
+ * Looping through the list requires a 'struct foo' as iterator and the
+ * name of the field the subnodes use.
+ *
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar.list_of_foos, entry) {
+ *      if (iterator->something == ...)
+ *             ...
+ * }
+ *
+ * Note: You must not call list_del() on the iterator if you continue the
+ * loop. You need to run the safe for-each loop instead:
+ *
+ * struct foo *iterator, *next;
+ * list_for_each_entry_safe(iterator, next, &bar.list_of_foos, entry) {
+ *      if (...)
+ *              list_del(&iterator->entry);
+ * }
+ *
+ */
+
+/**
+ * The linkage struct for list nodes. This struct must be part of your
+ * to-be-linked struct. struct list is required for both the head of the
+ * list and for each list node.
+ *
+ * Position and name of the struct list field is irrelevant.
+ * There are no requirements that elements of a list are of the same type.
+ * There are no requirements for a list head, any struct list can be a list
+ * head.
+ */
+struct list {
+    struct list *next, *prev;
+};
+
+/**
+ * Initialize the list as an empty list.
+ *
+ * Example:
+ * list_init(&bar->list_of_foos);
+ *
+ * @param The list to initialized.
+ */
+static void
+list_init(struct list *list)
+{
+    list->next = list->prev = list;
+}
+
+static inline void
+__list_add(struct list *entry,
+	    struct list *prev,
+	    struct list *next)
+{
+    next->prev = entry;
+    entry->next = next;
+    entry->prev = prev;
+    prev->next = entry;
+}
+
+/**
+ * Insert a new element after the given list head. The new element does not
+ * need to be initialised as empty list.
+ * The list changes from:
+ *      head â†’ some element â†’ ...
+ * to
+ *      head â†’ new element â†’ older element â†’ ...
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_add(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_add(struct list *entry, struct list *head)
+{
+    __list_add(entry, head, head->next);
+}
+
+static inline void
+list_add_tail(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+static inline void list_replace(struct list *old,
+				struct list *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define list_for_each(pos, head)				\
+    for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * Append a new element to the end of the list given with this list head.
+ *
+ * The list changes from:
+ *      head â†’ some element â†’ ... â†’ lastelement
+ * to
+ *      head â†’ some element â†’ ... â†’ lastelement â†’ new element
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_append(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_append(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+
+static inline void
+__list_del(struct list *prev, struct list *next)
+{
+    next->prev = prev;
+    prev->next = next;
+}
+
+static inline void
+_list_del(struct list *entry)
+{
+    __list_del(entry->prev, entry->next);
+}
+
+/**
+ * Remove the element from the list it is in. Using this function will reset
+ * the pointers to/from this element so it is removed from the list. It does
+ * NOT free the element itself or manipulate it otherwise.
+ *
+ * Using list_del on a pure list head (like in the example at the top of
+ * this file) will NOT remove the first element from
+ * the list but rather reset the list as empty list.
+ *
+ * Example:
+ * list_del(&foo->entry);
+ *
+ * @param entry The element to remove.
+ */
+static inline void
+list_del(struct list *entry)
+{
+    _list_del(entry);
+    list_init(entry);
+}
+
+static inline void list_move(struct list *list, struct list *head)
+{
+	if (list->prev != head) {
+		_list_del(list);
+		list_add(list, head);
+	}
+}
+
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+	_list_del(list);
+	list_add_tail(list, head);
+}
+
+/**
+ * Check if the list is empty.
+ *
+ * Example:
+ * list_is_empty(&bar->list_of_foos);
+ *
+ * @return True if the list contains one or more elements or False otherwise.
+ */
+static inline bool
+list_is_empty(struct list *head)
+{
+    return head->next == head;
+}
+
+/**
+ * Returns a pointer to the container of this list element.
+ *
+ * Example:
+ * struct foo* f;
+ * f = container_of(&foo->entry, struct foo, entry);
+ * assert(f == foo);
+ *
+ * @param ptr Pointer to the struct list.
+ * @param type Data type of the list element.
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the data struct containing the list head.
+ */
+#ifndef container_of
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+#endif
+
+/**
+ * Alias of container_of
+ */
+#define list_entry(ptr, type, member) \
+    container_of(ptr, type, member)
+
+/**
+ * Retrieve the first list entry for the given list pointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_first_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the first list element.
+ */
+#define list_first_entry(ptr, type, member) \
+    list_entry((ptr)->next, type, member)
+
+/**
+ * Retrieve the last list entry for the given listpointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_last_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the last list element.
+ */
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define __container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr)						\
+	     - ((char *)&(sample)->member - (char *)(sample)))
+/**
+ * Loop through the list given by head and set pos to struct in the list.
+ *
+ * Example:
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar->list_of_foos, entry) {
+ *      [modify iterator]
+ * }
+ *
+ * This macro is not safe for node deletion. Use list_for_each_entry_safe
+ * instead.
+ *
+ * @param pos Iterator variable of the type of the list elements.
+ * @param head List head
+ * @param member Member name of the struct list in the list elements.
+ *
+ */
+#define list_for_each_entry(pos, head, member)				\
+    for (pos = __container_of((head)->next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.next, pos, member))
+
+/**
+ * Loop through the list, keeping a backup pointer to the element. This
+ * macro allows for the deletion of a list element while looping through the
+ * list.
+ *
+ * See list_for_each_entry for more details.
+ */
+#define list_for_each_entry_safe(pos, tmp, head, member)		\
+    for (pos = __container_of((head)->next, pos, member),		\
+	 tmp = __container_of(pos->member.next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = tmp, tmp = __container_of(pos->member.next, tmp, member))
+
+#else
+
+#include <list.h>
+
+static inline void
+list_add_tail(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+static inline void
+_list_del(struct list *entry)
+{
+    __list_del(entry->prev, entry->next);
+}
+
+static inline void list_replace(struct list *old,
+				struct list *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_move(struct list *list, struct list *head)
+{
+	if (list->prev != head) {
+		_list_del(list);
+		list_add(list, head);
+	}
+}
+
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+	_list_del(list);
+	list_add_tail(list, head);
+}
+
+#undef container_of
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+
+#endif
+
+#endif /* _INTEL_LIST_H_ */
+
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index a11846d..0cb8df3 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -1092,9 +1092,7 @@ intel_uxa_create_pixmap(ScreenPtr screen, int w, int h, int depth,
 			else
 				aligned_h = ALIGN(h, 2);
 
-			list_foreach_entry(priv, struct intel_pixmap,
-					   &intel->in_flight,
-					   in_flight) {
+			list_for_each_entry(priv, &intel->in_flight, in_flight) {
 				if (priv->tiling != tiling)
 					continue;
 
diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
index 2809617..70afd53 100644
--- a/src/sna/Makefile.am
+++ b/src/sna/Makefile.am
@@ -18,18 +18,19 @@
 #  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 #  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-AM_CFLAGS = @CWARNFLAGS@ @XORG_CFLAGS@ \
-	    @UDEV_CFLAGS@ \
-	    @DRM_CFLAGS@ \
-	    @DRI_CFLAGS@ \
-	    -I$(top_srcdir)/src \
-	    -I$(top_srcdir)/src/render_program
+AM_CFLAGS = \
+	@CWARNFLAGS@ \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/src/render_program \
+	@XORG_CFLAGS@ \
+	@UDEV_CFLAGS@ \
+	@DRM_CFLAGS@ \
+	@DRI_CFLAGS@ \
+	$(NULL)
 
 noinst_LTLIBRARIES = libsna.la
 libsna_la_LIBADD = @UDEV_LIBS@ -lm @DRM_LIBS@
 
-NULL:=#
-
 libsna_la_SOURCES = \
 	blt.c \
 	compiler.h \
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 607f1c4..f20a7bb 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -51,40 +51,6 @@
 static struct kgem_bo *
 search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
-static inline void _list_del(struct list *list)
-{
-	assert(list->prev->next == list);
-	assert(list->next->prev == list);
-	__list_del(list->prev, list->next);
-}
-
-static inline void list_move(struct list *list, struct list *head)
-{
-	_list_del(list);
-	list_add(list, head);
-}
-
-static inline void list_move_tail(struct list *list, struct list *head)
-{
-	_list_del(list);
-	list_add_tail(list, head);
-}
-
-static inline void list_replace(struct list *old,
-				struct list *new)
-{
-	new->next = old->next;
-	new->next->prev = new;
-	new->prev = old->prev;
-	new->prev->next = new;
-}
-
-#define list_last_entry(ptr, type, member) \
-    list_entry((ptr)->prev, type, member)
-
-#define list_for_each(pos, head)				\
-    for (pos = (head)->next; pos != (head); pos = pos->next)
-
 
 #define DBG_NO_HW 0
 #define DBG_NO_TILING 0
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 73e2490..575c213 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -44,6 +44,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <stdint.h>
 
 #include "compiler.h"
+
 #include <xf86_OSproc.h>
 #include <xf86Pci.h>
 #include <xf86Cursor.h>
@@ -109,15 +110,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define TEST_RENDER (TEST_ALL || 0)
 
 #include "intel_driver.h"
+#include "intel_list.h"
 #include "kgem.h"
 #include "sna_damage.h"
 #include "sna_render.h"
 
-static inline void list_add_tail(struct list *new, struct list *head)
-{
-	__list_add(new, head->prev, head);
-}
-
 #ifndef CREATE_PIXMAP_USAGE_SCRATCH_HEADER
 #define FAKE_CREATE_PIXMAP_USAGE_SCRATCH_HEADER 1
 #define CREATE_PIXMAP_USAGE_SCRATCH_HEADER (unsigned)-1
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2a4aaec..23fc9ad 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -815,14 +815,6 @@ static Bool sna_destroy_pixmap(PixmapPtr pixmap)
 	return fbDestroyPixmap(pixmap);
 }
 
-static inline void list_move(struct list *list, struct list *head)
-{
-	if (list->prev != head) {
-		__list_del(list->prev, list->next);
-		list_add(list, head);
-	}
-}
-
 static inline bool pixmap_inplace(struct sna *sna,
 				  PixmapPtr pixmap,
 				  struct sna_pixmap *priv)
commit 12fca2ea562f08268e75250ddeec464774a4d221
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 11 13:31:12 2012 +0000

    sna/gen6: Prefer the render ring for copies
    
    Slower for fills, but on the current stack faster for copies, both large
    and small. Hopefully, when we write some good shaders for SNB, we will
    not only improve performance for copies but also make fills faster on
    the render ring than the blt?
    
    As the BLT copy routine is GPU bound for copywinpix10, and the RENDER
    copy routine is CPU bound and faster, I believe that we have reached the
    potential of the BLT ring and not yet saturated the GPU using the render
    copy.
    
    Note that we still do not casually switch rings, so the actual routine
    chosen will still be selected by the preceeding operations, so is
    unlikely to have any effect in practice during, for example, cairo-traces.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 146a2d1..1e99709 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3167,7 +3167,7 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   PixmapPtr src, struct kgem_bo *src_bo,
 				   PixmapPtr dst, struct kgem_bo *dst_bo)
 {
-	return (sna->kgem.ring != KGEM_RENDER ||
+	return (sna->kgem.ring == KGEM_BLT ||
 		prefer_blt_bo(sna, src, src_bo) ||
 		prefer_blt_bo(sna, dst, dst_bo));
 }
commit f54834d5446d40b3109d90c699fbc209e9a2fc8a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 11 11:02:53 2012 +0000

    sna/gen6: Suppress the CS stall for the first command in the batch
    
    The batch emission serves as a full stall, so we do not need to incur a
    second before our first rendering.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 1476ff7..146a2d1 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -718,11 +718,13 @@ gen6_emit_drawing_rectangle(struct sna *sna,
 	 * BEFORE the pipe-control with a post-sync op and no write-cache
 	 * flushes.
 	 */
-	OUT_BATCH(GEN6_PIPE_CONTROL | (4 - 2));
-	OUT_BATCH(GEN6_PIPE_CONTROL_CS_STALL |
-		  GEN6_PIPE_CONTROL_STALL_AT_SCOREBOARD);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	if (!sna->render_state.gen6.first_state_packet) {
+		OUT_BATCH(GEN6_PIPE_CONTROL | (4 - 2));
+		OUT_BATCH(GEN6_PIPE_CONTROL_CS_STALL |
+			  GEN6_PIPE_CONTROL_STALL_AT_SCOREBOARD);
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+	}
 
 	OUT_BATCH(GEN6_PIPE_CONTROL | (4 - 2));
 	OUT_BATCH(GEN6_PIPE_CONTROL_WRITE_TIME);
@@ -868,6 +870,7 @@ gen6_emit_state(struct sna *sna,
 		OUT_BATCH(0);
 		OUT_BATCH(0);
 	}
+	sna->render_state.gen6.first_state_packet = false;
 }
 
 static void gen6_magic_ca_pass(struct sna *sna,
@@ -4140,6 +4143,7 @@ gen6_render_retire(struct kgem *kgem)
 static void gen6_render_reset(struct sna *sna)
 {
 	sna->render_state.gen6.needs_invariant = TRUE;
+	sna->render_state.gen6.first_state_packet = true;
 	sna->render_state.gen6.vb_id = 0;
 	sna->render_state.gen6.ve_id = -1;
 	sna->render_state.gen6.last_primitive = -1;
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index a689315..7243042 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -405,6 +405,7 @@ struct gen6_render_state {
 	uint16_t surface_table;
 
 	Bool needs_invariant;
+	Bool first_state_packet;
 };
 
 enum {
commit e665801b5af7db59c7e436907d8514620b514fdb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 11 10:58:05 2012 +0000

    sna/gen7: Mention the depth-stall required before changing VS state
    
    Because one day we may actually start using VS! Copied from the addition
    of the w/a to Mesa by Kenneth Graunke.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 5740a42..5294547 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -476,6 +476,15 @@ gen7_emit_state_base_address(struct sna *sna)
 static void
 gen7_disable_vs(struct sna *sna)
 {
+	/* For future reference:
+	 * A PIPE_CONTROL with post-sync op set to 1 and a depth stall needs
+	 * to be emitted just prior to change VS state, i.e. 3DSTATE_VS,
+	 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
+	 * 3DSTATE_BINDING_TABLE_POINTER_VS, 3DSTATE_SAMPLER_STATE_POINTER_VS.
+	 *
+	 * Here we saved by the full-flush incurred when emitting
+	 * the batchbuffer.
+	 */
 	OUT_BATCH(GEN7_3DSTATE_VS | (6 - 2));
 	OUT_BATCH(0); /* no VS kernel */
 	OUT_BATCH(0);
commit 66f80e836a42316c45df0c596a3a0b61a04f5b52
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 9 14:16:17 2012 +0000

    sna: Fix retire after readback
    
    Upon reading, we encounter a serialisation point and so can retire all
    requests. However, kgem_bo_retire() wasn't correctly detecting that
    barrier and so we continued to using GPU detiling thinking the target
    was still busy.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 1c23320..607f1c4 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -306,7 +306,7 @@ static void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle, bo->domain));
 	assert(!kgem_busy(kgem, bo->handle));
 
-	if (bo->domain == DOMAIN_GPU)
+	if (bo->rq)
 		kgem_retire(kgem);
 
 	if (bo->exec == NULL) {
@@ -3561,6 +3561,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
 	uint32_t offset = _bo->delta, length = _bo->size.bytes;
+	int domain;
 
 	assert(_bo->io);
 	assert(_bo->exec == NULL);
@@ -3587,11 +3588,11 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		if (IS_CPU_MAP(bo->base.map)) {
 			set_domain.read_domains = I915_GEM_DOMAIN_CPU;
 			set_domain.write_domain = I915_GEM_DOMAIN_CPU;
-			bo->base.domain = DOMAIN_CPU;
+			domain = DOMAIN_CPU;
 		} else {
 			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
 			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
-			bo->base.domain = DOMAIN_GTT;
+			domain = DOMAIN_GTT;
 		}
 
 		drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
@@ -3600,9 +3601,10 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 			 bo->base.handle, (char *)bo->mem+offset,
 			 offset, length);
 		kgem_bo_map__cpu(kgem, &bo->base);
-		bo->base.domain = DOMAIN_NONE;
+		domain = DOMAIN_NONE;
 	}
 	kgem_bo_retire(kgem, &bo->base);
+	bo->base.domain = domain;
 }
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format)
commit bf17e0b8239588e750d3e1bf54cb12b1ffa4e9f7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 8 13:15:46 2012 +0000

    sna/gen2+: Force upload rather than perform source transformations on the CPU
    
    If both the source and destination is on the CPU, then the thinking was
    it would be quicker to operate on those on the CPU rather than copy both
    to the GPU and then perform the operation. This turns out to be a false
    assumption if transformation is involved -- something to be reconsidered
    if pixman should ever be improved.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index ed48ce6..64b4e7c 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1507,9 +1507,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index fc006ac..97e5839 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2419,9 +2419,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 6246538..6e7d4be 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2016,9 +2016,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index b9c7a92..7ac993c 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2044,9 +2044,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 17789e9..1476ff7 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2325,9 +2325,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 9757405..5740a42 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2329,9 +2329,15 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+static bool
 need_upload(PicturePtr p)
 {
-	return p->pDrawable && unattached(p->pDrawable);
+	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
 }
 
 static bool
commit 12e150786c62da43ef1050b6051fbde41371430d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 8 09:13:27 2012 +0000

    sna: Limit max CPU bo size to prevent aperture thrashing on upload
    
    Copying between two objects that consume more than the available GATT
    space is a painful experience due to the forced use of an intermediatory
    and eviction on every batch. The tiled upload paths are in comparison
    remarkably efficient, so favour their use when handling extremely large
    buffers.
    
    This reverses the previous idea in that we now prefer large GPU bo
    rather than large CPU bo, as the render pipeline is far more flexible
    for handling those than the blitter is for handling the CPU bo (at least
    for gen4+).
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 94b6c18..1c23320 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -575,6 +575,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 {
 	struct drm_i915_gem_get_aperture aperture;
 	size_t totalram;
+	unsigned half_gpu_max;
 	unsigned int i, j;
 
 	memset(kgem, 0, sizeof(*kgem));
@@ -679,7 +680,6 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		kgem->min_alignment = 64;
 
 	kgem->max_object_size = 2 * kgem->aperture_total / 3;
-	kgem->max_cpu_size = kgem->max_object_size;
 	kgem->max_gpu_size = kgem->max_object_size;
 	if (!kgem->has_llc)
 		kgem->max_gpu_size = MAX_CACHE_SIZE;
@@ -691,16 +691,6 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		if (kgem->max_gpu_size > kgem->aperture_low)
 			kgem->max_gpu_size = kgem->aperture_low;
 	}
-	if (kgem->max_gpu_size > kgem->max_cpu_size)
-		kgem->max_gpu_size = kgem->max_cpu_size;
-
-	kgem->max_upload_tile_size = kgem->aperture_mappable / 2;
-	if (kgem->max_upload_tile_size > kgem->max_gpu_size / 2)
-		kgem->max_upload_tile_size = kgem->max_gpu_size / 2;
-
-	kgem->max_copy_tile_size = (MAX_CACHE_SIZE + 1)/2;
-	if (kgem->max_copy_tile_size > kgem->max_gpu_size / 2)
-		kgem->max_copy_tile_size = kgem->max_gpu_size / 2;
 
 	totalram = total_ram_size();
 	if (totalram == 0) {
@@ -708,14 +698,32 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		     __FUNCTION__));
 		totalram = kgem->aperture_total;
 	}
+	DBG(("%s: total ram=%ld\n", __FUNCTION__, (long)totalram));
 	if (kgem->max_object_size > totalram / 2)
 		kgem->max_object_size = totalram / 2;
-	if (kgem->max_cpu_size > totalram / 2)
-		kgem->max_cpu_size = totalram / 2;
 	if (kgem->max_gpu_size > totalram / 4)
 		kgem->max_gpu_size = totalram / 4;
 
+	half_gpu_max = kgem->max_gpu_size / 2;
+	if (kgem->gen >= 40)
+		kgem->max_cpu_size = half_gpu_max;
+	else
+		kgem->max_cpu_size = kgem->max_object_size;
+
+	kgem->max_copy_tile_size = (MAX_CACHE_SIZE + 1)/2;
+	if (kgem->max_copy_tile_size > half_gpu_max)
+		kgem->max_copy_tile_size = half_gpu_max;
+
+	if (kgem->has_llc)
+		kgem->max_upload_tile_size = kgem->max_copy_tile_size;
+	else
+		kgem->max_upload_tile_size = kgem->aperture_mappable / 4;
+	if (kgem->max_upload_tile_size > half_gpu_max)
+		kgem->max_upload_tile_size = half_gpu_max;
+
 	kgem->large_object_size = MAX_CACHE_SIZE;
+	if (kgem->large_object_size > kgem->max_cpu_size)
+		kgem->large_object_size = kgem->max_cpu_size;
 	if (kgem->large_object_size > kgem->max_gpu_size)
 		kgem->large_object_size = kgem->max_gpu_size;
 
commit 879c4982093cb6b82ab5e199c74743d3538a53ce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 23:45:37 2012 +0000

    sna: Check that we successfully retired an active linear buffer
    
    If we go to the trouble of running retire before searching, we may as
    well check that we retired something before proceeding to check all the
    inactive lists.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 0c2f547..94b6c18 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1945,11 +1945,16 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
 		return NULL;
 
-	if (!use_active &&
-	    list_is_empty(inactive(kgem, num_pages)) &&
-	    !list_is_empty(active(kgem, num_pages, I915_TILING_NONE)) &&
-	    !kgem_retire(kgem))
-		return NULL;
+	if (!use_active && list_is_empty(inactive(kgem, num_pages))) {
+		if (list_is_empty(active(kgem, num_pages, I915_TILING_NONE)))
+			return NULL;
+
+		if (!kgem_retire(kgem))
+			return NULL;
+
+		if (list_is_empty(inactive(kgem, num_pages)))
+			return NULL;
+	}
 
 	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
commit 2086b35ff0e4cada67efe812608b7fe18267f187
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 21:56:29 2012 +0000

    sna: Relax must-be-blittable rules for gen4+
    
    The render pipeline is actually more flexible than the blitter for
    dealing with large surfaces and so the BLT is no longer the limiting
    factor on gen4+.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index e80eaae..0c2f547 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -799,6 +799,9 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 	uint32_t tile_width, tile_height;
 	uint32_t size;
 
+	assert(width <= MAXSHORT);
+	assert(height <= MAXSHORT);
+
 	if (kgem->gen < 30) {
 		if (tiling) {
 			tile_width = 512;
@@ -823,32 +826,26 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 		break;
 	}
 
-	/* If it is too wide for the blitter, don't even bother.  */
 	*pitch = ALIGN(width * bpp / 8, tile_width);
-	if (kgem->gen < 40) {
-		if (tiling != I915_TILING_NONE) {
-			if (*pitch > 8192)
-				return 0;
-			for (size = tile_width; size < *pitch; size <<= 1)
-				;
-			*pitch = size;
-		} else {
-			if (*pitch >= 32768)
-				return 0;
-		}
+	height = ALIGN(height, tile_height);
+	if (kgem->gen >= 40)
+		return PAGE_ALIGN(*pitch * height);
+
+	/* If it is too wide for the blitter, don't even bother.  */
+	if (tiling != I915_TILING_NONE) {
+		if (*pitch > 8192)
+			return 0;
+
+		for (size = tile_width; size < *pitch; size <<= 1)
+			;
+		*pitch = size;
 	} else {
-		int limit = 32768;
-		if (tiling)
-			limit *= 4;
-		if (*pitch >= limit)
+		if (*pitch >= 32768)
 			return 0;
 	}
-	height = ALIGN(height, tile_height);
-	if (height >= 65536)
-		return 0;
 
 	size = *pitch * height;
-	if (relaxed_fencing || tiling == I915_TILING_NONE || kgem->gen >= 40)
+	if (relaxed_fencing || tiling == I915_TILING_NONE)
 		return PAGE_ALIGN(size);
 
 	/*  We need to allocate a pot fence region for a tiled buffer. */
@@ -2233,6 +2230,9 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 	if (depth < 8 || kgem->wedged)
 		return 0;
 
+	if (width > MAXSHORT || height > MAXSHORT)
+		return 0;
+
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
commit eb6058d0189d5fa9c6ddd39fcf0f72aefcf56e5e
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Thu Feb 2 11:30:57 2012 +0800

    uxa/glamor: Use a macro to specify module name.
    
    This depends upon glamor commit b5f8d, just after the 0.3.0 tag.
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_glamor.c b/src/intel_glamor.c
index 262a06a..c468d34 100644
--- a/src/intel_glamor.c
+++ b/src/intel_glamor.c
@@ -68,7 +68,7 @@ intel_glamor_pre_init(ScrnInfoPtr scrn)
 	CARD32 version;
 
 	/* Load glamor module */
-	if ((glamor_module = xf86LoadSubModule(scrn, "glamor_egl"))) {
+	if ((glamor_module = xf86LoadSubModule(scrn, GLAMOR_EGL_MODULE_NAME))) {
 		version = xf86GetModuleVersion(glamor_module);
 		if (version < MODULE_VERSION_NUMERIC(0,3,0)) {
 			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
commit 1fbf1388ecf27c7187615cdf4d227831c6aa46db
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Wed Feb 1 19:47:28 2012 +0800

    uxa/glamor: Refine CloseScreen and InitScreen process.
    
    The previous version calls glamor_egl_close_screen and
    glamor_egl_free_screen manually which is not align with
    standard process. Now glamor change the way to follow
    standard method:
    
    glamor layer and glamor egl layer both have their internal
    CloseScreens. The correct sequence is after the I830CloseScreen
    is registered, then register glamor_egl_close_screen and
    the last one is glamor_close_screen. So we move out the
    intel_glamor_init from the intel_uxa_init to I830ScreenInit
    and just after the registration of I830CloseScreen.
    
    As the glamor interfaces changed, we need to check the
    glamor version when load the glamor egl module to make
    sure we are loading the right glamor module. If
    failed, it will switch back to UXA path.
    
    This depends upon glamor commit 1bc8bf tagged with version 0.3.0.
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 0fb3270..ae04b01 100644
--- a/configure.ac
+++ b/configure.ac
@@ -158,7 +158,7 @@ AC_ARG_ENABLE(glamor,
 AC_MSG_RESULT([$GLAMOR])
 AM_CONDITIONAL(GLAMOR, test x$GLAMOR != xno)
 if test "x$GLAMOR" != "xno"; then
-	PKG_CHECK_MODULES(LIBGLAMOR, [glamor])
+	PKG_CHECK_MODULES(LIBGLAMOR, [glamor >= 0.3.0])
 	PKG_CHECK_MODULES(LIBGLAMOR_EGL, [glamor-egl])
 	AC_DEFINE(USE_GLAMOR, 1, [Enable glamor acceleration])
 fi
diff --git a/src/intel_driver.c b/src/intel_driver.c
index 3e52da4..e0f0a41 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -1109,6 +1109,7 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 	intel->CreateScreenResources = screen->CreateScreenResources;
 	screen->CreateScreenResources = i830CreateScreenResources;
 
+	intel_glamor_init(screen);
 	if (!xf86CrtcScreenInit(screen))
 		return FALSE;
 
@@ -1182,8 +1183,6 @@ static void I830FreeScreen(int scrnIndex, int flags)
 	ScrnInfoPtr scrn = xf86Screens[scrnIndex];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-	intel_glamor_free_screen(scrnIndex, flags);
-
 	if (intel) {
 		intel_mode_fini(intel);
 		intel_close_drm_master(intel);
diff --git a/src/intel_glamor.c b/src/intel_glamor.c
index e96daa6..262a06a 100644
--- a/src/intel_glamor.c
+++ b/src/intel_glamor.c
@@ -51,30 +51,40 @@ intel_glamor_create_screen_resources(ScreenPtr screen)
 
 	if (!glamor_glyphs_init(screen))
 		return FALSE;
+
 	if (!glamor_egl_create_textured_screen(screen,
 					       intel->front_buffer->handle,
 					       intel->front_pitch))
 		return FALSE;
+
 	return TRUE;
 }
 
 Bool
 intel_glamor_pre_init(ScrnInfoPtr scrn)
 {
-	intel_screen_private *intel;
-	intel = intel_get_screen_private(scrn);
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	pointer glamor_module;
+	CARD32 version;
 
 	/* Load glamor module */
-	if (xf86LoadSubModule(scrn, "glamor_egl") &&
-	    glamor_egl_init(scrn, intel->drmSubFD)) {
-		xf86DrvMsg(scrn->scrnIndex, X_INFO,
-			   "glamor detected, initialising\n");
-		intel->uxa_flags |= UXA_USE_GLAMOR;
-	} else {
+	if ((glamor_module = xf86LoadSubModule(scrn, "glamor_egl"))) {
+		version = xf86GetModuleVersion(glamor_module);
+		if (version < MODULE_VERSION_NUMERIC(0,3,0)) {
+			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+			"Incompatible glamor version, required >= 0.3.0.\n");
+		} else {
+			if (glamor_egl_init(scrn, intel->drmSubFD)) {
+				xf86DrvMsg(scrn->scrnIndex, X_INFO,
+					   "glamor detected, initialising egl layer.\n");
+				intel->uxa_flags = UXA_GLAMOR_EGL_INITIALIZED;
+			} else
+				xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+					   "glamor detected, failed to initialize egl.\n");
+		}
+	} else
 		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
 			   "glamor not available\n");
-		intel->uxa_flags &= ~UXA_USE_GLAMOR;
-	}
 
 	return TRUE;
 }
@@ -83,7 +93,13 @@ PixmapPtr
 intel_glamor_create_pixmap(ScreenPtr screen, int w, int h,
 			   int depth, unsigned int usage)
 {
-	return glamor_create_pixmap(screen, w, h, depth, usage);
+	ScrnInfoPtr scrn = xf86Screens[screen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+
+	if (intel->uxa_flags & UXA_USE_GLAMOR)
+		return glamor_create_pixmap(screen, w, h, depth, usage);
+	else
+		return NULL;
 }
 
 Bool
@@ -145,30 +161,29 @@ intel_glamor_finish_access(PixmapPtr pixmap, uxa_access_t access)
 	return;
 }
 
-
 Bool
 intel_glamor_init(ScreenPtr screen)
 {
 	ScrnInfoPtr scrn = xf86Screens[screen->myNum];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-	if ((intel->uxa_flags & UXA_USE_GLAMOR) == 0)
-		return TRUE;
+	if ((intel->uxa_flags & UXA_GLAMOR_EGL_INITIALIZED) == 0)
+		goto fail;
 
-	if (!glamor_init(screen, GLAMOR_INVERTED_Y_AXIS)) {
+	if (!glamor_init(screen, GLAMOR_INVERTED_Y_AXIS | GLAMOR_USE_EGL_SCREEN)) {
 		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-			   "Failed to initialize glamor\n");
+			   "Failed to initialize glamor.\n");
 		goto fail;
 	}
 
 	if (!glamor_egl_init_textured_pixmap(screen)) {
 		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-			   "Failed to initialize textured pixmap.\n");
+			   "Failed to initialize textured pixmap of screen for glamor.\n");
 		goto fail;
 	}
 
 	intel->uxa_driver->flags |= UXA_USE_GLAMOR;
-	intel->uxa_flags = intel->uxa_driver->flags;
+	intel->uxa_flags |= intel->uxa_driver->flags;
 
 	intel->uxa_driver->finish_access = intel_glamor_finish_access;
 
@@ -177,8 +192,8 @@ intel_glamor_init(ScreenPtr screen)
 	return TRUE;
 
   fail:
-	xf86DrvMsg(scrn->scrnIndex, X_WARNING,
-		   "Use standard UXA acceleration.");
+	xf86DrvMsg(scrn->scrnIndex, X_INFO,
+		   "Use standard UXA acceleration.\n");
 	return FALSE;
 }
 
@@ -196,21 +211,10 @@ Bool
 intel_glamor_close_screen(ScreenPtr screen)
 {
 	ScrnInfoPtr scrn = xf86Screens[screen->myNum];
-	intel_screen_private * intel;
-
-	intel = intel_get_screen_private(scrn);
-	if (intel && (intel->uxa_flags & UXA_USE_GLAMOR))
-		return glamor_egl_close_screen(screen);
-	return TRUE;
-}
+	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-void
-intel_glamor_free_screen(int scrnIndex, int flags)
-{
-	ScrnInfoPtr scrn = xf86Screens[scrnIndex];
-	intel_screen_private * intel;
+	if (intel->uxa_flags & UXA_USE_GLAMOR)
+		intel->uxa_flags &= ~UXA_USE_GLAMOR;
 
-	intel = intel_get_screen_private(scrn);
-	if (intel && (intel->uxa_flags & UXA_USE_GLAMOR))
-		glamor_egl_free_screen(scrnIndex, GLAMOR_EGL_EXTERNAL_BUFFER);
+	return TRUE;
 }
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index f04a2ef..a11846d 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -1391,7 +1391,5 @@ Bool intel_uxa_init(ScreenPtr screen)
 	uxa_set_fallback_debug(screen, intel->fallback_debug);
 	uxa_set_force_fallback(screen, intel->force_fallback);
 
-	intel_glamor_init(screen);
-
 	return TRUE;
 }
diff --git a/uxa/uxa.h b/uxa/uxa.h
index 66b5f1e..b8569f0 100644
--- a/uxa/uxa.h
+++ b/uxa/uxa.h
@@ -548,12 +548,18 @@ typedef struct _UxaDriver {
 /**
  * UXA_USE_GLAMOR indicates to use glamor acceleration to perform rendering.
  * And if glamor fail to accelerate the rendering, then goto fallback to
- * use CPU to do the rendering.
+ * use CPU to do the rendering. This flag will be set only when glamor get
+ * initialized successfully.
+ * Note, in ddx close screen, this bit need to be cleared.
  */
 #define UXA_USE_GLAMOR			(1 << 3)
 
-/** @} */
+/* UXA_GLAMOR_EGL_INITIALIZED indicates glamor egl layer get initialized
+ * successfully. UXA layer does not use this flag, before call to
+ * glamor_init, ddx need to check this flag. */
+#define UXA_GLAMOR_EGL_INITIALIZED	(1 << 4)
 
+/** @} */
 /** @name UXA CreatePixmap hint flags
  * @{
  */
commit 71d102012a9270a7c33f9ca1dcc1eabf44c55edb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 20:16:48 2012 +0000

    sna/gen[4-7]: Fix erroneous scale factor for partial large bo render copies
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 700a271..6246538 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2550,8 +2550,6 @@ fallback_blt:
 	gen4_copy_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
 
-	tmp.src.scale[0] = 1.f/src->drawable.width;
-	tmp.src.scale[1] = 1.f/src->drawable.height;
 	do {
 		gen4_render_copy_one(sna, &tmp,
 				     box->x1 + src_dx, box->y1 + src_dy,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 213b80b..17789e9 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3253,7 +3253,7 @@ fallback_blt:
 	if (!gen6_check_format(tmp.src.pict_format))
 		goto fallback_blt;
 
-	tmp.op = PictOpSrc;
+	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
@@ -3357,8 +3357,6 @@ fallback_blt:
 	gen6_emit_copy_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
 
-	tmp.src.scale[0] = 1.f / src->drawable.width;
-	tmp.src.scale[1] = 1.f / src->drawable.height;
 	do {
 		float *v;
 		int n_this_time = gen6_get_rectangles(sna, &tmp, n);
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 731d952..9757405 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3271,8 +3271,7 @@ fallback_blt:
 
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
-	tmp.src.card_format =
-		gen7_get_card_format(tmp.src.pict_format);
+	tmp.src.card_format = gen7_get_card_format(tmp.src.pict_format);
 	if (too_large(src->drawable.width, src->drawable.height)) {
 		BoxRec extents = box[0];
 		int i;
@@ -3337,8 +3336,6 @@ fallback_blt:
 	gen7_emit_copy_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
 
-	tmp.src.scale[0] = 1.f / src->drawable.width;
-	tmp.src.scale[1] = 1.f / src->drawable.height;
 	do {
 		float *v;
 		int n_this_time = gen7_get_rectangles(sna, &tmp, n);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 676f5c7..572d6ea 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -831,13 +831,13 @@ sna_render_pixmap_partial(struct sna *sna,
 	box.y1 = y;
 	box.x2 = x + w;
 	box.y2 = y + h;
+	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
+	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
 	if (box.x1 < 0)
 		box.x1 = 0;
 	if (box.y1 < 0)
 		box.y1 = 0;
-	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
-	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
 	if (bo->tiling) {
 		int tile_width, tile_height, tile_size;
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index d491b3b..17ecaea 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -599,20 +599,6 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 			if (tile.x2 > extents.x2)
 				tile.x2 = extents.x2;
 
-			p.drawable.width  = tile.x2 - tile.x1;
-			p.drawable.height = tile.y2 - tile.y1;
-
-			DBG(("%s: tile (%d, %d), (%d, %d)\n",
-			     __FUNCTION__, tile.x1, tile.y1, tile.x2, tile.y2));
-
-			tmp_bo = kgem_create_2d(&sna->kgem,
-						p.drawable.width,
-						p.drawable.height,
-						p.drawable.bitsPerPixel,
-						tiling, CREATE_TEMPORARY);
-			if (!tmp_bo)
-				goto tiled_error;
-
 			c = clipped;
 			for (i = 0; i < n; i++) {
 				*c = box[i];
@@ -628,17 +614,31 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 				     c->y1 - tile.y1));
 				c++;
 			}
+			if (c == clipped)
+				continue;
+
+			p.drawable.width  = tile.x2 - tile.x1;
+			p.drawable.height = tile.y2 - tile.y1;
+
+			DBG(("%s: tile (%d, %d), (%d, %d)\n",
+			     __FUNCTION__, tile.x1, tile.y1, tile.x2, tile.y2));
+
+			tmp_bo = kgem_create_2d(&sna->kgem,
+						p.drawable.width,
+						p.drawable.height,
+						p.drawable.bitsPerPixel,
+						tiling, CREATE_TEMPORARY);
+			if (!tmp_bo)
+				goto tiled_error;
 
-			if (c == clipped ||
-			    (sna->render.copy_boxes(sna, GXcopy,
+			i = (sna->render.copy_boxes(sna, GXcopy,
 						    src, src_bo, src_dx, src_dy,
 						    &p, tmp_bo, -tile.x1, -tile.y1,
 						    clipped, c - clipped) &&
 			     sna->render.copy_boxes(sna, alu,
 						    &p, tmp_bo, -tile.x1, -tile.y1,
 						    dst, dst_bo, dst_dx, dst_dy,
-						    clipped, c - clipped)))
-				i = 1;
+						    clipped, c - clipped));
 
 			kgem_bo_destroy(&sna->kgem, tmp_bo);
 
commit 20cd2787e90b7bde163c0221095347a8f37024b8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 15:26:50 2012 +0000

    sna: Apply offsets correctly for partial src/dst in large copy boxes
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 3db9ce7..ed48ce6 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2890,7 +2890,8 @@ fallback:
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 0f0345c..fc006ac 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -3879,7 +3879,8 @@ fallback_blt:
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 459b428..700a271 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2485,7 +2485,8 @@ fallback_blt:
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 7246ed6..b9c7a92 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2801,7 +2801,8 @@ fallback_blt:
 		}
 
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 59372f8..213b80b 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3278,7 +3278,8 @@ fallback_blt:
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 8a5d95c..731d952 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3262,7 +3262,8 @@ fallback_blt:
 				extents.y2 = box[i].y2;
 		}
 		if (!sna_render_composite_redirect(sna, &tmp,
-						   extents.x1, extents.y1,
+						   extents.x1 + dst_dx,
+						   extents.y1 + dst_dy,
 						   extents.x2 - extents.x1,
 						   extents.y2 - extents.y1))
 			goto fallback_tiled;
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 1625b91..676f5c7 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -836,10 +836,6 @@ sna_render_pixmap_partial(struct sna *sna,
 		box.x1 = 0;
 	if (box.y1 < 0)
 		box.y1 = 0;
-	if (box.x2 > pixmap->drawable.width)
-		box.x2 = pixmap->drawable.width;
-	if (box.y2 > pixmap->drawable.height)
-		box.y2 = pixmap->drawable.height;
 	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
@@ -854,19 +850,20 @@ sna_render_pixmap_partial(struct sna *sna,
 		/* Ensure we align to an even tile row */
 		box.y1 = box.y1 & ~(2*tile_height - 1);
 		box.y2 = ALIGN(box.y2, 2*tile_height);
-		if (box.y2 > pixmap->drawable.height)
-			box.y2 = pixmap->drawable.height;
 
 		assert(tile_width * 8 >= pixmap->drawable.bitsPerPixel);
 		box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
 		box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
-		if (box.x2 > pixmap->drawable.width)
-			box.x2 = pixmap->drawable.width;
 
 		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
 	} else
 		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8;
 
+	if (box.x2 > pixmap->drawable.width)
+		box.x2 = pixmap->drawable.width;
+	if (box.y2 > pixmap->drawable.height)
+		box.y2 = pixmap->drawable.height;
+
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
 	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
@@ -889,8 +886,8 @@ sna_render_pixmap_partial(struct sna *sna,
 
 	channel->bo->pitch = bo->pitch;
 
-	channel->offset[0] = x - box.x1;
-	channel->offset[1] = y - box.y1;
+	channel->offset[0] = -box.x1;
+	channel->offset[1] = -box.y1;
 	channel->scale[0] = 1.f/w;
 	channel->scale[1] = 1.f/h;
 	channel->width  = w;
commit e91b0f92b2032451d6eb8fedc9de6244c974ce21
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 15:25:48 2012 +0000

    sna/tiling: Request Y-tiles if we know we cannot BLT to either the src or dst
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index c305da0..d491b3b 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -548,7 +548,7 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 {
 	BoxRec extents, tile, stack[64], *clipped, *c;
 	PixmapRec p;
-	int i, step;
+	int i, step, tiling;
 	Bool ret = FALSE;
 
 	extents = box[0];
@@ -568,8 +568,13 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 	while (step * step * 4 > sna->kgem.max_upload_tile_size)
 		step /= 2;
 
-	DBG(("%s: tiling copy, using %dx%d tiles\n",
-	     __FUNCTION__, step, step));
+	tiling = I915_TILING_X;
+	if (!kgem_bo_can_blt(&sna->kgem, src_bo) ||
+	    !kgem_bo_can_blt(&sna->kgem, dst_bo))
+		tiling = I915_TILING_Y;
+
+	DBG(("%s: tiling copy, using %dx%d %c tiles\n",
+	     __FUNCTION__, step, step, tiling == I915_TILING_X ? 'X' : 'Y'));
 
 	if (n > ARRAY_SIZE(stack)) {
 		clipped = malloc(sizeof(BoxRec) * n);
@@ -597,12 +602,14 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 			p.drawable.width  = tile.x2 - tile.x1;
 			p.drawable.height = tile.y2 - tile.y1;
 
+			DBG(("%s: tile (%d, %d), (%d, %d)\n",
+			     __FUNCTION__, tile.x1, tile.y1, tile.x2, tile.y2));
+
 			tmp_bo = kgem_create_2d(&sna->kgem,
 						p.drawable.width,
 						p.drawable.height,
 						p.drawable.bitsPerPixel,
-						I915_TILING_X,
-					       	CREATE_TEMPORARY);
+						tiling, CREATE_TEMPORARY);
 			if (!tmp_bo)
 				goto tiled_error;
 
commit 2e55851c0844463d28f04c45f8983fb12557b00b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 14:35:59 2012 +0000

    sna: Mark up the temporary allocations
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index c6e898b..c305da0 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -384,7 +384,7 @@ sna_tiling_fill_boxes(struct sna *sna,
 							       tmp.drawable.width,
 							       tmp.drawable.height,
 							       dst->drawable.bitsPerPixel),
-					    CREATE_SCANOUT);
+					    CREATE_SCANOUT | CREATE_TEMPORARY);
 			if (bo) {
 				int16_t dx = this.extents.x1;
 				int16_t dy = this.extents.y1;
@@ -489,7 +489,7 @@ Bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
 					    kgem_choose_tiling(&sna->kgem,
 							       I915_TILING_X,
 							       w, h, bpp),
-					    0);
+					    CREATE_TEMPORARY);
 			if (bo) {
 				int16_t dx = this.extents.x1;
 				int16_t dy = this.extents.y1;
@@ -601,7 +601,8 @@ sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 						p.drawable.width,
 						p.drawable.height,
 						p.drawable.bitsPerPixel,
-						I915_TILING_X, 0);
+						I915_TILING_X,
+					       	CREATE_TEMPORARY);
 			if (!tmp_bo)
 				goto tiled_error;
 
commit 2e1b974d941dafe0ec8b01e15fb534efa67b73c6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 13:37:52 2012 +0000

    sna: Set the damage for render->copy_boxes to NULL before use
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 1e0c680..3db9ce7 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2870,6 +2870,7 @@ fallback:
 	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
 	tmp.dst.bo = dst_bo;
 	tmp.dst.x = tmp.dst.y = 0;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height) ||
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index b9afdb9..0f0345c 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -3859,6 +3859,7 @@ fallback_blt:
 	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
 	tmp.dst.bo = dst_bo;
 	tmp.dst.x = tmp.dst.y = 0;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height) ||
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 530251c..459b428 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2466,6 +2466,7 @@ fallback_blt:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.bo = dst_bo;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height)) {
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 487d903..7246ed6 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2781,6 +2781,7 @@ fallback_blt:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.bo = dst_bo;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height)) {
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d337b30..59372f8 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3259,6 +3259,7 @@ fallback_blt:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.bo = dst_bo;
 	tmp.dst.x = tmp.dst.y = 0;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height)) {
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 2e9948d..8a5d95c 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -3243,6 +3243,7 @@ fallback_blt:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.bo = dst_bo;
 	tmp.dst.x = tmp.dst.y = 0;
+	tmp.damage = NULL;
 
 	sna_render_composite_redirect_init(&tmp);
 	if (too_large(tmp.dst.width, tmp.dst.height)) {
commit 68c7529b937103001642e828eecb34f297c78ea4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Feb 7 13:32:20 2012 +0000

    sna: Handle tile alignment for untiled large bo more carefully
    
    We ended up trying to align the upper bound to zero as the integer
    divsion of the tile width by pixel was zero.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index c4d2301..d337b30 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -3285,8 +3285,7 @@ fallback_blt:
 
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
-	tmp.src.card_format =
-		gen6_get_card_format(tmp.src.pict_format);
+	tmp.src.card_format = gen6_get_card_format(tmp.src.pict_format);
 	if (too_large(src->drawable.width, src->drawable.height)) {
 		BoxRec extents = box[0];
 		int i;
@@ -3307,8 +3306,10 @@ fallback_blt:
 					       extents.x1 + src_dx,
 					       extents.y1 + src_dy,
 					       extents.x2 - extents.x1,
-					       extents.y2 - extents.y1))
+					       extents.y2 - extents.y1)) {
+			DBG(("%s: unable to extract partial pixmap\n", __FUNCTION__));
 			goto fallback_tiled_dst;
+		}
 	} else {
 		tmp.src.bo = kgem_bo_reference(src_bo);
 		tmp.src.width  = src->drawable.width;
@@ -3336,8 +3337,11 @@ fallback_blt:
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
-		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
+			DBG(("%s: too large for a single operation\n",
+			     __FUNCTION__));
 			goto fallback_tiled_src;
+		}
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 3309157..1625b91 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -819,10 +819,10 @@ sna_render_pixmap_partial(struct sna *sna,
 			  int16_t w, int16_t h)
 {
 	BoxRec box;
-	int tile_width, tile_height, tile_size;
 	int offset;
 
-	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
+	DBG(("%s (%d, %d)x(%d, %d), pitch %d, max %d\n",
+	     __FUNCTION__, x, y, w, h, bo->pitch, sna->render.max_3d_pitch));
 
 	if (bo->pitch > sna->render.max_3d_pitch)
 		return false;
@@ -840,20 +840,32 @@ sna_render_pixmap_partial(struct sna *sna,
 		box.x2 = pixmap->drawable.width;
 	if (box.y2 > pixmap->drawable.height)
 		box.y2 = pixmap->drawable.height;
+	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
+	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
-	kgem_get_tile_size(&sna->kgem, bo->tiling,
-			   &tile_width, &tile_height, &tile_size);
+	if (bo->tiling) {
+		int tile_width, tile_height, tile_size;
 
-	/* Ensure we align to an even tile row */
-	box.y1 = box.y1 & ~(2*tile_height - 1);
-	box.y2 = ALIGN(box.y2, 2*tile_height);
-	if (box.y2 > pixmap->drawable.height)
-		box.y2 = pixmap->drawable.height;
+		kgem_get_tile_size(&sna->kgem, bo->tiling,
+				   &tile_width, &tile_height, &tile_size);
+		DBG(("%s: tile size for tiling %d: %dx%d, size=%d\n",
+		     __FUNCTION__, bo->tiling, tile_width, tile_height, tile_size));
 
-	box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
-	box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
-	if (box.x2 > pixmap->drawable.width)
-		box.x2 = pixmap->drawable.width;
+		/* Ensure we align to an even tile row */
+		box.y1 = box.y1 & ~(2*tile_height - 1);
+		box.y2 = ALIGN(box.y2, 2*tile_height);
+		if (box.y2 > pixmap->drawable.height)
+			box.y2 = pixmap->drawable.height;
+
+		assert(tile_width * 8 >= pixmap->drawable.bitsPerPixel);
+		box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
+		box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
+		if (box.x2 > pixmap->drawable.width)
+			box.x2 = pixmap->drawable.width;
+
+		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+	} else
+		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8;
 
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
@@ -862,11 +874,13 @@ sna_render_pixmap_partial(struct sna *sna,
 	     pixmap->drawable.width, pixmap->drawable.height));
 	if (w <= 0 || h <= 0 ||
 	    w > sna->render.max_3d_size ||
-	    h > sna->render.max_3d_size)
+	    h > sna->render.max_3d_size) {
+		DBG(("%s: box too large (%dx%d) for 3D pipeline (max %d)\n",
+		    __FUNCTION__, w, h, sna->render.max_3d_size));
 		return false;
+	}
 
 	/* How many tiles across are we? */
-	offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
 	channel->bo = kgem_create_proxy(bo,
 					box.y1 * bo->pitch + offset,
 					h * bo->pitch);
@@ -895,27 +909,11 @@ sna_render_picture_partial(struct sna *sna,
 	struct kgem_bo *bo = NULL;
 	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
 	BoxRec box;
-	int tile_width, tile_height, tile_size;
 	int offset;
 
 	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
 	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
 
-	if (use_cpu_bo(sna, pixmap, &box)) {
-		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->cpu_bo;
-	} else {
-		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->gpu_bo;
-	}
-
-	if (bo->pitch > sna->render.max_3d_pitch)
-		return 0;
-
 	box.x1 = x;
 	box.y1 = y;
 	box.x2 = x + w;
@@ -951,19 +949,41 @@ sna_render_picture_partial(struct sna *sna,
 		}
 	}
 
-	kgem_get_tile_size(&sna->kgem, bo->tiling,
-			   &tile_width, &tile_height, &tile_size);
+	if (use_cpu_bo(sna, pixmap, &box)) {
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
 
-	/* Ensure we align to an even tile row */
-	box.y1 = box.y1 & ~(2*tile_height - 1);
-	box.y2 = ALIGN(box.y2, 2*tile_height);
-	if (box.y2 > pixmap->drawable.height)
-		box.y2 = pixmap->drawable.height;
+		bo = sna_pixmap(pixmap)->cpu_bo;
+	} else {
+		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+			return 0;
 
-	box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
-	box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
-	if (box.x2 > pixmap->drawable.width)
-		box.x2 = pixmap->drawable.width;
+		bo = sna_pixmap(pixmap)->gpu_bo;
+	}
+
+	if (bo->pitch > sna->render.max_3d_pitch)
+		return 0;
+
+	if (bo->tiling) {
+		int tile_width, tile_height, tile_size;
+
+		kgem_get_tile_size(&sna->kgem, bo->tiling,
+				   &tile_width, &tile_height, &tile_size);
+
+		/* Ensure we align to an even tile row */
+		box.y1 = box.y1 & ~(2*tile_height - 1);
+		box.y2 = ALIGN(box.y2, 2*tile_height);
+		if (box.y2 > pixmap->drawable.height)
+			box.y2 = pixmap->drawable.height;
+
+		box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
+		box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
+		if (box.x2 > pixmap->drawable.width)
+			box.x2 = pixmap->drawable.width;
+
+		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+	} else
+		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8;
 
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
@@ -976,7 +996,6 @@ sna_render_picture_partial(struct sna *sna,
 		return 0;
 
 	/* How many tiles across are we? */
-	offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
 	channel->bo = kgem_create_proxy(bo,
 					box.y1 * bo->pitch + offset,
 					h * bo->pitch);
@@ -1638,53 +1657,63 @@ sna_render_composite_redirect(struct sna *sna,
 	return FALSE;
 #endif
 
-	DBG(("%s: target too large (%dx%d), copying to temporary %dx%d\n",
-	     __FUNCTION__, op->dst.width, op->dst.height, width,height));
+	DBG(("%s: target too large (%dx%d), copying to temporary %dx%d, max %d\n",
+	     __FUNCTION__,
+	     op->dst.width, op->dst.height,
+	     width, height,
+	     sna->render.max_3d_size));
 
 	if (!width || !height)
 		return FALSE;
 
-	if (width  > sna->render.max_3d_pitch ||
-	    height > sna->render.max_3d_pitch)
+	if (width  > sna->render.max_3d_size ||
+	    height > sna->render.max_3d_size)
 		return FALSE;
 
 	if (op->dst.bo->pitch <= sna->render.max_3d_pitch) {
-		int tile_width, tile_height, tile_size;
 		BoxRec box;
-		int w, h;
+		int w, h, offset;
 
 		DBG(("%s: dst pitch (%d) fits within render pipeline (%d)\n",
 		     __FUNCTION__, op->dst.bo->pitch, sna->render.max_3d_pitch));
 
-		kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling,
-				   &tile_width, &tile_height, &tile_size);
-
 		box.x1 = x;
 		box.x2 = x + width;
 		box.y1 = y;
 		box.y2 = y + height;
 
 		/* Ensure we align to an even tile row */
-		box.y1 = box.y1 & ~(2*tile_height - 1);
-		box.y2 = ALIGN(box.y2, 2*tile_height);
+		if (op->dst.bo->tiling) {
+			int tile_width, tile_height, tile_size;
+
+			kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling,
+					   &tile_width, &tile_height, &tile_size);
+
+			box.y1 = box.y1 & ~(2*tile_height - 1);
+			box.y2 = ALIGN(box.y2, 2*tile_height);
+
+			box.x1 = box.x1 & ~(tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel - 1);
+			box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);
+
+			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+		} else
+			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8;
+
 		if (box.y2 > op->dst.pixmap->drawable.height)
 			box.y2 = op->dst.pixmap->drawable.height;
 
-		box.x1 = box.x1 & ~(tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel - 1);
-		box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);
 		if (box.x2 > op->dst.pixmap->drawable.width)
 			box.x2 = op->dst.pixmap->drawable.width;
 
 		w = box.x2 - box.x1;
 		h = box.y2 - box.y1;
-		DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
+		DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d), max %d\n", __FUNCTION__,
 		     box.x1, box.y1, box.x2, box.y2, w, h,
 		     op->dst.pixmap->drawable.width,
-		     op->dst.pixmap->drawable.height));
+		     op->dst.pixmap->drawable.height,
+		     sna->render.max_3d_size));
 		if (w <= sna->render.max_3d_size &&
 		    h <= sna->render.max_3d_size) {
-			int offset;
-
 			t->box.x2 = t->box.x1 = op->dst.x;
 			t->box.y2 = t->box.y1 = op->dst.y;
 			t->real_bo = op->dst.bo;
@@ -1695,7 +1724,6 @@ sna_render_composite_redirect(struct sna *sna,
 			}
 
 			/* How many tiles across are we? */
-			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
 			op->dst.bo = kgem_create_proxy(op->dst.bo,
 						       box.y1 * op->dst.bo->pitch + offset,
 						       h * op->dst.bo->pitch);
commit f544374e35861c9bfa69ce4ba2a9871ec3100721
Author: Zhigang Gong <zhigang.gong at linux.intel.com>
Date:   Tue Feb 7 09:40:54 2012 +0800

    uxa/glamor/dri: Fix a typo bug when fixup glamor pixmap.
    
    Should modify the old pixmap's header not the new one which
    was already destroyed.
    
    Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_dri.c b/src/intel_dri.c
index 2e43fcf..d2a02c9 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -179,7 +179,7 @@ static PixmapPtr fixup_glamor(DrawablePtr drawable, PixmapPtr pixmap)
 		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
 			   "Failed to get DRI drawable for glamor pixmap.\n");
 
-	screen->ModifyPixmapHeader(pixmap,
+	screen->ModifyPixmapHeader(old,
 				   drawable->width,
 				   drawable->height,
 				   0, 0,
commit 3927cc58a7f88d829490c3d3a78645124b4e1f64
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 20:41:53 2012 +0000

    sna: Use the proper sna_picture_is_solid() test
    
    Rather than the specialised routines that assumed pDrawable was
    non-NULL, which was no longer true after f30be6f743.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 606ecfe..1e0c680 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1486,14 +1486,6 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
-}
-
-static bool
 is_unhandled_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
@@ -1523,7 +1515,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index de6a3ad..b9afdb9 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2413,14 +2413,6 @@ static inline bool is_constant_ps(uint32_t type)
 }
 
 static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
-}
-
-static bool
 has_alphamap(PicturePtr p)
 {
 	return p->alphaMap != NULL;
@@ -2435,7 +2427,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 3cb0fbb..530251c 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2001,14 +2001,6 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
-}
-
-static bool
 is_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
@@ -2032,7 +2024,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index f15342d..487d903 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2029,14 +2029,6 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
-}
-
-static bool
 is_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
@@ -2060,7 +2052,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 06c33c4..c4d2301 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2270,12 +2270,6 @@ static bool can_switch_rings(struct sna *sna)
 	return sna->kgem.has_semaphores && !NO_RING_SWITCH;
 }
 
-static bool
-is_solid(PicturePtr picture)
-{
-	return sna_picture_is_solid(picture, NULL);
-}
-
 static Bool
 try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
@@ -2308,7 +2302,7 @@ try_blt(struct sna *sna,
 	}
 
 	if (can_switch_rings(sna)) {
-		if (is_solid(src))
+		if (sna_picture_is_solid(src, NULL))
 			return TRUE;
 	}
 
@@ -2339,7 +2333,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 5a867b1..2e9948d 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2314,14 +2314,6 @@ try_blt(struct sna *sna,
 }
 
 static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
-}
-
-static bool
 is_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
@@ -2345,7 +2337,7 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
-	if (is_solid(p))
+	if (sna_picture_is_solid(p, NULL))
 		return false;
 
 	return (has_alphamap(p) ||
commit 9a83333f3e65db1b9a11a72fd1ad7179e6626654
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 20:37:54 2012 +0000

    sna: Search all active buckets for a temporary allocation
    
    Reduce the need for creating a new object if we only need the allocation
    for a single operation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 259666e..e80eaae 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2291,13 +2291,14 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	if (tiling < 0)
 		tiling = -tiling, flags |= CREATE_EXACT;
 
-	DBG(("%s(%dx%d, bpp=%d, tiling=%d, exact=%d, inactive=%d, cpu-mapping=%d, gtt-mapping=%d, scanout?=%d)\n", __FUNCTION__,
+	DBG(("%s(%dx%d, bpp=%d, tiling=%d, exact=%d, inactive=%d, cpu-mapping=%d, gtt-mapping=%d, scanout?=%d, temp?=%d)\n", __FUNCTION__,
 	     width, height, bpp, tiling,
 	     !!(flags & CREATE_EXACT),
 	     !!(flags & CREATE_INACTIVE),
 	     !!(flags & CREATE_CPU_MAP),
 	     !!(flags & CREATE_GTT_MAP),
-	     !!(flags & CREATE_SCANOUT)));
+	     !!(flags & CREATE_SCANOUT),
+	     !!(flags & CREATE_TEMPORARY)));
 
 	size = kgem_surface_size(kgem,
 				 kgem->has_relaxed_fencing,
@@ -2404,7 +2405,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 
 	/* Best active match */
 	retry = NUM_CACHE_BUCKETS - bucket;
-	if (retry > 3)
+	if (retry > 3 && (flags & CREATE_TEMPORARY) == 0)
 		retry = 3;
 search_again:
 	assert(bucket < NUM_CACHE_BUCKETS);
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index b6930e0..f3a7b94 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -221,6 +221,7 @@ enum {
 	CREATE_CPU_MAP = 0x4,
 	CREATE_GTT_MAP = 0x8,
 	CREATE_SCANOUT = 0x10,
+	CREATE_TEMPORARY = 0x20,
 };
 struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			       int width,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a8bfe40..2a4aaec 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -700,7 +700,7 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 
 	priv->gpu_bo = kgem_create_2d(&sna->kgem,
 				      width, height, bpp, tiling,
-				      0);
+				      CREATE_TEMPORARY);
 	if (priv->gpu_bo == NULL) {
 		free(priv);
 		fbDestroyPixmap(pixmap);
@@ -5522,7 +5522,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 				       adx, ady, sdx, sdy,
 				       1, 1, octant);
 
-			DBG(("%s: adx=(%d, %d), sdx=(%d, %d), oc1=%d, oc2\n",
+			DBG(("%s: adx=(%d, %d), sdx=(%d, %d), oc1=%d, oc2=%d\n",
 			     __FUNCTION__, adx, ady, sdx, sdy, oc1, oc2));
 			if (adx == 0 || ady == 0) {
 				if (x1 <= x2) {
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index c2b9e79..3309157 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1146,7 +1146,7 @@ sna_render_picture_extract(struct sna *sna,
 				    kgem_choose_tiling(&sna->kgem,
 						       I915_TILING_X, w, h,
 						       pixmap->drawable.bitsPerPixel),
-				    0);
+				    CREATE_TEMPORARY);
 		if (bo) {
 			PixmapRec tmp;
 
@@ -1725,7 +1725,7 @@ sna_render_composite_redirect(struct sna *sna,
 			    width, height, bpp,
 			    kgem_choose_tiling(&sna->kgem, I915_TILING_X,
 					       width, height, bpp),
-			    CREATE_SCANOUT);
+			    CREATE_SCANOUT | CREATE_TEMPORARY);
 	if (!bo)
 		return FALSE;
 
commit 62a331237bd9c521766c8c6eafc79a5d2cc7e487
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 18:08:19 2012 +0000

    sna: Use the clipped end-point for recomputing segment length after clipping
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45673
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 8732ed4..a8bfe40 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5476,9 +5476,6 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 		xstart = pt->x + drawable->x;
 		ystart = pt->y + drawable->y;
 
-		/* x2, y2, oc2 copied to x1, y1, oc1 at top of loop to simplify
-		 * iteration logic
-		 */
 		x2 = xstart;
 		y2 = ystart;
 		oc2 = 0;
@@ -5525,8 +5522,8 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 				       adx, ady, sdx, sdy,
 				       1, 1, octant);
 
-			DBG(("%s: adx=(%d, %d), sdx=(%d, %d)\n",
-			     __FUNCTION__, adx, ady, sdx, sdy));
+			DBG(("%s: adx=(%d, %d), sdx=(%d, %d), oc1=%d, oc2\n",
+			     __FUNCTION__, adx, ady, sdx, sdy, oc1, oc2));
 			if (adx == 0 || ady == 0) {
 				if (x1 <= x2) {
 					b->x1 = x1;
@@ -5658,7 +5655,7 @@ X_continue2:
 							   octant, bias, oc1, oc2) == -1)
 						continue;
 
-					length = abs(y2 - y);
+					length = abs(y2_clipped - y);
 
 					/* if we've clipped the endpoint, always draw the full length
 					 * of the segment, because then the capstyle doesn't matter
commit 800a92f4023e14b798d8c487399be25689100631
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 15:59:21 2012 +0000

    sna/gen2+: Exclude solids from being classed as requiring an upload
    
    We treat any pixmap that is not attached to either a CPU or GPU bo as
    requiring the pixel data to be uploaded to the GPU before we can
    composite. Normally this is true, except for the solid cache.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45672
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 97b558d..606ecfe 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1523,6 +1523,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		is_unhandled_gradient(p) ||
 		!gen2_check_filter(p) ||
@@ -1572,7 +1575,7 @@ gen2_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
@@ -1580,7 +1583,7 @@ gen2_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index d5f5617..de6a3ad 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2435,6 +2435,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		!gen3_check_xformat(p) ||
 		!gen3_check_filter(p) ||
@@ -2494,7 +2497,7 @@ gen3_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv &&
 		    ((priv->gpu_damage && !priv->cpu_damage) ||
@@ -2504,7 +2507,7 @@ gen3_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv &&
 		    ((priv->gpu_damage && !priv->cpu_damage) ||
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index b3a64d9..3cb0fbb 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2032,6 +2032,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		is_gradient(p) ||
 		!gen4_check_filter(p) ||
@@ -2082,7 +2085,7 @@ gen4_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
@@ -2090,7 +2093,7 @@ gen4_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 933c51f..f15342d 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2060,6 +2060,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		is_gradient(p) ||
 		!gen5_check_filter(p) ||
@@ -2110,7 +2113,7 @@ gen5_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
@@ -2118,7 +2121,7 @@ gen5_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d73fda8..06c33c4 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2339,6 +2339,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		is_gradient(p) ||
 		!gen6_check_filter(p) ||
@@ -2391,7 +2394,7 @@ gen6_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
@@ -2399,7 +2402,7 @@ gen6_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 5385a47..5a867b1 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2345,6 +2345,9 @@ need_upload(PicturePtr p)
 static bool
 source_fallback(PicturePtr p)
 {
+	if (is_solid(p))
+		return false;
+
 	return (has_alphamap(p) ||
 		is_gradient(p) ||
 		!gen7_check_filter(p) ||
@@ -2397,7 +2400,7 @@ gen7_composite_fallback(struct sna *sna,
 		return FALSE;
 	}
 
-	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
+	if (src_pixmap && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
@@ -2405,7 +2408,7 @@ gen7_composite_fallback(struct sna *sna,
 			return FALSE;
 		}
 	}
-	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
+	if (mask_pixmap && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
 		if (priv && priv->gpu_damage && !priv->cpu_damage) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
commit 84bca5babb12f3d169352712c67fa5814d2563df
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 09:50:03 2012 +0000

    sna: If we have a CPU bo, do not assert we have shadow pixels
    
    When transferring damage to the GPU, on SNB it is not necessarily true
    that we have a shadow pixmap, we may instead have drawn onto an unmapped
    CPU bo and now simply need to copy from that bo onto the GPU. Move the
    assertion onto the path where it truly matters.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45672
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b6f6772..8732ed4 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2143,7 +2143,6 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		pixmap->devKind = priv->stride;
 		priv->mapped = false;
 	}
-	assert(pixmap->devPrivate.ptr != NULL);
 
 	n = sna_damage_get_boxes(priv->cpu_damage, &box);
 	if (n) {
@@ -2158,6 +2157,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
 		if (!ok) {
+			assert(pixmap->devPrivate.ptr != NULL);
 			if (n == 1 && !priv->pinned &&
 			    (box->x2 - box->x1) >= pixmap->drawable.width &&
 			    (box->y2 - box->y1) >= pixmap->drawable.height) {
commit a6bb7e7041e4b67f7be72673fa660275c4d16842
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Feb 6 09:19:56 2012 +0000

    sna: Disable use of xvmc for SNB+
    
    Not yet implemented, so don't bother setting it to fail.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=44874
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_hwmc.c b/src/sna/sna_video_hwmc.c
index 1f36096..2baa939 100644
--- a/src/sna/sna_video_hwmc.c
+++ b/src/sna/sna_video_hwmc.c
@@ -199,6 +199,10 @@ Bool sna_video_xvmc_setup(struct sna *sna,
 	if (sna->kgem.gen < 31)
 		return FALSE;
 
+	/* Not implemented */
+	if (sna->kgem.gen >= 60)
+		return FALSE;
+
 	pAdapt = calloc(1, sizeof(XF86MCAdaptorRec));
 	if (!pAdapt)
 		return FALSE;
commit 5fd1c3675a50c9cfacf359ea7e7b60ace497d7a0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 4 20:13:07 2012 +0000

    sna: Discard the redundant clear of the unbounded area if already clear
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index c2a4190..28c8a67 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2658,6 +2658,13 @@ choose_span(PicturePtr dst,
 }
 
 static bool
+sna_drawable_is_clear(DrawablePtr d)
+{
+	struct sna_pixmap *priv = sna_pixmap(get_drawable_pixmap(d));
+	return priv && priv->clear && priv->clear_color == 0;
+}
+
+static bool
 mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			       INT16 src_x, INT16 src_y,
 			       int ntrap, xTrapezoid *traps)
@@ -2666,6 +2673,7 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	BoxRec extents;
 	int16_t dst_x, dst_y;
 	int16_t dx, dy;
+	bool was_clear;
 	int n;
 
 	if (NO_SCAN_CONVERTER)
@@ -2709,6 +2717,8 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	if (!mono_init(&mono, 2*ntrap))
 		return false;
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+
 	for (n = 0; n < ntrap; n++) {
 		if (!xTrapezoidValid(&traps[n]))
 			continue;
@@ -2741,7 +2751,7 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	mono.op.done(mono.sna, &mono.op);
 	mono_fini(&mono);
 
-	if (!operator_is_bounded(op)) {
+	if (!was_clear && !operator_is_bounded(op)) {
 		xPointFixed p1, p2;
 
 		if (!mono_init(&mono, 2+2*ntrap))
@@ -4223,6 +4233,7 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	BoxRec extents;
 	int16_t dst_x, dst_y;
 	int16_t dx, dy;
+	bool was_clear;
 	int n;
 
 	mono.sna = to_sna_from_drawable(dst->pDrawable);
@@ -4261,6 +4272,8 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + mono.clip.extents.x1 - dst_x - dx,
 	     src_y + mono.clip.extents.y1 - dst_y - dy));
 
+	was_clear = sna_drawable_is_clear(dst->pDrawable);
+
 	if (mono_init(&mono, 3*count))
 		return false;
 
@@ -4289,7 +4302,7 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		mono.op.done(mono.sna, &mono.op);
 	}
 
-	if (!operator_is_bounded(op)) {
+	if (!was_clear && !operator_is_bounded(op)) {
 		xPointFixed p1, p2;
 
 		if (!mono_init(&mono, 2+3*count))
commit 634b75d74adb3ae0eec3a4f2697ac54bbf0ea5b1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 4 20:07:49 2012 +0000

    sna: Always pass the clear colour for PictOpClear
    
    Having made that optimisation for Composite, and then made the
    assumption that it is always true in the backends, we failed to clear
    the unbounded area outside of a trapezoid since we passed in the
    original colour and the operation was optimised as a continuation.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index 9df8cfd..73e2490 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -274,6 +274,7 @@ struct sna {
 	void *WakeupData;
 	CloseScreenProcPtr CloseScreen;
 
+	PicturePtr clear;
 	struct {
 		uint32_t fill_bo;
 		uint32_t fill_pixel;
diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 8eb0c94..5b81596 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -43,20 +43,18 @@
 
 #define BOUND(v)	(INT16) ((v) < MINSHORT ? MINSHORT : (v) > MAXSHORT ? MAXSHORT : (v))
 
-static PicturePtr clear;
-
 Bool sna_composite_create(struct sna *sna)
 {
 	xRenderColor color ={ 0 };
 	int error;
 
-	clear = CreateSolidPicture(0, &color, &error);
-	return clear != NULL;
+	sna->clear = CreateSolidPicture(0, &color, &error);
+	return sna->clear != NULL;
 }
 
 void sna_composite_close(struct sna *sna)
 {
-	FreePicture(clear, 0);
+	FreePicture(sna->clear, 0);
 }
 
 static inline bool
@@ -436,7 +434,7 @@ sna_composite(CARD8 op,
 	if (op == PictOpClear) {
 		DBG(("%s: discarding source and mask for clear\n", __FUNCTION__));
 		mask = NULL;
-		src = clear;
+		src = sna->clear;
 	}
 
 	if (mask && sna_composite_mask_is_opaque(mask)) {
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index eb6c968..c2a4190 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2776,7 +2776,7 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		memset(&mono.op, 0, sizeof(mono.op));
 		if (mono.sna->render.composite(mono.sna,
 					       PictOpClear,
-					       src, NULL, dst,
+					       mono.sna->clear, NULL, dst,
 					       0, 0,
 					       0, 0,
 					       mono.clip.extents.x1,  mono.clip.extents.y1,
@@ -4321,7 +4321,7 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		memset(&mono.op, 0, sizeof(mono.op));
 		if (mono.sna->render.composite(mono.sna,
 					       PictOpClear,
-					       src, NULL, dst,
+					       mono.sna->clear, NULL, dst,
 					       0, 0,
 					       0, 0,
 					       mono.clip.extents.x1,  mono.clip.extents.y1,
commit 9bf874d8d2c7ec25099fa6bf851efb2901d8a4da
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 4 16:33:34 2012 +0000

    sna/gen6: Reduce PictOpClear to PictOpSrc (with blending disabled)
    
    The advantage of PictOpSrc is that it writes its results directly to
    memory bypassing the blend unit.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 08f9668..d73fda8 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -579,17 +579,6 @@ gen6_emit_cc(struct sna *sna,
 	if (render->blend == blend)
 		return op <= PictOpSrc;
 
-	if (op == PictOpClear) {
-		uint32_t src;
-
-		/* We can emulate a clear using src, which is beneficial if
-		 * the blend unit is already disabled.
-		 */
-		src = BLEND_OFFSET(GEN6_BLENDFACTOR_ONE, GEN6_BLENDFACTOR_ZERO);
-		if (render->blend == src)
-			return true;
-	}
-
 	OUT_BATCH(GEN6_3DSTATE_CC_STATE_POINTERS | (4 - 2));
 	OUT_BATCH((render->cc_blend + blend) | 1);
 	if (render->blend == (unsigned)-1) {
@@ -716,8 +705,8 @@ gen6_emit_drawing_rectangle(struct sna *sna,
 	assert(!too_large(op->dst.x, op->dst.y));
 	assert(!too_large(op->dst.width, op->dst.height));
 
-	if  (sna->render_state.gen6.drawrect_limit  == limit &&
-	     sna->render_state.gen6.drawrect_offset == offset)
+	if (sna->render_state.gen6.drawrect_limit  == limit &&
+	    sna->render_state.gen6.drawrect_offset == offset)
 		return false;
 
 	/* [DevSNB-C+{W/A}] Before any depth stall flush (including those
@@ -932,6 +921,8 @@ static int gen6_vertex_finish(struct sna *sna)
 	struct kgem_bo *bo;
 	unsigned int i;
 
+	DBG(("%s: used=%d / %d\n", __FUNCTION__,
+	     sna->render.vertex_used, sna->render.vertex_size));
 	assert(sna->render.vertex_used);
 
 	/* Note: we only need dword alignment (currently) */
@@ -978,6 +969,10 @@ static int gen6_vertex_finish(struct sna *sna)
 
 	kgem_bo_sync__cpu(&sna->kgem, sna->render.vbo);
 	if (sna->render.vertex_used) {
+		DBG(("%s: copying initial buffer x %d to handle=%d\n",
+		     __FUNCTION__,
+		     sna->render.vertex_used,
+		     sna->render.vbo->handle));
 		memcpy(sna->render.vertices,
 		       sna->render.vertex_data,
 		       sizeof(float)*sna->render.vertex_used);
@@ -1003,6 +998,8 @@ static void gen6_vertex_close(struct sna *sna)
 
 	bo = sna->render.vbo;
 	if (bo == NULL) {
+		assert(sna->render.vertices == sna->render.vertex_data);
+		assert(sna->render.vertex_used < ARRAY_SIZE(sna->render.vertex_data));
 		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
 			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
 			     sna->render.vertex_used, sna->kgem.nbatch));
@@ -1230,24 +1227,24 @@ gen6_emit_composite_primitive_solid(struct sna *sna,
 		float f;
 	} dst;
 
+	DBG(("%s: [%d+9] = (%d, %d)x(%d, %d)\n", __FUNCTION__,
+	     sna->render.vertex_used, r->dst.x, r->dst.y, r->width, r->height));
+
 	v = sna->render.vertices + sna->render.vertex_used;
 	sna->render.vertex_used += 9;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+	assert(!too_large(r->dst.x + r->width, r->dst.y + r->height));
 
 	dst.p.x = r->dst.x + r->width;
 	dst.p.y = r->dst.y + r->height;
 	v[0] = dst.f;
-	v[1] = 1.;
-	v[2] = 1.;
-
 	dst.p.x = r->dst.x;
 	v[3] = dst.f;
-	v[4] = 0.;
-	v[5] = 1.;
-
 	dst.p.y = r->dst.y;
 	v[6] = dst.f;
-	v[7] = 0.;
-	v[8] = 0.;
+
+	v[5] = v[2] = v[1] = 1.;
+	v[8] = v[7] = v[4] = 0.;
 }
 
 fastcall static void
@@ -1280,6 +1277,45 @@ gen6_emit_composite_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen6_emit_composite_primitive_simple_source(struct sna *sna,
+					    const struct sna_composite_op *op,
+					    const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+
+fastcall static void
 gen6_emit_composite_primitive_affine_source(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const struct sna_composite_rectangles *r)
@@ -1525,6 +1561,10 @@ static void gen6_emit_vertex_buffer(struct sna *sna,
 static void gen6_emit_primitive(struct sna *sna)
 {
 	if (sna->kgem.nbatch == sna->render_state.gen6.last_primitive) {
+		DBG(("%s: continuing previous primitive, start=%d, index=%d\n",
+		     __FUNCTION__,
+		     sna->render.vertex_start,
+		     sna->render.vertex_index));
 		sna->render_state.gen6.vertex_offset = sna->kgem.nbatch - 5;
 		return;
 	}
@@ -1541,6 +1581,8 @@ static void gen6_emit_primitive(struct sna *sna)
 	OUT_BATCH(0);	/* start instance location */
 	OUT_BATCH(0);	/* index buffer offset, ignored */
 	sna->render.vertex_start = sna->render.vertex_index;
+	DBG(("%s: started new primitive: index=%d\n",
+	     __FUNCTION__, sna->render.vertex_start));
 
 	sna->render_state.gen6.last_primitive = sna->kgem.nbatch;
 }
@@ -1603,6 +1645,7 @@ inline static int gen6_get_rectangles(struct sna *sna,
 	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
+	assert(want > 0);
 	sna->render.vertex_index += 3*want;
 	return want;
 }
@@ -2156,6 +2199,8 @@ static void gen6_composite_channel_convert(struct sna_composite_channel *channel
 static void gen6_render_composite_done(struct sna *sna,
 				       const struct sna_composite_op *op)
 {
+	DBG(("%s\n", __FUNCTION__));
+
 	if (sna->render_state.gen6.vertex_offset) {
 		gen6_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
@@ -2488,6 +2533,8 @@ gen6_render_composite(struct sna *sna,
 					    width, height,
 					    tmp);
 
+	if (op == PictOpClear)
+		op = PictOpSrc;
 	tmp->op = op;
 	if (!gen6_composite_set_target(sna, tmp, dst))
 		return FALSE;
@@ -2585,12 +2632,28 @@ gen6_render_composite(struct sna *sna,
 
 		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
 	} else {
-		if (tmp->src.is_solid)
+		if (tmp->src.is_solid) {
+			DBG(("%s: choosing gen6_emit_composite_primitive_solid\n",
+			     __FUNCTION__));
 			tmp->prim_emit = gen6_emit_composite_primitive_solid;
-		else if (tmp->src.transform == NULL)
+		} else if (tmp->src.transform == NULL) {
+			DBG(("%s: choosing gen6_emit_composite_primitive_identity_source\n",
+			     __FUNCTION__));
 			tmp->prim_emit = gen6_emit_composite_primitive_identity_source;
-		else if (tmp->src.is_affine)
-			tmp->prim_emit = gen6_emit_composite_primitive_affine_source;
+		} else if (tmp->src.is_affine) {
+			if (tmp->src.transform->matrix[0][1] == 0 &&
+			    tmp->src.transform->matrix[1][0] == 0) {
+				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
+				DBG(("%s: choosing gen6_emit_composite_primitive_simple_source\n",
+				     __FUNCTION__));
+				tmp->prim_emit = gen6_emit_composite_primitive_simple_source;
+			} else {
+				DBG(("%s: choosing gen6_emit_composite_primitive_affine_source\n",
+				     __FUNCTION__));
+				tmp->prim_emit = gen6_emit_composite_primitive_affine_source;
+			}
+		}
 
 		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
@@ -2923,11 +2986,11 @@ fastcall static void
 gen6_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
+	DBG(("%s()\n", __FUNCTION__));
+
 	if (sna->render_state.gen6.vertex_offset)
 		gen6_vertex_flush(sna);
 
-	DBG(("%s()\n", __FUNCTION__));
-
 	if (op->base.src.bo)
 		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 
@@ -3193,8 +3256,7 @@ fallback_blt:
 	if (!gen6_check_format(tmp.src.pict_format))
 		goto fallback_blt;
 
-	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
-
+	tmp.op = PictOpSrc;
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
@@ -3373,6 +3435,8 @@ gen6_render_copy_blt(struct sna *sna,
 static void
 gen6_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
+	DBG(("%s()\n", __FUNCTION__));
+
 	if (sna->render_state.gen6.vertex_offset)
 		gen6_vertex_flush(sna);
 }
@@ -3428,7 +3492,7 @@ fallback:
 	if (!gen6_check_format(op->base.src.pict_format))
 		goto fallback;
 
-	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
+	op->base.op = PictOpSrc;
 
 	op->base.dst.pixmap = dst;
 	op->base.dst.width  = dst->drawable.width;
@@ -3574,9 +3638,10 @@ gen6_render_fill_boxes(struct sna *sna,
 	return FALSE;
 #endif
 
-	if (op == PictOpClear)
+	if (op == PictOpClear) {
 		pixel = 0;
-	else if (!sna_get_pixel_from_rgba(&pixel,
+		op = PictOpSrc;
+	} else if (!sna_get_pixel_from_rgba(&pixel,
 				     color->red,
 				     color->green,
 				     color->blue,
@@ -3744,6 +3809,8 @@ gen6_render_op_fill_boxes(struct sna *sna,
 static void
 gen6_render_op_fill_done(struct sna *sna, const struct sna_fill_op *op)
 {
+	DBG(("%s()\n", __FUNCTION__));
+
 	if (sna->render_state.gen6.vertex_offset)
 		gen6_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
@@ -3781,7 +3848,7 @@ gen6_render_fill(struct sna *sna, uint8_t alu,
 	if (alu == GXclear)
 		color = 0;
 
-	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
+	op->base.op = PictOpSrc;
 
 	op->base.dst.pixmap = dst;
 	op->base.dst.width  = dst->drawable.width;
@@ -3874,7 +3941,7 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	if (alu == GXclear)
 		color = 0;
 
-	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
+	tmp.op = PictOpSrc;
 
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
@@ -3976,7 +4043,7 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	if (too_large(dst->drawable.width, dst->drawable.height))
 		return gen6_render_clear_try_blt(sna, dst, bo);
 
-	tmp.op = PictOpClear;
+	tmp.op = PictOpSrc;
 
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 757bad1..259666e 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2917,7 +2917,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 {
 	if (bo->map)
-		return bo->map;
+		return CPU_MAP(bo->map);
 
 	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 	return bo->map = gem_mmap(kgem->fd, bo->handle, bytes(bo),
diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
index f748da6..72ed299 100644
--- a/src/sna/kgem_debug_gen6.c
+++ b/src/sna/kgem_debug_gen6.c
@@ -42,7 +42,6 @@
 static struct state {
 	struct vertex_buffer {
 		int handle;
-		void *base;
 		const char *ptr;
 		int pitch;
 
@@ -67,7 +66,7 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 {
 	uint32_t reloc = sizeof(uint32_t) * (&data[1] - kgem->batch);
 	struct kgem_bo *bo = NULL;
-	void *base, *ptr;
+	void *base;
 	int i;
 
 	for (i = 0; i < kgem->nreloc; i++)
@@ -85,13 +84,12 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 		assert(&bo->request != &kgem->next_request->buffers);
 		base = kgem_bo_map__debug(kgem, bo);
 	}
-	ptr = (char *)base + kgem->reloc[i].delta;
 
+	base = (char *)base + kgem->reloc[i].delta;
 	i = data[0] >> 26;
 
 	state.vb[i].current = bo;
-	state.vb[i].base = base;
-	state.vb[i].ptr = ptr;
+	state.vb[i].ptr = base;
 	state.vb[i].pitch = data[0] & 0x7ff;
 }
 
@@ -268,10 +266,8 @@ static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
 		const struct vertex_buffer *vb = &state.vb[ve->buffer];
 		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
 
-		if (!ve->valid)
-			continue;
-
-		ve_out(ve, ptr);
+		if (ve->valid)
+			ve_out(ve, ptr);
 
 		while (++i <= state.num_ve && !state.ve[i].valid)
 			;
diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 0faad84..8eb0c94 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -792,6 +792,21 @@ sna_composite_rectangles(CARD8		 op,
 			}
 		} else
 			sna_damage_add(&priv->gpu_damage, &region);
+	} else if (op <= PictOpSrc &&
+		   region.data == NULL &&
+		   region.extents.x2 - region.extents.x1 == pixmap->drawable.width &&
+		   region.extents.y2 - region.extents.y1 == pixmap->drawable.height) {
+		priv->clear = true;
+		priv->clear_color = 0;
+		if (op == PictOpSrc)
+			sna_get_pixel_from_rgba(&priv->clear_color,
+						color->red,
+						color->green,
+						color->blue,
+						color->alpha,
+						dst->format);
+		DBG(("%s: marking clear [%08x]\n",
+		     __FUNCTION__, priv->clear_color));
 	}
 
 	goto done;
commit da14f031d875b41aebed04632eca564731378695
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Feb 4 12:06:22 2012 +0000

    sna: Check if the damage reduces to all before performing the migration
    
    An assert exposed a situation where we had accumulated an unreduced
    damage-all and so we were taking the slow path only to discover later
    that it was a damage-all and that we had performed needless checks.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1d2b999..b6f6772 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -416,6 +416,8 @@ static inline uint32_t default_tiling(PixmapPtr pixmap)
 	if (sna_damage_is_all(&priv->cpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
+		DBG(("%s: entire source is damaged, using Y-tiling\n",
+		     __FUNCTION__));
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->undamaged = false;
 		return I915_TILING_Y;
@@ -1221,7 +1223,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		return true;
 	}
 
-	if (DAMAGE_IS_ALL(priv->cpu_damage))
+	if (sna_damage_is_all(&priv->cpu_damage,
+			      pixmap->drawable.width,
+			      pixmap->drawable.height))
 		goto out;
 
 	if (priv->clear)
@@ -1245,6 +1249,8 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	}
 
 	if ((flags & MOVE_READ) == 0) {
+		DBG(("%s: no read, checking to see if we can stream the write into the GPU bo\n",
+		     __FUNCTION__));
 		assert(flags & MOVE_WRITE);
 
 		if (priv->stride && priv->gpu_bo &&
@@ -1611,7 +1617,9 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 	assert_pixmap_contains_box(pixmap, box);
 
-	if (DAMAGE_IS_ALL(priv->gpu_damage))
+	if (sna_damage_is_all(&priv->gpu_damage,
+			      pixmap->drawable.width,
+			      pixmap->drawable.height))
 		goto done;
 
 	if (priv->gpu_bo == NULL) {
@@ -2081,7 +2089,9 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		return NULL;
 	}
 
-	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
+	if (sna_damage_is_all(&priv->gpu_damage,
+			      pixmap->drawable.width,
+			      pixmap->drawable.height)) {
 		DBG(("%s: already all-damaged\n", __FUNCTION__));
 		goto active;
 	}
commit c4357e72470c8a3be9a37dba5fa3e076c8e592fa
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 3 20:06:43 2012 +0000

    sna: Reduce the downsample tile size to accommodate alignment
    
    If we need to enlarge the sampled tile due to tiling alignments, the
    resulting sample can become larger than we can accommodate through the 3D
    pipeline, resulting in FAIL.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index bc8b2de..c2b9e79 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -622,7 +622,7 @@ static int sna_render_picture_downsample(struct sna *sna,
 	struct sna_pixmap *priv;
 	pixman_transform_t t;
 	PixmapPtr tmp;
-	int width, height;
+	int width, height, size;
 	int sx, sy, ox, oy, ow, oh;
 	int error, ret = 0;
 	BoxRec box, b;
@@ -743,8 +743,13 @@ static int sna_render_picture_downsample(struct sna *sna,
 	ValidatePicture(tmp_dst);
 	ValidatePicture(tmp_src);
 
-	w = sna->render.max_3d_size / sx - 2 * sx;
-	h = sna->render.max_3d_size / sy - 2 * sy;
+	/* Use a small size to accommodate enlargement through tile alignment */
+	size = sna->render.max_3d_size - 4096 / pixmap->drawable.bitsPerPixel;
+	while (size * size * 4 > sna->kgem.max_copy_tile_size)
+		size /= 2;
+
+	w = size / sx - 2 * sx;
+	h = size / sy - 2 * sy;
 	DBG(("%s %d:%d downsampling using %dx%d GPU tiles\n",
 	     __FUNCTION__, (width + w-1)/w, (height + h-1)/h, w, h));
 
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 00e111c..c6e898b 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -142,7 +142,8 @@ sna_tiling_composite_done(struct sna *sna,
 
 	/* Use a small step to accommodate enlargement through tile alignment */
 	step = sna->render.max_3d_size;
-	if (tile->dst_x & (8*512 / tile->dst->pDrawable->bitsPerPixel - 1))
+	if (tile->dst_x & (8*512 / tile->dst->pDrawable->bitsPerPixel - 1) ||
+	    tile->dst_y & 63)
 		step /= 2;
 	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
@@ -330,7 +331,11 @@ sna_tiling_fill_boxes(struct sna *sna,
 
 	pixman_region_init_rects(&region, box, n);
 
+	/* Use a small step to accommodate enlargement through tile alignment */
 	step = sna->render.max_3d_size;
+	if (region.extents.x1 & (8*512 / dst->drawable.bitsPerPixel - 1) ||
+	    region.extents.y1 & 63)
+		step /= 2;
 	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
 
@@ -443,7 +448,10 @@ Bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
 
 	pixman_region_init_rects(&region, box, nbox);
 
+	/* Use a small step to accommodate enlargement through tile alignment */
 	step = sna->render.max_3d_size;
+	if (region.extents.x1 & (8*512 / bpp - 1) || region.extents.y1 & 63)
+		step /= 2;
 	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
 
commit 9691e3b8852305a3835f5fb523688fd61f974405
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Feb 3 19:30:24 2012 +0000

    sna: Apply redirection for the render copy into large pixmaps
    
    If the pixmap is larger than the pipeline, but the operation extents fit
    within the pipeline, we may be able to create a proxy target to
    transform the operation into one that fits within the constraints of the
    render pipeline.
    
    This fixes the infinite recursion hit with partially displayed extremely
    large images.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 7250d66..97b558d 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -2852,9 +2852,7 @@ gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	if (src_bo == dst_bo || /* XXX handle overlap using 3D ? */
 	    too_large(src->drawable.width, src->drawable.height) ||
-	    src_bo->pitch > MAX_3D_PITCH ||
-	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
+	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch < 8) {
 fallback:
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
@@ -2876,10 +2874,39 @@ fallback:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
 	tmp.dst.bo = dst_bo;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height) ||
+	    dst_bo->pitch > MAX_3D_PITCH) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
 
 	tmp.floats_per_vertex = 4;
 	tmp.floats_per_rect = 12;
 
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
 	gen2_render_copy_setup_source(&tmp.src, src, src_bo);
 	gen2_emit_copy_state(sna, &tmp);
 	do {
@@ -2917,7 +2944,14 @@ fallback:
 	} while (n);
 
 	gen2_vertex_flush(sna, &tmp);
+	sna_render_composite_redirect_done(sna, &tmp);
 	return TRUE;
+
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 784d399..d5f5617 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -3841,10 +3841,8 @@ gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    src_bo == dst_bo || /* XXX handle overlap using 3D ? */
 	    src_bo->pitch > MAX_3D_PITCH ||
-	    too_large(src->drawable.width, src->drawable.height) ||
-	    dst_bo->pitch > MAX_3D_PITCH ||
-	    too_large(dst->drawable.width, dst->drawable.height)) {
-fallback:
+	    too_large(src->drawable.width, src->drawable.height)) {
+fallback_blt:
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
@@ -3854,7 +3852,7 @@ fallback:
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
-			goto fallback;
+			goto fallback_blt;
 	}
 
 	memset(&tmp, 0, sizeof(tmp));
@@ -3865,6 +3863,31 @@ fallback:
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
 	tmp.dst.bo = dst_bo;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height) ||
+	    dst_bo->pitch > MAX_3D_PITCH) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
 
 	gen3_render_copy_setup_source(&tmp.src, src, src_bo);
 
@@ -3873,6 +3896,10 @@ fallback:
 	tmp.mask.bo = NULL;
 	tmp.mask.u.gen3.type = SHADER_NONE;
 
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
 	gen3_emit_composite_state(sna, &tmp);
 	gen3_align_vertex(sna, &tmp);
 
@@ -3911,7 +3938,14 @@ fallback:
 	} while (n);
 
 	gen3_vertex_flush(sna);
+	sna_render_composite_redirect_done(sna, &tmp);
 	return TRUE;
+
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index ffdcbb7..b3a64d9 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -289,6 +289,13 @@ gen4_emit_pipelined_pointers(struct sna *sna,
 #define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
 #define OUT_VERTEX_F(v) vertex_emit(sna, v)
 
+#define GEN4_MAX_3D_SIZE 8192
+
+static inline bool too_large(int width, int height)
+{
+	return width > GEN4_MAX_3D_SIZE || height > GEN4_MAX_3D_SIZE;
+}
+
 static int
 gen4_choose_composite_kernel(int op, Bool has_mask, Bool is_ca, Bool is_affine)
 {
@@ -1884,7 +1891,7 @@ gen4_composite_picture(struct sna *sna,
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
 						  x, y, w, h, dst_x, dst_y);
 
-	if (pixmap->drawable.width > 8192 || pixmap->drawable.height > 8192)
+	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
 		return sna_render_picture_extract(sna, picture, channel,
 						  x, y, w, h, dst_x, dst_y);
 
@@ -1983,7 +1990,7 @@ try_blt(struct sna *sna,
 		return TRUE;
 	}
 
-	if (width > 8192 || height > 8192) {
+	if (too_large(width, height)) {
 		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
 		     __FUNCTION__, width, height));
 		return TRUE;
@@ -2221,11 +2228,10 @@ gen4_render_composite(struct sna *sna,
 		return FALSE;
 	sna_render_reduce_damage(tmp, dst_x, dst_y, width, height);
 
-	if (tmp->dst.width > 8192 || tmp->dst.height > 8192) {
-		if (!sna_render_composite_redirect(sna, tmp,
-						   dst_x, dst_y, width, height))
+	if (too_large(tmp->dst.width, tmp->dst.height) &&
+	    !sna_render_composite_redirect(sna, tmp,
+					   dst_x, dst_y, width, height))
 			return FALSE;
-	}
 
 	switch (gen4_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
@@ -2432,10 +2438,8 @@ gen4_render_copy_boxes(struct sna *sna, uint8_t alu,
 			       box, n))
 		return TRUE;
 
-	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
-	    src->drawable.width > 8192 || src->drawable.height > 8192 ||
-	    dst->drawable.width > 8192 || dst->drawable.height > 8192) {
-fallback:
+	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
+fallback_blt:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -2458,24 +2462,73 @@ fallback:
 		tmp.src.pict_format = sna_format_for_depth(src->drawable.depth);
 	}
 	if (!gen4_check_format(tmp.src.pict_format))
-		goto fallback;
+		goto fallback_blt;
 
 	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
 
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
+	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.bo = dst_bo;
-	tmp.dst.x = dst_dx;
-	tmp.dst.y = dst_dy;
 
-	tmp.src.bo = src_bo;
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
+
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
-	tmp.src.card_format =
-		gen4_get_card_format(tmp.src.pict_format),
-	tmp.src.width  = src->drawable.width;
-	tmp.src.height = src->drawable.height;
+	tmp.src.card_format = gen4_get_card_format(tmp.src.pict_format);
+	if (too_large(src->drawable.width, src->drawable.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+
+		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
+					       extents.x1 + src_dx,
+					       extents.y1 + src_dy,
+					       extents.x2 - extents.x1,
+					       extents.y2 - extents.y1)) {
+			goto fallback_tiled_dst;
+		}
+	} else {
+		tmp.src.bo = kgem_bo_reference(src_bo);
+		tmp.src.width  = src->drawable.width;
+		tmp.src.height = src->drawable.height;
+		tmp.src.offset[0] = tmp.src.offset[1] = 0;
+		tmp.src.scale[0] = 1.f/src->drawable.width;
+		tmp.src.scale[1] = 1.f/src->drawable.height;
+	}
 
 	tmp.mask.bo = NULL;
 
@@ -2487,9 +2540,16 @@ fallback:
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
-			goto fallback;
+			goto fallback_tiled_src;
 	}
 
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	src_dx += tmp.src.offset[0];
+	src_dy += tmp.src.offset[1];
+
 	gen4_copy_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
 
@@ -2499,10 +2559,23 @@ fallback:
 		gen4_render_copy_one(sna, &tmp,
 				     box->x1 + src_dx, box->y1 + src_dy,
 				     box->x2 - box->x1, box->y2 - box->y1,
-				     box->x1, box->y1);
+				     box->x1 + dst_dx, box->y1 + dst_dy);
 		box++;
 	} while (--n);
+	sna_render_composite_redirect_done(sna, &tmp);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return TRUE;
+
+fallback_tiled_src:
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+fallback_tiled_dst:
+	if (tmp.redirect.real_bo)
+		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
@@ -2552,8 +2625,8 @@ gen4_render_copy(struct sna *sna, uint8_t alu,
 		return TRUE;
 
 	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
-	    src->drawable.width > 8192 || src->drawable.height > 8192 ||
-	    dst->drawable.width > 8192 || dst->drawable.height > 8192) {
+	    too_large(src->drawable.width, src->drawable.height) ||
+	    too_large(dst->drawable.width, dst->drawable.height)) {
 fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
@@ -2683,10 +2756,7 @@ gen4_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (prefer_blt(sna) ||
-	    dst->drawable.width > 8192 ||
-	    dst->drawable.height > 8192 ||
-	    !gen4_check_dst_format(format)) {
+	if (prefer_blt(sna) || too_large(dst->drawable.width, dst->drawable.height)) {
 		uint8_t alu = -1;
 
 		if (op == PictOpClear || (op == PictOpOutReverse && color->alpha >= 0xff00))
@@ -2715,7 +2785,7 @@ gen4_render_fill_boxes(struct sna *sna,
 		if (!gen4_check_dst_format(format))
 			return FALSE;
 
-		if (dst->drawable.width > 8192 || dst->drawable.height > 8192)
+		if (too_large(dst->drawable.width, dst->drawable.height))
 			return sna_tiling_fill_boxes(sna, op, format, color,
 						     dst, dst_bo, box, n);
 	}
@@ -2834,7 +2904,7 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 		return TRUE;
 
 	if (!(alu == GXcopy || alu == GXclear) ||
-	    dst->drawable.width > 8192 || dst->drawable.height > 8192)
+	    too_large(dst->drawable.width, dst->drawable.height))
 		return sna_blt_fill(sna, alu,
 				    dst_bo, dst->drawable.bitsPerPixel,
 				    color,
@@ -2925,7 +2995,7 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	/* Must use the BLT if we can't RENDER... */
 	if (!(alu == GXcopy || alu == GXclear) ||
-	    dst->drawable.width > 8192 || dst->drawable.height > 8192)
+	    too_large(dst->drawable.width, dst->drawable.height))
 		return FALSE;
 
 	if (alu == GXclear)
@@ -3251,7 +3321,7 @@ Bool gen4_render_init(struct sna *sna)
 	sna->render.reset = gen4_render_reset;
 	sna->render.fini = gen4_render_fini;
 
-	sna->render.max_3d_size = 8192;
+	sna->render.max_3d_size = GEN4_MAX_3D_SIZE;
 	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 03dc8c9..933c51f 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1378,6 +1378,9 @@ gen5_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
 	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
 	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
 
+	assert(!too_large(op->dst.x, op->dst.y));
+	assert(!too_large(op->dst.width, op->dst.height));
+
 	if (!DBG_NO_STATE_CACHE &&
 	    sna->render_state.gen5.drawrect_limit == limit &&
 	    sna->render_state.gen5.drawrect_offset == offset)
@@ -2731,20 +2734,6 @@ gen5_copy_bind_surfaces(struct sna *sna,
 	gen5_emit_state(sna, op, offset);
 }
 
-static inline bool untiled_tlb_miss(struct kgem_bo *bo)
-{
-	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
-}
-
-static inline bool prefer_blt_copy(struct sna *sna,
-				   struct kgem_bo *src_bo,
-				   struct kgem_bo *dst_bo)
-{
-	return (sna->kgem.ring != KGEM_RENDER ||
-		untiled_tlb_miss(src_bo) ||
-		untiled_tlb_miss(dst_bo));
-}
-
 static Bool
 gen5_render_copy_boxes(struct sna *sna, uint8_t alu,
 		       PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
@@ -2753,8 +2742,7 @@ gen5_render_copy_boxes(struct sna *sna, uint8_t alu,
 {
 	struct sna_composite_op tmp;
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
 			       dst_bo, dst_dx, dst_dy,
@@ -2762,12 +2750,10 @@ gen5_render_copy_boxes(struct sna *sna, uint8_t alu,
 			       box, n))
 		return TRUE;
 
-	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
-	    too_large(src->drawable.width, src->drawable.height) ||
-	    too_large(dst->drawable.width, dst->drawable.height)) {
-fallback:
-	    if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
-		    return FALSE;
+	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
+fallback_blt:
+		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
+			return FALSE;
 
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
@@ -2787,7 +2773,7 @@ fallback:
 	if (!gen5_check_format(tmp.src.pict_format)) {
 		DBG(("%s: unsupported source format, %x, use BLT\n",
 		     __FUNCTION__, tmp.src.pict_format));
-		goto fallback;
+		goto fallback_blt;
 	}
 
 	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
@@ -2798,17 +2784,66 @@ fallback:
 	tmp.dst.pixmap = dst;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
+	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.bo = dst_bo;
-	tmp.dst.x = dst_dx;
-	tmp.dst.y = dst_dy;
 
-	tmp.src.bo = src_bo;
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
+
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
-	tmp.src.card_format =
-		gen5_get_card_format(tmp.src.pict_format);
-	tmp.src.width  = src->drawable.width;
-	tmp.src.height = src->drawable.height;
+	tmp.src.card_format = gen5_get_card_format(tmp.src.pict_format);
+	if (too_large(src->drawable.width, src->drawable.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+
+		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
+					       extents.x1 + src_dx,
+					       extents.y1 + src_dy,
+					       extents.x2 - extents.x1,
+					       extents.y2 - extents.y1))
+			goto fallback_tiled_dst;
+	} else {
+		tmp.src.bo = kgem_bo_reference(src_bo);
+		tmp.src.width  = src->drawable.width;
+		tmp.src.height = src->drawable.height;
+		tmp.src.offset[0] = tmp.src.offset[1] = 0;
+		tmp.src.scale[0] = 1.f/src->drawable.width;
+		tmp.src.scale[1] = 1.f/src->drawable.height;
+	}
 
 	tmp.is_affine = TRUE;
 	tmp.floats_per_vertex = 3;
@@ -2819,24 +2854,19 @@ fallback:
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
-			goto fallback;
+			goto fallback_tiled_src;
 	}
 
-	if (kgem_bo_is_dirty(src_bo)) {
-		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
-		    sna_blt_copy_boxes(sna, alu,
-				       src_bo, src_dx, src_dy,
-				       dst_bo, dst_dx, dst_dy,
-				       dst->drawable.bitsPerPixel,
-				       box, n))
-			return TRUE;
-	}
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	src_dx += tmp.src.offset[0];
+	src_dy += tmp.src.offset[1];
 
 	gen5_copy_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
 
-	tmp.src.scale[0] = 1.f/src->drawable.width;
-	tmp.src.scale[1] = 1.f/src->drawable.height;
 	do {
 		int n_this_time = gen5_get_rectangles(sna, &tmp, n);
 		if (n_this_time == 0) {
@@ -2850,15 +2880,15 @@ fallback:
 			     box->x1 + src_dx, box->y1 + src_dy,
 			     box->x1 + dst_dx, box->y1 + dst_dy,
 			     box->x2 - box->x1, box->y2 - box->y1));
-			OUT_VERTEX(box->x2, box->y2);
+			OUT_VERTEX(box->x2 + dst_dx, box->y2 + dst_dy);
 			OUT_VERTEX_F((box->x2 + src_dx) * tmp.src.scale[0]);
 			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
 
-			OUT_VERTEX(box->x1, box->y2);
+			OUT_VERTEX(box->x1 + dst_dx, box->y2 + dst_dy);
 			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
 			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
 
-			OUT_VERTEX(box->x1, box->y1);
+			OUT_VERTEX(box->x1 + dst_dx, box->y1 + dst_dy);
 			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
 			OUT_VERTEX_F((box->y1 + src_dy) * tmp.src.scale[1]);
 
@@ -2867,7 +2897,20 @@ fallback:
 	} while (n);
 
 	gen5_vertex_flush(sna);
+	sna_render_composite_redirect_done(sna, &tmp);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return TRUE;
+
+fallback_tiled_src:
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+fallback_tiled_dst:
+	if (tmp.redirect.real_bo)
+		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
@@ -2916,8 +2959,7 @@ gen5_render_copy(struct sna *sna, uint8_t alu,
 {
 	DBG(("%s (alu=%d)\n", __FUNCTION__, alu));
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
 			 dst->drawable.bitsPerPixel,
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 9f799ef..08f9668 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -229,6 +229,11 @@ static const struct formatinfo {
 #define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
 #define OUT_VERTEX_F(v) vertex_emit(sna, v)
 
+static inline bool too_large(int width, int height)
+{
+	return width > GEN6_MAX_SIZE || height > GEN6_MAX_SIZE;
+}
+
 static uint32_t gen6_get_blend(int op,
 			       bool has_component_alpha,
 			       uint32_t dst_format)
@@ -708,6 +713,9 @@ gen6_emit_drawing_rectangle(struct sna *sna,
 	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
 	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
 
+	assert(!too_large(op->dst.x, op->dst.y));
+	assert(!too_large(op->dst.width, op->dst.height));
+
 	if  (sna->render_state.gen6.drawrect_limit  == limit &&
 	     sna->render_state.gen6.drawrect_offset == offset)
 		return false;
@@ -2061,11 +2069,6 @@ gen6_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
-static inline bool too_large(int width, int height)
-{
-	return width > GEN6_MAX_SIZE || height > GEN6_MAX_SIZE;
-}
-
 static int
 gen6_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -3082,13 +3085,22 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
 }
 
+static bool prefer_blt_bo(struct sna *sna,
+			  PixmapPtr pixmap,
+			  struct kgem_bo *bo)
+{
+	return (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
+		untiled_tlb_miss(bo)) &&
+		kgem_bo_can_blt(&sna->kgem, bo);
+}
+
 static inline bool prefer_blt_copy(struct sna *sna,
-				   struct kgem_bo *src_bo,
-				   struct kgem_bo *dst_bo)
+				   PixmapPtr src, struct kgem_bo *src_bo,
+				   PixmapPtr dst, struct kgem_bo *dst_bo)
 {
-	return (prefer_blt_ring(sna) ||
-		untiled_tlb_miss(src_bo) ||
-		untiled_tlb_miss(dst_bo));
+	return (sna->kgem.ring != KGEM_RENDER ||
+		prefer_blt_bo(sna, src, src_bo) ||
+		prefer_blt_bo(sna, dst, dst_bo));
 }
 
 static inline bool
@@ -3148,7 +3160,7 @@ gen6_render_copy_boxes(struct sna *sna, uint8_t alu,
 		      dst_bo, dst_dx, dst_dy,
 		      box, n)));
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
+	if (prefer_blt_copy(sna, src, src_bo, dst, dst_bo) &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
@@ -3160,26 +3172,15 @@ gen6_render_copy_boxes(struct sna *sna, uint8_t alu,
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    overlaps(src_bo, src_dx, src_dy,
 		     dst_bo, dst_dx, dst_dy,
-		     box, n) ||
-	    too_large(src->drawable.width, src->drawable.height) ||
-	    too_large(dst->drawable.width, dst->drawable.height)) {
-fallback:
+		     box, n)) {
+fallback_blt:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return false;
 
-		if (sna_blt_copy_boxes_fallback(sna, alu,
+		return sna_blt_copy_boxes_fallback(sna, alu,
 						 src, src_bo, src_dx, src_dy,
 						 dst, dst_bo, dst_dx, dst_dy,
-						 box, n))
-			return true;
-
-		return false;
-#if 0
-		return sna_tiling_copy_boxes(sna,
-					     src, src_bo, src_dx, src_dy,
-					     dst, dst_bo, dst_dx, dst_dy,
-					     box, n);
-#endif
+						 box, n);
 	}
 
 	if (dst->drawable.depth == src->drawable.depth) {
@@ -3190,25 +3191,73 @@ fallback:
 		tmp.src.pict_format = sna_format_for_depth(src->drawable.depth);
 	}
 	if (!gen6_check_format(tmp.src.pict_format))
-		goto fallback;
+		goto fallback_blt;
 
 	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
 
 	tmp.dst.pixmap = dst;
-	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.bo = dst_bo;
-	tmp.dst.x = dst_dx;
-	tmp.dst.y = dst_dy;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
 
-	tmp.src.bo = src_bo;
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
 	tmp.src.card_format =
 		gen6_get_card_format(tmp.src.pict_format);
-	tmp.src.width  = src->drawable.width;
-	tmp.src.height = src->drawable.height;
+	if (too_large(src->drawable.width, src->drawable.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+
+		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
+					       extents.x1 + src_dx,
+					       extents.y1 + src_dy,
+					       extents.x2 - extents.x1,
+					       extents.y2 - extents.y1))
+			goto fallback_tiled_dst;
+	} else {
+		tmp.src.bo = kgem_bo_reference(src_bo);
+		tmp.src.width  = src->drawable.width;
+		tmp.src.height = src->drawable.height;
+		tmp.src.offset[0] = tmp.src.offset[1] = 0;
+		tmp.src.scale[0] = 1.f/src->drawable.width;
+		tmp.src.scale[1] = 1.f/src->drawable.height;
+	}
 
 	tmp.mask.bo = NULL;
 	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
@@ -3229,10 +3278,17 @@ fallback:
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
-			goto fallback;
+			goto fallback_tiled_src;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	src_dx += tmp.src.offset[0];
+	src_dy += tmp.src.offset[1];
+
 	gen6_emit_copy_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
 
@@ -3256,9 +3312,9 @@ fallback:
 			     box->x1 + src_dx, box->y1 + src_dy,
 			     box->x1 + dst_dx, box->y1 + dst_dy,
 			     box->x2 - box->x1, box->y2 - box->y1));
-			v[0] = pack_2s(box->x2, box->y2);
-			v[3] = pack_2s(box->x1, box->y2);
-			v[6] = pack_2s(box->x1, box->y1);
+			v[0] = pack_2s(box->x2 + dst_dx, box->y2 + dst_dy);
+			v[3] = pack_2s(box->x1 + dst_dx, box->y2 + dst_dy);
+			v[6] = pack_2s(box->x1 + dst_dx, box->y1 + dst_dy);
 
 			v[1] = (box->x2 + src_dx) * tmp.src.scale[0];
 			v[7] = v[4] = (box->x1 + src_dx) * tmp.src.scale[0];
@@ -3272,7 +3328,20 @@ fallback:
 	} while (n);
 
 	gen6_vertex_flush(sna);
+	sna_render_composite_redirect_done(sna, &tmp);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return TRUE;
+
+fallback_tiled_src:
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+fallback_tiled_dst:
+	if (tmp.redirect.real_bo)
+		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
@@ -3329,7 +3398,7 @@ gen6_render_copy(struct sna *sna, uint8_t alu,
 	     src->drawable.width, src->drawable.height,
 	     dst->drawable.width, dst->drawable.height));
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
+	if (prefer_blt_copy(sna, src, src_bo, dst, dst_bo) &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index e2486c6..5385a47 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -235,6 +235,11 @@ static const struct formatinfo {
 #define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
 #define OUT_VERTEX_F(v) vertex_emit(sna, v)
 
+static inline bool too_large(int width, int height)
+{
+	return width > GEN7_MAX_SIZE || height > GEN7_MAX_SIZE;
+}
+
 static uint32_t gen7_get_blend(int op,
 			       Bool has_component_alpha,
 			       uint32_t dst_format)
@@ -817,6 +822,9 @@ gen7_emit_drawing_rectangle(struct sna *sna,
 	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
 	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
 
+	assert(!too_large(op->dst.x, op->dst.y));
+	assert(!too_large(op->dst.width, op->dst.height));
+
 	if (sna->render_state.gen7.drawrect_limit == limit &&
 	    sna->render_state.gen7.drawrect_offset == offset)
 		return;
@@ -2124,11 +2132,6 @@ gen7_composite_solid_init(struct sna *sna,
 	return channel->bo != NULL;
 }
 
-static inline bool too_large(int width, int height)
-{
-	return width > GEN7_MAX_SIZE || height > GEN7_MAX_SIZE;
-}
-
 static int
 gen7_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -3130,13 +3133,22 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
 }
 
+static bool prefer_blt_bo(struct sna *sna,
+			  PixmapPtr pixmap,
+			  struct kgem_bo *bo)
+{
+	return (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
+		untiled_tlb_miss(bo)) &&
+		kgem_bo_can_blt(&sna->kgem, bo);
+}
+
 static inline bool prefer_blt_copy(struct sna *sna,
-				   struct kgem_bo *src_bo,
-				   struct kgem_bo *dst_bo)
+				   PixmapPtr src, struct kgem_bo *src_bo,
+				   PixmapPtr dst, struct kgem_bo *dst_bo)
 {
 	return (sna->kgem.ring != KGEM_RENDER ||
-		untiled_tlb_miss(src_bo) ||
-		untiled_tlb_miss(dst_bo));
+		prefer_blt_bo(sna, src, src_bo) ||
+		prefer_blt_bo(sna, dst, dst_bo));
 }
 
 static inline bool
@@ -3196,7 +3208,7 @@ gen7_render_copy_boxes(struct sna *sna, uint8_t alu,
 		      dst_bo, dst_dx, dst_dy,
 		      box, n)));
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
+	if (prefer_blt_copy(sna, src, src_bo, dst, dst_bo) &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
@@ -3208,17 +3220,15 @@ gen7_render_copy_boxes(struct sna *sna, uint8_t alu,
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    overlaps(src_bo, src_dx, src_dy,
 		     dst_bo, dst_dx, dst_dy,
-		     box, n) ||
-	    too_large(src->drawable.width, src->drawable.height) ||
-	    too_large(dst->drawable.width, dst->drawable.height)) {
-fallback:
+		     box, n)) {
+fallback_blt:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
-			return FALSE;
+			return false;
 
 		return sna_blt_copy_boxes_fallback(sna, alu,
-						   src, src_bo, src_dx, src_dy,
-						   dst, dst_bo, dst_dx, dst_dy,
-						   box, n);
+						 src, src_bo, src_dx, src_dy,
+						 dst, dst_bo, dst_dx, dst_dy,
+						 box, n);
 	}
 
 	if (dst->drawable.depth == src->drawable.depth) {
@@ -3229,25 +3239,73 @@ fallback:
 		tmp.src.pict_format = sna_format_for_depth(src->drawable.depth);
 	}
 	if (!gen7_check_format(tmp.src.pict_format))
-		goto fallback;
+		goto fallback_blt;
 
 	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
 
 	tmp.dst.pixmap = dst;
-	tmp.dst.x = tmp.dst.y = 0;
 	tmp.dst.width  = dst->drawable.width;
 	tmp.dst.height = dst->drawable.height;
 	tmp.dst.bo = dst_bo;
-	tmp.dst.x = dst_dx;
-	tmp.dst.y = dst_dy;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	sna_render_composite_redirect_init(&tmp);
+	if (too_large(tmp.dst.width, tmp.dst.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+		if (!sna_render_composite_redirect(sna, &tmp,
+						   extents.x1, extents.y1,
+						   extents.x2 - extents.x1,
+						   extents.y2 - extents.y1))
+			goto fallback_tiled;
+	}
 
-	tmp.src.bo = src_bo;
 	tmp.src.filter = SAMPLER_FILTER_NEAREST;
 	tmp.src.repeat = SAMPLER_EXTEND_NONE;
 	tmp.src.card_format =
 		gen7_get_card_format(tmp.src.pict_format);
-	tmp.src.width  = src->drawable.width;
-	tmp.src.height = src->drawable.height;
+	if (too_large(src->drawable.width, src->drawable.height)) {
+		BoxRec extents = box[0];
+		int i;
+
+		for (i = 1; i < n; i++) {
+			if (extents.x1 < box[i].x1)
+				extents.x1 = box[i].x1;
+			if (extents.y1 < box[i].y1)
+				extents.y1 = box[i].y1;
+
+			if (extents.x2 > box[i].x2)
+				extents.x2 = box[i].x2;
+			if (extents.y2 > box[i].y2)
+				extents.y2 = box[i].y2;
+		}
+
+		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
+					       extents.x1 + src_dx,
+					       extents.y1 + src_dy,
+					       extents.x2 - extents.x1,
+					       extents.y2 - extents.y1))
+			goto fallback_tiled_dst;
+	} else {
+		tmp.src.bo = kgem_bo_reference(src_bo);
+		tmp.src.width  = src->drawable.width;
+		tmp.src.height = src->drawable.height;
+		tmp.src.offset[0] = tmp.src.offset[1] = 0;
+		tmp.src.scale[0] = 1.f/src->drawable.width;
+		tmp.src.scale[1] = 1.f/src->drawable.height;
+	}
 
 	tmp.mask.bo = NULL;
 	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
@@ -3259,7 +3317,7 @@ fallback:
 	tmp.has_component_alpha = 0;
 	tmp.need_magic_ca_pass = 0;
 
-	tmp.u.gen7.wm_kernel = GEN7_WM_KERNEL_NOMASK;
+	tmp.u.gen7.wm_kernel = GEN6_WM_KERNEL_NOMASK;
 	tmp.u.gen7.nr_surfaces = 2;
 	tmp.u.gen7.nr_inputs = 1;
 	tmp.u.gen7.ve_id = 1;
@@ -3268,10 +3326,17 @@ fallback:
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
-			goto fallback;
+			goto fallback_tiled_src;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
+	dst_dx += tmp.dst.x;
+	dst_dy += tmp.dst.y;
+	tmp.dst.x = tmp.dst.y = 0;
+
+	src_dx += tmp.src.offset[0];
+	src_dy += tmp.src.offset[1];
+
 	gen7_emit_copy_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
 
@@ -3295,9 +3360,9 @@ fallback:
 			     box->x1 + src_dx, box->y1 + src_dy,
 			     box->x1 + dst_dx, box->y1 + dst_dy,
 			     box->x2 - box->x1, box->y2 - box->y1));
-			v[0] = pack_2s(box->x2, box->y2);
-			v[3] = pack_2s(box->x1, box->y2);
-			v[6] = pack_2s(box->x1, box->y1);
+			v[0] = pack_2s(box->x2 + dst_dx, box->y2 + dst_dy);
+			v[3] = pack_2s(box->x1 + dst_dx, box->y2 + dst_dy);
+			v[6] = pack_2s(box->x1 + dst_dx, box->y1 + dst_dy);
 
 			v[1] = (box->x2 + src_dx) * tmp.src.scale[0];
 			v[7] = v[4] = (box->x1 + src_dx) * tmp.src.scale[0];
@@ -3311,7 +3376,20 @@ fallback:
 	} while (n);
 
 	gen7_vertex_flush(sna);
+	sna_render_composite_redirect_done(sna, &tmp);
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return TRUE;
+
+fallback_tiled_src:
+	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+fallback_tiled_dst:
+	if (tmp.redirect.real_bo)
+		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
+fallback_tiled:
+	return sna_tiling_copy_boxes(sna, alu,
+				     src, src_bo, src_dx, src_dy,
+				     dst, dst_bo, dst_dx, dst_dy,
+				     box, n);
 }
 
 static void
@@ -3368,7 +3446,7 @@ gen7_render_copy(struct sna *sna, uint8_t alu,
 	     src->drawable.width, src->drawable.height,
 	     dst->drawable.width, dst->drawable.height));
 
-	if (prefer_blt_copy(sna, src_bo, dst_bo) &&
+	if (prefer_blt_copy(sna, src, src_bo, dst, dst_bo) &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d2580e6..757bad1 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -678,9 +678,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
-	kgem->max_object_size = kgem->aperture_total / 2;
-	kgem->max_cpu_size = kgem->aperture_total / 2;
-	kgem->max_gpu_size = kgem->aperture_total / 2;
+	kgem->max_object_size = 2 * kgem->aperture_total / 3;
+	kgem->max_cpu_size = kgem->max_object_size;
+	kgem->max_gpu_size = kgem->max_object_size;
 	if (!kgem->has_llc)
 		kgem->max_gpu_size = MAX_CACHE_SIZE;
 	if (gen < 40) {
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 974a716..b6930e0 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -379,18 +379,10 @@ static inline int kgem_buffer_size(struct kgem_bo *bo)
 	return bo->size.bytes;
 }
 
-static inline bool kgem_bo_can_blt(struct kgem *kgem,
-				   struct kgem_bo *bo)
+static inline bool kgem_bo_blt_pitch_is_ok(struct kgem *kgem,
+					   struct kgem_bo *bo)
 {
-	int pitch;
-
-	if (bo->tiling == I915_TILING_Y) {
-		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
-		     __FUNCTION__, bo->handle));
-		return false;
-	}
-
-	pitch = bo->pitch;
+	int pitch = bo->pitch;
 	if (kgem->gen >= 40 && bo->tiling)
 		pitch /= 4;
 	if (pitch > MAXSHORT) {
@@ -402,6 +394,18 @@ static inline bool kgem_bo_can_blt(struct kgem *kgem,
 	return true;
 }
 
+static inline bool kgem_bo_can_blt(struct kgem *kgem,
+				   struct kgem_bo *bo)
+{
+	if (bo->tiling == I915_TILING_Y) {
+		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+		     __FUNCTION__, bo->handle));
+		return false;
+	}
+
+	return kgem_bo_blt_pitch_is_ok(kgem, bo);
+}
+
 static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 				       struct kgem_bo *bo)
 {
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 9f51028..a7ea95c 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -2140,10 +2140,10 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	    !kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
 		_kgem_submit(kgem);
 		if (!kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL))
-			return sna_tiling_copy_boxes(sna, alu,
-						     src_bo, src_dx, src_dy,
-						     dst_bo, dst_dx, dst_dy,
-						     bpp, box, nbox);
+			return sna_tiling_blt_copy_boxes(sna, alu,
+							 src_bo, src_dx, src_dy,
+							 dst_bo, dst_dx, dst_dy,
+							 bpp, box, nbox);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -2244,7 +2244,8 @@ Bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
 	if (src_bo == dst_bo) {
 		DBG(("%s: dst == src\n", __FUNCTION__));
 
-		if (src_bo->tiling == I915_TILING_Y) {
+		if (src_bo->tiling == I915_TILING_Y &&
+		    kgem_bo_blt_pitch_is_ok(&sna->kgem, src_bo)) {
 			struct kgem_bo *bo;
 
 			DBG(("%s: src is Y-tiled\n", __FUNCTION__));
@@ -2287,7 +2288,8 @@ Bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
 				dst_bo = src_bo = bo;
 		}
 	} else {
-		if (src_bo->tiling == I915_TILING_Y) {
+		if (src_bo->tiling == I915_TILING_Y &&
+		    kgem_bo_blt_pitch_is_ok(&sna->kgem, src_bo)) {
 			DBG(("%s: src is y-tiled\n", __FUNCTION__));
 			assert(src_bo == sna_pixmap(src)->gpu_bo);
 			src_bo = sna_pixmap_change_tiling(src, I915_TILING_X);
@@ -2298,7 +2300,8 @@ Bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
 			}
 		}
 
-		if (dst_bo->tiling == I915_TILING_Y) {
+		if (dst_bo->tiling == I915_TILING_Y &&
+		    kgem_bo_blt_pitch_is_ok(&sna->kgem, dst_bo)) {
 			DBG(("%s: dst is y-tiled\n", __FUNCTION__));
 			assert(dst_bo == sna_pixmap(dst)->gpu_bo);
 			dst_bo = sna_pixmap_change_tiling(dst, I915_TILING_X);
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index eb5df9d..62a8962 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -593,7 +593,7 @@ fallback:
 			int step;
 
 tile:
-			step = MIN(sna->render.max_3d_size,
+			step = MIN(sna->render.max_3d_size - 4096 / dst->drawable.bitsPerPixel,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
 			while (step * step * 4 > sna->kgem.max_upload_tile_size)
 				step /= 2;
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index fc6e6df..bc8b2de 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -805,6 +805,80 @@ cleanup_tmp:
 	return ret;
 }
 
+bool
+sna_render_pixmap_partial(struct sna *sna,
+			  PixmapPtr pixmap,
+			  struct kgem_bo *bo,
+			  struct sna_composite_channel *channel,
+			  int16_t x, int16_t y,
+			  int16_t w, int16_t h)
+{
+	BoxRec box;
+	int tile_width, tile_height, tile_size;
+	int offset;
+
+	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
+
+	if (bo->pitch > sna->render.max_3d_pitch)
+		return false;
+
+	box.x1 = x;
+	box.y1 = y;
+	box.x2 = x + w;
+	box.y2 = y + h;
+
+	if (box.x1 < 0)
+		box.x1 = 0;
+	if (box.y1 < 0)
+		box.y1 = 0;
+	if (box.x2 > pixmap->drawable.width)
+		box.x2 = pixmap->drawable.width;
+	if (box.y2 > pixmap->drawable.height)
+		box.y2 = pixmap->drawable.height;
+
+	kgem_get_tile_size(&sna->kgem, bo->tiling,
+			   &tile_width, &tile_height, &tile_size);
+
+	/* Ensure we align to an even tile row */
+	box.y1 = box.y1 & ~(2*tile_height - 1);
+	box.y2 = ALIGN(box.y2, 2*tile_height);
+	if (box.y2 > pixmap->drawable.height)
+		box.y2 = pixmap->drawable.height;
+
+	box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
+	box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
+	if (box.x2 > pixmap->drawable.width)
+		box.x2 = pixmap->drawable.width;
+
+	w = box.x2 - box.x1;
+	h = box.y2 - box.y1;
+	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
+	     box.x1, box.y1, box.x2, box.y2, w, h,
+	     pixmap->drawable.width, pixmap->drawable.height));
+	if (w <= 0 || h <= 0 ||
+	    w > sna->render.max_3d_size ||
+	    h > sna->render.max_3d_size)
+		return false;
+
+	/* How many tiles across are we? */
+	offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+	channel->bo = kgem_create_proxy(bo,
+					box.y1 * bo->pitch + offset,
+					h * bo->pitch);
+	if (channel->bo == NULL)
+		return false;
+
+	channel->bo->pitch = bo->pitch;
+
+	channel->offset[0] = x - box.x1;
+	channel->offset[1] = y - box.y1;
+	channel->scale[0] = 1.f/w;
+	channel->scale[1] = 1.f/h;
+	channel->width  = w;
+	channel->height = h;
+	return true;
+}
+
 static int
 sna_render_picture_partial(struct sna *sna,
 			   PicturePtr picture,
@@ -1068,13 +1142,25 @@ sna_render_picture_extract(struct sna *sna,
 						       I915_TILING_X, w, h,
 						       pixmap->drawable.bitsPerPixel),
 				    0);
-		if (bo && !sna_blt_copy_boxes(sna, GXcopy,
-					src_bo, 0, 0,
-					bo, -box.x1, -box.y1,
-					pixmap->drawable.bitsPerPixel,
-					&box, 1)) {
-			kgem_bo_destroy(&sna->kgem, bo);
-			bo = NULL;
+		if (bo) {
+			PixmapRec tmp;
+
+			tmp.drawable.width  = w;
+			tmp.drawable.height = h;
+			tmp.drawable.depth  = pixmap->drawable.depth;
+			tmp.drawable.bitsPerPixel = pixmap->drawable.bitsPerPixel;
+			tmp.devPrivate.ptr = NULL;
+
+			assert(tmp.drawable.width);
+			assert(tmp.drawable.height);
+
+			if (!sna->render.copy_boxes(sna, GXcopy,
+						    pixmap, src_bo, 0, 0,
+						    &tmp, bo, -box.x1, -box.y1,
+						    &box, 1)) {
+				kgem_bo_destroy(&sna->kgem, bo);
+				bo = NULL;
+			}
 		}
 	}
 
@@ -1541,7 +1627,6 @@ sna_render_composite_redirect(struct sna *sna,
 {
 	struct sna_composite_redirect *t = &op->redirect;
 	int bpp = op->dst.pixmap->drawable.bitsPerPixel;
-	struct sna_pixmap *priv;
 	struct kgem_bo *bo;
 
 #if NO_REDIRECT
@@ -1554,11 +1639,9 @@ sna_render_composite_redirect(struct sna *sna,
 	if (!width || !height)
 		return FALSE;
 
-	priv = sna_pixmap_force_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE);
-	if (priv == NULL) {
-		DBG(("%s: fallback -- no GPU bo attached\n", __FUNCTION__));
+	if (width  > sna->render.max_3d_pitch ||
+	    height > sna->render.max_3d_pitch)
 		return FALSE;
-	}
 
 	if (op->dst.bo->pitch <= sna->render.max_3d_pitch) {
 		int tile_width, tile_height, tile_size;
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 94c2744..a689315 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -507,10 +507,16 @@ Bool sna_tiling_fill_boxes(struct sna *sna,
 			   const xRenderColor *color,
 			   PixmapPtr dst, struct kgem_bo *dst_bo,
 			   const BoxRec *box, int n);
+
 Bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
-			   struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
-			   struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
-			   int bpp, const BoxRec *box, int nbox);
+			   PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			   PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			   const BoxRec *box, int n);
+
+Bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
+			       struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			       struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			       int bpp, const BoxRec *box, int nbox);
 
 Bool sna_blt_composite(struct sna *sna,
 		       uint32_t op,
@@ -589,6 +595,14 @@ sna_render_pixmap_bo(struct sna *sna,
 		     int16_t w, int16_t h,
 		     int16_t dst_x, int16_t dst_y);
 
+bool
+sna_render_pixmap_partial(struct sna *sna,
+			  PixmapPtr pixmap,
+			  struct kgem_bo *bo,
+			  struct sna_composite_channel *channel,
+			  int16_t x, int16_t y,
+			  int16_t w, int16_t h);
+
 int
 sna_render_picture_extract(struct sna *sna,
 			   PicturePtr picture,
@@ -614,6 +628,13 @@ sna_render_picture_convert(struct sna *sna,
 			   int16_t w, int16_t h,
 			   int16_t dst_x, int16_t dst_y);
 
+inline static void sna_render_composite_redirect_init(struct sna_composite_op *op)
+{
+	struct sna_composite_redirect *t = &op->redirect;
+	t->real_bo = NULL;
+	t->damage = NULL;
+}
+
 Bool
 sna_render_composite_redirect(struct sna *sna,
 			      struct sna_composite_op *op,
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 702192a..00e111c 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -421,10 +421,10 @@ done:
 	return ret;
 }
 
-Bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
-			   struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
-			   struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
-			   int bpp, const BoxRec *box, int nbox)
+Bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
+			       struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			       struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			       int bpp, const BoxRec *box, int nbox)
 {
 	RegionRec region, tile, this;
 	struct kgem_bo *bo;
@@ -516,3 +516,125 @@ done:
 	pixman_region_fini(&region);
 	return ret;
 }
+
+static Bool
+box_intersect(BoxPtr a, const BoxRec *b)
+{
+	if (a->x1 < b->x1)
+		a->x1 = b->x1;
+	if (a->x2 > b->x2)
+		a->x2 = b->x2;
+	if (a->y1 < b->y1)
+		a->y1 = b->y1;
+	if (a->y2 > b->y2)
+		a->y2 = b->y2;
+
+	return a->x1 < a->x2 && a->y1 < a->y2;
+}
+
+Bool
+sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
+		      PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+		      PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+		      const BoxRec *box, int n)
+{
+	BoxRec extents, tile, stack[64], *clipped, *c;
+	PixmapRec p;
+	int i, step;
+	Bool ret = FALSE;
+
+	extents = box[0];
+	for (i = 1; i < n; i++) {
+		if (extents.x1 < box[i].x1)
+			extents.x1 = box[i].x1;
+		if (extents.y1 < box[i].y1)
+			extents.y1 = box[i].y1;
+
+		if (extents.x2 > box[i].x2)
+			extents.x2 = box[i].x2;
+		if (extents.y2 > box[i].y2)
+			extents.y2 = box[i].y2;
+	}
+
+	step = sna->render.max_3d_size - 4096 / dst->drawable.bitsPerPixel;
+	while (step * step * 4 > sna->kgem.max_upload_tile_size)
+		step /= 2;
+
+	DBG(("%s: tiling copy, using %dx%d tiles\n",
+	     __FUNCTION__, step, step));
+
+	if (n > ARRAY_SIZE(stack)) {
+		clipped = malloc(sizeof(BoxRec) * n);
+		if (clipped == NULL)
+			goto tiled_error;
+	} else
+		clipped = stack;
+
+	p.drawable.depth = src->drawable.depth;
+	p.drawable.bitsPerPixel = src->drawable.bitsPerPixel;
+	p.devPrivate.ptr = NULL;
+
+	for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+		tile.y2 = tile.y1 + step;
+		if (tile.y2 > extents.y2)
+			tile.y2 = extents.y2;
+
+		for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+			struct kgem_bo *tmp_bo;
+
+			tile.x2 = tile.x1 + step;
+			if (tile.x2 > extents.x2)
+				tile.x2 = extents.x2;
+
+			p.drawable.width  = tile.x2 - tile.x1;
+			p.drawable.height = tile.y2 - tile.y1;
+
+			tmp_bo = kgem_create_2d(&sna->kgem,
+						p.drawable.width,
+						p.drawable.height,
+						p.drawable.bitsPerPixel,
+						I915_TILING_X, 0);
+			if (!tmp_bo)
+				goto tiled_error;
+
+			c = clipped;
+			for (i = 0; i < n; i++) {
+				*c = box[i];
+				if (!box_intersect(c, &tile))
+					continue;
+
+				DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+				     __FUNCTION__,
+				     c->x1, c->y1,
+				     c->x2, c->y2,
+				     src_dx, src_dy,
+				     c->x1 - tile.x1,
+				     c->y1 - tile.y1));
+				c++;
+			}
+
+			if (c == clipped ||
+			    (sna->render.copy_boxes(sna, GXcopy,
+						    src, src_bo, src_dx, src_dy,
+						    &p, tmp_bo, -tile.x1, -tile.y1,
+						    clipped, c - clipped) &&
+			     sna->render.copy_boxes(sna, alu,
+						    &p, tmp_bo, -tile.x1, -tile.y1,
+						    dst, dst_bo, dst_dx, dst_dy,
+						    clipped, c - clipped)))
+				i = 1;
+
+			kgem_bo_destroy(&sna->kgem, tmp_bo);
+
+			if (!i)
+				goto tiled_error;
+		}
+	}
+
+	ret = TRUE;
+tiled_error:
+	if (clipped != stack)
+		free(clipped);
+
+	return ret;
+}
commit 876eec027a54b413d1d65c4241967ef5f3d801de
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Feb 2 15:23:03 2012 +0000

    sna: Add a couple of sanity checks that the CPU drawable is on the CPU
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 71d3f06..1d2b999 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1070,7 +1070,8 @@ done:
 		DBG(("%s: syncing CPU bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 	}
-
+	assert(pixmap->devPrivate.ptr);
+	assert(pixmap->devKind);
 	return true;
 }
 
@@ -1534,6 +1535,8 @@ out:
 		DBG(("%s: syncing cpu bo\n", __FUNCTION__));
 		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 	}
+	assert(pixmap->devPrivate.ptr);
+	assert(pixmap->devKind);
 	return true;
 }
 
commit cde41fdfad241fba5bc12fcd8a0373bf2a4ac28a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 1 19:10:41 2012 +0000

    sna/gen6: Ring switching outweighs the benefits for cairo-traces
    
    At the moment, the jury is still out on whether freely switching rings
    for fills is a Good Idea. So make it easier to turn it on and off for
    testing.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index ec5412a..9f799ef 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -57,6 +57,8 @@
 #define NO_FILL_BOXES 0
 #define NO_CLEAR 0
 
+#define NO_RING_SWITCH 1
+
 #define GEN6_MAX_SIZE 8192
 
 static const uint32_t ps_kernel_nomask_affine[][4] = {
@@ -2215,6 +2217,11 @@ static bool prefer_blt_ring(struct sna *sna)
 	return sna->kgem.ring != KGEM_RENDER;
 }
 
+static bool can_switch_rings(struct sna *sna)
+{
+	return sna->kgem.has_semaphores && !NO_RING_SWITCH;
+}
+
 static bool
 is_solid(PicturePtr picture)
 {
@@ -2252,7 +2259,7 @@ try_blt(struct sna *sna,
 		return TRUE;
 	}
 
-	if (sna->kgem.has_semaphores) {
+	if (can_switch_rings(sna)) {
 		if (is_solid(src))
 			return TRUE;
 	}
@@ -3432,7 +3439,9 @@ gen6_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 static inline bool prefer_blt_fill(struct sna *sna,
 				   struct kgem_bo *bo)
 {
-	return sna->kgem.has_semaphores || prefer_blt_ring(sna) || untiled_tlb_miss(bo);
+	return (can_switch_rings(sna) ||
+		prefer_blt_ring(sna) ||
+		untiled_tlb_miss(bo));
 }
 
 static Bool
commit 7ee4928c519000d738806c05704ae4d7b2c58e52
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 1 14:52:56 2012 +0000

    sna: Search again for a just-large-enough mapping for inplace uploads
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index b446558..d2580e6 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -104,6 +104,8 @@ static inline void list_replace(struct list *old,
 #endif
 
 #define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
+#define NUM_PAGES(x) (((x) + PAGE_SIZE-1) / PAGE_SIZE)
+
 #define MAX_GTT_VMA_CACHE 512
 #define MAX_CPU_VMA_CACHE INT16_MAX
 #define MAP_PRESERVE_TIME 10
@@ -3215,6 +3217,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			old = search_linear_cache(kgem, alloc, CREATE_CPU_MAP);
 		if (old == NULL)
 			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE | CREATE_CPU_MAP);
+		if (old == NULL)
+			old = search_linear_cache(kgem, NUM_PAGES(size), CREATE_INACTIVE | CREATE_CPU_MAP);
 		if (old) {
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
@@ -3290,6 +3294,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			}
 		}
 #endif
+		if (old == NULL)
+			old = search_linear_cache(kgem, NUM_PAGES(size),
+						  CREATE_INACTIVE | CREATE_GTT_MAP);
 		if (old) {
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
commit 139a9352801a7cd4c5105e65924f9be6d6583d70
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 1 01:33:52 2012 +0000

    sna: Add debugging code to verify damage extents of fallback paths
    
    After using the CPU, upload the damage and read back the pixels from the
    GPU bo and verify that the two are equivalent.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 40748ec..71d3f06 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -80,12 +80,68 @@
 #define ACCEL_POLY_FILL_ARC 1
 #define ACCEL_POLY_TEXT8 1
 #define ACCEL_POLY_TEXT16 1
+#define ACCEL_POLY_GLYPH 1
 #define ACCEL_IMAGE_TEXT8 1
 #define ACCEL_IMAGE_TEXT16 1
 #define ACCEL_IMAGE_GLYPH 1
-#define ACCEL_POLY_GLYPH 1
 #define ACCEL_PUSH_PIXELS 1
 
+#if 0
+static void __sna_fallback_flush(DrawablePtr d)
+{
+	PixmapPtr pixmap = get_drawable_pixmap(d);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_pixmap *priv;
+	BoxRec box;
+	PixmapPtr tmp;
+	int i, j;
+	char *src, *dst;
+
+	DBG(("%s: uploading CPU damage...\n", __FUNCTION__));
+	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
+	if (priv == NULL)
+		return;
+
+	DBG(("%s: downloading GPU damage...\n", __FUNCTION__));
+	if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+		return;
+
+	box.x1 = box.y1 = 0;
+	box.x2 = pixmap->drawable.width;
+	box.y2 = pixmap->drawable.height;
+
+	tmp = fbCreatePixmap(pixmap->drawable.pScreen,
+			     pixmap->drawable.width,
+			     pixmap->drawable.height,
+			     pixmap->drawable.depth,
+			     0);
+
+	DBG(("%s: comparing with direct read...\n", __FUNCTION__));
+	sna_read_boxes(sna,
+		       priv->gpu_bo, 0, 0,
+		       tmp, 0, 0,
+		       &box, 1);
+
+	src = pixmap->devPrivate.ptr;
+	dst = tmp->devPrivate.ptr;
+	for (i = 0; i < tmp->drawable.height; i++) {
+		if (memcmp(src, dst, tmp->drawable.width * tmp->drawable.bitsPerPixel >> 3)) {
+			for (j = 0; src[j] == dst[j]; j++)
+				;
+			ErrorF("mismatch at (%d, %d)\n",
+			       8*j / tmp->drawable.bitsPerPixel, i);
+			abort();
+		}
+		src += pixmap->devKind;
+		dst += tmp->devKind;
+	}
+	fbDestroyPixmap(tmp);
+}
+#define FALLBACK_FLUSH(d) __sna_fallback_flush(d)
+#else
+#define FALLBACK_FLUSH(d)
+#endif
+
 static int sna_font_key;
 
 static const uint8_t copy_ROP[] = {
@@ -1453,8 +1509,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	}
 
 done:
-	if (flags & MOVE_WRITE && !DAMAGE_IS_ALL(priv->cpu_damage)) {
+	if (flags & MOVE_WRITE) {
 		DBG(("%s: applying cpu damage\n", __FUNCTION__));
+		assert(!DAMAGE_IS_ALL(priv->cpu_damage));
 		assert_pixmap_contains_box(pixmap, RegionExtents(region));
 		sna_damage_add(&priv->cpu_damage, region);
 		if (priv->gpu_bo &&
@@ -2982,6 +3039,7 @@ fallback:
 	DBG(("%s: fbPutImage(%d, %d, %d, %d)\n",
 	     __FUNCTION__, x, y, w, h));
 	fbPutImage(drawable, gc, depth, x, y, w, h, left, format, bits);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
@@ -3694,6 +3752,7 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 				  src_x, src_y,
 				  width, height,
 				  dst_x, dst_y);
+		FALLBACK_FLUSH(dst);
 out:
 		RegionUninit(&region);
 		return ret;
@@ -4625,6 +4684,7 @@ fallback:
 
 	DBG(("%s: fbFillSpans\n", __FUNCTION__));
 	fbFillSpans(drawable, gc, n, pt, width, sorted);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
@@ -4663,6 +4723,7 @@ fallback:
 
 	DBG(("%s: fbSetSpans\n", __FUNCTION__));
 	fbSetSpans(drawable, gc, src, pt, width, n, sorted);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
@@ -5137,6 +5198,7 @@ fallback:
 	DBG(("%s: fbCopyPlane(%d, %d, %d, %d, %d,%d) %x\n",
 	     __FUNCTION__, src_x, src_y, w, h, dst_x, dst_y, (unsigned)bit));
 	ret = fbCopyPlane(src, dst, gc, src_x, src_y, w, h, dst_x, dst_y, bit);
+	FALLBACK_FLUSH(dst);
 out:
 	RegionUninit(&region);
 	return ret;
@@ -5336,6 +5398,7 @@ fallback:
 
 	DBG(("%s: fbPolyPoint\n", __FUNCTION__));
 	fbPolyPoint(drawable, gc, mode, n, pt);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
@@ -6389,6 +6452,7 @@ fallback:
 
 	DBG(("%s: fbPolyLine\n", __FUNCTION__));
 	fbPolyLine(drawable, gc, mode, n, pt);
+	FALLBACK_FLUSH(drawable);
 
 	gc->ops = (GCOps *)&sna_gc_ops;
 out:
@@ -7301,6 +7365,7 @@ fallback:
 
 	DBG(("%s: fbPolySegment\n", __FUNCTION__));
 	fbPolySegment(drawable, gc, n, seg);
+	FALLBACK_FLUSH(drawable);
 
 	gc->ops = (GCOps *)&sna_gc_ops;
 out:
@@ -7850,6 +7915,7 @@ fallback:
 
 	DBG(("%s: fbPolyRectangle\n", __FUNCTION__));
 	fbPolyRectangle(drawable, gc, n, r);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
@@ -8027,6 +8093,7 @@ fallback:
 
 	DBG(("%s -- fbPolyArc\n", __FUNCTION__));
 	fbPolyArc(drawable, gc, n, arc);
+	FALLBACK_FLUSH(drawable);
 
 	gc->ops = (GCOps *)&sna_gc_ops;
 out:
@@ -9792,6 +9859,7 @@ fallback:
 			}
 		} while (--n);
 	}
+	FALLBACK_FLUSH(draw);
 out:
 	RegionUninit(&region);
 }
@@ -10332,6 +10400,7 @@ force_fallback:
 		DBG(("%s: fallback -- fbPolyGlyphBlt\n", __FUNCTION__));
 		fbPolyGlyphBlt(drawable, gc, x, y, n,
 			       info, FONTGLYPHS(gc->font));
+		FALLBACK_FLUSH(drawable);
 	}
 out:
 	RegionUninit(&region);
@@ -10420,6 +10489,7 @@ force_fallback:
 		DBG(("%s: fallback -- fbPolyGlyphBlt\n", __FUNCTION__));
 		fbPolyGlyphBlt(drawable, gc, x, y, n,
 			       info, FONTGLYPHS(gc->font));
+		FALLBACK_FLUSH(drawable);
 	}
 out:
 	RegionUninit(&region);
@@ -10517,6 +10587,7 @@ force_fallback:
 		DBG(("%s: fallback -- fbImageGlyphBlt\n", __FUNCTION__));
 		fbImageGlyphBlt(drawable, gc, x, y, n,
 				info, FONTGLYPHS(gc->font));
+		FALLBACK_FLUSH(drawable);
 	}
 out:
 	RegionUninit(&region);
@@ -10607,6 +10678,7 @@ force_fallback:
 		DBG(("%s: fallback -- fbImageGlyphBlt\n", __FUNCTION__));
 		fbImageGlyphBlt(drawable, gc, x, y, n,
 				info, FONTGLYPHS(gc->font));
+		FALLBACK_FLUSH(drawable);
 	}
 out:
 	RegionUninit(&region);
@@ -10869,6 +10941,7 @@ fallback:
 
 	DBG(("%s: fallback -- fbImageGlyphBlt\n", __FUNCTION__));
 	fbImageGlyphBlt(drawable, gc, x, y, n, info, base);
+	FALLBACK_FLUSH(drawable);
 
 out:
 	RegionUninit(&region);
@@ -10943,6 +11016,7 @@ fallback:
 
 	DBG(("%s: fallback -- fbPolyGlyphBlt\n", __FUNCTION__));
 	fbPolyGlyphBlt(drawable, gc, x, y, n, info, base);
+	FALLBACK_FLUSH(drawable);
 
 out:
 	RegionUninit(&region);
@@ -11123,6 +11197,7 @@ sna_push_pixels(GCPtr gc, PixmapPtr bitmap, DrawablePtr drawable,
 	DBG(("%s: fallback, fbPushPixels(%d, %d, %d %d)\n",
 	     __FUNCTION__, w, h, x, y));
 	fbPushPixels(gc, bitmap, drawable, w, h, x, y);
+	FALLBACK_FLUSH(drawable);
 out:
 	RegionUninit(&region);
 }
diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index ab825aa..f52ecac 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -514,8 +514,7 @@ static void __sna_damage_reduce(struct sna_damage *damage)
 		damage->extents = region->extents;
 	}
 
-	if (free_boxes)
-		free(boxes);
+	free(free_boxes);
 
 done:
 	damage->mode = DAMAGE_ADD;
@@ -1048,6 +1047,7 @@ static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 					       &damage->region,
 					       region);
 			damage->extents = damage->region.extents;
+			assert(pixman_region_not_empty(&damage->region));
 			return damage;
 		}
 
commit 0e9d1518d1fc187fb4df99c00e4d05cac142f5ce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Feb 1 01:27:43 2012 +0000

    sna: Fill extents for ImageGlyphs
    
    The spec says to fill the characters boxes, which is what the hardware
    does. The implementation fills the extents instead. rxvt expects the
    former, emacs the latter. Overdraw is a nuisance, but less than leaving
    glyphs behind...
    
    Reported-by: walch.martin at web.de
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45438
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index a0ea54f..9df8cfd 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -359,7 +359,8 @@ to_sna_from_kgem(struct kgem *kgem)
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #endif
 #define ALIGN(i,m)	(((i) + (m) - 1) & ~((m) - 1))
-#define MIN(a,b)	((a) < (b) ? (a) : (b))
+#define MIN(a,b)	((a) <= (b) ? (a) : (b))
+#define MAX(a,b)	((a) >= (b) ? (a) : (b))
 
 extern xf86CrtcPtr sna_covering_crtc(ScrnInfoPtr scrn,
 				     const BoxRec *box,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 4148bdb..40748ec 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -10011,6 +10011,11 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	extents = REGION_RECTS(clip);
 	last_extents = extents + REGION_NUM_RECTS(clip);
 
+	if (bg != -1) /* emulate miImageGlyphBlt */
+		sna_blt_fill_boxes(sna, GXcopy,
+				   bo, drawable->bitsPerPixel,
+				   bg, extents, REGION_NUM_RECTS(clip));
+
 	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
@@ -10174,6 +10179,8 @@ sna_glyph_extents(FontPtr font,
 
 		extents->overallWidth += p->metrics.characterWidth;
 	}
+
+	assert(extents->overallWidth > 0);
 }
 
 static bool sna_set_glyph(CharInfoPtr in, CharInfoPtr out)
@@ -10458,10 +10465,17 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 		return;
 
 	sna_glyph_extents(gc->font, info, n, &extents);
-	region.extents.x1 = x + extents.overallLeft;
-	region.extents.y1 = y - extents.overallAscent;
-	region.extents.x2 = x + extents.overallRight;
-	region.extents.y2 = y + extents.overallDescent;
+	region.extents.x1 = x + MIN(0, extents.overallLeft);
+	region.extents.y1 = y - extents.fontAscent;
+	region.extents.x2 = x + MAX(extents.overallWidth, extents.overallRight);
+	region.extents.y2 = y + extents.fontDescent;
+
+	DBG(("%s: count=%ld/%d, extents=(left=%d, right=%d, width=%d, ascent=%d, descent=%d), box=(%d, %d), (%d, %d)\n",
+	     __FUNCTION__, n, count,
+	     extents.overallLeft, extents.overallRight, extents.overallWidth,
+	     extents.fontAscent, extents.fontDescent,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
 
 	translate_box(&region.extents, drawable);
 	clip_box(&region.extents, gc);
@@ -10473,6 +10487,11 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (!RegionNotEmpty(&region))
 		return;
 
+	DBG(("%s: clipped extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
+
 	if (FORCE_FALLBACK)
 		goto force_fallback;
 
@@ -10535,10 +10554,17 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 		return;
 
 	sna_glyph_extents(gc->font, info, n, &extents);
-	region.extents.x1 = x + extents.overallLeft;
-	region.extents.y1 = y - extents.overallAscent;
-	region.extents.x2 = x + extents.overallRight;
-	region.extents.y2 = y + extents.overallDescent;
+	region.extents.x1 = x + MIN(0, extents.overallLeft);
+	region.extents.y1 = y - extents.fontAscent;
+	region.extents.x2 = x + MAX(extents.overallWidth, extents.overallRight);
+	region.extents.y2 = y + extents.fontDescent;
+
+	DBG(("%s: count=%ld/%d, extents=(left=%d, right=%d, width=%d, ascent=%d, descent=%d), box=(%d, %d), (%d, %d)\n",
+	     __FUNCTION__, n, count,
+	     extents.overallLeft, extents.overallRight, extents.overallWidth,
+	     extents.fontAscent, extents.fontDescent,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
 
 	translate_box(&region.extents, drawable);
 	clip_box(&region.extents, gc);
@@ -10550,6 +10576,11 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (!RegionNotEmpty(&region))
 		return;
 
+	DBG(("%s: clipped extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
+
 	if (FORCE_FALLBACK)
 		goto force_fallback;
 
@@ -10625,6 +10656,11 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	extents = REGION_RECTS(clip);
 	last_extents = extents + REGION_NUM_RECTS(clip);
 
+	if (bg != -1) /* emulate miImageGlyphBlt */
+		sna_blt_fill_boxes(sna, GXcopy,
+				   bo, drawable->bitsPerPixel,
+				   bg, extents, REGION_NUM_RECTS(clip));
+
 	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
@@ -10775,11 +10811,18 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 	if (n == 0)
 		return;
 
-	QueryGlyphExtents(gc->font, info, n, &extents);
-	region.extents.x1 = x + extents.overallLeft;
-	region.extents.y1 = y - extents.overallAscent;
-	region.extents.x2 = x + extents.overallRight;
-	region.extents.y2 = y + extents.overallDescent;
+	sna_glyph_extents(gc->font, info, n, &extents);
+	region.extents.x1 = x + MIN(0, extents.overallLeft);
+	region.extents.y1 = y - extents.fontAscent;
+	region.extents.x2 = x + MAX(extents.overallWidth, extents.overallRight);
+	region.extents.y2 = y + extents.fontDescent;
+
+	DBG(("%s: count=%d, extents=(left=%d, right=%d, width=%d, ascent=%d, descent=%d), box=(%d, %d), (%d, %d)\n",
+	     __FUNCTION__, n,
+	     extents.overallLeft, extents.overallRight, extents.overallWidth,
+	     extents.fontAscent, extents.fontDescent,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
 
 	translate_box(&region.extents, drawable);
 	clip_box(&region.extents, gc);
@@ -10847,7 +10890,7 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 	if (n == 0)
 		return;
 
-	QueryGlyphExtents(gc->font, info, n, &extents);
+	sna_glyph_extents(gc->font, info, n, &extents);
 	region.extents.x1 = x + extents.overallLeft;
 	region.extents.y1 = y - extents.overallAscent;
 	region.extents.x2 = x + extents.overallRight;
commit 1454559985e45e6af601c1a3e7f1d4a13280ed3e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 22:38:46 2012 +0000

    sna: PolyGlyph supports all of fill/tile/stipple rules
    
    The hw routines only directly supports solid fill so fallback for the
    interesting cases. An alternative would be to investigate using the
    miPolyGlyph routine to convert the weird fills into spans in order to
    fallback. Sounds cheaper to fallback, so wait for an actual use case.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index cccdd59..b446558 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -719,11 +719,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 
 	DBG(("%s: large object thresold=%d\n",
 	     __FUNCTION__, kgem->large_object_size));
-	DBG(("%s: max object size (gpu=%d, cpu=%d, tile=%d)\n",
+	DBG(("%s: max object size (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
 	     __FUNCTION__,
-	     kgem->max_gpu_size,
-	     kgem->max_cpu_size,
-	     kgem->max_tile_size));
+	     kgem->max_gpu_size, kgem->max_cpu_size,
+	     kgem->max_upload_tile_size, kgem->max_copy_tile_size));
 
 	/* Convert the aperture thresholds to pages */
 	kgem->aperture_low /= PAGE_SIZE;
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3115bd7..4148bdb 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -9968,7 +9968,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	      int _x, int _y, unsigned int _n,
 	      CharInfoPtr *_info,
 	      RegionRec *clip,
-	      bool transparent)
+	      uint32_t fg, uint32_t bg)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
@@ -9979,10 +9979,10 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	int16_t dx, dy;
 	uint32_t br00;
 
-	uint8_t rop = transparent ? copy_ROP[gc->alu] : ROP_S;
+	uint8_t rop = bg == -1 ? copy_ROP[gc->alu] : ROP_S;
 
-	DBG(("%s (%d, %d) x %d, transparent? %d, alu=%d\n",
-	     __FUNCTION__, _x, _y, _n, transparent, rop));
+	DBG(("%s (%d, %d) x %d, fg=%08x, bg=%08x alu=%02x\n",
+	     __FUNCTION__, _x, _y, _n, fg, bg, rop));
 
 	if (wedged(sna)) {
 		DBG(("%s -- fallback, wedged\n", __FUNCTION__));
@@ -10025,7 +10025,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
-	b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+	b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 	b[2] = extents->y1 << 16 | extents->x1;
 	b[3] = extents->y2 << 16 | extents->x2;
 	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10033,8 +10033,8 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			      I915_GEM_DOMAIN_RENDER |
 			      KGEM_RELOC_FENCED,
 			      0);
-	b[5] = gc->bgPixel;
-	b[6] = gc->fgPixel;
+	b[5] = bg;
+	b[6] = fg;
 	b[7] = 0;
 	sna->kgem.nbatch += 8;
 
@@ -10079,7 +10079,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
-				b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+				b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 				b[2] = extents->y1 << 16 | extents->x1;
 				b[3] = extents->y2 << 16 | extents->x2;
 				b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10087,8 +10087,8 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 						      I915_GEM_DOMAIN_RENDER |
 						      KGEM_RELOC_FENCED,
 						      0);
-				b[5] = gc->bgPixel;
-				b[6] = gc->fgPixel;
+				b[5] = bg;
+				b[6] = fg;
 				b[7] = 0;
 				sna->kgem.nbatch += 8;
 			}
@@ -10269,6 +10269,7 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	ExtentInfoRec extents;
 	RegionRec region;
 	long unsigned i, n;
+	uint32_t fg;
 
 	if (drawable->depth < 8)
 		goto fallback;
@@ -10302,7 +10303,13 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_POLY_TEXT8)
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, true)) {
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		return false;
+
+	if (!gc_is_solid(gc, &fg))
+		goto force_fallback;
+
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10350,6 +10357,7 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	ExtentInfoRec extents;
 	RegionRec region;
 	long unsigned i, n;
+	uint32_t fg;
 
 	if (drawable->depth < 8)
 		goto fallback;
@@ -10383,7 +10391,13 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_POLY_TEXT16)
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, true)) {
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		return false;
+
+	if (!gc_is_solid(gc, &fg))
+		goto force_fallback;
+
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, fg, -1)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10465,7 +10479,10 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_IMAGE_TEXT8)
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, false)) {
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		goto force_fallback;
+
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, gc->fgPixel, gc->bgPixel)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10539,7 +10556,10 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 	if (!ACCEL_IMAGE_TEXT16)
 		goto force_fallback;
 
-	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, false)) {
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		goto force_fallback;
+
+	if (!sna_glyph_blt(drawable, gc, x, y, n, info, &region, gc->fgPixel, gc->bgPixel)) {
 force_fallback:
 		DBG(("%s: fallback\n", __FUNCTION__));
 		gc->font->get_glyphs(gc->font, count, (unsigned char *)chars,
@@ -10580,14 +10600,14 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		       struct kgem_bo *bo,
 		       struct sna_damage **damage,
 		       RegionPtr clip,
-		       bool transparent)
+		       uint32_t fg, uint32_t bg)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	const BoxRec *extents, *last_extents;
 	uint32_t *b;
 	int16_t dx, dy;
-	uint8_t rop = transparent ? copy_ROP[gc->alu] : ROP_S;
+	uint8_t rop = bg == -1 ? copy_ROP[gc->alu] : ROP_S;
 
 	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
@@ -10619,7 +10639,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
-	b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+	b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 	b[2] = extents->y1 << 16 | extents->x1;
 	b[3] = extents->y2 << 16 | extents->x2;
 	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
@@ -10627,8 +10647,8 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			      I915_GEM_DOMAIN_RENDER |
 			      KGEM_RELOC_FENCED,
 			      0);
-	b[5] = gc->bgPixel;
-	b[6] = gc->fgPixel;
+	b[5] = bg;
+	b[6] = fg;
 	b[7] = 0;
 	sna->kgem.nbatch += 8;
 
@@ -10672,7 +10692,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
-				b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
+				b[1] |= 1 << 30 | (bg == -1) << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 				b[2] = extents->y1 << 16 | extents->x1;
 				b[3] = extents->y2 << 16 | extents->x2;
 				b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
@@ -10681,8 +10701,8 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 						      I915_GEM_DOMAIN_RENDER |
 						      KGEM_RELOC_FENCED,
 						      0);
-				b[5] = gc->bgPixel;
-				b[6] = gc->fgPixel;
+				b[5] = bg;
+				b[6] = fg;
 				b[7] = 0;
 				sna->kgem.nbatch += 8;
 			}
@@ -10786,9 +10806,13 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 		goto fallback;
 	}
 
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		goto fallback;
+
 	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
-				   bo, damage, &region, false))
+				   bo, damage, &region,
+				   gc->fgPixel, gc->bgPixel))
 		goto out;
 
 fallback:
@@ -10818,6 +10842,7 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 	RegionRec region;
 	struct sna_damage **damage;
 	struct kgem_bo *bo;
+	uint32_t fg;
 
 	if (n == 0)
 		return;
@@ -10853,9 +10878,15 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 		goto fallback;
 	}
 
+	if (!PM_IS_SOLID(drawable, gc->planemask))
+		goto fallback;
+
+	if (!gc_is_solid(gc, &fg))
+		goto fallback;
+
 	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
-				   bo, damage, &region, true))
+				   bo, damage, &region, fg, -1))
 		goto out;
 
 fallback:
commit 7eca05aeeb5b716069868723210d2753061ddc70
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 19:19:13 2012 +0000

    sna/gen6: Prefer to do fills using the BLT
    
    Using the BLT is substantially faster than the current shaders for solid
    fill. The downside is that it invokes more ring switching.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 93410b6..ec5412a 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2210,12 +2210,23 @@ gen6_composite_set_target(struct sna *sna,
 	return TRUE;
 }
 
+static bool prefer_blt_ring(struct sna *sna)
+{
+	return sna->kgem.ring != KGEM_RENDER;
+}
+
+static bool
+is_solid(PicturePtr picture)
+{
+	return sna_picture_is_solid(picture, NULL);
+}
+
 static Bool
 try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (sna->kgem.ring != KGEM_RENDER) {
+	if (prefer_blt_ring(sna)) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return TRUE;
 	}
@@ -2241,15 +2252,12 @@ try_blt(struct sna *sna,
 		return TRUE;
 	}
 
-	return FALSE;
-}
+	if (sna->kgem.has_semaphores) {
+		if (is_solid(src))
+			return TRUE;
+	}
 
-static bool
-is_solid(PicturePtr picture)
-{
-	return  picture->pDrawable->width == 1 &&
-		picture->pDrawable->height == 1 &&
-		picture->repeat;
+	return FALSE;
 }
 
 static bool
@@ -3071,7 +3079,7 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   struct kgem_bo *src_bo,
 				   struct kgem_bo *dst_bo)
 {
-	return (sna->kgem.ring != KGEM_RENDER ||
+	return (prefer_blt_ring(sna) ||
 		untiled_tlb_miss(src_bo) ||
 		untiled_tlb_miss(dst_bo));
 }
@@ -3424,7 +3432,7 @@ gen6_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 static inline bool prefer_blt_fill(struct sna *sna,
 				   struct kgem_bo *bo)
 {
-	return sna->kgem.ring != KGEM_RENDER || untiled_tlb_miss(bo);
+	return sna->kgem.has_semaphores || prefer_blt_ring(sna) || untiled_tlb_miss(bo);
 }
 
 static Bool
commit deec2fc9304c32c18b27899961067f8ac1deda8c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 20:29:58 2012 +0000

    sna/gen5: Always prefer to emit solid fills using the BLT
    
    As the BLT is far, far faster than using a shader.
    
    Improves cairo-demos/chart from 6 to 13 fps.
    
    Reported-by: Michael Larabel <Michael at phoronix.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index dc1e720..03dc8c9 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1992,12 +1992,6 @@ picture_is_cpu(PicturePtr picture)
 	if (!picture->pDrawable)
 		return FALSE;
 
-	/* If it is a solid, try to use the render paths */
-	if (picture->pDrawable->width == 1 &&
-	    picture->pDrawable->height == 1 &&
-	    picture->repeat)
-		return FALSE;
-
 	if (too_large(picture->pDrawable->width, picture->pDrawable->height))
 		return TRUE;
 
@@ -2009,7 +2003,7 @@ try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (sna->kgem.mode == KGEM_BLT) {
+	if (sna->kgem.mode != KGEM_RENDER) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return TRUE;
 	}
@@ -2023,6 +2017,10 @@ try_blt(struct sna *sna,
 	if (too_large(dst->pDrawable->width, dst->pDrawable->height))
 		return TRUE;
 
+	/* The blitter is much faster for solids */
+	if (sna_picture_is_solid(src, NULL))
+		return TRUE;
+
 	/* is the source picture only in cpu memory e.g. a shm pixmap? */
 	return picture_is_cpu(src);
 }
@@ -2733,13 +2731,18 @@ gen5_copy_bind_surfaces(struct sna *sna,
 	gen5_emit_state(sna, op, offset);
 }
 
+static inline bool untiled_tlb_miss(struct kgem_bo *bo)
+{
+	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
+}
+
 static inline bool prefer_blt_copy(struct sna *sna,
 				   struct kgem_bo *src_bo,
 				   struct kgem_bo *dst_bo)
 {
-	return (src_bo->tiling == I915_TILING_NONE ||
-		dst_bo->tiling == I915_TILING_NONE ||
-		sna->kgem.mode == KGEM_BLT);
+	return (sna->kgem.ring != KGEM_RENDER ||
+		untiled_tlb_miss(src_bo) ||
+		untiled_tlb_miss(dst_bo));
 }
 
 static Bool
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index b6749ec..9f51028 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -682,7 +682,8 @@ sna_picture_is_solid(PicturePtr picture, uint32_t *color)
 	if (!is_solid(picture))
 		return FALSE;
 
-	*color = get_solid_color(picture, PICT_a8r8g8b8);
+	if (color)
+		*color = get_solid_color(picture, PICT_a8r8g8b8);
 	return TRUE;
 }
 
commit afda43e781fd869e25335b018f637f2153da4742
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 10:51:02 2012 +0000

    sna: Split the tiling limits between upload and copying
    
    The kernel has a bug that prevents pwriting buffers large than the
    aperture. Whilst waiting for the fix, limit the upload where possible to
    fit within that constraint.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 311bac4..cccdd59 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -692,9 +692,13 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if (kgem->max_gpu_size > kgem->max_cpu_size)
 		kgem->max_gpu_size = kgem->max_cpu_size;
 
-	kgem->max_tile_size = MAX_CACHE_SIZE;
-	if (kgem->max_tile_size > kgem->max_gpu_size / 2)
-		kgem->max_tile_size = kgem->max_gpu_size / 2;
+	kgem->max_upload_tile_size = kgem->aperture_mappable / 2;
+	if (kgem->max_upload_tile_size > kgem->max_gpu_size / 2)
+		kgem->max_upload_tile_size = kgem->max_gpu_size / 2;
+
+	kgem->max_copy_tile_size = (MAX_CACHE_SIZE + 1)/2;
+	if (kgem->max_copy_tile_size > kgem->max_gpu_size / 2)
+		kgem->max_copy_tile_size = kgem->max_gpu_size / 2;
 
 	totalram = total_ram_size();
 	if (totalram == 0) {
@@ -3197,9 +3201,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
-	if (alloc > kgem->max_tile_size)
+	if (alloc > MAX_CACHE_SIZE)
 		alloc = ALIGN(size, kgem->partial_buffer_size);
-	if (alloc > kgem->max_tile_size)
+	if (alloc > MAX_CACHE_SIZE)
 		alloc = PAGE_ALIGN(size);
 	alloc /= PAGE_SIZE;
 	if (kgem->has_cpu_bo) {
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 87dc386..974a716 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -161,7 +161,8 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_tile_size, max_gpu_size, max_cpu_size;
+	uint32_t max_upload_tile_size, max_copy_tile_size;
+	uint32_t max_gpu_size, max_cpu_size;
 	uint32_t large_object_size, max_object_size;
 	uint32_t partial_buffer_size;
 
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index f4278be..eb5df9d 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -61,7 +61,7 @@ box_intersect(BoxPtr a, const BoxRec *b)
 
 static inline bool upload_too_large(struct sna *sna, int width, int height)
 {
-	return width * height * 4 > sna->kgem.max_tile_size;
+	return width * height * 4 > sna->kgem.max_upload_tile_size;
 }
 
 static inline bool must_tile(struct sna *sna, int width, int height)
@@ -209,7 +209,7 @@ fallback:
 
 			step = MIN(sna->render.max_3d_size,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
-			while (step * step * 4 > sna->kgem.max_tile_size)
+			while (step * step * 4 > sna->kgem.max_upload_tile_size)
 				step /= 2;
 
 			DBG(("%s: tiling download, using %dx%d tiles\n",
@@ -595,7 +595,7 @@ fallback:
 tile:
 			step = MIN(sna->render.max_3d_size,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
-			while (step * step * 4 > sna->kgem.max_tile_size)
+			while (step * step * 4 > sna->kgem.max_upload_tile_size)
 				step /= 2;
 
 			DBG(("%s: tiling upload, using %dx%d tiles\n",
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index a3bf19d..702192a 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -144,7 +144,7 @@ sna_tiling_composite_done(struct sna *sna,
 	step = sna->render.max_3d_size;
 	if (tile->dst_x & (8*512 / tile->dst->pDrawable->bitsPerPixel - 1))
 		step /= 2;
-	while (step * step * 4 > sna->kgem.max_tile_size)
+	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
 
 	DBG(("%s -- %dx%d, count=%d, step size=%d\n", __FUNCTION__,
@@ -331,7 +331,7 @@ sna_tiling_fill_boxes(struct sna *sna,
 	pixman_region_init_rects(&region, box, n);
 
 	step = sna->render.max_3d_size;
-	while (step * step * 4 > sna->kgem.max_tile_size)
+	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
 
 	DBG(("%s (op=%d, format=%x, color=(%04x,%04x,%04x, %04x), tile.size=%d, box=%dx[(%d, %d), (%d, %d)])\n",
@@ -444,7 +444,7 @@ Bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
 	pixman_region_init_rects(&region, box, nbox);
 
 	step = sna->render.max_3d_size;
-	while (step * step * 4 > sna->kgem.max_tile_size)
+	while (step * step * 4 > sna->kgem.max_copy_tile_size)
 		step /= 2;
 
 	DBG(("%s (alu=%d), tile.size=%d, box=%dx[(%d, %d), (%d, %d)])\n",
commit e94063c07407bf2d5c67083e726bf1361d3afc44
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 10:29:02 2012 +0000

    sna: Avoid converting requested Y to X tiling for large pitches on gen4+
    
    The only strong requirement is that to utilize large pitches, the object
    must be tiled. Having it as X tiling is a pure convenience to facilitate
    use of the blitter. A DRI client may want to keep using Y tiling
    instead.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 0b2e1d6..311bac4 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2151,7 +2151,10 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		if (width*bpp > (MAXSHORT-512) * 8) {
 			DBG(("%s: large pitch [%d], forcing TILING_X\n",
 			     __FUNCTION__, width*bpp/8));
-			tiling = -I915_TILING_X;
+			if (tiling > 0)
+				tiling = -tiling;
+			else if (tiling == 0)
+				tiling = -I915_TILING_X;
 		} else if (tiling && (width|height) > 8192) {
 			DBG(("%s: large tiled buffer [%dx%d], forcing TILING_X\n",
 			     __FUNCTION__, width, height));
commit c7f08cb3113ea31beea30260ca3530e8be88c826
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 10:21:05 2012 +0000

    sna/dri: We need to reduce tiling on older gen if we cannot fence
    
    Only apply the architectural limits to enable bo creation for DRI buffers.
    
    Reported-by: Alban Browaeys <prahal at yahoo.com>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45414
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 2245c03..d0e19cf 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -127,19 +127,37 @@ static inline struct kgem_bo *ref(struct kgem_bo *bo)
 /* Prefer to enable TILING_Y if this buffer will never be a
  * candidate for pageflipping
  */
-static bool color_use_tiling_y(struct sna *sna, DrawablePtr drawable)
+static uint32_t color_tiling(struct sna *sna, DrawablePtr drawable)
 {
-	if (!COLOR_PREFER_TILING_Y)
-		return false;
+	uint32_t tiling;
 
-	return (drawable->width != sna->front->drawable.width ||
-		drawable->height != sna->front->drawable.height);
+	if (COLOR_PREFER_TILING_Y &&
+	    (drawable->width  != sna->front->drawable.width ||
+	     drawable->height != sna->front->drawable.height))
+		tiling = I915_TILING_Y;
+	else
+		tiling = I915_TILING_X;
+
+	return kgem_choose_tiling(&sna->kgem, -tiling,
+				  drawable->width,
+				  drawable->height,
+				  drawable->bitsPerPixel);
+}
+
+static uint32_t other_tiling(struct sna *sna, DrawablePtr drawable)
+{
+	/* XXX Can mix color X / depth Y? */
+	return kgem_choose_tiling(&sna->kgem, -I915_TILING_Y,
+				  drawable->width,
+				  drawable->height,
+				  drawable->bitsPerPixel);
 }
 
 static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 					  PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv;
+	uint32_t tiling;
 
 	priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
 	if (priv == NULL)
@@ -148,9 +166,9 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	if (priv->flush)
 		return ref(priv->gpu_bo);
 
-	if (priv->gpu_bo->tiling != I915_TILING_Y &&
-	    color_use_tiling_y(sna, &pixmap->drawable))
-		sna_pixmap_change_tiling(pixmap, I915_TILING_Y);
+	tiling = color_tiling(sna, &pixmap->drawable);
+	if (priv->gpu_bo->tiling != tiling)
+		sna_pixmap_change_tiling(pixmap, tiling);
 
 	/* We need to submit any modifications to and reads from this
 	 * buffer before we send any reply to the Client.
@@ -209,7 +227,7 @@ sna_dri_create_buffer(DrawablePtr drawable,
 				    drawable->width,
 				    drawable->height,
 				    drawable->bitsPerPixel,
-				    color_use_tiling_y(sna, drawable) ?  I915_TILING_Y : I915_TILING_X,
+				    color_tiling(sna, drawable),
 				    CREATE_EXACT);
 		break;
 
@@ -252,8 +270,7 @@ sna_dri_create_buffer(DrawablePtr drawable,
 		bpp = format ? format : drawable->bitsPerPixel,
 		bo = kgem_create_2d(&sna->kgem,
 				    drawable->width, drawable->height, bpp,
-				    //sna->kgem.gen >= 40 ? I915_TILING_Y : I915_TILING_X,
-				    I915_TILING_Y,
+				    other_tiling(sna, drawable),
 				    CREATE_EXACT);
 		break;
 
commit ba3c00c75191e2e20d64dc7e57e065e4a63b0926
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 00:35:42 2012 +0000

    sna: Trim tile sizes to fit into bo cache
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index ca7eafa..0b2e1d6 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -692,13 +692,14 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if (kgem->max_gpu_size > kgem->max_cpu_size)
 		kgem->max_gpu_size = kgem->max_cpu_size;
 
-	kgem->max_tile_size = kgem->aperture_total / 4;
+	kgem->max_tile_size = MAX_CACHE_SIZE;
 	if (kgem->max_tile_size > kgem->max_gpu_size / 2)
 		kgem->max_tile_size = kgem->max_gpu_size / 2;
 
 	totalram = total_ram_size();
 	if (totalram == 0) {
-		DBG(("%s: total ram size unknown, assuming maximum of total aperture\n"));
+		DBG(("%s: total ram size unknown, assuming maximum of total aperture\n",
+		     __FUNCTION__));
 		totalram = kgem->aperture_total;
 	}
 	if (kgem->max_object_size > totalram / 2)
@@ -3193,7 +3194,9 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
-	if (alloc > kgem->max_gpu_size)
+	if (alloc > kgem->max_tile_size)
+		alloc = ALIGN(size, kgem->partial_buffer_size);
+	if (alloc > kgem->max_tile_size)
 		alloc = PAGE_ALIGN(size);
 	alloc /= PAGE_SIZE;
 	if (kgem->has_cpu_bo) {
commit f40176cc03b1f4ab3cb0de2b70ea204887c8b05e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 31 00:09:42 2012 +0000

    sna: Check that the intermediate IO buffer can also be used for blitting
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index e35be97..3115bd7 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -716,8 +716,8 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	} else {
 		struct sna_pixmap *priv;
 
-		DBG(("%s: creating GPU pixmap %dx%d, stride=%d\n",
-		     __FUNCTION__, width, height, pad));
+		DBG(("%s: creating GPU pixmap %dx%d, stride=%d, flags=%x\n",
+		     __FUNCTION__, width, height, pad, flags));
 
 		pixmap = create_pixmap(sna, screen, 0, 0, depth, usage);
 		if (pixmap == NullPixmap)
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 4f86f8d..f4278be 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -59,11 +59,16 @@ box_intersect(BoxPtr a, const BoxRec *b)
 	return a->x1 < a->x2 && a->y1 < a->y2;
 }
 
+static inline bool upload_too_large(struct sna *sna, int width, int height)
+{
+	return width * height * 4 > sna->kgem.max_tile_size;
+}
+
 static inline bool must_tile(struct sna *sna, int width, int height)
 {
 	return (width  > sna->render.max_3d_size ||
 		height > sna->render.max_3d_size ||
-		width * height * 4 > sna->kgem.max_tile_size);
+		upload_too_large(sna, width, height));
 }
 
 static void read_boxes_inplace(struct kgem *kgem,
@@ -118,6 +123,7 @@ void sna_read_boxes(struct sna *sna,
 	void *ptr;
 	int src_pitch, cpp, offset;
 	int n, cmd, br13;
+	bool can_blt;
 
 	DBG(("%s x %d, src=(handle=%d, offset=(%d,%d)), dst=(size=(%d, %d), offset=(%d,%d))\n",
 	     __FUNCTION__, nbox, src_bo->handle, src_dx, src_dy,
@@ -154,6 +160,7 @@ fallback:
 		return;
 	}
 
+	can_blt = kgem_bo_can_blt(kgem, src_bo);
 	extents = box[0];
 	for (n = 1; n < nbox; n++) {
 		if (box[n].x1 < extents.x1)
@@ -161,6 +168,9 @@ fallback:
 		if (box[n].x2 > extents.x2)
 			extents.x2 = box[n].x2;
 
+		if (can_blt)
+			can_blt = (box[n].x2 - box[n].x1) * dst->drawable.bitsPerPixel < 8 * (MAXSHORT - 4);
+
 		if (box[n].y1 < extents.y1)
 			extents.y1 = box[n].y1;
 		if (box[n].y2 > extents.y2)
@@ -173,9 +183,8 @@ fallback:
 	}
 
 	/* Try to avoid switching rings... */
-	if (kgem->ring == KGEM_RENDER ||
-	    !kgem_bo_can_blt(kgem, src_bo) ||
-	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
+	if (!can_blt || kgem->ring == KGEM_RENDER ||
+	    upload_too_large(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
@@ -531,6 +540,7 @@ bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 	void *ptr;
 	int offset;
 	int n, cmd, br13;
+	bool can_blt;
 
 	DBG(("%s x %d\n", __FUNCTION__, nbox));
 
@@ -542,6 +552,7 @@ fallback:
 					   box, nbox);
 	}
 
+	can_blt = kgem_bo_can_blt(kgem, dst_bo);
 	extents = box[0];
 	for (n = 1; n < nbox; n++) {
 		if (box[n].x1 < extents.x1)
@@ -549,6 +560,9 @@ fallback:
 		if (box[n].x2 > extents.x2)
 			extents.x2 = box[n].x2;
 
+		if (can_blt)
+			can_blt = (box[n].x2 - box[n].x1) * dst->drawable.bitsPerPixel < 8 * (MAXSHORT - 4);
+
 		if (box[n].y1 < extents.y1)
 			extents.y1 = box[n].y1;
 		if (box[n].y2 > extents.y2)
@@ -556,9 +570,8 @@ fallback:
 	}
 
 	/* Try to avoid switching rings... */
-	if (kgem->ring == KGEM_RENDER ||
-	    !kgem_bo_can_blt(kgem, dst_bo) ||
-	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
+	if (!can_blt || kgem->ring == KGEM_RENDER ||
+	    upload_too_large(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
@@ -579,6 +592,7 @@ fallback:
 			BoxRec tile, stack[64], *clipped, *c;
 			int step;
 
+tile:
 			step = MIN(sna->render.max_3d_size,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
 			while (step * step * 4 > sna->kgem.max_tile_size)
@@ -693,7 +707,7 @@ fallback:
 			kgem_bo_destroy(&sna->kgem, src_bo);
 
 			if (!n)
-				goto fallback;
+				goto tile;
 		}
 
 		return true;
@@ -1069,9 +1083,13 @@ indirect_replace(struct sna *sna,
 	if ((int)pixmap->devKind * pixmap->drawable.height >> 12 > kgem->half_cpu_cache_pages)
 		return false;
 
-	if (bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
+	if (kgem->ring == KGEM_RENDER || !kgem_bo_can_blt(kgem, bo)) {
 		BoxRec box;
 
+		assert(!must_tile(sna,
+				  pixmap->drawable.width,
+				  pixmap->drawable.height));
+
 		src_bo = kgem_create_buffer_2d(kgem,
 					       pixmap->drawable.width,
 					       pixmap->drawable.height,
commit 5424ad87f13ed06dc305ec08940563073f660781
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 23:49:18 2012 +0000

    sna: Discard the cleared GPU buffer upon PutImage to the CPU buffer
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 26fd1ab..e35be97 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2533,6 +2533,26 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
 		return true;
 
+	if (priv->clear) {
+		DBG(("%s: applying clear [%08x]\n",
+		     __FUNCTION__, priv->clear_color));
+
+		pixman_fill(pixmap->devPrivate.ptr,
+			    pixmap->devKind/sizeof(uint32_t),
+			    pixmap->drawable.bitsPerPixel,
+			    0, 0,
+			    pixmap->drawable.width,
+			    pixmap->drawable.height,
+			    priv->clear_color);
+
+		sna_damage_all(&priv->cpu_damage,
+			       pixmap->drawable.width,
+			       pixmap->drawable.height);
+		sna_pixmap_free_gpu(sna, priv);
+		priv->undamaged = false;
+		priv->clear = false;
+	}
+
 	if (!DAMAGE_IS_ALL(priv->cpu_damage)) {
 		DBG(("%s: marking damage\n", __FUNCTION__));
 		if (region_subsumes_drawable(region, &pixmap->drawable)) {
commit 4884c11b5332c252eca2801a4ef9e48075c21176
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 13:43:59 2012 +0000

    sna: Track large objects and limit prefer-gpu hint to small objects
    
    As the GATT is irrespective of actual RAM size, we need to be careful
    not to be too generous when allocating GPU bo and their shadows. So
    first of all we limit default render targets to those small enough to
    fit comfortably in RAM alongside others, and secondly we try to only
    keep a single copy of large objects in memory.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/configure.ac b/configure.ac
index 50b5643..0fb3270 100644
--- a/configure.ac
+++ b/configure.ac
@@ -132,6 +132,7 @@ required_pixman_version=0.24
 if test "x$SNA" != "xno"; then
 	required_xorg_xserver_version=1.10
 	AC_DEFINE(USE_SNA, 1, [Enable SNA support])
+	AC_CHECK_HEADERS([sys/sysinfo.h])
 fi
 AC_MSG_RESULT([$SNA])
 
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 5ab5c83..ca7eafa 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -44,6 +44,10 @@
 #include <memcheck.h>
 #endif
 
+#if HAVE_SYS_SYSINFO_H
+#include <sys/sysinfo.h>
+#endif
+
 static struct kgem_bo *
 search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
@@ -498,6 +502,18 @@ agp_aperture_size(struct pci_device *dev, int gen)
 }
 
 static size_t
+total_ram_size(void)
+{
+#if HAVE_SYS_SYSINFO_H
+	struct sysinfo info;
+	if (sysinfo(&info) == 0)
+		return info.totalram * info.mem_unit;
+#endif
+
+	return 0;
+}
+
+static size_t
 cpu_cache_size(void)
 {
 	FILE *file = fopen("/proc/cpuinfo", "r");
@@ -556,6 +572,7 @@ static bool semaphores_enabled(void)
 void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 {
 	struct drm_i915_gem_get_aperture aperture;
+	size_t totalram;
 	unsigned int i, j;
 
 	memset(kgem, 0, sizeof(*kgem));
@@ -679,6 +696,24 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if (kgem->max_tile_size > kgem->max_gpu_size / 2)
 		kgem->max_tile_size = kgem->max_gpu_size / 2;
 
+	totalram = total_ram_size();
+	if (totalram == 0) {
+		DBG(("%s: total ram size unknown, assuming maximum of total aperture\n"));
+		totalram = kgem->aperture_total;
+	}
+	if (kgem->max_object_size > totalram / 2)
+		kgem->max_object_size = totalram / 2;
+	if (kgem->max_cpu_size > totalram / 2)
+		kgem->max_cpu_size = totalram / 2;
+	if (kgem->max_gpu_size > totalram / 4)
+		kgem->max_gpu_size = totalram / 4;
+
+	kgem->large_object_size = MAX_CACHE_SIZE;
+	if (kgem->large_object_size > kgem->max_gpu_size)
+		kgem->large_object_size = kgem->max_gpu_size;
+
+	DBG(("%s: large object thresold=%d\n",
+	     __FUNCTION__, kgem->large_object_size));
 	DBG(("%s: max object size (gpu=%d, cpu=%d, tile=%d)\n",
 	     __FUNCTION__,
 	     kgem->max_gpu_size,
@@ -2179,83 +2214,40 @@ done:
 	return tiling;
 }
 
-bool kgem_can_create_2d(struct kgem *kgem,
-			 int width, int height, int depth)
+unsigned kgem_can_create_2d(struct kgem *kgem,
+			    int width, int height, int depth)
 {
 	int bpp = BitsPerPixel(depth);
 	uint32_t pitch, size;
+	unsigned flags = 0;
 
 	if (depth < 8 || kgem->wedged)
-		return false;
-
-	size = kgem_surface_size(kgem, false, false,
-				 width, height, bpp,
-				 I915_TILING_X, &pitch);
-	if (size > 0 && size <= kgem->max_object_size)
-		return true;
-
-	size = kgem_surface_size(kgem, false, false,
-				 width, height, bpp,
-				 I915_TILING_NONE, &pitch);
-	if (size > 0 && size <= kgem->max_object_size)
-		return true;
-
-	return false;
-}
-
-bool kgem_can_create_cpu(struct kgem *kgem,
-			 int width, int height, int bpp)
-{
-	uint32_t pitch, size;
-
-	if (bpp < 8 || kgem->wedged)
-		return false;
+		return 0;
 
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
-	DBG(("%s? %d, cpu size %d, max %d\n",
-	     __FUNCTION__,
-	     size > 0 && size <= kgem->max_cpu_size,
-	     size, kgem->max_cpu_size));
-	return size > 0 && size <= kgem->max_cpu_size;
-}
-
-static bool _kgem_can_create_gpu(struct kgem *kgem,
-				 int width, int height, int bpp)
-{
-	uint32_t pitch, size;
-
-	if (bpp < 8 || kgem->wedged)
-		return false;
+	if (size > 0 && size <= kgem->max_cpu_size)
+		flags |= KGEM_CAN_CREATE_CPU | KGEM_CAN_CREATE_GPU;
+	if (size > kgem->large_object_size)
+		flags |= KGEM_CAN_CREATE_LARGE;
+	if (size > kgem->max_object_size)
+		return 0;
 
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
-				 kgem_choose_tiling(kgem,
-						    I915_TILING_X,
+				 kgem_choose_tiling(kgem, I915_TILING_X,
 						    width, height, bpp),
 				 &pitch);
-	DBG(("%s? %d, gpu size %d, max %d\n",
-	     __FUNCTION__,
-	     size > 0 && size < kgem->max_gpu_size,
-	     size, kgem->max_gpu_size));
-	return size > 0 && size < kgem->max_gpu_size;
-}
+	if (size > 0 && size <= kgem->max_gpu_size)
+		flags |= KGEM_CAN_CREATE_GPU;
+	if (size > kgem->large_object_size)
+		flags |= KGEM_CAN_CREATE_LARGE;
+	if (size > kgem->max_object_size)
+		return 0;
 
-#if DEBUG_KGEM
-bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp)
-{
-	bool ret = _kgem_can_create_gpu(kgem, width, height, bpp);
-	DBG(("%s(%dx%d, bpp=%d) = %d\n", __FUNCTION__,
-	     width, height, bpp, ret));
-	return ret;
-}
-#else
-bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp)
-{
-	return _kgem_can_create_gpu(kgem, width, height, bpp);
+	return flags;
 }
-#endif
 
 inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 {
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index fea2d45..87dc386 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -161,7 +161,8 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_tile_size, max_gpu_size, max_cpu_size, max_object_size;
+	uint32_t max_tile_size, max_gpu_size, max_cpu_size;
+	uint32_t large_object_size, max_object_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
@@ -201,9 +202,10 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
-bool kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
-bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp);
-bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int bpp);
+unsigned kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
+#define KGEM_CAN_CREATE_GPU	0x1
+#define KGEM_CAN_CREATE_CPU	0x2
+#define KGEM_CAN_CREATE_LARGE	0x4
 
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
diff --git a/src/sna/sna.h b/src/sna/sna.h
index d9ba773..a0ea54f 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -145,7 +145,7 @@ struct sna_pixmap {
 	uint8_t flush :1;
 	uint8_t clear :1;
 	uint8_t undamaged :1;
-	uint8_t gpu :1;
+	uint8_t create :3;
 	uint8_t header :1;
 };
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 5d0e042..26fd1ab 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -246,11 +246,8 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	DBG(("%s: pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
 	assert(priv->stride);
 
-	if ((sna->kgem.has_cpu_bo || !priv->gpu) &&
-	    kgem_can_create_cpu(&sna->kgem,
-				pixmap->drawable.width,
-				pixmap->drawable.height,
-				pixmap->drawable.bitsPerPixel)) {
+	if ((sna->kgem.has_cpu_bo || (priv->create & KGEM_CAN_CREATE_GPU) == 0) &&
+	    (priv->create & KGEM_CAN_CREATE_CPU)) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
@@ -589,15 +586,7 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, tiling=%d)\n", __FUNCTION__,
 	     width, height, depth, tiling));
 
-	if (depth < 8)
-		return create_pixmap(sna, screen, width, height, depth,
-				     CREATE_PIXMAP_USAGE_SCRATCH);
-
 	bpp = BitsPerPixel(depth);
-	if (!kgem_can_create_gpu(&sna->kgem, width, height, bpp))
-		return create_pixmap(sna, screen, width, height, depth,
-				     CREATE_PIXMAP_USAGE_SCRATCH);
-
 	if (tiling == I915_TILING_Y && !sna->have_render)
 		tiling = I915_TILING_X;
 
@@ -672,46 +661,47 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 {
 	struct sna *sna = to_sna_from_screen(screen);
 	PixmapPtr pixmap;
+	unsigned flags;
 	int pad;
 
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
-	if (!kgem_can_create_2d(&sna->kgem, width, height, depth)) {
+	if (!sna->have_render)
+		goto fallback;
+
+	flags = kgem_can_create_2d(&sna->kgem, width, height, depth);
+	if (flags == 0) {
 		DBG(("%s: can not use GPU, just creating shadow\n",
 		     __FUNCTION__));
-		return create_pixmap(sna, screen, width, height, depth, usage);
+		goto fallback;
 	}
 
-	if (!sna->have_render)
-		return create_pixmap(sna, screen,
-				     width, height, depth,
-				     usage);
-
 #if FAKE_CREATE_PIXMAP_USAGE_SCRATCH_HEADER
 	if (width == 0 || height == 0)
-		return create_pixmap(sna, screen, width, height, depth,
-				     CREATE_PIXMAP_USAGE_SCRATCH_HEADER);
+		goto fallback;
 #endif
 
-	if (usage == CREATE_PIXMAP_USAGE_SCRATCH)
-#if USE_BO_FOR_SCRATCH_PIXMAP
-		return sna_pixmap_create_scratch(screen,
-						 width, height, depth,
-						 I915_TILING_X);
-#else
-	return create_pixmap(sna, screen,
-			     width, height, depth,
-			     usage);
-#endif
+	if (usage == CREATE_PIXMAP_USAGE_SCRATCH) {
+		if (flags & KGEM_CAN_CREATE_GPU)
+			return sna_pixmap_create_scratch(screen,
+							 width, height, depth,
+							 I915_TILING_X);
+		else
+			goto fallback;
+	}
 
-	if (usage == SNA_CREATE_SCRATCH)
-		return sna_pixmap_create_scratch(screen,
-						 width, height, depth,
-						 I915_TILING_Y);
+	if (usage == SNA_CREATE_SCRATCH) {
+		if (flags & KGEM_CAN_CREATE_GPU)
+			return sna_pixmap_create_scratch(screen,
+							 width, height, depth,
+							 I915_TILING_Y);
+		else
+			goto fallback;
+	}
 
 	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE)
-		return create_pixmap(sna, screen, width, height, depth, usage);
+		goto fallback;
 
 	pad = PixmapBytePad(width, depth);
 	if (pad * height <= 4096) {
@@ -741,17 +731,17 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		priv = __sna_pixmap_attach(sna, pixmap);
 		if (priv == NULL) {
 			free(pixmap);
-			return create_pixmap(sna, screen,
-					     width, height, depth, usage);
+			goto fallback;
 		}
 
 		priv->stride = pad;
-		priv->gpu = kgem_can_create_gpu(&sna->kgem,
-						width, height,
-						pixmap->drawable.bitsPerPixel);
+		priv->create = flags;
 	}
 
 	return pixmap;
+
+fallback:
+	return create_pixmap(sna, screen, width, height, depth, usage);
 }
 
 static Bool sna_destroy_pixmap(PixmapPtr pixmap)
@@ -844,7 +834,8 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 		sna_damage_destroy(&priv->gpu_damage);
 		priv->clear = false;
 
-		if (priv->gpu && pixmap_inplace(sna, pixmap, priv)) {
+		if (priv->create & KGEM_CAN_CREATE_GPU &&
+		    pixmap_inplace(sna, pixmap, priv)) {
 			DBG(("%s: write inplace\n", __FUNCTION__));
 			if (priv->gpu_bo) {
 				if (kgem_bo_is_busy(priv->gpu_bo) &&
@@ -1004,7 +995,7 @@ skip_inplace_map:
 		priv->undamaged = true;
 	}
 
-	if (flags & MOVE_WRITE) {
+	if (flags & MOVE_WRITE || priv->create & KGEM_CAN_CREATE_LARGE) {
 		DBG(("%s: marking as damaged\n", __FUNCTION__));
 		sna_damage_all(&priv->cpu_damage,
 			       pixmap->drawable.width,
@@ -1179,7 +1170,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	if (priv->clear)
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 
-	if (priv->gpu_bo == NULL && !priv->gpu && flags & MOVE_WRITE)
+	if (priv->gpu_bo == NULL &&
+	    (priv->create & KGEM_CAN_CREATE_GPU) == 0 &&
+	    flags & MOVE_WRITE)
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
@@ -1692,7 +1685,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 done:
-	if (!priv->pinned && priv->gpu)
+	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return true;
@@ -1733,7 +1726,7 @@ sna_drawable_use_bo(DrawablePtr drawable,
 		goto use_cpu_bo;
 
 	if (priv->gpu_bo == NULL) {
-		if (!priv->gpu) {
+		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0) {
 			DBG(("%s: untiled, will not force allocation\n",
 			     __FUNCTION__));
 			goto use_cpu_bo;
@@ -1832,7 +1825,7 @@ done:
 
 use_gpu_bo:
 	priv->clear = false;
-	if (!priv->pinned && priv->gpu)
+	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 		list_move(&priv->inactive,
 			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
@@ -1883,10 +1876,6 @@ sna_pixmap_create_upload(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d)\n", __FUNCTION__, width, height, depth));
 	assert(width);
 	assert(height);
-	if (!sna->have_render ||
-	    !kgem_can_create_gpu(&sna->kgem, width, height, bpp))
-		return create_pixmap(sna, screen, width, height, depth,
-				     CREATE_PIXMAP_USAGE_SCRATCH);
 
 	if (sna->freed_pixmap) {
 		pixmap = sna->freed_pixmap;
@@ -2000,7 +1989,7 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 		return NULL;
 
 	/* For large bo, try to keep only a single copy around */
-	if (!priv->gpu && priv->ptr) {
+	if (priv->create & KGEM_CAN_CREATE_LARGE && priv->ptr) {
 		sna_damage_all(&priv->gpu_damage,
 			       pixmap->drawable.width,
 			       pixmap->drawable.height);
@@ -2043,7 +2032,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	sna_damage_reduce(&priv->cpu_damage);
 	DBG(("%s: CPU damage? %d\n", __FUNCTION__, priv->cpu_damage != NULL));
 	if (priv->gpu_bo == NULL) {
-		if (!wedged(sna) && priv->gpu)
+		if (!wedged(sna) && priv->create & KGEM_CAN_CREATE_GPU)
 			priv->gpu_bo =
 				kgem_create_2d(&sna->kgem,
 					       pixmap->drawable.width,
@@ -2128,10 +2117,13 @@ done:
 	sna_damage_reduce_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height);
-	if (DAMAGE_IS_ALL(priv->gpu_damage))
+	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		priv->undamaged = false;
+		if (priv->ptr)
+			sna_pixmap_free_cpu(sna, priv);
+	}
 active:
-	if (!priv->pinned && priv->gpu)
+	if (!priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return priv;
@@ -2984,7 +2976,7 @@ move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 	if (priv->gpu_bo)
 		return TRUE;
 
-	if (!priv->gpu)
+	if ((priv->create & KGEM_CAN_CREATE_GPU) == 0)
 		return FALSE;
 
 	if (priv->cpu_bo) {
@@ -3241,7 +3233,8 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			}
 		} else {
 			dst_priv->clear = false;
-			if (!dst_priv->pinned && dst_priv->gpu)
+			if (!dst_priv->pinned &&
+			    (dst_priv->create & KGEM_CAN_CREATE_LARGE) == 0)
 				list_move(&dst_priv->inactive,
 					  &sna->active_pixmaps);
 		}
@@ -11557,7 +11550,7 @@ static void sna_accel_inactive(struct sna *sna)
 		priv = list_first_entry(&sna->inactive_clock[1],
 					struct sna_pixmap,
 					inactive);
-		assert(priv->gpu);
+		assert((priv->create & KGEM_CAN_CREATE_LARGE) == 0);
 		assert(priv->gpu_bo);
 
 		/* XXX Rather than discarding the GPU buffer here, we
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 43e8642..fc6e6df 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -332,7 +332,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 		int w = box->x2 - box->x1;
 		int h = box->y2 - box->y1;
 
-		if (!priv->gpu)
+		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0)
 			goto done;
 
 		if (priv->source_count*w*h >= pixmap->drawable.width * pixmap->drawable.height &&
@@ -380,7 +380,7 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 			return false;
 
 		upload = true;
-		if (!priv->gpu ||
+		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
 		    kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
 				       I915_TILING_X,
 				       pixmap->drawable.width,
@@ -405,7 +405,7 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box)
 		return FALSE;
 
 	count = priv->source_count++;
-	if (!priv->gpu ||
+	if ((priv->create & KGEM_CAN_CREATE_GPU) == 0 ||
 	    kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
 			       I915_TILING_X,
 			       pixmap->drawable.width,
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index 489f215..2805a01 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -102,7 +102,7 @@ too_small(DrawablePtr drawable)
 	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
 		return false;
 
-	return !priv->gpu;
+	return (priv->create & KGEM_CAN_CREATE_GPU) == 0;
 }
 
 static inline Bool
commit daa068a1dda3e7a86a8d24283bf47b477904f93c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 15:11:34 2012 +0000

    sna: Update the partial buffer allocation size when reusing an old mapping
    
    Whilst the old mapping is guaranteed to be larger than the requested
    allocation size, keep track of the actual size allows for better packing
    of future buffers. And the code also performs a sanity check that the
    buffer is the size we claim it to be...
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d97a6ac..5ab5c83 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3365,6 +3365,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		if (old) {
 			DBG(("%s: reusing cpu map handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
+			alloc = num_pages(old);
 
 			memcpy(&bo->base, old, sizeof(*old));
 			if (old->rq)
commit b92bf9d8aac18d8cf62fd9c9dda84f73c49f3508
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 11:48:59 2012 +0000

    sna: Allow the creation of render targets larger than the maximum bo cache
    
    Given that we now handle uploads to and from bo that are larger than the
    aperture and that usage of such large bo is rare and so unlikely to
    benefit from caching, allow them to be created as render targets and
    destroy as soon as they become inactive.
    
    In principle, this finally enables GPU acceleration of ocitysmap on gen4+,
    but due to the large cost of creating and destroying large bo it is
    disabled on systems that require clflushing. It is, however, a
    pre-requisite for exploiting the enhanced capabilities of IvyBridge.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 30c46fb..d97a6ac 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -661,7 +661,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 
 	kgem->max_object_size = kgem->aperture_total / 2;
 	kgem->max_cpu_size = kgem->aperture_total / 2;
-	kgem->max_gpu_size = MAX_CACHE_SIZE;
+	kgem->max_gpu_size = kgem->aperture_total / 2;
+	if (!kgem->has_llc)
+		kgem->max_gpu_size = MAX_CACHE_SIZE;
 	if (gen < 40) {
 		/* If we have to use fences for blitting, we have to make
 		 * sure we can fit them into the aperture.
@@ -672,8 +674,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	}
 	if (kgem->max_gpu_size > kgem->max_cpu_size)
 		kgem->max_gpu_size = kgem->max_cpu_size;
+
 	kgem->max_tile_size = kgem->aperture_total / 4;
-	if (kgem->max_tile_size < kgem->max_gpu_size / 2)
+	if (kgem->max_tile_size > kgem->max_gpu_size / 2)
 		kgem->max_tile_size = kgem->max_gpu_size / 2;
 
 	DBG(("%s: max object size (gpu=%d, cpu=%d, tile=%d)\n",
commit dd03f5fedd00fedf1d65606a5496f14e60b63c0b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 11:41:07 2012 +0000

    sna: Decrease tiling step size in case we need to enlarge the box later
    
    We can juggle rendering into large bo on gen4 by redirecting the
    rendering through a proxy that is tile aligned, and so the render target
    may be slightly larger than the tiling step size. As that is then larger
    than the maximum 3D pipeline, the trick fails and we need to resort to a
    temporary render target with copies in and out. In this case, check that
    the tile is aligned to the most pessimistic tiling width and reduce the
    step size to accomodate the enlargement.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 9a98990..43e8642 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1565,6 +1565,9 @@ sna_render_composite_redirect(struct sna *sna,
 		BoxRec box;
 		int w, h;
 
+		DBG(("%s: dst pitch (%d) fits within render pipeline (%d)\n",
+		     __FUNCTION__, op->dst.bo->pitch, sna->render.max_3d_pitch));
+
 		kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling,
 				   &tile_width, &tile_height, &tile_size);
 
@@ -1615,10 +1618,11 @@ sna_render_composite_redirect(struct sna *sna,
 				return FALSE;
 			}
 
+			assert(op->dst.bo != t->real_bo);
 			op->dst.bo->pitch = t->real_bo->pitch;
 
-			op->dst.x += -box.x1;
-			op->dst.y += -box.y1;
+			op->dst.x -= box.x1;
+			op->dst.y -= box.y1;
 			op->dst.width  = w;
 			op->dst.height = h;
 			return TRUE;
@@ -1675,6 +1679,8 @@ sna_render_composite_redirect_done(struct sna *sna,
 	const struct sna_composite_redirect *t = &op->redirect;
 
 	if (t->real_bo) {
+		assert(op->dst.bo != t->real_bo);
+
 		if (t->box.x2 > t->box.x1) {
 			DBG(("%s: copying temporary to dst\n", __FUNCTION__));
 			sna_blt_copy_boxes(sna, GXcopy,
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 0bc4539..a3bf19d 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -140,12 +140,15 @@ sna_tiling_composite_done(struct sna *sna,
 	struct sna_composite_op tmp;
 	int x, y, n, step;
 
+	/* Use a small step to accommodate enlargement through tile alignment */
 	step = sna->render.max_3d_size;
+	if (tile->dst_x & (8*512 / tile->dst->pDrawable->bitsPerPixel - 1))
+		step /= 2;
 	while (step * step * 4 > sna->kgem.max_tile_size)
 		step /= 2;
 
-	DBG(("%s -- %dx%d, count=%d\n", __FUNCTION__,
-	     tile->width, tile->height, tile->rect_count));
+	DBG(("%s -- %dx%d, count=%d, step size=%d\n", __FUNCTION__,
+	     tile->width, tile->height, tile->rect_count, step));
 
 	if (tile->rect_count == 0)
 		goto done;
commit d93381fc08f961ee4822bc02c24a0b5fc97fe95d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 11:40:02 2012 +0000

    sna: Allow creation of proxies to proxies
    
    Just update the offset of the new bo by the offset of the existing
    proxy.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 64f729b..30c46fb 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3105,20 +3105,25 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 
 	DBG(("%s: target handle=%d, offset=%d, length=%d, io=%d\n",
 	     __FUNCTION__, target->handle, offset, length, target->io));
-	assert(target->proxy == NULL);
 
 	bo = __kgem_bo_alloc(target->handle, length);
 	if (bo == NULL)
 		return NULL;
 
+	bo->reusable = false;
 	bo->size.bytes = length;
+
 	bo->io = target->io;
 	bo->dirty = target->dirty;
 	bo->tiling = target->tiling;
 	bo->pitch = target->pitch;
+
+	if (target->proxy) {
+		offset += target->delta;
+		target = target->proxy;
+	}
 	bo->proxy = kgem_bo_reference(target);
 	bo->delta = offset;
-	bo->reusable = false;
 	return bo;
 }
 
commit 684f9470dbb977a61e855e980bac4e7db8538181
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Jan 30 11:38:36 2012 +0000

    sna: Base prefer-gpu hint on default tiling choice
    
    As on gen4+, tiling increases the maximum usable pitch we can
    accommodate wider pixmaps on the GPU.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 7019638..64f729b 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2211,6 +2211,10 @@ bool kgem_can_create_cpu(struct kgem *kgem,
 	size = kgem_surface_size(kgem, false, false,
 				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
+	DBG(("%s? %d, cpu size %d, max %d\n",
+	     __FUNCTION__,
+	     size > 0 && size <= kgem->max_cpu_size,
+	     size, kgem->max_cpu_size));
 	return size > 0 && size <= kgem->max_cpu_size;
 }
 
@@ -2223,8 +2227,15 @@ static bool _kgem_can_create_gpu(struct kgem *kgem,
 		return false;
 
 	size = kgem_surface_size(kgem, false, false,
-				 width, height, bpp, I915_TILING_NONE,
+				 width, height, bpp,
+				 kgem_choose_tiling(kgem,
+						    I915_TILING_X,
+						    width, height, bpp),
 				 &pitch);
+	DBG(("%s? %d, gpu size %d, max %d\n",
+	     __FUNCTION__,
+	     size > 0 && size < kgem->max_gpu_size,
+	     size, kgem->max_gpu_size));
 	return size > 0 && size < kgem->max_gpu_size;
 }
 
commit 99d0a53a14d0b28eb209e6220d41500131f79d43
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 19:36:55 2012 +0000

    sna: Detect batch overflow and fallback rather an risk an ENOSPC
    
    Having noticed that eog was failing to perform a 8k x 8k copy with
    compiz running on a 965gm, it was time the checks for batch overflow
    were implemented.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 398988a..7250d66 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1824,12 +1824,20 @@ gen2_render_composite(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
-			   NULL))
+			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
+				   NULL))
+			goto cleanup_mask;
+	}
 
 	gen2_emit_composite_state(sna, tmp);
 	return TRUE;
 
+cleanup_mask:
+	if (tmp->mask.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
 cleanup_src:
 	if (tmp->src.bo)
 		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
@@ -2235,12 +2243,20 @@ gen2_render_composite_spans(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
-			   NULL))
+			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
+	}
 
 	gen2_emit_composite_spans_state(sna, tmp);
 	return TRUE;
 
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
 cleanup_dst:
 	if (tmp->base.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
@@ -2435,8 +2451,10 @@ gen2_render_fill_boxes(struct sna *sna,
 	tmp.floats_per_vertex = 2;
 	tmp.floats_per_rect = 6;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen2_emit_fill_composite_state(sna, &tmp, pixel);
 
@@ -2675,6 +2693,7 @@ gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 		if (gen2_render_fill_one_try_blt(sna, dst, bo, color,
 						 x1, y1, x2, y2, alu))
 			return TRUE;
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
 	}
 
 	tmp.op = alu;
@@ -2835,14 +2854,19 @@ gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
 	    too_large(src->drawable.width, src->drawable.height) ||
 	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
+fallback:
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
 						   box, n);
+	}
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	memset(&tmp, 0, sizeof(tmp));
 	tmp.op = alu;
@@ -2960,6 +2984,7 @@ gen2_render_copy(struct sna *sna, uint8_t alu,
 	    too_large(dst->drawable.width, dst->drawable.height) ||
 	    src_bo->pitch > MAX_3D_PITCH ||
 	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
+fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -2982,8 +3007,11 @@ gen2_render_copy(struct sna *sna, uint8_t alu,
 	tmp->base.floats_per_vertex = 4;
 	tmp->base.floats_per_rect = 12;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	tmp->blt  = gen2_render_copy_blt;
 	tmp->done = gen2_render_copy_done;
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index da90d82..784d399 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2836,8 +2836,13 @@ gen3_render_composite(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
-			   NULL))
+			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
+				   NULL))
+			goto cleanup_mask;
+	}
 
 	gen3_emit_composite_state(sna, tmp);
 	gen3_align_vertex(sna, tmp);
@@ -3267,13 +3272,21 @@ gen3_render_composite_spans(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
-			   NULL))
+			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
+	}
 
 	gen3_emit_composite_state(sna, &tmp->base);
 	gen3_align_vertex(sna, &tmp->base);
 	return TRUE;
 
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
 cleanup_dst:
 	if (tmp->base.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
@@ -3830,14 +3843,19 @@ gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
 	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(src->drawable.width, src->drawable.height) ||
 	    dst_bo->pitch > MAX_3D_PITCH ||
-	    too_large(dst->drawable.width, dst->drawable.height))
+	    too_large(dst->drawable.width, dst->drawable.height)) {
+fallback:
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
 						   box, n);
+	}
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	memset(&tmp, 0, sizeof(tmp));
 	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
@@ -3961,6 +3979,7 @@ gen3_render_copy(struct sna *sna, uint8_t alu,
 	    too_large(src->drawable.width, src->drawable.height) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
 	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch > MAX_3D_PITCH) {
+fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -3984,8 +4003,11 @@ gen3_render_copy(struct sna *sna, uint8_t alu,
 	tmp->base.mask.bo = NULL;
 	tmp->base.mask.u.gen3.type = SHADER_NONE;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	tmp->blt  = gen3_render_copy_blt;
 	tmp->done = gen3_render_copy_done;
@@ -4139,8 +4161,10 @@ gen3_render_fill_boxes(struct sna *sna,
 	tmp.mask.u.gen3.type = SHADER_NONE;
 	tmp.u.gen3.num_constants = 0;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen3_emit_composite_state(sna, &tmp);
 	gen3_align_vertex(sna, &tmp);
@@ -4293,8 +4317,10 @@ gen3_render_fill(struct sna *sna, uint8_t alu,
 	tmp->base.mask.u.gen3.type = SHADER_NONE;
 	tmp->base.u.gen3.num_constants = 0;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	tmp->blt   = gen3_render_fill_op_blt;
 	tmp->box   = gen3_render_fill_op_box;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index d9542ea..ffdcbb7 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -556,7 +556,7 @@ static Bool gen4_check_dst_format(PictFormat format)
 	case PICT_x4r4g4b4:
 		return TRUE;
 	default:
-		DBG(("%s: unhandled format: %x\n", __FUNCTION__, format));
+		DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
 		return FALSE;
 	}
 }
@@ -1726,8 +1726,10 @@ gen4_render_video(struct sna *sna,
 	tmp.floats_per_vertex = 3;
 	tmp.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+	}
 
 	gen4_video_bind_surfaces(sna, &tmp, frame);
 	gen4_align_vertex(sna, &tmp);
@@ -2319,13 +2321,21 @@ gen4_render_composite(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
-			   NULL))
+			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				     tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
+				     NULL))
+			goto cleanup_mask;
+	}
 
 	gen4_bind_surfaces(sna, tmp);
 	gen4_align_vertex(sna, tmp);
 	return TRUE;
 
+cleanup_mask:
+	if (tmp->mask.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
 cleanup_src:
 	if (tmp->src.bo)
 		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
@@ -2400,6 +2410,8 @@ gen4_render_copy_boxes(struct sna *sna, uint8_t alu,
 {
 	struct sna_composite_op tmp;
 
+	DBG(("%s x %d\n", __FUNCTION__, n));
+
 #if NO_COPY_BOXES
 	if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 		return FALSE;
@@ -2472,8 +2484,11 @@ fallback:
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
 	tmp.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	gen4_copy_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
@@ -2512,6 +2527,12 @@ gen4_render_copy(struct sna *sna, uint8_t alu,
 		 PixmapPtr dst, struct kgem_bo *dst_bo,
 		 struct sna_copy_op *op)
 {
+	DBG(("%s: src=%ld, dst=%ld, alu=%d\n",
+	     __FUNCTION__,
+	     src->drawable.serialNumber,
+	     dst->drawable.serialNumber,
+	     alu));
+
 #if NO_COPY
 	if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 		return FALSE;
@@ -2575,8 +2596,11 @@ fallback:
 	op->base.u.gen4.wm_kernel = WM_KERNEL;
 	op->base.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	gen4_copy_bind_surfaces(sna, &op->base);
 	gen4_align_vertex(sna, &op->base);
@@ -2731,8 +2755,10 @@ gen4_render_fill_boxes(struct sna *sna,
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
 	tmp.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen4_fill_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
@@ -2844,8 +2870,10 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 	op->base.u.gen4.wm_kernel = WM_KERNEL;
 	op->base.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))  {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen4_fill_bind_surfaces(sna, &op->base);
 	gen4_align_vertex(sna, &op->base);
@@ -2884,6 +2912,8 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 {
 	struct sna_composite_op tmp;
 
+	DBG(("%s: color=%08x\n", __FUNCTION__, color));
+
 #if NO_FILL_ONE
 	return gen4_render_fill_one_try_blt(sna, dst, bo, color,
 					    x1, y1, x2, y2, alu);
@@ -2929,8 +2959,10 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
 	tmp.u.gen4.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen4_fill_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 3465121..dc1e720 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -557,7 +557,7 @@ static Bool gen5_check_dst_format(PictFormat format)
 	case PICT_x4r4g4b4:
 		return TRUE;
 	default:
-		DBG(("%s: unhandled format: %x\n", __FUNCTION__, format));
+		DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
 		return FALSE;
 	}
 }
@@ -1759,8 +1759,10 @@ gen5_render_video(struct sna *sna,
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
 
-	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
+	}
 
 	gen5_video_bind_surfaces(sna, &tmp, frame);
 	gen5_align_vertex(sna, &tmp);
@@ -2352,8 +2354,12 @@ gen5_render_composite(struct sna *sna,
 	tmp->done  = gen5_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
-			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL))
+			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL))
+			goto cleanup_mask;
+	}
 
 	if (kgem_bo_is_dirty(tmp->src.bo) || kgem_bo_is_dirty(tmp->mask.bo)) {
 		if (mask == NULL &&
@@ -2372,6 +2378,9 @@ gen5_render_composite(struct sna *sna,
 	gen5_align_vertex(sna, tmp);
 	return TRUE;
 
+cleanup_mask:
+	if (tmp->mask.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
 cleanup_src:
 	if (tmp->src.bo)
 		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
@@ -2671,13 +2680,21 @@ gen5_render_composite_spans(struct sna *sna,
 
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
-			   NULL))
+			   NULL))  {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
+	}
 
 	gen5_bind_surfaces(sna, &tmp->base);
 	gen5_align_vertex(sna, &tmp->base);
 	return TRUE;
 
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
 cleanup_dst:
 	if (tmp->base.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
@@ -2796,8 +2813,11 @@ fallback:
 	tmp.u.gen5.wm_kernel = WM_KERNEL;
 	tmp.u.gen5.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	if (kgem_bo_is_dirty(src_bo)) {
 		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
@@ -2946,8 +2966,11 @@ fallback:
 	op->base.u.gen5.wm_kernel = WM_KERNEL;
 	op->base.u.gen5.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))  {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
+	}
 
 	if (kgem_bo_is_dirty(src_bo)) {
 		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
@@ -3093,8 +3116,10 @@ gen5_render_fill_boxes(struct sna *sna,
 	tmp.u.gen5.wm_kernel = WM_KERNEL;
 	tmp.u.gen5.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen5_fill_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
@@ -3280,8 +3305,10 @@ gen5_render_fill(struct sna *sna, uint8_t alu,
 	op->base.u.gen5.wm_kernel = WM_KERNEL;
 	op->base.u.gen5.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen5_fill_bind_surfaces(sna, &op->base);
 	gen5_align_vertex(sna, &op->base);
@@ -3369,8 +3396,10 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.u.gen5.wm_kernel = WM_KERNEL;
 	tmp.u.gen5.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen5_fill_bind_surfaces(sna, &tmp);
 	gen5_align_vertex(sna, &tmp);
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 0d244f1..93410b6 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1971,6 +1971,7 @@ gen6_render_video(struct sna *sna,
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -2596,6 +2597,10 @@ gen6_render_composite(struct sna *sna,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
+				   NULL))
+			goto cleanup_mask;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -2603,6 +2608,9 @@ gen6_render_composite(struct sna *sna,
 	gen6_align_vertex(sna, tmp);
 	return TRUE;
 
+cleanup_mask:
+	if (tmp->mask.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
 cleanup_src:
 	if (tmp->src.bo)
 		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
@@ -3000,6 +3008,10 @@ gen6_render_composite_spans(struct sna *sna,
 			   tmp->base.dst.bo, tmp->base.src.bo,
 			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3007,6 +3019,9 @@ gen6_render_composite_spans(struct sna *sna,
 	gen6_align_vertex(sna, &tmp->base);
 	return TRUE;
 
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
 cleanup_dst:
 	if (tmp->base.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
@@ -3198,6 +3213,8 @@ fallback:
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3358,6 +3375,8 @@ fallback:
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3508,8 +3527,10 @@ gen6_render_fill_boxes(struct sna *sna,
 	tmp.u.gen6.nr_inputs = 1;
 	tmp.u.gen6.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
@@ -3705,8 +3726,10 @@ gen6_render_fill(struct sna *sna, uint8_t alu,
 	op->base.u.gen6.nr_inputs = 1;
 	op->base.u.gen6.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen6_emit_fill_state(sna, &op->base);
 	gen6_align_vertex(sna, &op->base);
@@ -3796,8 +3819,10 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.u.gen6.nr_inputs = 1;
 	tmp.u.gen6.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
@@ -3893,8 +3918,10 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	tmp.u.gen6.nr_inputs = 1;
 	tmp.u.gen6.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen6_emit_fill_state(sna, &tmp);
 	gen6_align_vertex(sna, &tmp);
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index ff04631..e2486c6 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2036,6 +2036,7 @@ gen7_render_video(struct sna *sna,
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -2662,6 +2663,10 @@ gen7_render_composite(struct sna *sna,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
+				   NULL))
+			goto cleanup_mask;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -2669,6 +2674,9 @@ gen7_render_composite(struct sna *sna,
 	gen7_align_vertex(sna, tmp);
 	return TRUE;
 
+cleanup_mask:
+	if (tmp->mask.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
 cleanup_src:
 	if (tmp->src.bo)
 		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
@@ -3065,6 +3073,10 @@ gen7_render_composite_spans(struct sna *sna,
 			   tmp->base.dst.bo, tmp->base.src.bo,
 			   NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem,
+				   tmp->base.dst.bo, tmp->base.src.bo,
+				   NULL))
+			goto cleanup_src;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3072,6 +3084,9 @@ gen7_render_composite_spans(struct sna *sna,
 	gen7_align_vertex(sna, &tmp->base);
 	return TRUE;
 
+cleanup_src:
+	if (tmp->base.src.bo)
+		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
 cleanup_dst:
 	if (tmp->base.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
@@ -3252,6 +3267,8 @@ fallback:
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3412,6 +3429,8 @@ fallback:
 	kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
+			goto fallback;
 		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
 	}
 
@@ -3564,8 +3583,10 @@ gen7_render_fill_boxes(struct sna *sna,
 	tmp.u.gen7.nr_inputs = 1;
 	tmp.u.gen7.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
@@ -3761,8 +3782,10 @@ gen7_render_fill(struct sna *sna, uint8_t alu,
 	op->base.u.gen7.nr_inputs = 1;
 	op->base.u.gen7.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
+	}
 
 	gen7_emit_fill_state(sna, &op->base);
 	gen7_align_vertex(sna, &op->base);
@@ -3852,8 +3875,10 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.u.gen7.nr_inputs = 1;
 	tmp.u.gen7.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
@@ -3949,8 +3974,10 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	tmp.u.gen7.nr_inputs = 1;
 	tmp.u.gen7.ve_id = 1;
 
-	if (!kgem_check_bo(&sna->kgem, bo, NULL))
+	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+	}
 
 	gen7_emit_fill_state(sna, &tmp);
 	gen7_align_vertex(sna, &tmp);
commit 73bbb3c3aad3ed63026fe14470e5632172cc199f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 19:21:54 2012 +0000

    sna: Add a tiled fallback for large BLT copies
    
    If we are attempting to copy between two large bo, larger than we can
    fit into the aperture, break the copy into smaller steps and use an
    intermediatory.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 7efbcf9..b6749ec 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -153,6 +153,7 @@ static bool sna_blt_fill_init(struct sna *sna,
 	if (!kgem_check_bo_fenced(kgem, bo, NULL) ||
 	    !kgem_check_batch(kgem, 12)) {
 		_kgem_submit(kgem);
+		assert(kgem_check_bo_fenced(kgem, bo, NULL));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -290,6 +291,8 @@ static Bool sna_blt_copy_init(struct sna *sna,
 	kgem_set_mode(kgem, KGEM_BLT);
 	if (!kgem_check_bo_fenced(kgem, src, dst, NULL)) {
 		_kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, src, dst, NULL))
+			return FALSE;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -337,6 +340,8 @@ static Bool sna_blt_alpha_fixup_init(struct sna *sna,
 	kgem_set_mode(kgem, KGEM_BLT);
 	if (!kgem_check_bo_fenced(kgem, src, dst, NULL)) {
 		_kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, src, dst, NULL))
+			return FALSE;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -1109,8 +1114,11 @@ prepare_blt_copy(struct sna *sna,
 	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo))
 		return FALSE;
 
-	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
+	if (!kgem_check_bo_fenced(&sna->kgem, op->dst.bo, priv->gpu_bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem,
+					  op->dst.bo, priv->gpu_bo, NULL))
+			return FALSE;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -1594,6 +1602,7 @@ sna_blt_composite(struct sna *sna,
 
 	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
 		_kgem_submit(&sna->kgem);
+		assert(kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL));
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -1891,6 +1900,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 	    !kgem_check_reloc(kgem, 1) ||
 	    !kgem_check_bo_fenced(kgem, bo, NULL)) {
 		_kgem_submit(kgem);
+		assert(kgem_check_bo_fenced(&sna->kgem, bo, NULL));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -1964,6 +1974,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	if (!kgem_check_bo_fenced(kgem, bo, NULL) ||
 	    !kgem_check_batch(kgem, 12)) {
 		_kgem_submit(kgem);
+		assert(kgem_check_bo_fenced(&sna->kgem, bo, NULL));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -2127,6 +2138,11 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	    !kgem_check_reloc(kgem, 2) ||
 	    !kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
 		_kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, dst_bo, src_bo, NULL))
+			return sna_tiling_copy_boxes(sna, alu,
+						     src_bo, src_dx, src_dy,
+						     dst_bo, dst_dx, dst_dy,
+						     bpp, box, nbox);
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 5df53dc..94c2744 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -507,6 +507,10 @@ Bool sna_tiling_fill_boxes(struct sna *sna,
 			   const xRenderColor *color,
 			   PixmapPtr dst, struct kgem_bo *dst_bo,
 			   const BoxRec *box, int n);
+Bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
+			   struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			   struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			   int bpp, const BoxRec *box, int nbox);
 
 Bool sna_blt_composite(struct sna *sna,
 		       uint32_t op,
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index eac664e..0bc4539 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -417,3 +417,99 @@ done:
 	pixman_region_fini(&region);
 	return ret;
 }
+
+Bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
+			   struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			   struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			   int bpp, const BoxRec *box, int nbox)
+{
+	RegionRec region, tile, this;
+	struct kgem_bo *bo;
+	int step;
+	Bool ret = FALSE;
+
+	if (!kgem_bo_can_blt(&sna->kgem, src_bo) ||
+	    !kgem_bo_can_blt(&sna->kgem, dst_bo)) {
+		/* XXX */
+		DBG(("%s: tiling blt fail: src?=%d, dst?=%d\n",
+		     __FUNCTION__,
+		     kgem_bo_can_blt(&sna->kgem, src_bo),
+		     kgem_bo_can_blt(&sna->kgem, dst_bo)));
+		return FALSE;
+	}
+
+	pixman_region_init_rects(&region, box, nbox);
+
+	step = sna->render.max_3d_size;
+	while (step * step * 4 > sna->kgem.max_tile_size)
+		step /= 2;
+
+	DBG(("%s (alu=%d), tile.size=%d, box=%dx[(%d, %d), (%d, %d)])\n",
+	     __FUNCTION__, alu, step, nbox,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
+
+	for (tile.extents.y1 = tile.extents.y2 = region.extents.y1;
+	     tile.extents.y2 < region.extents.y2;
+	     tile.extents.y1 = tile.extents.y2) {
+		tile.extents.y2 = tile.extents.y1 + step;
+		if (tile.extents.y2 > region.extents.y2)
+			tile.extents.y2 = region.extents.y2;
+
+		for (tile.extents.x1 = tile.extents.x2 = region.extents.x1;
+		     tile.extents.x2 < region.extents.x2;
+		     tile.extents.x1 = tile.extents.x2) {
+			int w, h;
+
+			tile.extents.x2 = tile.extents.x1 + step;
+			if (tile.extents.x2 > region.extents.x2)
+				tile.extents.x2 = region.extents.x2;
+
+			tile.data = NULL;
+
+			RegionNull(&this);
+			RegionIntersect(&this, &region, &tile);
+			if (!RegionNotEmpty(&this))
+				continue;
+
+			w = this.extents.x2 - this.extents.x1;
+			h = this.extents.y2 - this.extents.y1;
+			bo = kgem_create_2d(&sna->kgem, w, h, bpp,
+					    kgem_choose_tiling(&sna->kgem,
+							       I915_TILING_X,
+							       w, h, bpp),
+					    0);
+			if (bo) {
+				int16_t dx = this.extents.x1;
+				int16_t dy = this.extents.y1;
+
+				assert(bo->pitch <= 8192);
+				assert(bo->tiling != I915_TILING_Y);
+
+				if (!sna_blt_copy_boxes(sna, alu,
+							src_bo, src_dx, src_dy,
+							bo, -dx, -dy,
+							bpp, REGION_RECTS(&this), REGION_NUM_RECTS(&this)))
+					goto err;
+
+				if (!sna_blt_copy_boxes(sna, alu,
+							bo, -dx, -dy,
+							dst_bo, dst_dx, dst_dy,
+							bpp, REGION_RECTS(&this), REGION_NUM_RECTS(&this)))
+					goto err;
+
+				kgem_bo_destroy(&sna->kgem, bo);
+			}
+			RegionUninit(&this);
+		}
+	}
+
+	ret = TRUE;
+	goto done;
+err:
+	kgem_bo_destroy(&sna->kgem, bo);
+	RegionUninit(&this);
+done:
+	pixman_region_fini(&region);
+	return ret;
+}
commit 4811de6ca5b59012c784d3ce45b2e9f587cb2f0f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 18:07:14 2012 +0000

    sna: Limit the tile size for uploading into large pixmaps
    
    As we may have a constrained aperture, we need to be careful not to
    exceed our resources limits when uploading the pixel data. (For example,
    fitting two of the maximum bo into a single batch may fail due to
    fragmentation of the GATT.) So be cautious and use more tiles to reduce
    the size of each individual batch.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index fff2d19..7019638 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -672,9 +672,15 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	}
 	if (kgem->max_gpu_size > kgem->max_cpu_size)
 		kgem->max_gpu_size = kgem->max_cpu_size;
-
-	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
-	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
+	kgem->max_tile_size = kgem->aperture_total / 4;
+	if (kgem->max_tile_size < kgem->max_gpu_size / 2)
+		kgem->max_tile_size = kgem->max_gpu_size / 2;
+
+	DBG(("%s: max object size (gpu=%d, cpu=%d, tile=%d)\n",
+	     __FUNCTION__,
+	     kgem->max_gpu_size,
+	     kgem->max_cpu_size,
+	     kgem->max_tile_size));
 
 	/* Convert the aperture thresholds to pages */
 	kgem->aperture_low /= PAGE_SIZE;
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index f386967..fea2d45 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -161,7 +161,7 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_gpu_size, max_cpu_size, max_object_size;
+	uint32_t max_tile_size, max_gpu_size, max_cpu_size, max_object_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 14a7901..4f86f8d 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -62,7 +62,8 @@ box_intersect(BoxPtr a, const BoxRec *b)
 static inline bool must_tile(struct sna *sna, int width, int height)
 {
 	return (width  > sna->render.max_3d_size ||
-		height > sna->render.max_3d_size);
+		height > sna->render.max_3d_size ||
+		width * height * 4 > sna->kgem.max_tile_size);
 }
 
 static void read_boxes_inplace(struct kgem *kgem,
@@ -199,6 +200,9 @@ fallback:
 
 			step = MIN(sna->render.max_3d_size,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			while (step * step * 4 > sna->kgem.max_tile_size)
+				step /= 2;
+
 			DBG(("%s: tiling download, using %dx%d tiles\n",
 			     __FUNCTION__, step, step));
 
@@ -577,6 +581,9 @@ fallback:
 
 			step = MIN(sna->render.max_3d_size,
 				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			while (step * step * 4 > sna->kgem.max_tile_size)
+				step /= 2;
+
 			DBG(("%s: tiling upload, using %dx%d tiles\n",
 			     __FUNCTION__, step, step));
 
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 52572bc..eac664e 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -138,7 +138,11 @@ sna_tiling_composite_done(struct sna *sna,
 {
 	struct sna_tile_state *tile = op->u.priv;
 	struct sna_composite_op tmp;
-	int x, y, n, step = sna->render.max_3d_size;
+	int x, y, n, step;
+
+	step = sna->render.max_3d_size;
+	while (step * step * 4 > sna->kgem.max_tile_size)
+		step /= 2;
 
 	DBG(("%s -- %dx%d, count=%d\n", __FUNCTION__,
 	     tile->width, tile->height, tile->rect_count));
@@ -318,21 +322,26 @@ sna_tiling_fill_boxes(struct sna *sna,
 {
 	RegionRec region, tile, this;
 	struct kgem_bo *bo;
+	int step;
 	Bool ret = FALSE;
 
 	pixman_region_init_rects(&region, box, n);
 
+	step = sna->render.max_3d_size;
+	while (step * step * 4 > sna->kgem.max_tile_size)
+		step /= 2;
+
 	DBG(("%s (op=%d, format=%x, color=(%04x,%04x,%04x, %04x), tile.size=%d, box=%dx[(%d, %d), (%d, %d)])\n",
 	     __FUNCTION__, op, (int)format,
 	     color->red, color->green, color->blue, color->alpha,
-	     sna->render.max_3d_size, n,
+	     step, n,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2));
 
 	for (tile.extents.y1 = tile.extents.y2 = region.extents.y1;
 	     tile.extents.y2 < region.extents.y2;
 	     tile.extents.y1 = tile.extents.y2) {
-		tile.extents.y2 = tile.extents.y1 + sna->render.max_3d_size;
+		tile.extents.y2 = tile.extents.y1 + step;
 		if (tile.extents.y2 > region.extents.y2)
 			tile.extents.y2 = region.extents.y2;
 
@@ -341,7 +350,7 @@ sna_tiling_fill_boxes(struct sna *sna,
 		     tile.extents.x1 = tile.extents.x2) {
 			PixmapRec tmp;
 
-			tile.extents.x2 = tile.extents.x1 + sna->render.max_3d_size;
+			tile.extents.x2 = tile.extents.x1 + step;
 			if (tile.extents.x2 > region.extents.x2)
 				tile.extents.x2 = region.extents.x2;
 
commit 84df675b16697d581dce9b46aea249294520b3bc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 15:43:42 2012 +0000

    sna: Fix the "trivial" fix to improve error handling
    
    The logic was just backwards and we tried to upload a shadowless GPU
    pixmap.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 1d7b8e9..fff2d19 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3490,6 +3490,12 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 	DBG(("%s : (%d, %d), (%d, %d), stride=%d, bpp=%d\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2, stride, bpp));
 
+	assert(data);
+	assert(width > 0);
+	assert(height > 0);
+	assert(stride);
+	assert(bpp);
+
 	bo = kgem_create_buffer_2d(kgem,
 				   width, height, bpp,
 				   KGEM_BUFFER_WRITE_INPLACE, &dst);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index cbaaafd..9a98990 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1044,7 +1044,7 @@ sna_render_picture_extract(struct sna *sna,
 			return 0;
 	} else {
 		bool upload = true;
-		if (!texture_is_cpu(pixmap, &box) &&
+		if (!texture_is_cpu(pixmap, &box) ||
 		    move_to_gpu(pixmap, &box)) {
 			struct sna_pixmap *priv;
 
commit 9954e17770315c71c5b315c7448127f0024dc3a5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 14:20:33 2012 +0000

    sna: Handle GPU creation failure when uploading subtexture
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index abc6325..cbaaafd 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1043,20 +1043,23 @@ sna_render_picture_extract(struct sna *sna,
 		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
 			return 0;
 	} else {
-		if (texture_is_cpu(pixmap, &box) &&
-		    !move_to_gpu(pixmap, &box)) {
-			bo = kgem_upload_source_image(&sna->kgem,
-						      pixmap->devPrivate.ptr,
-						      &box,
-						      pixmap->devKind,
-						      pixmap->drawable.bitsPerPixel);
-		} else {
+		bool upload = true;
+		if (!texture_is_cpu(pixmap, &box) &&
+		    move_to_gpu(pixmap, &box)) {
 			struct sna_pixmap *priv;
 
 			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
-			if (priv)
+			if (priv) {
 				src_bo = priv->gpu_bo;
+				upload = false;
+			}
 		}
+		if (upload)
+			bo = kgem_upload_source_image(&sna->kgem,
+						      pixmap->devPrivate.ptr,
+						      &box,
+						      pixmap->devKind,
+						      pixmap->drawable.bitsPerPixel);
 	}
 	if (src_bo) {
 		bo = kgem_create_2d(&sna->kgem, w, h,
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 2fd1eaf..eb6c968 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -3973,6 +3973,8 @@ trap_mask_converter(PicturePtr picture,
 
 	pixmap = get_drawable_pixmap(picture->pDrawable);
 	priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
+	if (priv == NULL)
+		return false;
 
 	/* XXX strict adherence to the Render specification */
 	if (picture->polyMode == PolyModePrecise) {
commit db3509c1511c499819cba915c102d49b741f746a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 14:09:46 2012 +0000

    sna: Always create a GPU bo for copying from an existent source GPU bo
    
    Make sure we prevent the readback of an active source GPU bo by always
    prefering to do the copy on the GPU if the data is already resisent.
    This fixes the second regression from e583af9cc, (sna: Experiment with
    creating large objects as CPU bo).
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ce35113..5d0e042 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3211,7 +3211,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	}
 
 	/* Try to maintain the data on the GPU */
-	if (dst_priv->gpu_bo == NULL && dst_priv->gpu &&
+	if (dst_priv->gpu_bo == NULL &&
 	    ((dst_priv->cpu_damage == NULL && copy_use_gpu_bo(sna, dst_priv, &region)) ||
 	     (src_priv && (src_priv->gpu_bo != NULL || (src_priv->cpu_bo && kgem_bo_is_busy(src_priv->cpu_bo)))))) {
 		uint32_t tiling = sna_pixmap_choose_tiling(dst_pixmap);
commit 6ac8f44c668136811e39eac90f84a8e207a7503f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 13:55:20 2012 +0000

    sna: Ignore map status and pick the first inactive bo for reuse
    
    This fixes the performance regression introduced with e583af9cca,
    (sna: Experiment with creating large objects as CPU bo), as we ended up
    creating fresh bo and incurring setup and thrashing overhead, when we
    already had plenty cached.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d062a1d..1d7b8e9 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2501,9 +2501,6 @@ search_inactive:
 			continue;
 		}
 
-		if ((flags & CREATE_CPU_MAP) == 0 && IS_CPU_MAP(bo->map))
-			continue;
-
 		if (bo->tiling != tiling ||
 		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
 			if (tiling != gem_set_tiling(kgem->fd,
commit 9a20948dacc36cf0913528afa36c9e3a38f3daeb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sun Jan 29 11:02:38 2012 +0000

    sna: Determine whether to use a partial proxy based on the pitch
    
    On gen4+ devices the maximum render pitch is much larger than is simply
    required for the maximum coordinates. This makes it possible to use
    proxy textures as a subimage into the oversized texture without having
    to blit into a temporary copy for virtually every single bo we use.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 2a97cea..398988a 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -55,6 +55,7 @@
 #define PREFER_BLT_COPY 1
 
 #define MAX_3D_SIZE 2048
+#define MAX_3D_PITCH 8192
 
 #define BATCH(v) batch_emit(sna, v)
 #define BATCH_F(v) batch_emit_float(sna, v)
@@ -547,7 +548,7 @@ gen2_get_batch(struct sna *sna)
 
 static void gen2_emit_target(struct sna *sna, const struct sna_composite_op *op)
 {
-	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= 8192);
+	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= MAX_3D_PITCH);
 	assert(sna->render_state.gen2.vertex_offset == 0);
 
 	if (sna->render_state.gen2.target == op->dst.bo->unique_id) {
@@ -1736,7 +1737,7 @@ gen2_render_composite(struct sna *sna,
 
 	tmp->op = op;
 	if (too_large(tmp->dst.width, tmp->dst.height) ||
-	    tmp->dst.bo->pitch > 8192) {
+	    tmp->dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, tmp,
 						   dst_x, dst_y, width, height))
 			return FALSE;
@@ -2192,7 +2193,7 @@ gen2_render_composite_spans(struct sna *sna,
 
 	tmp->base.op = op;
 	if (too_large(tmp->base.dst.width, tmp->base.dst.height) ||
-	    tmp->base.dst.bo->pitch > 8192) {
+	    tmp->base.dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, &tmp->base,
 						   dst_x, dst_y, width, height))
 			return FALSE;
@@ -2388,7 +2389,7 @@ gen2_render_fill_boxes(struct sna *sna,
 	     color->red, color->green, color->blue, color->alpha));
 
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192 ||
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH ||
 	    !gen2_check_dst_format(format)) {
 		DBG(("%s: try blt, too large or incompatible destination\n",
 		     __FUNCTION__));
@@ -2589,7 +2590,7 @@ gen2_render_fill(struct sna *sna, uint8_t alu,
 
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192)
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_fill(sna, alu,
 				    dst_bo, dst->drawable.bitsPerPixel,
 				    color,
@@ -2665,7 +2666,7 @@ gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    bo->pitch < 8 || bo->pitch > 8192)
+	    bo->pitch < 8 || bo->pitch > MAX_3D_PITCH)
 		return gen2_render_fill_one_try_blt(sna, dst, bo, color,
 						    x1, y1, x2, y2, alu);
 
@@ -2832,9 +2833,9 @@ gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	if (src_bo == dst_bo || /* XXX handle overlap using 3D ? */
 	    too_large(src->drawable.width, src->drawable.height) ||
-	    src_bo->pitch > 8192 ||
+	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch < 8 || dst_bo->pitch > 8192)
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
 						   dst, dst_bo, dst_dx, dst_dy,
@@ -2957,7 +2958,8 @@ gen2_render_copy(struct sna *sna, uint8_t alu,
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(src->drawable.width, src->drawable.height) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    src_bo->pitch > 8192 || dst_bo->pitch < 8 || dst_bo->pitch > 8192) {
+	    src_bo->pitch > MAX_3D_PITCH ||
+	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH) {
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -3045,5 +3047,6 @@ Bool gen2_render_init(struct sna *sna)
 	render->flush = gen2_render_flush;
 
 	render->max_3d_size = MAX_3D_SIZE;
+	render->max_3d_pitch = MAX_3D_PITCH;
 	return TRUE;
 }
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 931142d..da90d82 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -65,6 +65,7 @@ enum {
 };
 
 #define MAX_3D_SIZE 2048
+#define MAX_3D_PITCH 8192
 
 #define OUT_BATCH(v) batch_emit(sna, v)
 #define OUT_BATCH_F(v) batch_emit_float(sna, v)
@@ -143,7 +144,7 @@ static inline uint32_t gen3_buf_tiling(uint32_t tiling)
 static inline Bool
 gen3_check_pitch_3d(struct kgem_bo *bo)
 {
-	return bo->pitch <= 8192;
+	return bo->pitch <= MAX_3D_PITCH;
 }
 
 static uint32_t gen3_get_blend_cntl(int op,
@@ -3826,9 +3827,9 @@ gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    src_bo == dst_bo || /* XXX handle overlap using 3D ? */
-	    src_bo->pitch > 8192 ||
+	    src_bo->pitch > MAX_3D_PITCH ||
 	    too_large(src->drawable.width, src->drawable.height) ||
-	    dst_bo->pitch > 8192 ||
+	    dst_bo->pitch > MAX_3D_PITCH ||
 	    too_large(dst->drawable.width, dst->drawable.height))
 		return sna_blt_copy_boxes_fallback(sna, alu,
 						   src, src_bo, src_dx, src_dy,
@@ -3959,7 +3960,7 @@ gen3_render_copy(struct sna *sna, uint8_t alu,
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(src->drawable.width, src->drawable.height) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    src_bo->pitch > 8192 || dst_bo->pitch > 8192) {
+	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch > MAX_3D_PITCH) {
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return FALSE;
 
@@ -4083,7 +4084,7 @@ gen3_render_fill_boxes(struct sna *sna,
 	     color->red, color->green, color->blue, color->alpha));
 
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch > 8192 ||
+	    dst_bo->pitch > MAX_3D_PITCH ||
 	    !gen3_check_dst_format(format)) {
 		DBG(("%s: try blt, too large or incompatible destination\n",
 		     __FUNCTION__));
@@ -4265,7 +4266,7 @@ gen3_render_fill(struct sna *sna, uint8_t alu,
 	/* Must use the BLT if we can't RENDER... */
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    dst_bo->pitch > 8192)
+	    dst_bo->pitch > MAX_3D_PITCH)
 		return sna_blt_fill(sna, alu,
 				    dst_bo, dst->drawable.bitsPerPixel,
 				    color,
@@ -4346,7 +4347,7 @@ gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	/* Must use the BLT if we can't RENDER... */
 	if (!(alu == GXcopy || alu == GXclear) ||
 	    too_large(dst->drawable.width, dst->drawable.height) ||
-	    bo->pitch > 8192)
+	    bo->pitch > MAX_3D_PITCH)
 		return gen3_render_fill_one_try_blt(sna, dst, bo, color,
 						    x1, y1, x2, y2, alu);
 
@@ -4424,5 +4425,6 @@ Bool gen3_render_init(struct sna *sna)
 	render->fini = gen3_render_fini;
 
 	render->max_3d_size = MAX_3D_SIZE;
+	render->max_3d_pitch = MAX_3D_PITCH;
 	return TRUE;
 }
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 91d5f49..d9542ea 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -3220,5 +3220,6 @@ Bool gen4_render_init(struct sna *sna)
 	sna->render.fini = gen4_render_fini;
 
 	sna->render.max_3d_size = 8192;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 2c6d020..3465121 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -3702,5 +3702,6 @@ Bool gen5_render_init(struct sna *sna)
 	sna->render.fini = gen5_render_fini;
 
 	sna->render.max_3d_size = MAX_3D_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 41e05d0..0d244f1 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -4044,5 +4044,6 @@ Bool gen6_render_init(struct sna *sna)
 	sna->render.fini = gen6_render_fini;
 
 	sna->render.max_3d_size = GEN6_MAX_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 282b724..ff04631 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -4097,5 +4097,6 @@ Bool gen7_render_init(struct sna *sna)
 	sna->render.fini = gen7_render_fini;
 
 	sna->render.max_3d_size = GEN7_MAX_SIZE;
+	sna->render.max_3d_pitch = 1 << 18;
 	return TRUE;
 }
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 208c8f2..d062a1d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -704,6 +704,39 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 	return ALIGN(width, scanout ? 64 : kgem->min_alignment);
 }
 
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size)
+{
+	if (kgem->gen < 30) {
+		if (tiling) {
+			*tile_width = 512;
+			*tile_height = 16;
+			*tile_size = 2048;
+		} else {
+			*tile_width = 1;
+			*tile_height = 1;
+			*tile_size = 1;
+		}
+	} else switch (tiling) {
+	default:
+	case I915_TILING_NONE:
+		*tile_width = 1;
+		*tile_height = 1;
+		*tile_size = 1;
+		break;
+	case I915_TILING_X:
+		*tile_width = 512;
+		*tile_height = 8;
+		*tile_size = 4096;
+		break;
+	case I915_TILING_Y:
+		*tile_width = kgem->gen <= 30 ? 512 : 128;
+		*tile_height = 32;
+		*tile_size = 4096;
+		break;
+	}
+}
+
 static uint32_t kgem_surface_size(struct kgem *kgem,
 				  bool relaxed_fencing,
 				  bool scanout,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 2631e81..f386967 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -361,6 +361,8 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 		   const void *data, int length);
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size);
 
 static inline int kgem_bo_size(struct kgem_bo *bo)
 {
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 7077f36..abc6325 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -816,10 +816,27 @@ sna_render_picture_partial(struct sna *sna,
 	struct kgem_bo *bo = NULL;
 	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
 	BoxRec box;
+	int tile_width, tile_height, tile_size;
+	int offset;
 
 	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
 	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
 
+	if (use_cpu_bo(sna, pixmap, &box)) {
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->cpu_bo;
+	} else {
+		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->gpu_bo;
+	}
+
+	if (bo->pitch > sna->render.max_3d_pitch)
+		return 0;
+
 	box.x1 = x;
 	box.y1 = y;
 	box.x2 = x + w;
@@ -855,51 +872,65 @@ sna_render_picture_partial(struct sna *sna,
 		}
 	}
 
-	/* Presume worst case tile-row alignment for Y-tiling */
-	box.y1 = box.y1 & (64 - 1);
-	box.y2 = ALIGN(box.y2, 64);
+	kgem_get_tile_size(&sna->kgem, bo->tiling,
+			   &tile_width, &tile_height, &tile_size);
+
+	/* Ensure we align to an even tile row */
+	box.y1 = box.y1 & ~(2*tile_height - 1);
+	box.y2 = ALIGN(box.y2, 2*tile_height);
+	if (box.y2 > pixmap->drawable.height)
+		box.y2 = pixmap->drawable.height;
+
+	box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
+	box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
+	if (box.x2 > pixmap->drawable.width)
+		box.x2 = pixmap->drawable.width;
+
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
 	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
 	     box.x1, box.y1, box.x2, box.y2, w, h,
 	     pixmap->drawable.width, pixmap->drawable.height));
-	if (w <= 0 || h <= 0 || h > sna->render.max_3d_size)
+	if (w <= 0 || h <= 0 ||
+	    w > sna->render.max_3d_size ||
+	    h > sna->render.max_3d_size)
 		return 0;
 
-	memset(&channel->embedded_transform,
-	       0,
-	       sizeof(channel->embedded_transform));
-	channel->embedded_transform.matrix[0][0] = 1 << 16;
-	channel->embedded_transform.matrix[0][2] = 0;
-	channel->embedded_transform.matrix[1][1] = 1 << 16;
-	channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
-	channel->embedded_transform.matrix[2][2] = 1 << 16;
-	if (channel->transform)
+	/* How many tiles across are we? */
+	offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+	channel->bo = kgem_create_proxy(bo,
+					box.y1 * bo->pitch + offset,
+					h * bo->pitch);
+	if (channel->bo == NULL)
+		return 0;
+
+	channel->bo->pitch = bo->pitch;
+
+	if (channel->transform) {
+		memset(&channel->embedded_transform,
+		       0,
+		       sizeof(channel->embedded_transform));
+		channel->embedded_transform.matrix[0][0] = 1 << 16;
+		channel->embedded_transform.matrix[0][2] = -box.x1 << 16;
+		channel->embedded_transform.matrix[1][1] = 1 << 16;
+		channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
+		channel->embedded_transform.matrix[2][2] = 1 << 16;
 		pixman_transform_multiply(&channel->embedded_transform,
 					  &channel->embedded_transform,
 					  channel->transform);
-	channel->transform = &channel->embedded_transform;
-
-	if (use_cpu_bo(sna, pixmap, &box)) {
-		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->cpu_bo;
+		channel->transform = &channel->embedded_transform;
 	} else {
-		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
-			return 0;
-
-		bo = sna_pixmap(pixmap)->gpu_bo;
+		x -= box.x1;
+		y -= box.y1;
 	}
 
 	channel->offset[0] = x - dst_x;
 	channel->offset[1] = y - dst_y;
-	channel->scale[0] = 1.f/pixmap->drawable.width;
+	channel->scale[0] = 1.f/w;
 	channel->scale[1] = 1.f/h;
-	channel->width  = pixmap->drawable.width;
+	channel->width  = w;
 	channel->height = h;
-	channel->bo = kgem_create_proxy(bo, box.y1 * bo->pitch, h * bo->pitch);
-	return channel->bo != NULL;
+	return 1;
 }
 
 int
@@ -927,8 +958,7 @@ sna_render_picture_extract(struct sna *sna,
 		return -1;
 	}
 
-	if (pixmap->drawable.width < sna->render.max_3d_size &&
-	    sna_render_picture_partial(sna, picture, channel,
+	if (sna_render_picture_partial(sna, picture, channel,
 				       x, y, w, h,
 				       dst_x, dst_y))
 		return 1;
@@ -1527,30 +1557,67 @@ sna_render_composite_redirect(struct sna *sna,
 		return FALSE;
 	}
 
-	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
-		int y1, y2;
+	if (op->dst.bo->pitch <= sna->render.max_3d_pitch) {
+		int tile_width, tile_height, tile_size;
+		BoxRec box;
+		int w, h;
 
-		assert(op->dst.pixmap->drawable.height > sna->render.max_3d_size);
-		y1 =  y + op->dst.y;
-		y2 =  y1 + height;
-		y1 &= y1 & (64 - 1);
-		y2 = ALIGN(y2, 64);
+		kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling,
+				   &tile_width, &tile_height, &tile_size);
+
+		box.x1 = x;
+		box.x2 = x + width;
+		box.y1 = y;
+		box.y2 = y + height;
+
+		/* Ensure we align to an even tile row */
+		box.y1 = box.y1 & ~(2*tile_height - 1);
+		box.y2 = ALIGN(box.y2, 2*tile_height);
+		if (box.y2 > op->dst.pixmap->drawable.height)
+			box.y2 = op->dst.pixmap->drawable.height;
+
+		box.x1 = box.x1 & ~(tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel - 1);
+		box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);
+		if (box.x2 > op->dst.pixmap->drawable.width)
+			box.x2 = op->dst.pixmap->drawable.width;
+
+		w = box.x2 - box.x1;
+		h = box.y2 - box.y1;
+		DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
+		     box.x1, box.y1, box.x2, box.y2, w, h,
+		     op->dst.pixmap->drawable.width,
+		     op->dst.pixmap->drawable.height));
+		if (w <= sna->render.max_3d_size &&
+		    h <= sna->render.max_3d_size) {
+			int offset;
 
-		if (y2 - y1 <= sna->render.max_3d_size) {
 			t->box.x2 = t->box.x1 = op->dst.x;
 			t->box.y2 = t->box.y1 = op->dst.y;
-			t->real_bo = priv->gpu_bo;
+			t->real_bo = op->dst.bo;
 			t->real_damage = op->damage;
 			if (op->damage) {
 				t->damage = sna_damage_create();
 				op->damage = &t->damage;
 			}
 
-			op->dst.bo = kgem_create_proxy(priv->gpu_bo,
-						       y1 * priv->gpu_bo->pitch,
-						       (y2 - y1) * priv->gpu_bo->pitch);
-			op->dst.y += -y1;
-			op->dst.height = y2 - y1;
+			/* How many tiles across are we? */
+			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
+			op->dst.bo = kgem_create_proxy(op->dst.bo,
+						       box.y1 * op->dst.bo->pitch + offset,
+						       h * op->dst.bo->pitch);
+			if (!op->dst.bo) {
+				t->real_bo = NULL;
+				if (t->damage)
+					__sna_damage_destroy(t->damage);
+				return FALSE;
+			}
+
+			op->dst.bo->pitch = t->real_bo->pitch;
+
+			op->dst.x += -box.x1;
+			op->dst.y += -box.y1;
+			op->dst.width  = w;
+			op->dst.height = h;
 			return TRUE;
 		}
 	}
@@ -1583,7 +1650,7 @@ sna_render_composite_redirect(struct sna *sna,
 		return FALSE;
 	}
 
-	t->real_bo = priv->gpu_bo;
+	t->real_bo = op->dst.bo;
 	t->real_damage = op->damage;
 	if (op->damage) {
 		t->damage = sna_damage_create();
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index b23a8a7..5df53dc 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -186,6 +186,7 @@ struct sna_copy_op {
 
 struct sna_render {
 	int max_3d_size;
+	int max_3d_pitch;
 
 	Bool (*composite)(struct sna *sna, uint8_t op,
 			  PicturePtr dst, PicturePtr src, PicturePtr mask,
commit 779dd7b25bc0dd67f47f093663ecbee37902852c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Sat Jan 28 01:54:47 2012 +0000

    sna: Allow ridiculously large bo, up to half the total GATT
    
    Such large bo place extreme stress on the system, for example trying to
    mmap a 1GiB into the CPU domain currently fails due to a kernel bug. :(
    So if you can avoid the swap thrashing during the upload, the ddx can now
    handle 16k x 16k images on gen4+ on the GPU. That is fine until you want
    two such images...
    
    The real complication comes in uploading (and downloading) from such
    large textures as they are too large for a single operation with
    automatic detiling via either the BLT or the RENDER ring. We could do
    manual tiling/switching or, as this patch does, tile the transfer in
    chunks small enough to fit into either pipeline.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index fb3dd35..65d586c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -154,6 +154,8 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	uint8_t *dst_bytes;
 	int byte_width;
 
+	assert(src);
+	assert(dst);
 	assert(width && height);
 	assert(bpp >= 8);
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d813d95..41e05d0 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1193,6 +1193,7 @@ gen6_bind_bo(struct sna *sna,
 			       bo, domains, 0);
 	ss[2] = ((width - 1)  << GEN6_SURFACE_WIDTH_SHIFT |
 		 (height - 1) << GEN6_SURFACE_HEIGHT_SHIFT);
+	assert(bo->pitch <= (1 << 18));
 	ss[3] = (gen6_tiling_bits(bo->tiling) |
 		 (bo->pitch - 1) << GEN6_SURFACE_PITCH_SHIFT);
 	ss[4] = 0;
@@ -3136,10 +3137,19 @@ fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return false;
 
-		return sna_blt_copy_boxes_fallback(sna, alu,
-						   src, src_bo, src_dx, src_dy,
-						   dst, dst_bo, dst_dx, dst_dy,
-						   box, n);
+		if (sna_blt_copy_boxes_fallback(sna, alu,
+						 src, src_bo, src_dx, src_dy,
+						 dst, dst_bo, dst_dx, dst_dy,
+						 box, n))
+			return true;
+
+		return false;
+#if 0
+		return sna_tiling_copy_boxes(sna,
+					     src, src_bo, src_dx, src_dy,
+					     dst, dst_bo, dst_dx, dst_dy,
+					     box, n);
+#endif
 	}
 
 	if (dst->drawable.depth == src->drawable.depth) {
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 86a4372..208c8f2 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -45,7 +45,7 @@
 #endif
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags);
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
 static inline void _list_del(struct list *list)
 {
@@ -99,7 +99,6 @@ static inline void list_replace(struct list *old,
 #define DBG(x) ErrorF x
 #endif
 
-#define PAGE_SIZE 4096
 #define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
 #define MAX_GTT_VMA_CACHE 512
 #define MAX_CPU_VMA_CACHE INT16_MAX
@@ -120,6 +119,14 @@ struct kgem_partial_bo {
 static struct kgem_bo *__kgem_freed_bo;
 static struct drm_i915_gem_exec_object2 _kgem_dummy_exec;
 
+static inline int bytes(struct kgem_bo *bo)
+{
+	return kgem_bo_size(bo);
+}
+
+#define bucket(B) (B)->size.pages.bucket
+#define num_pages(B) (B)->size.pages.count
+
 #ifndef NDEBUG
 static bool validate_partials(struct kgem *kgem)
 {
@@ -128,10 +135,10 @@ static bool validate_partials(struct kgem *kgem)
 	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
 		if (bo->base.list.next == &kgem->partial)
 			return true;
-		if (bo->base.size - bo->used < next->base.size - next->used) {
+		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
 			ErrorF("this rem: %d, next rem: %d\n",
-			       bo->base.size - bo->used,
-			       next->base.size - next->used);
+			       bytes(&bo->base) - bo->used,
+			       bytes(&next->base) - next->used);
 			goto err;
 		}
 	}
@@ -140,7 +147,7 @@ static bool validate_partials(struct kgem *kgem)
 err:
 	list_for_each_entry(bo, &kgem->partial, base.list)
 		ErrorF("bo: used=%d / %d, rem=%d\n",
-		       bo->used, bo->base.size, bo->base.size - bo->used);
+		       bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
 	return false;
 }
 #else
@@ -312,7 +319,7 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	assert(!bo->purged);
 	assert(!kgem_busy(kgem, bo->handle));
 
-	assert(length <= bo->size);
+	assert(length <= bytes(bo));
 	if (gem_write(kgem->fd, bo->handle, 0, length, data))
 		return FALSE;
 
@@ -322,17 +329,13 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	return TRUE;
 }
 
-static uint32_t gem_create(int fd, int size)
+static uint32_t gem_create(int fd, int num_pages)
 {
 	struct drm_i915_gem_create create;
 
-#if DEBUG_KGEM
-	assert((size & (PAGE_SIZE-1)) == 0);
-#endif
-
 	VG_CLEAR(create);
 	create.handle = 0;
-	create.size = size;
+	create.size = PAGE_SIZE * num_pages;
 	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
 
 	return create.handle;
@@ -415,7 +418,7 @@ static void gem_close(int fd, uint32_t handle)
 	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
 }
 
-static inline unsigned long __fls(unsigned long word)
+constant inline static unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -423,24 +426,21 @@ static inline unsigned long __fls(unsigned long word)
 	return word;
 }
 
-constant inline static int cache_bucket(int size)
+constant inline static int cache_bucket(int num_pages)
 {
-	uint32_t order = __fls(size / PAGE_SIZE);
-	assert(order < NUM_CACHE_BUCKETS);
-	return order;
+	return __fls(num_pages);
 }
 
 static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
-				      int handle, int size)
+				      int handle, int num_pages)
 {
-	assert(size);
+	assert(num_pages);
 	memset(bo, 0, sizeof(*bo));
 
 	bo->refcnt = 1;
 	bo->handle = handle;
-	bo->size = size;
-	bo->bucket = cache_bucket(size);
-	assert(bo->size < 1 << (12 + bo->bucket + 1));
+	num_pages(bo) = num_pages;
+	bucket(bo) = cache_bucket(num_pages);
 	bo->reusable = true;
 	bo->domain = DOMAIN_CPU;
 	list_init(&bo->request);
@@ -450,7 +450,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
 	return bo;
 }
 
-static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
+static struct kgem_bo *__kgem_bo_alloc(int handle, int num_pages)
 {
 	struct kgem_bo *bo;
 
@@ -463,7 +463,7 @@ static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
 			return NULL;
 	}
 
-	return __kgem_bo_init(bo, handle, size);
+	return __kgem_bo_init(bo, handle, num_pages);
 }
 
 static struct kgem_request _kgem_static_request;
@@ -481,14 +481,14 @@ static struct kgem_request *__kgem_request_alloc(void)
 	return rq;
 }
 
-static struct list *inactive(struct kgem *kgem, int size)
+static struct list *inactive(struct kgem *kgem, int num_pages)
 {
-	return &kgem->inactive[cache_bucket(size)];
+	return &kgem->inactive[cache_bucket(num_pages)];
 }
 
-static struct list *active(struct kgem *kgem, int size, int tiling)
+static struct list *active(struct kgem *kgem, int num_pages, int tiling)
 {
-	return &kgem->active[cache_bucket(size)][tiling];
+	return &kgem->active[cache_bucket(num_pages)][tiling];
 }
 
 static size_t
@@ -575,6 +575,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	list_init(&kgem->partial);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
+	list_init(&kgem->large);
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
 		list_init(&kgem->inactive[i]);
 	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
@@ -658,11 +659,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
+	kgem->max_object_size = kgem->aperture_total / 2;
 	kgem->max_cpu_size = kgem->aperture_total / 2;
-	if (kgem->max_cpu_size > MAX_OBJECT_SIZE)
-		kgem->max_cpu_size = MAX_OBJECT_SIZE;
-
-	kgem->max_gpu_size = -1;
+	kgem->max_gpu_size = MAX_CACHE_SIZE;
 	if (gen < 40) {
 		/* If we have to use fences for blitting, we have to make
 		 * sure we can fit them into the aperture.
@@ -677,6 +676,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
 	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
 
+	/* Convert the aperture thresholds to pages */
+	kgem->aperture_low /= PAGE_SIZE;
+	kgem->aperture_high /= PAGE_SIZE;
+
 	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
 	if ((int)kgem->fence_max < 0)
 		kgem->fence_max = 5; /* minimum safe value for all hw */
@@ -811,7 +814,7 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 	exec->handle = bo->handle;
 	exec->offset = bo->presumed_offset;
 
-	kgem->aperture += bo->size;
+	kgem->aperture += num_pages(bo);
 
 	return exec;
 }
@@ -875,7 +878,7 @@ static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, kgem->vma[type].count));
 
 	VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-	munmap(CPU_MAP(bo->map), bo->size);
+	munmap(CPU_MAP(bo->map), bytes(bo));
 	bo->map = NULL;
 
 	if (!list_is_empty(&bo->vma)) {
@@ -917,16 +920,22 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(bo->rq == NULL);
 	assert(bo->domain != DOMAIN_GPU);
 
-	list_move(&bo->list, &kgem->inactive[bo->bucket]);
+	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
+		kgem_bo_free(kgem, bo);
+		return;
+	}
+
+	list_move(&bo->list, &kgem->inactive[bucket(bo)]);
 	if (bo->map) {
 		int type = IS_CPU_MAP(bo->map);
-		if (!type && !kgem_bo_is_mappable(kgem, bo)) {
+		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
+		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
 			list_del(&bo->vma);
-			munmap(CPU_MAP(bo->map), bo->size);
+			munmap(CPU_MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
 		if (bo->map) {
-			list_move(&bo->vma, &kgem->vma[type].inactive[bo->bucket]);
+			list_move(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
 			kgem->vma[type].count++;
 		}
 	}
@@ -1002,8 +1011,14 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	bo->scanout = bo->flush = false;
 	if (bo->rq) {
+		struct list *cache;
+
 		DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle));
-		list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+		if (bucket(bo) < NUM_CACHE_BUCKETS)
+			cache = &kgem->active[bucket(bo)][bo->tiling];
+		else
+			cache = &kgem->large;
+		list_add(&bo->list, cache);
 		return;
 	}
 
@@ -1012,10 +1027,17 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	if (bo->needs_flush) {
 		if ((bo->needs_flush = kgem_busy(kgem, bo->handle))) {
+			struct list *cache;
+
 			DBG(("%s: handle=%d -> flushing\n",
 			     __FUNCTION__, bo->handle));
+
 			list_add(&bo->request, &kgem->flushing);
-			list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+			if (bucket(bo) < NUM_CACHE_BUCKETS)
+				cache = &kgem->active[bucket(bo)][bo->tiling];
+			else
+				cache = &kgem->large;
+			list_add(&bo->list, cache);
 			bo->rq = &_kgem_static_request;
 			return;
 		}
@@ -1231,7 +1253,7 @@ static void kgem_close_inactive(struct kgem *kgem)
 
 static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 {
-	int remain = bo->base.size - bo->used;
+	int remain = bytes(&bo->base) - bo->used;
 
 	while (bo->base.list.prev != &kgem->partial) {
 		struct kgem_partial_bo *p;
@@ -1239,7 +1261,7 @@ static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 		p = list_entry(bo->base.list.prev,
 			       struct kgem_partial_bo,
 			       base.list);
-		if (remain <= p->base.size - p->used)
+		if (remain <= bytes(&p->base) - p->used)
 			break;
 
 		assert(p->base.list.next == &bo->base.list);
@@ -1282,7 +1304,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 		assert(bo->base.rq == kgem->next_request);
 		if (bo->used && bo->need_io) {
 			if (bo->base.refcnt == 1 &&
-			    bo->used < bo->base.size / 2) {
+			    bo->used < bytes(&bo->base) / 2) {
 				struct kgem_bo *shrink;
 
 				shrink = search_linear_cache(kgem,
@@ -1293,10 +1315,10 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 					DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
 					     __FUNCTION__,
-					     bo->used, bo->base.size, shrink->size,
+					     bo->used, bytes(&bo->base), bytes(shrink),
 					     bo->base.handle, shrink->handle));
 
-					assert(bo->used <= shrink->size);
+					assert(bo->used <= bytes(shrink));
 					gem_write(kgem->fd, shrink->handle,
 						  0, bo->used, bo->mem);
 
@@ -1330,9 +1352,9 @@ static void kgem_finish_partials(struct kgem *kgem)
 			}
 
 			DBG(("%s: handle=%d, uploading %d/%d\n",
-			     __FUNCTION__, bo->base.handle, bo->used, bo->base.size));
+			     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
 			assert(!kgem_busy(kgem, bo->base.handle));
-			assert(bo->used <= bo->base.size);
+			assert(bo->used <= bytes(&bo->base));
 			gem_write(kgem->fd, bo->base.handle,
 				  0, bo->used, bo->mem);
 			bo->need_io = 0;
@@ -1616,7 +1638,7 @@ void _kgem_submit(struct kgem *kgem)
 					       i,
 					       kgem->exec[i].handle,
 					       (int)kgem->exec[i].offset,
-					       found ? found->size : -1,
+					       found ? bytes(found) : -1,
 					       found ? found->tiling : -1,
 					       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
 					       found ? found->purged : -1);
@@ -1690,7 +1712,7 @@ static void kgem_expire_partial(struct kgem *kgem)
 			continue;
 
 		DBG(("%s: discarding unused partial buffer: %d/%d, write? %d\n",
-		     __FUNCTION__, bo->used, bo->base.size, bo->write));
+		     __FUNCTION__, bo->used, bytes(&bo->base), bo->write));
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
 	}
@@ -1773,7 +1795,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 				list_move_tail(&bo->list, &preserve);
 			} else {
 				count++;
-				size += bo->size;
+				size += bytes(bo);
 				kgem_bo_free(kgem, bo);
 				DBG(("%s: expiring %d\n",
 				     __FUNCTION__, bo->handle));
@@ -1834,28 +1856,31 @@ void kgem_cleanup_cache(struct kgem *kgem)
 }
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 {
 	struct kgem_bo *bo, *first = NULL;
 	bool use_active = (flags & CREATE_INACTIVE) == 0;
 	struct list *cache;
 
+	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
+		return NULL;
+
 	if (!use_active &&
-	    list_is_empty(inactive(kgem, size)) &&
-	    !list_is_empty(active(kgem, size, I915_TILING_NONE)) &&
+	    list_is_empty(inactive(kgem, num_pages)) &&
+	    !list_is_empty(active(kgem, num_pages, I915_TILING_NONE)) &&
 	    !kgem_retire(kgem))
 		return NULL;
 
 	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
-		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
 		list_for_each_entry(bo, cache, vma) {
 			assert(IS_CPU_MAP(bo->map) == for_cpu);
-			assert(bo->bucket == cache_bucket(size));
+			assert(bucket(bo) == cache_bucket(num_pages));
 
-			if (size > bo->size) {
+			if (num_pages > num_pages(bo)) {
 				DBG(("inactive too small: %d < %d\n",
-				     bo->size, size));
+				     num_pages(bo), num_pages));
 				continue;
 			}
 
@@ -1874,8 +1899,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 			bo->tiling = I915_TILING_NONE;
 			bo->pitch = 0;
 			bo->delta = 0;
-			DBG(("  %s: found handle=%d (size=%d) in linear vma cache\n",
-			     __FUNCTION__, bo->handle, bo->size));
+			DBG(("  %s: found handle=%d (num_pages=%d) in linear vma cache\n",
+			     __FUNCTION__, bo->handle, num_pages(bo)));
 			assert(use_active || bo->domain != DOMAIN_GPU);
 			assert(!bo->needs_flush);
 			//assert(!kgem_busy(kgem, bo->handle));
@@ -1883,13 +1908,13 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		}
 	}
 
-	cache = use_active ? active(kgem, size, I915_TILING_NONE) : inactive(kgem, size);
+	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
 	list_for_each_entry(bo, cache, list) {
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
 		assert(!!bo->rq == !!use_active);
 
-		if (size > bo->size)
+		if (num_pages > num_pages(bo))
 			continue;
 
 		if (use_active && bo->tiling != I915_TILING_NONE)
@@ -1946,8 +1971,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		assert(bo->tiling == I915_TILING_NONE);
 		bo->pitch = 0;
 		bo->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, bo->handle, bo->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, bo->handle, num_pages(bo),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || bo->domain != DOMAIN_GPU);
 		assert(!bo->needs_flush || use_active);
@@ -1965,8 +1990,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 
 		first->pitch = 0;
 		first->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, first->handle, first->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, first->handle, num_pages(first),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || first->domain != DOMAIN_GPU);
 		assert(!first->needs_flush || use_active);
@@ -1990,7 +2015,7 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 		return NULL;
 
 	DBG(("%s: new handle=%d\n", __FUNCTION__, open_arg.handle));
-	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size);
+	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size / PAGE_SIZE);
 	if (bo == NULL) {
 		gem_close(kgem->fd, open_arg.handle);
 		return NULL;
@@ -2007,7 +2032,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 	DBG(("%s(%d)\n", __FUNCTION__, size));
 
-	size = PAGE_ALIGN(size);
+	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	bo = search_linear_cache(kgem, size, CREATE_INACTIVE);
 	if (bo)
 		return kgem_bo_reference(bo);
@@ -2019,7 +2044,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 	DBG(("%s: new handle=%d\n", __FUNCTION__, handle));
 	bo = __kgem_bo_alloc(handle, size);
 	if (bo == NULL) {
-		gem_close(kgem->fd, size);
+		gem_close(kgem->fd, handle);
 		return NULL;
 	}
 
@@ -2028,8 +2053,6 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int bpp)
 {
-	uint32_t pitch;
-
 	if (DBG_NO_TILING)
 		return tiling < 0 ? tiling : I915_TILING_NONE;
 
@@ -2058,17 +2081,6 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		}
 	}
 
-	/* First check that we can fence the whole object */
-	if (tiling &&
-	    kgem_surface_size(kgem, false, false,
-			      width, height, bpp, tiling,
-			      &pitch) > kgem->max_gpu_size) {
-		DBG(("%s: too large (%dx%d) to be fenced, discarding tiling\n",
-		     __FUNCTION__, width, height));
-		tiling = I915_TILING_NONE;
-		goto done;
-	}
-
 	if (tiling < 0)
 		return tiling;
 
@@ -2125,18 +2137,42 @@ done:
 	return tiling;
 }
 
-bool kgem_can_create_cpu(struct kgem *kgem,
+bool kgem_can_create_2d(struct kgem *kgem,
 			 int width, int height, int depth)
 {
+	int bpp = BitsPerPixel(depth);
 	uint32_t pitch, size;
 
 	if (depth < 8 || kgem->wedged)
 		return false;
 
 	size = kgem_surface_size(kgem, false, false,
-				 width, height, BitsPerPixel(depth),
+				 width, height, bpp,
+				 I915_TILING_X, &pitch);
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
-	return size > 0 && size < kgem->max_cpu_size;
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	return false;
+}
+
+bool kgem_can_create_cpu(struct kgem *kgem,
+			 int width, int height, int bpp)
+{
+	uint32_t pitch, size;
+
+	if (bpp < 8 || kgem->wedged)
+		return false;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
+				 I915_TILING_NONE, &pitch);
+	return size > 0 && size <= kgem->max_cpu_size;
 }
 
 static bool _kgem_can_create_gpu(struct kgem *kgem,
@@ -2179,7 +2215,7 @@ inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 		size = 512 * 1024;
 	else
 		size = 1024 * 1024;
-	while (size < bo->size)
+	while (size < bytes(bo))
 		size *= 2;
 
 	return size;
@@ -2213,10 +2249,52 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				 kgem->has_relaxed_fencing,
 				 flags & CREATE_SCANOUT,
 				 width, height, bpp, tiling, &pitch);
-	assert(size && size < kgem->max_cpu_size);
-	assert(tiling == I915_TILING_NONE || size < kgem->max_gpu_size);
+	assert(size && size <= kgem->max_object_size);
+	size /= PAGE_SIZE;
 	bucket = cache_bucket(size);
 
+	if (bucket >= NUM_CACHE_BUCKETS) {
+		DBG(("%s: large bo num pages=%d, bucket=%d\n",
+		     __FUNCTION__, size, bucket));
+
+		if (flags & CREATE_INACTIVE)
+			goto create;
+
+		tiled_height = kgem_aligned_height(kgem, height, I915_TILING_Y);
+		untiled_pitch = kgem_untiled_pitch(kgem,
+						   width, bpp,
+						   flags & CREATE_SCANOUT);
+
+		list_for_each_entry(bo, &kgem->large, list) {
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+
+			if (bo->tiling) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
+			} else
+				bo->pitch = untiled_pitch;
+
+			if (bo->pitch * tiled_height > bytes(bo))
+				continue;
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			return kgem_bo_reference(bo);
+		}
+
+		goto create;
+	}
+
 	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
 		if (kgem->has_llc && tiling == I915_TILING_NONE)
@@ -2227,16 +2305,16 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		cache = &kgem->vma[for_cpu].inactive[bucket];
 		do {
 			list_for_each_entry(bo, cache, vma) {
-				assert(bo->bucket == bucket);
+				assert(bucket(bo) == bucket);
 				assert(bo->refcnt == 0);
 				assert(bo->map);
 				assert(IS_CPU_MAP(bo->map) == for_cpu);
 				assert(bo->rq == NULL);
 				assert(list_is_empty(&bo->request));
 
-				if (size > bo->size) {
+				if (size > num_pages(bo)) {
 					DBG(("inactive too small: %d < %d\n",
-					     bo->size, size));
+					     num_pages(bo), size));
 					continue;
 				}
 
@@ -2275,13 +2353,14 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	if (retry > 3)
 		retry = 3;
 search_again:
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->active[bucket][tiling];
 	if (tiling) {
 		tiled_height = kgem_aligned_height(kgem, height, tiling);
 		list_for_each_entry(bo, cache, list) {
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
@@ -2292,7 +2371,7 @@ search_again:
 				continue;
 			}
 
-			if (bo->pitch * tiled_height > bo->size)
+			if (bo->pitch * tiled_height > bytes(bo))
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2305,13 +2384,13 @@ search_again:
 		}
 	} else {
 		list_for_each_entry(bo, cache, list) {
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
-			if (bo->size < size)
+			if (num_pages(bo) < size)
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2340,7 +2419,7 @@ search_again:
 							 kgem->has_relaxed_fencing,
 							 flags & CREATE_SCANOUT,
 							 width, height, bpp, tiling, &pitch);
-			cache = active(kgem, tiled_height, i);
+			cache = active(kgem, tiled_height / PAGE_SIZE, i);
 			tiled_height = kgem_aligned_height(kgem, height, i);
 			list_for_each_entry(bo, cache, list) {
 				assert(!bo->purged);
@@ -2357,7 +2436,7 @@ search_again:
 				} else
 					bo->pitch = untiled_pitch;
 
-				if (bo->pitch * tiled_height > bo->size)
+				if (bo->pitch * tiled_height > bytes(bo))
 					continue;
 
 				kgem_bo_remove_from_active(kgem, bo);
@@ -2378,13 +2457,14 @@ skip_active_search:
 		retry = 3;
 search_inactive:
 	/* Now just look for a close match and prefer any currently active */
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->inactive[bucket];
 	list_for_each_entry_safe(bo, next, cache, list) {
-		assert(bo->bucket == bucket);
+		assert(bucket(bo) == bucket);
 
-		if (size > bo->size) {
+		if (size > num_pages(bo)) {
 			DBG(("inactive too small: %d < %d\n",
-			     bo->size, size));
+			     num_pages(bo), size));
 			continue;
 		}
 
@@ -2439,6 +2519,7 @@ search_inactive:
 		goto search_inactive;
 	}
 
+create:
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
@@ -2455,7 +2536,7 @@ search_inactive:
 	if (tiling != I915_TILING_NONE)
 		bo->tiling = gem_set_tiling(kgem->fd, handle, tiling, pitch);
 
-	assert(bo->size >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
+	assert(bytes(bo) >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
 
 	DBG(("  new pitch=%d, tiling=%d, handle=%d, id=%d\n",
 	     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
@@ -2470,9 +2551,9 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 		return;
 
 	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
-	     __FUNCTION__, bo->size, bo->delta, io->used));
+	     __FUNCTION__, bo->size.bytes, bo->delta, io->used));
 
-	if (bo->delta + bo->size == io->used) {
+	if (bo->delta + bo->size.bytes == io->used) {
 		io->used = bo->delta;
 		bubble_sort_partial(kgem, io);
 	}
@@ -2508,25 +2589,30 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	va_list ap;
 	struct kgem_bo *bo;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		if (bo->exec)
 			continue;
 
-		size += bo->size;
+		if (bo->proxy) {
+			bo = bo->proxy;
+			if (bo->exec)
+				continue;
+		}
+		num_pages += num_pages(bo);
 		num_exec++;
 	}
 	va_end(ap);
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2541,11 +2627,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	struct kgem_bo *bo;
 	int num_fence = 0;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 	int fenced_size = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		if (bo->proxy)
+			bo = bo->proxy;
 		if (bo->exec) {
 			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
 				continue;
@@ -2558,7 +2646,7 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 			continue;
 		}
 
-		size += bo->size;
+		num_pages += num_pages(bo);
 		num_exec++;
 		if (kgem->gen < 40 && bo->tiling) {
 			fenced_size += kgem_bo_fenced_size(kgem, bo);
@@ -2573,13 +2661,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	if (kgem->nfence + num_fence > kgem->fence_max)
 		return false;
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2698,7 +2786,7 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 		assert(bo->rq == NULL);
 
 		VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-		munmap(CPU_MAP(bo->map), bo->size);
+		munmap(CPU_MAP(bo->map), bytes(bo));
 		bo->map = NULL;
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
@@ -2736,11 +2824,11 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		assert(bo->size <= kgem->aperture_mappable / 4);
+		assert(bytes(bo) <= kgem->aperture_mappable / 4);
 
-		kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
-		ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
+		ptr = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 			       PROT_READ | PROT_WRITE);
 		if (ptr == NULL)
 			return NULL;
@@ -2780,8 +2868,8 @@ void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		return bo->map;
 
-	kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
-	return bo->map = gem_mmap(kgem->fd, bo->handle, bo->size,
+	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+	return bo->map = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 				  PROT_READ | PROT_WRITE);
 }
 
@@ -2789,7 +2877,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_i915_gem_mmap mmap_arg;
 
-	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bo->size));
+	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bytes(bo)));
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
 
@@ -2799,18 +2887,19 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		kgem_bo_release_map(kgem, bo);
 
-	kgem_trim_vma_cache(kgem, MAP_CPU, bo->bucket);
+	kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
 
 	VG_CLEAR(mmap_arg);
 	mmap_arg.handle = bo->handle;
 	mmap_arg.offset = 0;
-	mmap_arg.size = bo->size;
+	mmap_arg.size = bytes(bo);
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-		assert(0);
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain\n",
+		       __FUNCTION__, bo->handle, bytes(bo));
 		return NULL;
 	}
 
-	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bo->size, 0, 1));
+	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bytes(bo), 0, 1));
 
 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
 	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
@@ -2876,6 +2965,9 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	if (!kgem->has_vmap)
 		return NULL;
 
+	if (size >= MAX_CACHE_SIZE)
+		return NULL;
+
 	handle = gem_vmap(kgem->fd, ptr, size, read_only);
 	if (handle == 0)
 		return NULL;
@@ -2972,6 +3064,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	if (bo == NULL)
 		return NULL;
 
+	bo->size.bytes = length;
 	bo->io = target->io;
 	bo->dirty = target->dirty;
 	bo->tiling = target->tiling;
@@ -2982,11 +3075,11 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	return bo;
 }
 
-static struct kgem_partial_bo *partial_bo_alloc(int size)
+static struct kgem_partial_bo *partial_bo_alloc(int num_pages)
 {
 	struct kgem_partial_bo *bo;
 
-	bo = malloc(sizeof(*bo) + 128 + size);
+	bo = malloc(sizeof(*bo) + 128 + num_pages * PAGE_SIZE);
 	if (bo) {
 		bo->mem = (void *)ALIGN((uintptr_t)bo + sizeof(*bo), 64);
 		bo->mmapped = false;
@@ -3010,20 +3103,20 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size < kgem->max_cpu_size);
+	assert(size <= kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
 			/* We can reuse any write buffer which we can fit */
-			if (size <= bo->base.size) {
+			if (size <= bytes(&bo->base)) {
 				if (bo->base.refcnt == 1 && bo->base.exec) {
 					DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = 0;
 					goto done;
-				} else if (bo->used + size <= bo->base.size) {
+				} else if (bo->used + size <= bytes(&bo->base)) {
 					DBG(("%s: reusing unfinished write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = bo->used;
 					goto done;
 				}
@@ -3037,24 +3130,25 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			continue;
 		}
 
-		if (bo->used + size <= bo->base.size) {
+		if (bo->used + size <= bytes(&bo->base)) {
 			DBG(("%s: reusing partial buffer? used=%d + size=%d, total=%d\n",
-			     __FUNCTION__, bo->used, size, bo->base.size));
+			     __FUNCTION__, bo->used, size, bytes(&bo->base)));
 			offset = bo->used;
 			bo->used += size;
 			goto done;
 		}
 
 		DBG(("%s: too small (%d < %d)\n",
-		     __FUNCTION__, bo->base.size - bo->used, size));
+		     __FUNCTION__, bytes(&bo->base) - bo->used, size));
 		break;
 	}
 
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
-	if (alloc >= kgem->max_cpu_size)
+	if (alloc > kgem->max_gpu_size)
 		alloc = PAGE_ALIGN(size);
+	alloc /= PAGE_SIZE;
 	if (kgem->has_cpu_bo) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
@@ -3098,7 +3192,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->base.io = true;
 			bo->mmapped = true;
 
-			alloc = bo->base.size;
+			alloc = num_pages(&bo->base);
 			goto init;
 		} else {
 			bo->base.refcnt = 0; /* for valgrind */
@@ -3107,7 +3201,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 
-	if (alloc > kgem->aperture_mappable / 4)
+	if (PAGE_SIZE * alloc > kgem->aperture_mappable / 4)
 		flags &= ~KGEM_BUFFER_INPLACE;
 
 	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
@@ -3164,7 +3258,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				bo->mmapped = true;
 				bo->base.refcnt = 1;
 
-				alloc = bo->base.size;
+				alloc = num_pages(&bo->base);
 				goto init;
 			} else {
 				kgem_bo_free(kgem, &bo->base);
@@ -3173,11 +3267,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 #else
-	alloc = ALIGN(size, 64*1024);
+	alloc = ALIGN(size, 64*1024) / PAGE_SIZE;
 #endif
 	/* Be more parsimonious with pwrite/pread buffers */
 	if ((flags & KGEM_BUFFER_INPLACE) == 0)
-		alloc = PAGE_ALIGN(size);
+		alloc = PAGE_ALIGN(size) / PAGE_SIZE;
 	flags &= ~KGEM_BUFFER_INPLACE;
 
 	old = NULL;
@@ -3188,7 +3282,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	if (old) {
 		DBG(("%s: reusing ordinary handle %d for io\n",
 		     __FUNCTION__, old->handle));
-		alloc = old->size;
+		alloc = num_pages(old);
 		bo = partial_bo_alloc(alloc);
 		if (bo == NULL)
 			return NULL;
@@ -3240,21 +3334,40 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
-		if (bo->mem == NULL) {
-			kgem_bo_free(kgem, &bo->base);
+		if (bo->mem != NULL) {
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
+			bo->need_io = false;
+			bo->base.io = true;
+			bo->mmapped = true;
+			goto init;
+		}
+
+		DBG(("%s: failing back to new pwrite buffer\n", __FUNCTION__));
+		old = &bo->base;
+		bo = partial_bo_alloc(alloc);
+		if (bo == NULL) {
+			free(old);
 			return NULL;
 		}
 
-		if (flags & KGEM_BUFFER_WRITE)
-			kgem_bo_sync__cpu(kgem, &bo->base);
+		memcpy(&bo->base, old, sizeof(*old));
+		free(old);
+
+		assert(bo->mem);
+		assert(!bo->mmapped);
 
-		bo->need_io = false;
+		list_init(&bo->base.request);
+		list_init(&bo->base.vma);
+		list_init(&bo->base.list);
+		bo->base.refcnt = 1;
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
 		bo->base.io = true;
-		bo->mmapped = true;
 	}
 init:
 	bo->base.reusable = false;
-	assert(bo->base.size == alloc);
+	assert(num_pages(&bo->base) == alloc);
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 
@@ -3263,12 +3376,12 @@ init:
 	offset = 0;
 
 	list_add(&bo->base.list, &kgem->partial);
-	DBG(("%s(size=%d) new handle=%d\n",
+	DBG(("%s(pages=%d) new handle=%d\n",
 	     __FUNCTION__, alloc, bo->base.handle));
 
 done:
 	/* adjust the position within the list to maintain decreasing order */
-	alloc = bo->base.size - bo->used;
+	alloc = bytes(&bo->base) - bo->used;
 	{
 		struct kgem_partial_bo *p, *first;
 
@@ -3276,9 +3389,9 @@ done:
 					     struct kgem_partial_bo,
 					     base.list);
 		while (&p->base.list != &kgem->partial &&
-		       alloc < p->base.size - p->used) {
+		       alloc < bytes(&p->base) - p->used) {
 			DBG(("%s: this=%d, right=%d\n",
-			     __FUNCTION__, alloc, p->base.size -p->used));
+			     __FUNCTION__, alloc, bytes(&p->base) -p->used));
 			p = list_first_entry(&p->base.list,
 					     struct kgem_partial_bo,
 					     base.list);
@@ -3287,6 +3400,7 @@ done:
 			list_move_tail(&bo->base.list, &p->base.list);
 		assert(validate_partials(kgem));
 	}
+	assert(bo->mem);
 	*ret = (char *)bo->mem + offset;
 	return kgem_create_proxy(&bo->base, offset, size);
 }
@@ -3300,6 +3414,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	int stride;
 
 	assert(width > 0 && height > 0);
+	assert(ret != NULL);
 	stride = ALIGN(width, 2) * bpp >> 3;
 	stride = ALIGN(stride, kgem->min_alignment);
 
@@ -3307,8 +3422,12 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	     __FUNCTION__, width, height, bpp, stride));
 
 	bo = kgem_create_buffer(kgem, stride * ALIGN(height, 2), flags, ret);
-	if (bo == NULL)
+	if (bo == NULL) {
+		DBG(("%s: allocation failure for upload buffer\n",
+		     __FUNCTION__));
 		return NULL;
+	}
+	assert(*ret != NULL);
 
 	if (height & 1) {
 		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
@@ -3319,7 +3438,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		 */
 		if (io->used)
 			io->used -= stride;
-		bo->size -= stride;
+		bo->size.bytes -= stride;
 		bubble_sort_partial(kgem, io);
 	}
 
@@ -3357,8 +3476,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
-	uint32_t offset = _bo->delta, length = _bo->size;
+	uint32_t offset = _bo->delta, length = _bo->size.bytes;
 
+	assert(_bo->io);
 	assert(_bo->exec == NULL);
 	if (_bo->proxy)
 		_bo = _bo->proxy;
@@ -3461,7 +3581,7 @@ kgem_replace_bo(struct kgem *kgem,
 	assert(src->tiling == I915_TILING_NONE);
 
 	size = height * pitch;
-	size = PAGE_ALIGN(size);
+	size = PAGE_ALIGN(size) / PAGE_SIZE;
 
 	dst = search_linear_cache(kgem, size, 0);
 	if (dst == NULL)
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 0dc67da..2631e81 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -66,10 +66,16 @@ struct kgem_bo {
 	uint32_t handle;
 	uint32_t presumed_offset;
 	uint32_t delta;
-	uint32_t size:28;
-	uint32_t bucket:4;
-#define MAX_OBJECT_SIZE (1 << 28)
-
+	union {
+		struct {
+			uint32_t count:27;
+#define PAGE_SIZE 4096
+			uint32_t bucket:5;
+#define NUM_CACHE_BUCKETS 16
+#define MAX_CACHE_SIZE (1 << (NUM_CACHE_BUCKETS+12))
+		} pages;
+		uint32_t bytes;
+	} size;
 	uint32_t pitch : 18; /* max 128k */
 	uint32_t tiling : 2;
 	uint32_t reusable : 1;
@@ -100,8 +106,6 @@ enum {
 	NUM_MAP_TYPES,
 };
 
-#define NUM_CACHE_BUCKETS 16
-
 struct kgem {
 	int fd;
 	int wedged;
@@ -117,7 +121,10 @@ struct kgem {
 		KGEM_BLT,
 	} mode, ring;
 
-	struct list flushing, active[NUM_CACHE_BUCKETS][3], inactive[NUM_CACHE_BUCKETS];
+	struct list flushing;
+	struct list large;
+	struct list active[NUM_CACHE_BUCKETS][3];
+	struct list inactive[NUM_CACHE_BUCKETS];
 	struct list partial;
 	struct list requests;
 	struct kgem_request *next_request;
@@ -154,7 +161,7 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_gpu_size, max_cpu_size;
+	uint32_t max_gpu_size, max_cpu_size, max_object_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
@@ -194,8 +201,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
+bool kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
 bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp);
-bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int depth);
+bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int bpp);
 
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
@@ -354,11 +362,46 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
 
+static inline int kgem_bo_size(struct kgem_bo *bo)
+{
+	assert(!(bo->proxy && bo->io));
+	return PAGE_SIZE * bo->size.pages.count;
+}
+
+static inline int kgem_buffer_size(struct kgem_bo *bo)
+{
+	assert(bo->proxy && bo->io);
+	return bo->size.bytes;
+}
+
+static inline bool kgem_bo_can_blt(struct kgem *kgem,
+				   struct kgem_bo *bo)
+{
+	int pitch;
+
+	if (bo->tiling == I915_TILING_Y) {
+		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+		     __FUNCTION__, bo->handle));
+		return false;
+	}
+
+	pitch = bo->pitch;
+	if (kgem->gen >= 40 && bo->tiling)
+		pitch /= 4;
+	if (pitch > MAXSHORT) {
+		DBG(("%s: can not blt to handle=%d, adjusted pitch=%d\n",
+		     __FUNCTION__, pitch));
+		return false;
+	}
+
+	return true;
+}
+
 static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 				       struct kgem_bo *bo)
 {
 	DBG_HDR(("%s: domain=%d, offset: %d size: %d\n",
-		 __FUNCTION__, bo->domain, bo->presumed_offset, bo->size));
+		 __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
 
 	if (bo->domain == DOMAIN_GTT)
 		return true;
@@ -371,9 +414,9 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 		return false;
 
 	if (!bo->presumed_offset)
-		return bo->size <= kgem->aperture_mappable / 4;
+		return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
 
-	return bo->presumed_offset + bo->size <= kgem->aperture_mappable;
+	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
 static inline bool kgem_bo_mapped(struct kgem_bo *bo)
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index f21220f..9e7360a 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -79,7 +79,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	} else {
 		bo = kgem_debug_get_bo_for_reloc_entry(kgem, reloc);
 		base = kgem_bo_map__debug(kgem, bo);
-		size = bo->size;
+		size = kgem_bo_size(bo);
 	}
 	ptr = (char *)base + reloc->delta;
 
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 5910daf..d9ba773 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -92,7 +92,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define DEBUG_NO_RENDER 0
 #define DEBUG_NO_BLT 0
-#define DEBUG_NO_IO 0
 
 #define DEBUG_FLUSH_BATCH 0
 #define DEBUG_FLUSH_SYNC 0
@@ -647,7 +646,7 @@ void sna_read_boxes(struct sna *sna,
 		    struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 		    PixmapPtr dst, int16_t dst_dx, int16_t dst_dy,
 		    const BoxRec *box, int n);
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int n);
@@ -657,10 +656,10 @@ void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 			  const BoxRec *box, int nbox,
 			  uint32_t and, uint32_t or);
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride);
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **bo,
+		 const void *src, int stride);
 struct kgem_bo *sna_replace__xor(struct sna *sna,
 				 PixmapPtr pixmap,
 				 struct kgem_bo *bo,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f2997d0..ce35113 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -243,9 +243,14 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	if (priv->ptr)
 		goto done;
 
+	DBG(("%s: pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
 	assert(priv->stride);
 
-	if (sna->kgem.has_cpu_bo || !priv->gpu) {
+	if ((sna->kgem.has_cpu_bo || !priv->gpu) &&
+	    kgem_can_create_cpu(&sna->kgem,
+				pixmap->drawable.width,
+				pixmap->drawable.height,
+				pixmap->drawable.bitsPerPixel)) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
@@ -270,8 +275,11 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 		}
 	}
 
-	if (priv->ptr == NULL)
+	if (priv->ptr == NULL) {
+		DBG(("%s: allocating ordinary memory for shadow pixels [%d bytes]\n",
+		     __FUNCTION__, priv->stride * pixmap->drawable.height));
 		priv->ptr = malloc(priv->stride * pixmap->drawable.height);
+	}
 
 	assert(priv->ptr);
 done:
@@ -289,7 +297,7 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
-		     __FUNCTION__, priv->cpu_bo->handle, priv->cpu_bo->size));
+		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
 
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 		priv->cpu_bo = NULL;
@@ -515,10 +523,10 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 		break;
 
 	default:
-		if (!kgem_can_create_gpu(&sna->kgem,
-					 pixmap->drawable.width,
-					 pixmap->drawable.height,
-					 pixmap->drawable.bitsPerPixel))
+		if (!kgem_can_create_2d(&sna->kgem,
+					pixmap->drawable.width,
+					pixmap->drawable.height,
+					pixmap->drawable.depth))
 			return NULL;
 		break;
 	}
@@ -669,8 +677,11 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
-	if (!kgem_can_create_cpu(&sna->kgem, width, height, depth))
+	if (!kgem_can_create_2d(&sna->kgem, width, height, depth)) {
+		DBG(("%s: can not use GPU, just creating shadow\n",
+		     __FUNCTION__));
 		return create_pixmap(sna, screen, width, height, depth, usage);
+	}
 
 	if (!sna->have_render)
 		return create_pixmap(sna, screen,
@@ -704,6 +715,8 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 
 	pad = PixmapBytePad(width, depth);
 	if (pad * height <= 4096) {
+		DBG(("%s: small buffer [%d], attaching to shadow pixmap\n",
+		     __FUNCTION__, pad * height));
 		pixmap = create_pixmap(sna, screen,
 				       width, height, depth, usage);
 		if (pixmap == NullPixmap)
@@ -713,6 +726,9 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	} else {
 		struct sna_pixmap *priv;
 
+		DBG(("%s: creating GPU pixmap %dx%d, stride=%d\n",
+		     __FUNCTION__, width, height, pad));
+
 		pixmap = create_pixmap(sna, screen, 0, 0, depth, usage);
 		if (pixmap == NullPixmap)
 			return NullPixmap;
@@ -1609,19 +1625,20 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 				    box->x1 <= 0 && box->y1 <= 0 &&
 				    box->x2 >= pixmap->drawable.width &&
 				    box->y2 >= pixmap->drawable.height) {
-					priv->gpu_bo =
-						sna_replace(sna, pixmap,
-							    priv->gpu_bo,
-							    pixmap->devPrivate.ptr,
-							    pixmap->devKind);
+					ok = sna_replace(sna, pixmap,
+							 &priv->gpu_bo,
+							 pixmap->devPrivate.ptr,
+							 pixmap->devKind);
 				} else {
-					sna_write_boxes(sna, pixmap,
-							priv->gpu_bo, 0, 0,
-							pixmap->devPrivate.ptr,
-							pixmap->devKind,
-							0, 0,
-							box, n);
+					ok = sna_write_boxes(sna, pixmap,
+							     priv->gpu_bo, 0, 0,
+							     pixmap->devPrivate.ptr,
+							     pixmap->devKind,
+							     0, 0,
+							     box, n);
 				}
+				if (!ok)
+					return false;
 			}
 		}
 
@@ -1637,12 +1654,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, 1);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, 1);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, 1);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1658,12 +1677,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, n);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, n);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1671,7 +1692,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 done:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return true;
@@ -1811,7 +1832,7 @@ done:
 
 use_gpu_bo:
 	priv->clear = false;
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive,
 			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
@@ -1978,6 +1999,17 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (!sna_pixmap_move_to_gpu(pixmap, flags))
 		return NULL;
 
+	/* For large bo, try to keep only a single copy around */
+	if (!priv->gpu && priv->ptr) {
+		sna_damage_all(&priv->gpu_damage,
+			       pixmap->drawable.width,
+			       pixmap->drawable.height);
+		sna_damage_destroy(&priv->cpu_damage);
+		priv->undamaged = false;
+		list_del(&priv->list);
+		sna_pixmap_free_cpu(to_sna_from_pixmap(pixmap), priv);
+	}
+
 	return priv;
 }
 
@@ -2070,19 +2102,20 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			if (n == 1 && !priv->pinned &&
 			    (box->x2 - box->x1) >= pixmap->drawable.width &&
 			    (box->y2 - box->y1) >= pixmap->drawable.height) {
-				priv->gpu_bo =
-					sna_replace(sna, pixmap,
-						    priv->gpu_bo,
-						    pixmap->devPrivate.ptr,
-						    pixmap->devKind);
+				ok = sna_replace(sna, pixmap,
+						 &priv->gpu_bo,
+						 pixmap->devPrivate.ptr,
+						 pixmap->devKind);
 			} else {
-				sna_write_boxes(sna, pixmap,
+				ok = sna_write_boxes(sna, pixmap,
 						priv->gpu_bo, 0, 0,
 						pixmap->devPrivate.ptr,
 						pixmap->devKind,
 						0, 0,
 						box, n);
 			}
+			if (!ok)
+				return NULL;
 		}
 	}
 
@@ -2098,7 +2131,7 @@ done:
 	if (DAMAGE_IS_ALL(priv->gpu_damage))
 		priv->undamaged = false;
 active:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return priv;
@@ -2321,11 +2354,8 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	    !priv->pinned && nbox == 1 &&
 	    box->x1 <= 0 && box->y1 <= 0 &&
 	    box->x2 >= pixmap->drawable.width &&
-	    box->y2 >= pixmap->drawable.height) {
-		priv->gpu_bo =
-			sna_replace(sna, pixmap, priv->gpu_bo, bits, stride);
-		return TRUE;
-	}
+	    box->y2 >= pixmap->drawable.height)
+		return sna_replace(sna, pixmap, &priv->gpu_bo, bits, stride);
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
@@ -2341,15 +2371,13 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		kgem_bo_destroy(&sna->kgem, src_bo);
 	}
 
-	if (!ok && gc->alu == GXcopy) {
-		sna_write_boxes(sna, pixmap,
-				priv->gpu_bo, 0, 0,
-				bits,
-				stride,
-				-x, -y,
-				box, nbox);
-		ok = TRUE;
-	}
+	if (!ok && gc->alu == GXcopy)
+		ok = sna_write_boxes(sna, pixmap,
+				     priv->gpu_bo, 0, 0,
+				     bits,
+				     stride,
+				     -x, -y,
+				     box, nbox);
 
 	return ok;
 }
@@ -3213,7 +3241,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			}
 		} else {
 			dst_priv->clear = false;
-			if (!dst_priv->pinned)
+			if (!dst_priv->pinned && dst_priv->gpu)
 				list_move(&dst_priv->inactive,
 					  &sna->active_pixmaps);
 		}
@@ -3400,10 +3428,10 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 				assert(src_dy + box->y1 + dst_pixmap->drawable.height <= src_pixmap->drawable.height);
 				assert(src_dx + box->x1 + dst_pixmap->drawable.width <= src_pixmap->drawable.width);
 
-				dst_priv->gpu_bo =
-					sna_replace(sna, dst_pixmap,
-						    dst_priv->gpu_bo,
-						    bits, stride);
+				if (!sna_replace(sna, dst_pixmap,
+						 &dst_priv->gpu_bo,
+						 bits, stride))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					sna_damage_destroy(&dst_priv->cpu_damage);
@@ -3416,12 +3444,13 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			} else {
 				DBG(("%s: dst is on the GPU, src is on the CPU, uploading into dst\n",
 				     __FUNCTION__));
-				sna_write_boxes(sna, dst_pixmap,
-						dst_priv->gpu_bo, dst_dx, dst_dy,
-						src_pixmap->devPrivate.ptr,
-						src_pixmap->devKind,
-						src_dx, src_dy,
-						box, n);
+				if (!sna_write_boxes(sna, dst_pixmap,
+						     dst_priv->gpu_bo, dst_dx, dst_dy,
+						     src_pixmap->devPrivate.ptr,
+						     src_pixmap->devKind,
+						     src_dx, src_dy,
+						     box, n))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					RegionTranslate(&region, dst_dx, dst_dy);
@@ -11502,7 +11531,7 @@ static void sna_accel_inactive(struct sna *sna)
 		count = bytes = 0;
 		list_for_each_entry(priv, &sna->inactive_clock[1], inactive)
 			if (!priv->pinned)
-				count++, bytes += priv->gpu_bo->size;
+				count++, bytes += kgem_bo_size(priv->gpu_bo);
 
 		DBG(("%s: trimming %d inactive GPU buffers, %d bytes\n",
 		    __FUNCTION__, count, bytes));
@@ -11528,6 +11557,9 @@ static void sna_accel_inactive(struct sna *sna)
 		priv = list_first_entry(&sna->inactive_clock[1],
 					struct sna_pixmap,
 					inactive);
+		assert(priv->gpu);
+		assert(priv->gpu_bo);
+
 		/* XXX Rather than discarding the GPU buffer here, we
 		 * could mark it purgeable and allow the shrinker to
 		 * reap its storage only under memory pressure.
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 535628c..7efbcf9 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -235,7 +235,7 @@ inline static void sna_blt_fill_one(struct sna *sna,
 
 	assert(x >= 0);
 	assert(y >= 0);
-	assert((y+height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((y+height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -358,10 +358,10 @@ static void sna_blt_alpha_fixup_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -409,10 +409,10 @@ static void sna_blt_copy_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -787,7 +787,7 @@ inline static void _sna_blt_fill_box(struct sna *sna,
 
 	assert(box->x1 >= 0);
 	assert(box->y1 >= 0);
-	assert(box->y2 * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert(box->y2 * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -1106,7 +1106,7 @@ prepare_blt_copy(struct sna *sna,
 	PixmapPtr src = op->u.blt.src_pixmap;
 	struct sna_pixmap *priv = sna_pixmap(src);
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo))
 		return FALSE;
 
 	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
@@ -1176,9 +1176,8 @@ blt_put_composite(struct sna *sna,
 		data += (src_x - dst_x) * bpp / 8;
 		data += (src_y - dst_y) * pitch;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, dst_priv->gpu_bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		BoxRec box;
 
@@ -1215,9 +1214,8 @@ fastcall static void blt_put_composite_box(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1250,9 +1248,8 @@ static void blt_put_composite_boxes(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1573,9 +1570,13 @@ sna_blt_composite(struct sna *sna,
 
 	tmp->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	priv = sna_pixmap_move_to_gpu(tmp->dst.pixmap, MOVE_WRITE | MOVE_READ);
-	if (priv == NULL || priv->gpu_bo->tiling == I915_TILING_Y) {
-		DBG(("%s: dst not on the gpu or using Y-tiling\n",
-		     __FUNCTION__));
+	if (priv == NULL) {
+		DBG(("%s: dst not attached\n", __FUNCTION__));
+		return FALSE;
+	}
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo)) {
+		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+		     __FUNCTION__, priv->gpu_bo->tiling, priv->gpu_bo->pitch));
 		return FALSE;
 	}
 
@@ -1747,7 +1748,7 @@ bool sna_blt_fill(struct sna *sna, uint8_t alu,
 
 	DBG(("%s(alu=%d, pixel=%x, bpp=%d)\n", __FUNCTION__, alu, pixel, bpp));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(&sna->kgem, bo)) {
 		DBG(("%s: rejected due to incompatible Y-tiling\n",
 		     __FUNCTION__));
 		return FALSE;
@@ -1797,10 +1798,10 @@ bool sna_blt_copy(struct sna *sna, uint8_t alu,
 	return FALSE;
 #endif
 
-	if (src->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, src))
 		return FALSE;
 
-	if (dst->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, dst))
 		return FALSE;
 
 	if (!sna_blt_copy_init(sna, &op->base.u.blt,
@@ -1926,7 +1927,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	DBG(("%s (%d, %08x, %d) x %d\n",
 	     __FUNCTION__, bpp, pixel, alu, nbox));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(kgem, bo)) {
 		DBG(("%s: fallback -- dst uses Y-tiling\n", __FUNCTION__));
 		return FALSE;
 	}
@@ -2020,7 +2021,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 
 			assert(box->x1 >= 0);
 			assert(box->y1 >= 0);
-			assert(box->y2 * bo->pitch <= bo->size);
+			assert(box->y2 * bo->pitch <= kgem_bo_size(bo));
 
 			b = kgem->batch + kgem->nbatch;
 			kgem->nbatch += 3;
@@ -2075,8 +2076,13 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	    src_bo->tiling, dst_bo->tiling,
 	    src_bo->pitch, dst_bo->pitch));
 
-	if (src_bo->tiling == I915_TILING_Y || dst_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(kgem, src_bo) || !kgem_bo_can_blt(kgem, dst_bo)) {
+		DBG(("%s: cannot blt to src? %d or dst? %d\n",
+		     __FUNCTION__,
+		     kgem_bo_can_blt(kgem, src_bo),
+		     kgem_bo_can_blt(kgem, dst_bo)));
 		return FALSE;
+	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	if (bpp == 32)
@@ -2087,7 +2093,7 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		cmd |= BLT_SRC_TILED;
 		src_pitch >>= 2;
 	}
-	assert(src_pitch < MAXSHORT);
+	assert(src_pitch <= MAXSHORT);
 
 	br13 = dst_bo->pitch;
 	if (kgem->gen >= 40 && dst_bo->tiling) {
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index f3ca212..14a7901 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -44,6 +44,27 @@
 
 /* XXX Need to avoid using GTT fenced access for I915_TILING_Y on 855GM */
 
+static Bool
+box_intersect(BoxPtr a, const BoxRec *b)
+{
+	if (a->x1 < b->x1)
+		a->x1 = b->x1;
+	if (a->x2 > b->x2)
+		a->x2 = b->x2;
+	if (a->y1 < b->y1)
+		a->y1 = b->y1;
+	if (a->y2 > b->y2)
+		a->y2 = b->y2;
+
+	return a->x1 < a->x2 && a->y1 < a->y2;
+}
+
+static inline bool must_tile(struct sna *sna, int width, int height)
+{
+	return (width  > sna->render.max_3d_size ||
+		height > sna->render.max_3d_size);
+}
+
 static void read_boxes_inplace(struct kgem *kgem,
 			       struct kgem_bo *bo, int16_t src_dx, int16_t src_dy,
 			       PixmapPtr pixmap, int16_t dst_dx, int16_t dst_dy,
@@ -105,13 +126,13 @@ void sna_read_boxes(struct sna *sna,
 	for (n = 0; n < nbox; n++) {
 		if (box[n].x1 + src_dx < 0 || box[n].y1 + src_dy < 0 ||
 		    (box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 > src_bo->pitch ||
-		    (box[n].y2 + src_dy) * src_bo->pitch > src_bo->size)
+		    (box[n].y2 + src_dy) * src_bo->pitch > kgem_bo_size(src_bo))
 		{
 			FatalError("source out-of-bounds box[%d]=(%d, %d), (%d, %d) + (%d, %d), pitch=%d, size=%d\n", n,
 				   box[n].x1, box[n].y1,
 				   box[n].x2, box[n].y2,
 				   src_dx, src_dy,
-				   src_bo->pitch, src_bo->size);
+				   src_bo->pitch, kgem_bo_size(src_bo));
 		}
 	}
 #endif
@@ -132,7 +153,6 @@ fallback:
 		return;
 	}
 
-	/* Is it worth detiling? */
 	extents = box[0];
 	for (n = 1; n < nbox; n++) {
 		if (box[n].x1 < extents.x1)
@@ -145,11 +165,16 @@ fallback:
 		if (box[n].y2 > extents.y2)
 			extents.y2 = box[n].y2;
 	}
-	if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
-		goto fallback;
+	if (kgem_bo_is_mappable(kgem, src_bo)) {
+		/* Is it worth detiling? */
+		if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
+			goto fallback;
+	}
 
 	/* Try to avoid switching rings... */
-	if (src_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, src_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
@@ -161,38 +186,124 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		dst_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_LAST,
-					       &ptr);
-		if (!dst_bo)
-			goto fallback;
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling download, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						c++;
+					}
+					if (c == clipped)
+						continue;
+
+					dst_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_LAST,
+								       &ptr);
+					if (!dst_bo)
+						goto fallback;
+
+					if (!sna->render.copy_boxes(sna, GXcopy,
+								    dst, src_bo, src_dx, src_dy,
+								    &tmp, dst_bo, -tile.x1, -tile.y1,
+								    clipped, c-clipped)) {
+						kgem_bo_destroy(&sna->kgem, dst_bo);
+						goto fallback;
+					}
+
+					kgem_bo_submit(&sna->kgem, dst_bo);
+					kgem_buffer_read_sync(kgem, dst_bo);
+
+					while (c-- != clipped) {
+						memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+							   dst_bo->pitch, dst->devKind,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x1 + dst_dx,
+							   c->y1 + dst_dy,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+					}
+
+					kgem_bo_destroy(&sna->kgem, dst_bo);
+				}
+			}
 
-		if (!sna->render.copy_boxes(sna, GXcopy,
-					    dst, src_bo, src_dx, src_dy,
-					    &tmp, dst_bo, -extents.x1, -extents.y1,
-					    box, nbox)) {
-			kgem_bo_destroy(&sna->kgem, dst_bo);
-			goto fallback;
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			dst_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_LAST,
+						       &ptr);
+			if (!dst_bo)
+				goto fallback;
+
+			if (!sna->render.copy_boxes(sna, GXcopy,
+						    dst, src_bo, src_dx, src_dy,
+						    &tmp, dst_bo, -extents.x1, -extents.y1,
+						    box, nbox)) {
+				kgem_bo_destroy(&sna->kgem, dst_bo);
+				goto fallback;
+			}
 
-		kgem_bo_submit(&sna->kgem, dst_bo);
-		kgem_buffer_read_sync(kgem, dst_bo);
+			kgem_bo_submit(&sna->kgem, dst_bo);
+			kgem_buffer_read_sync(kgem, dst_bo);
+
+			for (n = 0; n < nbox; n++) {
+				memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+					   dst_bo->pitch, dst->devKind,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x1 + dst_dx,
+					   box[n].y1 + dst_dy,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
-				   dst_bo->pitch, dst->devKind,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x1 + dst_dx,
-				   box[n].y1 + dst_dy,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
+			kgem_bo_destroy(&sna->kgem, dst_bo);
 		}
-
-		kgem_bo_destroy(&sna->kgem, dst_bo);
 		return;
 	}
 
@@ -270,7 +381,7 @@ fallback:
 			assert(tmp_box[n].x1 + src_dx >= 0);
 			assert((tmp_box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 <= src_bo->pitch);
 			assert(tmp_box[n].y1 + src_dy >= 0);
-			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= src_bo->size);
+			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= kgem_bo_size(src_bo));
 
 			b[0] = cmd;
 			b[1] = br13 | pitch;
@@ -299,7 +410,7 @@ fallback:
 		_kgem_set_mode(kgem, KGEM_BLT);
 		tmp_box += nbox_this_time;
 	} while (1);
-	assert(offset == dst_bo->size);
+	assert(offset == kgem_buffer_size(dst_bo));
 
 	kgem_buffer_read_sync(kgem, dst_bo);
 
@@ -331,12 +442,12 @@ fallback:
 
 		src += pitch * height;
 	} while (--nbox);
-	assert(src - (char *)ptr == dst_bo->size);
+	assert(src - (char *)ptr == kgem_buffer_size(dst_bo));
 	kgem_bo_destroy(kgem, dst_bo);
 	sna->blt_state.fill_bo = 0;
 }
 
-static void write_boxes_inplace(struct kgem *kgem,
+static bool write_boxes_inplace(struct kgem *kgem,
 				const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
 				struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
 				const BoxRec *box, int n)
@@ -346,11 +457,14 @@ static void write_boxes_inplace(struct kgem *kgem,
 	DBG(("%s x %d, handle=%d, tiling=%d\n",
 	     __FUNCTION__, n, bo->handle, bo->tiling));
 
+	if (!kgem_bo_is_mappable(kgem, bo))
+		return false;
+
 	kgem_bo_submit(kgem, bo);
 
 	dst = kgem_bo_map(kgem, bo);
 	if (dst == NULL)
-		return;
+		return false;
 
 	assert(dst != src);
 
@@ -364,7 +478,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 		assert(box->x1 + dst_dx >= 0);
 		assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
 		assert(box->y1 + dst_dy >= 0);
-		assert((box->y2 + dst_dy)*bo->pitch <= bo->size);
+		assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
 
 		assert(box->x1 + src_dx >= 0);
 		assert((box->x2 + src_dx)*bpp <= 8*stride);
@@ -377,6 +491,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 			   box->x2 - box->x1, box->y2 - box->y1);
 		box++;
 	} while (--n);
+	return true;
 }
 
 static bool upload_inplace(struct kgem *kgem,
@@ -384,9 +499,6 @@ static bool upload_inplace(struct kgem *kgem,
 			   const BoxRec *box,
 			   int n, int bpp)
 {
-	if (DEBUG_NO_IO)
-		return kgem_bo_is_mappable(kgem, bo);
-
 	/* If we are writing through the GTT, check first if we might be
 	 * able to almagamate a series of small writes into a single
 	 * operation.
@@ -404,13 +516,14 @@ static bool upload_inplace(struct kgem *kgem,
 	return !kgem_bo_map_will_stall(kgem, bo);
 }
 
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int nbox)
 {
 	struct kgem *kgem = &sna->kgem;
 	struct kgem_bo *src_bo;
+	BoxRec extents;
 	void *ptr;
 	int offset;
 	int n, cmd, br13;
@@ -419,30 +532,30 @@ void sna_write_boxes(struct sna *sna, PixmapPtr dst,
 
 	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
-		write_boxes_inplace(kgem,
-				    src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
-				    dst_bo, dst_dx, dst_dy,
-				    box, nbox);
-		return;
+		return write_boxes_inplace(kgem,
+					   src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
+					   dst_bo, dst_dx, dst_dy,
+					   box, nbox);
 	}
 
-	/* Try to avoid switching rings... */
-	if (dst_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
-		PixmapRec tmp;
-		BoxRec extents;
+	extents = box[0];
+	for (n = 1; n < nbox; n++) {
+		if (box[n].x1 < extents.x1)
+			extents.x1 = box[n].x1;
+		if (box[n].x2 > extents.x2)
+			extents.x2 = box[n].x2;
 
-		extents = box[0];
-		for (n = 1; n < nbox; n++) {
-			if (box[n].x1 < extents.x1)
-				extents.x1 = box[n].x1;
-			if (box[n].x2 > extents.x2)
-				extents.x2 = box[n].x2;
+		if (box[n].y1 < extents.y1)
+			extents.y1 = box[n].y1;
+		if (box[n].y2 > extents.y2)
+			extents.y2 = box[n].y2;
+	}
 
-			if (box[n].y1 < extents.y1)
-				extents.y1 = box[n].y1;
-			if (box[n].y2 > extents.y2)
-				extents.y2 = box[n].y2;
-		}
+	/* Try to avoid switching rings... */
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, dst_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
+		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
 		tmp.drawable.height = extents.y2 - extents.y1;
@@ -453,37 +566,130 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		src_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_WRITE_INPLACE,
-					       &ptr);
-		if (!src_bo)
-			goto fallback;
+		DBG(("%s: upload (%d, %d)x(%d, %d), max %dx%d\n",
+		     __FUNCTION__,
+		     extents.x1, extents.y1,
+		     tmp.drawable.width, tmp.drawable.height,
+		     sna->render.max_3d_size, sna->render.max_3d_size));
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling upload, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					src_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_WRITE_INPLACE,
+								       &ptr);
+					if (!src_bo)
+						goto fallback;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+							   stride, src_bo->pitch,
+							   c->x1 + src_dx,
+							   c->y1 + src_dy,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+						c++;
+					}
+
+					if (c != clipped)
+						n = sna->render.copy_boxes(sna, GXcopy,
+									   &tmp, src_bo, -tile.x1, -tile.y1,
+									   dst, dst_bo, dst_dx, dst_dy,
+									   clipped, c - clipped);
+					else
+						n = 1;
+
+					kgem_bo_destroy(&sna->kgem, src_bo);
+
+					if (!n)
+						goto fallback;
+				}
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
-				   stride, src_bo->pitch,
-				   box[n].x1 + src_dx,
-				   box[n].y1 + src_dy,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			src_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_WRITE_INPLACE,
+						       &ptr);
+			if (!src_bo)
+				goto fallback;
+
+			for (n = 0; n < nbox; n++) {
+				DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+				     __FUNCTION__,
+				     box[n].x1, box[n].y1,
+				     box[n].x2, box[n].y2,
+				     src_dx, src_dy,
+				     box[n].x1 - extents.x1,
+				     box[n].y1 - extents.y1));
+				memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+					   stride, src_bo->pitch,
+					   box[n].x1 + src_dx,
+					   box[n].y1 + src_dy,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		n = sna->render.copy_boxes(sna, GXcopy,
-					   &tmp, src_bo, -extents.x1, -extents.y1,
-					   dst, dst_bo, dst_dx, dst_dy,
-					   box, nbox);
+			n = sna->render.copy_boxes(sna, GXcopy,
+						   &tmp, src_bo, -extents.x1, -extents.y1,
+						   dst, dst_bo, dst_dx, dst_dy,
+						   box, nbox);
 
-		kgem_bo_destroy(&sna->kgem, src_bo);
+			kgem_bo_destroy(&sna->kgem, src_bo);
 
-		if (!n)
-			goto fallback;
+			if (!n)
+				goto fallback;
+		}
 
-		return;
+		return true;
 	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
@@ -586,7 +792,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -597,6 +803,7 @@ fallback:
 	} while (nbox);
 
 	sna->blt_state.fill_bo = 0;
+	return true;
 }
 
 static void
@@ -823,7 +1030,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -951,11 +1158,12 @@ indirect_replace(struct sna *sna,
 	return ret;
 }
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride)
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **_bo,
+		 const void *src, int stride)
 {
+	struct kgem_bo *bo = *_bo;
 	struct kgem *kgem = &sna->kgem;
 	void *dst;
 
@@ -968,7 +1176,7 @@ struct kgem_bo *sna_replace(struct sna *sna,
 
 	if ((!kgem_bo_mapped(bo) || bo->rq) &&
 	    indirect_replace(sna, pixmap, bo, src, stride))
-		return bo;
+		return true;
 
 	if (kgem_bo_is_busy(bo)) {
 		struct kgem_bo *new_bo;
@@ -979,26 +1187,26 @@ struct kgem_bo *sna_replace(struct sna *sna,
 					pixmap->drawable.bitsPerPixel,
 					bo->tiling,
 					CREATE_GTT_MAP | CREATE_INACTIVE);
-		if (new_bo) {
-			kgem_bo_destroy(kgem, bo);
+		if (new_bo)
 			bo = new_bo;
-		}
 	}
 
 	if (bo->tiling == I915_TILING_NONE && bo->pitch == stride) {
-		kgem_bo_write(kgem, bo, src,
-			      (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8);
+		if (!kgem_bo_write(kgem, bo, src,
+				   (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
+			goto err;
 	} else {
 		if (kgem_bo_is_mappable(kgem, bo)) {
 			dst = kgem_bo_map(kgem, bo);
-			if (dst) {
-				memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
-					   stride, bo->pitch,
-					   0, 0,
-					   0, 0,
-					   pixmap->drawable.width,
-					   pixmap->drawable.height);
-			}
+			if (!dst)
+				goto err;
+
+			memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
+				   stride, bo->pitch,
+				   0, 0,
+				   0, 0,
+				   pixmap->drawable.width,
+				   pixmap->drawable.height);
 		} else {
 			BoxRec box;
 
@@ -1006,14 +1214,23 @@ struct kgem_bo *sna_replace(struct sna *sna,
 			box.x2 = pixmap->drawable.width;
 			box.y2 = pixmap->drawable.height;
 
-			sna_write_boxes(sna, pixmap,
-					bo, 0, 0,
-					src, stride, 0, 0,
-					&box, 1);
+			if (!sna_write_boxes(sna, pixmap,
+					     bo, 0, 0,
+					     src, stride, 0, 0,
+					     &box, 1))
+				goto err;
 		}
 	}
 
-	return bo;
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, *_bo);
+	*_bo = bo;
+	return true;
+
+err:
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, bo);
+	return false;
 }
 
 struct kgem_bo *sna_replace__xor(struct sna *sna,
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index f9151e0..7077f36 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -696,6 +696,11 @@ static int sna_render_picture_downsample(struct sna *sna,
 	DBG(("%s: creating temporary GPU bo %dx%d\n",
 	     __FUNCTION__, width, height));
 
+	if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, ow, oh,
+						dst_x, dst_y);
+
 	tmp = screen->CreatePixmap(screen,
 				   width, height,
 				   pixmap->drawable.depth,
@@ -1306,9 +1311,6 @@ do_fixup:
 		return 0;
 	}
 
-	/* XXX Convolution filter? */
-	memset(ptr, 0, channel->bo->size);
-
 	/* Composite in the original format to preserve idiosyncracies */
 	if (picture->format == channel->pict_format)
 		dst = pixman_image_create_bits(picture->format,
@@ -1354,7 +1356,7 @@ do_fixup:
 					       w, h);
 			pixman_image_unref(src);
 		} else {
-			memset(ptr, 0, channel->bo->size);
+			memset(ptr, 0, kgem_buffer_size(channel->bo));
 			dst = src;
 		}
 	}
@@ -1528,7 +1530,7 @@ sna_render_composite_redirect(struct sna *sna,
 	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
 		int y1, y2;
 
-		assert(op->dst.pixmap.drawable.height > sna->render.max_3d_size);
+		assert(op->dst.pixmap->drawable.height > sna->render.max_3d_size);
 		y1 =  y + op->dst.y;
 		y2 =  y1 + height;
 		y1 &= y1 & (64 - 1);
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 7b759a7..cec0473 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -100,7 +100,7 @@ sna_video_buffer(struct sna *sna,
 		 struct sna_video_frame *frame)
 {
 	/* Free the current buffer if we're going to have to reallocate */
-	if (video->buf && video->buf->size < frame->size)
+	if (video->buf && kgem_bo_size(video->buf) < frame->size)
 		sna_video_free_buffers(sna, video);
 
 	if (video->buf == NULL)
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index d99f884..1aaf972 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -271,7 +271,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 			return BadAlloc;
 		}
 
-		assert(frame.bo->size >= frame.size);
+		assert(kgem_bo_size(frame.bo) >= frame.size);
 	} else {
 		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
 		if (frame.bo == NULL) {
commit 88933a827bacad35cbece03f2810056c6386a045
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 23:18:05 2012 +0000

    sna: Guard against the upload buffer growing past the maximum bo size
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index a5c47d6..86a4372 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3010,7 +3010,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size < kgem->max_cpu_buffer);
+	assert(size < kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
@@ -3053,6 +3053,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
+	if (alloc >= kgem->max_cpu_size)
+		alloc = PAGE_ALIGN(size);
 	if (kgem->has_cpu_bo) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
commit 11d489814f56e1f91f5d2c84c5363e7dfc7eef67
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 22:18:30 2012 +0000

    sna: Limit inplace upload buffers to maximum mappable size
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45323
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 95b67cf..a5c47d6 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3001,6 +3001,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 {
 	struct kgem_partial_bo *bo;
 	unsigned offset, alloc;
+	struct kgem_bo *old;
 
 	DBG(("%s: size=%d, flags=%x [write?=%d, inplace?=%d, last?=%d]\n",
 	     __FUNCTION__, size, flags,
@@ -3008,6 +3009,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_INPLACE),
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
+	/* we should never be asked to create anything TOO large */
+	assert(size < kgem->max_cpu_buffer);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
@@ -3047,14 +3050,10 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		break;
 	}
 
-	/* Be a little more generous and hope to hold fewer mmappings */
-	bo = NULL;
-
 #if !DBG_NO_MAP_UPLOAD
-	alloc = ALIGN(size, kgem->partial_buffer_size);
+	/* Be a little more generous and hope to hold fewer mmappings */
+	alloc = ALIGN(2*size, kgem->partial_buffer_size);
 	if (kgem->has_cpu_bo) {
-		struct kgem_bo *old;
-
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
 			return NULL;
@@ -3098,14 +3097,18 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->mmapped = true;
 
 			alloc = bo->base.size;
+			goto init;
 		} else {
 			bo->base.refcnt = 0; /* for valgrind */
 			kgem_bo_free(kgem, &bo->base);
 			bo = NULL;
 		}
-	} else if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
-		struct kgem_bo *old;
+	}
+
+	if (alloc > kgem->aperture_mappable / 4)
+		flags &= ~KGEM_BUFFER_INPLACE;
 
+	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
 		/* The issue with using a GTT upload buffer is that we may
 		 * cause eviction-stalls in order to free up some GTT space.
 		 * An is-mappable? ioctl could help us detect when we are
@@ -3160,6 +3163,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				bo->base.refcnt = 1;
 
 				alloc = bo->base.size;
+				goto init;
 			} else {
 				kgem_bo_free(kgem, &bo->base);
 				bo = NULL;
@@ -3169,88 +3173,84 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 #else
 	alloc = ALIGN(size, 64*1024);
 #endif
+	/* Be more parsimonious with pwrite/pread buffers */
+	if ((flags & KGEM_BUFFER_INPLACE) == 0)
+		alloc = PAGE_ALIGN(size);
+	flags &= ~KGEM_BUFFER_INPLACE;
+
+	old = NULL;
+	if ((flags & KGEM_BUFFER_WRITE) == 0)
+		old = search_linear_cache(kgem, alloc, 0);
+	if (old == NULL)
+		old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
+	if (old) {
+		DBG(("%s: reusing ordinary handle %d for io\n",
+		     __FUNCTION__, old->handle));
+		alloc = old->size;
+		bo = partial_bo_alloc(alloc);
+		if (bo == NULL)
+			return NULL;
 
-	if (bo == NULL) {
-		struct kgem_bo *old;
-
-		/* Be more parsimonious with pwrite/pread buffers */
-		if ((flags & KGEM_BUFFER_INPLACE) == 0)
-			alloc = PAGE_ALIGN(size);
-		flags &= ~KGEM_BUFFER_INPLACE;
+		memcpy(&bo->base, old, sizeof(*old));
+		if (old->rq)
+			list_replace(&old->request,
+				     &bo->base.request);
+		else
+			list_init(&bo->base.request);
+		list_replace(&old->vma, &bo->base.vma);
+		list_init(&bo->base.list);
+		free(old);
+		bo->base.refcnt = 1;
+
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
+		bo->base.io = true;
+	} else {
+		bo = malloc(sizeof(*bo));
+		if (bo == NULL)
+			return NULL;
 
-		old = NULL;
-		if ((flags & KGEM_BUFFER_WRITE) == 0)
-			old = search_linear_cache(kgem, alloc, 0);
-		if (old == NULL)
-			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
+		old = search_linear_cache(kgem, alloc,
+					  CREATE_INACTIVE | CREATE_CPU_MAP);
 		if (old) {
-			DBG(("%s: reusing ordinary handle %d for io\n",
+			DBG(("%s: reusing cpu map handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
-			alloc = old->size;
-			bo = partial_bo_alloc(alloc);
-			if (bo == NULL)
-				return NULL;
 
 			memcpy(&bo->base, old, sizeof(*old));
 			if (old->rq)
-				list_replace(&old->request,
-					     &bo->base.request);
+				list_replace(&old->request, &bo->base.request);
 			else
 				list_init(&bo->base.request);
 			list_replace(&old->vma, &bo->base.vma);
 			list_init(&bo->base.list);
 			free(old);
 			bo->base.refcnt = 1;
-
-			bo->need_io = flags & KGEM_BUFFER_WRITE;
-			bo->base.io = true;
 		} else {
-			bo = malloc(sizeof(*bo));
-			if (bo == NULL)
-				return NULL;
-
-			old = search_linear_cache(kgem, alloc,
-						  CREATE_INACTIVE | CREATE_CPU_MAP);
-			if (old) {
-				DBG(("%s: reusing cpu map handle=%d for buffer\n",
-				     __FUNCTION__, old->handle));
-
-				memcpy(&bo->base, old, sizeof(*old));
-				if (old->rq)
-					list_replace(&old->request, &bo->base.request);
-				else
-					list_init(&bo->base.request);
-				list_replace(&old->vma, &bo->base.vma);
-				list_init(&bo->base.list);
-				free(old);
-				bo->base.refcnt = 1;
-			} else {
-				if (!__kgem_bo_init(&bo->base,
-						    gem_create(kgem->fd, alloc),
-						    alloc)) {
-					free(bo);
-					return NULL;
-				}
-				DBG(("%s: created handle=%d for buffer\n",
-				     __FUNCTION__, bo->base.handle));
-
-				bo->base.domain = DOMAIN_CPU;
-			}
-
-			bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
-			if (bo->mem == NULL) {
-				kgem_bo_free(kgem, &bo->base);
+			if (!__kgem_bo_init(&bo->base,
+					    gem_create(kgem->fd, alloc),
+					    alloc)) {
+				free(bo);
 				return NULL;
 			}
+			DBG(("%s: created handle=%d for buffer\n",
+			     __FUNCTION__, bo->base.handle));
 
-			if (flags & KGEM_BUFFER_WRITE)
-				kgem_bo_sync__cpu(kgem, &bo->base);
+			bo->base.domain = DOMAIN_CPU;
+		}
 
-			bo->need_io = false;
-			bo->base.io = true;
-			bo->mmapped = true;
+		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+		if (bo->mem == NULL) {
+			kgem_bo_free(kgem, &bo->base);
+			return NULL;
 		}
+
+		if (flags & KGEM_BUFFER_WRITE)
+			kgem_bo_sync__cpu(kgem, &bo->base);
+
+		bo->need_io = false;
+		bo->base.io = true;
+		bo->mmapped = true;
 	}
+init:
 	bo->base.reusable = false;
 	assert(bo->base.size == alloc);
 	assert(!bo->need_io || !bo->base.needs_flush);
commit 15499185a9b07dc1117a89f2940881cfe7867f81
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 20:28:44 2012 +0000

    sna/video: Ensure the video pixmap is on the GPU
    
    The presumption that the pixmap is the scanout and so will always be
    pinned is false if there is a shadow or under a compositor. In those
    cases, the pixmap may be idle and so the GPU bo reaped. This was
    compounded by that the video path did not mark the pixmap as busy. So
    whilst watching a video under xfce4 with compositing enabled (has to be
    a non-GL compositor) the video would suddenly stall.
    
    Reported-by: Paul Neumann <paul104x at yahoo.de>
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=45279
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index bc14967..5910daf 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -476,13 +476,6 @@ sna_drawable_move_to_gpu(DrawablePtr drawable, unsigned flags)
 	return sna_pixmap_move_to_gpu(get_drawable_pixmap(drawable), flags) != NULL;
 }
 
-static inline Bool
-sna_pixmap_is_gpu(PixmapPtr pixmap)
-{
-	struct sna_pixmap *priv = pixmap ? sna_pixmap(pixmap) : NULL;
-	return priv && priv->gpu_bo;
-}
-
 static inline struct kgem_bo *sna_pixmap_get_bo(PixmapPtr pixmap)
 {
 	return sna_pixmap(pixmap)->gpu_bo;
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index 6f69135..d99f884 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -240,7 +240,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		return BadAlloc;
 	}
 
-	if (!sna_pixmap_is_gpu(pixmap)) {
+	if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ | MOVE_WRITE)) {
 		DBG(("%s: attempting to render to a non-GPU pixmap\n",
 		     __FUNCTION__));
 		return BadAlloc;
commit ba2e7b0803d667624a8b28291fa952671ed9b4c6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 20:12:49 2012 +0000

    sna: Use a proxy rather than a temporary bo for too-tall but thin targets
    
    If the render target is thin enough to fit within the 3D pipeline, but is
    too tall, we can fudge the address of the origin and coordinates to fit
    within the constaints of the pipeline.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index d15ec60..ab825aa 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -187,6 +187,11 @@ static struct sna_damage *_sna_damage_create(void)
 	return damage;
 }
 
+struct sna_damage *sna_damage_create(void)
+{
+	return _sna_damage_create();
+}
+
 static bool _sna_damage_create_boxes(struct sna_damage *damage,
 				     int count)
 {
@@ -1395,6 +1400,21 @@ int _sna_damage_get_boxes(struct sna_damage *damage, BoxPtr *boxes)
 }
 #endif
 
+struct sna_damage *_sna_damage_combine(struct sna_damage *l,
+				       struct sna_damage *r,
+				       int dx, int dy)
+{
+	if (r->dirty)
+		__sna_damage_reduce(r);
+
+	if (pixman_region_not_empty(&r->region)) {
+		pixman_region_translate(&r->region, dx, dy);
+		l = __sna_damage_add(l, &r->region);
+	}
+
+	return l;
+}
+
 void __sna_damage_destroy(struct sna_damage *damage)
 {
 	free_list(&damage->embedded_box.list);
diff --git a/src/sna/sna_damage.h b/src/sna/sna_damage.h
index 228aba0..422a124 100644
--- a/src/sna/sna_damage.h
+++ b/src/sna/sna_damage.h
@@ -27,6 +27,18 @@ struct sna_damage {
 #define DAMAGE_MARK_ALL(ptr) ((struct sna_damage *)(((uintptr_t)(ptr))|1))
 #define DAMAGE_PTR(ptr) ((struct sna_damage *)(((uintptr_t)(ptr))&~1))
 
+struct sna_damage *sna_damage_create(void);
+
+struct sna_damage *_sna_damage_combine(struct sna_damage *l,
+				       struct sna_damage *r,
+				       int dx, int dy);
+static inline void sna_damage_combine(struct sna_damage **l,
+				      struct sna_damage *r,
+				      int dx, int dy)
+{
+	*l = _sna_damage_combine(*l, r, dx, dy);
+}
+
 fastcall struct sna_damage *_sna_damage_add(struct sna_damage *damage,
 					    RegionPtr region);
 static inline void sna_damage_add(struct sna_damage **damage,
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 513b489..f9151e0 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -1519,14 +1519,39 @@ sna_render_composite_redirect(struct sna *sna,
 	if (!width || !height)
 		return FALSE;
 
-	priv = sna_pixmap(op->dst.pixmap);
-	if (priv->gpu_bo == NULL) {
+	priv = sna_pixmap_force_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE);
+	if (priv == NULL) {
 		DBG(("%s: fallback -- no GPU bo attached\n", __FUNCTION__));
 		return FALSE;
 	}
 
-	if (!sna_pixmap_move_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE))
-		return FALSE;
+	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
+		int y1, y2;
+
+		assert(op->dst.pixmap.drawable.height > sna->render.max_3d_size);
+		y1 =  y + op->dst.y;
+		y2 =  y1 + height;
+		y1 &= y1 & (64 - 1);
+		y2 = ALIGN(y2, 64);
+
+		if (y2 - y1 <= sna->render.max_3d_size) {
+			t->box.x2 = t->box.x1 = op->dst.x;
+			t->box.y2 = t->box.y1 = op->dst.y;
+			t->real_bo = priv->gpu_bo;
+			t->real_damage = op->damage;
+			if (op->damage) {
+				t->damage = sna_damage_create();
+				op->damage = &t->damage;
+			}
+
+			op->dst.bo = kgem_create_proxy(priv->gpu_bo,
+						       y1 * priv->gpu_bo->pitch,
+						       (y2 - y1) * priv->gpu_bo->pitch);
+			op->dst.y += -y1;
+			op->dst.height = y2 - y1;
+			return TRUE;
+		}
+	}
 
 	/* We can process the operation in a single pass,
 	 * but the target is too large for the 3D pipeline.
@@ -1557,12 +1582,17 @@ sna_render_composite_redirect(struct sna *sna,
 	}
 
 	t->real_bo = priv->gpu_bo;
+	t->real_damage = op->damage;
+	if (op->damage) {
+		t->damage = sna_damage_create();
+		op->damage = &t->damage;
+	}
+
 	op->dst.bo = bo;
 	op->dst.x = -x;
 	op->dst.y = -y;
 	op->dst.width  = width;
 	op->dst.height = height;
-	op->damage = NULL;
 	return TRUE;
 }
 
@@ -1573,13 +1603,20 @@ sna_render_composite_redirect_done(struct sna *sna,
 	const struct sna_composite_redirect *t = &op->redirect;
 
 	if (t->real_bo) {
-		DBG(("%s: copying temporary to dst\n", __FUNCTION__));
-
-		sna_blt_copy_boxes(sna, GXcopy,
-				   op->dst.bo, -t->box.x1, -t->box.y1,
-				   t->real_bo, 0, 0,
-				   op->dst.pixmap->drawable.bitsPerPixel,
-				   &t->box, 1);
+		if (t->box.x2 > t->box.x1) {
+			DBG(("%s: copying temporary to dst\n", __FUNCTION__));
+			sna_blt_copy_boxes(sna, GXcopy,
+					   op->dst.bo, -t->box.x1, -t->box.y1,
+					   t->real_bo, 0, 0,
+					   op->dst.pixmap->drawable.bitsPerPixel,
+					   &t->box, 1);
+		}
+		if (t->damage) {
+			sna_damage_combine(t->real_damage, t->damage,
+					   t->box.x1 - op->dst.x,
+					   t->box.y1 - op->dst.y);
+			__sna_damage_destroy(t->damage);
+		}
 
 		kgem_bo_destroy(&sna->kgem, op->dst.bo);
 	}
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index c4711f4..b23a8a7 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -86,6 +86,7 @@ struct sna_composite_op {
 
 	struct sna_composite_redirect {
 		struct kgem_bo *real_bo;
+		struct sna_damage **real_damage, *damage;
 		BoxRec box;
 	} redirect;
 
commit 20bd415c441accc5424fa64cd0c2135edca62093
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 19:34:39 2012 +0000

    sna: Experiment with a partial source
    
    If the source is thin enough such that the pitch is within the sampler's
    constraints and the sample size is small enough, just fudge the origin
    of the bo such that it can be sampled.
    
    This avoids having to create a temporary bo and use the BLT to extract
    it and helps, for example, firefox-asteroids which uses an 64x11200
    texture atlas.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 4df29d2..95b67cf 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2967,7 +2967,6 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	DBG(("%s: target handle=%d, offset=%d, length=%d, io=%d\n",
 	     __FUNCTION__, target->handle, offset, length, target->io));
 	assert(target->proxy == NULL);
-	assert(target->tiling == I915_TILING_NONE);
 
 	bo = __kgem_bo_alloc(target->handle, length);
 	if (bo == NULL)
@@ -2975,9 +2974,11 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 
 	bo->io = target->io;
 	bo->dirty = target->dirty;
-	bo->reusable = false;
+	bo->tiling = target->tiling;
+	bo->pitch = target->pitch;
 	bo->proxy = kgem_bo_reference(target);
 	bo->delta = offset;
+	bo->reusable = false;
 	return bo;
 }
 
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 9d7857c..513b489 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -800,6 +800,103 @@ cleanup_tmp:
 	return ret;
 }
 
+static int
+sna_render_picture_partial(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int16_t x, int16_t y,
+			   int16_t w, int16_t h,
+			   int16_t dst_x, int16_t dst_y)
+{
+	struct kgem_bo *bo = NULL;
+	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
+	BoxRec box;
+
+	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
+	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
+
+	box.x1 = x;
+	box.y1 = y;
+	box.x2 = x + w;
+	box.y2 = y + h;
+	if (channel->transform)
+		pixman_transform_bounds(channel->transform, &box);
+
+	DBG(("%s sample=(%d, %d), (%d, %d): (%d, %d)/(%d, %d), repeat=%d\n", __FUNCTION__,
+	     box.x1, box.y1, box.x2, box.y2, w, h,
+	     pixmap->drawable.width, pixmap->drawable.height,
+	     channel->repeat));
+
+	if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
+		if (box.x1 < 0)
+			box.x1 = 0;
+		if (box.y1 < 0)
+			box.y1 = 0;
+		if (box.x2 > pixmap->drawable.width)
+			box.x2 = pixmap->drawable.width;
+		if (box.y2 > pixmap->drawable.height)
+			box.y2 = pixmap->drawable.height;
+	} else {
+		if (box.x1 < 0 ||
+		    box.y1 < 0 ||
+		    box.x2 > pixmap->drawable.width ||
+		    box.y2 > pixmap->drawable.height) {
+			box.x1 = box.y1 = 0;
+			box.x2 = pixmap->drawable.width;
+			box.y2 = pixmap->drawable.height;
+
+			if (!channel->is_affine)
+				return 0;
+		}
+	}
+
+	/* Presume worst case tile-row alignment for Y-tiling */
+	box.y1 = box.y1 & (64 - 1);
+	box.y2 = ALIGN(box.y2, 64);
+	w = box.x2 - box.x1;
+	h = box.y2 - box.y1;
+	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
+	     box.x1, box.y1, box.x2, box.y2, w, h,
+	     pixmap->drawable.width, pixmap->drawable.height));
+	if (w <= 0 || h <= 0 || h > sna->render.max_3d_size)
+		return 0;
+
+	memset(&channel->embedded_transform,
+	       0,
+	       sizeof(channel->embedded_transform));
+	channel->embedded_transform.matrix[0][0] = 1 << 16;
+	channel->embedded_transform.matrix[0][2] = 0;
+	channel->embedded_transform.matrix[1][1] = 1 << 16;
+	channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
+	channel->embedded_transform.matrix[2][2] = 1 << 16;
+	if (channel->transform)
+		pixman_transform_multiply(&channel->embedded_transform,
+					  &channel->embedded_transform,
+					  channel->transform);
+	channel->transform = &channel->embedded_transform;
+
+	if (use_cpu_bo(sna, pixmap, &box)) {
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->cpu_bo;
+	} else {
+		if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+			return 0;
+
+		bo = sna_pixmap(pixmap)->gpu_bo;
+	}
+
+	channel->offset[0] = x - dst_x;
+	channel->offset[1] = y - dst_y;
+	channel->scale[0] = 1.f/pixmap->drawable.width;
+	channel->scale[1] = 1.f/h;
+	channel->width  = pixmap->drawable.width;
+	channel->height = h;
+	channel->bo = kgem_create_proxy(bo, box.y1 * bo->pitch, h * bo->pitch);
+	return channel->bo != NULL;
+}
+
 int
 sna_render_picture_extract(struct sna *sna,
 			   PicturePtr picture,
@@ -825,6 +922,12 @@ sna_render_picture_extract(struct sna *sna,
 		return -1;
 	}
 
+	if (pixmap->drawable.width < sna->render.max_3d_size &&
+	    sna_render_picture_partial(sna, picture, channel,
+				       x, y, w, h,
+				       dst_x, dst_y))
+		return 1;
+
 	ow = w;
 	oh = h;
 
commit 3feed86fc361d289da2bee71752146a7b66f0962
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 18:37:39 2012 +0000

    sna: Mark diagonal lines as partial write
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index abcf559..f2997d0 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -6337,7 +6337,7 @@ fallback:
 		goto out;
 	if (!sna_drawable_move_region_to_cpu(drawable, &data.region,
 					     drawable_gc_flags(drawable, gc,
-							       n > 2)))
+							       !(data.flags & 4 && n == 2))))
 		goto out;
 
 	/* Install FillSpans in case we hit a fallback path in fbPolyLine */
@@ -7249,7 +7249,7 @@ fallback:
 		goto out;
 	if (!sna_drawable_move_region_to_cpu(drawable, &data.region,
 					     drawable_gc_flags(drawable, gc,
-							       n > 1)))
+							       !(data.flags & 4 && n > 1))))
 		goto out;
 
 	/* Install FillSpans in case we hit a fallback path in fbPolySegment */
commit 0d9aa0e72727f172b31e8577e1fecda2cdb1bd5a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 18:24:01 2012 +0000

    sna/video: Add some DBG messages to track the error paths
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45279
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index 09f1551..6f69135 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -235,11 +235,16 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 	     drw_x, drw_y, drw_w, drw_h,
 	     id, width, height, sync));
 
-	if (buf == 0)
+	if (buf == 0) {
+		DBG(("%s: garbage video buffer\n", __FUNCTION__));
 		return BadAlloc;
+	}
 
-	if (!sna_pixmap_is_gpu(pixmap))
+	if (!sna_pixmap_is_gpu(pixmap)) {
+		DBG(("%s: attempting to render to a non-GPU pixmap\n",
+		     __FUNCTION__));
 		return BadAlloc;
+	}
 
 	sna_video_frame_init(sna, video, id, width, height, &frame);
 
@@ -261,16 +266,21 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		}
 
 		frame.bo = kgem_create_for_name(&sna->kgem, *(uint32_t*)buf);
-		if (frame.bo == NULL)
+		if (frame.bo == NULL) {
+			DBG(("%s: failed to open bo\n", __FUNCTION__));
 			return BadAlloc;
+		}
 
 		assert(frame.bo->size >= frame.size);
 	} else {
 		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
-		if (frame.bo == NULL)
+		if (frame.bo == NULL) {
+			DBG(("%s: failed to allocate bo\n", __FUNCTION__));
 			return BadAlloc;
+		}
 
 		if (!sna_video_copy_data(sna, video, &frame, buf)) {
+			DBG(("%s: failed to copy frame\n", __FUNCTION__));
 			kgem_bo_destroy(&sna->kgem, frame.bo);
 			return BadAlloc;
 		}
@@ -281,13 +291,14 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 					      &clip->extents);
 
 	ret = Success;
-	if (sna->render.video(sna, video, &frame, clip,
+	if (!sna->render.video(sna, video, &frame, clip,
 			      src_w, src_h,
 			      drw_w, drw_h,
-			      pixmap))
-		DamageDamageRegion(drawable, clip);
-	else
+			      pixmap)) {
+		DBG(("%s: failed to render video\n", __FUNCTION__));
 		ret = BadAlloc;
+	} else
+		DamageDamageRegion(drawable, clip);
 
 	kgem_bo_destroy(&sna->kgem, frame.bo);
 
commit 7b1d1010f36eb62a42b6dff4681226ac54a427e8
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 15:45:17 2012 +0000

    sna: Consolidate routines to choice destination bo
    
    Combine the two very similar routines that decided if we should render
    into the GPU bo, CPU bo or shadow pixmap into a single function.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 840012f..abcf559 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1684,50 +1684,48 @@ box_inplace(PixmapPtr pixmap, const BoxRec *box)
 	return ((box->x2 - box->x1) * (box->y2 - box->y1) * pixmap->drawable.bitsPerPixel >> 15) >= sna->kgem.half_cpu_cache_pages;
 }
 
-static inline Bool
-_sna_drawable_use_gpu_bo(DrawablePtr drawable,
-			 const BoxRec *box,
-			 struct sna_damage ***damage)
+static inline struct kgem_bo *
+sna_drawable_use_bo(DrawablePtr drawable,
+		    const BoxRec *box,
+		    struct sna_damage ***damage)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	BoxRec extents;
 	int16_t dx, dy;
+	int ret;
+
+	DBG(("%s((%d, %d), (%d, %d))...\n", __FUNCTION__,
+	     box->x1, box->y1, box->x2, box->y2));
 
 	assert_drawable_contains_box(drawable, box);
 
 	if (priv == NULL) {
 		DBG(("%s: not attached\n", __FUNCTION__));
-		return FALSE;
+		return NULL;
 	}
 
-	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
-		*damage = NULL;
-		if (!priv->pinned)
-			list_move(&priv->inactive,
-				  &to_sna_from_pixmap(pixmap)->active_pixmaps);
-		priv->clear = false;
-		return TRUE;
-	}
+	if (DAMAGE_IS_ALL(priv->gpu_damage))
+		goto use_gpu_bo;
 
 	if (DAMAGE_IS_ALL(priv->cpu_damage))
-		return FALSE;
+		goto use_cpu_bo;
 
 	if (priv->gpu_bo == NULL) {
 		if (!priv->gpu) {
 			DBG(("%s: untiled, will not force allocation\n",
 			     __FUNCTION__));
-			return FALSE;
+			goto use_cpu_bo;
 		}
 
 		if (priv->cpu_damage && !box_inplace(pixmap, box)) {
 			DBG(("%s: damaged with a small operation, will not force allocation\n",
 			     __FUNCTION__));
-			return FALSE;
+			goto use_cpu_bo;
 		}
 
 		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE | MOVE_READ))
-			return FALSE;
+			goto use_cpu_bo;
 
 		DBG(("%s: allocated GPU bo for operation\n", __FUNCTION__));
 		goto done;
@@ -1745,34 +1743,39 @@ _sna_drawable_use_gpu_bo(DrawablePtr drawable,
 	     extents.x1, extents.y1, extents.x2, extents.y2));
 
 	if (priv->gpu_damage) {
-		int ret = sna_damage_contains_box(priv->gpu_damage, &extents);
+		if (!priv->cpu_damage) {
+			if (sna_damage_contains_box__no_reduce(priv->gpu_damage,
+							       &extents)) {
+				DBG(("%s: region wholly contained within GPU damage\n",
+				     __FUNCTION__));
+				goto use_gpu_bo;
+			} else {
+				DBG(("%s: partial GPU damage with no CPU damage, continuing to use GPU\n",
+				     __FUNCTION__));
+				goto move_to_gpu;
+			}
+		}
+
+		ret = sna_damage_contains_box(priv->gpu_damage, &extents);
 		if (ret == PIXMAN_REGION_IN) {
 			DBG(("%s: region wholly contained within GPU damage\n",
 			     __FUNCTION__));
-			*damage = NULL;
-			return TRUE;
+			goto use_gpu_bo;
 		}
 
-		if (ret != PIXMAN_REGION_OUT &&
-		    (priv->cpu_bo || kgem_bo_is_busy(priv->gpu_bo))) {
-			DBG(("%s: region partially contained within busy GPU damage\n",
+		if (ret != PIXMAN_REGION_OUT) {
+			DBG(("%s: region partially contained within GPU damage\n",
 			     __FUNCTION__));
 			goto move_to_gpu;
 		}
 	}
 
-	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)) {
-		DBG(("%s: busy CPU bo, prefer to use GPU\n",
-		     __FUNCTION__));
-		goto move_to_gpu;
-	}
-
 	if (priv->cpu_damage) {
-		int ret = sna_damage_contains_box(priv->cpu_damage, &extents);
+		ret = sna_damage_contains_box(priv->cpu_damage, &extents);
 		if (ret == PIXMAN_REGION_IN) {
 			DBG(("%s: region wholly contained within CPU damage\n",
 			     __FUNCTION__));
-			return FALSE;
+			goto use_cpu_bo;
 		}
 
 		if (box_inplace(pixmap, box)) {
@@ -1780,11 +1783,10 @@ _sna_drawable_use_gpu_bo(DrawablePtr drawable,
 			goto move_to_gpu;
 		}
 
-		if (ret != PIXMAN_REGION_OUT &&
-		    (priv->cpu_bo || !kgem_bo_is_busy(priv->gpu_bo))) {
-			DBG(("%s: region partially contained within idle CPU damage\n",
+		if (ret != PIXMAN_REGION_OUT) {
+			DBG(("%s: region partially contained within CPU damage\n",
 			     __FUNCTION__));
-			return FALSE;
+			goto use_cpu_bo;
 		}
 	}
 
@@ -1792,81 +1794,58 @@ move_to_gpu:
 	if (!sna_pixmap_move_area_to_gpu(pixmap, &extents,
 					 MOVE_READ | MOVE_WRITE)) {
 		DBG(("%s: failed to move-to-gpu, fallback\n", __FUNCTION__));
-		return FALSE;
+		goto use_cpu_bo;
 	}
 
 done:
-	*damage = DAMAGE_IS_ALL(priv->gpu_damage) ? NULL : &priv->gpu_damage;
-	return TRUE;
-}
-
-static inline Bool
-sna_drawable_use_gpu_bo(DrawablePtr drawable,
-			const BoxRec *box,
-			struct sna_damage ***damage)
-{
-	Bool ret;
-
-	DBG(("%s((%d, %d), (%d, %d))...\n", __FUNCTION__,
-	     box->x1, box->y1, box->x2, box->y2));
-
-	ret = _sna_drawable_use_gpu_bo(drawable, box, damage);
-
-	DBG(("%s((%d, %d), (%d, %d)) = %d\n", __FUNCTION__,
-	     box->x1, box->y1, box->x2, box->y2, ret));
-
-	return ret;
-}
+	if (sna_damage_is_all(&priv->gpu_damage,
+			      pixmap->drawable.width,
+			      pixmap->drawable.height))
+		*damage = NULL;
+	else
+		*damage = &priv->gpu_damage;
 
-static inline Bool
-_sna_drawable_use_cpu_bo(DrawablePtr drawable,
-			 const BoxRec *box,
-			 struct sna_damage ***damage)
-{
-	PixmapPtr pixmap = get_drawable_pixmap(drawable);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	struct sna *sna = to_sna_from_pixmap(pixmap);
-	BoxRec extents;
-	int16_t dx, dy;
+	DBG(("%s: using GPU bo with damage? %d\n",
+	     __FUNCTION__, *damage != NULL));
+	return priv->gpu_bo;
 
-	assert_drawable_contains_box(drawable, box);
+use_gpu_bo:
+	priv->clear = false;
+	if (!priv->pinned)
+		list_move(&priv->inactive,
+			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
+	*damage = NULL;
+	DBG(("%s: using whole GPU bo\n", __FUNCTION__));
+	return priv->gpu_bo;
+
+use_cpu_bo:
+	if (priv->cpu_bo == NULL)
+		return NULL;
 
-	if (priv == NULL || priv->cpu_bo == NULL)
-		return FALSE;
+	/* Continue to use the shadow pixmap once mapped */
+	if (pixmap->devPrivate.ptr) {
+		/* But only if we do not need to sync the CPU bo */
+		if (!kgem_bo_is_busy(priv->cpu_bo))
+			return NULL;
 
-	if (!sna->kgem.has_llc && priv->cpu_bo->domain == DOMAIN_CPU)
-		return FALSE;
+		/* Both CPU and GPU are busy, prefer to use the GPU */
+		if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+			goto move_to_gpu;
 
-	if (DAMAGE_IS_ALL(priv->cpu_damage)) {
-		*damage = NULL;
-		return TRUE;
+		priv->mapped = false;
+		pixmap->devPrivate.ptr = NULL;
 	}
 
-	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-
-	extents = *box;
-	extents.x1 += dx;
-	extents.x2 += dx;
-	extents.y1 += dy;
-	extents.y2 += dy;
-
-	*damage = &priv->cpu_damage;
-	if (priv->cpu_damage &&
-	     sna_damage_contains_box__no_reduce(priv->cpu_damage, &extents))
+	if (sna_damage_is_all(&priv->cpu_damage,
+			      pixmap->drawable.width,
+			      pixmap->drawable.height))
 		*damage = NULL;
+	else
+		*damage = &priv->cpu_damage;
 
-	return TRUE;
-}
-
-static inline Bool
-sna_drawable_use_cpu_bo(DrawablePtr drawable,
-			const BoxRec *box,
-			struct sna_damage ***damage)
-{
-	Bool ret = _sna_drawable_use_cpu_bo(drawable, box, damage);
-	DBG(("%s((%d, %d), (%d, %d)) = %d\n", __FUNCTION__,
-	     box->x1, box->y1, box->x2, box->y2, ret));
-	return ret;
+	DBG(("%s: using CPU bo with damage? %d\n",
+	     __FUNCTION__, *damage != NULL));
+	return priv->cpu_bo;
 }
 
 PixmapPtr
@@ -2617,21 +2596,22 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 	BoxRec *box;
 	int16_t dx, dy;
 	int n;
 	uint8_t rop = copy_ROP[gc->alu];
 
-	if (!sna_drawable_use_gpu_bo(&pixmap->drawable,
-				     &region->extents,
-				     &damage))
+	bo = sna_drawable_use_bo(&pixmap->drawable, &region->extents, &damage);
+	if (bo == NULL)
 		return false;
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y) {
+	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
-		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_X))
+		assert(bo == sna_pixmap_get_bo(pixmap));
+		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
+		if (bo == NULL)
 			return false;
 	}
 
@@ -2663,7 +2643,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		void *ptr;
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -2696,8 +2676,8 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		if (drawable->bitsPerPixel == 32)
 			b[0] |= 3 << 20;
 		b[0] |= ((box->x1 - x) & 7) << 17;
-		b[1] = priv->gpu_bo->pitch;
-		if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+		b[1] = bo->pitch;
+		if (sna->kgem.gen >= 40 && bo->tiling) {
 			b[0] |= BLT_DST_TILED;
 			b[1] >>= 2;
 		}
@@ -2705,8 +2685,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		b[1] |= rop << 16;
 		b[2] = box->y1 << 16 | box->x1;
 		b[3] = box->y2 << 16 | box->x2;
-		b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-				      priv->gpu_bo,
+		b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
 				      I915_GEM_DOMAIN_RENDER << 16 |
 				      I915_GEM_DOMAIN_RENDER |
 				      KGEM_RELOC_FENCED,
@@ -2735,7 +2714,6 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	struct sna_damage **damage;
 	struct kgem_bo *bo;
 	int16_t dx, dy;
@@ -2744,18 +2722,16 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (gc->alu != GXcopy)
 		return false;
 
-	if (!sna_drawable_use_gpu_bo(&pixmap->drawable,
-				     &region->extents,
-				     &damage))
+	bo = sna_drawable_use_bo(&pixmap->drawable, &region->extents, &damage);
+	if (bo == NULL)
 		return false;
 
-	bo = priv->gpu_bo;
 	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
-		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_X))
+		assert(bo == sna_pixmap_get_bo(pixmap));
+		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
+		if (bo == NULL)
 			return false;
-
-		bo = priv->gpu_bo;
 	}
 
 	assert_pixmap_contains_box(pixmap, RegionExtents(region));
@@ -4511,6 +4487,8 @@ sna_fill_spans(DrawablePtr drawable, GCPtr gc, int n,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct sna_damage **damage;
+	struct kgem_bo *bo;
 	RegionRec region;
 	unsigned flags;
 	uint32_t color;
@@ -4543,32 +4521,18 @@ sna_fill_spans(DrawablePtr drawable, GCPtr gc, int n,
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	if (gc_is_solid(gc, &color)) {
-		struct sna_pixmap *priv = sna_pixmap(pixmap);
-		struct sna_damage **damage;
-
-		DBG(("%s: trying solid fill [alu=%d, pixel=%08lx] blt paths\n",
-		     __FUNCTION__, gc->alu, gc->fgPixel));
-
-		if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage) &&
-		    sna_fill_spans_blt(drawable,
-				       priv->gpu_bo, damage,
-				       gc, color, n, pt, width, sorted,
-				       &region.extents, flags & 2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(drawable, &region.extents, &damage) &&
-		    sna_fill_spans_blt(drawable,
-				       priv->cpu_bo, damage,
-				       gc, color, n, pt, width, sorted,
-				       &region.extents, flags & 2))
-			return;
-	} else {
-		struct sna_pixmap *priv = sna_pixmap(pixmap);
-		struct sna_damage **damage;
+	bo = sna_drawable_use_bo(drawable, &region.extents, &damage);
+	if (bo) {
+		if (gc_is_solid(gc, &color)) {
+			DBG(("%s: trying solid fill [alu=%d, pixel=%08lx] blt paths\n",
+			     __FUNCTION__, gc->alu, gc->fgPixel));
 
-		/* Try converting these to a set of rectangles instead */
-		if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage)) {
+			sna_fill_spans_blt(drawable,
+					   bo, damage,
+					   gc, color, n, pt, width, sorted,
+					   &region.extents, flags & 2);
+		} else {
+			/* Try converting these to a set of rectangles instead */
 			xRectangle *rect;
 			int i;
 
@@ -4587,12 +4551,12 @@ sna_fill_spans(DrawablePtr drawable, GCPtr gc, int n,
 
 			if (gc->fillStyle == FillTiled) {
 				i = sna_poly_fill_rect_tiled_blt(drawable,
-								 priv->gpu_bo, damage,
+								 bo, damage,
 								 gc, n, rect,
 								 &region.extents, flags & 2);
 			} else {
 				i = sna_poly_fill_rect_stippled_blt(drawable,
-								    priv->gpu_bo, damage,
+								    bo, damage,
 								    gc, n, rect,
 								    &region.extents, flags & 2);
 			}
@@ -4661,6 +4625,11 @@ out:
 	RegionUninit(&region);
 }
 
+struct sna_copy_plane {
+	struct sna_damage **damage;
+	struct kgem_bo *bo;
+};
+
 static void
 sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 		    BoxPtr box, int n,
@@ -4670,7 +4639,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	struct sna_copy_plane *arg = closure;
 	PixmapPtr bitmap = (PixmapPtr)_bitmap;
 	uint32_t br00, br13;
 	int16_t dx, dy;
@@ -4681,12 +4650,12 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 		return;
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	if (closure)
-		sna_damage_add_boxes(closure, box, n, dx, dy);
+	if (arg->damage)
+		sna_damage_add_boxes(arg->damage, box, n, dx, dy);
 
 	br00 = 3 << 20;
-	br13 = priv->gpu_bo->pitch;
-	if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+	br13 = arg->bo->pitch;
+	if (sna->kgem.gen >= 40 && arg->bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -4714,7 +4683,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 		if (src_stride <= 128) {
 			src_stride = ALIGN(src_stride, 8) / 4;
 			if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
-			    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -4727,7 +4696,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			b[2] = (box->y1 + dy) << 16 | (box->x1 + dx);
 			b[3] = (box->y2 + dy) << 16 | (box->x2 + dx);
 			b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-					      priv->gpu_bo,
+					      arg->bo,
 					      I915_GEM_DOMAIN_RENDER << 16 |
 					      I915_GEM_DOMAIN_RENDER |
 					      KGEM_RELOC_FENCED,
@@ -4756,7 +4725,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			void *ptr;
 
 			if (!kgem_check_batch(&sna->kgem, 8) ||
-			    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+			    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
 			    !kgem_check_reloc(&sna->kgem, 2)) {
 				_kgem_submit(&sna->kgem);
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -4776,7 +4745,7 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			b[2] = (box->y1 + dy) << 16 | (box->x1 + dx);
 			b[3] = (box->y2 + dy) << 16 | (box->x2 + dx);
 			b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-					      priv->gpu_bo,
+					      arg->bo,
 					      I915_GEM_DOMAIN_RENDER << 16 |
 					      I915_GEM_DOMAIN_RENDER |
 					      KGEM_RELOC_FENCED,
@@ -4824,8 +4793,8 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 {
 	PixmapPtr dst_pixmap = get_drawable_pixmap(drawable);
 	PixmapPtr src_pixmap = get_drawable_pixmap(source);
-	struct sna_pixmap *priv = sna_pixmap(dst_pixmap);
 	struct sna *sna = to_sna_from_pixmap(dst_pixmap);
+	struct sna_copy_plane *arg = closure;
 	int16_t dx, dy;
 	int bit = ffs(bitplane) - 1;
 	uint32_t br00, br13;
@@ -4841,14 +4810,14 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 	sy += dy;
 
 	get_drawable_deltas(drawable, dst_pixmap, &dx, &dy);
-	if (closure)
-		sna_damage_add_boxes(closure, box, n, dx, dy);
+	if (arg->damage)
+		sna_damage_add_boxes(arg->damage, box, n, dx, dy);
 
 	br00 = XY_MONO_SRC_COPY;
 	if (drawable->bitsPerPixel == 32)
 		br00 |= 3 << 20;
-	br13 = priv->gpu_bo->pitch;
-	if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+	br13 = arg->bo->pitch;
+	if (sna->kgem.gen >= 40 && arg->bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -4873,7 +4842,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 		     sx, sy, bx1, bx2));
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, arg->bo, NULL) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -4997,7 +4966,7 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 		b[2] = (box->y1 + dy) << 16 | (box->x1 + dx);
 		b[3] = (box->y2 + dy) << 16 | (box->x2 + dx);
 		b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-				      priv->gpu_bo,
+				      arg->bo,
 				      I915_GEM_DOMAIN_RENDER << 16 |
 				      I915_GEM_DOMAIN_RENDER |
 				      KGEM_RELOC_FENCED,
@@ -5029,7 +4998,7 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	PixmapPtr pixmap = get_drawable_pixmap(dst);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	RegionRec region, *ret = NULL;
-	struct sna_damage **damage;
+	struct sna_copy_plane arg;
 
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d\n", __FUNCTION__,
 	     src_x, src_y, dst_x, dst_y, w, h));
@@ -5098,18 +5067,21 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (!PM_IS_SOLID(dst, gc->planemask))
 		goto fallback;
 
-	if (sna_drawable_use_gpu_bo(dst, &region.extents, &damage)) {
-		struct sna_pixmap *priv = sna_pixmap(pixmap);
-		if (priv->gpu_bo->tiling != I915_TILING_Y ||
-		    sna_pixmap_change_tiling(pixmap, I915_TILING_X)) {
-			RegionUninit(&region);
-			return miDoCopy(src, dst, gc,
-					src_x, src_y,
-					w, h,
-					dst_x, dst_y,
-					src->depth == 1 ? sna_copy_bitmap_blt :sna_copy_plane_blt,
-					(Pixel)bit, damage);
+	arg.bo = sna_drawable_use_bo(dst, &region.extents, &arg.damage);
+	if (arg.bo) {
+		if (arg.bo->tiling == I915_TILING_Y) {
+			assert(arg.bo == sna_pixmap_get_bo(pixmap));
+			arg.bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
+			if (arg.bo == NULL)
+				goto fallback;
 		}
+		RegionUninit(&region);
+		return miDoCopy(src, dst, gc,
+				src_x, src_y,
+				w, h,
+				dst_x, dst_y,
+				src->depth == 1 ? sna_copy_bitmap_blt :sna_copy_plane_blt,
+				(Pixel)bit, &arg);
 	}
 
 fallback:
@@ -5294,21 +5266,14 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 	}
 
 	if (PM_IS_SOLID(drawable, gc->planemask) && gc_is_solid(gc, &color)) {
-		struct sna_pixmap *priv = sna_pixmap(pixmap);
 		struct sna_damage **damage;
+		struct kgem_bo *bo;
 
 		DBG(("%s: trying solid fill [%08lx] blt paths\n",
 		     __FUNCTION__, gc->fgPixel));
 
-		if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage) &&
-		    sna_poly_point_blt(drawable,
-				       priv->gpu_bo, damage,
-				       gc, color, mode, n, pt, flags & 2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(drawable, &region.extents, &damage) &&
-		    sna_poly_point_blt(drawable,
-				       priv->cpu_bo, damage,
+		if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+		    sna_poly_point_blt(drawable, bo, damage,
 				       gc, color, mode, n, pt, flags & 2))
 			return;
 	}
@@ -6187,32 +6152,23 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 		     __FUNCTION__, (unsigned)color));
 
 		if (data.flags & 4) {
-			if (sna_drawable_use_gpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
-			    sna_poly_line_blt(drawable,
-					      priv->gpu_bo, data.damage,
-					      gc, color, mode, n, pt,
-					      &data.region.extents,
-					      data.flags & 2))
-				return;
-
-			if (sna_drawable_use_cpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
+			data.bo = sna_drawable_use_bo(drawable,
+						      &data.region.extents,
+						      &data.damage);
+			if (data.bo &&
 			    sna_poly_line_blt(drawable,
-					      priv->cpu_bo, data.damage,
+					      data.bo, data.damage,
 					      gc, color, mode, n, pt,
 					      &data.region.extents,
 					      data.flags & 2))
 				return;
 		} else { /* !rectilinear */
 			if (use_zero_spans(drawable, gc, &data.region.extents) &&
-			    sna_drawable_use_gpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
+			    (data.bo = sna_drawable_use_bo(drawable,
+							   &data.region.extents,
+							   &data.damage)) &&
 			    sna_poly_zero_line_blt(drawable,
-						   priv->gpu_bo, data.damage,
+						   data.bo, data.damage,
 						   gc, mode, n, pt,
 						   &data.region.extents,
 						   data.flags & 2))
@@ -6221,7 +6177,8 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 		}
 	} else if (data.flags & 4) {
 		/* Try converting these to a set of rectangles instead */
-		if (sna_drawable_use_gpu_bo(drawable, &data.region.extents, &data.damage)) {
+		data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage);
+		if (data.bo) {
 			DDXPointRec p1, p2;
 			xRectangle *rect;
 			int i;
@@ -6272,13 +6229,13 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 
 			if (gc->fillStyle == FillTiled) {
 				i = sna_poly_fill_rect_tiled_blt(drawable,
-								 priv->gpu_bo, data.damage,
+								 data.bo, data.damage,
 								 gc, n - 1, rect + 1,
 								 &data.region.extents,
 								 data.flags & 2);
 			} else {
 				i = sna_poly_fill_rect_stippled_blt(drawable,
-								    priv->gpu_bo, data.damage,
+								    data.bo, data.damage,
 								    gc, n - 1, rect + 1,
 								    &data.region.extents,
 								    data.flags & 2);
@@ -6292,7 +6249,7 @@ sna_poly_line(DrawablePtr drawable, GCPtr gc,
 
 spans_fallback:
 	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    sna_drawable_use_gpu_bo(drawable, &data.region.extents, &data.damage)) {
+	    (data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
 		DBG(("%s: converting line into spans\n", __FUNCTION__));
 		get_drawable_deltas(drawable, data.pixmap, &data.dx, &data.dy);
 		sna_gc(gc)->priv = &data;
@@ -6304,7 +6261,7 @@ spans_fallback:
 
 			if (!sna_fill_init_blt(&fill,
 					       data.sna, data.pixmap,
-					       priv->gpu_bo, gc->alu, color))
+					       data.bo, gc->alu, color))
 				goto fallback;
 
 			data.op = &fill;
@@ -6335,7 +6292,6 @@ spans_fallback:
 			 * using fgPixel and bgPixel so we need to reset state between
 			 * FillSpans.
 			 */
-			data.bo = priv->gpu_bo;
 			sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
 			gc->ops = &sna_gc_ops__tmp;
 
@@ -7120,32 +7076,22 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		     __FUNCTION__, (unsigned)color, data.flags));
 
 		if (data.flags & 4) {
-			if (sna_drawable_use_gpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
-			    sna_poly_segment_blt(drawable,
-						 priv->gpu_bo, data.damage,
-						 gc, color, n, seg,
-						 &data.region.extents,
-						 data.flags & 2))
-				return;
-
-			if (sna_drawable_use_cpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
+			if ((data.bo = sna_drawable_use_bo(drawable,
+							   &data.region.extents,
+							   &data.damage)) &&
 			    sna_poly_segment_blt(drawable,
-						 priv->cpu_bo, data.damage,
+						 data.bo, data.damage,
 						 gc, color, n, seg,
 						 &data.region.extents,
 						 data.flags & 2))
 				return;
 		} else {
 			if (use_zero_spans(drawable, gc, &data.region.extents) &&
-			    sna_drawable_use_gpu_bo(drawable,
-						    &data.region.extents,
-						    &data.damage) &&
+			    (data.bo = sna_drawable_use_bo(drawable,
+							   &data.region.extents,
+							   &data.damage)) &&
 			    sna_poly_zero_segment_blt(drawable,
-						      priv->gpu_bo, data.damage,
+						      data.bo, data.damage,
 						      gc, n, seg,
 						      &data.region.extents,
 						      data.flags & 2))
@@ -7153,7 +7099,7 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 		}
 	} else if (data.flags & 4) {
 		/* Try converting these to a set of rectangles instead */
-		if (sna_drawable_use_gpu_bo(drawable, &data.region.extents, &data.damage)) {
+		if ((data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
 			xRectangle *rect;
 			int i;
 
@@ -7196,13 +7142,13 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 
 			if (gc->fillStyle == FillTiled) {
 				i = sna_poly_fill_rect_tiled_blt(drawable,
-								 priv->gpu_bo, data.damage,
+								 data.bo, data.damage,
 								 gc, n, rect,
 								 &data.region.extents,
 								 data.flags & 2);
 			} else {
 				i = sna_poly_fill_rect_stippled_blt(drawable,
-								    priv->gpu_bo, data.damage,
+								    data.bo, data.damage,
 								    gc, n, rect,
 								    &data.region.extents,
 								    data.flags & 2);
@@ -7216,7 +7162,7 @@ sna_poly_segment(DrawablePtr drawable, GCPtr gc, int n, xSegment *seg)
 
 spans_fallback:
 	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    sna_drawable_use_gpu_bo(drawable, &data.region.extents, &data.damage)) {
+	    (data.bo = sna_drawable_use_bo(drawable, &data.region.extents, &data.damage))) {
 		void (*line)(DrawablePtr, GCPtr, int, int, DDXPointPtr);
 		int i;
 
@@ -7249,7 +7195,7 @@ spans_fallback:
 
 			if (!sna_fill_init_blt(&fill,
 					       data.sna, data.pixmap,
-					       priv->gpu_bo, gc->alu, color))
+					       data.bo, gc->alu, color))
 				goto fallback;
 
 			data.op = &fill;
@@ -7278,7 +7224,6 @@ spans_fallback:
 
 			fill.done(data.sna, &fill);
 		} else {
-			data.bo = priv->gpu_bo;
 			sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
 			gc->ops = &sna_gc_ops__tmp;
 
@@ -7793,6 +7738,7 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 	RegionRec region;
 	unsigned flags;
 
@@ -7829,25 +7775,17 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 	if (gc->lineStyle == LineSolid &&
 	    gc->joinStyle == JoinMiter &&
 	    PM_IS_SOLID(drawable, gc->planemask)) {
-		struct sna_pixmap *priv = sna_pixmap(pixmap);
-
 		DBG(("%s: trying blt solid fill [%08lx] paths\n",
 		     __FUNCTION__, gc->fgPixel));
-
-		if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage) &&
-		    sna_poly_rectangle_blt(drawable, priv->gpu_bo, damage,
-					   gc, n, r, &region.extents, flags&2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(drawable, &region.extents, &damage) &&
-		    sna_poly_rectangle_blt(drawable, priv->cpu_bo, damage,
+		if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
+		    sna_poly_rectangle_blt(drawable, bo, damage,
 					   gc, n, r, &region.extents, flags&2))
 			return;
 	} else {
 		/* Not a trivial outline, but we still maybe able to break it
 		 * down into simpler operations that we can accelerate.
 		 */
-		if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage)) {
+		if (sna_drawable_use_bo(drawable, &region.extents, &damage)) {
 			miPolyRectangle(drawable, gc, n, r);
 			return;
 		}
@@ -7966,8 +7904,8 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 		goto fallback;
 
 	if (use_wide_spans(drawable, gc, &data.region.extents) &&
-	    sna_drawable_use_gpu_bo(drawable,
-				    &data.region.extents, &data.damage)) {
+	    (data.bo = sna_drawable_use_bo(drawable,
+					   &data.region.extents, &data.damage))) {
 		uint32_t color;
 
 		DBG(("%s: converting arcs into spans\n", __FUNCTION__));
@@ -7978,7 +7916,7 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 
 			if (!sna_fill_init_blt(&fill,
 					       data.sna, data.pixmap,
-					       priv->gpu_bo, gc->alu, color))
+					       data.bo, gc->alu, color))
 				goto fallback;
 
 			data.op = &fill;
@@ -8311,9 +8249,9 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 		goto fallback;
 
 	if (use_wide_spans(draw, gc, &data.region.extents) &&
-	    sna_drawable_use_gpu_bo(draw,
-				    &data.region.extents,
-				    &data.damage)) {
+	    (data.bo = sna_drawable_use_bo(draw,
+					   &data.region.extents,
+					   &data.damage))) {
 		uint32_t color;
 
 		sna_gc(gc)->priv = &data;
@@ -8324,7 +8262,7 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 
 			if (!sna_fill_init_blt(&fill,
 					       data.sna, data.pixmap,
-					       priv->gpu_bo, gc->alu, color))
+					       data.bo, gc->alu, color))
 				goto fallback;
 
 			data.op = &fill;
@@ -8351,7 +8289,6 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 			miFillPolygon(draw, gc, shape, mode, n, pt);
 			fill.done(data.sna, &fill);
 		} else {
-			data.bo = priv->gpu_bo;
 			sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
 			gc->ops = &sna_gc_ops__tmp;
 
@@ -9654,6 +9591,7 @@ sna_poly_fill_rect(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect)
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 	RegionRec region;
 	unsigned flags;
 	uint32_t color;
@@ -9712,48 +9650,25 @@ sna_poly_fill_rect(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect)
 		DBG(("%s: solid fill [%08x], testing for blt\n",
 		     __FUNCTION__, color));
 
-		if (sna_drawable_use_gpu_bo(draw, &region.extents, &damage) &&
+		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
 		    sna_poly_fill_rect_blt(draw,
-					   priv->gpu_bo, damage,
-					   gc, color, n, rect,
-					   &region.extents, flags & 2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(draw, &region.extents, &damage) &&
-		    sna_poly_fill_rect_blt(draw,
-					   priv->cpu_bo, damage,
+					   bo, damage,
 					   gc, color, n, rect,
 					   &region.extents, flags & 2))
 			return;
 	} else if (gc->fillStyle == FillTiled) {
 		DBG(("%s: tiled fill, testing for blt\n", __FUNCTION__));
 
-		if (sna_drawable_use_gpu_bo(draw, &region.extents, &damage) &&
-		    sna_poly_fill_rect_tiled_blt(draw,
-						 priv->gpu_bo, damage,
-						 gc, n, rect,
-						 &region.extents, flags & 2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(draw, &region.extents, &damage) &&
-		    sna_poly_fill_rect_tiled_blt(draw,
-						 priv->cpu_bo, damage,
+		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
+		    sna_poly_fill_rect_tiled_blt(draw, bo, damage,
 						 gc, n, rect,
 						 &region.extents, flags & 2))
 			return;
 	} else {
 		DBG(("%s: stippled fill, testing for blt\n", __FUNCTION__));
 
-		if (sna_drawable_use_gpu_bo(draw, &region.extents, &damage) &&
-		    sna_poly_fill_rect_stippled_blt(draw,
-						    priv->gpu_bo, damage,
-						    gc, n, rect,
-						    &region.extents, flags & 2))
-			return;
-
-		if (sna_drawable_use_cpu_bo(draw, &region.extents, &damage) &&
-		    sna_poly_fill_rect_stippled_blt(draw,
-						    priv->cpu_bo, damage,
+		if ((bo = sna_drawable_use_bo(draw, &region.extents, &damage)) &&
+		    sna_poly_fill_rect_stippled_blt(draw, bo, damage,
 						    gc, n, rect,
 						    &region.extents, flags & 2))
 			return;
@@ -9887,9 +9802,9 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 		goto fallback;
 
 	if (use_wide_spans(draw, gc, &data.region.extents) &&
-	    sna_drawable_use_gpu_bo(draw,
-				    &data.region.extents,
-				    &data.damage)) {
+	    (data.bo = sna_drawable_use_bo(draw,
+					   &data.region.extents,
+					   &data.damage))) {
 		uint32_t color;
 
 		get_drawable_deltas(draw, data.pixmap, &data.dx, &data.dy);
@@ -9900,7 +9815,7 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 
 			if (!sna_fill_init_blt(&fill,
 					       data.sna, data.pixmap,
-					       priv->gpu_bo, gc->alu, color))
+					       data.bo, gc->alu, color))
 				goto fallback;
 
 			data.op = &fill;
@@ -9927,7 +9842,6 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 			miPolyFillArc(draw, gc, n, arc);
 			fill.done(data.sna, &fill);
 		} else {
-			data.bo = priv->gpu_bo;
 			sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
 			gc->ops = &sna_gc_ops__tmp;
 
@@ -10016,7 +9930,6 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	struct kgem_bo *bo;
 	struct sna_damage **damage;
 	const BoxRec *extents, *last_extents;
@@ -10034,12 +9947,13 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		return false;
 	}
 
-	if (!sna_drawable_use_gpu_bo(drawable, &clip->extents, &damage))
+	bo = sna_drawable_use_bo(drawable, &clip->extents, &damage);
+	if (bo == NULL)
 		return false;
 
-	bo = priv->gpu_bo;
 	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
+		assert(bo == sna_pixmap_get_bo(pixmap));
 		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
 		if (bo == NULL) {
 			DBG(("%s -- fallback, dst uses Y-tiling\n", __FUNCTION__));
@@ -10621,19 +10535,19 @@ static bool
 sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 		       int _x, int _y, unsigned int _n,
 		       CharInfoPtr *_info, pointer _base,
+		       struct kgem_bo *bo,
 		       struct sna_damage **damage,
 		       RegionPtr clip,
 		       bool transparent)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	const BoxRec *extents, *last_extents;
 	uint32_t *b;
 	int16_t dx, dy;
 	uint8_t rop = transparent ? copy_ROP[gc->alu] : ROP_S;
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y) {
+	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
 		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_X)) {
 			DBG(("%s -- fallback, dst uses Y-tiling\n", __FUNCTION__));
@@ -10651,23 +10565,22 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 
 	kgem_set_mode(&sna->kgem, KGEM_BLT);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
-	    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+	    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
 		_kgem_submit(&sna->kgem);
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 	b = sna->kgem.batch + sna->kgem.nbatch;
 	b[0] = XY_SETUP_BLT | 1 << 20;
-	b[1] = priv->gpu_bo->pitch;
-	if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+	b[1] = bo->pitch;
+	if (sna->kgem.gen >= 40 && bo->tiling) {
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
 	b[1] |= 1 << 30 | transparent << 29 | blt_depth(drawable->depth) << 24 | rop << 16;
 	b[2] = extents->y1 << 16 | extents->x1;
 	b[3] = extents->y2 << 16 | extents->x2;
-	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-			      priv->gpu_bo,
+	b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
 			      I915_GEM_DOMAIN_RENDER << 16 |
 			      I915_GEM_DOMAIN_RENDER |
 			      KGEM_RELOC_FENCED,
@@ -10712,8 +10625,8 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 
 				b = sna->kgem.batch + sna->kgem.nbatch;
 				b[0] = XY_SETUP_BLT | 1 << 20;
-				b[1] = priv->gpu_bo->pitch;
-				if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+				b[1] = bo->pitch;
+				if (sna->kgem.gen >= 40 && bo->tiling) {
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
@@ -10721,7 +10634,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 				b[2] = extents->y1 << 16 | extents->x1;
 				b[3] = extents->y2 << 16 | extents->x2;
 				b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-						      priv->gpu_bo,
+						      bo,
 						      I915_GEM_DOMAIN_RENDER << 16 |
 						      I915_GEM_DOMAIN_RENDER |
 						      KGEM_RELOC_FENCED,
@@ -10736,7 +10649,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			sna->kgem.nbatch += 3 + len;
 
 			b[0] = XY_TEXT_IMMEDIATE_BLT | (1 + len);
-			if (priv->gpu_bo->tiling && sna->kgem.gen >= 40)
+			if (bo->tiling && sna->kgem.gen >= 40)
 				b[0] |= BLT_DST_TILED;
 			b[1] = (uint16_t)y1 << 16 | (uint16_t)x1;
 			b[2] = (uint16_t)(y1+h) << 16 | (uint16_t)(x1+w);
@@ -10795,6 +10708,7 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 	ExtentInfoRec extents;
 	RegionRec region;
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 
 	if (n == 0)
 		return;
@@ -10830,9 +10744,9 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 		goto fallback;
 	}
 
-	if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage) &&
+	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
-				   damage, &region, false))
+				   bo, damage, &region, false))
 		goto out;
 
 fallback:
@@ -10861,6 +10775,7 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 	ExtentInfoRec extents;
 	RegionRec region;
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 
 	if (n == 0)
 		return;
@@ -10896,9 +10811,9 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 		goto fallback;
 	}
 
-	if (sna_drawable_use_gpu_bo(drawable, &region.extents, &damage) &&
+	if ((bo = sna_drawable_use_bo(drawable, &region.extents, &damage)) &&
 	    sna_reversed_glyph_blt(drawable, gc, x, y, n, info, base,
-				   damage, &region, true))
+				   bo, damage, &region, true))
 		goto out;
 
 fallback:
@@ -10925,19 +10840,22 @@ sna_push_pixels_solid_blt(GCPtr gc,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	struct sna_damage **damage;
+	struct kgem_bo *bo;
 	BoxRec *box;
 	int16_t dx, dy;
 	int n;
 	uint8_t rop = copy_ROP[gc->alu];
 
-	if (!sna_drawable_use_gpu_bo(drawable, &region->extents, &damage))
+	bo = sna_drawable_use_bo(drawable, &region->extents, &damage);
+	if (bo == NULL)
 		return false;
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y) {
+	if (bo->tiling == I915_TILING_Y) {
 		DBG(("%s: converting bo from Y-tiling\n", __FUNCTION__));
-		if (!sna_pixmap_change_tiling(pixmap, I915_TILING_X))
+		assert(bo == sna_pixmap_get_bo(pixmap));
+		bo = sna_pixmap_change_tiling(pixmap, I915_TILING_X);
+		if (bo == NULL)
 			return false;
 	}
 
@@ -10970,7 +10888,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		void *ptr;
 
 		if (!kgem_check_batch(&sna->kgem, 8) ||
-		    !kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL) ||
+		    !kgem_check_bo_fenced(&sna->kgem, bo, NULL) ||
 		    !kgem_check_reloc(&sna->kgem, 2)) {
 			_kgem_submit(&sna->kgem);
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
@@ -11004,8 +10922,8 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		if (drawable->bitsPerPixel == 32)
 			b[0] |= 3 << 20;
 		b[0] |= ((box->x1 - region->extents.x1) & 7) << 17;
-		b[1] = priv->gpu_bo->pitch;
-		if (sna->kgem.gen >= 40 && priv->gpu_bo->tiling) {
+		b[1] = bo->pitch;
+		if (sna->kgem.gen >= 40 && bo->tiling) {
 			b[0] |= BLT_DST_TILED;
 			b[1] >>= 2;
 		}
@@ -11014,8 +10932,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		b[1] |= rop << 16;
 		b[2] = box->y1 << 16 | box->x1;
 		b[3] = box->y2 << 16 | box->x2;
-		b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4,
-				      priv->gpu_bo,
+		b[4] = kgem_add_reloc(&sna->kgem, sna->kgem.nbatch + 4, bo,
 				      I915_GEM_DOMAIN_RENDER << 16 |
 				      I915_GEM_DOMAIN_RENDER |
 				      KGEM_RELOC_FENCED,
@@ -11221,7 +11138,7 @@ sna_get_image(DrawablePtr drawable,
 		PixmapPtr pixmap = get_drawable_pixmap(drawable);
 		int16_t dx, dy;
 
-		DBG(("%s: copy box (%d, %d), (%d, %d), origin (%d, %d)\n",
+		DBG(("%s: copy box (%d, %d), (%d, %d)\n",
 		     __FUNCTION__,
 		     region.extents.x1, region.extents.y1,
 		     region.extents.x2, region.extents.y2));
commit 6ac597ccc0cde61ec6142d939d70e0292165ebab
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 18:05:51 2012 +0000

    sna: Ensure that we have a source bo for tiled fills
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 3c8f2be..840012f 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2016,8 +2016,10 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	     __FUNCTION__, pixmap->drawable.serialNumber, pixmap->usage_hint));
 
 	priv = sna_pixmap(pixmap);
-	if (priv == NULL)
+	if (priv == NULL) {
+		DBG(("%s: not attached\n", __FUNCTION__));
 		return NULL;
+	}
 
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		DBG(("%s: already all-damaged\n", __FUNCTION__));
@@ -2039,6 +2041,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 					       sna_pixmap_choose_tiling(pixmap),
 					       priv->cpu_damage ? CREATE_GTT_MAP | CREATE_INACTIVE : 0);
 		if (priv->gpu_bo == NULL) {
+			DBG(("%s: not creating GPU bo\n", __FUNCTION__));
 			assert(list_is_empty(&priv->list));
 			return NULL;
 		}
@@ -8391,6 +8394,45 @@ out:
 	RegionUninit(&data.region);
 }
 
+static struct kgem_bo *
+sna_pixmap_get_source_bo(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+
+	if (priv == NULL) {
+		struct kgem_bo *upload;
+		struct sna *sna = to_sna_from_pixmap(pixmap);
+		void *ptr;
+
+		upload = kgem_create_buffer_2d(&sna->kgem,
+					       pixmap->drawable.width,
+					       pixmap->drawable.height,
+					       pixmap->drawable.bitsPerPixel,
+					       KGEM_BUFFER_WRITE_INPLACE,
+					       &ptr);
+		memcpy_blt(pixmap->devPrivate.ptr, ptr,
+			   pixmap->drawable.bitsPerPixel,
+			   pixmap->devKind, upload->pitch,
+			   0, 0,
+			   0, 0,
+			   pixmap->drawable.width,
+			   pixmap->drawable.height);
+
+		return upload;
+	}
+
+	if (priv->gpu_damage && !sna_pixmap_move_to_gpu(pixmap, MOVE_READ))
+		return NULL;
+
+	if (priv->cpu_damage && priv->cpu_bo)
+		return kgem_bo_reference(priv->cpu_bo);
+
+	if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+		return NULL;
+
+	return kgem_bo_reference(priv->gpu_bo);
+}
+
 static Bool
 sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 			     struct kgem_bo *bo,
@@ -8401,6 +8443,7 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	PixmapPtr tile = gc->tile.pixmap;
+	struct kgem_bo *tile_bo;
 	const DDXPointRec * const origin = &gc->patOrg;
 	struct sna_copy_op copy;
 	CARD32 alu = gc->alu;
@@ -8412,25 +8455,30 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 
 	tile_width = tile->drawable.width;
 	tile_height = tile->drawable.height;
-	if ((tile_width | tile_height) == 1)
+	if ((tile_width | tile_height) == 1) {
+		DBG(("%s: single pixel tile pixmap ,converting to solid fill\n",
+		     __FUNCTION__));
 		return sna_poly_fill_rect_blt(drawable, bo, damage,
 					      gc, get_pixel(tile),
 					      n, rect,
 					      extents, clipped);
+	}
 
 	/* XXX [248]x[238] tiling can be reduced to a pattern fill.
 	 * Also we can do the lg2 reduction for BLT and use repeat modes for
 	 * RENDER.
 	 */
 
-	if (!sna_pixmap_move_to_gpu(tile, MOVE_READ))
+	tile_bo = sna_pixmap_get_source_bo(tile);
+	if (tile_bo == NULL) {
+		DBG(("%s: unable to move tile go GPU, fallback\n",
+		     __FUNCTION__));
 		return FALSE;
+	}
 
-	if (!sna_copy_init_blt(&copy, sna,
-			       tile, sna_pixmap_get_bo(tile),
-			       pixmap, bo,
-			       alu)) {
+	if (!sna_copy_init_blt(&copy, sna, tile, tile_bo, pixmap, bo, alu)) {
 		DBG(("%s: unsupported blt\n", __FUNCTION__));
+		kgem_bo_destroy(&sna->kgem, tile_bo);
 		return FALSE;
 	}
 
@@ -8620,6 +8668,7 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 	}
 done:
 	copy.done(sna, &copy);
+	kgem_bo_destroy(&sna->kgem, tile_bo);
 	return TRUE;
 }
 
commit e38755cf7303f51cd114b615a5a56d40e608b521
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 14:08:57 2012 +0000

    sna/glyphs: Check that we attached to the cache pixmaps upon creation
    
    If the hw is wedged, then the pixmap creation routines will return an
    ordinary unattached pixmap. The code presumed that it would only return
    a pixmap with an attached bo, and so would segfault as it chased the
    invalid pointer after a GPU hang and the server was restarted.
    Considering that we already checked that the GPU wasn't wedged before we
    started, this is just mild paranoia, but on a run-once piece of code.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index ee40707..bef1774 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -157,12 +157,15 @@ static Bool realize_glyph_caches(struct sna *sna)
 
 	for (i = 0; i < ARRAY_SIZE(formats); i++) {
 		struct sna_glyph_cache *cache = &sna->render.glyph[i];
+		struct sna_pixmap *priv;
 		PixmapPtr pixmap;
-		PicturePtr picture;
+		PicturePtr picture = NULL;
+		PictFormatPtr pPictFormat;
 		CARD32 component_alpha;
 		int depth = PIXMAN_FORMAT_DEPTH(formats[i]);
 		int error;
-		PictFormatPtr pPictFormat = PictureMatchFormat(screen, depth, formats[i]);
+
+		pPictFormat = PictureMatchFormat(screen, depth, formats[i]);
 		if (!pPictFormat)
 			goto bail;
 
@@ -175,16 +178,18 @@ static Bool realize_glyph_caches(struct sna *sna)
 		if (!pixmap)
 			goto bail;
 
-		/* Prevent the cache from ever being paged out */
-		sna_pixmap(pixmap)->pinned = true;
+		priv = sna_pixmap(pixmap);
+		if (priv != NULL) {
+			/* Prevent the cache from ever being paged out */
+			priv->pinned = true;
 
-		component_alpha = NeedsComponent(pPictFormat->format);
-		picture = CreatePicture(0, &pixmap->drawable, pPictFormat,
-					CPComponentAlpha, &component_alpha,
-					serverClient, &error);
+			component_alpha = NeedsComponent(pPictFormat->format);
+			picture = CreatePicture(0, &pixmap->drawable, pPictFormat,
+						CPComponentAlpha, &component_alpha,
+						serverClient, &error);
+		}
 
 		screen->DestroyPixmap(pixmap);
-
 		if (!picture)
 			goto bail;
 
commit ce20ecd44d0e6a34b96650c186c1c18dd6944813
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 10:26:11 2012 +0000

    sna/video: Add some more DBG breadcrumbs to the textured PutImage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index fcef820..09f1551 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -229,6 +229,12 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 	Bool flush = false;
 	Bool ret;
 
+	DBG(("%s: src=(%d, %d),(%d, %d), dst=(%d, %d),(%d, %d), id=%d, sizep=%dx%d, sync?=%d\n",
+	     __FUNCTION__,
+	     src_x, src_y, src_w, src_h,
+	     drw_x, drw_y, drw_w, drw_h,
+	     id, width, height, sync));
+
 	if (buf == 0)
 		return BadAlloc;
 
@@ -245,6 +251,9 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		return Success;
 
 	if (xvmc_passthrough(id)) {
+		DBG(("%s: using passthough, name=%d\n",
+		     __FUNCTION__, *(uint32_t *)buf));
+
 		if (sna->kgem.gen < 31) {
 			/* XXX: i915 is not support and needs some
 			 * serious care.  grep for KMS in i915_hwmc.c */
@@ -254,6 +263,8 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		frame.bo = kgem_create_for_name(&sna->kgem, *(uint32_t*)buf);
 		if (frame.bo == NULL)
 			return BadAlloc;
+
+		assert(frame.bo->size >= frame.size);
 	} else {
 		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
 		if (frame.bo == NULL)
@@ -270,16 +281,16 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 					      &clip->extents);
 
 	ret = Success;
-	if (!sna->render.video(sna, video, &frame, clip,
-			       src_w, src_h,
-			       drw_w, drw_h,
-			       pixmap))
+	if (sna->render.video(sna, video, &frame, clip,
+			      src_w, src_h,
+			      drw_w, drw_h,
+			      pixmap))
+		DamageDamageRegion(drawable, clip);
+	else
 		ret = BadAlloc;
 
 	kgem_bo_destroy(&sna->kgem, frame.bo);
 
-	DamageDamageRegion(drawable, clip);
-
 	/* Push the frame to the GPU as soon as possible so
 	 * we can hit the next vsync.
 	 */
commit 5b1006aae8449dad387ca281ffded80b9da53a12
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 27 00:37:51 2012 +0000

    sna/video: Simplify the gen2/915gm check
    
    And make the later check in put image match.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_video_hwmc.c b/src/sna/sna_video_hwmc.c
index 3da7d3a..1f36096 100644
--- a/src/sna/sna_video_hwmc.c
+++ b/src/sna/sna_video_hwmc.c
@@ -196,10 +196,7 @@ Bool sna_video_xvmc_setup(struct sna *sna,
 	char buf[64];
 
 	/* Needs KMS support. */
-	if (IS_I915G(sna) || IS_I915GM(sna))
-		return FALSE;
-
-	if (IS_GEN2(sna))
+	if (sna->kgem.gen < 31)
 		return FALSE;
 
 	pAdapt = calloc(1, sizeof(XF86MCAdaptorRec));
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index a72d335..fcef820 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -245,7 +245,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		return Success;
 
 	if (xvmc_passthrough(id)) {
-		if (sna->kgem.gen == 30) {
+		if (sna->kgem.gen < 31) {
 			/* XXX: i915 is not support and needs some
 			 * serious care.  grep for KMS in i915_hwmc.c */
 			return BadAlloc;
commit ebeec12563c3a57c40dbe710b3627b482aada7fe
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 23:14:14 2012 +0000

    sna: Remove extraneous clipping from GetImage
    
    The spec says that they must wholly contained with the valid BorderClip
    for a Window or within the Pixmap or else a BadMatch is thrown. Rely on
    this behaviour and not perform the clipping ourselves.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index cad8d66..3c8f2be 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11154,42 +11154,15 @@ sna_get_image(DrawablePtr drawable,
 	      char *dst)
 {
 	RegionRec region;
-	DDXPointRec origin;
 
 	DBG(("%s (%d, %d, %d, %d)\n", __FUNCTION__, x, y, w, h));
 
-	/* XXX should be clipped already according to the spec... */
-	origin.x = region.extents.x1 = x + drawable->x;
-	origin.y = region.extents.y1 = y + drawable->y;
+	region.extents.x1 = x + drawable->x;
+	region.extents.y1 = y + drawable->y;
 	region.extents.x2 = region.extents.x1 + w;
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
 
-	if (drawable->type == DRAWABLE_PIXMAP) {
-		if (region.extents.x1 < drawable->x)
-			region.extents.x1 = drawable->x;
-		if (region.extents.x2 > drawable->x + drawable->width)
-			region.extents.x2 = drawable->x + drawable->width;
-
-		if (region.extents.y1 < drawable->y)
-			region.extents.y1 = drawable->y;
-		if (region.extents.y2 > drawable->y + drawable->height)
-			region.extents.y2 = drawable->y + drawable->height;
-	} else {
-		pixman_box16_t *c = &((WindowPtr)drawable)->borderClip.extents;
-		pixman_box16_t *r = &region.extents;
-
-		if (r->x1 < c->x1)
-			r->x1 = c->x1;
-		if (r->x2 > c->x2)
-			r->x2 = c->x2;
-
-		if (r->y1 < c->y1)
-			r->y1 = c->y1;
-		if (r->y2 > c->y2)
-			r->y2 = c->y2;
-	}
-
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, MOVE_READ))
 		return;
 
@@ -11202,17 +11175,13 @@ sna_get_image(DrawablePtr drawable,
 		DBG(("%s: copy box (%d, %d), (%d, %d), origin (%d, %d)\n",
 		     __FUNCTION__,
 		     region.extents.x1, region.extents.y1,
-		     region.extents.x2, region.extents.y2,
-		     origin.x, origin.y));
+		     region.extents.x2, region.extents.y2));
 		get_drawable_deltas(drawable, pixmap, &dx, &dy);
 		memcpy_blt(pixmap->devPrivate.ptr, dst, drawable->bitsPerPixel,
 			   pixmap->devKind, PixmapBytePad(w, drawable->depth),
 			   region.extents.x1 + dx,
 			   region.extents.y1 + dy,
-			   region.extents.x1 - origin.x,
-			   region.extents.y1 - origin.y,
-			   region.extents.x2 - region.extents.x1,
-			   region.extents.y2 - region.extents.y1);
+			   0, 0, w, h);
 	} else
 		fbGetImage(drawable, x, y, w, h, format, mask, dst);
 }
commit e86a941ead5f8068d2a81d629c97d8432e9129af
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 22:55:04 2012 +0000

    sna: Avoid fbBlt for the easy GetImage cases
    
    From (i5-2520m):
      60000 trep @   0.6145 msec (  1630.0/sec): GetImage 500x500 square
    To:
      60000 trep @   0.4949 msec (  2020.0/sec): GetImage 500x500 square
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 7fe06de..cad8d66 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11154,11 +11154,13 @@ sna_get_image(DrawablePtr drawable,
 	      char *dst)
 {
 	RegionRec region;
+	DDXPointRec origin;
 
 	DBG(("%s (%d, %d, %d, %d)\n", __FUNCTION__, x, y, w, h));
 
-	region.extents.x1 = x + drawable->x;
-	region.extents.y1 = y + drawable->y;
+	/* XXX should be clipped already according to the spec... */
+	origin.x = region.extents.x1 = x + drawable->x;
+	origin.y = region.extents.y1 = y + drawable->y;
 	region.extents.x2 = region.extents.x1 + w;
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
@@ -11173,14 +11175,46 @@ sna_get_image(DrawablePtr drawable,
 			region.extents.y1 = drawable->y;
 		if (region.extents.y2 > drawable->y + drawable->height)
 			region.extents.y2 = drawable->y + drawable->height;
-	} else
-		RegionIntersect(&region, &region,
-				&((WindowPtr)drawable)->borderClip);
+	} else {
+		pixman_box16_t *c = &((WindowPtr)drawable)->borderClip.extents;
+		pixman_box16_t *r = &region.extents;
+
+		if (r->x1 < c->x1)
+			r->x1 = c->x1;
+		if (r->x2 > c->x2)
+			r->x2 = c->x2;
+
+		if (r->y1 < c->y1)
+			r->y1 = c->y1;
+		if (r->y2 > c->y2)
+			r->y2 = c->y2;
+	}
 
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, MOVE_READ))
 		return;
 
-	fbGetImage(drawable, x, y, w, h, format, mask, dst);
+	if (format == ZPixmap &&
+	    drawable->bitsPerPixel >= 8 &&
+	    PM_IS_SOLID(drawable, mask)) {
+		PixmapPtr pixmap = get_drawable_pixmap(drawable);
+		int16_t dx, dy;
+
+		DBG(("%s: copy box (%d, %d), (%d, %d), origin (%d, %d)\n",
+		     __FUNCTION__,
+		     region.extents.x1, region.extents.y1,
+		     region.extents.x2, region.extents.y2,
+		     origin.x, origin.y));
+		get_drawable_deltas(drawable, pixmap, &dx, &dy);
+		memcpy_blt(pixmap->devPrivate.ptr, dst, drawable->bitsPerPixel,
+			   pixmap->devKind, PixmapBytePad(w, drawable->depth),
+			   region.extents.x1 + dx,
+			   region.extents.y1 + dy,
+			   region.extents.x1 - origin.x,
+			   region.extents.y1 - origin.y,
+			   region.extents.x2 - region.extents.x1,
+			   region.extents.y2 - region.extents.y1);
+	} else
+		fbGetImage(drawable, x, y, w, h, format, mask, dst);
 }
 
 static void
commit 55d326833fa7088988a3eb26662404af661672a7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 16:05:48 2012 +0000

    sna/gen2+: Include being unattached in the list of source fallbacks
    
    If the source is not attached to a buffer (be it a GPU bo or a CPU bo),
    a temporary upload buffer would be required and so it is not worth
    forcing the target to the destination in that case (should the target
    not be on the GPU already).
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index eb8d4ef..2a97cea 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1514,12 +1514,19 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
 	return (has_alphamap(p) ||
 		is_unhandled_gradient(p) ||
 		!gen2_check_filter(p) ||
-		!gen2_check_repeat(p));
+		!gen2_check_repeat(p) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index af83966..931142d 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2426,9 +2426,19 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
-	return has_alphamap(p) || !gen3_check_xformat(p) || !gen3_check_filter(p) || !gen3_check_repeat(p);
+	return (has_alphamap(p) ||
+		!gen3_check_xformat(p) ||
+		!gen3_check_filter(p) ||
+		!gen3_check_repeat(p) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index c798ce5..91d5f49 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2015,9 +2015,20 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
-	return has_alphamap(p) || is_gradient(p) || !gen4_check_filter(p) || !gen4_check_repeat(p) || !gen4_check_format(p->format);
+	return (has_alphamap(p) ||
+		is_gradient(p) ||
+		!gen4_check_filter(p) ||
+		!gen4_check_repeat(p) ||
+		!gen4_check_format(p->format) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 47c4e96..2c6d020 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2049,13 +2049,20 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
 	return (has_alphamap(p) ||
 		is_gradient(p) ||
 		!gen5_check_filter(p) ||
 		!gen5_check_repeat(p) ||
-		!gen5_check_format(p->format));
+		!gen5_check_format(p->format) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index c3bc2e7..d813d95 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2266,9 +2266,20 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
-	return has_alphamap(p) || is_gradient(p) || !gen6_check_filter(p) || !gen6_check_repeat(p) || !gen6_check_format(p->format);
+	return (has_alphamap(p) ||
+		is_gradient(p) ||
+		!gen6_check_filter(p) ||
+		!gen6_check_repeat(p) ||
+		!gen6_check_format(p->format) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 21d8c99..282b724 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2333,9 +2333,20 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
+need_upload(PicturePtr p)
+{
+	return p->pDrawable && unattached(p->pDrawable);
+}
+
+static bool
 source_fallback(PicturePtr p)
 {
-	return has_alphamap(p) || is_gradient(p) || !gen7_check_filter(p) || !gen7_check_repeat(p) || !gen7_check_format(p->format);
+	return (has_alphamap(p) ||
+		is_gradient(p) ||
+		!gen7_check_filter(p) ||
+		!gen7_check_repeat(p) ||
+		!gen7_check_format(p->format) ||
+		need_upload(p));
 }
 
 static bool
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index c9d2b5f..489f215 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -106,6 +106,17 @@ too_small(DrawablePtr drawable)
 }
 
 static inline Bool
+unattached(DrawablePtr drawable)
+{
+	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
+
+	if (priv == NULL)
+		return true;
+
+	return priv->gpu_bo == NULL && priv->cpu_bo == NULL;
+}
+
+static inline Bool
 picture_is_gpu(PicturePtr picture)
 {
 	if (!picture || !picture->pDrawable)
commit 62054692d84c2ce4dfdfeb55fee6410a6671252e
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 13:59:24 2012 +0000

    sna: Allow gen4+ to use larger GPU bo
    
    As the blitter on gen4+ does not require fence registers, it is not
    restricted to operating on large objects within the mappable aperture.
    As we do not need to operate on such large GPU bo in place, we can relax
    the restriction on the maximum bo size for gen4+ to allocate for use
    with the GPU.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 6cd86e6..4df29d2 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -658,15 +658,22 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
-	kgem->max_gpu_size = kgem->aperture_mappable / 2;
-	if (kgem->max_gpu_size > kgem->aperture_low)
-		kgem->max_gpu_size = kgem->aperture_low;
-	if (kgem->max_gpu_size > MAX_OBJECT_SIZE)
-		kgem->max_gpu_size = MAX_OBJECT_SIZE;
-
 	kgem->max_cpu_size = kgem->aperture_total / 2;
 	if (kgem->max_cpu_size > MAX_OBJECT_SIZE)
 		kgem->max_cpu_size = MAX_OBJECT_SIZE;
+
+	kgem->max_gpu_size = -1;
+	if (gen < 40) {
+		/* If we have to use fences for blitting, we have to make
+		 * sure we can fit them into the aperture.
+		 */
+		kgem->max_gpu_size = kgem->aperture_mappable / 2;
+		if (kgem->max_gpu_size > kgem->aperture_low)
+			kgem->max_gpu_size = kgem->aperture_low;
+	}
+	if (kgem->max_gpu_size > kgem->max_cpu_size)
+		kgem->max_gpu_size = kgem->max_cpu_size;
+
 	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
 	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
 
@@ -2729,6 +2736,8 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
+		assert(bo->size <= kgem->aperture_mappable / 4);
+
 		kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
 
 		ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 0cc4fd3..0dc67da 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -363,10 +363,16 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 	if (bo->domain == DOMAIN_GTT)
 		return true;
 
+	if (IS_GTT_MAP(bo->map))
+		return true;
+
 	if (kgem->gen < 40 && bo->tiling &&
 	    bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
 		return false;
 
+	if (!bo->presumed_offset)
+		return bo->size <= kgem->aperture_mappable / 4;
+
 	return bo->presumed_offset + bo->size <= kgem->aperture_mappable;
 }
 
commit 72d5b6eca269cba285b32b1ac6c207be553be66c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 14:34:36 2012 +0000

    sna: Prevent mapping through the GTT for large bo
    
    If the bo is larger than a quarter of the aperture, it is unlikely that
    we will be able to evict enough contiguous space in the GATT to
    accommodate that buffer. So don't attempt to map them and use the
    indirect access instead.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index db4f061..0cc4fd3 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -394,13 +394,19 @@ static inline bool kgem_bo_map_will_stall(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle,
 	     bo->domain, bo->presumed_offset, bo->size));
 
+	if (!kgem_bo_is_mappable(kgem, bo))
+		return true;
+
+	if (kgem->wedged)
+		return false;
+
 	if (kgem_bo_is_busy(bo))
 		return true;
 
 	if (bo->presumed_offset == 0)
 		return !list_is_empty(&kgem->requests);
 
-	return !kgem_bo_is_mappable(kgem, bo);
+	return false;
 }
 
 static inline bool kgem_bo_is_dirty(struct kgem_bo *bo)
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b3b968c..7fe06de 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -797,7 +797,7 @@ sna_pixmap_create_mappable_gpu(PixmapPtr pixmap)
 			       sna_pixmap_choose_tiling(pixmap),
 			       CREATE_GTT_MAP | CREATE_INACTIVE);
 
-	return priv->gpu_bo != NULL;
+	return priv->gpu_bo && kgem_bo_is_mappable(&sna->kgem, priv->gpu_bo);
 }
 
 bool
@@ -835,7 +835,8 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 				    priv->gpu_bo->exec == NULL)
 					kgem_retire(&sna->kgem);
 
-				if (kgem_bo_is_busy(priv->gpu_bo)) {
+				if (kgem_bo_map_will_stall(&sna->kgem,
+							   priv->gpu_bo)) {
 					if (priv->pinned)
 						goto skip_inplace_map;
 
@@ -897,7 +898,7 @@ skip_inplace_map:
 
 	if (flags & MOVE_INPLACE_HINT &&
 	    priv->stride && priv->gpu_bo &&
-	    !kgem_bo_is_busy(priv->gpu_bo) &&
+	    !kgem_bo_map_will_stall(&sna->kgem, priv->gpu_bo) &&
 	    pixmap_inplace(sna, pixmap, priv) &&
 	    sna_pixmap_move_to_gpu(pixmap, flags)) {
 		assert(flags & MOVE_WRITE);
@@ -1250,7 +1251,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 
 	if (flags & MOVE_INPLACE_HINT &&
 	    priv->stride && priv->gpu_bo &&
-	    !kgem_bo_is_busy(priv->gpu_bo) &&
+	    !kgem_bo_map_will_stall(&sna->kgem, priv->gpu_bo) &&
 	    region_inplace(sna, pixmap, region, priv) &&
 	    sna_pixmap_move_area_to_gpu(pixmap, &region->extents, flags)) {
 		assert(flags & MOVE_WRITE);
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index a2e7a59..f3ca212 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -122,8 +122,7 @@ void sna_read_boxes(struct sna *sna,
 	 * this path.
 	 */
 
-	if (DEBUG_NO_IO || kgem->wedged ||
-	    !kgem_bo_map_will_stall(kgem, src_bo) ||
+	if (!kgem_bo_map_will_stall(kgem, src_bo) ||
 	    src_bo->tiling == I915_TILING_NONE) {
 fallback:
 		read_boxes_inplace(kgem,
@@ -386,10 +385,7 @@ static bool upload_inplace(struct kgem *kgem,
 			   int n, int bpp)
 {
 	if (DEBUG_NO_IO)
-		return true;
-
-	if (unlikely(kgem->wedged))
-		return true;
+		return kgem_bo_is_mappable(kgem, bo);
 
 	/* If we are writing through the GTT, check first if we might be
 	 * able to almagamate a series of small writes into a single
@@ -993,14 +989,27 @@ struct kgem_bo *sna_replace(struct sna *sna,
 		kgem_bo_write(kgem, bo, src,
 			      (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8);
 	} else {
-		dst = kgem_bo_map(kgem, bo);
-		if (dst) {
-			memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
-				   stride, bo->pitch,
-				   0, 0,
-				   0, 0,
-				   pixmap->drawable.width,
-				   pixmap->drawable.height);
+		if (kgem_bo_is_mappable(kgem, bo)) {
+			dst = kgem_bo_map(kgem, bo);
+			if (dst) {
+				memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
+					   stride, bo->pitch,
+					   0, 0,
+					   0, 0,
+					   pixmap->drawable.width,
+					   pixmap->drawable.height);
+			}
+		} else {
+			BoxRec box;
+
+			box.x1 = box.y1 = 0;
+			box.x2 = pixmap->drawable.width;
+			box.y2 = pixmap->drawable.height;
+
+			sna_write_boxes(sna, pixmap,
+					bo, 0, 0,
+					src, stride, 0, 0,
+					&box, 1);
 		}
 	}
 
@@ -1038,15 +1047,29 @@ struct kgem_bo *sna_replace__xor(struct sna *sna,
 		}
 	}
 
-	dst = kgem_bo_map(kgem, bo);
-	if (dst) {
-		memcpy_xor(src, dst, pixmap->drawable.bitsPerPixel,
-			   stride, bo->pitch,
-			   0, 0,
-			   0, 0,
-			   pixmap->drawable.width,
-			   pixmap->drawable.height,
-			   and, or);
+	if (kgem_bo_is_mappable(kgem, bo)) {
+		dst = kgem_bo_map(kgem, bo);
+		if (dst) {
+			memcpy_xor(src, dst, pixmap->drawable.bitsPerPixel,
+				   stride, bo->pitch,
+				   0, 0,
+				   0, 0,
+				   pixmap->drawable.width,
+				   pixmap->drawable.height,
+				   and, or);
+		}
+	} else {
+		BoxRec box;
+
+		box.x1 = box.y1 = 0;
+		box.x2 = pixmap->drawable.width;
+		box.y2 = pixmap->drawable.height;
+
+		sna_write_boxes__xor(sna, pixmap,
+				     bo, 0, 0,
+				     src, stride, 0, 0,
+				     &box, 1,
+				     and, or);
 	}
 
 	return bo;
commit 96dd58f1e05fd89c6311740f15322a1627c162ae
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 13:35:15 2012 +0000

    sna: Add FORCE_FALLBACK debugging hook for PutImage
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 759e0fe..b3b968c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -2907,6 +2907,9 @@ sna_put_image(DrawablePtr drawable, GCPtr gc, int depth,
 
 	RegionTranslate(&region, dx, dy);
 
+	if (FORCE_FALLBACK)
+		goto fallback;
+
 	if (wedged(sna))
 		goto fallback;
 
commit 9f870510bc43b7f51b0d186ea62d152fe03b0dd3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 11:20:03 2012 +0000

    sna/gen3: Use cpu bo if already in use
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 95a79b2..af83966 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2337,32 +2337,42 @@ gen3_composite_set_target(struct sna *sna,
 	op->dst.height = op->dst.pixmap->drawable.height;
 	priv = sna_pixmap(op->dst.pixmap);
 
-	priv = sna_pixmap_force_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE);
-	if (priv == NULL)
-		return FALSE;
+	op->dst.bo = NULL;
+	priv = sna_pixmap(op->dst.pixmap);
+	if (priv &&
+	    priv->gpu_bo == NULL &&
+	    priv->cpu_bo && priv->cpu_bo->domain != DOMAIN_CPU) {
+		op->dst.bo = priv->cpu_bo;
+		op->damage = &priv->cpu_damage;
+	}
+	if (op->dst.bo == NULL) {
+		priv = sna_pixmap_force_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE);
+		if (priv == NULL)
+			return FALSE;
 
-	/* For single-stream mode there should be no minimum alignment
-	 * required, except that the width must be at least 2 elements.
-	 */
-	if (priv->gpu_bo->pitch < 2*op->dst.pixmap->drawable.bitsPerPixel) {
-		struct kgem_bo *bo;
+		/* For single-stream mode there should be no minimum alignment
+		 * required, except that the width must be at least 2 elements.
+		 */
+		if (priv->gpu_bo->pitch < 2*op->dst.pixmap->drawable.bitsPerPixel) {
+			struct kgem_bo *bo;
 
-		if (priv->pinned)
-			return FALSE;
+			if (priv->pinned)
+				return FALSE;
 
-		bo = kgem_replace_bo(&sna->kgem, priv->gpu_bo,
-				     op->dst.width, op->dst.height,
-				     2*op->dst.pixmap->drawable.bitsPerPixel,
-				     op->dst.pixmap->drawable.bitsPerPixel);
-		if (bo == NULL)
-			return FALSE;
+			bo = kgem_replace_bo(&sna->kgem, priv->gpu_bo,
+					     op->dst.width, op->dst.height,
+					     2*op->dst.pixmap->drawable.bitsPerPixel,
+					     op->dst.pixmap->drawable.bitsPerPixel);
+			if (bo == NULL)
+				return FALSE;
 
-		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
-		priv->gpu_bo = bo;
-	}
+			kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
+			priv->gpu_bo = bo;
+		}
 
-	op->dst.bo = priv->gpu_bo;
-	op->damage = &priv->gpu_damage;
+		op->dst.bo = priv->gpu_bo;
+		op->damage = &priv->gpu_damage;
+	}
 	if (sna_damage_is_all(op->damage, op->dst.width, op->dst.height))
 		op->damage = NULL;
 
@@ -2475,7 +2485,9 @@ gen3_composite_fallback(struct sna *sna,
 
 	if (src_pixmap && !is_solid(src) && !source_fallback(src)) {
 		priv = sna_pixmap(src_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
+		if (priv &&
+		    ((priv->gpu_damage && !priv->cpu_damage) ||
+		     (priv->cpu_bo && priv->cpu_bo->domain != DOMAIN_CPU))) {
 			DBG(("%s: src is already on the GPU, try to use GPU\n",
 			     __FUNCTION__));
 			return FALSE;
@@ -2483,7 +2495,9 @@ gen3_composite_fallback(struct sna *sna,
 	}
 	if (mask_pixmap && !is_solid(mask) && !source_fallback(mask)) {
 		priv = sna_pixmap(mask_pixmap);
-		if (priv && priv->gpu_damage && !priv->cpu_damage) {
+		if (priv &&
+		    ((priv->gpu_damage && !priv->cpu_damage) ||
+		     (priv->cpu_bo && priv->cpu_bo->domain != DOMAIN_CPU))) {
 			DBG(("%s: mask is already on the GPU, try to use GPU\n",
 			     __FUNCTION__));
 			return FALSE;
commit 8c6846f95525785655408735a17309c4ac674587
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 10:47:01 2012 +0000

    sna: Search the buckets above the desired size in the bo cache
    
    It is preferrable to reuse a slightly larger bo, than it is to create a
    fresh one and map it into the aperture. So search the bucket above us as
    well.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index a2fcefc..6cd86e6 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -2189,7 +2189,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	struct kgem_bo *bo, *next;
 	uint32_t pitch, untiled_pitch, tiled_height, size;
 	uint32_t handle;
-	int i;
+	int i, bucket, retry;
 
 	if (tiling < 0)
 		tiling = -tiling, flags |= CREATE_EXACT;
@@ -2208,6 +2208,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				 width, height, bpp, tiling, &pitch);
 	assert(size && size < kgem->max_cpu_size);
 	assert(tiling == I915_TILING_NONE || size < kgem->max_gpu_size);
+	bucket = cache_bucket(size);
 
 	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
@@ -2216,10 +2217,10 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		/* We presume that we will need to upload to this bo,
 		 * and so would prefer to have an active VMA.
 		 */
-		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
+		cache = &kgem->vma[for_cpu].inactive[bucket];
 		do {
 			list_for_each_entry(bo, cache, vma) {
-				assert(bo->bucket == cache_bucket(size));
+				assert(bo->bucket == bucket);
 				assert(bo->refcnt == 0);
 				assert(bo->map);
 				assert(IS_CPU_MAP(bo->map) == for_cpu);
@@ -2263,13 +2264,17 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		goto skip_active_search;
 
 	/* Best active match */
-	cache = active(kgem, size, tiling);
+	retry = NUM_CACHE_BUCKETS - bucket;
+	if (retry > 3)
+		retry = 3;
+search_again:
+	cache = &kgem->active[bucket][tiling];
 	if (tiling) {
 		tiled_height = kgem_aligned_height(kgem, height, tiling);
 		list_for_each_entry(bo, cache, list) {
-			assert(bo->bucket == cache_bucket(size));
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
+			assert(bo->bucket == bucket);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
@@ -2280,7 +2285,6 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				continue;
 			}
 
-
 			if (bo->pitch * tiled_height > bo->size)
 				continue;
 
@@ -2294,7 +2298,7 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		}
 	} else {
 		list_for_each_entry(bo, cache, list) {
-			assert(bo->bucket == cache_bucket(size));
+			assert(bo->bucket == bucket);
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
@@ -2314,6 +2318,11 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		}
 	}
 
+	if (--retry && flags & CREATE_EXACT) {
+		bucket++;
+		goto search_again;
+	}
+
 	if ((flags & CREATE_EXACT) == 0) { /* allow an active near-miss? */
 		untiled_pitch = kgem_untiled_pitch(kgem,
 						   width, bpp,
@@ -2356,10 +2365,15 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	}
 
 skip_active_search:
+	bucket = cache_bucket(size);
+	retry = NUM_CACHE_BUCKETS - bucket;
+	if (retry > 3)
+		retry = 3;
+search_inactive:
 	/* Now just look for a close match and prefer any currently active */
-	cache = inactive(kgem, size);
+	cache = &kgem->inactive[bucket];
 	list_for_each_entry_safe(bo, next, cache, list) {
-		assert(bo->bucket == cache_bucket(size));
+		assert(bo->bucket == bucket);
 
 		if (size > bo->size) {
 			DBG(("inactive too small: %d < %d\n",
@@ -2409,10 +2423,15 @@ skip_active_search:
 	if (flags & CREATE_INACTIVE && !list_is_empty(&kgem->requests)) {
 		if (kgem_retire(kgem)) {
 			flags &= ~CREATE_INACTIVE;
-			goto skip_active_search;
+			goto search_inactive;
 		}
 	}
 
+	if (--retry) {
+		bucket++;
+		goto search_inactive;
+	}
+
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
commit e7e72b4d6ca65593a85b8e22d344b96b42c04d4a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 12:42:12 2012 +0000

    sna: Apply any previous transformation when downsampling
    
    In order to handle rotations and fractional offsets produced by the act
    of downsampling, we need to compute the full affine transformation and
    apply it to the vertices rather than attempt to fudge it with an integer
    offset.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=45086
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 6eae248..a2fcefc 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3323,67 +3323,6 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 	return bo;
 }
 
-struct kgem_bo *kgem_upload_source_image_halved(struct kgem *kgem,
-						pixman_format_code_t format,
-						const void *data,
-						int x, int y,
-						int width, int height,
-						int stride, int bpp)
-{
-	struct kgem_bo *bo;
-	pixman_image_t *src_image, *dst_image;
-	pixman_transform_t t;
-	int w, h;
-	void *dst;
-
-	DBG(("%s : (%d, %d), (%d, %d), stride=%d, bpp=%d\n",
-	     __FUNCTION__, x, y, width, height, stride, bpp));
-
-	w = (width + 1) / 2;
-	h = (height + 1) / 2;
-
-	bo = kgem_create_buffer_2d(kgem, w, h, bpp,
-				   KGEM_BUFFER_WRITE_INPLACE,
-				   &dst);
-	if (bo == NULL)
-		return NULL;
-
-	dst_image = pixman_image_create_bits(format, w, h, dst, bo->pitch);
-	if (dst_image == NULL)
-		goto cleanup_bo;
-
-	src_image = pixman_image_create_bits(format, width, height,
-					     (uint32_t*)data, stride);
-	if (src_image == NULL)
-		goto cleanup_dst;
-
-	memset(&t, 0, sizeof(t));
-	t.matrix[0][0] = 2 << 16;
-	t.matrix[1][1] = 2 << 16;
-	t.matrix[2][2] = 1 << 16;
-	pixman_image_set_transform(src_image, &t);
-	pixman_image_set_filter(src_image, PIXMAN_FILTER_BILINEAR, NULL, 0);
-	pixman_image_set_repeat(src_image, PIXMAN_REPEAT_PAD);
-
-	pixman_image_composite(PIXMAN_OP_SRC,
-			       src_image, NULL, dst_image,
-			       x, y,
-			       0, 0,
-			       0, 0,
-			       w, h);
-
-	pixman_image_unref(src_image);
-	pixman_image_unref(dst_image);
-
-	return bo;
-
-cleanup_dst:
-	pixman_image_unref(dst_image);
-cleanup_bo:
-	kgem_bo_destroy(kgem, bo);
-	return NULL;
-}
-
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index fd3aa9d..db4f061 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -191,12 +191,6 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 					 const void *data,
 					 BoxPtr box,
 					 int stride, int bpp);
-struct kgem_bo *kgem_upload_source_image_halved(struct kgem *kgem,
-						pixman_format_code_t format,
-						const void *data,
-						int x, int y,
-						int width, int height,
-						int stride, int bpp);
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 28b93a2..9d7857c 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -615,15 +615,17 @@ static int sna_render_picture_downsample(struct sna *sna,
 					 int16_t w, int16_t h,
 					 int16_t dst_x, int16_t dst_y)
 {
-	struct kgem_bo *bo = NULL;
 	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
-	int16_t ox, oy, ow, oh;
-	BoxRec box;
-
-	assert(w && h);
-
-	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
-	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
+	ScreenPtr screen = pixmap->drawable.pScreen;
+	PicturePtr tmp_src, tmp_dst;
+	PictFormatPtr format;
+	struct sna_pixmap *priv;
+	pixman_transform_t t;
+	PixmapPtr tmp;
+	int width, height;
+	int sx, sy, ox, oy, ow, oh;
+	int error, ret = 0;
+	BoxRec box, b;
 
 	ow = w;
 	oh = h;
@@ -645,12 +647,6 @@ static int sna_render_picture_downsample(struct sna *sna,
 		oy = v.vector[1] / v.vector[2];
 	}
 
-	/* Align the origin to an even pixel so that the sampling of
-	 * partial images is stable.
-	 */
-	box.x1 &= ~1;
-	box.y1 &= ~1;
-
 	if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
 		if (box.x1 < 0)
 			box.x1 = 0;
@@ -684,184 +680,124 @@ static int sna_render_picture_downsample(struct sna *sna,
 
 	w = box.x2 - box.x1;
 	h = box.y2 - box.y1;
-	DBG(("%s: sample area (%d, %d), (%d, %d): %dx%d\n",
-	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2, w, h));
-	assert(w && h);
-	if (w > 2*sna->render.max_3d_size || h > 2*sna->render.max_3d_size) {
-		DBG(("%s: sample size too large for pixman downscaling\n",
-		     __FUNCTION__));
-		goto fixup;
-	}
 
-	if (texture_is_cpu(pixmap, &box) && !move_to_gpu(pixmap, &box)) {
-		DBG(("%s: uploading partial texture\n", __FUNCTION__));
-		bo = kgem_upload_source_image_halved(&sna->kgem,
-						     picture->format,
-						     pixmap->devPrivate.ptr,
-						     box.x1, box.y1, w, h,
-						     pixmap->devKind,
-						     pixmap->drawable.bitsPerPixel);
-		if (!bo) {
-			DBG(("%s: failed to upload source image, using clear\n",
-			     __FUNCTION__));
-			return 0;
-		}
-	} else {
-		ScreenPtr screen = pixmap->drawable.pScreen;
-		PicturePtr tmp_src, tmp_dst;
-		PictFormatPtr format;
-		struct sna_pixmap *priv;
-		pixman_transform_t t;
-		PixmapPtr tmp;
-		int error, i, j, ww, hh, ni, nj;
-
-		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_READ))
-			goto fixup;
-
-		tmp = screen->CreatePixmap(screen,
-					   (w+1)/2, (h+1)/2,
-					   pixmap->drawable.depth,
-					   SNA_CREATE_SCRATCH);
-		if (!tmp)
-			goto fixup;
-
-		priv = sna_pixmap(tmp);
-		if (!priv) {
-			screen->DestroyPixmap(tmp);
-			goto fixup;
-		}
+	DBG(("%s: sample (%d, %d), (%d, %d)\n",
+	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
-		format = PictureMatchFormat(screen,
-					    pixmap->drawable.depth,
-					    picture->format);
+	sx = (w + sna->render.max_3d_size - 1) / sna->render.max_3d_size;
+	sy = (h + sna->render.max_3d_size - 1) / sna->render.max_3d_size;
 
-		tmp_dst = CreatePicture(0, &tmp->drawable, format, 0, NULL,
-					serverClient, &error);
-		if (!tmp_dst) {
-			screen->DestroyPixmap(tmp);
-			goto fixup;
-		}
+	DBG(("%s: scaling (%d, %d) down by %dx%d\n",
+	     __FUNCTION__, w, h, sx, sy));
 
-		tmp_src = CreatePicture(0, &pixmap->drawable, format, 0, NULL,
-					serverClient, &error);
-		if (!tmp_src) {
-			FreePicture(tmp_dst, 0);
-			screen->DestroyPixmap(tmp);
-			goto fixup;
-		}
+	width  = w / sx;
+	height = h / sy;
 
-		tmp_src->repeat = true;
-		tmp_src->repeatType = RepeatPad;
-		tmp_src->filter = PictFilterBilinear;
-		memset(&t, 0, sizeof(t));
-		t.matrix[0][0] = 2 << 16;
-		t.matrix[1][1] = 2 << 16;
-		t.matrix[2][2] = 1 << 16;
-		tmp_src->transform = &t;
-
-		ValidatePicture(tmp_dst);
-		ValidatePicture(tmp_src);
-
-		if (w > sna->render.max_3d_size) {
-			ww = (w+3)/4;
-			nj = 2;
-		} else {
-			ww = (w+1)/2;
-			nj = 1;
-		}
+	DBG(("%s: creating temporary GPU bo %dx%d\n",
+	     __FUNCTION__, width, height));
 
-		if (h > sna->render.max_3d_size) {
-			hh = (h+3)/4;
-			ni = 2;
-		} else {
-			hh = (h+1)/2;
-			ni = 1;
-		}
+	tmp = screen->CreatePixmap(screen,
+				   width, height,
+				   pixmap->drawable.depth,
+				   SNA_CREATE_SCRATCH);
+	if (!tmp)
+		return 0;
 
-		DBG(("%s %d:%d downsampling using %dx%d GPU tiles\n",
-		     __FUNCTION__, nj, ni, ww, hh));
-
-		for (i = 0; i < ni; i++) {
-			BoxRec b;
-
-			b.y1 = hh*i;
-			if (i == ni - 1)
-				b.y2 = (h+1)/2;
-			else
-				b.y2 = b.y1 + hh;
-
-			for (j = 0; j < nj; j++) {
-				struct sna_composite_op op;
-
-				b.x1 = ww*j;
-				if (j == nj - 1)
-					b.x2 = (w+1)/2;
-				else
-					b.x2 = b.x1 + ww;
-
-				DBG(("%s: tile %d:%d, box=(%d,%d), (%d, %d)\n",
-				     __FUNCTION__, i, j, b.x1, b.y1, b.x2, b.y2));
-
-				memset(&op, 0, sizeof(op));
-				if (!sna->render.composite(sna,
-							   PictOpSrc,
-							   tmp_src, NULL, tmp_dst,
-							   box.x1/2 + b.x1, box.y1/2 + b.y1,
-							   0, 0,
-							   b.x1, b.y1,
-							   b.x2 - b.x1, b.y2 - b.y1,
-							   &op)) {
-					tmp_src->transform = NULL;
-					FreePicture(tmp_src, 0);
-					FreePicture(tmp_dst, 0);
-					screen->DestroyPixmap(tmp);
-					goto fixup;
-				}
-
-				op.boxes(sna, &op, &b, 1);
-				op.done(sna, &op);
-			}
+	priv = sna_pixmap(tmp);
+	if (!priv)
+		goto cleanup_tmp;
+
+	format = PictureMatchFormat(screen,
+				    pixmap->drawable.depth,
+				    picture->format);
+
+	tmp_dst = CreatePicture(0, &tmp->drawable, format, 0, NULL,
+				serverClient, &error);
+	if (!tmp_dst)
+		goto cleanup_tmp;
+
+	tmp_src = CreatePicture(0, &pixmap->drawable, format, 0, NULL,
+				serverClient, &error);
+	if (!tmp_src)
+		goto cleanup_dst;
+
+	tmp_src->repeat = 1;
+	tmp_src->repeatType = RepeatPad;
+	/* Prefer to use nearest as it helps reduce artefacts from
+	 * interpolating and filtering twice.
+	 */
+	tmp_src->filter = PictFilterNearest;
+	memset(&t, 0, sizeof(t));
+	t.matrix[0][0] = (w << 16) / width;
+	t.matrix[0][2] = box.x1 << 16;
+	t.matrix[1][1] = (h << 16) / height;
+	t.matrix[1][2] = box.y1 << 16;
+	t.matrix[2][2] = 1 << 16;
+	tmp_src->transform = &t;
+
+	ValidatePicture(tmp_dst);
+	ValidatePicture(tmp_src);
+
+	w = sna->render.max_3d_size / sx - 2 * sx;
+	h = sna->render.max_3d_size / sy - 2 * sy;
+	DBG(("%s %d:%d downsampling using %dx%d GPU tiles\n",
+	     __FUNCTION__, (width + w-1)/w, (height + h-1)/h, w, h));
+
+	for (b.y1 = 0; b.y1 < height; b.y1 = b.y2) {
+		b.y2 = b.y1 + h;
+		if (b.y2 > height)
+			b.y2 = height;
+
+		for (b.x1 = 0; b.x1 < width; b.x1 = b.x2) {
+			struct sna_composite_op op;
+
+			b.x2 = b.x1 + w;
+			if (b.x2 > width)
+				b.x2 = width;
+
+			DBG(("%s: tile (%d, %d), (%d, %d)\n",
+			     __FUNCTION__, b.x1, b.y1, b.x2, b.y2));
+
+			memset(&op, 0, sizeof(op));
+			if (!sna->render.composite(sna,
+						   PictOpSrc,
+						   tmp_src, NULL, tmp_dst,
+						   b.x1, b.y1,
+						   0, 0,
+						   b.x1, b.y1,
+						   b.x2 - b.x1, b.y2 - b.y1,
+						   &op))
+				goto cleanup_src;
+
+			op.boxes(sna, &op, &b, 1);
+			op.done(sna, &op);
 		}
-
-		bo = kgem_bo_reference(priv->gpu_bo);
-
-		tmp_src->transform = NULL;
-		FreePicture(tmp_src, 0);
-		FreePicture(tmp_dst, 0);
-		screen->DestroyPixmap(tmp);
 	}
 
-	if (ox == x && oy == y) {
-		x = y = 0;
-	} else if (channel->transform) {
-		pixman_vector_t v;
-		pixman_transform_t m;
-
-		v.vector[0] = (ox - box.x1) << 16;
-		v.vector[1] = (oy - box.y1) << 16;
-		v.vector[2] = 1 << 16;
-		pixman_transform_invert(&m, channel->transform);
-		pixman_transform_point(&m, &v);
-		x = v.vector[0] / v.vector[2];
-		y = v.vector[1] / v.vector[2];
-	} else {
-		x = ox - box.x1;
-		y = oy - box.y1;
-	}
+	pixman_transform_invert(&channel->embedded_transform, &t);
+	if (channel->transform)
+		pixman_transform_multiply(&channel->embedded_transform,
+					  &channel->embedded_transform,
+					  channel->transform);
+	channel->transform = &channel->embedded_transform;
 
 	channel->offset[0] = x - dst_x;
 	channel->offset[1] = y - dst_y;
-	channel->scale[0] = 1.f/w;
-	channel->scale[1] = 1.f/h;
-	channel->width  = (w + 1) / 2;
-	channel->height = (h + 1) / 2;
-	channel->bo = bo;
-	return 1;
-
-fixup:
-	return sna_render_picture_fixup(sna, picture, channel,
-					x, y, w, h,
-					dst_x, dst_y);
+	channel->scale[0] = 1.f/width;
+	channel->scale[1] = 1.f/height;
+	channel->width  = width;
+	channel->height = height;
+	channel->bo = kgem_bo_reference(priv->gpu_bo);
+
+	ret = 1;
+cleanup_src:
+	tmp_src->transform = NULL;
+	FreePicture(tmp_src, 0);
+cleanup_dst:
+	FreePicture(tmp_dst, 0);
+cleanup_tmp:
+	screen->DestroyPixmap(tmp);
+	return ret;
 }
 
 int
@@ -965,7 +901,10 @@ sna_render_picture_extract(struct sna *sna,
 	}
 
 	src_bo = use_cpu_bo(sna, pixmap, &box);
-	if (src_bo == NULL) {
+	if (src_bo) {
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
+	} else {
 		if (texture_is_cpu(pixmap, &box) &&
 		    !move_to_gpu(pixmap, &box)) {
 			bo = kgem_upload_source_image(&sna->kgem,
@@ -981,7 +920,6 @@ sna_render_picture_extract(struct sna *sna,
 				src_bo = priv->gpu_bo;
 		}
 	}
-
 	if (src_bo) {
 		bo = kgem_create_2d(&sna->kgem, w, h,
 				    pixmap->drawable.bitsPerPixel,
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 4346196..c4711f4 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -57,6 +57,8 @@ struct sna_composite_op {
 		int16_t offset[2];
 		float scale[2];
 
+		pixman_transform_t embedded_transform;
+
 		union {
 			struct {
 				uint32_t pixel;
commit c31ccd9496f3f50d312d1003507f2a4f9a09dbb0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 26 12:41:34 2012 +0000

    sna: Tweak aperture thresholds for batch flushing
    
    In order to more easily accommodate operations on large source CPU bo.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 1bcda22..6eae248 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -634,7 +634,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 
 	kgem->aperture_total = aperture.aper_size;
 	kgem->aperture_high = aperture.aper_size * 3/4;
-	kgem->aperture_low = aperture.aper_size * 1/4;
+	kgem->aperture_low = aperture.aper_size * 1/3;
 	DBG(("%s: aperture low=%d [%d], high=%d [%d]\n", __FUNCTION__,
 	     kgem->aperture_low, kgem->aperture_low / (1024*1024),
 	     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
@@ -2484,9 +2484,6 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	int num_exec = 0;
 	int size = 0;
 
-	if (kgem->aperture > kgem->aperture_low)
-		return false;
-
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		if (bo->exec)
@@ -2497,6 +2494,12 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	}
 	va_end(ap);
 
+	if (!size)
+		return true;
+
+	if (kgem->aperture > kgem->aperture_low)
+		return false;
+
 	if (size + kgem->aperture > kgem->aperture_high)
 		return false;
 
@@ -2515,9 +2518,6 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	int size = 0;
 	int fenced_size = 0;
 
-	if (unlikely (kgem->aperture > kgem->aperture_low))
-		return false;
-
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		if (bo->exec) {
@@ -2544,13 +2544,19 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	if (fenced_size + kgem->aperture_fenced > kgem->aperture_mappable)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (kgem->nfence + num_fence > kgem->fence_max)
 		return false;
 
-	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
+	if (!size)
+		return true;
+
+	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (kgem->nfence + num_fence >= kgem->fence_max)
+	if (size + kgem->aperture > kgem->aperture_high)
+		return false;
+
+	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
 		return false;
 
 	return true;
commit b8a72e2e727c791993c7c8737151c5087743f360
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 23:21:36 2012 +0000

    sna: Use the cpu bo where possible as the source for texture extraction
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index d1cb60b..28b93a2 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -295,7 +295,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 	if (DBG_NO_CPU_BO)
 		return NULL;
 
-	priv = sna_pixmap_attach(pixmap);
+	priv = sna_pixmap(pixmap);
 	if (priv == NULL || priv->cpu_bo == NULL) {
 		DBG(("%s: no cpu bo\n", __FUNCTION__));
 		return NULL;
@@ -332,7 +332,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 		int w = box->x2 - box->x1;
 		int h = box->y2 - box->y1;
 
-		if (pixmap->usage_hint)
+		if (!priv->gpu)
 			goto done;
 
 		if (priv->source_count*w*h >= pixmap->drawable.width * pixmap->drawable.height &&
@@ -349,7 +349,7 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box)
 done:
 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
-	return kgem_bo_reference(priv->cpu_bo);
+	return priv->cpu_bo;
 }
 
 static Bool
@@ -583,23 +583,25 @@ sna_render_pixmap_bo(struct sna *sna,
 	     pixmap->drawable.width, pixmap->drawable.height));
 
 	bo = use_cpu_bo(sna, pixmap, &box);
-	if (bo == NULL &&
-	    texture_is_cpu(pixmap, &box) &&
-	    !move_to_gpu(pixmap, &box)) {
-		DBG(("%s: uploading CPU box (%d, %d), (%d, %d)\n",
-		     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
-		bo = upload(sna, channel, pixmap, &box);
-	}
-
-	if (bo == NULL) {
-		priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
-		if (priv) {
-			bo = kgem_bo_reference(priv->gpu_bo);
-		} else {
-			DBG(("%s: failed to upload pixmap to gpu, uploading CPU box (%d, %d), (%d, %d) instead\n",
+	if (bo) {
+		bo = kgem_bo_reference(bo);
+	} else {
+		if (texture_is_cpu(pixmap, &box) && !move_to_gpu(pixmap, &box)) {
+			DBG(("%s: uploading CPU box (%d, %d), (%d, %d)\n",
 			     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 			bo = upload(sna, channel, pixmap, &box);
 		}
+
+		if (bo == NULL) {
+			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
+			if (priv) {
+				bo = kgem_bo_reference(priv->gpu_bo);
+			} else {
+				DBG(("%s: failed to upload pixmap to gpu, uploading CPU box (%d, %d), (%d, %d) instead\n",
+				     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
+				bo = upload(sna, channel, pixmap, &box);
+			}
+		}
 	}
 
 	channel->bo = bo;
@@ -870,7 +872,7 @@ sna_render_picture_extract(struct sna *sna,
 			   int16_t w, int16_t h,
 			   int16_t dst_x, int16_t dst_y)
 {
-	struct kgem_bo *bo = NULL;
+	struct kgem_bo *bo = NULL, *src_bo;
 	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
 	int16_t ox, oy, ow, oh;
 	BoxRec box;
@@ -962,49 +964,48 @@ sna_render_picture_extract(struct sna *sna,
 						     dst_x, dst_y);
 	}
 
-	if (texture_is_cpu(pixmap, &box) && !move_to_gpu(pixmap, &box)) {
-		bo = kgem_upload_source_image(&sna->kgem,
-					      pixmap->devPrivate.ptr,
-					      &box,
-					      pixmap->devKind,
-					      pixmap->drawable.bitsPerPixel);
-		if (bo == NULL) {
-			DBG(("%s: failed to upload source image, using clear\n",
-			     __FUNCTION__));
-			return 0;
-		}
-	} else {
-		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_READ)) {
-			DBG(("%s: falback -- pixmap is not on the GPU\n",
-			     __FUNCTION__));
-			return sna_render_picture_fixup(sna, picture, channel,
-							x, y, ow, oh, dst_x, dst_y);
+	src_bo = use_cpu_bo(sna, pixmap, &box);
+	if (src_bo == NULL) {
+		if (texture_is_cpu(pixmap, &box) &&
+		    !move_to_gpu(pixmap, &box)) {
+			bo = kgem_upload_source_image(&sna->kgem,
+						      pixmap->devPrivate.ptr,
+						      &box,
+						      pixmap->devKind,
+						      pixmap->drawable.bitsPerPixel);
+		} else {
+			struct sna_pixmap *priv;
+
+			priv = sna_pixmap_move_to_gpu(pixmap, MOVE_READ);
+			if (priv)
+				src_bo = priv->gpu_bo;
 		}
+	}
 
+	if (src_bo) {
 		bo = kgem_create_2d(&sna->kgem, w, h,
 				    pixmap->drawable.bitsPerPixel,
 				    kgem_choose_tiling(&sna->kgem,
 						       I915_TILING_X, w, h,
 						       pixmap->drawable.bitsPerPixel),
 				    0);
-		if (!bo) {
-			DBG(("%s: failed to create bo, using clear\n",
-			     __FUNCTION__));
-			return 0;
-		}
-
-		if (!sna_blt_copy_boxes(sna, GXcopy,
-					sna_pixmap_get_bo(pixmap), 0, 0,
+		if (bo && !sna_blt_copy_boxes(sna, GXcopy,
+					src_bo, 0, 0,
 					bo, -box.x1, -box.y1,
 					pixmap->drawable.bitsPerPixel,
 					&box, 1)) {
-			DBG(("%s: fallback -- unable to copy boxes\n",
-			     __FUNCTION__));
-			return sna_render_picture_fixup(sna, picture, channel,
-							x, y, ow, oh, dst_x, dst_y);
+			kgem_bo_destroy(&sna->kgem, bo);
+			bo = NULL;
 		}
 	}
 
+	if (bo == NULL) {
+		DBG(("%s: falback -- pixmap is not on the GPU\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, ow, oh, dst_x, dst_y);
+	}
+
 	if (ox == x && oy == y) {
 		x = y = 0;
 	} else if (channel->transform) {
commit de931486a481fc23d0624b57ec0042da3e900c5f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 23:04:50 2012 +0000

    sna: Experiment with creating large objects as CPU bo
    
    Even on non-LLC systems if we can prevent the migration of such
    objects, we can still benefit immensely from being able to map them into
    the GTT as required.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 0955a5d..1bcda22 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -632,6 +632,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	aperture.aper_size = 64*1024*1024;
 	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
 
+	kgem->aperture_total = aperture.aper_size;
 	kgem->aperture_high = aperture.aper_size * 3/4;
 	kgem->aperture_low = aperture.aper_size * 1/4;
 	DBG(("%s: aperture low=%d [%d], high=%d [%d]\n", __FUNCTION__,
@@ -657,12 +658,17 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
-	kgem->max_object_size = kgem->aperture_mappable / 2;
-	if (kgem->max_object_size > kgem->aperture_low)
-		kgem->max_object_size = kgem->aperture_low;
-	if (kgem->max_object_size > MAX_OBJECT_SIZE)
-		kgem->max_object_size = MAX_OBJECT_SIZE;
-	DBG(("%s: max object size %d\n", __FUNCTION__, kgem->max_object_size));
+	kgem->max_gpu_size = kgem->aperture_mappable / 2;
+	if (kgem->max_gpu_size > kgem->aperture_low)
+		kgem->max_gpu_size = kgem->aperture_low;
+	if (kgem->max_gpu_size > MAX_OBJECT_SIZE)
+		kgem->max_gpu_size = MAX_OBJECT_SIZE;
+
+	kgem->max_cpu_size = kgem->aperture_total / 2;
+	if (kgem->max_cpu_size > MAX_OBJECT_SIZE)
+		kgem->max_cpu_size = MAX_OBJECT_SIZE;
+	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
+	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
 
 	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
 	if ((int)kgem->fence_max < 0)
@@ -979,6 +985,9 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		goto destroy;
 	}
 
+	if (!kgem->has_llc && IS_CPU_MAP(bo->map) && bo->domain != DOMAIN_CPU)
+		kgem_bo_release_map(kgem, bo);
+
 	assert(list_is_empty(&bo->vma));
 	assert(list_is_empty(&bo->list));
 	assert(bo->vmap == false && bo->sync == false);
@@ -1010,6 +1019,10 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	if (!IS_CPU_MAP(bo->map)) {
 		if (!kgem_bo_set_purgeable(kgem, bo))
 			goto destroy;
+
+		if (!kgem->has_llc && bo->domain == DOMAIN_CPU)
+			goto destroy;
+
 		DBG(("%s: handle=%d, purged\n",
 		     __FUNCTION__, bo->handle));
 	}
@@ -1121,8 +1134,11 @@ bool kgem_retire(struct kgem *kgem)
 		if (kgem_bo_set_purgeable(kgem, rq->bo)) {
 			kgem_bo_move_to_inactive(kgem, rq->bo);
 			retired = true;
-		} else
+		} else {
+			DBG(("%s: closing %d\n",
+			     __FUNCTION__, rq->bo->handle));
 			kgem_bo_free(kgem, rq->bo);
+		}
 
 		_list_del(&rq->list);
 		free(rq);
@@ -1679,9 +1695,13 @@ void kgem_purge_cache(struct kgem *kgem)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
-		list_for_each_entry_safe(bo, next, &kgem->inactive[i], list)
-			if (!kgem_bo_is_retained(kgem, bo))
+		list_for_each_entry_safe(bo, next, &kgem->inactive[i], list) {
+			if (!kgem_bo_is_retained(kgem, bo)) {
+				DBG(("%s: purging %d\n",
+				     __FUNCTION__, bo->handle));
 				kgem_bo_free(kgem, bo);
+			}
+		}
 	}
 
 	kgem->need_purge = false;
@@ -1748,6 +1768,8 @@ bool kgem_expire_cache(struct kgem *kgem)
 				count++;
 				size += bo->size;
 				kgem_bo_free(kgem, bo);
+				DBG(("%s: expiring %d\n",
+				     __FUNCTION__, bo->handle));
 			}
 		}
 		if (!list_is_empty(&preserve)) {
@@ -2033,7 +2055,7 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 	if (tiling &&
 	    kgem_surface_size(kgem, false, false,
 			      width, height, bpp, tiling,
-			      &pitch) > kgem->max_object_size) {
+			      &pitch) > kgem->max_gpu_size) {
 		DBG(("%s: too large (%dx%d) to be fenced, discarding tiling\n",
 		     __FUNCTION__, width, height));
 		tiling = I915_TILING_NONE;
@@ -2096,43 +2118,46 @@ done:
 	return tiling;
 }
 
-static bool _kgem_can_create_2d(struct kgem *kgem,
-				int width, int height, int bpp, int tiling)
+bool kgem_can_create_cpu(struct kgem *kgem,
+			 int width, int height, int depth)
 {
 	uint32_t pitch, size;
 
-	if (bpp < 8)
+	if (depth < 8 || kgem->wedged)
 		return false;
 
-	if (tiling >= 0 && kgem->wedged)
-		return false;
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, BitsPerPixel(depth),
+				 I915_TILING_NONE, &pitch);
+	return size > 0 && size < kgem->max_cpu_size;
+}
 
-	if (tiling < 0)
-		tiling = -tiling;
+static bool _kgem_can_create_gpu(struct kgem *kgem,
+				 int width, int height, int bpp)
+{
+	uint32_t pitch, size;
+
+	if (bpp < 8 || kgem->wedged)
+		return false;
 
 	size = kgem_surface_size(kgem, false, false,
-				 width, height, bpp, tiling, &pitch);
-	if (size == 0 || size >= kgem->max_object_size)
-		size = kgem_surface_size(kgem, false, false,
-					 width, height, bpp,
-					 I915_TILING_NONE, &pitch);
-	return size > 0 && size < kgem->max_object_size;
+				 width, height, bpp, I915_TILING_NONE,
+				 &pitch);
+	return size > 0 && size < kgem->max_gpu_size;
 }
 
 #if DEBUG_KGEM
-bool kgem_can_create_2d(struct kgem *kgem,
-			int width, int height, int bpp, int tiling)
+bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp)
 {
-	bool ret = _kgem_can_create_2d(kgem, width, height, bpp, tiling);
-	DBG(("%s(%dx%d, bpp=%d, tiling=%d) = %d\n", __FUNCTION__,
-	     width, height, bpp, tiling, ret));
+	bool ret = _kgem_can_create_gpu(kgem, width, height, bpp);
+	DBG(("%s(%dx%d, bpp=%d) = %d\n", __FUNCTION__,
+	     width, height, bpp, ret));
 	return ret;
 }
 #else
-bool kgem_can_create_2d(struct kgem *kgem,
-			int width, int height, int bpp, int tiling)
+bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp)
 {
-	return _kgem_can_create_2d(kgem, width, height, bpp, tiling);
+	return _kgem_can_create_gpu(kgem, width, height, bpp);
 }
 #endif
 
@@ -2177,12 +2202,12 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	     !!(flags & CREATE_GTT_MAP),
 	     !!(flags & CREATE_SCANOUT)));
 
-	assert(_kgem_can_create_2d(kgem, width, height, bpp, flags & CREATE_EXACT ? -tiling : tiling));
 	size = kgem_surface_size(kgem,
 				 kgem->has_relaxed_fencing,
 				 flags & CREATE_SCANOUT,
 				 width, height, bpp, tiling, &pitch);
-	assert(size && size <= kgem->max_object_size);
+	assert(size && size < kgem->max_cpu_size);
+	assert(tiling == I915_TILING_NONE || size < kgem->max_gpu_size);
 
 	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
@@ -2342,6 +2367,9 @@ skip_active_search:
 			continue;
 		}
 
+		if ((flags & CREATE_CPU_MAP) == 0 && IS_CPU_MAP(bo->map))
+			continue;
+
 		if (bo->tiling != tiling ||
 		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
 			if (tiling != gem_set_tiling(kgem->fd,
@@ -2643,8 +2671,11 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
 
-		if (!bo->purged && !kgem_bo_set_purgeable(kgem, bo))
+		if (!bo->purged && !kgem_bo_set_purgeable(kgem, bo)) {
+			DBG(("%s: freeing unpurgeable old mapping\n",
+			     __FUNCTION__));
 			kgem_bo_free(kgem, bo);
+		}
 	}
 }
 
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 652c2d7..fd3aa9d 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -151,10 +151,10 @@ struct kgem {
 
 	uint16_t fence_max;
 	uint16_t half_cpu_cache_pages;
-	uint32_t aperture_high, aperture_low, aperture;
-	uint32_t aperture_fenced, aperture_mappable;
+	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
+	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_object_size;
+	uint32_t max_gpu_size, max_cpu_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
@@ -200,8 +200,8 @@ struct kgem_bo *kgem_upload_source_image_halved(struct kgem *kgem,
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
-bool kgem_can_create_2d(struct kgem *kgem,
-			int width, int height, int bpp, int tiling);
+bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp);
+bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int depth);
 
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index b28134c..759e0fe 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -245,7 +245,7 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 
 	assert(priv->stride);
 
-	if (sna->kgem.has_cpu_bo) {
+	if (sna->kgem.has_cpu_bo || !priv->gpu) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
@@ -515,11 +515,10 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 		break;
 
 	default:
-		if (!kgem_can_create_2d(&sna->kgem,
-					pixmap->drawable.width,
-					pixmap->drawable.height,
-					pixmap->drawable.bitsPerPixel,
-					I915_TILING_NONE))
+		if (!kgem_can_create_gpu(&sna->kgem,
+					 pixmap->drawable.width,
+					 pixmap->drawable.height,
+					 pixmap->drawable.bitsPerPixel))
 			return NULL;
 		break;
 	}
@@ -586,6 +585,11 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 		return create_pixmap(sna, screen, width, height, depth,
 				     CREATE_PIXMAP_USAGE_SCRATCH);
 
+	bpp = BitsPerPixel(depth);
+	if (!kgem_can_create_gpu(&sna->kgem, width, height, bpp))
+		return create_pixmap(sna, screen, width, height, depth,
+				     CREATE_PIXMAP_USAGE_SCRATCH);
+
 	if (tiling == I915_TILING_Y && !sna->have_render)
 		tiling = I915_TILING_X;
 
@@ -594,11 +598,7 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 	     height > sna->render.max_3d_size))
 		tiling = I915_TILING_X;
 
-	bpp = BitsPerPixel(depth);
 	tiling = kgem_choose_tiling(&sna->kgem, tiling, width, height, bpp);
-	if (!kgem_can_create_2d(&sna->kgem, width, height, bpp, tiling))
-		return create_pixmap(sna, screen, width, height, depth,
-				     CREATE_PIXMAP_USAGE_SCRATCH);
 
 	/* you promise never to access this via the cpu... */
 	if (sna->freed_pixmap) {
@@ -669,7 +669,10 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
-	if (depth < 8 || wedged(sna) || !sna->have_render)
+	if (!kgem_can_create_cpu(&sna->kgem, width, height, depth))
+		return create_pixmap(sna, screen, width, height, depth, usage);
+
+	if (!sna->have_render)
 		return create_pixmap(sna, screen,
 				     width, height, depth,
 				     usage);
@@ -696,13 +699,11 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 						 width, height, depth,
 						 I915_TILING_Y);
 
-	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE ||
-	    !kgem_can_create_2d(&sna->kgem, width, height,
-				BitsPerPixel(depth), I915_TILING_NONE))
+	if (usage == CREATE_PIXMAP_USAGE_GLYPH_PICTURE)
 		return create_pixmap(sna, screen, width, height, depth, usage);
 
 	pad = PixmapBytePad(width, depth);
-	if (pad*height <= 4096) {
+	if (pad * height <= 4096) {
 		pixmap = create_pixmap(sna, screen,
 				       width, height, depth, usage);
 		if (pixmap == NullPixmap)
@@ -729,7 +730,9 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		}
 
 		priv->stride = pad;
-		priv->gpu = true;
+		priv->gpu = kgem_can_create_gpu(&sna->kgem,
+						width, height,
+						pixmap->drawable.bitsPerPixel);
 	}
 
 	return pixmap;
@@ -1821,6 +1824,7 @@ _sna_drawable_use_cpu_bo(DrawablePtr drawable,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
 	BoxRec extents;
 	int16_t dx, dy;
 
@@ -1829,6 +1833,9 @@ _sna_drawable_use_cpu_bo(DrawablePtr drawable,
 	if (priv == NULL || priv->cpu_bo == NULL)
 		return FALSE;
 
+	if (!sna->kgem.has_llc && priv->cpu_bo->domain == DOMAIN_CPU)
+		return FALSE;
+
 	if (DAMAGE_IS_ALL(priv->cpu_damage)) {
 		*damage = NULL;
 		return TRUE;
@@ -1876,9 +1883,7 @@ sna_pixmap_create_upload(ScreenPtr screen,
 	assert(width);
 	assert(height);
 	if (!sna->have_render ||
-	    !kgem_can_create_2d(&sna->kgem,
-				width, height, bpp,
-				I915_TILING_NONE))
+	    !kgem_can_create_gpu(&sna->kgem, width, height, bpp))
 		return create_pixmap(sna, screen, width, height, depth,
 				     CREATE_PIXMAP_USAGE_SCRATCH);
 
@@ -2024,7 +2029,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	sna_damage_reduce(&priv->cpu_damage);
 	DBG(("%s: CPU damage? %d\n", __FUNCTION__, priv->cpu_damage != NULL));
 	if (priv->gpu_bo == NULL) {
-		if (!wedged(sna))
+		if (!wedged(sna) && priv->gpu)
 			priv->gpu_bo =
 				kgem_create_2d(&sna->kgem,
 					       pixmap->drawable.width,
@@ -3195,24 +3200,19 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	}
 
 	/* Try to maintain the data on the GPU */
-	if (dst_priv->gpu_bo == NULL &&
+	if (dst_priv->gpu_bo == NULL && dst_priv->gpu &&
 	    ((dst_priv->cpu_damage == NULL && copy_use_gpu_bo(sna, dst_priv, &region)) ||
 	     (src_priv && (src_priv->gpu_bo != NULL || (src_priv->cpu_bo && kgem_bo_is_busy(src_priv->cpu_bo)))))) {
 		uint32_t tiling = sna_pixmap_choose_tiling(dst_pixmap);
 
 		DBG(("%s: create dst GPU bo for upload\n", __FUNCTION__));
 
-		if (kgem_can_create_2d(&sna->kgem,
+		dst_priv->gpu_bo =
+			kgem_create_2d(&sna->kgem,
 				       dst_pixmap->drawable.width,
 				       dst_pixmap->drawable.height,
 				       dst_pixmap->drawable.bitsPerPixel,
-				       tiling))
-			dst_priv->gpu_bo =
-				kgem_create_2d(&sna->kgem,
-					       dst_pixmap->drawable.width,
-					       dst_pixmap->drawable.height,
-					       dst_pixmap->drawable.bitsPerPixel,
-					       tiling, 0);
+				       tiling, 0);
 	}
 
 	if (dst_priv->gpu_bo) {
commit 4a5617ddd35af4ece5ad79a45f3041924d5a4fc3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 20:13:27 2012 +0000

    sna: Apply the same migration flags for the dst alphamap as for the dst pixmap
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 6fb2d27..0faad84 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -519,8 +519,7 @@ fallback:
 	if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &region, flags))
 		goto out;
 	if (dst->alphaMap &&
-	    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable,
-				      MOVE_WRITE | MOVE_READ))
+	    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable, flags))
 		goto out;
 	if (src->pDrawable) {
 		if (!sna_drawable_move_to_cpu(src->pDrawable,
commit 1ce99854b27edb2d8c5665dba1f0de434c562f0c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 20:12:55 2012 +0000

    sna: Correct offset for moving drawable regions to the CPU
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index d8f96e0..bc14967 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -465,7 +465,8 @@ sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags)
 	RegionRec region;
 
 	pixman_region_init_rect(&region,
-				0, 0, drawable->width, drawable->height);
+				drawable->x, drawable->y,
+				drawable->width, drawable->height);
 	return sna_drawable_move_region_to_cpu(drawable, &region, flags);
 }
 
commit 06f5396f805e52ee088b4b06478eeb980f446b51
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 20:12:27 2012 +0000

    sna/gen2+: Do not force use of GPU if the target is simply cleared
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index d75a412..eb8d4ef 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -1558,7 +1558,7 @@ gen2_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage) {
+	if (priv && priv->gpu_damage && !priv->clear) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 618f694..95a79b2 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -2467,7 +2467,7 @@ gen3_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage) {
+	if (priv && priv->gpu_damage && !priv->clear) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index c1ceb33..c798ce5 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -2056,7 +2056,7 @@ gen4_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage) {
+	if (priv && priv->gpu_damage && !priv->clear) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 6308d10..47c4e96 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -2094,7 +2094,7 @@ gen5_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage) {
+	if (priv && priv->gpu_damage && !priv->clear) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 5cbdd74..c3bc2e7 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2307,7 +2307,9 @@ gen6_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && (priv->gpu_damage || (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
+	if (priv &&
+	    ((priv->gpu_damage && !priv->clear) ||
+	     (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index ee546e1..21d8c99 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2374,7 +2374,9 @@ gen7_composite_fallback(struct sna *sna,
 
 	/* If anything is on the GPU, push everything out to the GPU */
 	priv = sna_pixmap(dst_pixmap);
-	if (priv && (priv->gpu_damage || (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
+	if (priv &&
+	    ((priv->gpu_damage && !priv->clear) ||
+	     (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return FALSE;
commit 1652ea3c94bb86e3a6d94a0037559e5ae1830086
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 20:11:21 2012 +0000

    sna: Map freshly created, unbound bo through the CPU
    
    Take advantage that we know we will have to clflush the unbound bo
    before use by the GPU and populate it inplace.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 1e78c1a..0955a5d 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -865,8 +865,10 @@ static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
 	munmap(CPU_MAP(bo->map), bo->size);
 	bo->map = NULL;
 
-	list_del(&bo->vma);
-	kgem->vma[type].count--;
+	if (!list_is_empty(&bo->vma)) {
+		list_del(&bo->vma);
+		kgem->vma[type].count--;
+	}
 }
 
 static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
@@ -2393,6 +2395,7 @@ skip_active_search:
 		return NULL;
 	}
 
+	bo->domain = DOMAIN_CPU;
 	bo->unique_id = kgem_get_unique_id(kgem);
 	bo->pitch = pitch;
 	if (tiling != I915_TILING_NONE)
@@ -2598,6 +2601,8 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 {
 	int i, j;
 
+	DBG(("%s: type=%d, count=%d (bucket: %d)\n",
+	     __FUNCTION__, type, kgem->vma[type].count, bucket));
 	if (kgem->vma[type].count <= 0)
 	       return;
 
@@ -2647,14 +2652,17 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 {
 	void *ptr;
 
-	DBG(("%s: handle=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
-	     bo->handle, bo->tiling, bo->map, bo->domain));
+	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
 
 	assert(!bo->purged);
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->list));
 
-	if (kgem->has_llc && bo->tiling == I915_TILING_NONE) {
+	if (bo->tiling == I915_TILING_NONE &&
+	    (kgem->has_llc || bo->domain == bo->presumed_offset)) {
+		DBG(("%s: converting request for GTT map into CPU map\n",
+		     __FUNCTION__));
 		ptr = kgem_bo_map__cpu(kgem, bo);
 		kgem_bo_sync__cpu(kgem, bo);
 		return ptr;
@@ -3110,6 +3118,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		if (old == NULL)
 			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
 		if (old) {
+			DBG(("%s: reusing ordinary handle %d for io\n",
+			     __FUNCTION__, old->handle));
 			alloc = old->size;
 			bo = partial_bo_alloc(alloc);
 			if (bo == NULL)
@@ -3125,20 +3135,55 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			list_init(&bo->base.list);
 			free(old);
 			bo->base.refcnt = 1;
+
+			bo->need_io = flags & KGEM_BUFFER_WRITE;
+			bo->base.io = true;
 		} else {
-			bo = partial_bo_alloc(alloc);
+			bo = malloc(sizeof(*bo));
 			if (bo == NULL)
 				return NULL;
 
-			if (!__kgem_bo_init(&bo->base,
-					    gem_create(kgem->fd, alloc),
-					    alloc)) {
-				free(bo);
+			old = search_linear_cache(kgem, alloc,
+						  CREATE_INACTIVE | CREATE_CPU_MAP);
+			if (old) {
+				DBG(("%s: reusing cpu map handle=%d for buffer\n",
+				     __FUNCTION__, old->handle));
+
+				memcpy(&bo->base, old, sizeof(*old));
+				if (old->rq)
+					list_replace(&old->request, &bo->base.request);
+				else
+					list_init(&bo->base.request);
+				list_replace(&old->vma, &bo->base.vma);
+				list_init(&bo->base.list);
+				free(old);
+				bo->base.refcnt = 1;
+			} else {
+				if (!__kgem_bo_init(&bo->base,
+						    gem_create(kgem->fd, alloc),
+						    alloc)) {
+					free(bo);
+					return NULL;
+				}
+				DBG(("%s: created handle=%d for buffer\n",
+				     __FUNCTION__, bo->base.handle));
+
+				bo->base.domain = DOMAIN_CPU;
+			}
+
+			bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+			if (bo->mem == NULL) {
+				kgem_bo_free(kgem, &bo->base);
 				return NULL;
 			}
+
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
+			bo->need_io = false;
+			bo->base.io = true;
+			bo->mmapped = true;
 		}
-		bo->need_io = flags & KGEM_BUFFER_WRITE;
-		bo->base.io = true;
 	}
 	bo->base.reusable = false;
 	assert(bo->base.size == alloc);
@@ -3343,8 +3388,8 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		gem_read(kgem->fd,
 			 bo->base.handle, (char *)bo->mem+offset,
 			 offset, length);
+		kgem_bo_map__cpu(kgem, &bo->base);
 		bo->base.domain = DOMAIN_NONE;
-		bo->base.reusable = false; /* in the CPU cache now */
 	}
 	kgem_bo_retire(kgem, &bo->base);
 }
commit 571533e2175da0640135a6a038fa72970be539cf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 16:45:43 2012 +0000

    sna: GetImage is allowed to read a window's border
    
    We need to adjust the clip to include the border pixels when migrating
    damage from the backing pixmap. This also requires relaxing the
    constraint that a read must be within the drawable.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index deb39bf..b28134c 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1143,7 +1143,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	     RegionExtents(region)->x2, RegionExtents(region)->y2,
 	     flags));
 
-	assert_drawable_contains_box(drawable, &region->extents);
+	if (flags & MOVE_WRITE) {
+		assert_drawable_contains_box(drawable, &region->extents);
+	}
 
 	priv = sna_pixmap(pixmap);
 	if (priv == NULL) {
@@ -11157,15 +11159,19 @@ sna_get_image(DrawablePtr drawable,
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
 
-	if (region.extents.x1 < drawable->x)
-		region.extents.x1 = drawable->x;
-	if (region.extents.x2 > drawable->x + drawable->width)
-		region.extents.x2 = drawable->x + drawable->width;
+	if (drawable->type == DRAWABLE_PIXMAP) {
+		if (region.extents.x1 < drawable->x)
+			region.extents.x1 = drawable->x;
+		if (region.extents.x2 > drawable->x + drawable->width)
+			region.extents.x2 = drawable->x + drawable->width;
 
-	if (region.extents.y1 < drawable->y)
-		region.extents.y1 = drawable->y;
-	if (region.extents.y2 > drawable->y + drawable->height)
-		region.extents.y2 = drawable->y + drawable->height;
+		if (region.extents.y1 < drawable->y)
+			region.extents.y1 = drawable->y;
+		if (region.extents.y2 > drawable->y + drawable->height)
+			region.extents.y2 = drawable->y + drawable->height;
+	} else
+		RegionIntersect(&region, &region,
+				&((WindowPtr)drawable)->borderClip);
 
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, MOVE_READ))
 		return;
commit 2b6c49ecda6e8432719e136ea2261a031bd4b691
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 12:49:24 2012 +0000

    sna: Round up buffer allocations when downsampling
    
    The pathological case being nx1 or 1xm resulting in an illegal allocation
    request of 0 bytes.
    
    One such example is
      wolframalpha.com: x = (200 + x) / 100
    which generates an approximately 8500x1 image and so needs downscaling
    to fit in the render pipeline on all but IvyBridge. Bring on Ivy!
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index d85e930..1e78c1a 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -3251,20 +3251,22 @@ struct kgem_bo *kgem_upload_source_image_halved(struct kgem *kgem,
 	struct kgem_bo *bo;
 	pixman_image_t *src_image, *dst_image;
 	pixman_transform_t t;
+	int w, h;
 	void *dst;
 
 	DBG(("%s : (%d, %d), (%d, %d), stride=%d, bpp=%d\n",
 	     __FUNCTION__, x, y, width, height, stride, bpp));
 
-	bo = kgem_create_buffer_2d(kgem,
-				   width/2, height/2, bpp,
+	w = (width + 1) / 2;
+	h = (height + 1) / 2;
+
+	bo = kgem_create_buffer_2d(kgem, w, h, bpp,
 				   KGEM_BUFFER_WRITE_INPLACE,
 				   &dst);
 	if (bo == NULL)
 		return NULL;
 
-	dst_image = pixman_image_create_bits(format, width/2, height/2,
-					     dst, bo->pitch);
+	dst_image = pixman_image_create_bits(format, w, h, dst, bo->pitch);
 	if (dst_image == NULL)
 		goto cleanup_bo;
 
@@ -3286,7 +3288,7 @@ struct kgem_bo *kgem_upload_source_image_halved(struct kgem *kgem,
 			       x, y,
 			       0, 0,
 			       0, 0,
-			       width/2, height/2);
+			       w, h);
 
 	pixman_image_unref(src_image);
 	pixman_image_unref(dst_image);
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index af81735..d1cb60b 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -717,7 +717,8 @@ static int sna_render_picture_downsample(struct sna *sna,
 			goto fixup;
 
 		tmp = screen->CreatePixmap(screen,
-					   w/2, h/2, pixmap->drawable.depth,
+					   (w+1)/2, (h+1)/2,
+					   pixmap->drawable.depth,
 					   SNA_CREATE_SCRATCH);
 		if (!tmp)
 			goto fixup;
@@ -760,18 +761,18 @@ static int sna_render_picture_downsample(struct sna *sna,
 		ValidatePicture(tmp_src);
 
 		if (w > sna->render.max_3d_size) {
-			ww = w/4;
+			ww = (w+3)/4;
 			nj = 2;
 		} else {
-			ww = w/2;
+			ww = (w+1)/2;
 			nj = 1;
 		}
 
 		if (h > sna->render.max_3d_size) {
-			hh = h/4;
+			hh = (h+3)/4;
 			ni = 2;
 		} else {
-			hh = h/2;
+			hh = (h+1)/2;
 			ni = 1;
 		}
 
@@ -783,7 +784,7 @@ static int sna_render_picture_downsample(struct sna *sna,
 
 			b.y1 = hh*i;
 			if (i == ni - 1)
-				b.y2 = h/2;
+				b.y2 = (h+1)/2;
 			else
 				b.y2 = b.y1 + hh;
 
@@ -792,7 +793,7 @@ static int sna_render_picture_downsample(struct sna *sna,
 
 				b.x1 = ww*j;
 				if (j == nj - 1)
-					b.x2 = w/2;
+					b.x2 = (w+1)/2;
 				else
 					b.x2 = b.x1 + ww;
 
@@ -850,8 +851,8 @@ static int sna_render_picture_downsample(struct sna *sna,
 	channel->offset[1] = y - dst_y;
 	channel->scale[0] = 1.f/w;
 	channel->scale[1] = 1.f/h;
-	channel->width  = w / 2;
-	channel->height = h / 2;
+	channel->width  = (w + 1) / 2;
+	channel->height = (h + 1) / 2;
 	channel->bo = bo;
 	return 1;
 
commit ddc49477b4a46660031a230010a48d9bd6f6b9e3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 11:22:38 2012 +0000

    sna: Silence compiler warning for a potential uninitialised return on error
    
    sna_accel.c: In function 'sna_copy_plane':
    sna_accel.c:5022:21: warning: 'ret' may be used uninitialized in this
    function [-Wuninitialized]
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ee1f373..deb39bf 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5019,7 +5019,7 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 {
 	PixmapPtr pixmap = get_drawable_pixmap(dst);
 	struct sna *sna = to_sna_from_pixmap(pixmap);
-	RegionRec region, *ret;
+	RegionRec region, *ret = NULL;
 	struct sna_damage **damage;
 
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d\n", __FUNCTION__,
@@ -5105,7 +5105,6 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 fallback:
 	DBG(("%s: fallback\n", __FUNCTION__));
-	ret = NULL;
 	if (!sna_gc_move_to_cpu(gc, dst))
 		goto out;
 	if (!sna_drawable_move_region_to_cpu(dst, &region,
commit 050a9a5a233c4d7c6f7a3b0604ca7f419126b070
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 09:36:20 2012 +0000

    sna: Run the miHandleExposures for no-op CopyPlane
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 03dceaf..ee1f373 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5026,13 +5026,10 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	     src_x, src_y, dst_x, dst_y, w, h));
 
 	if (gc->planemask == 0)
-		return NULL;
+		goto empty;
 
 	if (src->bitsPerPixel == 1 && (bit&1) == 0)
-		return miHandleExposures(src, dst, gc,
-					 src_x, src_y,
-					 w, h,
-					 dst_x, dst_y, bit);
+		goto empty;
 
 	region.extents.x1 = dst_x + dst->x;
 	region.extents.y1 = dst_y + dst->y;
@@ -5067,7 +5064,7 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2));
 	if (!RegionNotEmpty(&region))
-		return NULL;
+		goto empty;
 
 	RegionTranslate(&region,
 			src_x - dst_x - dst->x + src->x,
@@ -5121,6 +5118,11 @@ fallback:
 out:
 	RegionUninit(&region);
 	return ret;
+empty:
+	return miHandleExposures(src, dst, gc,
+				 src_x, src_y,
+				 w, h,
+				 dst_x, dst_y, bit);
 }
 
 static Bool
commit 74e27d75d36ead6e095f48249fbfd20f84bc40bc
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 02:42:25 2012 +0000

    sna: Handle self-copies for CopyPlane
    
    Prepare the source first as this has the dual benefit of letting us
    decide how best to proceed with the op (on the CPU or GPU) and prevents
    modification of the damage after we have choosen our preferred path.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f52766b..03dceaf 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4827,8 +4827,6 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 	if (n == 0)
 		return;
 
-	if (!sna_pixmap_move_to_cpu(src_pixmap, MOVE_READ))
-		return;
 	get_drawable_deltas(source, src_pixmap, &dx, &dy);
 	sx += dx;
 	sy += dy;
@@ -5027,6 +5025,9 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d\n", __FUNCTION__,
 	     src_x, src_y, dst_x, dst_y, w, h));
 
+	if (gc->planemask == 0)
+		return NULL;
+
 	if (src->bitsPerPixel == 1 && (bit&1) == 0)
 		return miHandleExposures(src, dst, gc,
 					 src_x, src_y,
@@ -5068,6 +5069,17 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (!RegionNotEmpty(&region))
 		return NULL;
 
+	RegionTranslate(&region,
+			src_x - dst_x - dst->x + src->x,
+			src_y - dst_y - dst->y + src->y);
+
+	if (!sna_drawable_move_region_to_cpu(src, &region, MOVE_READ))
+		goto out;
+
+	RegionTranslate(&region,
+			-(src_x - dst_x - dst->x + src->x),
+			-(src_y - dst_y - dst->y + src->y));
+
 	if (FORCE_FALLBACK)
 		goto fallback;
 
@@ -5077,6 +5089,9 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (wedged(sna))
 		goto fallback;
 
+	if (!PM_IS_SOLID(dst, gc->planemask))
+		goto fallback;
+
 	if (sna_drawable_use_gpu_bo(dst, &region.extents, &damage)) {
 		struct sna_pixmap *priv = sna_pixmap(pixmap);
 		if (priv->gpu_bo->tiling != I915_TILING_Y ||
@@ -5096,17 +5111,10 @@ fallback:
 	ret = NULL;
 	if (!sna_gc_move_to_cpu(gc, dst))
 		goto out;
-
 	if (!sna_drawable_move_region_to_cpu(dst, &region,
 					     MOVE_READ | MOVE_WRITE))
 		goto out;
 
-	RegionTranslate(&region,
-			src_x - dst_x - dst->x + src->x,
-			src_y - dst_y - dst->y + src->y);
-	if (!sna_drawable_move_region_to_cpu(src, &region, MOVE_READ))
-		goto out;
-
 	DBG(("%s: fbCopyPlane(%d, %d, %d, %d, %d,%d) %x\n",
 	     __FUNCTION__, src_x, src_y, w, h, dst_x, dst_y, (unsigned)bit));
 	ret = fbCopyPlane(src, dst, gc, src_x, src_y, w, h, dst_x, dst_y, bit);
commit 0bafc2f4c4ff7ef0745b8a6d093e1cb7d9a37595
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:42:56 2012 +0000

    sna: Only shrink partial buffers that are being written to
    
    Ignore inactive and mmapped buffers.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index b979d3c..d85e930 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1255,54 +1255,55 @@ static void kgem_finish_partials(struct kgem *kgem)
 		}
 
 		assert(bo->base.rq == kgem->next_request);
-		if (bo->base.refcnt == 1 && bo->used < bo->base.size / 2) {
-			struct kgem_bo *shrink;
-
-			shrink = search_linear_cache(kgem,
-						     PAGE_ALIGN(bo->used),
-						     CREATE_INACTIVE);
-			if (shrink) {
-				int n;
-
-				DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
-				     __FUNCTION__,
-				     bo->used, bo->base.size, shrink->size,
-				     bo->base.handle, shrink->handle));
-
-				assert(bo->used <= shrink->size);
-				gem_write(kgem->fd, shrink->handle,
-					  0, bo->used, bo->mem);
-
-				for (n = 0; n < kgem->nreloc; n++) {
-					if (kgem->reloc[n].target_handle == bo->base.handle) {
-						kgem->reloc[n].target_handle = shrink->handle;
-						kgem->reloc[n].presumed_offset = shrink->presumed_offset;
-						kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
-							kgem->reloc[n].delta + shrink->presumed_offset;
+		if (bo->used && bo->need_io) {
+			if (bo->base.refcnt == 1 &&
+			    bo->used < bo->base.size / 2) {
+				struct kgem_bo *shrink;
+
+				shrink = search_linear_cache(kgem,
+							     PAGE_ALIGN(bo->used),
+							     CREATE_INACTIVE);
+				if (shrink) {
+					int n;
+
+					DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
+					     __FUNCTION__,
+					     bo->used, bo->base.size, shrink->size,
+					     bo->base.handle, shrink->handle));
+
+					assert(bo->used <= shrink->size);
+					gem_write(kgem->fd, shrink->handle,
+						  0, bo->used, bo->mem);
+
+					for (n = 0; n < kgem->nreloc; n++) {
+						if (kgem->reloc[n].target_handle == bo->base.handle) {
+							kgem->reloc[n].target_handle = shrink->handle;
+							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+								kgem->reloc[n].delta + shrink->presumed_offset;
+						}
 					}
-				}
-
-				bo->base.exec->handle = shrink->handle;
-				bo->base.exec->offset = shrink->presumed_offset;
-				shrink->exec = bo->base.exec;
-				shrink->rq = bo->base.rq;
-				list_replace(&bo->base.request,
-					     &shrink->request);
-				list_init(&bo->base.request);
-				shrink->needs_flush = bo->base.dirty;
-
-				bo->base.exec = NULL;
-				bo->base.rq = NULL;
-				bo->base.dirty = false;
-				bo->base.needs_flush = false;
-				bo->used = 0;
 
-				bubble_sort_partial(kgem, bo);
-				continue;
+					bo->base.exec->handle = shrink->handle;
+					bo->base.exec->offset = shrink->presumed_offset;
+					shrink->exec = bo->base.exec;
+					shrink->rq = bo->base.rq;
+					list_replace(&bo->base.request,
+						     &shrink->request);
+					list_init(&bo->base.request);
+					shrink->needs_flush = bo->base.dirty;
+
+					bo->base.exec = NULL;
+					bo->base.rq = NULL;
+					bo->base.dirty = false;
+					bo->base.needs_flush = false;
+					bo->used = 0;
+
+					bubble_sort_partial(kgem, bo);
+					continue;
+				}
 			}
-		}
 
-		if (bo->used && bo->need_io) {
 			DBG(("%s: handle=%d, uploading %d/%d\n",
 			     __FUNCTION__, bo->base.handle, bo->used, bo->base.size));
 			assert(!kgem_busy(kgem, bo->base.handle));
commit fcaf9f5ce4df6f4fe7285dec02dbf9e8decac956
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:36:27 2012 +0000

    sna: Apply source clipping to sna_copy_plane()
    
    Ensure that the migration region is within bounds for both the source
    and destination pixmaps.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2d83868..f52766b 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -5038,7 +5038,33 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	region.extents.x2 = region.extents.x1 + w;
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
-	region_maybe_clip(&region, gc->pCompositeClip);
+	RegionIntersect(&region, &region, gc->pCompositeClip);
+
+	DBG(("%s: dst extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
+
+	{
+		RegionRec clip;
+
+		clip.extents.x1 = src->x - (src->x + src_x) + (dst->x + dst_x);
+		clip.extents.y1 = src->y - (src->y + src_y) + (dst->y + dst_y);
+		clip.extents.x2 = clip.extents.x1 + src->width;
+		clip.extents.y2 = clip.extents.y1 + src->height;
+		clip.data = NULL;
+
+		DBG(("%s: src extents (%d, %d), (%d, %d)\n",
+		     __FUNCTION__,
+		     clip.extents.x1, clip.extents.y1,
+		     clip.extents.x2, clip.extents.y2));
+
+		RegionIntersect(&region, &region, &clip);
+	}
+	DBG(("%s: dst^src extents (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     region.extents.x1, region.extents.y1,
+	     region.extents.x2, region.extents.y2));
 	if (!RegionNotEmpty(&region))
 		return NULL;
 
commit 30669f46b6e8775a486a4b5f6930c4179215ce13
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:31:34 2012 +0000

    sna: Set the source clip for CopyArea fallback correctly
    
    The source window is (src->x, src->y)x(src->width, src->height) in
    pixmap space. However, we then need to use this to clip against the
    desination region, and so we need to translate from the source
    coordinate to the destination coordinate.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f86bb62..2d83868 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3629,8 +3629,8 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		{
 			RegionRec clip;
 
-			clip.extents.x1 = -(src_x - dst_x - dst->x + src->x);
-			clip.extents.y1 = -(src_y - dst_y - dst->y + src->y);
+			clip.extents.x1 = src->x - (src->x + src_x) + (dst->x + dst_x);
+			clip.extents.y1 = src->y - (src->y + src_y) + (dst->y + dst_y);
 			clip.extents.x2 = clip.extents.x1 + src->width;
 			clip.extents.y2 = clip.extents.y1 + src->height;
 			clip.data = NULL;
commit 9c8cd1b13b69bfd9734a158a4627cef3398e30b4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:18:11 2012 +0000

    sna: Print source and destination regions for CopyArea fallback for DBG
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 613c1ed..f86bb62 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -3597,8 +3597,13 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 {
 	struct sna *sna = to_sna_from_drawable(dst);
 
-	DBG(("%s: src=(%d, %d)x(%d, %d) -> dst=(%d, %d)\n",
-	     __FUNCTION__, src_x, src_y, width, height, dst_x, dst_y));
+	if (gc->planemask == 0)
+		return NULL;
+
+	DBG(("%s: src=(%d, %d)x(%d, %d)+(%d, %d) -> dst=(%d, %d)+(%d, %d)\n",
+	     __FUNCTION__,
+	     src_x, src_y, width, height, src->x, src->y,
+	     dst_x, dst_y, dst->x, dst->y));
 
 	if (FORCE_FALLBACK || !ACCEL_COPY_AREA || wedged(sna) ||
 	    !PM_IS_SOLID(dst, gc->planemask)) {
@@ -3616,6 +3621,11 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		region.data = NULL;
 		RegionIntersect(&region, &region, gc->pCompositeClip);
 
+		DBG(("%s: dst extents (%d, %d), (%d, %d)\n",
+		     __FUNCTION__,
+		     region.extents.x1, region.extents.y1,
+		     region.extents.x2, region.extents.y2));
+
 		{
 			RegionRec clip;
 
@@ -3625,8 +3635,18 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			clip.extents.y2 = clip.extents.y1 + src->height;
 			clip.data = NULL;
 
+			DBG(("%s: src extents (%d, %d), (%d, %d)\n",
+			     __FUNCTION__,
+			     clip.extents.x1, clip.extents.y1,
+			     clip.extents.x2, clip.extents.y2));
+
 			RegionIntersect(&region, &region, &clip);
 		}
+		DBG(("%s: dst^src extents (%d, %d), (%d, %d)\n",
+		     __FUNCTION__,
+		     region.extents.x1, region.extents.y1,
+		     region.extents.x2, region.extents.y2));
+
 		if (!RegionNotEmpty(&region))
 			return NULL;
 
commit 8a026c378ab061f871a5736170d2a2f735bb8171
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:17:49 2012 +0000

    sna: Clip GetImage to drawable so that damage migration is within bounds
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 31a9079..613c1ed 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1143,7 +1143,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	     RegionExtents(region)->x2, RegionExtents(region)->y2,
 	     flags));
 
-	assert_pixmap_contains_box(pixmap, &region->extents);
+	assert_drawable_contains_box(drawable, &region->extents);
 
 	priv = sna_pixmap(pixmap);
 	if (priv == NULL) {
@@ -11102,6 +11102,16 @@ sna_get_image(DrawablePtr drawable,
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
 
+	if (region.extents.x1 < drawable->x)
+		region.extents.x1 = drawable->x;
+	if (region.extents.x2 > drawable->x + drawable->width)
+		region.extents.x2 = drawable->x + drawable->width;
+
+	if (region.extents.y1 < drawable->y)
+		region.extents.y1 = drawable->y;
+	if (region.extents.y2 > drawable->y + drawable->height)
+		region.extents.y2 = drawable->y + drawable->height;
+
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, MOVE_READ))
 		return;
 
commit a06e806b2ebf29b3002ce2096b98550538e05852
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Jan 25 01:16:34 2012 +0000

    sna: Clear GPU damage first when moving a clear pixmap to the CPU
    
    This allows us to discard any busy GPU or CPU bo when we know we are
    going to clear the shadow pixmap afterwards.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 1a13465..31a9079 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -922,6 +922,13 @@ skip_inplace_map:
 		priv->mapped = false;
 	}
 
+	if (priv->clear) {
+		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+			sna_pixmap_free_cpu(sna, priv);
+		sna_damage_destroy(&priv->gpu_damage);
+		priv->undamaged = true;
+	}
+
 	if (pixmap->devPrivate.ptr == NULL &&
 	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, priv->gpu_damage != NULL))
 		return false;
@@ -943,8 +950,6 @@ skip_inplace_map:
 			    pixmap->drawable.height,
 			    priv->clear_color);
 
-		sna_damage_destroy(&priv->gpu_damage);
-		priv->undamaged = true;
 		priv->clear = false;
 	}
 
commit f04604d80e3f0cdf03acd46cc92723a2c7b655f4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Jan 24 22:11:20 2012 +0000

    sna: Reduce number of reads required to inspect timers
    
    By using the information provided by select at wakeup.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna.h b/src/sna/sna.h
index cabf04c..d8f96e0 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -231,7 +231,8 @@ struct sna {
 #define SNA_NO_DELAYED_FLUSH	0x2
 
 	int timer[NUM_TIMERS];
-	int timer_active;
+	uint16_t timer_active;
+	uint16_t timer_ready;
 
 	int vblank_interval;
 
@@ -581,7 +582,7 @@ static inline uint32_t pixmap_size(PixmapPtr pixmap)
 Bool sna_accel_pre_init(struct sna *sna);
 Bool sna_accel_init(ScreenPtr sreen, struct sna *sna);
 void sna_accel_block_handler(struct sna *sna);
-void sna_accel_wakeup_handler(struct sna *sna);
+void sna_accel_wakeup_handler(struct sna *sna, fd_set *ready);
 void sna_accel_close(struct sna *sna);
 void sna_accel_free(struct sna *sna);
 
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 46016e0..1a13465 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -11276,7 +11276,7 @@ static void _sna_accel_disarm_timer(struct sna *sna, int id)
 
 #define return_if_timer_active(id) do {					\
 	if (sna->timer_active & (1<<(id)))				\
-		return read_timer(sna->timer[id]) > 0;			\
+		return (sna->timer_ready & (1<<(id))) && read_timer(sna->timer[id]) > 0;			\
 } while (0)
 
 static Bool sna_accel_do_flush(struct sna *sna)
@@ -11687,15 +11687,24 @@ void sna_accel_block_handler(struct sna *sna)
 
 	if (sna_accel_do_inactive(sna))
 		sna_accel_inactive(sna);
+
+	sna->timer_ready = 0;
 }
 
-void sna_accel_wakeup_handler(struct sna *sna)
+void sna_accel_wakeup_handler(struct sna *sna, fd_set *ready)
 {
+	int id, active;
+
 	if (sna->kgem.need_retire)
 		kgem_retire(&sna->kgem);
 	if (sna->kgem.need_purge)
 		kgem_purge_cache(&sna->kgem);
 
+	active = sna->timer_active & ~sna->timer_ready;
+	for (id = 0; id < NUM_TIMERS; id++)
+		if (active & (1 << id) && FD_ISSET(sna->timer[id], ready))
+			sna->timer_ready |= 1 << id;
+
 	sna_deferred_free(sna);
 }
 
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index 770a5bd..bcd1191 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -606,7 +606,7 @@ sna_wakeup_handler(int i, pointer data, unsigned long result, pointer read_mask)
 
 	sna->WakeupHandler(i, sna->WakeupData, result, read_mask);
 
-	sna_accel_wakeup_handler(sna);
+	sna_accel_wakeup_handler(sna, read_mask);
 
 	if (FD_ISSET(sna->kgem.fd, (fd_set*)read_mask))
 		sna_dri_wakeup(sna);
@@ -761,7 +761,6 @@ static Bool sna_close_screen(int scrnIndex, ScreenPtr screen)
 #endif
 
 	/* drain the event queues */
-	sna_accel_wakeup_handler(sna);
 	if (sna_dri_has_pending_events(sna))
 		sna_dri_wakeup(sna);
 
commit 9b93914586597f9002712a04e3a08e1aad49345f
Author: Kristian HÃ¸gsberg <krh at bitplanet.net>
Date:   Fri Oct 15 17:58:14 2010 -0400

    Add xwayland support

diff --git a/configure.ac b/configure.ac
index 63beb64..50b5643 100644
--- a/configure.ac
+++ b/configure.ac
@@ -172,7 +172,6 @@ if test "x$VMAP" = xyes; then
 	AC_DEFINE(USE_VMAP,1,[Assume VMAP support])
 fi
 
-
 AC_ARG_ENABLE(debug,
 	      AS_HELP_STRING([--enable-debug],
 			     [Enables internal debugging [default=no]]),
diff --git a/src/intel.h b/src/intel.h
index 7593731..775afb6 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -441,10 +441,14 @@ typedef struct intel_screen_private {
 	 */
 	Bool fallback_debug;
 	unsigned debug_flush;
+
 #if HAVE_UDEV
 	struct udev_monitor *uevent_monitor;
 	InputHandlerProc uevent_handler;
 #endif
+
+	struct xwl_screen *xwl_screen;
+
 } intel_screen_private;
 
 enum {
diff --git a/src/intel_dri.c b/src/intel_dri.c
index 0c6d45c..2e43fcf 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -50,6 +50,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "xf86.h"
 #include "xf86_OSproc.h"
+#include "xf86Priv.h"
 
 #include "xf86Pci.h"
 #include "xf86drm.h"
@@ -58,6 +59,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shadow.h"
 #include "fb.h"
 
+#ifdef XORG_WAYLAND
+#include <xwayland.h>
+#endif
+
 #include "intel.h"
 #include "i830_reg.h"
 
@@ -1533,6 +1538,29 @@ out_complete:
 	return TRUE;
 }
 
+#ifdef XORG_WAYLAND
+static int intel_auth_magic(int fd, uint32_t magic)
+{
+	ScrnInfoPtr scrn;
+	intel_screen_private *intel;
+	int i;
+
+	/* Not wayland, go stragight to drm */
+	if (!xorgWayland)
+		return drmAuthMagic(fd, magic);
+
+	for (i = 0; i < 1; i++) {
+		scrn = xf86Screens[i];
+		intel = intel_get_screen_private(scrn);
+		if (xwl_screen_get_drm_fd(intel->xwl_screen) == fd)
+			break;
+	}
+
+	/* Forward the request to our host */
+	return xwl_drm_authenticate(intel->xwl_screen, magic);
+}
+#endif
+
 static int dri2_server_generation;
 #endif
 
@@ -1618,6 +1646,11 @@ Bool I830DRI2ScreenInit(ScreenPtr screen)
 	}
 #endif
 
+#if DRI2INFOREC_VERSION >= 5 && defined(XORG_WAYLAND)
+	info.version = 5;
+	info.AuthMagic = intel_auth_magic;
+#endif
+
 	return DRI2ScreenInit(screen, &info);
 }
 
diff --git a/src/intel_driver.c b/src/intel_driver.c
index 9d1c4e8..3e52da4 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -49,6 +49,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xf86_OSproc.h"
 #include "xf86cmap.h"
 #include "xf86drm.h"
+#include "xf86Priv.h"
 #include "compiler.h"
 #include "mibstore.h"
 #include "mipointer.h"
@@ -65,6 +66,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "intel.h"
 #include "intel_video.h"
 
+#ifdef XORG_WAYLAND
+#include <xwayland.h>
+#endif
+
 #ifdef INTEL_XVMC
 #define _INTEL_XVMC_SERVER_
 #include "intel_hwmc.h"
@@ -241,6 +246,11 @@ static Bool i830CreateScreenResources(ScreenPtr screen)
 	if (!(*screen->CreateScreenResources) (screen))
 		return FALSE;
 
+#ifdef XORG_WAYLAND
+	if (intel->xwl_screen)
+		xwl_screen_init(intel->xwl_screen, screen);
+#endif
+
 	return intel_uxa_create_screen_resources(screen);
 }
 
@@ -532,6 +542,27 @@ static Bool can_accelerate_blt(struct intel_screen_private *intel)
 	return TRUE;
 }
 
+#ifdef XORG_WAYLAND
+static int intel_create_window_buffer(struct xwl_window *xwl_window,
+				      PixmapPtr pixmap)
+{
+	uint32_t name;
+	dri_bo *bo;
+
+	bo = intel_get_pixmap_bo(pixmap);
+	if (bo == NULL || dri_bo_flink(bo, &name) != 0)
+		return BadDrawable;
+
+	return xwl_create_window_buffer_drm(xwl_window, pixmap, name);
+}
+
+static struct xwl_driver xwl_driver = {
+	.version = 1,
+	.use_drm = 1,
+	.create_window_buffer = intel_create_window_buffer
+};
+#endif
+
 /**
  * This is called before ScreenInit to do any require probing of screen
  * configuration.
@@ -579,10 +610,6 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 
 	intel->PciInfo = xf86GetPciInfoForEntity(intel->pEnt->index);
 
-	if (!intel_open_drm_master(scrn))
-		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-			   "Failed to become DRM master.\n");
-
 	scrn->monitor = scrn->confScreen->monitor;
 	scrn->progClock = TRUE;
 	scrn->rgbBits = 8;
@@ -620,6 +647,26 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 	intel_check_chipset_option(scrn);
 	intel_check_dri_option(scrn);
 
+#ifdef XORG_WAYLAND
+	if (xorgWayland) {
+		xf86LoadSubModule(scrn, "xwayland");
+		intel->xwl_screen =
+			xwl_screen_pre_init(scrn, 0, &xwl_driver);
+		if (!intel->xwl_screen) {
+			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+				   "Failed to initialize xwayland.\n");
+			return FALSE;
+		}
+
+		intel->drmSubFD =
+			xwl_screen_get_drm_fd(intel->xwl_screen);
+	}
+#endif
+
+	if (!intel->xwl_screen && !intel_open_drm_master(scrn))
+		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+			   "Failed to become DRM master.\n");
+
 	if (!intel_init_bufmgr(intel)) {
 		PreInitCleanup(scrn);
 		return FALSE;
@@ -680,6 +727,9 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Triple buffering? %s\n",
 		   intel->use_triple_buffer ? "enabled" : "disabled");
 
+	if (!intel->xwl_screen)
+		intel->swapbuffers_wait = TRUE;
+
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Framebuffer %s\n",
 		   intel->tiling & INTEL_TILING_FB ? "tiled" : "linear");
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Pixmaps %s\n",
@@ -691,7 +741,8 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 
 	I830XvInit(scrn);
 
-	if (!intel_mode_pre_init(scrn, intel->drmSubFD, intel->cpp)) {
+	if (!intel->xwl_screen &&
+	    !intel_mode_pre_init(scrn, intel->drmSubFD, intel->cpp)) {
 		PreInitCleanup(scrn);
 		return FALSE;
 	}
@@ -825,9 +876,16 @@ intel_flush_callback(CallbackListPtr *list,
 		     pointer user_data, pointer call_data)
 {
 	ScrnInfoPtr scrn = user_data;
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+
 	if (scrn->vtSema) {
 		intel_batch_submit(scrn);
 		intel_glamor_flush(intel_get_screen_private(scrn));
+
+#ifdef XORG_WAYLAND
+		if (intel->xwl_screen)
+			xwl_screen_post_damage(intel->xwl_screen);
+#endif
 	}
 }
 
@@ -1220,7 +1278,8 @@ static Bool I830CloseScreen(int scrnIndex, ScreenPtr screen)
 		if (!intel->use_shadow)
 			intel_set_pixmap_bo(screen->GetScreenPixmap(screen),
 					    NULL);
-		intel_mode_remove_fb(intel);
+		if (!intel->xwl_screen)
+			intel_mode_remove_fb(intel);
 		drm_intel_bo_unreference(intel->front_buffer);
 		intel->front_buffer = NULL;
 	}
diff --git a/src/intel_module.c b/src/intel_module.c
index 6e2af57..d0367e5 100644
--- a/src/intel_module.c
+++ b/src/intel_module.c
@@ -32,6 +32,7 @@
 #include <xf86_OSproc.h>
 #include <xf86cmap.h>
 #include <xf86drmMode.h>
+#include "xf86Priv.h"
 
 #include <xorgVersion.h>
 
@@ -263,6 +264,11 @@ static Bool intel_driver_func(ScrnInfoPtr pScrn,
 #else
 		(*flag) = HW_IO | HW_MMIO;
 #endif
+
+#ifdef XORG_WAYLAND
+		if (xorgWayland)
+			(*flag) = HW_SKIP_CONSOLE;
+#endif
 		return TRUE;
 	default:
 		/* Unknown or deprecated function */