xf86-video-ati: Branch 'master' - 14 commits

Dave Airlie airlied at kemper.freedesktop.org
Mon Sep 7 18:30:07 PDT 2009


 src/r600_exa.c                 |  602 +++++++++++++++++++++++++++++------------
 src/r600_state.h               |   54 +++
 src/r600_textured_videofuncs.c |   96 ++++--
 src/r6xx_accel.c               |  291 +++++++++++++++++--
 src/radeon.h                   |    5 
 src/radeon_dri2.c              |    4 
 src/radeon_exa.c               |    8 
 src/radeon_kms.c               |    4 
 8 files changed, 824 insertions(+), 240 deletions(-)

New commits:
commit 6990f2ac6478bf92929a4400ef84fb2142699204
Merge: 917f2d7... 853f4c3...
Author: Dave Airlie <airlied at redhat.com>
Date:   Tue Sep 8 11:26:32 2009 +1000

    Merge branch 'r6xx-cs'

commit 853f4c3d1ea8f975ab2855f18d3ae336a4095091
Author: Dave Airlie <airlied at redhat.com>
Date:   Tue Sep 8 11:25:39 2009 +1000

    r600: more alignment fixups + vb map/unmap
    
    I'm not so sure the vb map/unmap is a good idea, I think
    it pretty much locksteps the cpu/gpu, so we should really
    work out if we really need to flush this often, since
    mesa doesn't have to and we are just doing 3D ops.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index db4c0b1..c143b69 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -780,8 +780,7 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 		radeon_bo_unref(accel_state->copy_area_bo);
 		accel_state->copy_area_bo = NULL;
 	    }
-	    accel_state->copy_area_bo = radeon_bo_open(info->bufmgr, 0, size,
-						       4096,
+	    accel_state->copy_area_bo = radeon_bo_open(info->bufmgr, 0, size, 0,
 						       RADEON_GEM_DOMAIN_VRAM,
 						       0);
 	    if (accel_state->copy_area_bo == NULL) {
@@ -789,7 +788,7 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 		return FALSE;
 	    }
 	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->copy_area_bo,
-					      RADEON_GEM_DOMAIN_VRAM, 0);
+					      0, RADEON_GEM_DOMAIN_VRAM);
 	    if (radeon_cs_space_check(info->cs)) {
 		radeon_bo_unref(accel_state->copy_area_bo);
 		accel_state->copy_area_bo = NULL;
@@ -2157,7 +2156,7 @@ R600AllocShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 #ifdef XF86DRM_MODE
 #if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
     if (info->cs) {
-	accel_state->shaders_bo = radeon_bo_open(info->bufmgr, 0, size, 4096,
+	accel_state->shaders_bo = radeon_bo_open(info->bufmgr, 0, size, 0,
 						 RADEON_GEM_DOMAIN_VRAM, 0);
 	if (accel_state->shaders_bo == NULL) {
 	    ErrorF("Allocating shader failed\n");
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 6346e52..985595e 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -50,8 +50,7 @@ void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
 
     if (info->accel_state->vb_bo) {
 	radeon_bo_unmap(info->accel_state->vb_bo);
-	radeon_bo_ref(info->accel_state->vb_bo);
-	info->accel_state->vb_bo = NULL;
+	info->accel_state->vb_ptr = NULL;
     }
 
     radeon_cs_emit(info->cs);
@@ -1161,15 +1160,17 @@ r600_vb_get(ScrnInfoPtr pScrn)
 	if (accel_state->vb_bo == NULL) {
 	    accel_state->vb_mc_addr = 0;
 	    accel_state->vb_bo = radeon_bo_open(info->bufmgr, 0, 16 * 1024,
-						4096, RADEON_GEM_DOMAIN_GTT, 0);
+						0, RADEON_GEM_DOMAIN_GTT, 0);
 	    if (accel_state->vb_bo == NULL)
 		return FALSE;
+	    accel_state->vb_total = 16 * 1024;
+	}
+	if (!accel_state->vb_ptr) {
 	    ret = radeon_bo_map(accel_state->vb_bo, 1);
 	    if (ret) {
 		FatalError("failed to vb %d\n", ret);
 		return FALSE;
 	    }
-	    accel_state->vb_total = 16 * 1024;
 	    accel_state->vb_ptr = accel_state->vb_bo->ptr;
 	}
     } else
commit 8f4196e88855f10762254fca9e0a0988e7b5562f
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Aug 31 19:41:59 2009 -0400

    r6xx/r7xx: various CS fixes from Dave

diff --git a/src/r600_exa.c b/src/r600_exa.c
index cfe041f..db4c0b1 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -720,6 +720,7 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 
     accel_state->dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
+    accel_state->same_surface = FALSE;
 
 #if defined(XF86DRM_MODE)
     if (info->cs) {
@@ -728,11 +729,15 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	accel_state->src_bo[0] = radeon_get_pixmap_bo(pSrc);
 	accel_state->src_bo[1] = NULL;
 	accel_state->dst_bo = radeon_get_pixmap_bo(pDst);
+	if (accel_state->dst_bo == accel_state->src_bo[0])
+	    accel_state->same_surface = TRUE;
     } else
 #endif
     {
 	accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
 	accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+	if (exaGetPixmapOffset(pSrc) == exaGetPixmapOffset(pDst))
+	    accel_state->same_surface = TRUE;
     }
 
     accel_state->src_width[0] = pSrc->drawable.width;
@@ -766,9 +771,8 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     accel_state->rop = rop;
     accel_state->planemask = planemask;
 
-    if (exaGetPixmapOffset(pSrc) == exaGetPixmapOffset(pDst)) {
+    if (accel_state->same_surface == TRUE) {
 	unsigned long size = pDst->drawable.height * accel_state->dst_pitch * pDst->drawable.bitsPerPixel/8;
-	accel_state->same_surface = TRUE;
 
 #if defined(XF86DRM_MODE)
 	if (info->cs) {
@@ -802,9 +806,7 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	    }
 	    accel_state->copy_area = exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
 	}
-    } else {
-	accel_state->same_surface = FALSE;
-
+    } else
 	R600DoPrepareCopy(pScrn,
 			  accel_state->src_pitch[0], pSrc->drawable.width, pSrc->drawable.height,
 			  accel_state->src_mc_addr[0], accel_state->src_bo[0], pSrc->drawable.bitsPerPixel,
@@ -812,8 +814,6 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 			  accel_state->dst_mc_addr, accel_state->dst_bo, pDst->drawable.bitsPerPixel,
 			  rop, planemask);
 
-    }
-
     return TRUE;
 }
 
@@ -853,8 +853,8 @@ R600OverlapCopy(PixmapPtr pDst,
     }
 #endif
 
-    if (is_overlap(srcX, srcX + w, srcY, srcY + h,
-		   dstX, dstX + w, dstY, dstY + h)) {
+    if (is_overlap(srcX, srcX + (w - 1), srcY, srcY + (h - 1),
+		   dstX, dstX + (w - 1), dstY, dstY + (h - 1))) {
         /* Calculate height/width of non-overlapping area */
         hchunk = (srcX < dstX) ? (dstX - srcX) : (srcX - dstX);
         vchunk = (srcY < dstY) ? (dstY - srcY) : (srcY - dstY);
@@ -1008,7 +1008,8 @@ R600Copy(PixmapPtr pDst,
 #endif
 
     if (accel_state->same_surface &&
-	is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
+	is_overlap(srcX, srcX + (w - 1), srcY, srcY + (h - 1),
+		   dstX, dstX + (w - 1), dstY, dstY + (h - 1))) {
 	if (accel_state->copy_area) {
 	    uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
 	    uint32_t orig_offset, tmp_offset;
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 7c7f469..6346e52 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -48,9 +48,11 @@ void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
     if (!info->cs->cdw)
 	return;
 
-    if (info->accel_state->vb_bo)
+    if (info->accel_state->vb_bo) {
 	radeon_bo_unmap(info->accel_state->vb_bo);
-    info->accel_state->vb_bo = NULL;
+	radeon_bo_ref(info->accel_state->vb_bo);
+	info->accel_state->vb_bo = NULL;
+    }
 
     radeon_cs_emit(info->cs);
     radeon_cs_erase(info->cs);
commit e87f0f50f31a59ca1f60d4582d4a57ed00854fb7
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Aug 26 02:13:38 2009 -0400

    r6xx/r7xx: set EXA_HANDLES_PIXMAPS

diff --git a/src/r600_exa.c b/src/r600_exa.c
index f2136ae..cfe041f 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -128,7 +128,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     uint32_t a, r, g, b;
     float ps_alu_consts[4];
 
-    return FALSE;
+    //return FALSE;
 
     if (pPix->drawable.bitsPerPixel == 24)
         RADEON_FALLBACK(("24bpp unsupported\n"));
@@ -707,7 +707,7 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    return FALSE;
+    //return FALSE;
 
     if (pSrc->drawable.bitsPerPixel == 24)
         RADEON_FALLBACK(("24bpp unsupported\n"));
@@ -1569,7 +1569,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     cb_config_t cb_conf;
     shader_config_t vs_conf, ps_conf;
 
-    return FALSE;
+    //return FALSE;
     /* return FALSE; */
 
     if (pDst->drawable.bitsPerPixel < 8 || pSrc->drawable.bitsPerPixel < 8)
@@ -2325,6 +2325,17 @@ R600DrawInit(ScreenPtr pScreen)
 #ifdef EXA_SUPPORTS_PREPARE_AUX
     info->accel_state->exa->flags |= EXA_SUPPORTS_PREPARE_AUX;
 #endif
+
+#ifdef XF86DRM_MODE
+#ifdef EXA_HANDLES_PIXMAPS
+    if (info->cs) {
+	info->accel_state->exa->flags |= EXA_HANDLES_PIXMAPS;
+//#ifdef EXA_MIXED_PIXMAPS
+//	info->accel_state->exa->flags |= EXA_MIXED_PIXMAPS;
+//#endif
+    }
+#endif
+#endif
     info->accel_state->exa->pixmapOffsetAlign = 256;
     info->accel_state->exa->pixmapPitchAlign = 256;
 
commit 3212c26b90c0f6f1a7248b4da3ed985a9c2e9381
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Aug 26 01:42:10 2009 -0400

    r6xx/r7xx: more WIP

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 462bbb8..f2136ae 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -352,11 +352,6 @@ R600DoneSolid(PixmapPtr pPix)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
-#ifdef XF86DRM_MODE
-    if (info->cs)
-	radeon_bo_unmap(accel_state->vb_bo);
-#endif
-
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -405,11 +400,6 @@ R600DoneSolid(PixmapPtr pPix)
 			accel_state->dst_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
-
-    accel_state->src_bo[0] = NULL;
-    accel_state->src_bo[1] = NULL;
-    accel_state->dst_bo = NULL;
-    accel_state->vb_bo = NULL;
 }
 
 static void
@@ -453,6 +443,10 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     r600_cp_start(pScrn);
 
     /* Init */
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	accel_state->XInited3D = FALSE;
+#endif
     start_3d(pScrn, accel_state->ib);
 
     set_default_state(pScrn, accel_state->ib);
@@ -619,11 +613,6 @@ R600DoCopy(ScrnInfoPtr pScrn)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
-#ifdef XF86DRM_MODE
-    if (info->cs)
-	radeon_bo_unmap(accel_state->vb_bo);
-#endif
-
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -1092,10 +1081,7 @@ R600DoneCopy(PixmapPtr pDst)
 	    exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
 	accel_state->copy_area = NULL;
     }
-    accel_state->src_bo[0] = NULL;
-    accel_state->src_bo[1] = NULL;
-    accel_state->dst_bo = NULL;
-    accel_state->vb_bo = NULL;
+
 }
 
 
@@ -1611,6 +1597,8 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     if (info->cs) {
 	accel_state->dst_mc_addr = 0;
 	accel_state->dst_bo = radeon_get_pixmap_bo(pDst);
+	accel_state->src_bo[0] = NULL;
+	accel_state->src_bo[1] = NULL;
     } else
 #endif
 	accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
@@ -1633,6 +1621,10 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     r600_cp_start(pScrn);
 
     /* Init */
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	accel_state->XInited3D = FALSE;
+#endif
     start_3d(pScrn, accel_state->ib);
 
     set_default_state(pScrn, accel_state->ib);
@@ -1714,7 +1706,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     ps_conf.bo                  = accel_state->shaders_bo;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
-    BEGIN_BATCH(12);
+    BEGIN_BATCH(9);
     EREG(accel_state->ib, CB_SHADER_MASK,                      (0xf << OUTPUT0_ENABLE_shift));
 
     blendcntl = R600GetBlendCntl(op, pMaskPicture, pDstPicture->format);
@@ -1893,11 +1885,6 @@ static void R600DoneComposite(PixmapPtr pDst)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
-#ifdef XF86DRM_MODE
-    if (info->cs)
-	radeon_bo_unmap(accel_state->vb_bo);
-#endif
-
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -1954,10 +1941,6 @@ static void R600DoneComposite(PixmapPtr pDst)
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
 
-    accel_state->src_bo[0] = NULL;
-    accel_state->src_bo[1] = NULL;
-    accel_state->dst_bo = NULL;
-    accel_state->vb_bo = NULL;
 }
 
 Bool
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index a6e2559..b6f7b39 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -65,11 +65,6 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
-#ifdef XF86DRM_MODE
-    if (info->cs)
-	radeon_bo_unmap(accel_state->vb_bo);
-#endif
-
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -117,9 +112,6 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
 			accel_state->dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
-    accel_state->dst_bo = NULL;
-    accel_state->src_bo[0] = NULL;
-    accel_state->src_bo[1] = NULL;
 }
 
 void
@@ -261,6 +253,10 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     r600_cp_start(pScrn);
 
     /* Init */
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	accel_state->XInited3D = FALSE;
+#endif
     start_3d(pScrn, accel_state->ib);
 
     set_default_state(pScrn, accel_state->ib);
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 8a2b1ae..7c7f469 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -39,6 +39,7 @@
 
 #include "radeon_drm.h"
 
+#if defined(XF86DRM_MODE)
 void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
@@ -46,6 +47,11 @@ void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
 
     if (!info->cs->cdw)
 	return;
+
+    if (info->accel_state->vb_bo)
+	radeon_bo_unmap(info->accel_state->vb_bo);
+    info->accel_state->vb_bo = NULL;
+
     radeon_cs_emit(info->cs);
     radeon_cs_erase(info->cs);
 
@@ -53,6 +59,7 @@ void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
     if (ret)
 	ErrorF("space check failed in flush\n");
 }
+#endif
 
 /* Flush the indirect buffer to the kernel for submission to the card */
 void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
@@ -98,6 +105,9 @@ void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib)
     int ret;
     RADEONInfoPtr info = RADEONPTR(pScrn);
     if (info->cs) {
+	if (info->accel_state->vb_bo)
+	    radeon_bo_unmap(info->accel_state->vb_bo);
+	info->accel_state->vb_bo = NULL;
 	if (CS_FULL(info->cs)) {
 	    r600_cs_flush_indirect(pScrn);
 	    return;
commit 5a08e68cc254fb255e631b456e331c32456ef0e7
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 19:24:41 2009 -0400

    r6xx/r7xx: fix some define problems in Xv code

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 09eb6ee..462bbb8 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -128,7 +128,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     uint32_t a, r, g, b;
     float ps_alu_consts[4];
 
-    //return FALSE;
+    return FALSE;
 
     if (pPix->drawable.bitsPerPixel == 24)
         RADEON_FALLBACK(("24bpp unsupported\n"));
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 10d6f4f..a6e2559 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -228,7 +228,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     CLEAR (vs_conf);
     CLEAR (ps_conf);
 
-#if defined(ACCEL_CP) && defined(XF86DRM_MODE)
+#if defined(XF86DRM_MODE)
     if (info->cs) {
 	accel_state->dst_mc_addr = 0;
 	accel_state->src_mc_addr[0] = 0;
@@ -269,7 +269,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
 
-#if defined(ACCEL_CP) && defined(XF86DRM_MODE)
+#if defined(XF86DRM_MODE)
     if (info->cs) {
 	accel_state->vs_mc_addr = accel_state->xv_vs_offset;
 	accel_state->ps_mc_addr = accel_state->xv_ps_offset;
commit 9aa214e125b7927d62b9fe124a851d0373c24d7e
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 18:45:49 2009 -0400

    r6xx/r7xx: fix reloc for vtx buffer

diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 6e4c8ea..8a2b1ae 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -494,7 +494,7 @@ set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
     E32(ib, 0);							// 4: n/a
     E32(ib, 0);							// 5: n/a
     E32(ib, SQ_TEX_VTX_VALID_BUFFER << SQ_VTX_CONSTANT_WORD6_0__TYPE_shift);	// 6: TYPE
-    RELOC_BATCH(res->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    RELOC_BATCH(res->bo, RADEON_GEM_DOMAIN_GTT, 0);
     END_BATCH();
 }
 
commit bba51187055932ecd466f5f817428d6c773747b9
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 18:37:15 2009 -0400

    R6xx/r7xx: unmap vb bo when done

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 1d7802a..09eb6ee 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -352,6 +352,11 @@ R600DoneSolid(PixmapPtr pPix)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
+#ifdef XF86DRM_MODE
+    if (info->cs)
+	radeon_bo_unmap(accel_state->vb_bo);
+#endif
+
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -614,6 +619,11 @@ R600DoCopy(ScrnInfoPtr pScrn)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
+#ifdef XF86DRM_MODE
+    if (info->cs)
+	radeon_bo_unmap(accel_state->vb_bo);
+#endif
+
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
@@ -1883,6 +1893,11 @@ static void R600DoneComposite(PixmapPtr pDst)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
+#ifdef XF86DRM_MODE
+    if (info->cs)
+	radeon_bo_unmap(accel_state->vb_bo);
+#endif
+
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
diff --git a/src/r600_state.h b/src/r600_state.h
index 6ca88cf..cb039d4 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -188,9 +188,12 @@ do {					\
 	radeon_cs_end(info->cs, __FILE__, __func__, __LINE__);	\
 } while(0)
 #define RELOC_BATCH(bo, rd, wd)					\
-do {								\
-    if (info->cs)							\
-	OUT_RING_RELOC((bo), (rd), (wd));				\
+do {							\
+    if (info->cs) {							\
+	int _ret;							\
+	_ret = radeon_cs_write_reloc(info->cs, (bo), (rd), (wd), 0);	\
+	if (_ret) ErrorF("reloc emit failure %d (%s %d)\n", _ret, __func__, __LINE__); \
+    }									\
 } while(0)
 #define E32(ib, dword)                                                  \
 do {                                                                    \
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 631a40c..10d6f4f 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -65,6 +65,11 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
     CLEAR (draw_conf);
     CLEAR (vtx_res);
 
+#ifdef XF86DRM_MODE
+    if (info->cs)
+	radeon_bo_unmap(accel_state->vb_bo);
+#endif
+
     if (accel_state->vb_index == 0) {
         R600IBDiscard(pScrn, accel_state->ib);
         r600_vb_discard(pScrn);
commit 599adfc1f5e6d708be7ad30f4871de3046775727
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 18:13:14 2009 -0400

    r6xx/r7xx: fix flipped domains

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 4d09d6e..1d7802a 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -658,7 +658,7 @@ R600DoCopy(ScrnInfoPtr pScrn)
     /* sync dst surface */
     cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
 			accel_state->dst_size, accel_state->dst_mc_addr,
-			accel_state->dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
+			accel_state->dst_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
 }
commit 2e83cca8d7efaf1a6836cfb9ea5893fd9d70175f
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 18:05:43 2009 -0400

    r6xx/r7xx: more cs exa wip

diff --git a/src/r600_exa.c b/src/r600_exa.c
index b6a1a15..4d09d6e 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -167,8 +167,6 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 #endif
 
     r600_cp_start(pScrn);
-    if (!r600_vb_get(pScrn))
-	RADEON_FALLBACK(("Can't get VB\n"));
 
     /* Init */
 #if defined(XF86DRM_MODE)
@@ -325,8 +323,6 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
     if (((accel_state->vb_index + 3) * 8) > accel_state->vb_total) {
         R600DoneSolid(pPix);
 	r600_cp_start(pScrn);
-	if (!r600_vb_get(pScrn))
-	    return;
     }
 
     vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*8);
@@ -408,6 +404,7 @@ R600DoneSolid(PixmapPtr pPix)
     accel_state->src_bo[0] = NULL;
     accel_state->src_bo[1] = NULL;
     accel_state->dst_bo = NULL;
+    accel_state->vb_bo = NULL;
 }
 
 static void
@@ -449,8 +446,6 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     accel_state->dst_bo = dst_bo;
 
     r600_cp_start(pScrn);
-    if (!r600_vb_get(pScrn))
-	return;
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -681,8 +676,6 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
     if (((accel_state->vb_index + 3) * 16) > accel_state->vb_total) {
         R600DoCopy(pScrn);
 	r600_cp_start(pScrn);
-	if (!r600_vb_get(pScrn))
-	    return;
     }
 
     vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*16);
@@ -1092,6 +1085,7 @@ R600DoneCopy(PixmapPtr pDst)
     accel_state->src_bo[0] = NULL;
     accel_state->src_bo[1] = NULL;
     accel_state->dst_bo = NULL;
+    accel_state->vb_bo = NULL;
 }
 
 
@@ -1627,8 +1621,6 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     CLEAR (ps_conf);
 
     r600_cp_start(pScrn);
-    if (!r600_vb_get(pScrn))
-	RADEON_FALLBACK(("Can't get VB\n"));
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -1818,8 +1810,6 @@ static void R600Composite(PixmapPtr pDst,
         if (((accel_state->vb_index + 3) * 24) > accel_state->vb_total) {
             R600DoneComposite(pDst);
 	    r600_cp_start(pScrn);
-	    if (!r600_vb_get(pScrn))
-		return;
         }
 
         vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*24);
@@ -1858,8 +1848,6 @@ static void R600Composite(PixmapPtr pDst,
         if (((accel_state->vb_index + 3) * 16) > accel_state->vb_total) {
             R600DoneComposite(pDst);
 	    r600_cp_start(pScrn);
-	    if (!r600_vb_get(pScrn))
-		return;
         }
 
         vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*16);
@@ -1954,6 +1942,7 @@ static void R600DoneComposite(PixmapPtr pDst)
     accel_state->src_bo[0] = NULL;
     accel_state->src_bo[1] = NULL;
     accel_state->dst_bo = NULL;
+    accel_state->vb_bo = NULL;
 }
 
 Bool
@@ -2374,6 +2363,11 @@ R600DrawInit(ScreenPtr pScreen)
 
     info->accel_state->XInited3D = FALSE;
     info->accel_state->copy_area = NULL;
+    info->accel_state->src_bo[0] = NULL;
+    info->accel_state->src_bo[1] = NULL;
+    info->accel_state->dst_bo = NULL;
+    info->accel_state->copy_area_bo = NULL;
+    info->accel_state->vb_bo = NULL;
 
     if (!R600AllocShaders(pScrn, pScreen))
 	return FALSE;
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 9cbfea4..631a40c 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -254,8 +254,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 #endif
 
     r600_cp_start(pScrn);
-    if (!r600_vb_get(pScrn))
-	return;
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -587,8 +585,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
         if (((accel_state->vb_index + 3) * 16) > accel_state->vb_total) {
             R600DoneTexturedVideo(pScrn);
-            accel_state->ib = RADEONCPGetBuffer(pScrn);
-            r600_vb_get(pScrn);
+	    r600_cp_start(pScrn);
         }
 
         vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*16);
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 4aa4650..6e4c8ea 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -1188,7 +1188,7 @@ r600_cp_start(ScrnInfoPtr pScrn)
 #if defined(XF86DRM_MODE)
     if (info->cs) {
 	if (!r600_vb_get(pScrn))
-	    return FALSE;
+	    return -1;
 	radeon_cs_space_reset_bos(info->cs);
 	radeon_cs_space_add_persistent_bo(info->cs, accel_state->shaders_bo,
 					  RADEON_GEM_DOMAIN_VRAM, 0);
@@ -1198,10 +1198,12 @@ r600_cp_start(ScrnInfoPtr pScrn)
 	if (accel_state->src_bo[1])
 	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->src_bo[1],
 					      RADEON_GEM_DOMAIN_VRAM, 0);
-	radeon_cs_space_add_persistent_bo(info->cs, accel_state->dst_bo,
-					  RADEON_GEM_DOMAIN_VRAM, 0);
-	radeon_cs_space_add_persistent_bo(info->cs, accel_state->vb_bo,
-					  RADEON_GEM_DOMAIN_GTT, 0);
+	if (accel_state->dst_bo)
+	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->dst_bo,
+					      RADEON_GEM_DOMAIN_VRAM, 0);
+	if (accel_state->vb_bo)
+	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->vb_bo,
+					      RADEON_GEM_DOMAIN_GTT, 0);
 	if (accel_state->copy_area_bo)
 	    radeon_cs_space_add_persistent_bo(info->cs,
 					      accel_state->copy_area_bo,
commit 65852de027989c105246fa4e4eed432f29525a22
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 17:29:42 2009 -0400

    r6xx/r7xx EXA: WIP

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 03d3d8c..b6a1a15 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -128,6 +128,8 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     uint32_t a, r, g, b;
     float ps_alu_consts[4];
 
+    //return FALSE;
+
     if (pPix->drawable.bitsPerPixel == 24)
         RADEON_FALLBACK(("24bpp unsupported\n"));
     if (!R600CheckBPP(pPix->drawable.bitsPerPixel))
@@ -713,6 +715,8 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
+    return FALSE;
+
     if (pSrc->drawable.bitsPerPixel == 24)
         RADEON_FALLBACK(("24bpp unsupported\n"));
     if (pDst->drawable.bitsPerPixel == 24)
@@ -1575,6 +1579,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     cb_config_t cb_conf;
     shader_config_t vs_conf, ps_conf;
 
+    return FALSE;
     /* return FALSE; */
 
     if (pDst->drawable.bitsPerPixel < 8 || pSrc->drawable.bitsPerPixel < 8)
diff --git a/src/radeon_kms.c b/src/radeon_kms.c
index cd398c6..faa0cfd 100644
--- a/src/radeon_kms.c
+++ b/src/radeon_kms.c
@@ -180,7 +180,7 @@ static Bool RADEONPreInitAccel_KMS(ScrnInfoPtr pScrn)
 	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Unable to allocate accel_state rec!\n");
 	return FALSE;
     }
-
+#if 0
     if (info->ChipFamily >= CHIP_FAMILY_R600) {
 	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
 		   "Using shadowfb for KMS on R600+\n");
@@ -189,7 +189,7 @@ static Bool RADEONPreInitAccel_KMS(ScrnInfoPtr pScrn)
 	    info->r600_shadow_fb = FALSE;
 	return TRUE;
     }
-
+#endif
 
     if ((info->ChipFamily == CHIP_FAMILY_RS100) ||
 	(info->ChipFamily == CHIP_FAMILY_RS200) ||
commit 69ec7a35e2a0a3d802ec093a6aab2d7ed2cc88be
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 16:14:02 2009 -0400

    r6xx/r7xx: first pass at kms accel support
    
    Adapted from various patches from Dave and Jerome.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 555748b..03d3d8c 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -133,7 +133,15 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     if (!R600CheckBPP(pPix->drawable.bitsPerPixel))
         RADEON_FALLBACK(("R600CheckDatatype failed\n"));
 
-    accel_state->dst_mc_addr = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->dst_mc_addr = 0;
+	accel_state->dst_bo = radeon_get_pixmap_bo(pPix);
+	accel_state->src_bo[0] = NULL;
+	accel_state->src_bo[1] = NULL;
+    } else
+#endif
+	accel_state->dst_mc_addr = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
     accel_state->dst_size = exaGetPixmapPitch(pPix) * pPix->drawable.height;
     accel_state->dst_pitch = exaGetPixmapPitch(pPix) / (pPix->drawable.bitsPerPixel / 8);
 
@@ -156,10 +164,15 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 	   pPix->drawable.bitsPerPixel, exaGetPixmapPitch(pPix));
 #endif
 
-    accel_state->ib = RADEONCPGetBuffer(pScrn);
-    r600_vb_get(pScrn);
+    r600_cp_start(pScrn);
+    if (!r600_vb_get(pScrn))
+	RADEON_FALLBACK(("Can't get VB\n"));
 
     /* Init */
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	accel_state->XInited3D = FALSE;
+#endif
     start_3d(pScrn, accel_state->ib);
 
     set_default_state(pScrn, accel_state->ib);
@@ -168,10 +181,18 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pPix->drawable.width, pPix->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pPix->drawable.width, pPix->drawable.height);
 
-    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->solid_vs_offset;
-    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->solid_ps_offset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->vs_mc_addr = accel_state->solid_vs_offset;
+	accel_state->ps_mc_addr = accel_state->solid_ps_offset;
+    } else
+#endif
+    {
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->solid_vs_offset;
+	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->solid_ps_offset;
+    }
     accel_state->vs_size = 512;
     accel_state->ps_size = 512;
 
@@ -179,16 +200,19 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->vs_size, accel_state->vs_mc_addr);
+			accel_state->vs_size, accel_state->vs_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.num_gprs            = 2;
     vs_conf.stack_size          = 0;
+    vs_conf.bo                  = accel_state->shaders_bo;
     vs_setup                    (pScrn, accel_state->ib, &vs_conf);
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->ps_size, accel_state->ps_mc_addr);
+			accel_state->ps_size, accel_state->ps_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.num_gprs            = 1;
@@ -196,6 +220,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
+    ps_conf.bo                  = accel_state->shaders_bo;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
     /* Render setup */
@@ -216,6 +241,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     cb_conf.w = accel_state->dst_pitch;
     cb_conf.h = pPix->drawable.height;
     cb_conf.base = accel_state->dst_mc_addr;
+    cb_conf.bo = accel_state->dst_bo;
 
     if (pPix->drawable.bitsPerPixel == 8) {
 	cb_conf.format = COLOR_8;
@@ -296,8 +322,9 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
 
     if (((accel_state->vb_index + 3) * 8) > accel_state->vb_total) {
         R600DoneSolid(pPix);
-        accel_state->ib = RADEONCPGetBuffer(pScrn);
-        r600_vb_get(pScrn);
+	r600_cp_start(pScrn);
+	if (!r600_vb_get(pScrn))
+	    return;
     }
 
     vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*8);
@@ -342,10 +369,12 @@ R600DoneSolid(PixmapPtr pPix)
 	(info->ChipFamily == CHIP_FAMILY_RS880) ||
 	(info->ChipFamily == CHIP_FAMILY_RV710))
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
     else
 	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
 
     /* Vertex buffer setup */
     vtx_res.id              = SQ_VTX_RESOURCE_vs;
@@ -353,6 +382,7 @@ R600DoneSolid(PixmapPtr pPix)
     vtx_res.vtx_num_entries = accel_state->vb_size / 4;
     vtx_res.mem_req_size    = 1;
     vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    vtx_res.bo              = accel_state->vb_bo;
     set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
 
     /* Draw */
@@ -368,15 +398,22 @@ R600DoneSolid(PixmapPtr pPix)
 
     /* sync dst surface */
     cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
+			accel_state->dst_size, accel_state->dst_mc_addr,
+			accel_state->dst_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
+
+    accel_state->src_bo[0] = NULL;
+    accel_state->src_bo[1] = NULL;
+    accel_state->dst_bo = NULL;
 }
 
 static void
 R600DoPrepareCopy(ScrnInfoPtr pScrn,
-		  int src_pitch, int src_width, int src_height, uint32_t src_offset, int src_bpp,
-		  int dst_pitch, int dst_width, int dst_height, uint32_t dst_offset, int dst_bpp,
+		  int src_pitch, int src_width, int src_height,
+		  uint32_t src_offset, struct radeon_bo *src_bo, int src_bpp,
+		  int dst_pitch, int dst_width, int dst_height,
+		  uint32_t dst_offset, struct radeon_bo *dst_bo, int dst_bpp,
 		  int rop, Pixel planemask)
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
@@ -393,8 +430,25 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     CLEAR (vs_conf);
     CLEAR (ps_conf);
 
-    accel_state->ib = RADEONCPGetBuffer(pScrn);
-    r600_vb_get(pScrn);
+    accel_state->src_size[0] = src_pitch * src_height * (src_bpp/8);
+    accel_state->src_mc_addr[0] = src_offset;
+    accel_state->src_pitch[0] = src_pitch;
+    accel_state->src_width[0] = src_width;
+    accel_state->src_height[0] = src_height;
+    accel_state->src_bpp[0] = src_bpp;
+    accel_state->src_bo[0] = src_bo;
+    accel_state->src_bo[1] = NULL;
+
+    accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
+    accel_state->dst_mc_addr = dst_offset;
+    accel_state->dst_pitch = dst_pitch;
+    accel_state->dst_height = dst_height;
+    accel_state->dst_bpp = dst_bpp;
+    accel_state->dst_bo = dst_bo;
+
+    r600_cp_start(pScrn);
+    if (!r600_vb_get(pScrn))
+	return;
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -405,10 +459,18 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, dst_width, dst_height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, dst_width, dst_height);
 
-    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->copy_vs_offset;
-    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->copy_ps_offset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->vs_mc_addr = accel_state->copy_vs_offset;
+	accel_state->ps_mc_addr = accel_state->copy_ps_offset;
+    } else
+#endif
+    {
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->copy_vs_offset;
+	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->copy_ps_offset;
+    }
     accel_state->vs_size = 512;
     accel_state->ps_size = 512;
 
@@ -416,16 +478,19 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->vs_size, accel_state->vs_mc_addr);
+			accel_state->vs_size, accel_state->vs_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.num_gprs            = 2;
     vs_conf.stack_size          = 0;
+    vs_conf.bo                  = accel_state->shaders_bo;
     vs_setup                    (pScrn, accel_state->ib, &vs_conf);
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->ps_size, accel_state->ps_mc_addr);
+			accel_state->ps_size, accel_state->ps_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.num_gprs            = 1;
@@ -433,18 +498,13 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
+    ps_conf.bo                  = accel_state->shaders_bo;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
-    accel_state->src_size[0] = src_pitch * src_height * (src_bpp/8);
-    accel_state->src_mc_addr[0] = src_offset;
-    accel_state->src_pitch[0] = src_pitch;
-    accel_state->src_width[0] = src_width;
-    accel_state->src_height[0] = src_height;
-    accel_state->src_bpp[0] = src_bpp;
-
     /* flush texture cache */
     cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			accel_state->src_size[0], accel_state->src_mc_addr[0]);
+			accel_state->src_size[0], accel_state->src_mc_addr[0],
+			accel_state->src_bo[0], RADEON_GEM_DOMAIN_VRAM, 0);
 
     /* Texture */
     tex_res.id                  = 0;
@@ -455,6 +515,8 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     tex_res.dim                 = SQ_TEX_DIM_2D;
     tex_res.base                = accel_state->src_mc_addr[0];
     tex_res.mip_base            = accel_state->src_mc_addr[0];
+    tex_res.bo                  = accel_state->src_bo[0];
+    tex_res.mip_bo              = accel_state->src_bo[0];
     if (src_bpp == 8) {
 	tex_res.format              = FMT_8;
 	tex_res.dst_sel_x           = SQ_SEL_1; /* R */
@@ -506,16 +568,11 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[rop]);
     END_BATCH();
 
-    accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
-    accel_state->dst_mc_addr = dst_offset;
-    accel_state->dst_pitch = dst_pitch;
-    accel_state->dst_height = dst_height;
-    accel_state->dst_bpp = dst_bpp;
-
     cb_conf.id = 0;
     cb_conf.w = accel_state->dst_pitch;
     cb_conf.h = dst_height;
     cb_conf.base = accel_state->dst_mc_addr;
+    cb_conf.bo = accel_state->dst_bo;
     if (dst_bpp == 8) {
 	cb_conf.format = COLOR_8;
 	cb_conf.comp_swap = 3; /* A */
@@ -575,10 +632,12 @@ R600DoCopy(ScrnInfoPtr pScrn)
 	(info->ChipFamily == CHIP_FAMILY_RS880) ||
 	(info->ChipFamily == CHIP_FAMILY_RV710))
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
     else
 	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
 
     /* Vertex buffer setup */
     vtx_res.id              = SQ_VTX_RESOURCE_vs;
@@ -586,6 +645,7 @@ R600DoCopy(ScrnInfoPtr pScrn)
     vtx_res.vtx_num_entries = accel_state->vb_size / 4;
     vtx_res.mem_req_size    = 1;
     vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    vtx_res.bo              = accel_state->vb_bo;
     set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
 
     draw_conf.prim_type          = DI_PT_RECTLIST;
@@ -600,7 +660,8 @@ R600DoCopy(ScrnInfoPtr pScrn)
 
     /* sync dst surface */
     cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
+			accel_state->dst_size, accel_state->dst_mc_addr,
+			accel_state->dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
 }
@@ -617,8 +678,9 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
 
     if (((accel_state->vb_index + 3) * 16) > accel_state->vb_total) {
         R600DoCopy(pScrn);
-        accel_state->ib = RADEONCPGetBuffer(pScrn);
-        r600_vb_get(pScrn);
+	r600_cp_start(pScrn);
+	if (!r600_vb_get(pScrn))
+	    return;
     }
 
     vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*16);
@@ -663,8 +725,19 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     accel_state->dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
 
-    accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
-    accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->src_mc_addr[0] = 0;
+	accel_state->dst_mc_addr = 0;
+	accel_state->src_bo[0] = radeon_get_pixmap_bo(pSrc);
+	accel_state->src_bo[1] = NULL;
+	accel_state->dst_bo = radeon_get_pixmap_bo(pDst);
+    } else
+#endif
+    {
+	accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
+	accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+    }
 
     accel_state->src_width[0] = pSrc->drawable.width;
     accel_state->src_height[0] = pSrc->drawable.height;
@@ -701,19 +774,46 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	unsigned long size = pDst->drawable.height * accel_state->dst_pitch * pDst->drawable.bitsPerPixel/8;
 	accel_state->same_surface = TRUE;
 
-	if (accel_state->copy_area) {
-	    exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
-	    accel_state->copy_area = NULL;
+#if defined(XF86DRM_MODE)
+	if (info->cs) {
+	    if (accel_state->copy_area_bo) {
+		radeon_bo_unref(accel_state->copy_area_bo);
+		accel_state->copy_area_bo = NULL;
+	    }
+	    accel_state->copy_area_bo = radeon_bo_open(info->bufmgr, 0, size,
+						       4096,
+						       RADEON_GEM_DOMAIN_VRAM,
+						       0);
+	    if (accel_state->copy_area_bo == NULL) {
+		R600IBDiscard(pScrn, accel_state->ib);
+		return FALSE;
+	    }
+	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->copy_area_bo,
+					      RADEON_GEM_DOMAIN_VRAM, 0);
+	    if (radeon_cs_space_check(info->cs)) {
+		radeon_bo_unref(accel_state->copy_area_bo);
+		accel_state->copy_area_bo = NULL;
+		R600IBDiscard(pScrn, accel_state->ib);
+		return FALSE;
+	    }
+	    accel_state->copy_area = (void*)accel_state->copy_area_bo;
+	} else
+#endif
+	{
+	    if (accel_state->copy_area) {
+		exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
+		accel_state->copy_area = NULL;
+	    }
+	    accel_state->copy_area = exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
 	}
-	accel_state->copy_area = exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
     } else {
 	accel_state->same_surface = FALSE;
 
 	R600DoPrepareCopy(pScrn,
 			  accel_state->src_pitch[0], pSrc->drawable.width, pSrc->drawable.height,
-			  accel_state->src_mc_addr[0], pSrc->drawable.bitsPerPixel,
+			  accel_state->src_mc_addr[0], accel_state->src_bo[0], pSrc->drawable.bitsPerPixel,
 			  accel_state->dst_pitch, pDst->drawable.width, pDst->drawable.height,
-			  accel_state->dst_mc_addr, pDst->drawable.bitsPerPixel,
+			  accel_state->dst_mc_addr, accel_state->dst_bo, pDst->drawable.bitsPerPixel,
 			  rop, planemask);
 
     }
@@ -745,6 +845,17 @@ R600OverlapCopy(PixmapPtr pDst,
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
     int i, hchunk, vchunk;
+    struct radeon_bo *dst_bo = NULL;
+
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	dst_offset = 0;
+	dst_bo = radeon_get_pixmap_bo(pDst);
+	radeon_cs_space_add_persistent_bo(info->cs, dst_bo,
+					  RADEON_GEM_DOMAIN_VRAM, 0);
+	radeon_cs_space_check(info->cs);
+    }
+#endif
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
 		   dstX, dstX + w, dstY, dstY + h)) {
@@ -760,8 +871,10 @@ R600OverlapCopy(PixmapPtr pDst,
             if ((w / hchunk) <= (h / vchunk)) { /* reduce to horizontal  */
                 if (srcY > dstY ) { /* diagonal up */
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
                     R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, vchunk);
                     R600DoCopy(pScrn);
@@ -770,8 +883,10 @@ R600OverlapCopy(PixmapPtr pDst,
                     dstY = dstY + vchunk;
                 } else { /* diagonal down */
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
                     R600AppendCopyVertex(pScrn, srcX, srcY + h - vchunk, dstX, dstY + h - vchunk, w, vchunk);
                     R600DoCopy(pScrn);
@@ -781,8 +896,10 @@ R600OverlapCopy(PixmapPtr pDst,
             } else { /* reduce to vertical */
                 if (srcX > dstX ) { /* diagonal left */
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
                     R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, hchunk, h);
                     R600DoCopy(pScrn);
@@ -791,8 +908,10 @@ R600OverlapCopy(PixmapPtr pDst,
                     dstX = dstX + hchunk;
                 } else { /* diagonal right */
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
                     R600AppendCopyVertex(pScrn, srcX + w - hchunk, srcY, dstX + w - hchunk, dstY, hchunk, h);
                     R600DoCopy(pScrn);
@@ -807,8 +926,10 @@ R600OverlapCopy(PixmapPtr pDst,
 		/* copy right to left */
 		for (i = w; i > 0; i -= hchunk) {
 		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 		    R600AppendCopyVertex(pScrn, srcX + i - hchunk, srcY, dstX + i - hchunk, dstY, hchunk, h);
 		    R600DoCopy(pScrn);
@@ -817,8 +938,10 @@ R600OverlapCopy(PixmapPtr pDst,
 		/* copy left to right */
 		for (i = 0; i < w; i += hchunk) {
 		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
 		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, hchunk, h);
@@ -830,8 +953,10 @@ R600OverlapCopy(PixmapPtr pDst,
 		/* copy top to bottom */
                 for (i = 0; i < h; i += vchunk) {
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
 
                     if (vchunk > h - i) vchunk = h - i;
@@ -842,8 +967,10 @@ R600OverlapCopy(PixmapPtr pDst,
 		/* copy bottom to top */
                 for (i = h; i > 0; i -= vchunk) {
                     R600DoPrepareCopy(pScrn,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height,
+				      dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
 
                     if (vchunk > i) vchunk = i;
@@ -854,8 +981,10 @@ R600OverlapCopy(PixmapPtr pDst,
 	}
     } else {
 	R600DoPrepareCopy(pScrn,
-			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+			  dst_pitch, pDst->drawable.width, pDst->drawable.height,
+			  dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
+			  dst_pitch, pDst->drawable.width, pDst->drawable.height,
+			  dst_offset, dst_bo, pDst->drawable.bitsPerPixel,
 			  accel_state->rop, accel_state->planemask);
 
 	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
@@ -872,27 +1001,45 @@ R600Copy(PixmapPtr pDst,
     ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
+    struct radeon_bo *bo = NULL;
 
     if (accel_state->same_surface && (srcX == dstX) && (srcY == dstY))
 	return;
 
-    if (accel_state->same_surface && is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	bo = radeon_get_pixmap_bo(pDst);
+#endif
+
+    if (accel_state->same_surface &&
+	is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
 	if (accel_state->copy_area) {
 	    uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
 	    uint32_t orig_offset, tmp_offset;
 
-	    tmp_offset = accel_state->copy_area->offset + info->fbLocation + pScrn->fbOffset;
-	    orig_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-
+#if defined(XF86DRM_MODE)
+	    if (info->cs) {
+		tmp_offset = 0;
+		orig_offset = 0;
+	    } else
+#endif
+	    {
+		tmp_offset = accel_state->copy_area->offset + info->fbLocation + pScrn->fbOffset;
+		orig_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+	    }
 	    R600DoPrepareCopy(pScrn,
-			      pitch, pDst->drawable.width, pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
-			      pitch, pDst->drawable.width, pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
+			      pitch, pDst->drawable.width, pDst->drawable.height,
+			      orig_offset, bo, pDst->drawable.bitsPerPixel,
+			      pitch, pDst->drawable.width, pDst->drawable.height,
+			      tmp_offset, accel_state->copy_area_bo, pDst->drawable.bitsPerPixel,
 			      accel_state->rop, accel_state->planemask);
 	    R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
 	    R600DoCopy(pScrn);
 	    R600DoPrepareCopy(pScrn,
-			      pitch, pDst->drawable.width, pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
-			      pitch, pDst->drawable.width, pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
+			      pitch, pDst->drawable.width, pDst->drawable.height,
+			      tmp_offset, accel_state->copy_area_bo, pDst->drawable.bitsPerPixel,
+			      pitch, pDst->drawable.width, pDst->drawable.height,
+			      orig_offset, bo, pDst->drawable.bitsPerPixel,
 			      accel_state->rop, accel_state->planemask);
 	    R600AppendCopyVertex(pScrn, dstX, dstY, dstX, dstY, w, h);
 	    R600DoCopy(pScrn);
@@ -900,11 +1047,20 @@ R600Copy(PixmapPtr pDst,
 	    R600OverlapCopy(pDst, srcX, srcY, dstX, dstY, w, h);
     } else if (accel_state->same_surface) {
 	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
-	uint32_t offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+	uint32_t offset;
+
+#if defined(XF86DRM_MODE)
+	    if (info->cs)
+		offset = 0;
+	    else
+#endif
+		offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
 
 	R600DoPrepareCopy(pScrn,
-			  pitch, pDst->drawable.width, pDst->drawable.height, offset, pDst->drawable.bitsPerPixel,
-			  pitch, pDst->drawable.width, pDst->drawable.height, offset, pDst->drawable.bitsPerPixel,
+			  pitch, pDst->drawable.width, pDst->drawable.height,
+			  offset, bo, pDst->drawable.bitsPerPixel,
+			  pitch, pDst->drawable.width, pDst->drawable.height,
+			  offset, bo, pDst->drawable.bitsPerPixel,
 			  accel_state->rop, accel_state->planemask);
 	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
 	R600DoCopy(pScrn);
@@ -925,10 +1081,13 @@ R600DoneCopy(PixmapPtr pDst)
 	R600DoCopy(pScrn);
 
     if (accel_state->copy_area) {
-	exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
+	if (!info->cs)
+	    exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
 	accel_state->copy_area = NULL;
     }
-
+    accel_state->src_bo[0] = NULL;
+    accel_state->src_bo[1] = NULL;
+    accel_state->dst_bo = NULL;
 }
 
 
@@ -1103,7 +1262,16 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     CLEAR (tex_res);
     CLEAR (tex_samp);
 
-    accel_state->src_mc_addr[unit] = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->src_mc_addr[unit] = 0;
+	accel_state->src_bo[unit] = radeon_get_pixmap_bo(pPix);
+	radeon_cs_space_add_persistent_bo(info->cs, accel_state->src_bo[unit],
+					  RADEON_GEM_DOMAIN_VRAM, 0);
+	radeon_cs_space_check(info->cs);
+    } else
+#endif
+	accel_state->src_mc_addr[unit] = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
     accel_state->src_pitch[unit] = exaGetPixmapPitch(pPix) / (pPix->drawable.bitsPerPixel / 8);
     accel_state->src_size[unit] = exaGetPixmapPitch(pPix) * pPix->drawable.height;
 
@@ -1122,7 +1290,8 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
 
     /* flush texture cache */
     cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			accel_state->src_size[unit], accel_state->src_mc_addr[unit]);
+			accel_state->src_size[unit], accel_state->src_mc_addr[unit],
+			accel_state->src_bo[unit], RADEON_GEM_DOMAIN_VRAM, 0);
 
     /* Texture */
     tex_res.id                  = unit;
@@ -1134,6 +1303,8 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     tex_res.base                = accel_state->src_mc_addr[unit];
     tex_res.mip_base            = accel_state->src_mc_addr[unit];
     tex_res.format              = R600TexFormats[i].card_fmt;
+    tex_res.bo                  = accel_state->src_bo[unit];
+    tex_res.mip_bo              = accel_state->src_bo[unit];
     tex_res.request_size        = 1;
 
     /* component swizzles */
@@ -1406,6 +1577,9 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 
     /* return FALSE; */
 
+    if (pDst->drawable.bitsPerPixel < 8 || pSrc->drawable.bitsPerPixel < 8)
+	return FALSE;
+
     if (pMask) {
 	accel_state->msk_pic = pMaskPicture;
 	if (pMaskPicture->componentAlpha) {
@@ -1424,7 +1598,13 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 	accel_state->src_alpha = FALSE;
     }
 
-    accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->dst_mc_addr = 0;
+	accel_state->dst_bo = radeon_get_pixmap_bo(pDst);
+    } else
+#endif
+	accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
     accel_state->dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     accel_state->dst_size = exaGetPixmapPitch(pDst) * pDst->drawable.height;
 
@@ -1441,8 +1621,9 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     CLEAR (vs_conf);
     CLEAR (ps_conf);
 
-    accel_state->ib = RADEONCPGetBuffer(pScrn);
-    r600_vb_get(pScrn);
+    r600_cp_start(pScrn);
+    if (!r600_vb_get(pScrn))
+	RADEON_FALLBACK(("Can't get VB\n"));
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -1470,16 +1651,31 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 
     if (pMask) {
 	set_bool_consts(pScrn, accel_state->ib, SQ_BOOL_CONST_vs, (1 << 0));
-	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->comp_mask_ps_offset;
+#if defined(XF86DRM_MODE)
+	if (info->cs)
+	    accel_state->ps_mc_addr = accel_state->comp_mask_ps_offset;
+	else
+#endif
+	    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+		accel_state->comp_mask_ps_offset;
     } else {
 	set_bool_consts(pScrn, accel_state->ib, SQ_BOOL_CONST_vs, (0 << 0));
-	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+#if defined(XF86DRM_MODE)
+	if (info->cs)
+	    accel_state->ps_mc_addr = accel_state->comp_ps_offset;
+	else
+#endif
+	    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
 	    accel_state->comp_ps_offset;
     }
 
-    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->comp_vs_offset;
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	accel_state->vs_mc_addr = accel_state->comp_vs_offset;
+    else
+#endif
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->comp_vs_offset;
 
     accel_state->vs_size = 512;
     accel_state->ps_size = 512;
@@ -1488,16 +1684,19 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->vs_size, accel_state->vs_mc_addr);
+			accel_state->vs_size, accel_state->vs_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.num_gprs            = 3;
     vs_conf.stack_size          = 1;
+    vs_conf.bo                  = accel_state->shaders_bo;
     vs_setup                    (pScrn, accel_state->ib, &vs_conf);
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->ps_size, accel_state->ps_mc_addr);
+			accel_state->ps_size, accel_state->ps_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.num_gprs            = 3;
@@ -1505,6 +1704,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
+    ps_conf.bo                  = accel_state->shaders_bo;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
     BEGIN_BATCH(12);
@@ -1529,6 +1729,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     cb_conf.h = pDst->drawable.height;
     cb_conf.base = accel_state->dst_mc_addr;
     cb_conf.format = dst_format;
+    cb_conf.bo = accel_state->dst_bo;
 
     switch (pDstPicture->format) {
     case PICT_a8r8g8b8:
@@ -1611,8 +1812,9 @@ static void R600Composite(PixmapPtr pDst,
 
         if (((accel_state->vb_index + 3) * 24) > accel_state->vb_total) {
             R600DoneComposite(pDst);
-            accel_state->ib = RADEONCPGetBuffer(pScrn);
-            r600_vb_get(pScrn);
+	    r600_cp_start(pScrn);
+	    if (!r600_vb_get(pScrn))
+		return;
         }
 
         vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*24);
@@ -1650,8 +1852,9 @@ static void R600Composite(PixmapPtr pDst,
     } else {
         if (((accel_state->vb_index + 3) * 16) > accel_state->vb_total) {
             R600DoneComposite(pDst);
-            accel_state->ib = RADEONCPGetBuffer(pScrn);
-            r600_vb_get(pScrn);
+	    r600_cp_start(pScrn);
+	    if (!r600_vb_get(pScrn))
+		return;
         }
 
         vb = (pointer)((char*)accel_state->vb_ptr+accel_state->vb_index*16);
@@ -1701,6 +1904,7 @@ static void R600DoneComposite(PixmapPtr pDst)
 	vtx_res.vtx_num_entries = accel_state->vb_size / 4;
 	vtx_res.mem_req_size    = 1;
 	vtx_res.vb_addr         = accel_state->vb_mc_addr;
+	vtx_res.bo              = accel_state->vb_bo;
     } else {
 	accel_state->vb_size = accel_state->vb_index * 16;
 	vtx_res.id              = SQ_VTX_RESOURCE_vs;
@@ -1708,6 +1912,7 @@ static void R600DoneComposite(PixmapPtr pDst)
 	vtx_res.vtx_num_entries = accel_state->vb_size / 4;
 	vtx_res.mem_req_size    = 1;
 	vtx_res.vb_addr         = accel_state->vb_mc_addr;
+	vtx_res.bo              = accel_state->vb_bo;
     }
     /* flush vertex cache */
     if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
@@ -1716,12 +1921,14 @@ static void R600DoneComposite(PixmapPtr pDst)
 	(info->ChipFamily == CHIP_FAMILY_RS880) ||
 	(info->ChipFamily == CHIP_FAMILY_RV710))
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
     else
 	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
 
-    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+    set_vtx_resource(pScrn, accel_state->ib, &vtx_res);
 
     draw_conf.prim_type          = DI_PT_RECTLIST;
     draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
@@ -1734,9 +1941,14 @@ static void R600DoneComposite(PixmapPtr pDst)
     wait_3d_idle_clean(pScrn, accel_state->ib);
 
     cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
+			accel_state->dst_size, accel_state->dst_mc_addr,
+			accel_state->dst_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
+
+    accel_state->src_bo[0] = NULL;
+    accel_state->src_bo[1] = NULL;
+    accel_state->dst_bo = NULL;
 }
 
 Bool
@@ -1753,6 +1965,7 @@ R600CopyToVRAM(ScrnInfoPtr pScrn,
     int scratch_offset = 0, hpass, temph;
     char *dst;
     drmBufPtr scratch;
+    struct radeon_bo *bo = NULL;
 
     if (dst_pitch & 7)
 	return FALSE;
@@ -1795,8 +2008,10 @@ R600CopyToVRAM(ScrnInfoPtr pScrn,
 	}
 	/* blit from scratch to vram */
 	R600DoPrepareCopy(pScrn,
-			  scratch_pitch, w, oldhpass, offset, bpp,
-			  dst_pitch, dst_width, dst_height, dst_mc_addr, bpp,
+			  scratch_pitch, w, oldhpass,
+			  offset, bo, bpp,
+			  dst_pitch, dst_width, dst_height,
+			  dst_mc_addr, bo, bpp,
 			  3, 0xffffffff);
 	R600AppendCopyVertex(pScrn, 0, 0, x, y, w, oldhpass);
 	R600DoCopy(pScrn);
@@ -1842,6 +2057,7 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
     uint32_t scratch_pitch = scratch_pitch_bytes / (bpp / 8);
     int wpass = w * (bpp/8);
     drmBufPtr scratch;
+    struct radeon_bo *bo = NULL;
 
     /* RV740 seems to be particularly problematic with small xfers */
     if ((info->ChipFamily == CHIP_FAMILY_RV740) && (w < 32 || h < 32))
@@ -1859,8 +2075,10 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 
     /* blit from vram to scratch */
     R600DoPrepareCopy(pScrn,
-		      src_pitch, src_width, src_height, src_mc_addr, bpp,
-		      scratch_pitch, src_width, hpass, scratch_mc_addr, bpp,
+		      src_pitch, src_width, src_height,
+		      src_mc_addr, bo, bpp,
+		      scratch_pitch, src_width, hpass,
+		      scratch_mc_addr, bo, bpp,
 		      3, 0xffffffff);
     R600AppendCopyVertex(pScrn, x, y, 0, 0, w, hpass);
     R600DoCopy(pScrn);
@@ -1876,8 +2094,10 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 	    scratch_offset = scratch->total/2 - scratch_offset;
 	    /* blit from vram to scratch */
 	    R600DoPrepareCopy(pScrn,
-			      src_pitch, src_width, src_height, src_mc_addr, bpp,
-			      scratch_pitch, src_width, hpass, scratch_mc_addr + scratch_offset, bpp,
+			      src_pitch, src_width, src_height,
+			      src_mc_addr, bo, bpp,
+			      scratch_pitch, src_width, hpass,
+			      scratch_mc_addr + scratch_offset, bo, bpp,
 			      3, 0xffffffff);
 	    R600AppendCopyVertex(pScrn, x, y, 0, 0, w, hpass);
 	    R600DoCopy(pScrn);
@@ -1919,7 +2139,12 @@ R600Sync(ScreenPtr pScreen, int marker)
     struct radeon_accel_state *accel_state = info->accel_state;
 
     if (accel_state->exaMarkerSynced != marker) {
-	RADEONWaitForIdleCP(pScrn);
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+	if (!info->cs)
+#endif
+#endif
+	    RADEONWaitForIdleCP(pScrn);
 	accel_state->exaMarkerSynced = marker;
     }
 
@@ -1936,11 +2161,27 @@ R600AllocShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 
     accel_state->shaders = NULL;
 
-    accel_state->shaders = exaOffscreenAlloc(pScreen, size, 256,
-					     TRUE, NULL, NULL);
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+    if (info->cs) {
+	accel_state->shaders_bo = radeon_bo_open(info->bufmgr, 0, size, 4096,
+						 RADEON_GEM_DOMAIN_VRAM, 0);
+	if (accel_state->shaders_bo == NULL) {
+	    ErrorF("Allocating shader failed\n");
+	    return FALSE;
+	}
+	return TRUE;
+    } else
+#endif
+#endif
+    {
+	accel_state->shaders = exaOffscreenAlloc(pScreen, size, 256,
+						 TRUE, NULL, NULL);
+
+	if (accel_state->shaders == NULL)
+	    return FALSE;
+    }
 
-    if (accel_state->shaders == NULL)
-	return FALSE;
     return TRUE;
 }
 
@@ -1951,8 +2192,21 @@ R600LoadShaders(ScrnInfoPtr pScrn)
     struct radeon_accel_state *accel_state = info->accel_state;
     RADEONChipFamily ChipSet = info->ChipFamily;
     uint32_t *shader;
-
-    shader = (pointer)((char *)info->FB + accel_state->shaders->offset);
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+    int ret;
+
+    if (info->cs) {
+	ret = radeon_bo_map(accel_state->shaders_bo, 1);
+	if (ret) {
+	    FatalError("failed to map shader %d\n", ret);
+	    return FALSE;
+	}
+	shader = accel_state->shaders_bo->ptr;
+    } else
+#endif
+#endif
+	shader = (pointer)((char *)info->FB + accel_state->shaders->offset);
 
     /*  solid vs --------------------------------------- */
     accel_state->solid_vs_offset = 0;
@@ -1990,6 +2244,14 @@ R600LoadShaders(ScrnInfoPtr pScrn)
     accel_state->xv_ps_offset = 4096;
     R600_xv_ps(ChipSet, shader + accel_state->xv_ps_offset / 4);
 
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+    if (info->cs) {
+	radeon_bo_unmap(accel_state->shaders_bo);
+    }
+#endif
+#endif
+
     return TRUE;
 }
 
@@ -2018,7 +2280,6 @@ R600FinishAccess(PixmapPtr pPix, int index)
 
 }
 
-
 Bool
 R600DrawInit(ScreenPtr pScreen)
 {
@@ -2044,13 +2305,28 @@ R600DrawInit(ScreenPtr pScreen)
     info->accel_state->exa->MarkSync = R600MarkSync;
     info->accel_state->exa->WaitMarker = R600Sync;
 
-    info->accel_state->exa->PrepareAccess = R600PrepareAccess;
-    info->accel_state->exa->FinishAccess = R600FinishAccess;
-
-    /* AGP seems to have problems with gart transfers */
-    if (info->accelDFS) {
-	info->accel_state->exa->UploadToScreen = R600UploadToScreen;
-	info->accel_state->exa->DownloadFromScreen = R600DownloadFromScreen;
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+    if (info->cs) {
+	info->accel_state->exa->CreatePixmap = RADEONEXACreatePixmap;
+	info->accel_state->exa->DestroyPixmap = RADEONEXADestroyPixmap;
+	info->accel_state->exa->PixmapIsOffscreen = RADEONEXAPixmapIsOffscreen;
+	info->accel_state->exa->PrepareAccess = RADEONPrepareAccess_CS;
+	info->accel_state->exa->FinishAccess = RADEONFinishAccess_CS;
+	info->accel_state->exa->UploadToScreen = NULL;
+	info->accel_state->exa->DownloadFromScreen = NULL;
+    } else
+#endif
+#endif
+    {
+	info->accel_state->exa->PrepareAccess = R600PrepareAccess;
+	info->accel_state->exa->FinishAccess = R600FinishAccess;
+
+	/* AGP seems to have problems with gart transfers */
+	if (info->accelDFS) {
+	    info->accel_state->exa->UploadToScreen = R600UploadToScreen;
+	    info->accel_state->exa->DownloadFromScreen = R600DownloadFromScreen;
+	}
     }
 
     info->accel_state->exa->flags = EXA_OFFSCREEN_PIXMAPS;
@@ -2083,8 +2359,13 @@ R600DrawInit(ScreenPtr pScreen)
 	return FALSE;
     }
 
-    if (!info->gartLocation)
-	return FALSE;
+#ifdef XF86DRM_MODE
+#if (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 4)
+    if (!info->cs)
+#endif
+#endif
+	if (!info->gartLocation)
+	    return FALSE;
 
     info->accel_state->XInited3D = FALSE;
     info->accel_state->copy_area = NULL;
diff --git a/src/r600_state.h b/src/r600_state.h
index 10b1022..6ca88cf 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -50,6 +50,7 @@ typedef struct {
     int round_mode;
     int tile_compact;
     int source_format;
+    struct radeon_bo *bo;
 } cb_config_t;
 
 /* Depth buffer */
@@ -63,6 +64,7 @@ typedef struct {
     int tile_surface_en;
     int tile_compact;
     int zrange_precision;
+    struct radeon_bo *bo;
 } db_config_t;
 
 /* Shader */
@@ -79,6 +81,7 @@ typedef struct {
     int clamp_consts;
     int export_mode;
     int uncached_first_inst;
+    struct radeon_bo *bo;
 } shader_config_t;
 
 /* Vertex buffer / vtx resource */
@@ -94,6 +97,7 @@ typedef struct {
     int srf_mode_all;
     int endian;
     int mem_req_size;
+    struct radeon_bo *bo;
 } vtx_resource_t;
 
 /* Texture resource */
@@ -129,6 +133,8 @@ typedef struct {
     int mpeg_clamp;
     int perf_modulation;
     int interlaced;
+    struct radeon_bo *bo;
+    struct radeon_bo *mip_bo;
 } tex_resource_t;
 
 /* Texture sampler */
@@ -170,15 +176,43 @@ typedef struct {
     uint32_t num_indices;
 } draw_config_t;
 
+#if defined(XF86DRM_MODE)
+#define BEGIN_BATCH(n)				\
+do {					\
+    if (info->cs)			\
+	radeon_ddx_cs_start(pScrn, (n), __FILE__, __func__, __LINE__);	\
+} while(0)
+#define END_BATCH()				\
+do {					\
+    if (info->cs)			\
+	radeon_cs_end(info->cs, __FILE__, __func__, __LINE__);	\
+} while(0)
+#define RELOC_BATCH(bo, rd, wd)					\
+do {								\
+    if (info->cs)							\
+	OUT_RING_RELOC((bo), (rd), (wd));				\
+} while(0)
+#define E32(ib, dword)                                                  \
+do {                                                                    \
+    if (info->cs)							\
+	radeon_cs_write_dword(info->cs, (dword));			\
+    else {								\
+	uint32_t *ib_head = (pointer)(char*)(ib)->address;		\
+	ib_head[(ib)->used >> 2] = (dword);				\
+	(ib)->used += 4;						\
+    }									\
+} while (0)
+#else
 #define BEGIN_BATCH(n) do {} while(0)
 #define END_BATCH() do {} while(0)
-
+#define RELOC_BATCH(bo, wd, rd) do {} while(0)
 #define E32(ib, dword)                                                  \
 do {                                                                    \
     uint32_t *ib_head = (pointer)(char*)(ib)->address;			\
     ib_head[(ib)->used >> 2] = (dword);					\
     (ib)->used += 4;							\
 } while (0)
+#endif
 
 #define EFLOAT(ib, val)							\
 do {								        \
@@ -246,7 +280,8 @@ start_3d(ScrnInfoPtr pScrn, drmBufPtr ib);
 void
 set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf);
 void
-cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr);
+cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr,
+		    struct radeon_bo *bo, uint32_t rdomains, uint32_t wdomain);
 void
 cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix, int crtc, int start, int stop);
 void
@@ -282,9 +317,18 @@ draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *i
 void
 draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf);
 
-void
+Bool
 r600_vb_get(ScrnInfoPtr pScrn);
 void
 r600_vb_discard(ScrnInfoPtr pScrn);
+int
+r600_cp_start(ScrnInfoPtr pScrn);
+
+extern Bool RADEONPrepareAccess_CS(PixmapPtr pPix, int index);
+extern void RADEONFinishAccess_CS(PixmapPtr pPix, int index);
+extern void *RADEONEXACreatePixmap(ScreenPtr pScreen, int size, int align);
+extern void RADEONEXADestroyPixmap(ScreenPtr pScreen, void *driverPriv);
+extern struct radeon_bo *radeon_get_pixmap_bo(PixmapPtr pPix);
+extern Bool RADEONEXAPixmapIsOffscreen(PixmapPtr pPix);
 
 #endif
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 6739616..9cbfea4 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -80,10 +80,12 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
 	(info->ChipFamily == CHIP_FAMILY_RS880) ||
 	(info->ChipFamily == CHIP_FAMILY_RV710))
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
     else
 	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
+			    accel_state->vb_size, accel_state->vb_mc_addr,
+			    accel_state->vb_bo, RADEON_GEM_DOMAIN_GTT, 0);
 
     /* Vertex buffer setup */
     vtx_res.id              = SQ_VTX_RESOURCE_vs;
@@ -91,6 +93,7 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
     vtx_res.vtx_num_entries = accel_state->vb_size / 4;
     vtx_res.mem_req_size    = 1;
     vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    vtx_res.bo              = accel_state->vb_bo;
     set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
 
     draw_conf.prim_type          = DI_PT_RECTLIST;
@@ -105,9 +108,13 @@ R600DoneTexturedVideo(ScrnInfoPtr pScrn)
 
     /* sync destination surface */
     cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
+			accel_state->dst_size, accel_state->dst_mc_addr,
+			accel_state->dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
+    accel_state->dst_bo = NULL;
+    accel_state->src_bo[0] = NULL;
+    accel_state->src_bo[1] = NULL;
 }
 
 void
@@ -216,6 +223,19 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     CLEAR (vs_conf);
     CLEAR (ps_conf);
 
+#if defined(ACCEL_CP) && defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->dst_mc_addr = 0;
+	accel_state->src_mc_addr[0] = 0;
+	accel_state->src_bo[0] = pPriv->src_bo;
+	accel_state->src_bo[1] = NULL;
+	accel_state->dst_bo = radeon_get_pixmap_bo(pPixmap);
+    } else
+#endif
+    {
+	accel_state->dst_mc_addr = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
+	accel_state->src_mc_addr[0] = pPriv->src_offset + info->fbLocation + pScrn->fbOffset;
+    }
     accel_state->dst_pitch = exaGetPixmapPitch(pPixmap) / (pPixmap->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = pPriv->src_pitch;
 
@@ -233,8 +253,9 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     dstyoff = 0;
 #endif
 
-    accel_state->ib = RADEONCPGetBuffer(pScrn);
-    r600_vb_get(pScrn);
+    r600_cp_start(pScrn);
+    if (!r600_vb_get(pScrn))
+	return;
 
     /* Init */
     start_3d(pScrn, accel_state->ib);
@@ -245,11 +266,18 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
 
-    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->xv_vs_offset;
-
-    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->xv_ps_offset;
+#if defined(ACCEL_CP) && defined(XF86DRM_MODE)
+    if (info->cs) {
+	accel_state->vs_mc_addr = accel_state->xv_vs_offset;
+	accel_state->ps_mc_addr = accel_state->xv_ps_offset;
+    } else
+#endif
+    {
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->xv_vs_offset;
+	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->xv_ps_offset;
+    }
 
     /* PS bool constant */
     switch(pPriv->id) {
@@ -271,16 +299,19 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->vs_size, accel_state->vs_mc_addr);
+			accel_state->vs_size, accel_state->vs_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.num_gprs            = 2;
     vs_conf.stack_size          = 0;
+    vs_conf.bo                  = accel_state->shaders_bo;
     vs_setup                    (pScrn, accel_state->ib, &vs_conf);
 
     /* flush SQ cache */
     cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
-			accel_state->ps_size, accel_state->ps_mc_addr);
+			accel_state->ps_size, accel_state->ps_mc_addr,
+			accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.num_gprs            = 3;
@@ -288,6 +319,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
+    ps_conf.bo                  = accel_state->shaders_bo;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
     /* PS alu constants */
@@ -298,12 +330,12 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     switch(pPriv->id) {
     case FOURCC_YV12:
     case FOURCC_I420:
-	accel_state->src_mc_addr[0] = pPriv->src_offset + info->fbLocation + pScrn->fbOffset;
 	accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h;
 
 	/* flush texture cache */
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
-			    accel_state->src_mc_addr[0]);
+			    accel_state->src_mc_addr[0],
+			    accel_state->src_bo[0], RADEON_GEM_DOMAIN_VRAM, 0);
 
 	/* Y texture */
 	tex_res.id                  = 0;
@@ -314,6 +346,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	tex_res.dim                 = SQ_TEX_DIM_2D;
 	tex_res.base                = accel_state->src_mc_addr[0];
 	tex_res.mip_base            = accel_state->src_mc_addr[0];
+	tex_res.bo                  = accel_state->src_bo[0];
+	tex_res.mip_bo              = accel_state->src_bo[0];
 
 	tex_res.format              = FMT_8;
 	tex_res.dst_sel_x           = SQ_SEL_X; /* Y */
@@ -345,7 +379,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	/* U or V texture */
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
 			    accel_state->src_size[0] / 4,
-			    accel_state->src_mc_addr[0] + pPriv->planev_offset);
+			    accel_state->src_mc_addr[0] + pPriv->planev_offset,
+			    accel_state->src_bo[0], RADEON_GEM_DOMAIN_VRAM, 0);
 
 	tex_res.id                  = 1;
 	tex_res.format              = FMT_8;
@@ -369,7 +404,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	/* U or V texture */
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
 			    accel_state->src_size[0] / 4,
-			    accel_state->src_mc_addr[0] + pPriv->planeu_offset);
+			    accel_state->src_mc_addr[0] + pPriv->planeu_offset,
+			    accel_state->src_bo[0], RADEON_GEM_DOMAIN_VRAM, 0);
 
 	tex_res.id                  = 2;
 	tex_res.format              = FMT_8;
@@ -393,12 +429,12 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     case FOURCC_UYVY:
     case FOURCC_YUY2:
     default:
-	accel_state->src_mc_addr[0] = pPriv->src_offset + info->fbLocation + pScrn->fbOffset;
 	accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h;
 
 	/* flush texture cache */
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
-			    accel_state->src_mc_addr[0]);
+			    accel_state->src_mc_addr[0],
+			    accel_state->src_bo[0], RADEON_GEM_DOMAIN_VRAM, 0);
 
 	/* Y texture */
 	tex_res.id                  = 0;
@@ -409,6 +445,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	tex_res.dim                 = SQ_TEX_DIM_2D;
 	tex_res.base                = accel_state->src_mc_addr[0];
 	tex_res.mip_base            = accel_state->src_mc_addr[0];
+	tex_res.bo                  = accel_state->src_bo[0];
+	tex_res.mip_bo              = accel_state->src_bo[0];
 
 	tex_res.format              = FMT_8_8;
 	if (pPriv->id == FOURCC_UYVY)
@@ -474,12 +512,10 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     END_BATCH();
 
     cb_conf.id = 0;
-
-    accel_state->dst_mc_addr = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
-
     cb_conf.w = accel_state->dst_pitch;
     cb_conf.h = pPixmap->drawable.height;
     cb_conf.base = accel_state->dst_mc_addr;
+    cb_conf.bo = accel_state->dst_bo;
 
     switch (pPixmap->drawable.bitsPerPixel) {
     case 16:
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 059c3cc..4aa4650 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -39,6 +39,21 @@
 
 #include "radeon_drm.h"
 
+void r600_cs_flush_indirect(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    int ret;
+
+    if (!info->cs->cdw)
+	return;
+    radeon_cs_emit(info->cs);
+    radeon_cs_erase(info->cs);
+
+    ret = radeon_cs_space_check(info->cs);
+    if (ret)
+	ErrorF("space check failed in flush\n");
+}
+
 /* Flush the indirect buffer to the kernel for submission to the card */
 void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
@@ -47,13 +62,20 @@ void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
     int                start  = 0;
     drm_radeon_indirect_t  indirect;
 
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	r600_cs_flush_indirect(pScrn);
+	return;
+    }
+#endif
+
     if (!buffer) return;
 
     //xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Flushing buffer %d\n",
     //       buffer->idx);
 
     while (buffer->used & 0x3c){
-	BEGIN_BATCH();
+	BEGIN_BATCH(1);
         E32(buffer, CP_PACKET2()); /* fill up to multiple of 16 dwords */
 	END_BATCH();
     }
@@ -72,6 +94,20 @@ void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
 
 void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
+#if defined(XF86DRM_MODE)
+    int ret;
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    if (info->cs) {
+	if (CS_FULL(info->cs)) {
+	    r600_cs_flush_indirect(pScrn);
+	    return;
+	}
+	radeon_cs_erase(info->cs);
+	ret = radeon_cs_space_check(info->cs);
+	if (ret)
+	    ErrorF("space check failed in flush\n");
+    }
+#endif
     if (!ib) return;
 
     ib->used = 0;
@@ -81,6 +117,7 @@ void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib)
 void
 wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
 
     //flush caches, don't generate timestamp
     BEGIN_BATCH(5);
@@ -95,6 +132,8 @@ wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib)
 void
 wait_3d_idle(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
     BEGIN_BATCH(3);
     EREG(ib, WAIT_UNTIL,                          WAIT_3D_IDLE_bit);
     END_BATCH();
@@ -212,19 +251,19 @@ set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf)
     h = (cb_conf->h + 7) & ~7;
     slice = ((cb_conf->w * h) / 64) - 1;
 
-    if ((info->ChipFamily > CHIP_FAMILY_R600) &&
-	(info->ChipFamily < CHIP_FAMILY_RV770))
-	BEGIN_BATCH(23);
-    else
-	BEGIN_BATCH(21);
+    BEGIN_BATCH(3 + 2);
     EREG(ib, (CB_COLOR0_BASE + (4 * cb_conf->id)), (cb_conf->base >> 8));
+    RELOC_BATCH(cb_conf->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    END_BATCH();
 
     // rv6xx workaround
     if ((info->ChipFamily > CHIP_FAMILY_R600) &&
 	(info->ChipFamily < CHIP_FAMILY_RV770)) {
+	BEGIN_BATCH(20);
 	PACK3(ib, IT_SURFACE_BASE_UPDATE, 1);
 	E32(ib, (2 << cb_conf->id));
-    }
+    } else
+	BEGIN_BATCH(18);
 
     // pitch only for ARRAY_LINEAR_GENERAL, other tiling modes require addrlib
     EREG(ib, (CB_COLOR0_SIZE + (4 * cb_conf->id)), ((pitch << PITCH_TILE_MAX_shift)	|
@@ -240,20 +279,23 @@ set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf)
 }
 
 void
-cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr)
+cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr,
+		    struct radeon_bo *bo, uint32_t rdomains, uint32_t wdomain)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t cp_coher_size;
     if (size == 0xffffffff)
 	cp_coher_size = 0xffffffff;
     else
 	cp_coher_size = ((size + 255) >> 8);
 
-    BEGIN_BATCH(5);
+    BEGIN_BATCH(5 + 2);
     PACK3(ib, IT_SURFACE_SYNC, 4);
     E32(ib, sync_type);
     E32(ib, cp_coher_size);
     E32(ib, (mc_addr >> 8));
     E32(ib, 10); /* poll interval */
+    RELOC_BATCH(bo, rdomains, wdomain);
     END_BATCH();
 }
 
@@ -266,6 +308,12 @@ void cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
     uint32_t offset;
     RADEONCrtcPrivatePtr radeon_crtc;
 
+    //XXX FIXME
+#if defined(XF86DRM_MODE)
+    if (info->cs)
+	return;
+#endif
+
     if ((crtc < 0) || (crtc > 1))
         return;
 
@@ -314,6 +362,7 @@ void cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
 void
 fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_pgm_resources;
 
     sq_pgm_resources = ((fs_conf->num_gprs << NUM_GPRS_shift) |
@@ -322,8 +371,12 @@ fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
     if (fs_conf->dx10_clamp)
 	sq_pgm_resources |= SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit;
 
-    BEGIN_BATCH(9);
+    BEGIN_BATCH(3 + 2);
     EREG(ib, SQ_PGM_START_FS, fs_conf->shader_addr >> 8);
+    RELOC_BATCH(fs_conf->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    END_BATCH();
+
+    BEGIN_BATCH(6);
     EREG(ib, SQ_PGM_RESOURCES_FS, sq_pgm_resources);
     EREG(ib, SQ_PGM_CF_OFFSET_FS, 0);
     END_BATCH();
@@ -332,6 +385,7 @@ fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
 void
 vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_pgm_resources;
 
     sq_pgm_resources = ((vs_conf->num_gprs << NUM_GPRS_shift) |
@@ -344,8 +398,12 @@ vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf)
     if (vs_conf->uncached_first_inst)
 	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
 
-    BEGIN_BATCH(9);
+    BEGIN_BATCH(3 + 2);
     EREG(ib, SQ_PGM_START_VS, vs_conf->shader_addr >> 8);
+    RELOC_BATCH(vs_conf->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    END_BATCH();
+
+    BEGIN_BATCH(6);
     EREG(ib, SQ_PGM_RESOURCES_VS, sq_pgm_resources);
     EREG(ib, SQ_PGM_CF_OFFSET_VS, 0);
     END_BATCH();
@@ -354,6 +412,7 @@ vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf)
 void
 ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_pgm_resources;
 
     sq_pgm_resources = ((ps_conf->num_gprs << NUM_GPRS_shift) |
@@ -368,8 +427,12 @@ ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf)
     if (ps_conf->clamp_consts)
 	sq_pgm_resources |= CLAMP_CONSTS_bit;
 
-    BEGIN_BATCH(12);
+    BEGIN_BATCH(3 + 2);
     EREG(ib, SQ_PGM_START_PS, ps_conf->shader_addr >> 8);
+    RELOC_BATCH(ps_conf->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    END_BATCH();
+
+    BEGIN_BATCH(9);
     EREG(ib, SQ_PGM_RESOURCES_PS, sq_pgm_resources);
     EREG(ib, SQ_PGM_EXPORTS_PS, ps_conf->export_mode);
     EREG(ib, SQ_PGM_CF_OFFSET_PS, 0);
@@ -379,10 +442,11 @@ ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf)
 void
 set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     int i;
     const int countreg = count * (SQ_ALU_CONSTANT_offset >> 2);
 
-    BEGIN_BATCH(2 + count_reg);
+    BEGIN_BATCH(2 + countreg);
     PACK0(ib, SQ_ALU_CONSTANT + offset * SQ_ALU_CONSTANT_offset, countreg);
     for (i = 0; i < countreg; i++)
 	EFLOAT(ib, const_buf[i]);
@@ -392,6 +456,7 @@ set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *co
 void
 set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     /* bool register order is: ps, vs, gs; one register each
      * 1 bits per bool; 32 bools each for ps, vs, gs.
      */
@@ -403,6 +468,7 @@ set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val)
 void
 set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_vtx_constant_word2;
 
     sq_vtx_constant_word2 = ((((res->vb_addr) >> 32) & BASE_ADDRESS_HI_mask) |
@@ -419,7 +485,7 @@ set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
     if (res->srf_mode_all)
 	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit;
 
-    BEGIN_BATCH(9);
+    BEGIN_BATCH(9 + 2);
     PACK0(ib, SQ_VTX_RESOURCE + res->id * SQ_VTX_RESOURCE_offset, 7);
     E32(ib, res->vb_addr & 0xffffffff);				// 0: BASE_ADDRESS
     E32(ib, (res->vtx_num_entries << 2) - 1);			// 1: SIZE
@@ -428,12 +494,14 @@ set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
     E32(ib, 0);							// 4: n/a
     E32(ib, 0);							// 5: n/a
     E32(ib, SQ_TEX_VTX_VALID_BUFFER << SQ_VTX_CONSTANT_WORD6_0__TYPE_shift);	// 6: TYPE
+    RELOC_BATCH(res->bo, RADEON_GEM_DOMAIN_VRAM, 0);
     END_BATCH();
 }
 
 void
 set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_tex_resource_word0, sq_tex_resource_word1, sq_tex_resource_word4;
     uint32_t sq_tex_resource_word5, sq_tex_resource_word6;
 
@@ -483,7 +551,7 @@ set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
     if (tex_res->interlaced)
 	sq_tex_resource_word6 |= INTERLACED_bit;
 
-    BEGIN_BATCH(9);
+    BEGIN_BATCH(9 + 4);
     PACK0(ib, SQ_TEX_RESOURCE + tex_res->id * SQ_TEX_RESOURCE_offset, 7);
     E32(ib, sq_tex_resource_word0);
     E32(ib, sq_tex_resource_word1);
@@ -492,12 +560,15 @@ set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
     E32(ib, sq_tex_resource_word4);
     E32(ib, sq_tex_resource_word5);
     E32(ib, sq_tex_resource_word6);
+    RELOC_BATCH(tex_res->bo, RADEON_GEM_DOMAIN_VRAM, 0);
+    RELOC_BATCH(tex_res->mip_bo, RADEON_GEM_DOMAIN_VRAM, 0);
     END_BATCH();
 }
 
 void
 set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t sq_tex_sampler_word0, sq_tex_sampler_word1, sq_tex_sampler_word2;
 
     sq_tex_sampler_word0 = ((s->clamp_x       << SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift)		|
@@ -549,6 +620,8 @@ set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s)
 void
 set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
     BEGIN_BATCH(6);
     EREG(ib, PA_SC_SCREEN_SCISSOR_TL,              ((x1 << PA_SC_SCREEN_SCISSOR_TL__TL_X_shift) |
 						    (y1 << PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift)));
@@ -560,6 +633,7 @@ set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int
 void
 set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
 
     BEGIN_BATCH(6);
     EREG(ib, PA_SC_VPORT_SCISSOR_0_TL +
@@ -575,6 +649,7 @@ set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x
 void
 set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
 
     BEGIN_BATCH(6);
     EREG(ib, PA_SC_GENERIC_SCISSOR_TL,            ((x1 << PA_SC_GENERIC_SCISSOR_TL__TL_X_shift) |
@@ -588,6 +663,8 @@ set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int
 void
 set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
     BEGIN_BATCH(6);
     EREG(ib, PA_SC_WINDOW_SCISSOR_TL,             ((x1 << PA_SC_WINDOW_SCISSOR_TL__TL_X_shift) |
 						   (y1 << PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift) |
@@ -600,6 +677,8 @@ set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int
 void
 set_clip_rect(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
     BEGIN_BATCH(6);
     EREG(ib, PA_SC_CLIPRECT_0_TL +
 	 id * PA_SC_CLIPRECT_0_TL_offset,     ((x1 << PA_SC_CLIPRECT_0_TL__TL_X_shift) |
@@ -966,6 +1045,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     END_BATCH();
 
     // clear FS
+    fs_conf.bo = accel_state->shaders_bo;
     fs_setup(pScrn, ib, &fs_conf);
 
     // VGT
@@ -1006,6 +1086,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 void
 draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *indices)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
     uint32_t i, count;
 
     // calculate num of packets
@@ -1043,6 +1124,8 @@ draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *i
 void
 draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
 {
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
     BEGIN_BATCH(10);
     EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
     PACK3(ib, IT_INDEX_TYPE, 1);
@@ -1055,22 +1138,82 @@ draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
     END_BATCH();
 }
 
-void
+Bool
 r600_vb_get(ScrnInfoPtr pScrn)
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
-
-    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
-                              (accel_state->ib->idx * accel_state->ib->total) +
-                              (accel_state->ib->total / 2);
-    accel_state->vb_total = (accel_state->ib->total / 2);
-    accel_state->vb_ptr = (pointer)((char*)accel_state->ib->address +
-                                           (accel_state->ib->total / 2));
+#if defined(XF86DRM_MODE)
+    int ret;
+    if (info->cs) {
+	if (accel_state->vb_bo == NULL) {
+	    accel_state->vb_mc_addr = 0;
+	    accel_state->vb_bo = radeon_bo_open(info->bufmgr, 0, 16 * 1024,
+						4096, RADEON_GEM_DOMAIN_GTT, 0);
+	    if (accel_state->vb_bo == NULL)
+		return FALSE;
+	    ret = radeon_bo_map(accel_state->vb_bo, 1);
+	    if (ret) {
+		FatalError("failed to vb %d\n", ret);
+		return FALSE;
+	    }
+	    accel_state->vb_total = 16 * 1024;
+	    accel_state->vb_ptr = accel_state->vb_bo->ptr;
+	}
+    } else
+#endif
+    {
+	accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	    (accel_state->ib->idx*accel_state->ib->total)+
+	    (accel_state->ib->total / 2);
+	accel_state->vb_total = (accel_state->ib->total / 2);
+	accel_state->vb_ptr = (pointer)((char*)accel_state->ib->address +
+					(accel_state->ib->total / 2));
+    }
     accel_state->vb_index = 0;
+    return TRUE;
 }
 
 void
 r600_vb_discard(ScrnInfoPtr pScrn)
 {
 }
+
+int
+r600_cp_start(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+#if defined(XF86DRM_MODE)
+    if (info->cs) {
+	if (!r600_vb_get(pScrn))
+	    return FALSE;
+	radeon_cs_space_reset_bos(info->cs);
+	radeon_cs_space_add_persistent_bo(info->cs, accel_state->shaders_bo,
+					  RADEON_GEM_DOMAIN_VRAM, 0);
+	if (accel_state->src_bo[0])
+	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->src_bo[0],
+					      RADEON_GEM_DOMAIN_VRAM, 0);
+	if (accel_state->src_bo[1])
+	    radeon_cs_space_add_persistent_bo(info->cs, accel_state->src_bo[1],
+					      RADEON_GEM_DOMAIN_VRAM, 0);
+	radeon_cs_space_add_persistent_bo(info->cs, accel_state->dst_bo,
+					  RADEON_GEM_DOMAIN_VRAM, 0);
+	radeon_cs_space_add_persistent_bo(info->cs, accel_state->vb_bo,
+					  RADEON_GEM_DOMAIN_GTT, 0);
+	if (accel_state->copy_area_bo)
+	    radeon_cs_space_add_persistent_bo(info->cs,
+					      accel_state->copy_area_bo,
+					      RADEON_GEM_DOMAIN_VRAM, 0);
+	radeon_cs_space_check(info->cs);
+    } else
+#endif
+    {
+	accel_state->ib = RADEONCPGetBuffer(pScrn);
+	if (!r600_vb_get(pScrn)) {
+	    return -1;
+	}
+    }
+    return 0;
+}
diff --git a/src/radeon.h b/src/radeon.h
index 3a3631e..7fdd8f5 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -696,9 +696,11 @@ struct radeon_accel_state {
     int               vb_total;
     void              *vb_ptr;
     uint32_t          vb_size;
+    struct radeon_bo  *vb_bo;
 
     // shader storage
     ExaOffscreenArea  *shaders;
+    struct radeon_bo  *shaders_bo;
     uint32_t          solid_vs_offset;
     uint32_t          solid_ps_offset;
     uint32_t          copy_vs_offset;
@@ -710,12 +712,14 @@ struct radeon_accel_state {
     uint32_t          xv_ps_offset;
 
     //size/addr stuff
+    struct radeon_bo  *src_bo[2];
     uint32_t          src_size[2];
     uint64_t          src_mc_addr[2];
     uint32_t          src_pitch[2];
     uint32_t          src_width[2];
     uint32_t          src_height[2];
     uint32_t          src_bpp[2];
+    struct radeon_bo  *dst_bo;
     uint32_t          dst_size;
     uint64_t          dst_mc_addr;
     uint32_t          dst_pitch;
@@ -731,6 +735,7 @@ struct radeon_accel_state {
 
     // copy
     ExaOffscreenArea  *copy_area;
+    struct radeon_bo  *copy_area_bo;
     Bool              same_surface;
     int               rop;
     uint32_t          planemask;
diff --git a/src/radeon_dri2.c b/src/radeon_dri2.c
index b52f965..efc6bde 100644
--- a/src/radeon_dri2.c
+++ b/src/radeon_dri2.c
@@ -333,7 +333,9 @@ radeon_dri2_screen_init(ScreenPtr pScreen)
         return FALSE;
     }
 
-    if ( (info->ChipFamily >= CHIP_FAMILY_R300) ) {
+    if ( (info->ChipFamily >= CHIP_FAMILY_R600) ) {
+        dri2_info.driverName = R600_DRIVER_NAME;
+    } else if ( (info->ChipFamily >= CHIP_FAMILY_R300) ) {
         dri2_info.driverName = R300_DRIVER_NAME;
     } else if ( info->ChipFamily >= CHIP_FAMILY_R200 ) {
         dri2_info.driverName = R200_DRIVER_NAME;
diff --git a/src/radeon_exa.c b/src/radeon_exa.c
index 3f3c9ba..56e87a9 100644
--- a/src/radeon_exa.c
+++ b/src/radeon_exa.c
@@ -336,7 +336,7 @@ static void RADEONFinishAccess_BE(PixmapPtr pPix, int index)
 #endif /* X_BYTE_ORDER == X_BIG_ENDIAN */
 
 #ifdef XF86DRM_MODE
-static Bool RADEONPrepareAccess_CS(PixmapPtr pPix, int index)
+Bool RADEONPrepareAccess_CS(PixmapPtr pPix, int index)
 {
     ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
     struct radeon_exa_pixmap_priv *driver_priv;
@@ -364,7 +364,7 @@ static Bool RADEONPrepareAccess_CS(PixmapPtr pPix, int index)
     return TRUE;
 }
 
-static void RADEONFinishAccess_CS(PixmapPtr pPix, int index)
+void RADEONFinishAccess_CS(PixmapPtr pPix, int index)
 {
     struct radeon_exa_pixmap_priv *driver_priv;
 
@@ -456,7 +456,7 @@ void *RADEONEXACreatePixmap2(ScreenPtr pScreen, int width, int height,
     return new_priv;
 }
 
-static void RADEONEXADestroyPixmap(ScreenPtr pScreen, void *driverPriv)
+void RADEONEXADestroyPixmap(ScreenPtr pScreen, void *driverPriv)
 {
     struct radeon_exa_pixmap_priv *driver_priv = driverPriv;
 
@@ -489,7 +489,7 @@ void radeon_set_pixmap_bo(PixmapPtr pPix, struct radeon_bo *bo)
     }
 }
 
-static Bool RADEONEXAPixmapIsOffscreen(PixmapPtr pPix)
+Bool RADEONEXAPixmapIsOffscreen(PixmapPtr pPix)
 {
     struct radeon_exa_pixmap_priv *driver_priv;
 
commit 9cf965bbc977f0523437c0ecf1d7363b17de2468
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 12:47:34 2009 -0400

    R6xx/r7xx: add begin/end batch macros

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 0a9a0c6..555748b 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -207,8 +207,10 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 	pmask |= 1; /* R */
     if (pm & 0xff000000)
 	pmask |= 8; /* A */
+    BEGIN_BATCH(6);
     EREG(accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[alu]);
+    END_BATCH();
 
     cb_conf.id = 0;
     cb_conf.w = accel_state->dst_pitch;
@@ -231,6 +233,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 
     /* Interpolator setup */
     /* one unused export from VS (VS_EXPORT_COUNT is zero based, count minus one) */
+    BEGIN_BATCH(18);
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, (0 << VS_EXPORT_COUNT_shift));
     EREG(accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
 
@@ -245,6 +248,7 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 								  FLAT_SHADE_bit		|
 								  SEL_CENTROID_bit));
     EREG(accel_state->ib, SPI_INTERP_CONTROL_0,                FLAT_SHADE_ENA_bit);
+    END_BATCH();
 
     /* PS alu constants */
     if (pPix->drawable.bitsPerPixel == 16) {
@@ -497,8 +501,10 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
 	pmask |= 1; /* R */
     if (planemask & 0xff000000)
 	pmask |= 8; /* A */
+    BEGIN_BATCH(6);
     EREG(accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[rop]);
+    END_BATCH();
 
     accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
     accel_state->dst_mc_addr = dst_offset;
@@ -526,6 +532,7 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
 
     /* Interpolator setup */
     /* export tex coord from VS */
+    BEGIN_BATCH(18);
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
     EREG(accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
 
@@ -539,6 +546,7 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
 								(0x01 << DEFAULT_VAL_shift)	|
 								SEL_CENTROID_bit));
     EREG(accel_state->ib, SPI_INTERP_CONTROL_0,                0);
+    END_BATCH();
 }
 
 static void
@@ -1499,6 +1507,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     ps_conf.export_mode         = 2;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
+    BEGIN_BATCH(12);
     EREG(accel_state->ib, CB_SHADER_MASK,                      (0xf << OUTPUT0_ENABLE_shift));
 
     blendcntl = R600GetBlendCntl(op, pMaskPicture, pDstPicture->format);
@@ -1513,6 +1522,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 								    PER_MRT_BLEND_bit));
 	EREG(accel_state->ib, CB_BLEND0_CONTROL,                   blendcntl);
     }
+    END_BATCH();
 
     cb_conf.id = 0;
     cb_conf.w = accel_state->dst_pitch;
@@ -1540,6 +1550,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     set_render_target(pScrn, accel_state->ib, &cb_conf);
 
     /* Interpolator setup */
+    BEGIN_BATCH(21);
     if (pMask) {
 	/* export 2 tex coords from VS */
 	EREG(accel_state->ib, SPI_VS_OUT_CONFIG, ((2 - 1) << VS_EXPORT_COUNT_shift));
@@ -1566,6 +1577,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 								(0x01 << DEFAULT_VAL_shift)	|
 								SEL_CENTROID_bit));
     EREG(accel_state->ib, SPI_INTERP_CONTROL_0,                0);
+    END_BATCH();
 
     return TRUE;
 }
diff --git a/src/r600_state.h b/src/r600_state.h
index 8f20e42..10b1022 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -170,6 +170,9 @@ typedef struct {
     uint32_t num_indices;
 } draw_config_t;
 
+#define BEGIN_BATCH(n) do {} while(0)
+#define END_BATCH() do {} while(0)
+
 #define E32(ib, dword)                                                  \
 do {                                                                    \
     uint32_t *ib_head = (pointer)(char*)(ib)->address;			\
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 4502ab3..6739616 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -468,8 +468,10 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     }
 
     /* Render setup */
+    BEGIN_BATCH(6);
     EREG(accel_state->ib, CB_SHADER_MASK,                      (0x0f << OUTPUT0_ENABLE_shift));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    (0xcc << ROP3_shift)); /* copy */
+    END_BATCH();
 
     cb_conf.id = 0;
 
@@ -503,6 +505,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     /* Interpolator setup */
     /* export tex coords from VS */
+    BEGIN_BATCH(18);
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
     EREG(accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
 
@@ -514,7 +517,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 								(0x03 << DEFAULT_VAL_shift)	|
 								SEL_CENTROID_bit));
     EREG(accel_state->ib, SPI_INTERP_CONTROL_0,                0);
-
+    END_BATCH();
 
     vs_alu_consts[0] = 1.0 / pPriv->w;
     vs_alu_consts[1] = 1.0 / pPriv->h;
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 55188a4..059c3cc 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -53,7 +53,9 @@ void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
     //       buffer->idx);
 
     while (buffer->used & 0x3c){
+	BEGIN_BATCH();
         E32(buffer, CP_PACKET2()); /* fill up to multiple of 16 dwords */
+	END_BATCH();
     }
 
     //ErrorF("buffer bytes: %d\n", buffer->used);
@@ -81,19 +83,21 @@ wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
 
     //flush caches, don't generate timestamp
+    BEGIN_BATCH(5);
     PACK3(ib, IT_EVENT_WRITE, 1);
     E32(ib, CACHE_FLUSH_AND_INV_EVENT);
     // wait for 3D idle clean
     EREG(ib, WAIT_UNTIL,                          (WAIT_3D_IDLE_bit |
 						   WAIT_3D_IDLECLEAN_bit));
+    END_BATCH();
 }
 
 void
 wait_3d_idle(ScrnInfoPtr pScrn, drmBufPtr ib)
 {
-
+    BEGIN_BATCH(3);
     EREG(ib, WAIT_UNTIL,                          WAIT_3D_IDLE_bit);
-
+    END_BATCH();
 }
 
 void
@@ -102,13 +106,16 @@ start_3d(ScrnInfoPtr pScrn, drmBufPtr ib)
     RADEONInfoPtr info = RADEONPTR(pScrn);
 
     if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	BEGIN_BATCH(5);
 	PACK3(ib, IT_START_3D_CMDBUF, 1);
 	E32(ib, 0);
-    }
+    } else
+	BEGIN_BATCH(3);
 
     PACK3(ib, IT_CONTEXT_CONTROL, 2);
     E32(ib, 0x80000000);
     E32(ib, 0x80000000);
+    END_BATCH();
 
     wait_3d_idle_clean (pScrn, ib);
 }
@@ -158,6 +165,7 @@ sq_setup(ScrnInfoPtr pScrn, drmBufPtr ib, sq_config_t *sq_conf)
     sq_stack_resource_mgmt_2 = ((sq_conf->num_gs_stack_entries << NUM_GS_STACK_ENTRIES_shift) |
 				(sq_conf->num_es_stack_entries << NUM_ES_STACK_ENTRIES_shift));
 
+    BEGIN_BATCH(8);
     PACK0(ib, SQ_CONFIG, 6);
     E32(ib, sq_config);
     E32(ib, sq_gpr_resource_mgmt_1);
@@ -165,7 +173,7 @@ sq_setup(ScrnInfoPtr pScrn, drmBufPtr ib, sq_config_t *sq_conf)
     E32(ib, sq_thread_resource_mgmt);
     E32(ib, sq_stack_resource_mgmt_1);
     E32(ib, sq_stack_resource_mgmt_2);
-
+    END_BATCH();
 }
 
 void
@@ -204,6 +212,11 @@ set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf)
     h = (cb_conf->h + 7) & ~7;
     slice = ((cb_conf->w * h) / 64) - 1;
 
+    if ((info->ChipFamily > CHIP_FAMILY_R600) &&
+	(info->ChipFamily < CHIP_FAMILY_RV770))
+	BEGIN_BATCH(23);
+    else
+	BEGIN_BATCH(21);
     EREG(ib, (CB_COLOR0_BASE + (4 * cb_conf->id)), (cb_conf->base >> 8));
 
     // rv6xx workaround
@@ -223,6 +236,7 @@ set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf)
     EREG(ib, (CB_COLOR0_FRAG + (4 * cb_conf->id)), (0     >> 8));	// FMASK per-tile data base/256
     EREG(ib, (CB_COLOR0_MASK + (4 * cb_conf->id)), ((0    << CMASK_BLOCK_MAX_shift)	|
 						    (0    << FMASK_TILE_MAX_shift)));
+    END_BATCH();
 }
 
 void
@@ -234,11 +248,13 @@ cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_
     else
 	cp_coher_size = ((size + 255) >> 8);
 
+    BEGIN_BATCH(5);
     PACK3(ib, IT_SURFACE_SYNC, 4);
     E32(ib, sync_type);
     E32(ib, cp_coher_size);
     E32(ib, (mc_addr >> 8));
     E32(ib, 10); /* poll interval */
+    END_BATCH();
 }
 
 /* inserts a wait for vline in the command stream */
@@ -278,6 +294,7 @@ void cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
 
     radeon_crtc = xf86_config->crtc[crtc]->driver_private;
 
+    BEGIN_BATCH(10);
     /* set the VLINE range */
     EREG(ib, AVIVO_D1MODE_VLINE_START_END + radeon_crtc->crtc_offset,
          (start << AVIVO_D1MODE_VLINE_START_SHIFT) |
@@ -291,6 +308,7 @@ void cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
     E32(ib, 0);                          // Ref value
     E32(ib, AVIVO_D1MODE_VLINE_STAT);    // Mask
     E32(ib, 10);                         // Wait interval
+    END_BATCH();
 }
 
 void
@@ -304,9 +322,11 @@ fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
     if (fs_conf->dx10_clamp)
 	sq_pgm_resources |= SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit;
 
+    BEGIN_BATCH(9);
     EREG(ib, SQ_PGM_START_FS, fs_conf->shader_addr >> 8);
     EREG(ib, SQ_PGM_RESOURCES_FS, sq_pgm_resources);
     EREG(ib, SQ_PGM_CF_OFFSET_FS, 0);
+    END_BATCH();
 }
 
 void
@@ -324,9 +344,11 @@ vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf)
     if (vs_conf->uncached_first_inst)
 	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
 
+    BEGIN_BATCH(9);
     EREG(ib, SQ_PGM_START_VS, vs_conf->shader_addr >> 8);
     EREG(ib, SQ_PGM_RESOURCES_VS, sq_pgm_resources);
     EREG(ib, SQ_PGM_CF_OFFSET_VS, 0);
+    END_BATCH();
 }
 
 void
@@ -346,10 +368,12 @@ ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf)
     if (ps_conf->clamp_consts)
 	sq_pgm_resources |= CLAMP_CONSTS_bit;
 
+    BEGIN_BATCH(12);
     EREG(ib, SQ_PGM_START_PS, ps_conf->shader_addr >> 8);
     EREG(ib, SQ_PGM_RESOURCES_PS, sq_pgm_resources);
     EREG(ib, SQ_PGM_EXPORTS_PS, ps_conf->export_mode);
     EREG(ib, SQ_PGM_CF_OFFSET_PS, 0);
+    END_BATCH();
 }
 
 void
@@ -358,9 +382,11 @@ set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *co
     int i;
     const int countreg = count * (SQ_ALU_CONSTANT_offset >> 2);
 
+    BEGIN_BATCH(2 + count_reg);
     PACK0(ib, SQ_ALU_CONSTANT + offset * SQ_ALU_CONSTANT_offset, countreg);
     for (i = 0; i < countreg; i++)
 	EFLOAT(ib, const_buf[i]);
+    END_BATCH();
 }
 
 void
@@ -369,7 +395,9 @@ set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val)
     /* bool register order is: ps, vs, gs; one register each
      * 1 bits per bool; 32 bools each for ps, vs, gs.
      */
+    BEGIN_BATCH(3);
     EREG(ib, SQ_BOOL_CONST + offset * SQ_BOOL_CONST_offset, val);
+    END_BATCH();
 }
 
 void
@@ -391,6 +419,7 @@ set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
     if (res->srf_mode_all)
 	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit;
 
+    BEGIN_BATCH(9);
     PACK0(ib, SQ_VTX_RESOURCE + res->id * SQ_VTX_RESOURCE_offset, 7);
     E32(ib, res->vb_addr & 0xffffffff);				// 0: BASE_ADDRESS
     E32(ib, (res->vtx_num_entries << 2) - 1);			// 1: SIZE
@@ -399,6 +428,7 @@ set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
     E32(ib, 0);							// 4: n/a
     E32(ib, 0);							// 5: n/a
     E32(ib, SQ_TEX_VTX_VALID_BUFFER << SQ_VTX_CONSTANT_WORD6_0__TYPE_shift);	// 6: TYPE
+    END_BATCH();
 }
 
 void
@@ -453,6 +483,7 @@ set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
     if (tex_res->interlaced)
 	sq_tex_resource_word6 |= INTERLACED_bit;
 
+    BEGIN_BATCH(9);
     PACK0(ib, SQ_TEX_RESOURCE + tex_res->id * SQ_TEX_RESOURCE_offset, 7);
     E32(ib, sq_tex_resource_word0);
     E32(ib, sq_tex_resource_word1);
@@ -461,6 +492,7 @@ set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
     E32(ib, sq_tex_resource_word4);
     E32(ib, sq_tex_resource_word5);
     E32(ib, sq_tex_resource_word6);
+    END_BATCH();
 }
 
 void
@@ -505,27 +537,31 @@ set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s)
     if (s->type)
 	sq_tex_sampler_word2 |= SQ_TEX_SAMPLER_WORD2_0__TYPE_bit;
 
+    BEGIN_BATCH(5);
     PACK0(ib, SQ_TEX_SAMPLER_WORD + s->id * SQ_TEX_SAMPLER_WORD_offset, 3);
     E32(ib, sq_tex_sampler_word0);
     E32(ib, sq_tex_sampler_word1);
     E32(ib, sq_tex_sampler_word2);
+    END_BATCH();
 }
 
 //XXX deal with clip offsets in clip setup
 void
 set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
-
+    BEGIN_BATCH(6);
     EREG(ib, PA_SC_SCREEN_SCISSOR_TL,              ((x1 << PA_SC_SCREEN_SCISSOR_TL__TL_X_shift) |
 						    (y1 << PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift)));
     EREG(ib, PA_SC_SCREEN_SCISSOR_BR,              ((x2 << PA_SC_SCREEN_SCISSOR_BR__BR_X_shift) |
 						    (y2 << PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift)));
+    END_BATCH();
 }
 
 void
 set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
 {
 
+    BEGIN_BATCH(6);
     EREG(ib, PA_SC_VPORT_SCISSOR_0_TL +
 	 id * PA_SC_VPORT_SCISSOR_0_TL_offset, ((x1 << PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift) |
 						(y1 << PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift) |
@@ -533,40 +569,45 @@ set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x
     EREG(ib, PA_SC_VPORT_SCISSOR_0_BR +
 	 id * PA_SC_VPORT_SCISSOR_0_BR_offset, ((x2 << PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift) |
 						(y2 << PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift)));
+    END_BATCH();
 }
 
 void
 set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
 
+    BEGIN_BATCH(6);
     EREG(ib, PA_SC_GENERIC_SCISSOR_TL,            ((x1 << PA_SC_GENERIC_SCISSOR_TL__TL_X_shift) |
 						   (y1 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift) |
 						   WINDOW_OFFSET_DISABLE_bit));
     EREG(ib, PA_SC_GENERIC_SCISSOR_BR,            ((x2 << PA_SC_GENERIC_SCISSOR_BR__BR_X_shift) |
 						   (y2 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift)));
+    END_BATCH();
 }
 
 void
 set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
 {
-
+    BEGIN_BATCH(6);
     EREG(ib, PA_SC_WINDOW_SCISSOR_TL,             ((x1 << PA_SC_WINDOW_SCISSOR_TL__TL_X_shift) |
 						   (y1 << PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift) |
 						   WINDOW_OFFSET_DISABLE_bit));
     EREG(ib, PA_SC_WINDOW_SCISSOR_BR,             ((x2 << PA_SC_WINDOW_SCISSOR_BR__BR_X_shift) |
 						   (y2 << PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift)));
+    END_BATCH();
 }
 
 void
 set_clip_rect(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
 {
-
+    BEGIN_BATCH(6);
     EREG(ib, PA_SC_CLIPRECT_0_TL +
 	 id * PA_SC_CLIPRECT_0_TL_offset,     ((x1 << PA_SC_CLIPRECT_0_TL__TL_X_shift) |
 					       (y1 << PA_SC_CLIPRECT_0_TL__TL_Y_shift)));
     EREG(ib, PA_SC_CLIPRECT_0_BR +
 	 id * PA_SC_CLIPRECT_0_BR_offset,     ((x2 << PA_SC_CLIPRECT_0_BR__BR_X_shift) |
 					       (y2 << PA_SC_CLIPRECT_0_BR__BR_Y_shift)));
+    END_BATCH();
 }
 
 /*
@@ -594,6 +635,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     wait_3d_idle(pScrn, ib);
 
     // ASIC specific setup, see drm
+    BEGIN_BATCH(15);
     if (info->ChipFamily < CHIP_FAMILY_RV770) {
 	EREG(ib, TA_CNTL_AUX,                     (( 3 << GRADIENT_CREDIT_shift)		|
 						   (28 << TD_FIFO_CREDIT_shift)));
@@ -619,6 +661,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 						    (4  << DEPTH_CACHELINE_FREE_shift)	|
 						    0));
     }
+    END_BATCH();
 
     // SQ
     sq_conf.ps_prio = 0;
@@ -744,6 +787,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 
     sq_setup(pScrn, ib, &sq_conf);
 
+    BEGIN_BATCH(59);
     EREG(ib, SQ_VTX_BASE_VTX_LOC,                 0);
     EREG(ib, SQ_VTX_START_INST_LOC,               0);
 
@@ -790,28 +834,29 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     E32(ib, 0x00000000);
     E32(ib, 0x00000000);
     E32(ib, 0x00000000);
+    END_BATCH();
 
     if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	BEGIN_BATCH(11);
 	PACK0(ib, CB_FOG_RED, 3);
 	E32(ib, 0x00000000);
 	E32(ib, 0x00000000);
 	E32(ib, 0x00000000);
+	PACK0(ib, CB_CLEAR_RED, 4);
+	EFLOAT(ib, 1.0);						/* WTF? */
+	EFLOAT(ib, 0.0);
+	EFLOAT(ib, 1.0);
+	EFLOAT(ib, 1.0);
+	END_BATCH();
     }
 
+    BEGIN_BATCH(18);
     PACK0(ib, CB_CLRCMP_CONTROL, 4);
     E32(ib, 1 << CLRCMP_FCN_SEL_shift);				// CB_CLRCMP_CONTROL: use CLRCMP_FCN_SRC
     E32(ib, 0);							// CB_CLRCMP_SRC
     E32(ib, 0);							// CB_CLRCMP_DST
     E32(ib, 0);							// CB_CLRCMP_MSK
 
-
-    if (info->ChipFamily < CHIP_FAMILY_RV770) {
-	PACK0(ib, CB_CLEAR_RED, 4);
-	EFLOAT(ib, 1.0);						/* WTF? */
-	EFLOAT(ib, 0.0);
-	EFLOAT(ib, 1.0);
-	EFLOAT(ib, 1.0);
-    }
     EREG(ib, CB_TARGET_MASK,                      (0x0f << TARGET0_ENABLE_shift));
     EREG(ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
 
@@ -821,23 +866,29 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 						   (0 << WINDOW_Y_OFFSET_shift)));
 
     EREG(ib, PA_SC_CLIPRECT_RULE,                 CLIP_RULE_mask);
+    END_BATCH();
 
     /* clip boolean is set to always visible -> doesn't matter */
     for (i = 0; i < PA_SC_CLIPRECT_0_TL_num; i++)
 	set_clip_rect (pScrn, ib, i, 0, 0, 8192, 8192);
 
+    BEGIN_BATCH(3);
     if (info->ChipFamily < CHIP_FAMILY_RV770)
 	EREG(ib, R7xx_PA_SC_EDGERULE,             0x00000000);
     else
 	EREG(ib, R7xx_PA_SC_EDGERULE,             0xAAAAAAAA);
+    END_BATCH();
 
     for (i = 0; i < PA_SC_VPORT_SCISSOR_0_TL_num; i++) {
 	set_vport_scissor (pScrn, ib, i, 0, 0, 8192, 8192);
+	BEGIN_BATCH(4);
 	PACK0(ib, PA_SC_VPORT_ZMIN_0 + i * PA_SC_VPORT_ZMIN_0_offset, 2);
 	EFLOAT(ib, 0.0);
 	EFLOAT(ib, 1.0);
+	END_BATCH();
     }
 
+    BEGIN_BATCH(15);
     if (info->ChipFamily < CHIP_FAMILY_RV770)
 	EREG(ib, PA_SC_MODE_CNTL,                 (WALK_ORDER_ENABLE_bit | FORCE_EOV_CNTDWN_ENABLE_bit));
     else
@@ -852,13 +903,17 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     EREG(ib, PA_SC_LINE_CNTL,                     0);
     EREG(ib, PA_SC_AA_CONFIG,                     0);
     EREG(ib, PA_SC_AA_MASK,                       0xFFFFFFFF);
+    END_BATCH();
 
     //XXX: double check this
     if (info->ChipFamily > CHIP_FAMILY_R600) {
+	BEGIN_BATCH(6);
 	EREG(ib, PA_SC_AA_SAMPLE_LOCS_MCTX,       0);
 	EREG(ib, PA_SC_AA_SAMPLE_LOCS_8S_WD1_M,   0);
+	END_BATCH();
     }
 
+    BEGIN_BATCH(83);
     EREG(ib, PA_SC_LINE_STIPPLE,                  0);
     EREG(ib, PA_SC_MPASS_PS_CNTL,                 0);
 
@@ -908,17 +963,18 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     EREG(ib, SPI_FOG_CNTL,                        0);
     EREG(ib, SPI_FOG_FUNC_SCALE,                  0);
     EREG(ib, SPI_FOG_FUNC_BIAS,                   0);
+    END_BATCH();
 
     // clear FS
     fs_setup(pScrn, ib, &fs_conf);
 
     // VGT
+    BEGIN_BATCH(75);
     EREG(ib, VGT_MAX_VTX_INDX,                    2048); /* XXX set to a reasonably large number of indices */
     EREG(ib, VGT_MIN_VTX_INDX,                    0);
     EREG(ib, VGT_INDX_OFFSET,                     0);
     EREG(ib, VGT_INSTANCE_STEP_RATE_0,            0);
     EREG(ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
     EREG(ib, VGT_MULTI_PRIM_IB_RESET_INDX,        0);
     EREG(ib, VGT_OUTPUT_PATH_CNTL,                0);
     EREG(ib, VGT_GS_MODE,                         0);
@@ -939,7 +995,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     EREG(ib, VGT_REUSE_OFF,                       0);
     EREG(ib, VGT_VTX_CNT_EN,                      0);
     EREG(ib, VGT_STRMOUT_BUFFER_EN,               0);
-
+    END_BATCH();
 }
 
 
@@ -952,12 +1008,6 @@ draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *i
 {
     uint32_t i, count;
 
-    EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
-    PACK3(ib, IT_INDEX_TYPE, 1);
-    E32(ib, draw_conf->index_type);
-    PACK3(ib, IT_NUM_INSTANCES, 1);
-    E32(ib, draw_conf->num_instances);
-
     // calculate num of packets
     count = 2;
     if (draw_conf->index_type == DI_INDEX_SIZE_16_BIT)
@@ -965,6 +1015,13 @@ draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *i
     else
 	count += draw_conf->num_indices;
 
+    BEGIN_BATCH(8 + count);
+    EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
+    PACK3(ib, IT_INDEX_TYPE, 1);
+    E32(ib, draw_conf->index_type);
+    PACK3(ib, IT_NUM_INSTANCES, 1);
+    E32(ib, draw_conf->num_instances);
+
     PACK3(ib, IT_DRAW_INDEX_IMMD, count);
     E32(ib, draw_conf->num_indices);
     E32(ib, draw_conf->vgt_draw_initiator);
@@ -980,12 +1037,13 @@ draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *i
 	for (i = 0; i < draw_conf->num_indices; i++)
 	    E32(ib, indices[i]);
     }
+    END_BATCH();
 }
 
 void
 draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
 {
-
+    BEGIN_BATCH(10);
     EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
     PACK3(ib, IT_INDEX_TYPE, 1);
     E32(ib, draw_conf->index_type);
@@ -994,6 +1052,7 @@ draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
     PACK3(ib, IT_DRAW_INDEX_AUTO, 2);
     E32(ib, draw_conf->num_indices);
     E32(ib, draw_conf->vgt_draw_initiator);
+    END_BATCH();
 }
 
 void
commit b6368cc572c79bce9a9366242c727c13cab3f006
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Aug 25 12:14:33 2009 -0400

    r6xx/r7xx: move more common state to default state setup

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 3e77515..0a9a0c6 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -164,10 +164,6 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 
     set_default_state(pScrn, accel_state->ib);
 
-    /* Scissor / viewport */
-    EREG(accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
-    EREG(accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
-
     set_generic_scissor(pScrn, accel_state->ib, 0, 0, pPix->drawable.width, pPix->drawable.height);
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pPix->drawable.width, pPix->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pPix->drawable.width, pPix->drawable.height);
@@ -212,7 +208,6 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     if (pm & 0xff000000)
 	pmask |= 8; /* A */
     EREG(accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
-    EREG(accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[alu]);
 
     cb_conf.id = 0;
@@ -234,12 +229,6 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     cb_conf.blend_clamp = 1;
     set_render_target(pScrn, accel_state->ib, &cb_conf);
 
-    EREG(accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
-    EREG(accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
-								DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
-
     /* Interpolator setup */
     /* one unused export from VS (VS_EXPORT_COUNT is zero based, count minus one) */
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, (0 << VS_EXPORT_COUNT_shift));
@@ -408,10 +397,6 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
 
     set_default_state(pScrn, accel_state->ib);
 
-    /* Scissor / viewport */
-    EREG(accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
-    EREG(accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
-
     set_generic_scissor(pScrn, accel_state->ib, 0, 0, dst_width, dst_height);
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, dst_width, dst_height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, dst_width, dst_height);
@@ -513,7 +498,6 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     if (planemask & 0xff000000)
 	pmask |= 8; /* A */
     EREG(accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
-    EREG(accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[rop]);
 
     accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
@@ -540,12 +524,6 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     cb_conf.blend_clamp = 1;
     set_render_target(pScrn, accel_state->ib, &cb_conf);
 
-    EREG(accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
-    EREG(accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
-								DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
-
     /* Interpolator setup */
     /* export tex coord from VS */
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
@@ -1463,10 +1441,6 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 
     set_default_state(pScrn, accel_state->ib);
 
-    /* Scissor / viewport */
-    EREG(accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
-    EREG(accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
-
     set_generic_scissor(pScrn, accel_state->ib, 0, 0, pDst->drawable.width, pDst->drawable.height);
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pDst->drawable.width, pDst->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pDst->drawable.width, pDst->drawable.height);
@@ -1526,7 +1500,6 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
     EREG(accel_state->ib, CB_SHADER_MASK,                      (0xf << OUTPUT0_ENABLE_shift));
-    EREG(accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
 
     blendcntl = R600GetBlendCntl(op, pMaskPicture, pDstPicture->format);
 
@@ -1566,12 +1539,6 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     cb_conf.blend_clamp = 1;
     set_render_target(pScrn, accel_state->ib, &cb_conf);
 
-    EREG(accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
-    EREG(accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
-								DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
-
     /* Interpolator setup */
     if (pMask) {
 	/* export 2 tex coords from VS */
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 5dc79c9..4502ab3 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -241,10 +241,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     set_default_state(pScrn, accel_state->ib);
 
-    /* Scissor / viewport */
-    EREG(accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
-    EREG(accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
-
     set_generic_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
     set_screen_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
     set_window_scissor(pScrn, accel_state->ib, 0, 0, pPixmap->drawable.width, pPixmap->drawable.height);
@@ -473,7 +469,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     /* Render setup */
     EREG(accel_state->ib, CB_SHADER_MASK,                      (0x0f << OUTPUT0_ENABLE_shift));
-    EREG(accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
     EREG(accel_state->ib, CB_COLOR_CONTROL,                    (0xcc << ROP3_shift)); /* copy */
 
     cb_conf.id = 0;
@@ -506,12 +501,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     cb_conf.blend_clamp = 1;
     set_render_target(pScrn, accel_state->ib, &cb_conf);
 
-    EREG(accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
-								(POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
-    EREG(accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
-								DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
-
     /* Interpolator setup */
     /* export tex coords from VS */
     EREG(accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 0457f7d..55188a4 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -775,6 +775,11 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 						   (2 << ALPHA_TO_MASK_OFFSET2_shift)	|
 						   (2 << ALPHA_TO_MASK_OFFSET3_shift)));
 
+
+    EREG(ib, DB_SHADER_CONTROL, ((1 << Z_ORDER_shift) | /* EARLY_Z_THEN_LATE_Z */
+				 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+
     // SX
     EREG(ib, SX_ALPHA_TEST_CONTROL,               0);
     EREG(ib, SX_ALPHA_REF,                        0);
@@ -808,6 +813,8 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 	EFLOAT(ib, 1.0);
     }
     EREG(ib, CB_TARGET_MASK,                      (0x0f << TARGET0_ENABLE_shift));
+    EREG(ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
+
 
     // SC
     EREG(ib, PA_SC_WINDOW_OFFSET,                 ((0 << WINDOW_X_OFFSET_shift) |
@@ -837,6 +844,11 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
 	EREG(ib, PA_SC_MODE_CNTL,                 (FORCE_EOV_CNTDWN_ENABLE_bit | FORCE_EOV_REZ_ENABLE_bit |
 						   0x00500000)); /* ? */
 
+    EREG(ib, PA_SU_SC_MODE_CNTL, (FACE_bit |
+				  (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift) |
+				  (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+
+
     EREG(ib, PA_SC_LINE_CNTL,                     0);
     EREG(ib, PA_SC_AA_CONFIG,                     0);
     EREG(ib, PA_SC_AA_MASK,                       0xFFFFFFFF);
@@ -867,6 +879,10 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     EFLOAT(ib, 1.0);						// PA_CL_GB_HORZ_CLIP_ADJ
     EFLOAT(ib, 1.0);						// PA_CL_GB_HORZ_DISC_ADJ
 
+    /* Scissor / viewport */
+    EREG(ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
+    EREG(ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
+
     // SU
     EREG(ib, PA_SU_SC_MODE_CNTL,                  FACE_bit);
     EREG(ib, PA_SU_POINT_SIZE,                    0);


More information about the xorg-commit mailing list