xf86-video-ati: Branch 'master' - 16 commits

Alex Deucher agd5f at kemper.freedesktop.org
Fri Apr 17 07:36:57 PDT 2009


 man/radeon.man                   |    4 
 src/r600_shader.c                |  440 ---
 src/r600_textured_videofuncs.c   |  106 
 src/radeon.h                     |    5 
 src/radeon_accelfuncs.c          |    5 
 src/radeon_commonfuncs.c         |    5 
 src/radeon_exa_funcs.c           |    5 
 src/radeon_textured_video.c      |  208 +
 src/radeon_textured_videofuncs.c | 5547 +++++++++++++++++++++++++--------------
 src/radeon_video.c               |   19 
 src/radeon_video.h               |   20 
 11 files changed, 4014 insertions(+), 2350 deletions(-)

New commits:
commit db177c70ce88af19c8e05eb916a15f3e124876eb
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Fri Apr 17 01:05:15 2009 -0400

    Update Xv info in man page

diff --git a/man/radeon.man b/man/radeon.man
index 7ca65e9..9b0d292 100644
--- a/man/radeon.man
+++ b/man/radeon.man
@@ -594,7 +594,9 @@ XV_BICUBIC is used to control whether textured adapter should apply
 a bicubic filter to smooth the output. It has three values: 'off'(0), 'on'(1)
 and 'auto'(2). 'off' means never apply the filter, 'on' means always apply
 the filter and 'auto' means apply the filter only if the X and Y
-sizes are scaled to more than double, this to avoid blurred output.
+sizes are scaled to more than double to avoid blurred output.  Bicubic
+filtering is not currently compatible with other Xv attributes like hue,
+contrast, and brightness, and must be disabled to use those attributes.
 The default is
 .B 'auto'(2).
 
commit 00266177bed2dc8693df497ca3ec19f2dc4adc05
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Apr 16 20:33:28 2009 -0400

    R3xx/R5xx: only apply Xv attributes if bicubic is disabled
    
    Provides consistent output

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 2318a62..5c46712 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -356,8 +356,10 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 
     /* Bicubic filter setup */
     pPriv->bicubic_enabled = (pPriv->bicubic_state != BICUBIC_OFF);
-    if (!(IS_R300_3D || IS_R500_3D))
+    if (!(IS_R300_3D || IS_R500_3D)) {
 	pPriv->bicubic_enabled = FALSE;
+	pPriv->bicubic_state = BICUBIC_OFF;
+    }
     if (pPriv->bicubic_enabled && (pPriv->bicubic_state == BICUBIC_AUTO)) {
 	/*
 	 * Applying the bicubic filter with a scale of less than 200%
@@ -372,7 +374,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     case FOURCC_I420:
 	srcPitch = (width + 3) & ~3;
 	srcPitch2 = ((width >> 1) + 3) & ~3;
-        if (pPriv->bicubic_enabled) {
+        if (pPriv->bicubic_state != BICUBIC_OFF) {
 	    dstPitch = ((dst_width << 1) + 15) & ~15;
 	    dstPitch = (dstPitch + 63) & ~63;
 	} else {
@@ -502,7 +504,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 				     srcPitch, srcPitch2, pPriv->src_pitch,
 				     width, height);
 	    }
-	} else if (pPriv->bicubic_enabled) {
+	} else if (pPriv->bicubic_state != BICUBIC_OFF) {
 	    top &= ~1;
 	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
 	    s2offset = srcPitch * height;
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 0ac247a..caf8dce 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1051,9 +1051,9 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	colorpitch |= R300_COLORTILE;
 
 
-    if (!pPriv->bicubic_enabled && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
+	(pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
 	isplanar = TRUE;
-    }
 
     if (isplanar) {
 	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
@@ -1064,7 +1064,7 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	else
 	    txformat1 = R300_TX_FORMAT_VYUY422;
 
-	if (pPriv->bicubic_enabled)
+	if (pPriv->bicubic_state != BICUBIC_OFF)
 	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
 
 	/* pitch is in pixels */
@@ -1250,661 +1250,422 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     FINISH_ACCEL();
 
     /* setup pixel shader */
-    if (pPriv->bicubic_enabled) {
-	BEGIN_ACCEL(79);
-
-	/* 4 components: 2 for tex0 and 2 for tex1 */
-	OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-				      R300_RS_COUNT_HIRES_EN));
-
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
-
-	/* Pixel stack frame size. */
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
-
-	/* Indirection levels */
-	OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
-				       R300_FIRST_TEX));
-
-	/* Set nodes. */
-	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-					    R300_ALU_CODE_SIZE(14) |
-					    R300_TEX_CODE_OFFSET(0) |
-					    R300_TEX_CODE_SIZE(6)));
-
-	/* Nodes are allocated highest first, but executed lowest first */
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
-					    R300_ALU_SIZE(0) |
-					    R300_TEX_START(0) |
-					    R300_TEX_SIZE(0)));
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
-					    R300_ALU_SIZE(9) |
-					    R300_TEX_START(1) |
-					    R300_TEX_SIZE(0)));
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
-					    R300_ALU_SIZE(2) |
-					    R300_TEX_START(2) |
-					    R300_TEX_SIZE(3) |
-					    R300_RGBA_OUT));
-
-	/* ** BICUBIC FP ** */
-
-	/* texcoord0 => temp0
-	 * texcoord1 => temp1 */
-
-	// first node
-	/* TEX temp2, temp1.rrr0, tex1, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(1) |
-					    R300_TEX_SRC_ADDR(1) |
-					    R300_TEX_DST_ADDR(2)));
-
-	/* MOV temp1.r, temp1.ggg0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
-						R300_ALU_RGB_ADDRD(1) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-
-	// second node
-	/* TEX temp1, temp1, tex1, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(1) |
-					    R300_TEX_SRC_ADDR(1) |
-					    R300_TEX_DST_ADDR(1)));
-
-	/* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
-						R300_ALU_RGB_ADDRD(3) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-
-	/* MUL temp2.rg, temp2.rrr0, const0.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
-						R300_ALU_RGB_ADDRD(2) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR2(3) |
-						R300_ALU_RGB_ADDRD(4) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR2(2) |
-						R300_ALU_RGB_ADDRD(5) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR2(3) |
-						R300_ALU_RGB_ADDRD(3) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
-						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR2(2) |
-						R300_ALU_RGB_ADDRD(1) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
-						R300_ALU_RGB_ADDR2(1) |
-						R300_ALU_RGB_ADDRD(1) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
-						R300_ALU_RGB_ADDR2(3) |
-						R300_ALU_RGB_ADDRD(2) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
-						R300_ALU_RGB_ADDR2(5) |
-						R300_ALU_RGB_ADDRD(3) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-	/* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
-						 R300_ALU_RGB_ADDR2(4) |
-						 R300_ALU_RGB_ADDRD(0) |
-						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-
-
-	// third node
-	/* TEX temp4, temp1.rg--, tex0, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(0) |
-					    R300_TEX_SRC_ADDR(1) |
-					    R300_TEX_DST_ADDR(4)));
-
-	/* TEX temp3, temp3.rg--, tex0, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(0) |
-					    R300_TEX_SRC_ADDR(3) |
-					    R300_TEX_DST_ADDR(3)));
-
-	/* TEX temp5, temp2.rg--, tex0, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(0) |
-					    R300_TEX_SRC_ADDR(2) |
-					    R300_TEX_DST_ADDR(5)));
-
-	/* TEX temp0, temp0.rg--, tex0, 1D */
-	OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
-					    R300_TEX_ID(0) |
-					    R300_TEX_SRC_ADDR(0) |
-					    R300_TEX_DST_ADDR(0)));
-
-	/* LRP temp3, temp1.bbbb, temp4, temp3 ->
-	 * - PRESUB temps, temp4 - temp3
-	 * - MAD temp3, temp1.bbbb, temps, temp3 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
-						 R300_ALU_RGB_ADDR1(4) |
-						 R300_ALU_RGB_ADDR2(1) |
-						 R300_ALU_RGB_ADDRD(3) |
-						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
-						   R300_ALU_ALPHA_ADDR1(4) |
-						   R300_ALU_ALPHA_ADDR2(1) |
-						   R300_ALU_ALPHA_ADDRD(3) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
-
-	/* LRP temp0, temp1.bbbb, temp5, temp0 ->
-	 * - PRESUB temps, temp5 - temp0
-	 * - MAD temp0, temp1.bbbb, temps, temp0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
-						 R300_ALU_RGB_INSERT_NOP));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
-						 R300_ALU_RGB_ADDR1(5) |
-						 R300_ALU_RGB_ADDR2(1) |
-						 R300_ALU_RGB_ADDRD(0) |
-						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
-						   R300_ALU_ALPHA_ADDR1(5) |
-						   R300_ALU_ALPHA_ADDR2(1) |
-						   R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
-
-	/* LRP output, temp2.bbbb, temp3, temp0 ->
-	 * - PRESUB temps, temp3 - temp0
-	 * - MAD output, temp2.bbbb, temps, temp0 */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
-						 R300_ALU_RGB_ADDR1(3) |
-						 R300_ALU_RGB_ADDR2(2) |
-						 R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
-						   R300_ALU_ALPHA_ADDR1(3) |
-						   R300_ALU_ALPHA_ADDR2(2) |
-						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
-
-	/* Shader constants. */
-	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
-	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
-	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
-
-	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
-	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
-	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
-
-	FINISH_ACCEL();
-    } else if (isplanar) {
-	/*
-	 * y' = y - .0625
-	 * u' = u - .5
-	 * v' = v - .5;
-	 *
-	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
-	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
-	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
-	 *
-	 * DP3 might look like the straightforward solution
-	 * but we'd need to move the texture yuv values in
-	 * the same reg for this to work. Therefore use MADs.
-	 * Brightness just adds to the off constant.
-	 * Contrast is multiplication of luminance.
-	 * Saturation and hue change the u and v coeffs.
-	 * Default values (before adjustments - depend on colorspace):
-	 * yco = 1.1643
-	 * uco = 0, -0.39173, 2.017
-	 * vco = 1.5958, -0.8129, 0
-	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
-	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
-	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
-	 *
-	 * temp = MAD(yco, yuv.yyyy, off)
-	 * temp = MAD(uco, yuv.uuuu, temp)
-	 * result = MAD(vco, yuv.vvvv, temp)
-	 */
-	/* TODO: don't recalc consts always */
-	const float Loff = -0.0627;
-	const float Coff = -0.502;
-	float uvcosf, uvsinf;
-	float yco;
-	float uco[3], vco[3], off[3];
-	float bright, cont, gamma;
-	int ref = pPriv->transform_index;
-	Bool needgamma = FALSE;
+    if (pPriv->bicubic_state != BICUBIC_OFF) {
+	if (pPriv->bicubic_enabled) {
+	    BEGIN_ACCEL(79);
+
+	    /* 4 components: 2 for tex0 and 2 for tex1 */
+	    OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+					  R300_RS_COUNT_HIRES_EN));
+
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
+
+	    /* Pixel stack frame size. */
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
+
+	    /* Indirection levels */
+	    OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
+					   R300_FIRST_TEX));
+
+	    /* Set nodes. */
+	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+						R300_ALU_CODE_SIZE(14) |
+						R300_TEX_CODE_OFFSET(0) |
+						R300_TEX_CODE_SIZE(6)));
+
+	    /* Nodes are allocated highest first, but executed lowest first */
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
+						R300_ALU_SIZE(0) |
+						R300_TEX_START(0) |
+						R300_TEX_SIZE(0)));
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
+						R300_ALU_SIZE(9) |
+						R300_TEX_START(1) |
+						R300_TEX_SIZE(0)));
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
+						R300_ALU_SIZE(2) |
+						R300_TEX_START(2) |
+						R300_TEX_SIZE(3) |
+						R300_RGBA_OUT));
+
+	    /* ** BICUBIC FP ** */
+
+	    /* texcoord0 => temp0
+	     * texcoord1 => temp1 */
+
+	    // first node
+	    /* TEX temp2, temp1.rrr0, tex1, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(1) |
+						R300_TEX_SRC_ADDR(1) |
+						R300_TEX_DST_ADDR(2)));
+
+	    /* MOV temp1.r, temp1.ggg0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
+						    R300_ALU_RGB_ADDRD(1) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	cont = RTFContrast(pPriv->contrast);
-	bright = RTFBrightness(pPriv->brightness);
-	gamma = (float)pPriv->gamma / 1000.0;
-	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
-	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
-	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
 
-	yco = trans[ref].RefLuma * cont;
-	uco[0] = -trans[ref].RefRCr * uvsinf;
-	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-	uco[2] = trans[ref].RefBCb * uvcosf;
-	vco[0] = trans[ref].RefRCr * uvcosf;
-	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-	vco[2] = trans[ref].RefBCb * uvsinf;
-	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
-	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
-	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+	    // second node
+	    /* TEX temp1, temp1, tex1, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(1) |
+						R300_TEX_SRC_ADDR(1) |
+						R300_TEX_DST_ADDR(1)));
+
+	    /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
+						    R300_ALU_RGB_ADDRD(3) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	if (gamma != 1.0) {
-	    needgamma = TRUE;
-	    /* note: gamma correction is out = in ^ gamma;
-	       gpu can only do LG2/EX2 therefore we transform into
-	       in ^ gamma = 2 ^ (log2(in) * gamma).
-	       Lots of scalar ops, unfortunately (better solution?) -
-	       without gamma that's 3 inst, with gamma it's 10...
-	       could use different gamma factors per channel,
-	       if that's of any use. */
-	}
 
-	BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
-	/* 2 components: same 2 for tex0/1/2 */
-	OUT_ACCEL_REG(R300_RS_COUNT,
-		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-		       R300_RS_COUNT_HIRES_EN));
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
-
-	/* Indirection levels */
-	OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
-				       R300_FIRST_TEX));
-
-	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-					    R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
-					    R300_TEX_CODE_OFFSET(0) |
-					    R300_TEX_CODE_SIZE(3)));
-
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-					    R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
-					    R300_TEX_START(0) |
-					    R300_TEX_SIZE(2) |
-					    R300_RGBA_OUT));
-
-	/* tex inst */
-	OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(2) |
-					   R300_TEX_ID(0) |
-					   R300_TEX_INST(R300_TEX_INST_LD)));
-	OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(1) |
-					   R300_TEX_ID(1) |
-					   R300_TEX_INST(R300_TEX_INST_LD)));
-	OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(0) |
-					   R300_TEX_ID(2) |
-					   R300_TEX_INST(R300_TEX_INST_LD)));
-
-	/* ALU inst */
-	/* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
-						R300_ALU_RGB_ADDR1(2) |
-						R300_ALU_RGB_ADDR2(0) |
-						R300_ALU_RGB_ADDRD(2) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	/* alpha nop, but need to set up alpha source for rgb usage */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
-						  R300_ALU_ALPHA_ADDR1(2) |
-						  R300_ALU_ALPHA_ADDR2(0) |
-						  R300_ALU_ALPHA_ADDRD(2) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	/* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR1(1) |
-						R300_ALU_RGB_ADDR2(2) |
-						R300_ALU_RGB_ADDRD(2) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	/* alpha nop */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	/* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
-						R300_ALU_RGB_ADDR1(0) |
-						R300_ALU_RGB_ADDR2(2) |
-						R300_ALU_RGB_ADDRD(0) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
-						(needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
-						R300_ALU_RGB_CLAMP));
-	/* write alpha 1 */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
-						  R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
-						  R300_ALU_ALPHA_TARGET_A));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
-
-	if (needgamma) {
-	    /* rgb temp0.r = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.r */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+	    /* MUL temp2.rg, temp2.rrr0, const0.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
+						    R300_ALU_RGB_ADDRD(2) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	    /* rgb temp0.g = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.g */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+
+	    /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR2(3) |
+						    R300_ALU_RGB_ADDRD(4) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	    /* rgb temp0.b = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.b */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+
+	    /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR2(2) |
+						    R300_ALU_RGB_ADDRD(5) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	    /* MUL const1, temp1, temp0 */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_ADDR1(0) |
-						    R300_ALU_RGB_ADDR2(0) |
-						    R300_ALU_RGB_ADDRD(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
-						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
-						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	    /* alpha nop, but set up const1 */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+	    /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR2(3) |
+						    R300_ALU_RGB_ADDRD(3) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	    /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
+						    R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR2(2) |
+						    R300_ALU_RGB_ADDRD(1) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
 	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	    /* rgb out0.r = op_sop, set up src0 reg */
+	    /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
 	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.r */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						    R300_ALU_RGB_ADDR2(1) |
+						    R300_ALU_RGB_ADDRD(1) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	    /* rgb out0.g = op_sop, set up src0 reg */
+	    /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
 	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.g */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						    R300_ALU_RGB_ADDR2(3) |
+						    R300_ALU_RGB_ADDRD(2) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
-	    /* rgb out0.b = op_sop, set up src0 reg */
+	    /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
 	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.b */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						    R300_ALU_RGB_ADDR2(5) |
+						    R300_ALU_RGB_ADDRD(3) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-	}
-
-	/* Shader constants. */
-	/* constant 0: off, yco */
-	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
-	/* constant 1: uco */
-	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
-	/* constant 2: vco */
-	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
-	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
-
-	FINISH_ACCEL();
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
+	    /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
+						     R300_ALU_RGB_ADDR2(4) |
+						     R300_ALU_RGB_ADDRD(0) |
+						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
+						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+
+	    // third node
+	    /* TEX temp4, temp1.rg--, tex0, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(0) |
+						R300_TEX_SRC_ADDR(1) |
+						R300_TEX_DST_ADDR(4)));
+
+	    /* TEX temp3, temp3.rg--, tex0, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(0) |
+						R300_TEX_SRC_ADDR(3) |
+						R300_TEX_DST_ADDR(3)));
+
+	    /* TEX temp5, temp2.rg--, tex0, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(0) |
+						R300_TEX_SRC_ADDR(2) |
+						R300_TEX_DST_ADDR(5)));
+
+	    /* TEX temp0, temp0.rg--, tex0, 1D */
+	    OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
+						R300_TEX_ID(0) |
+						R300_TEX_SRC_ADDR(0) |
+						R300_TEX_DST_ADDR(0)));
+
+	    /* LRP temp3, temp1.bbbb, temp4, temp3 ->
+	     * - PRESUB temps, temp4 - temp3
+	     * - MAD temp3, temp1.bbbb, temps, temp3 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
+						     R300_ALU_RGB_ADDR1(4) |
+						     R300_ALU_RGB_ADDR2(1) |
+						     R300_ALU_RGB_ADDRD(3) |
+						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
+						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
+						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
+						       R300_ALU_ALPHA_ADDR1(4) |
+						       R300_ALU_ALPHA_ADDR2(1) |
+						       R300_ALU_ALPHA_ADDRD(3) |
+						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
+
+	    /* LRP temp0, temp1.bbbb, temp5, temp0 ->
+	     * - PRESUB temps, temp5 - temp0
+	     * - MAD temp0, temp1.bbbb, temps, temp0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
+						     R300_ALU_RGB_INSERT_NOP));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
+						     R300_ALU_RGB_ADDR1(5) |
+						     R300_ALU_RGB_ADDR2(1) |
+						     R300_ALU_RGB_ADDRD(0) |
+						     R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
+						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
+						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
+						       R300_ALU_ALPHA_ADDR1(5) |
+						       R300_ALU_ALPHA_ADDR2(1) |
+						       R300_ALU_ALPHA_ADDRD(0) |
+						       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
+
+	    /* LRP output, temp2.bbbb, temp3, temp0 ->
+	     * - PRESUB temps, temp3 - temp0
+	     * - MAD output, temp2.bbbb, temps, temp0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						     R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						     R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						     R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						     R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
+						     R300_ALU_RGB_ADDR1(3) |
+						     R300_ALU_RGB_ADDR2(2) |
+						     R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
+						       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
+						       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
+						       R300_ALU_ALPHA_ADDR1(3) |
+						       R300_ALU_ALPHA_ADDR2(2) |
+						       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
+
+	    /* Shader constants. */
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
+
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
+	    OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
+
+	    FINISH_ACCEL();
+	} else {
+	    BEGIN_ACCEL(11);
+	    /* 2 components: 2 for tex0 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+                          ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+                           R300_RS_COUNT_HIRES_EN));
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+
+	    /* Indirection levels */
+	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+					   R300_FIRST_TEX));
+
+	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+						R300_ALU_CODE_SIZE(1) |
+						R300_TEX_CODE_OFFSET(0) |
+						R300_TEX_CODE_SIZE(1)));
+
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+						R300_ALU_SIZE(0) |
+						R300_TEX_START(0) |
+						R300_TEX_SIZE(0) |
+						R300_RGBA_OUT));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+					       R300_TEX_DST_ADDR(0) |
+					       R300_TEX_ID(0) |
+					       R300_TEX_INST(R300_TEX_INST_LD)));
+
+	    /* ALU inst */
+	    /* RGB */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
+                                                   R300_ALU_RGB_ADDR1(0) |
+                                                   R300_ALU_RGB_ADDR2(0) |
+                                                   R300_ALU_RGB_ADDRD(0) |
+                                                   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
+								       R300_ALU_RGB_MASK_G |
+								       R300_ALU_RGB_MASK_B)) |
+                                                   R300_ALU_RGB_TARGET_A));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+                                                   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+                                                   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+                                                   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+                                                   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+                                                   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+                                                   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+                                                   R300_ALU_RGB_CLAMP));
+	    /* Alpha */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
+						     R300_ALU_ALPHA_ADDR1(0) |
+						     R300_ALU_ALPHA_ADDR2(0) |
+						     R300_ALU_ALPHA_ADDRD(0) |
+						     R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						     R300_ALU_ALPHA_TARGET_A |
+						     R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
+						     R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
+						     R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
+						     R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
+						     R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
+						     R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
+						     R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						     R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
+						     R300_ALU_ALPHA_CLAMP));
+	    FINISH_ACCEL();
+	}
     } else {
 	/*
 	 * y' = y - .0625
@@ -1972,220 +1733,446 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	       if that's of any use. */
 	}
 
-	BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
-	/* 2 components */
-	OUT_ACCEL_REG(R300_RS_COUNT,
-		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-		       R300_RS_COUNT_HIRES_EN));
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
-
-	/* Indirection levels */
-	OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
-				       R300_FIRST_TEX));
-
-	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-					    R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
-					    R300_TEX_CODE_OFFSET(0) |
-					    R300_TEX_CODE_SIZE(1)));
-
-	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-					    R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
-					    R300_TEX_START(0) |
-					    R300_TEX_SIZE(0) |
-					    R300_RGBA_OUT));
-
-	/* tex inst */
-	OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(0) |
-					   R300_TEX_ID(0) |
-					   R300_TEX_INST(R300_TEX_INST_LD)));
-
-	/* ALU inst */
-	/* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
-						R300_ALU_RGB_ADDR1(0) |
-						R300_ALU_RGB_ADDR2(0) |
-						R300_ALU_RGB_ADDRD(1) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	/* alpha nop, but need to set up alpha source for rgb usage */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
-						  R300_ALU_ALPHA_ADDR1(0) |
-						  R300_ALU_ALPHA_ADDR2(0) |
-						  R300_ALU_ALPHA_ADDRD(0) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	/* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
-						R300_ALU_RGB_ADDR1(0) |
-						R300_ALU_RGB_ADDR2(1) |
-						R300_ALU_RGB_ADDRD(1) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	/* alpha nop */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
-						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	/* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
-						R300_ALU_RGB_ADDR1(0) |
-						R300_ALU_RGB_ADDR2(1) |
-						R300_ALU_RGB_ADDRD(0) |
-						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
-						(needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
-						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
-						R300_ALU_RGB_CLAMP));
-	/* write alpha 1 */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
-						  R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
-						  R300_ALU_ALPHA_TARGET_A));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
-
-	if (needgamma) {
-	    /* rgb temp0.r = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.r */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+	if (isplanar) {
+	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
+	    /* 2 components: same 2 for tex0/1/2 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
+
+	    /* Indirection levels */
+	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+					   R300_FIRST_TEX));
+
+	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
+						R300_TEX_CODE_OFFSET(0) |
+						R300_TEX_CODE_SIZE(3)));
+
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
+						R300_TEX_START(0) |
+						R300_TEX_SIZE(2) |
+						R300_RGBA_OUT));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+					       R300_TEX_DST_ADDR(2) |
+					       R300_TEX_ID(0) |
+					       R300_TEX_INST(R300_TEX_INST_LD)));
+	    OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
+					       R300_TEX_DST_ADDR(1) |
+					       R300_TEX_ID(1) |
+					       R300_TEX_INST(R300_TEX_INST_LD)));
+	    OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
+					       R300_TEX_DST_ADDR(0) |
+					       R300_TEX_ID(2) |
+					       R300_TEX_INST(R300_TEX_INST_LD)));
+
+	    /* ALU inst */
+	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
+						    R300_ALU_RGB_ADDR1(2) |
+						    R300_ALU_RGB_ADDR2(0) |
+						    R300_ALU_RGB_ADDRD(2) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	    /* alpha nop, but need to set up alpha source for rgb usage */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
+						      R300_ALU_ALPHA_ADDR1(2) |
+						      R300_ALU_ALPHA_ADDR2(0) |
+						      R300_ALU_ALPHA_ADDRD(2) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	    /* rgb temp0.g = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.g */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR1(1) |
+						    R300_ALU_RGB_ADDR2(2) |
+						    R300_ALU_RGB_ADDRD(2) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	    /* alpha nop */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	    /* rgb temp0.b = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha lg2 temp0, temp0.b */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
+						    R300_ALU_RGB_ADDR1(0) |
+						    R300_ALU_RGB_ADDR2(2) |
+						    R300_ALU_RGB_ADDRD(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
+						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+						    R300_ALU_RGB_CLAMP));
+	    /* write alpha 1 */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						      R300_ALU_ALPHA_TARGET_A));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	    /* MUL const1, temp1, temp0 */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
+
+	    if (needgamma) {
+		/* rgb temp0.r = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.r */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb temp0.g = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.g */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb temp0.b = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.b */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* MUL const1, temp1, temp0 */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_ADDR1(0) |
+							R300_ALU_RGB_ADDR2(0) |
+							R300_ALU_RGB_ADDRD(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+		/* alpha nop, but set up const1 */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.r = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.r */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.g = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.g */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.b = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.b */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    }
+	} else {
+	    BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
+	    /* 2 components */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
+
+	    /* Indirection levels */
+	    OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+					   R300_FIRST_TEX));
+
+	    OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+						R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
+						R300_TEX_CODE_OFFSET(0) |
+						R300_TEX_CODE_SIZE(1)));
+
+	    OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+						R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
+						R300_TEX_START(0) |
+						R300_TEX_SIZE(0) |
+						R300_RGBA_OUT));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+					       R300_TEX_DST_ADDR(0) |
+					       R300_TEX_ID(0) |
+					       R300_TEX_INST(R300_TEX_INST_LD)));
+
+	    /* ALU inst */
+	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
 						    R300_ALU_RGB_ADDR1(0) |
 						    R300_ALU_RGB_ADDR2(0) |
-						    R300_ALU_RGB_ADDRD(0) |
+						    R300_ALU_RGB_ADDRD(1) |
 						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
 						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
 						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
 						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
 						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
 						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-	    /* alpha nop, but set up const1 */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+	    /* alpha nop, but need to set up alpha source for rgb usage */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
+						      R300_ALU_ALPHA_ADDR1(0) |
+						      R300_ALU_ALPHA_ADDR2(0) |
+						      R300_ALU_ALPHA_ADDRD(0) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	    /* rgb out0.r = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.r */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
-						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-	    /* rgb out0.g = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.g */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
+						    R300_ALU_RGB_ADDR1(0) |
+						    R300_ALU_RGB_ADDR2(1) |
+						    R300_ALU_RGB_ADDRD(1) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	    /* alpha nop */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
 						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	    /* rgb out0.b = op_sop, set up src0 reg */
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
-						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
-						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
-	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
-			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-	    /* alpha ex2 temp0, temp0.b */
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
-						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
+						    R300_ALU_RGB_ADDR1(0) |
+						    R300_ALU_RGB_ADDR2(1) |
+						    R300_ALU_RGB_ADDRD(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
+						    (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+						    R300_ALU_RGB_CLAMP));
+	    /* write alpha 1 */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						      R300_ALU_ALPHA_TARGET_A));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
+
+	    if (needgamma) {
+		/* rgb temp0.r = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.r */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb temp0.g = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.g */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb temp0.b = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha lg2 temp0, temp0.b */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* MUL const1, temp1, temp0 */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_ADDR1(0) |
+							R300_ALU_RGB_ADDR2(0) |
+							R300_ALU_RGB_ADDRD(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+							R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+							R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+							R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+							R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+							R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+		/* alpha nop, but set up const1 */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.r = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.r */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.g = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.g */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* rgb out0.b = op_sop, set up src0 reg */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+							R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
+							R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
+			      R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			      R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		/* alpha ex2 temp0, temp0.b */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
+							  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+							  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+							  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+							  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	    }
 	}
 
 	/* Shader constants. */
@@ -2491,9 +2478,9 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     if (RADEONTilingEnabled(pScrn, pPixmap))
 	colorpitch |= R300_COLORTILE;
 
-    if (!pPriv->bicubic_enabled && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+    if (((pPriv->bicubic_state == BICUBIC_OFF)) &&
+        (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12))
 	isplanar = TRUE;
-    }
 
     if (isplanar) {
 	txformat1 = R300_TX_FORMAT_X8;
@@ -2504,7 +2491,7 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	else
 	    txformat1 = R300_TX_FORMAT_VYUY422;
 
-	if (pPriv->bicubic_enabled)
+	if (pPriv->bicubic_state != BICUBIC_OFF)
 	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
 
 	/* pitch is in pixels */
@@ -2694,805 +2681,574 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     FINISH_ACCEL();
 
     /* setup pixel shader */
-    if (pPriv->bicubic_enabled) {
-	BEGIN_ACCEL(7);
-
-	/* 4 components: 2 for tex0 and 2 for tex1 */
-	OUT_ACCEL_REG(R300_RS_COUNT,
-		      ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-		       R300_RS_COUNT_HIRES_EN));
-
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
-
-	/* Pixel stack frame size. */
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
-
-	/* FP length. */
-	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-					  R500_US_CODE_END_ADDR(13)));
-	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-					   R500_US_CODE_RANGE_SIZE(13)));
-
-	/* Prepare for FP emission. */
-	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
-	FINISH_ACCEL();
-
-	BEGIN_ACCEL(89);
-	/* Pixel shader.
-	 * I've gone ahead and annotated each instruction, since this
-	 * thing is MASSIVE. :3
-	 * Note: In order to avoid buggies with temps and multiple
-	 * inputs, all temps are offset by 2. temp0 -> register2. */
-
-	/* TEX temp2, input1.xxxx, tex1, 1D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_R |
-					       R500_TEX_SRC_R_SWIZ_R |
-					       R500_TEX_SRC_Q_SWIZ_R |
-					       R500_TEX_DST_ADDR(2) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* TEX temp5, input1.yyyy, tex1, 1D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_SEM_ACQUIRE |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
-					       R500_TEX_SRC_S_SWIZ_G |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_SRC_R_SWIZ_G |
-					       R500_TEX_SRC_Q_SWIZ_G |
-					       R500_TEX_DST_ADDR(5) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* MUL temp4, const0.x0x0, temp2.yyxx */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_R |
-					       R500_ALU_RGB_G_SWIZ_A_0 |
-					       R500_ALU_RGB_B_SWIZ_A_R |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC0 |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SEL_B_SRC1 |
-					       R500_ALPHA_SWIZ_B_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_R_SWIZ_0 |
-					       R500_ALU_RGBA_G_SWIZ_0 |
-					       R500_ALU_RGBA_B_SWIZ_0 |
-					       R500_ALU_RGBA_A_SWIZ_0));
-
-	/* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(5) |
-					       R500_RGB_ADDR2(4)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(5) |
-					       R500_ALPHA_ADDR2(4)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_0 |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_0 |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_R |
-					       R500_ALU_RGB_B_SWIZ_B_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC0 |
-					       R500_ALPHA_SWIZ_A_G |
-					       R500_ALPHA_SEL_B_SRC1 |
-					       R500_ALPHA_SWIZ_B_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_A_SWIZ_A));
-
-	/* ADD temp3, temp3, input0.xyxy */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
-					       R500_RGB_ADDR2(0)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
-					       R500_ALPHA_ADDR2(0)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
-					       R500_ALU_RGB_G_SWIZ_A_1 |
-					       R500_ALU_RGB_B_SWIZ_A_1 |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SWIZ_A_1 |
-					       R500_ALPHA_SEL_B_SRC1 |
-					       R500_ALPHA_SWIZ_B_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_R |
-					       R500_ALU_RGBA_A_SWIZ_G));
-
-	/* TEX temp1, temp3.zwxy, tex0, 2D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
-					       R500_TEX_SRC_S_SWIZ_B |
-					       R500_TEX_SRC_T_SWIZ_A |
-					       R500_TEX_SRC_R_SWIZ_R |
-					       R500_TEX_SRC_Q_SWIZ_G |
-					       R500_TEX_DST_ADDR(1) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* TEX temp3, temp3.xyzw, tex0, 2D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_SEM_ACQUIRE |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_SRC_R_SWIZ_B |
-					       R500_TEX_SRC_Q_SWIZ_A |
-					       R500_TEX_DST_ADDR(3) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(5) |
-					       R500_RGB_ADDR2(4)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(5) |
-					       R500_ALPHA_ADDR2(4)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_0 |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_0 |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_G));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC0 |
-					       R500_ALPHA_SWIZ_A_G |
-					       R500_ALPHA_SEL_B_SRC1 |
-					       R500_ALPHA_SWIZ_B_G));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_A_SWIZ_A));
-
-	/* ADD temp0, temp4, input0.xyxy */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
-					       R500_RGB_ADDR2(0)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
-					       R500_ALPHA_ADDR2(0)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
-					       R500_ALU_RGB_G_SWIZ_A_1 |
-					       R500_ALU_RGB_B_SWIZ_A_1 |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SWIZ_A_1 |
-					       R500_ALPHA_SEL_B_SRC1 |
-					       R500_ALPHA_SWIZ_B_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_R |
-					       R500_ALU_RGBA_A_SWIZ_G));
-
-	/* TEX temp4, temp0.zwzw, tex0, 2D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_B |
-					       R500_TEX_SRC_T_SWIZ_A |
-					       R500_TEX_SRC_R_SWIZ_B |
-					       R500_TEX_SRC_Q_SWIZ_A |
-					       R500_TEX_DST_ADDR(4) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* TEX temp0, temp0.xyzw, tex0, 2D */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_SEM_ACQUIRE |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_SRC_R_SWIZ_B |
-					       R500_TEX_SRC_Q_SWIZ_A |
-					       R500_TEX_DST_ADDR(0) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* LRP temp3, temp2.zzzz, temp1, temp3 ->
-	 * - PRESUB temps, temp1 - temp3
-	 * - MAD temp2.zzzz, temps, temp3 */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
-					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-					       R500_RGB_ADDR1(1) |
-					       R500_RGB_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
-					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-					       R500_ALPHA_ADDR1(1) |
-					       R500_ALPHA_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-					       R500_ALU_RGB_R_SWIZ_A_B |
-					       R500_ALU_RGB_G_SWIZ_A_B |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRCP |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC2 |
-					       R500_ALPHA_SWIZ_A_B |
-					       R500_ALPHA_SEL_B_SRCP |
-					       R500_ALPHA_SWIZ_B_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_A_SWIZ_A));
-
-	/* LRP temp0, temp2.zzzz, temp4, temp0 ->
-	 * - PRESUB temps, temp4 - temp1
-	 * - MAD temp2.zzzz, temps, temp0 */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-					       R500_RGB_ADDR1(4) |
-					       R500_RGB_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-					       R500_ALPHA_ADDR1(4) |
-					       R500_ALPHA_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-					       R500_ALU_RGB_R_SWIZ_A_B |
-					       R500_ALU_RGB_G_SWIZ_A_B |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRCP |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC2 |
-					       R500_ALPHA_SWIZ_A_B |
-					       R500_ALPHA_SEL_B_SRCP |
-					       R500_ALPHA_SWIZ_B_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_A_SWIZ_A));
-
-	/* LRP output, temp5.zzzz, temp3, temp0 ->
-	 * - PRESUB temps, temp3 - temp0
-	 * - MAD temp5.zzzz, temps, temp0 */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
-					       R500_INST_LAST |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK |
-					       R500_INST_RGB_OMASK_R |
-					       R500_INST_RGB_OMASK_G |
-					       R500_INST_RGB_OMASK_B |
-					       R500_INST_ALPHA_OMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-					       R500_RGB_ADDR1(3) |
-					       R500_RGB_ADDR2(5)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-					       R500_ALPHA_ADDR1(3) |
-					       R500_ALPHA_ADDR2(5)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-					       R500_ALU_RGB_R_SWIZ_A_B |
-					       R500_ALU_RGB_G_SWIZ_A_B |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRCP |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-					       R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SEL_A_SRC2 |
-					       R500_ALPHA_SWIZ_A_B |
-					       R500_ALPHA_SEL_B_SRCP |
-					       R500_ALPHA_SWIZ_B_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-					       R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_A_SWIZ_A));
-
-	/* Shader constants. */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
-
-	/* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
-
-	FINISH_ACCEL();
-
-    } else if (isplanar) {
-	/*
-	 * y' = y - .0625
-	 * u' = u - .5
-	 * v' = v - .5;
-	 *
-	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
-	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
-	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
-	 *
-	 * DP3 might look like the straightforward solution
-	 * but we'd need to move the texture yuv values in
-	 * the same reg for this to work. Therefore use MADs.
-	 * Brightness just adds to the off constant.
-	 * Contrast is multiplication of luminance.
-	 * Saturation and hue change the u and v coeffs.
-	 * Default values (before adjustments - depend on colorspace):
-	 * yco = 1.1643
-	 * uco = 0, -0.39173, 2.017
-	 * vco = 1.5958, -0.8129, 0
-	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
-	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
-	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
-	 *
-	 * temp = MAD(yco, yuv.yyyy, off)
-	 * temp = MAD(uco, yuv.uuuu, temp)
-	 * result = MAD(vco, yuv.vvvv, temp)
-	 */
-	/* TODO: don't recalc consts always */
-	const float Loff = -0.0627;
-	const float Coff = -0.502;
-	float uvcosf, uvsinf;
-	float yco;
-	float uco[3], vco[3], off[3];
-	float bright, cont, gamma;
-	int ref = pPriv->transform_index;
-	Bool needgamma = FALSE;
-
-	cont = RTFContrast(pPriv->contrast);
-	bright = RTFBrightness(pPriv->brightness);
-	gamma = (float)pPriv->gamma / 1000.0;
-	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
-	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
-	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
-
-	yco = trans[ref].RefLuma * cont;
-	uco[0] = -trans[ref].RefRCr * uvsinf;
-	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-	uco[2] = trans[ref].RefBCb * uvcosf;
-	vco[0] = trans[ref].RefRCr * uvcosf;
-	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-	vco[2] = trans[ref].RefBCb * uvsinf;
-	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
-	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
-	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
-
-	//XXX gamma
+    if (pPriv->bicubic_state != BICUBIC_OFF) {
+	if (pPriv->bicubic_enabled) {
+	    BEGIN_ACCEL(7);
 
-	if (gamma != 1.0) {
-	    needgamma = TRUE;
-	    /* note: gamma correction is out = in ^ gamma;
-	       gpu can only do LG2/EX2 therefore we transform into
-	       in ^ gamma = 2 ^ (log2(in) * gamma).
-	       Lots of scalar ops, unfortunately (better solution?) -
-	       without gamma that's 3 inst, with gamma it's 10...
-	       could use different gamma factors per channel,
-	       if that's of any use. */
+	    /* 4 components: 2 for tex0 and 2 for tex1 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
+
+	    /* Pixel stack frame size. */
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
+
+	    /* FP length. */
+	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					      R500_US_CODE_END_ADDR(13)));
+	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					       R500_US_CODE_RANGE_SIZE(13)));
+
+	    /* Prepare for FP emission. */
+	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+	    FINISH_ACCEL();
+
+	    BEGIN_ACCEL(89);
+	    /* Pixel shader.
+	     * I've gone ahead and annotated each instruction, since this
+	     * thing is MASSIVE. :3
+	     * Note: In order to avoid buggies with temps and multiple
+	     * inputs, all temps are offset by 2. temp0 -> register2. */
+
+	    /* TEX temp2, input1.xxxx, tex1, 1D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_R |
+						   R500_TEX_SRC_R_SWIZ_R |
+						   R500_TEX_SRC_Q_SWIZ_R |
+						   R500_TEX_DST_ADDR(2) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* TEX temp5, input1.yyyy, tex1, 1D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
+						   R500_TEX_SRC_S_SWIZ_G |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_SRC_R_SWIZ_G |
+						   R500_TEX_SRC_Q_SWIZ_G |
+						   R500_TEX_DST_ADDR(5) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* MUL temp4, const0.x0x0, temp2.yyxx */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_0 |
+						   R500_ALU_RGB_B_SWIZ_A_R |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC0 |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SEL_B_SRC1 |
+						   R500_ALPHA_SWIZ_B_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_R_SWIZ_0 |
+						   R500_ALU_RGBA_G_SWIZ_0 |
+						   R500_ALU_RGBA_B_SWIZ_0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+
+	    /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(5) |
+						   R500_RGB_ADDR2(4)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(5) |
+						   R500_ALPHA_ADDR2(4)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_0 |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_0 |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_R |
+						   R500_ALU_RGB_B_SWIZ_B_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC0 |
+						   R500_ALPHA_SWIZ_A_G |
+						   R500_ALPHA_SEL_B_SRC1 |
+						   R500_ALPHA_SWIZ_B_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_A_SWIZ_A));
+
+	    /* ADD temp3, temp3, input0.xyxy */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
+						   R500_RGB_ADDR2(0)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
+						   R500_ALPHA_ADDR2(0)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
+						   R500_ALU_RGB_G_SWIZ_A_1 |
+						   R500_ALU_RGB_B_SWIZ_A_1 |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SWIZ_A_1 |
+						   R500_ALPHA_SEL_B_SRC1 |
+						   R500_ALPHA_SWIZ_B_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_R |
+						   R500_ALU_RGBA_A_SWIZ_G));
+
+	    /* TEX temp1, temp3.zwxy, tex0, 2D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
+						   R500_TEX_SRC_S_SWIZ_B |
+						   R500_TEX_SRC_T_SWIZ_A |
+						   R500_TEX_SRC_R_SWIZ_R |
+						   R500_TEX_SRC_Q_SWIZ_G |
+						   R500_TEX_DST_ADDR(1) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* TEX temp3, temp3.xyzw, tex0, 2D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_SRC_R_SWIZ_B |
+						   R500_TEX_SRC_Q_SWIZ_A |
+						   R500_TEX_DST_ADDR(3) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(5) |
+						   R500_RGB_ADDR2(4)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(5) |
+						   R500_ALPHA_ADDR2(4)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_0 |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_0 |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_G));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC0 |
+						   R500_ALPHA_SWIZ_A_G |
+						   R500_ALPHA_SEL_B_SRC1 |
+						   R500_ALPHA_SWIZ_B_G));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_A_SWIZ_A));
+
+	    /* ADD temp0, temp4, input0.xyxy */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
+						   R500_RGB_ADDR2(0)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
+						   R500_ALPHA_ADDR2(0)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
+						   R500_ALU_RGB_G_SWIZ_A_1 |
+						   R500_ALU_RGB_B_SWIZ_A_1 |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SWIZ_A_1 |
+						   R500_ALPHA_SEL_B_SRC1 |
+						   R500_ALPHA_SWIZ_B_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_R |
+						   R500_ALU_RGBA_A_SWIZ_G));
+
+	    /* TEX temp4, temp0.zwzw, tex0, 2D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_B |
+						   R500_TEX_SRC_T_SWIZ_A |
+						   R500_TEX_SRC_R_SWIZ_B |
+						   R500_TEX_SRC_Q_SWIZ_A |
+						   R500_TEX_DST_ADDR(4) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* TEX temp0, temp0.xyzw, tex0, 2D */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_SRC_R_SWIZ_B |
+						   R500_TEX_SRC_Q_SWIZ_A |
+						   R500_TEX_DST_ADDR(0) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* LRP temp3, temp2.zzzz, temp1, temp3 ->
+	     * - PRESUB temps, temp1 - temp3
+	     * - MAD temp2.zzzz, temps, temp3 */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
+						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+						   R500_RGB_ADDR1(1) |
+						   R500_RGB_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
+						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+						   R500_ALPHA_ADDR1(1) |
+						   R500_ALPHA_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+						   R500_ALU_RGB_R_SWIZ_A_B |
+						   R500_ALU_RGB_G_SWIZ_A_B |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRCP |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC2 |
+						   R500_ALPHA_SWIZ_A_B |
+						   R500_ALPHA_SEL_B_SRCP |
+						   R500_ALPHA_SWIZ_B_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_A_SWIZ_A));
+
+	    /* LRP temp0, temp2.zzzz, temp4, temp0 ->
+	     * - PRESUB temps, temp4 - temp1
+	     * - MAD temp2.zzzz, temps, temp0 */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+						   R500_RGB_ADDR1(4) |
+						   R500_RGB_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+						   R500_ALPHA_ADDR1(4) |
+						   R500_ALPHA_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+						   R500_ALU_RGB_R_SWIZ_A_B |
+						   R500_ALU_RGB_G_SWIZ_A_B |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRCP |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC2 |
+						   R500_ALPHA_SWIZ_A_B |
+						   R500_ALPHA_SEL_B_SRCP |
+						   R500_ALPHA_SWIZ_B_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_A_SWIZ_A));
+
+	    /* LRP output, temp5.zzzz, temp3, temp0 ->
+	     * - PRESUB temps, temp3 - temp0
+	     * - MAD temp5.zzzz, temps, temp0 */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+						   R500_INST_LAST |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_OMASK_R |
+						   R500_INST_RGB_OMASK_G |
+						   R500_INST_RGB_OMASK_B |
+						   R500_INST_ALPHA_OMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+						   R500_RGB_ADDR1(3) |
+						   R500_RGB_ADDR2(5)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+						   R500_ALPHA_ADDR1(3) |
+						   R500_ALPHA_ADDR2(5)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+						   R500_ALU_RGB_R_SWIZ_A_B |
+						   R500_ALU_RGB_G_SWIZ_A_B |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRCP |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+						   R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SEL_A_SRC2 |
+						   R500_ALPHA_SWIZ_A_B |
+						   R500_ALPHA_SEL_B_SRCP |
+						   R500_ALPHA_SWIZ_B_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+						   R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_A_SWIZ_A));
+
+	    /* Shader constants. */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
+
+	    /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
+	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
+	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
+	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
+	    OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
+
+	    FINISH_ACCEL();
+	} else {
+	    BEGIN_ACCEL(19);
+	    /* 2 components: 2 for tex0 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    /* Pixel stack frame size. */
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+
+	    /* FP length. */
+	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					      R500_US_CODE_END_ADDR(1)));
+	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					       R500_US_CODE_RANGE_SIZE(1)));
+
+	    /* Prepare for FP emission. */
+	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_DST_ADDR(0) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+						   R500_DX_S_SWIZ_R |
+						   R500_DX_T_SWIZ_R |
+						   R500_DX_R_SWIZ_R |
+						   R500_DX_Q_SWIZ_R |
+						   R500_DY_ADDR(0) |
+						   R500_DY_S_SWIZ_R |
+						   R500_DY_T_SWIZ_R |
+						   R500_DY_R_SWIZ_R |
+						   R500_DY_Q_SWIZ_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* ALU inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_LAST |
+						   R500_INST_RGB_OMASK_R |
+						   R500_INST_RGB_OMASK_G |
+						   R500_INST_RGB_OMASK_B |
+						   R500_INST_ALPHA_OMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR1(0) |
+						   R500_RGB_ADDR1_CONST |
+						   R500_RGB_ADDR2(0) |
+						   R500_RGB_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR1(0) |
+						   R500_ALPHA_ADDR1_CONST |
+						   R500_ALPHA_ADDR2(0) |
+						   R500_ALPHA_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_B_1 |
+						   R500_ALU_RGB_B_SWIZ_B_1 |
+						   R500_ALU_RGB_G_SWIZ_B_1));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_SWIZ_A_A |
+						   R500_ALPHA_SWIZ_B_1));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_R_SWIZ_0 |
+						   R500_ALU_RGBA_G_SWIZ_0 |
+						   R500_ALU_RGBA_B_SWIZ_0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+	    FINISH_ACCEL();
 	}
-
-	BEGIN_ACCEL(56);
-	/* 2 components: 2 for tex0 */
-	OUT_ACCEL_REG(R300_RS_COUNT,
-		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-		       R300_RS_COUNT_HIRES_EN));
-
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-	/* Pixel stack frame size. */
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
-
-	/* FP length. */
-	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-					  R500_US_CODE_END_ADDR(5)));
-	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-					   R500_US_CODE_RANGE_SIZE(5)));
-
-	/* Prepare for FP emission. */
-	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
-
-	/* tex inst */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_DST_ADDR(2) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
-					       R500_DX_S_SWIZ_R |
-					       R500_DX_T_SWIZ_R |
-					       R500_DX_R_SWIZ_R |
-					       R500_DX_Q_SWIZ_R |
-					       R500_DY_ADDR(0) |
-					       R500_DY_S_SWIZ_R |
-					       R500_DY_T_SWIZ_R |
-					       R500_DY_R_SWIZ_R |
-					       R500_DY_Q_SWIZ_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* tex inst */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_DST_ADDR(1) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
-					       R500_DX_S_SWIZ_R |
-					       R500_DX_T_SWIZ_R |
-					       R500_DX_R_SWIZ_R |
-					       R500_DX_Q_SWIZ_R |
-					       R500_DY_ADDR(0) |
-					       R500_DY_S_SWIZ_R |
-					       R500_DY_T_SWIZ_R |
-					       R500_DY_R_SWIZ_R |
-					       R500_DY_Q_SWIZ_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* tex inst */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_SEM_ACQUIRE |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_DST_ADDR(0) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
-					       R500_DX_S_SWIZ_R |
-					       R500_DX_T_SWIZ_R |
-					       R500_DX_R_SWIZ_R |
-					       R500_DX_Q_SWIZ_R |
-					       R500_DY_ADDR(0) |
-					       R500_DY_S_SWIZ_R |
-					       R500_DY_T_SWIZ_R |
-					       R500_DY_R_SWIZ_R |
-					       R500_DY_Q_SWIZ_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* ALU inst */
-	/* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(2) |
-					       R500_RGB_ADDR2(0) |
-					       R500_RGB_ADDR2_CONST));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(2) |
-					       R500_ALPHA_ADDR2(0) |
-					       R500_ALPHA_ADDR2_CONST));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_A |
-					       R500_ALU_RGB_G_SWIZ_A_A |
-					       R500_ALU_RGB_B_SWIZ_A_A |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_B_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(2) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(2) |
-					       R500_ALU_RGBA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_0));
-
-	/* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(1) |
-					       R500_RGB_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(1) |
-					       R500_ALPHA_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_R |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_B_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(2) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(2) |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_0));
-
-	/* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_LAST |
-					       R500_INST_RGB_OMASK_R |
-					       R500_INST_RGB_OMASK_G |
-					       R500_INST_RGB_OMASK_B |
-					       R500_INST_ALPHA_OMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(0) |
-					       R500_RGB_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(0) |
-					       R500_ALPHA_ADDR2(2)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_R |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_B_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(0) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(0) |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_1));
-
-	/* Shader constants. */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
-
-	/* constant 0: off, yco */
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
-	/* constant 1: uco */
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
-	/* constant 2: vco */
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
-	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
-
-	FINISH_ACCEL();
-
     } else {
 	/*
 	 * y' = y - .0625
@@ -3562,175 +3318,414 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	       if that's of any use. */
 	}
 
-	BEGIN_ACCEL(44);
-	/* 2 components: 2 for tex0/1/2 */
-	OUT_ACCEL_REG(R300_RS_COUNT,
-		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-		       R300_RS_COUNT_HIRES_EN));
-
-	/* R300_INST_COUNT_RS - highest RS instruction used */
-	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-	/* Pixel stack frame size. */
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
-
-	/* FP length. */
-	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-					  R500_US_CODE_END_ADDR(3)));
-	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-					   R500_US_CODE_RANGE_SIZE(3)));
-
-	/* Prepare for FP emission. */
-	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
-
-	/* tex inst */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-					       R500_TEX_INST_LD |
-					       R500_TEX_SEM_ACQUIRE |
-					       R500_TEX_IGNORE_UNCOVERED));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-					       R500_TEX_SRC_S_SWIZ_R |
-					       R500_TEX_SRC_T_SWIZ_G |
-					       R500_TEX_DST_ADDR(0) |
-					       R500_TEX_DST_R_SWIZ_R |
-					       R500_TEX_DST_G_SWIZ_G |
-					       R500_TEX_DST_B_SWIZ_B |
-					       R500_TEX_DST_A_SWIZ_A));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
-					       R500_DX_S_SWIZ_R |
-					       R500_DX_T_SWIZ_R |
-					       R500_DX_R_SWIZ_R |
-					       R500_DX_Q_SWIZ_R |
-					       R500_DY_ADDR(0) |
-					       R500_DY_S_SWIZ_R |
-					       R500_DY_T_SWIZ_R |
-					       R500_DY_R_SWIZ_R |
-					       R500_DY_Q_SWIZ_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-	/* ALU inst */
-	/* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(0) |
-					       R500_RGB_ADDR2(0) |
-					       R500_RGB_ADDR2_CONST));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(0) |
-					       R500_ALPHA_ADDR2(0) |
-					       R500_ALPHA_ADDR2_CONST));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_A |
-					       R500_ALU_RGB_G_SWIZ_A_A |
-					       R500_ALU_RGB_B_SWIZ_A_A |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_G |
-					       R500_ALU_RGB_B_SWIZ_B_G |
-					       R500_ALU_RGB_G_SWIZ_B_G));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(1) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(1) |
-					       R500_ALU_RGBA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_0));
-
-	/* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_RGB_WMASK_R |
-					       R500_INST_RGB_WMASK_G |
-					       R500_INST_RGB_WMASK_B |
-					       R500_INST_ALPHA_WMASK));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(0) |
-					       R500_RGB_ADDR2(1)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(0) |
-					       R500_ALPHA_ADDR2(1)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_R |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_B |
-					       R500_ALU_RGB_B_SWIZ_B_B |
-					       R500_ALU_RGB_G_SWIZ_B_B));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(1) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(1) |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_0));
-
-	/* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
-					       R500_INST_TEX_SEM_WAIT |
-					       R500_INST_LAST |
-					       R500_INST_RGB_OMASK_R |
-					       R500_INST_RGB_OMASK_G |
-					       R500_INST_RGB_OMASK_B |
-					       R500_INST_ALPHA_OMASK |
-					       R500_INST_RGB_CLAMP |
-					       R500_INST_ALPHA_CLAMP));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
-					       R500_RGB_ADDR0_CONST |
-					       R500_RGB_ADDR1(0) |
-					       R500_RGB_ADDR2(1)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
-					       R500_ALPHA_ADDR0_CONST |
-					       R500_ALPHA_ADDR1(0) |
-					       R500_ALPHA_ADDR2(1)));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_A_R |
-					       R500_ALU_RGB_G_SWIZ_A_G |
-					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRC1 |
-					       R500_ALU_RGB_R_SWIZ_B_R |
-					       R500_ALU_RGB_B_SWIZ_B_R |
-					       R500_ALU_RGB_G_SWIZ_B_R));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_ADDRD(1) |
-					       R500_ALPHA_SWIZ_A_0 |
-					       R500_ALPHA_SWIZ_B_0));
-	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_ADDRD(1) |
-					       R500_ALU_RGBA_SEL_C_SRC2 |
-					       R500_ALU_RGBA_R_SWIZ_R |
-					       R500_ALU_RGBA_G_SWIZ_G |
-					       R500_ALU_RGBA_B_SWIZ_B |
-					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
-					       R500_ALU_RGBA_A_SWIZ_1));
+	if (isplanar) {
+	    BEGIN_ACCEL(56);
+	    /* 2 components: 2 for tex0 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    /* Pixel stack frame size. */
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
+
+	    /* FP length. */
+	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					      R500_US_CODE_END_ADDR(5)));
+	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					       R500_US_CODE_RANGE_SIZE(5)));
+
+	    /* Prepare for FP emission. */
+	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_DST_ADDR(2) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+						   R500_DX_S_SWIZ_R |
+						   R500_DX_T_SWIZ_R |
+						   R500_DX_R_SWIZ_R |
+						   R500_DX_Q_SWIZ_R |
+						   R500_DY_ADDR(0) |
+						   R500_DY_S_SWIZ_R |
+						   R500_DY_T_SWIZ_R |
+						   R500_DY_R_SWIZ_R |
+						   R500_DY_Q_SWIZ_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_DST_ADDR(1) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+						   R500_DX_S_SWIZ_R |
+						   R500_DX_T_SWIZ_R |
+						   R500_DX_R_SWIZ_R |
+						   R500_DX_Q_SWIZ_R |
+						   R500_DY_ADDR(0) |
+						   R500_DY_S_SWIZ_R |
+						   R500_DY_T_SWIZ_R |
+						   R500_DY_R_SWIZ_R |
+						   R500_DY_Q_SWIZ_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_DST_ADDR(0) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+						   R500_DX_S_SWIZ_R |
+						   R500_DX_T_SWIZ_R |
+						   R500_DX_R_SWIZ_R |
+						   R500_DX_Q_SWIZ_R |
+						   R500_DY_ADDR(0) |
+						   R500_DY_S_SWIZ_R |
+						   R500_DY_T_SWIZ_R |
+						   R500_DY_R_SWIZ_R |
+						   R500_DY_Q_SWIZ_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* ALU inst */
+	    /* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(2) |
+						   R500_RGB_ADDR2(0) |
+						   R500_RGB_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(2) |
+						   R500_ALPHA_ADDR2(0) |
+						   R500_ALPHA_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_A |
+						   R500_ALU_RGB_G_SWIZ_A_A |
+						   R500_ALU_RGB_B_SWIZ_A_A |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_B_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(2) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(2) |
+						   R500_ALU_RGBA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+
+	    /* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(1) |
+						   R500_RGB_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(1) |
+						   R500_ALPHA_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_B_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(2) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(2) |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+
+	    /* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_LAST |
+						   R500_INST_RGB_OMASK_R |
+						   R500_INST_RGB_OMASK_G |
+						   R500_INST_RGB_OMASK_B |
+						   R500_INST_ALPHA_OMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(0) |
+						   R500_RGB_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(0) |
+						   R500_ALPHA_ADDR2(2)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_B_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(0) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(0) |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_1));
+
+	} else {
+	    BEGIN_ACCEL(44);
+	    /* 2 components: 2 for tex0/1/2 */
+	    OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+
+	    /* R300_INST_COUNT_RS - highest RS instruction used */
+	    OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	    /* Pixel stack frame size. */
+	    OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
+
+	    /* FP length. */
+	    OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					      R500_US_CODE_END_ADDR(3)));
+	    OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					       R500_US_CODE_RANGE_SIZE(3)));
+
+	    /* Prepare for FP emission. */
+	    OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+
+	    /* tex inst */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+						   R500_TEX_INST_LD |
+						   R500_TEX_SEM_ACQUIRE |
+						   R500_TEX_IGNORE_UNCOVERED));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+						   R500_TEX_SRC_S_SWIZ_R |
+						   R500_TEX_SRC_T_SWIZ_G |
+						   R500_TEX_DST_ADDR(0) |
+						   R500_TEX_DST_R_SWIZ_R |
+						   R500_TEX_DST_G_SWIZ_G |
+						   R500_TEX_DST_B_SWIZ_B |
+						   R500_TEX_DST_A_SWIZ_A));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+						   R500_DX_S_SWIZ_R |
+						   R500_DX_T_SWIZ_R |
+						   R500_DX_R_SWIZ_R |
+						   R500_DX_Q_SWIZ_R |
+						   R500_DY_ADDR(0) |
+						   R500_DY_S_SWIZ_R |
+						   R500_DY_T_SWIZ_R |
+						   R500_DY_R_SWIZ_R |
+						   R500_DY_Q_SWIZ_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	    /* ALU inst */
+	    /* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(0) |
+						   R500_RGB_ADDR2(0) |
+						   R500_RGB_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(0) |
+						   R500_ALPHA_ADDR2(0) |
+						   R500_ALPHA_ADDR2_CONST));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_A |
+						   R500_ALU_RGB_G_SWIZ_A_A |
+						   R500_ALU_RGB_B_SWIZ_A_A |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_G |
+						   R500_ALU_RGB_B_SWIZ_B_G |
+						   R500_ALU_RGB_G_SWIZ_B_G));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(1) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(1) |
+						   R500_ALU_RGBA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+
+	    /* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_RGB_WMASK_R |
+						   R500_INST_RGB_WMASK_G |
+						   R500_INST_RGB_WMASK_B |
+						   R500_INST_ALPHA_WMASK));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(0) |
+						   R500_RGB_ADDR2(1)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(0) |
+						   R500_ALPHA_ADDR2(1)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_B |
+						   R500_ALU_RGB_B_SWIZ_B_B |
+						   R500_ALU_RGB_G_SWIZ_B_B));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(1) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(1) |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_0));
+
+	    /* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+						   R500_INST_TEX_SEM_WAIT |
+						   R500_INST_LAST |
+						   R500_INST_RGB_OMASK_R |
+						   R500_INST_RGB_OMASK_G |
+						   R500_INST_RGB_OMASK_B |
+						   R500_INST_ALPHA_OMASK |
+						   R500_INST_RGB_CLAMP |
+						   R500_INST_ALPHA_CLAMP));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
+						   R500_RGB_ADDR0_CONST |
+						   R500_RGB_ADDR1(0) |
+						   R500_RGB_ADDR2(1)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+						   R500_ALPHA_ADDR0_CONST |
+						   R500_ALPHA_ADDR1(0) |
+						   R500_ALPHA_ADDR2(1)));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+						   R500_ALU_RGB_R_SWIZ_A_R |
+						   R500_ALU_RGB_G_SWIZ_A_G |
+						   R500_ALU_RGB_B_SWIZ_A_B |
+						   R500_ALU_RGB_SEL_B_SRC1 |
+						   R500_ALU_RGB_R_SWIZ_B_R |
+						   R500_ALU_RGB_B_SWIZ_B_R |
+						   R500_ALU_RGB_G_SWIZ_B_R));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+						   R500_ALPHA_ADDRD(1) |
+						   R500_ALPHA_SWIZ_A_0 |
+						   R500_ALPHA_SWIZ_B_0));
+	    OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+						   R500_ALU_RGBA_ADDRD(1) |
+						   R500_ALU_RGBA_SEL_C_SRC2 |
+						   R500_ALU_RGBA_R_SWIZ_R |
+						   R500_ALU_RGBA_G_SWIZ_G |
+						   R500_ALU_RGBA_B_SWIZ_B |
+						   R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+						   R500_ALU_RGBA_A_SWIZ_1));
+	}
 
 	/* Shader constants. */
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
commit 85323a7f84381fef7fad20c7f7ec601637af9aa7
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Apr 16 11:10:51 2009 -0400

    R3xx/R4xx: set tex caching for Y texture when doing planar rendering
    
    Doesn't affect performance, but docs indicate its the right
    thing to do.

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index e449204..0ac247a 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1091,7 +1091,10 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
     OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
     OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
-    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
+    if (isplanar)
+	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1 | R300_TX_FORMAT_CACHE_HALF_REGION_0);
+    else
+	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
     OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
     OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
     FINISH_ACCEL();
commit 5ea5df22c038fc8f00984acc760e9d8c962bf902
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Apr 15 20:26:34 2009 -0400

    Tex vid: remove remnants of XV_HWPLANAR
    
    no longer needed as bicubic is the only thing that uses
    the old csc code.

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 532b600..2318a62 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -367,26 +367,19 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    pPriv->bicubic_enabled = FALSE;
     }
 
-    pPriv->planar_hw = pPriv->planar_state;
-    if (pPriv->bicubic_enabled || IS_R600_3D)
-	pPriv->planar_hw = 0;
-
-    if (info->ChipFamily < CHIP_FAMILY_R300)
-	pPriv->planar_hw = 1;
-
     switch(id) {
     case FOURCC_YV12:
     case FOURCC_I420:
 	srcPitch = (width + 3) & ~3;
 	srcPitch2 = ((width >> 1) + 3) & ~3;
-        if (pPriv->planar_hw) {
+        if (pPriv->bicubic_enabled) {
+	    dstPitch = ((dst_width << 1) + 15) & ~15;
+	    dstPitch = (dstPitch + 63) & ~63;
+	} else {
 	    dstPitch = (dst_width + 15) & ~15;
 	    dstPitch = (dstPitch + 63) & ~63;
 	    dstPitch2 = ((dst_width >> 1) + 15) & ~15;
 	    dstPitch2 = (dstPitch2 + 63) & ~63;
-	} else {
-	    dstPitch = ((dst_width << 1) + 15) & ~15;
-	    dstPitch = (dstPitch + 63) & ~63;
 	}
 	break;
     case FOURCC_UYVY:
@@ -509,8 +502,24 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 				     srcPitch, srcPitch2, pPriv->src_pitch,
 				     width, height);
 	    }
-	}
-        else if (pPriv->planar_hw) {
+	} else if (pPriv->bicubic_enabled) {
+	    top &= ~1;
+	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
+	    s2offset = srcPitch * height;
+	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
+	    pPriv->src_addr += left << 1;
+	    tmp = ((top >> 1) * srcPitch2) + (left >> 1);
+	    s2offset += tmp;
+	    s3offset += tmp;
+	    if (id == FOURCC_I420) {
+		tmp = s2offset;
+		s2offset = s3offset;
+		s3offset = tmp;
+	    }
+	    RADEONCopyMungedData(pScrn, buf + (top * srcPitch) + left,
+				 buf + s2offset, buf + s3offset, pPriv->src_addr,
+				 srcPitch, srcPitch2, dstPitch, nlines, npixels);
+	} else {
 	    top &= ~1;
 	    s2offset = srcPitch * ((height + 1) & ~1);
 	    s3offset = s2offset + srcPitch2 * ((height + 1) >> 1);
@@ -532,23 +541,6 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 		srcPitch2, dstPitch2, (nlines + 1) >> 1, npixels >> 1, 1);
 	    RADEONCopyData(pScrn, buf + s3offset, pPriv->src_addr + d3line + (left >> 1),
 		srcPitch2, dstPitch2, (nlines + 1) >> 1, npixels >> 1, 1);
-	} else {
-	    top &= ~1;
-	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
-	    s2offset = srcPitch * height;
-	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
-	    pPriv->src_addr += left << 1;
-	    tmp = ((top >> 1) * srcPitch2) + (left >> 1);
-	    s2offset += tmp;
-	    s3offset += tmp;
-	    if (id == FOURCC_I420) {
-		tmp = s2offset;
-		s2offset = s3offset;
-		s3offset = tmp;
-	    }
-	    RADEONCopyMungedData(pScrn, buf + (top * srcPitch) + left,
-				 buf + s2offset, buf + s3offset, pPriv->src_addr,
-				 srcPitch, srcPitch2, dstPitch, nlines, npixels);
 	}
 	break;
     case FOURCC_UYVY:
@@ -681,13 +673,12 @@ static XF86AttributeRec Attributes_r200[NUM_ATTRIBUTES_R200+1] =
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R300 9
+#define NUM_ATTRIBUTES_R300 8
 
 static XF86AttributeRec Attributes_r300[NUM_ATTRIBUTES_R300+1] =
 {
     {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
-    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
     {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
     {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
@@ -697,13 +688,12 @@ static XF86AttributeRec Attributes_r300[NUM_ATTRIBUTES_R300+1] =
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R500 8
+#define NUM_ATTRIBUTES_R500 7
 
 static XF86AttributeRec Attributes_r500[NUM_ATTRIBUTES_R500+1] =
 {
     {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
-    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
     {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
     {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
@@ -727,7 +717,6 @@ static XF86AttributeRec Attributes_r600[NUM_ATTRIBUTES_R600+1] =
 
 static Atom xvBicubic;
 static Atom xvVSync;
-static Atom xvHWPlanar;
 static Atom xvBrightness, xvContrast, xvSaturation, xvHue;
 static Atom xvGamma, xvColorspace;
 
@@ -756,8 +745,6 @@ RADEONGetTexPortAttribute(ScrnInfoPtr  pScrn,
 	*value = pPriv->bicubic_state;
     else if (attribute == xvVSync)
 	*value = pPriv->vsync;
-    else if (attribute == xvHWPlanar)
-	*value = pPriv->planar_state;
     else if (attribute == xvBrightness)
 	*value = pPriv->brightness;
     else if (attribute == xvContrast)
@@ -791,10 +778,6 @@ RADEONSetTexPortAttribute(ScrnInfoPtr  pScrn,
 	pPriv->bicubic_state = ClipValue (value, 0, 2);
     else if (attribute == xvVSync)
 	pPriv->vsync = ClipValue (value, 0, 1);
-    else if (attribute == xvHWPlanar)
-	pPriv->planar_state = ClipValue (value, 0, 1);
-    else if (attribute == xvHWPlanar)
-	pPriv->planar_state = ClipValue (value, 0, 1);
     else if (attribute == xvBrightness)
 	pPriv->brightness = ClipValue (value, -1000, 1000);
     else if (attribute == xvContrast)
@@ -830,7 +813,6 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
 
     xvBicubic         = MAKE_ATOM("XV_BICUBIC");
     xvVSync           = MAKE_ATOM("XV_VSYNC");
-    xvHWPlanar        = MAKE_ATOM("XV_HWPLANAR");
     xvBrightness      = MAKE_ATOM("XV_BRIGHTNESS");
     xvContrast        = MAKE_ATOM("XV_CONTRAST");
     xvSaturation      = MAKE_ATOM("XV_SATURATION");
@@ -899,7 +881,6 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
 	pPriv->doubleBuffer = 0;
 	pPriv->bicubic_state = BICUBIC_AUTO;
 	pPriv->vsync = TRUE;
-	pPriv->planar_state = 1;
 	pPriv->brightness = 0;
 	pPriv->contrast = 0;
 	pPriv->saturation = 0;
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index db943e3..e449204 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1050,7 +1050,8 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     if (RADEONTilingEnabled(pScrn, pPixmap))
 	colorpitch |= R300_COLORTILE;
 
-    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+
+    if (!pPriv->bicubic_enabled && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
 	isplanar = TRUE;
     }
 
@@ -2487,7 +2488,7 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     if (RADEONTilingEnabled(pScrn, pPixmap))
 	colorpitch |= R300_COLORTILE;
 
-    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+    if (!pPriv->bicubic_enabled && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
 	isplanar = TRUE;
     }
 
diff --git a/src/radeon_video.h b/src/radeon_video.h
index 0f8342a..3f8f5e0 100644
--- a/src/radeon_video.h
+++ b/src/radeon_video.h
@@ -90,8 +90,6 @@ typedef struct {
    void         *video_memory;
    int           video_offset;
 
-   Bool          planar_hw;
-   Bool          planar_state;
    int           planeu_offset;
    int           planev_offset;
 
commit 9091b3f5f13dbea83ffd89679dac600e9f280bb2
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Apr 15 20:07:42 2009 -0400

    R3xx/R4xx: fix up planar shader
    
    We were overwriting the coord fetch address with the first
    tex fetch.  Seemed to work however, luck I guess. Reorder
    the fetches to write to temp0 last.

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index b74763f..db943e3 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1685,7 +1685,7 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
 	/* tex inst */
 	OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(0) |
+					   R300_TEX_DST_ADDR(2) |
 					   R300_TEX_ID(0) |
 					   R300_TEX_INST(R300_TEX_INST_LD)));
 	OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
@@ -1693,16 +1693,16 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					   R300_TEX_ID(1) |
 					   R300_TEX_INST(R300_TEX_INST_LD)));
 	OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
-					   R300_TEX_DST_ADDR(2) |
+					   R300_TEX_DST_ADDR(0) |
 					   R300_TEX_ID(2) |
 					   R300_TEX_INST(R300_TEX_INST_LD)));
 
 	/* ALU inst */
-	/* MAD temp0.rgb, const0.aaa, temp0.rgb, const0.rgb */
+	/* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
-						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR1(2) |
 						R300_ALU_RGB_ADDR2(0) |
-						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_ADDRD(2) |
 						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
 	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
 						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
@@ -1714,20 +1714,20 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
 	/* alpha nop, but need to set up alpha source for rgb usage */
 	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
-						  R300_ALU_ALPHA_ADDR1(0) |
+						  R300_ALU_ALPHA_ADDR1(2) |
 						  R300_ALU_ALPHA_ADDR2(0) |
-						  R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_ADDRD(2) |
 						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	/* MAD temp0.rgb, const1.rgb, temp1.rgb, temp0.rgb */
+	/* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
 						R300_ALU_RGB_ADDR1(1) |
-						R300_ALU_RGB_ADDR2(0) |
-						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_ADDR2(2) |
+						R300_ALU_RGB_ADDRD(2) |
 						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
 	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
 						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
@@ -1738,17 +1738,17 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
 						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
 	/* alpha nop */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(2) |
 						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	/* MAD result.rgb, const2.rgb, temp2.rgb, temp0.rgb */
+	/* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
-						R300_ALU_RGB_ADDR1(2) |
-						R300_ALU_RGB_ADDR2(0) |
+						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR2(2) |
 						R300_ALU_RGB_ADDRD(0) |
 						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
 						(needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
commit 14c13faeb9f9b7717a25fcc1ca97d46cc6ee0031
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Apr 15 19:53:12 2009 -0400

    R5xx: add shader-based csc
    
    - native planar support
    - Xv attributes

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 3626e8d..532b600 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -368,7 +368,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     pPriv->planar_hw = pPriv->planar_state;
-    if (pPriv->bicubic_enabled || (IS_R600_3D || IS_R500_3D))
+    if (pPriv->bicubic_enabled || IS_R600_3D)
 	pPriv->planar_hw = 0;
 
     if (info->ChipFamily < CHIP_FAMILY_R300)
@@ -697,12 +697,18 @@ static XF86AttributeRec Attributes_r300[NUM_ATTRIBUTES_R300+1] =
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R500 2
+#define NUM_ATTRIBUTES_R500 8
 
 static XF86AttributeRec Attributes_r500[NUM_ATTRIBUTES_R500+1] =
 {
     {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
+    {XvSettable | XvGettable, 0, 1, "XV_COLORSPACE"},
     {0, 0, 0, NULL}
 };
 
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 41ed7fa..b74763f 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -2492,7 +2492,7 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     }
 
     if (isplanar) {
-	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
+	txformat1 = R300_TX_FORMAT_X8;
 	txpitch = pPriv->src_pitch;
     } else {
 	if (pPriv->id == FOURCC_UYVY)
@@ -2500,7 +2500,8 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	else
 	    txformat1 = R300_TX_FORMAT_VYUY422;
 
-	txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+	if (pPriv->bicubic_enabled)
+	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
 
 	/* pitch is in pixels */
 	txpitch = pPriv->src_pitch / 2;
@@ -2555,13 +2556,13 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
 	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
 	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
-	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8);
 	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
 	OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
 	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
 	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
 	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
-	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8);
 	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
 	OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
 	FINISH_ACCEL();
@@ -3162,8 +3163,76 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
 	FINISH_ACCEL();
 
-    } else {
-	BEGIN_ACCEL(19);
+    } else if (isplanar) {
+	/*
+	 * y' = y - .0625
+	 * u' = u - .5
+	 * v' = v - .5;
+	 *
+	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+	 *
+	 * DP3 might look like the straightforward solution
+	 * but we'd need to move the texture yuv values in
+	 * the same reg for this to work. Therefore use MADs.
+	 * Brightness just adds to the off constant.
+	 * Contrast is multiplication of luminance.
+	 * Saturation and hue change the u and v coeffs.
+	 * Default values (before adjustments - depend on colorspace):
+	 * yco = 1.1643
+	 * uco = 0, -0.39173, 2.017
+	 * vco = 1.5958, -0.8129, 0
+	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+	 *
+	 * temp = MAD(yco, yuv.yyyy, off)
+	 * temp = MAD(uco, yuv.uuuu, temp)
+	 * result = MAD(vco, yuv.vvvv, temp)
+	 */
+	/* TODO: don't recalc consts always */
+	const float Loff = -0.0627;
+	const float Coff = -0.502;
+	float uvcosf, uvsinf;
+	float yco;
+	float uco[3], vco[3], off[3];
+	float bright, cont, gamma;
+	int ref = pPriv->transform_index;
+	Bool needgamma = FALSE;
+
+	cont = RTFContrast(pPriv->contrast);
+	bright = RTFBrightness(pPriv->brightness);
+	gamma = (float)pPriv->gamma / 1000.0;
+	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+	//XXX gamma
+
+	if (gamma != 1.0) {
+	    needgamma = TRUE;
+	    /* note: gamma correction is out = in ^ gamma;
+	       gpu can only do LG2/EX2 therefore we transform into
+	       in ^ gamma = 2 ^ (log2(in) * gamma).
+	       Lots of scalar ops, unfortunately (better solution?) -
+	       without gamma that's 3 inst, with gamma it's 10...
+	       could use different gamma factors per channel,
+	       if that's of any use. */
+	}
+
+	BEGIN_ACCEL(56);
 	/* 2 components: 2 for tex0 */
 	OUT_ACCEL_REG(R300_RS_COUNT,
 		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
@@ -3173,13 +3242,13 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
 
 	/* Pixel stack frame size. */
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
 
 	/* FP length. */
 	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-					  R500_US_CODE_END_ADDR(1)));
+					  R500_US_CODE_END_ADDR(5)));
 	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-					   R500_US_CODE_RANGE_SIZE(1)));
+					   R500_US_CODE_RANGE_SIZE(5)));
 
 	/* Prepare for FP emission. */
 	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
@@ -3196,6 +3265,72 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					       R500_INST_ALPHA_CLAMP));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
 					       R500_TEX_INST_LD |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_DST_ADDR(2) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+					       R500_DX_S_SWIZ_R |
+					       R500_DX_T_SWIZ_R |
+					       R500_DX_R_SWIZ_R |
+					       R500_DX_Q_SWIZ_R |
+					       R500_DY_ADDR(0) |
+					       R500_DY_S_SWIZ_R |
+					       R500_DY_T_SWIZ_R |
+					       R500_DY_R_SWIZ_R |
+					       R500_DY_Q_SWIZ_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* tex inst */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_DST_ADDR(1) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+					       R500_DX_S_SWIZ_R |
+					       R500_DX_T_SWIZ_R |
+					       R500_DX_R_SWIZ_R |
+					       R500_DX_Q_SWIZ_R |
+					       R500_DY_ADDR(0) |
+					       R500_DY_S_SWIZ_R |
+					       R500_DY_T_SWIZ_R |
+					       R500_DY_R_SWIZ_R |
+					       R500_DY_Q_SWIZ_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* tex inst */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(2) |
+					       R500_TEX_INST_LD |
 					       R500_TEX_SEM_ACQUIRE |
 					       R500_TEX_IGNORE_UNCOVERED));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
@@ -3220,6 +3355,81 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
 
 	/* ALU inst */
+	/* MAD temp2.rgb, const0.aaa, temp2.rgb, const0.rgb */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(2) |
+					       R500_RGB_ADDR2(0) |
+					       R500_RGB_ADDR2_CONST));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(2) |
+					       R500_ALPHA_ADDR2(0) |
+					       R500_ALPHA_ADDR2_CONST));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_A |
+					       R500_ALU_RGB_G_SWIZ_A_A |
+					       R500_ALU_RGB_B_SWIZ_A_A |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_B_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_ADDRD(2) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_ADDRD(2) |
+					       R500_ALU_RGBA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_A_SWIZ_0));
+
+	/* MAD temp2.rgb, const1.rgb, temp1.rgb, temp2.rgb */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(1) |
+					       R500_RGB_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(1) |
+					       R500_ALPHA_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_R |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_B_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_ADDRD(2) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_ADDRD(2) |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_A_SWIZ_0));
+
+	/* MAD result.rgb, const2.rgb, temp0.rgb, temp2.rgb */
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
 					       R500_INST_TEX_SEM_WAIT |
 					       R500_INST_LAST |
@@ -3229,32 +3439,314 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					       R500_INST_ALPHA_OMASK |
 					       R500_INST_RGB_CLAMP |
 					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(0) |
+					       R500_RGB_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(2) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(0) |
+					       R500_ALPHA_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_R |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_B_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_ADDRD(0) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_ADDRD(0) |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_A_SWIZ_1));
+
+	/* Shader constants. */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
+
+	/* constant 0: off, yco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
+	/* constant 1: uco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
+	/* constant 2: vco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
+
+	FINISH_ACCEL();
+
+    } else {
+	/*
+	 * y' = y - .0625
+	 * u' = u - .5
+	 * v' = v - .5;
+	 *
+	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+	 *
+	 * DP3 might look like the straightforward solution
+	 * but we'd need to move the texture yuv values in
+	 * the same reg for this to work. Therefore use MADs.
+	 * Brightness just adds to the off constant.
+	 * Contrast is multiplication of luminance.
+	 * Saturation and hue change the u and v coeffs.
+	 * Default values (before adjustments - depend on colorspace):
+	 * yco = 1.1643
+	 * uco = 0, -0.39173, 2.017
+	 * vco = 1.5958, -0.8129, 0
+	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+	 *
+	 * temp = MAD(yco, yuv.yyyy, off)
+	 * temp = MAD(uco, yuv.uuuu, temp)
+	 * result = MAD(vco, yuv.vvvv, temp)
+	 */
+	/* TODO: don't recalc consts always */
+	const float Loff = -0.0627;
+	const float Coff = -0.502;
+	float uvcosf, uvsinf;
+	float yco;
+	float uco[3], vco[3], off[3];
+	float bright, cont, gamma;
+	int ref = pPriv->transform_index;
+	Bool needgamma = FALSE;
+
+	cont = RTFContrast(pPriv->contrast);
+	bright = RTFBrightness(pPriv->brightness);
+	gamma = (float)pPriv->gamma / 1000.0;
+	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+	//XXX gamma
+
+	if (gamma != 1.0) {
+	    needgamma = TRUE;
+	    /* note: gamma correction is out = in ^ gamma;
+	       gpu can only do LG2/EX2 therefore we transform into
+	       in ^ gamma = 2 ^ (log2(in) * gamma).
+	       Lots of scalar ops, unfortunately (better solution?) -
+	       without gamma that's 3 inst, with gamma it's 10...
+	       could use different gamma factors per channel,
+	       if that's of any use. */
+	}
+
+	BEGIN_ACCEL(44);
+	/* 2 components: 2 for tex0/1/2 */
+	OUT_ACCEL_REG(R300_RS_COUNT,
+		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+		       R300_RS_COUNT_HIRES_EN));
+
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	/* Pixel stack frame size. */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
+
+	/* FP length. */
+	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					  R500_US_CODE_END_ADDR(3)));
+	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					   R500_US_CODE_RANGE_SIZE(3)));
+
+	/* Prepare for FP emission. */
+	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+
+	/* tex inst */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_SEM_ACQUIRE |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_DST_ADDR(0) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+					       R500_DX_S_SWIZ_R |
+					       R500_DX_T_SWIZ_R |
+					       R500_DX_R_SWIZ_R |
+					       R500_DX_Q_SWIZ_R |
+					       R500_DY_ADDR(0) |
+					       R500_DY_S_SWIZ_R |
+					       R500_DY_T_SWIZ_R |
+					       R500_DY_R_SWIZ_R |
+					       R500_DY_Q_SWIZ_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* ALU inst */
+	/* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR0_CONST |
 					       R500_RGB_ADDR1(0) |
-					       R500_RGB_ADDR1_CONST |
 					       R500_RGB_ADDR2(0) |
 					       R500_RGB_ADDR2_CONST));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR0_CONST |
 					       R500_ALPHA_ADDR1(0) |
-					       R500_ALPHA_ADDR1_CONST |
 					       R500_ALPHA_ADDR2(0) |
 					       R500_ALPHA_ADDR2_CONST));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_A |
+					       R500_ALU_RGB_G_SWIZ_A_A |
+					       R500_ALU_RGB_B_SWIZ_A_A |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_G));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_ADDRD(1) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_ADDRD(1) |
+					       R500_ALU_RGBA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_A_SWIZ_0));
+
+	/* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(1) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(0) |
+					       R500_RGB_ADDR2(1)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(0) |
+					       R500_ALPHA_ADDR2(1)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
 					       R500_ALU_RGB_R_SWIZ_A_R |
 					       R500_ALU_RGB_G_SWIZ_A_G |
 					       R500_ALU_RGB_B_SWIZ_A_B |
-					       R500_ALU_RGB_SEL_B_SRC0 |
-					       R500_ALU_RGB_R_SWIZ_B_1 |
-					       R500_ALU_RGB_B_SWIZ_B_1 |
-					       R500_ALU_RGB_G_SWIZ_B_1));
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_B |
+					       R500_ALU_RGB_B_SWIZ_B_B |
+					       R500_ALU_RGB_G_SWIZ_B_B));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-					       R500_ALPHA_SWIZ_A_A |
-					       R500_ALPHA_SWIZ_B_1));
+					       R500_ALPHA_ADDRD(1) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
 	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-					       R500_ALU_RGBA_R_SWIZ_0 |
-					       R500_ALU_RGBA_G_SWIZ_0 |
-					       R500_ALU_RGBA_B_SWIZ_0 |
+					       R500_ALU_RGBA_ADDRD(1) |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
 					       R500_ALU_RGBA_A_SWIZ_0));
+
+	/* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_LAST |
+					       R500_INST_RGB_OMASK_R |
+					       R500_INST_RGB_OMASK_G |
+					       R500_INST_RGB_OMASK_B |
+					       R500_INST_ALPHA_OMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(2) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(0) |
+					       R500_RGB_ADDR2(1)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(1) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(0) |
+					       R500_ALPHA_ADDR2(1)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_R |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_B_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_ADDRD(1) |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SWIZ_B_0));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_ADDRD(1) |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_ALPHA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_A_SWIZ_1));
+
+	/* Shader constants. */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
+
+	/* constant 0: off, yco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, off[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, yco);
+	/* constant 1: uco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, uco[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, gamma);
+	/* constant 2: vco */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[0]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[1]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, vco[2]);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0.0);
+
 	FINISH_ACCEL();
     }
 
commit 832efc7b90f5eb2da99512fcb902ab4838d2dcd1
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Apr 14 17:56:49 2009 -0400

    R3xx/R4xx: Implement shader-based csc for packed formats

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 4ce34e3..41ed7fa 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1063,7 +1063,8 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	else
 	    txformat1 = R300_TX_FORMAT_VYUY422;
 
-	txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+	if (pPriv->bicubic_enabled)
+	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
 
 	/* pitch is in pixels */
 	txpitch = pPriv->src_pitch / 2;
@@ -1697,7 +1698,7 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					   R300_TEX_INST(R300_TEX_INST_LD)));
 
 	/* ALU inst */
-	/* MAD temp0, const0.a, temp0, const0.rgb */
+	/* MAD temp0.rgb, const0.aaa, temp0.rgb, const0.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
 						R300_ALU_RGB_ADDR1(0) |
 						R300_ALU_RGB_ADDR2(0) |
@@ -1722,7 +1723,7 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	/* MAD const1, temp1, temp0 */
+	/* MAD temp0.rgb, const1.rgb, temp1.rgb, temp0.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
 						R300_ALU_RGB_ADDR1(1) |
 						R300_ALU_RGB_ADDR2(0) |
@@ -1744,7 +1745,7 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 
-	/* MAD result, const2, temp2, temp0 */
+	/* MAD result.rgb, const2.rgb, temp2.rgb, temp0.rgb */
 	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
 						R300_ALU_RGB_ADDR1(2) |
 						R300_ALU_RGB_ADDR2(0) |
@@ -1901,27 +1902,93 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	FINISH_ACCEL();
 
     } else {
-	BEGIN_ACCEL(11);
-	/* 2 components: 2 for tex0 */
+	/*
+	 * y' = y - .0625
+	 * u' = u - .5
+	 * v' = v - .5;
+	 *
+	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+	 *
+	 * DP3 might look like the straightforward solution
+	 * but we'd need to move the texture yuv values in
+	 * the same reg for this to work. Therefore use MADs.
+	 * Brightness just adds to the off constant.
+	 * Contrast is multiplication of luminance.
+	 * Saturation and hue change the u and v coeffs.
+	 * Default values (before adjustments - depend on colorspace):
+	 * yco = 1.1643
+	 * uco = 0, -0.39173, 2.017
+	 * vco = 1.5958, -0.8129, 0
+	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+	 *
+	 * temp = MAD(yco, yuv.yyyy, off)
+	 * temp = MAD(uco, yuv.uuuu, temp)
+	 * result = MAD(vco, yuv.vvvv, temp)
+	 */
+	/* TODO: don't recalc consts always */
+	const float Loff = -0.0627;
+	const float Coff = -0.502;
+	float uvcosf, uvsinf;
+	float yco;
+	float uco[3], vco[3], off[3];
+	float bright, cont, gamma;
+	int ref = pPriv->transform_index;
+	Bool needgamma = FALSE;
+
+	cont = RTFContrast(pPriv->contrast);
+	bright = RTFBrightness(pPriv->brightness);
+	gamma = (float)pPriv->gamma / 1000.0;
+	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+	if (gamma != 1.0) {
+	    needgamma = TRUE;
+	    /* note: gamma correction is out = in ^ gamma;
+	       gpu can only do LG2/EX2 therefore we transform into
+	       in ^ gamma = 2 ^ (log2(in) * gamma).
+	       Lots of scalar ops, unfortunately (better solution?) -
+	       without gamma that's 3 inst, with gamma it's 10...
+	       could use different gamma factors per channel,
+	       if that's of any use. */
+	}
+
+	BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
+	/* 2 components */
 	OUT_ACCEL_REG(R300_RS_COUNT,
 		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
 		       R300_RS_COUNT_HIRES_EN));
 	/* R300_INST_COUNT_RS - highest RS instruction used */
 	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
 
-	OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 1); /* highest temp used */
 
 	/* Indirection levels */
 	OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
 				       R300_FIRST_TEX));
 
 	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-					    R300_ALU_CODE_SIZE(1) |
+					    R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
 					    R300_TEX_CODE_OFFSET(0) |
 					    R300_TEX_CODE_SIZE(1)));
 
 	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-					    R300_ALU_SIZE(0) |
+					    R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
 					    R300_TEX_START(0) |
 					    R300_TEX_SIZE(0) |
 					    R300_RGBA_OUT));
@@ -1933,41 +2000,207 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					   R300_TEX_INST(R300_TEX_INST_LD)));
 
 	/* ALU inst */
-	/* RGB */
-	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
-					       R300_ALU_RGB_ADDR1(0) |
-					       R300_ALU_RGB_ADDR2(0) |
-					       R300_ALU_RGB_ADDRD(0) |
-					       R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
-								   R300_ALU_RGB_MASK_G |
-								   R300_ALU_RGB_MASK_B)) |
-					       R300_ALU_RGB_TARGET_A));
-	OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-					       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-					       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-					       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-					       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
-					       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-					       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-					       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
-					       R300_ALU_RGB_CLAMP));
-	/* Alpha */
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
-						 R300_ALU_ALPHA_ADDR1(0) |
-						 R300_ALU_ALPHA_ADDR2(0) |
-						 R300_ALU_ALPHA_ADDRD(0) |
-						 R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
-						 R300_ALU_ALPHA_TARGET_A |
-						 R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
-	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
-						 R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
-						 R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
-						 R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
-						 R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
-						 R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
-						 R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						 R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
-						 R300_ALU_ALPHA_CLAMP));
+	/* MAD temp1.rgb, const0.aaa, temp0.ggg, const0.rgb */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
+						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR2(0) |
+						R300_ALU_RGB_ADDRD(1) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_GGG) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	/* alpha nop, but need to set up alpha source for rgb usage */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
+						  R300_ALU_ALPHA_ADDR1(0) |
+						  R300_ALU_ALPHA_ADDR2(0) |
+						  R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	/* MAD temp1.rgb, const1.rgb, temp0.bbb, temp1.rgb */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR2(1) |
+						R300_ALU_RGB_ADDRD(1) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_BBB) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	/* alpha nop */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	/* MAD result.rgb, const2.rgb, temp0.rrr, temp1.rgb */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
+						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR2(1) |
+						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
+						(needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RRR) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+						R300_ALU_RGB_CLAMP));
+	/* write alpha 1 */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						  R300_ALU_ALPHA_TARGET_A));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
+
+	if (needgamma) {
+	    /* rgb temp0.r = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.r */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb temp0.g = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.g */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb temp0.b = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.b */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* MUL const1, temp1, temp0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_ADDR1(0) |
+						    R300_ALU_RGB_ADDR2(0) |
+						    R300_ALU_RGB_ADDRD(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	    /* alpha nop, but set up const1 */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.r = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.r */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.g = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.g */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.b = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.b */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	}
+
+	/* Shader constants. */
+	/* constant 0: off, yco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
+	/* constant 1: uco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
+	/* constant 2: vco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
+
 	FINISH_ACCEL();
     }
 
commit 32625118c27041265d25811c00d25ab7e82fb340
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Apr 14 16:38:40 2009 -0400

    tex vid: fix attribute setup typo for XV_COLORSPACE

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 542fc2a..3626e8d 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -677,7 +677,7 @@ static XF86AttributeRec Attributes_r200[NUM_ATTRIBUTES_R200+1] =
     {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
     {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
     {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
-    {XvSettable | XvGettable, 100, 10000, "XV_COLORSPACE"},
+    {XvSettable | XvGettable, 0, 1, "XV_COLORSPACE"},
     {0, 0, 0, NULL}
 };
 
@@ -715,7 +715,7 @@ static XF86AttributeRec Attributes_r600[NUM_ATTRIBUTES_R600+1] =
     {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
     {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
     {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
-    {XvSettable | XvGettable, 100, 10000, "XV_COLORSPACE"},
+    {XvSettable | XvGettable, 0, 1, "XV_COLORSPACE"},
     {0, 0, 0, NULL}
 };
 
commit adf0912006b4f1597784dbfcc563d5c6d1c5667d
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Apr 14 16:16:33 2009 -0400

    R6xx/R7xx: implement Xv attributes
    
    - brightness, contrast, hue, etc.
    - TODO: implement gamma

diff --git a/src/r600_shader.c b/src/r600_shader.c
index addba36..0a820cf 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -560,41 +560,12 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
     return i;
 }
 
-/*
- * ; xv ps planar
- * 00 TEX: ADDR(20) CNT(3) NO_BARRIER 
- *       0  SAMPLE R1.x__1, R0.xy01, t0, s0
- *       1  SAMPLE R1.__x_, R0.xy01, t1, s1
- *       2  SAMPLE R1._x__, R0.xy01, t2, s2
- * 01 TEX: ADDR(28) CNT(2) NO_BARRIER 
- *       0  SAMPLE R1.x__1, R0.xy01, t0, s0
- *       1  SAMPLE R1._xy_, R0.xy01, t1, s1
- * 02 ALU: ADDR(4) CNT(16) 
- *       3  x: MULADD      R1.x,  R1.x,  C3.x,  C3.y      CLAMP 
- *          y: MULADD      R1.y,  R1.y,  C3.z,  C3.w      
- *          z: MULADD      R1.z,  R1.z,  C3.z,  C3.w      
- *          w: MOV         R1.w,  0.0f 
- *       4  x: DOT4        R2.x,  R1.x,  C0.x      CLAMP VEC_102 
- *          y: DOT4        ____,  R1.y,  C0.y      CLAMP VEC_102 
- *          z: DOT4        ____,  R1.z,  C0.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C0.w      CLAMP VEC_021 
- *       5  x: DOT4        ____,  R1.x,  C1.x      CLAMP VEC_102 
- *          y: DOT4        R2.y,  R1.y,  C1.y      CLAMP VEC_102 
- *          z: DOT4        ____,  R1.z,  C1.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C1.w      CLAMP VEC_021 
- *       6  x: DOT4        ____,  R1.x,  C2.x      CLAMP VEC_102 
- *          y: DOT4        ____,  R1.y,  C2.y      CLAMP VEC_102 
- *          z: DOT4        R2.z,  R1.z,  C2.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C2.w      CLAMP VEC_021 
- * 03 EXP_DONE: PIX0, R2
- * END_OF_PROGRAM
- */
 int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 {
     int i = 0;
 
     /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(20));
+    shader[i++] = CF_DWORD0(ADDR(16));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_BOOL),
@@ -606,7 +577,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             WHOLE_QUAD_MODE(0),
                             BARRIER(0));
     /* 1 */
-    shader[i++] = CF_DWORD0(ADDR(28));
+    shader[i++] = CF_DWORD0(ADDR(24));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_NOT_BOOL),
@@ -625,7 +596,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
                                 KCACHE_ADDR0(0),
                                 KCACHE_ADDR1(0),
-                                I_COUNT(16),
+                                I_COUNT(12),
                                 USES_WATERFALL(0),
                                 CF_INST(SQ_CF_INST_ALU),
                                 WHOLE_QUAD_MODE(0),
@@ -648,73 +619,74 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                                CF_INST(SQ_CF_INST_EXPORT_DONE),
                                                WHOLE_QUAD_MODE(0),
                                                BARRIER(1));
-    /* 4 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* 4,5,6,7 */
+    /* r2.x = MAD(c0.w, r1.x, c0.x) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
                                  SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Y),
+                                 SRC2_ELEM(ELEM_X),
                                  SRC2_NEG(0),
                                  ALU_INST(SQ_OP3_INST_MULADD),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 5 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                                 CLAMP(0));
+    /* r2.y = MAD(c0.w, r1.x, c0.y) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
+                             SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
                                  SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_ELEM(ELEM_Y),
                                  SRC2_NEG(0),
                                  ALU_INST(SQ_OP3_INST_MULADD),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
-    /* 6 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* r2.z = MAD(c0.w, r1.x, c0.z) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
+                             SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
                                  SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_ELEM(ELEM_Z),
                                  SRC2_NEG(0),
                                  ALU_INST(SQ_OP3_INST_MULADD),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 7 */
+    /* r2.w = MAD(0, 0, 1) */
     shader[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -726,334 +698,198 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_MOV),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_1),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_X),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
-    /* 8 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+
+    /* 8,9,10,11 */
+    /* r2.x = MAD(c1.x, r1.y, pv.x) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_X),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
                                  DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 9 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                                 CLAMP(0));
+    /* r2.y = MAD(c1.y, r1.y, pv.y) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Y),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
-                                 CLAMP(1));
-    /* 10 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                                 CLAMP(0));
+    /* r2.z = MAD(c1.z, r1.y, pv.z) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Z),
-                                 CLAMP(1));
-    /* 11 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
-                             SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_W),
-                                 CLAMP(1));
-    /* 12 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 13 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
                                  DST_GPR(2),
                                  DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
-                                 CLAMP(1));
-    /* 14 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
-                                 CLAMP(1));
-    /* 15 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                                 CLAMP(0));
+    /* r2.w = MAD(0, 0, 1) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
+                             SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(SQ_ALU_SRC_0),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
+                             SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_1),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
-                                 CLAMP(1));
-    /* 16 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                                 CLAMP(0));
+    /* 12,13,14,15 */
+    /* r2.x = MAD(c2.x, r1.z, pv.x) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_X),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
-    /* 17 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* r2.y = MAD(c2.y, r1.z, pv.y) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Y),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(1));
-    /* 18 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* r2.z = MAD(c2.z, r1.z, pv.z) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_PV),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
                                  DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(1));
-    /* 19 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* r2.w = MAD(0, 0, 1) */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
+                             SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(SQ_ALU_SRC_0),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
+                             SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(SQ_ALU_SRC_1),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_X),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(1));
-    /* 20 */
-    shader[i++] = CF_DWORD0(ADDR(22));
+
+    /* 16 */
+    shader[i++] = CF_DWORD0(ADDR(18));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_ACTIVE),
@@ -1064,7 +900,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             CF_INST(SQ_CF_INST_TEX),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
-    /* 21 */
+    /* 17 */
     shader[i++] = CF_DWORD0(ADDR(0));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1076,7 +912,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 22/23 */
+    /* 18/19 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1104,7 +940,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 24/25 */
+    /* 20/21 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1132,7 +968,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 26/27 */
+    /* 22/23 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1160,8 +996,8 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 28 */
-    shader[i++] = CF_DWORD0(ADDR(30));
+    /* 24 */
+    shader[i++] = CF_DWORD0(ADDR(26));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_ACTIVE),
@@ -1172,7 +1008,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             CF_INST(SQ_CF_INST_TEX),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
-    /* 29 */
+    /* 25 */
     shader[i++] = CF_DWORD0(ADDR(0));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1184,7 +1020,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 30/31 */
+    /* 26/27 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1212,7 +1048,7 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 32/33 */
+    /* 28/29 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 88745d5..600262b 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -45,6 +45,15 @@
 
 #include "damage.h"
 
+/* Parameters for ITU-R BT.601 and ITU-R BT.709 colour spaces
+   note the difference to the parameters used in overlay are due
+   to 10bit vs. float calcs */
+static REF_TRANSFORM trans[2] =
+{
+    {1.1643, 0.0, 1.5960, -0.3918, -0.8129, 2.0172, 0.0}, /* BT.601 */
+    {1.1643, 0.0, 1.7927, -0.2132, -0.5329, 2.1124, 0.0}  /* BT.709 */
+};
+
 static void
 R600DoneTexturedVideo(ScrnInfoPtr pScrn)
 {
@@ -115,18 +124,91 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
     int uv_offset;
-    static float ps_alu_consts[] = {
-        1.0,  0.0,      1.4020,   0,  /* r - c[0] */
-        1.0, -0.34414, -0.71414,  0,  /* g - c[1] */
-        1.0,  1.7720,   0.0,      0,  /* b - c[2] */
-	/* Constants for undoing Y'CbCr scaling
-	 *  - Y' is scaled from 16:235
-	 *  - Cb/Cr are scaled from 16:240
-	 * Unscaled value N' = N * N_mul + N_shift (N' in range [-0.5, 0.5])
-	 * Vector is [Y_mul, Y_shfit, C_mul, C_shift]
-	 */
-        256.0/219.0, -16.0/219.0, 256.0/224.0, -128.0/224.0,
-    };
+    /*
+     * y' = y - .0625
+     * u' = u - .5
+     * v' = v - .5;
+     *
+     * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+     * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+     * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+     *
+     * DP3 might look like the straightforward solution
+     * but we'd need to move the texture yuv values in
+     * the same reg for this to work. Therefore use MADs.
+     * Brightness just adds to the off constant.
+     * Contrast is multiplication of luminance.
+     * Saturation and hue change the u and v coeffs.
+     * Default values (before adjustments - depend on colorspace):
+     * yco = 1.1643
+     * uco = 0, -0.39173, 2.017
+     * vco = 1.5958, -0.8129, 0
+     * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+     *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+     *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+     *
+     * temp = MAD(yco, yuv.yyyy, off)
+     * temp = MAD(uco, yuv.uuuu, temp)
+     * result = MAD(vco, yuv.vvvv, temp)
+     */
+    /* TODO: calc consts in the shader */
+    const float Loff = -0.0627;
+    const float Coff = -0.502;
+    float uvcosf, uvsinf;
+    float yco;
+    float uco[3], vco[3], off[3];
+    float bright, cont, gamma;
+    int ref = pPriv->transform_index;
+    Bool needgamma = FALSE;
+    float ps_alu_consts[12];
+
+    cont = RTFContrast(pPriv->contrast);
+    bright = RTFBrightness(pPriv->brightness);
+    gamma = (float)pPriv->gamma / 1000.0;
+    uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+    uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+    /* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+    yco = trans[ref].RefLuma * cont;
+    uco[0] = -trans[ref].RefRCr * uvsinf;
+    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+    uco[2] = trans[ref].RefBCb * uvcosf;
+    vco[0] = trans[ref].RefRCr * uvcosf;
+    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+    vco[2] = trans[ref].RefBCb * uvsinf;
+    off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+    off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+    off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+    // XXX
+    gamma = 1.0;
+
+    if (gamma != 1.0) {
+	needgamma = TRUE;
+	/* note: gamma correction is out = in ^ gamma;
+	   gpu can only do LG2/EX2 therefore we transform into
+	   in ^ gamma = 2 ^ (log2(in) * gamma).
+	   Lots of scalar ops, unfortunately (better solution?) -
+	   without gamma that's 3 inst, with gamma it's 10...
+	   could use different gamma factors per channel,
+	   if that's of any use. */
+    }
+
+    /* setup the ps consts */
+    ps_alu_consts[0] = off[0];
+    ps_alu_consts[1] = off[1];
+    ps_alu_consts[2] = off[2];
+    ps_alu_consts[3] = yco;
+
+    ps_alu_consts[4] = uco[0];
+    ps_alu_consts[5] = uco[1];
+    ps_alu_consts[6] = uco[2];
+    ps_alu_consts[7] = gamma;
+
+    ps_alu_consts[8] = vco[0];
+    ps_alu_consts[9] = vco[1];
+    ps_alu_consts[10] = vco[2];
+    ps_alu_consts[11] = 0.0;
 
     CLEAR (cb_conf);
     CLEAR (tex_res);
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 82675d9..542fc2a 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -137,12 +137,6 @@ static REF_TRANSFORM trans[2] =
     {1.1643, 0.0, 1.7927, -0.2132, -0.5329, 2.1124, 0.0}  /* BT.709 */
 };
 
-
-#define RTFSaturation(a)   (1.0 + ((a)*1.0)/1000.0)
-#define RTFBrightness(a)   (((a)*1.0)/2000.0)
-#define RTFContrast(a)   (1.0 + ((a)*1.0)/1000.0)
-#define RTFHue(a)   (((a)*3.1416)/1000.0)
-
 #define ACCEL_MMIO
 #define ACCEL_PREAMBLE()	unsigned char *RADEONMMIO = info->MMIO
 #define BEGIN_ACCEL(n)		RADEONWaitForFifo(pScrn, (n))
@@ -712,11 +706,16 @@ static XF86AttributeRec Attributes_r500[NUM_ATTRIBUTES_R500+1] =
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R600 1
+#define NUM_ATTRIBUTES_R600 6
 
 static XF86AttributeRec Attributes_r600[NUM_ATTRIBUTES_R600+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
+    {XvSettable | XvGettable, 100, 10000, "XV_COLORSPACE"},
     {0, 0, 0, NULL}
 };
 
diff --git a/src/radeon_video.c b/src/radeon_video.c
index 8479160..a14f44c 100644
--- a/src/radeon_video.c
+++ b/src/radeon_video.c
@@ -1704,12 +1704,6 @@ RADEONSetPortAttribute(ScrnInfoPtr  pScrn,
 
     RADEON_SYNC(info, pScrn);
 
-#define RTFSaturation(a)   (1.0 + ((a)*1.0)/1000.0)
-#define RTFBrightness(a)   (((a)*1.0)/2000.0)
-#define RTFIntensity(a)   (((a)*1.0)/2000.0)
-#define RTFContrast(a)   (1.0 + ((a)*1.0)/1000.0)
-#define RTFHue(a)   (((a)*3.1416)/1000.0)
-
     if(attribute == xvAutopaintColorkey)
     {
 	pPriv->autopaint_colorkey = ClipValue (value, 0, 1);
diff --git a/src/radeon_video.h b/src/radeon_video.h
index be33871..0f8342a 100644
--- a/src/radeon_video.h
+++ b/src/radeon_video.h
@@ -135,6 +135,12 @@ typedef struct tagREF_TRANSFORM
     float   RefBCr;
 } REF_TRANSFORM;
 
+#define RTFSaturation(a)   (1.0 + ((a)*1.0)/1000.0)
+#define RTFBrightness(a)   (((a)*1.0)/2000.0)
+#define RTFIntensity(a)   (((a)*1.0)/2000.0)
+#define RTFContrast(a)   (1.0 + ((a)*1.0)/1000.0)
+#define RTFHue(a)   (((a)*3.1416)/1000.0)
+
 xf86CrtcPtr
 radeon_xv_pick_best_crtc(ScrnInfoPtr pScrn,
 			 int x1, int x2, int y1, int y2);
commit 8810fe92b5aed08888584c6914482586b59f71ab
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Apr 14 11:26:35 2009 -0400

    R200: clean code, always use shader based csc
    
    - consolidate common r2xx csc shader code
    - always use shader based csc for both packed and planar
      formats
    - always use native planar csc on r1xx

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index f657536..82675d9 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -362,7 +362,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 
     /* Bicubic filter setup */
     pPriv->bicubic_enabled = (pPriv->bicubic_state != BICUBIC_OFF);
-    if (!(IS_R300_3D || IS_R500_3D || IS_R600_3D))
+    if (!(IS_R300_3D || IS_R500_3D))
 	pPriv->bicubic_enabled = FALSE;
     if (pPriv->bicubic_enabled && (pPriv->bicubic_state == BICUBIC_AUTO)) {
 	/*
@@ -377,6 +377,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     if (pPriv->bicubic_enabled || (IS_R600_3D || IS_R500_3D))
 	pPriv->planar_hw = 0;
 
+    if (info->ChipFamily < CHIP_FAMILY_R300)
+	pPriv->planar_hw = 1;
+
     switch(id) {
     case FOURCC_YV12:
     case FOURCC_I420:
@@ -663,21 +666,19 @@ static XF86VideoFormatRec Formats[NUM_FORMATS] =
     {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
 };
 
-#define NUM_ATTRIBUTES 2
+#define NUM_ATTRIBUTES 1
 
 static XF86AttributeRec Attributes[NUM_ATTRIBUTES+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
-    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R200 7
+#define NUM_ATTRIBUTES_R200 6
 
 static XF86AttributeRec Attributes_r200[NUM_ATTRIBUTES_R200+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
-    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
     {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
     {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 76c8456..4ce34e3 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -109,11 +109,11 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	dst_pitch = exaGetPixmapPitch(pPixmap);
     } else
 #endif
-	{
-	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
-		info->fbLocation + pScrn->fbOffset;
-	    dst_pitch = pPixmap->devKind;
-	}
+    {
+	dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
+	    info->fbLocation + pScrn->fbOffset;
+	dst_pitch = pPixmap->devKind;
+    }
 
 #ifdef COMPOSITE
     dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
@@ -158,11 +158,8 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	return;
     }
 
-    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
 	isplanar = TRUE;
-    }
-
-    if (isplanar) {
 	txformat = RADEON_TXFORMAT_Y8;
     } else {
 	if (pPriv->id == FOURCC_UYVY)
@@ -444,13 +441,24 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     PixmapPtr pPixmap = pPriv->pPixmap;
     uint32_t txformat;
-    uint32_t txfilter, txformat0, txpitch;
+    uint32_t txfilter, txsize, txpitch;
     uint32_t dst_offset, dst_pitch, dst_format;
     uint32_t colorpitch;
     Bool isplanar = FALSE;
     int dstxoff, dstyoff, pixel_shift, vtx_count;
     BoxPtr pBox = REGION_RECTS(&pPriv->clip);
     int nBox = REGION_NUM_RECTS(&pPriv->clip);
+
+    /* note: in contrast to r300, use input biasing on uv components */
+    const float Loff = -0.0627;
+    float uvcosf, uvsinf;
+    float yco, yoff;
+    float uco[3], vco[3];
+    float bright, cont, sat;
+    int ref = pPriv->transform_index;
+    float ucscale = 0.25, vcscale = 0.25;
+    Bool needux8 = FALSE, needvx8 = FALSE;
+
     ACCEL_PREAMBLE();
 
     pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
@@ -495,8 +503,6 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	    RADEONInit3DEngine(pScrn);
     }
 
-    vtx_count = 4;
-
     /* Same for R100/R200 */
     switch (pPixmap->drawable.bitsPerPixel) {
     case 16:
@@ -512,11 +518,8 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	return;
     }
 
-    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+    if (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12) {
 	isplanar = TRUE;
-    }
-
-    if (isplanar) {
 	txformat = RADEON_TXFORMAT_I8;
     } else {
 	if (pPriv->id == FOURCC_UYVY)
@@ -546,61 +549,52 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     info->accel_state->texW[0] = pPriv->w;
     info->accel_state->texH[0] = pPriv->h;
 
-    if (isplanar) {
-	/* note: in contrast to r300, use input biasing on uv components */
-	const float Loff = -0.0627;
-	float uvcosf, uvsinf;
-	float yco, yoff;
-	float uco[3], vco[3];
-	float bright, cont, sat;
-	int ref = pPriv->transform_index;
-	float ucscale = 0.25, vcscale = 0.25;
-	Bool needux8 = FALSE, needvx8 = FALSE;
-
-	/* contrast can cause constant overflow, clamp */
-	cont = RTFContrast(pPriv->contrast);
-	if (cont * trans[ref].RefLuma > 2.0)
-	    cont = 2.0 / trans[ref].RefLuma;
-	/* brightness is only from -0.5 to 0.5 should be safe */
-	bright = RTFBrightness(pPriv->brightness);
-	/* saturation can also cause overflow, clamp */
-	sat = RTFSaturation(pPriv->saturation);
-	if (sat * trans[ref].RefBCb > 4.0)
-	    sat = 4.0 / trans[ref].RefBCb;
-	uvcosf = sat * cos(RTFHue(pPriv->hue));
-	uvsinf = sat * sin(RTFHue(pPriv->hue));
-
-	yco = trans[ref].RefLuma * cont;
-	uco[0] = -trans[ref].RefRCr * uvsinf;
-	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-	uco[2] = trans[ref].RefBCb * uvcosf;
-	vco[0] = trans[ref].RefRCr * uvcosf;
-	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-	vco[2] = trans[ref].RefBCb * uvsinf;
-	yoff = Loff * yco + bright;
-
-	if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
-	    needux8 = TRUE;
-	    ucscale = 0.125;
-	}
-	if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
-	    needvx8 = TRUE;
-	    vcscale = 0.125;
-	}
+    txfilter =  R200_MAG_FILTER_LINEAR |
+	R200_MIN_FILTER_LINEAR |
+	R200_CLAMP_S_CLAMP_LAST |
+	R200_CLAMP_T_CLAMP_LAST;
+
+    /* contrast can cause constant overflow, clamp */
+    cont = RTFContrast(pPriv->contrast);
+    if (cont * trans[ref].RefLuma > 2.0)
+	cont = 2.0 / trans[ref].RefLuma;
+    /* brightness is only from -0.5 to 0.5 should be safe */
+    bright = RTFBrightness(pPriv->brightness);
+    /* saturation can also cause overflow, clamp */
+    sat = RTFSaturation(pPriv->saturation);
+    if (sat * trans[ref].RefBCb > 4.0)
+	sat = 4.0 / trans[ref].RefBCb;
+    uvcosf = sat * cos(RTFHue(pPriv->hue));
+    uvsinf = sat * sin(RTFHue(pPriv->hue));
+
+    yco = trans[ref].RefLuma * cont;
+    uco[0] = -trans[ref].RefRCr * uvsinf;
+    uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+    uco[2] = trans[ref].RefBCb * uvcosf;
+    vco[0] = trans[ref].RefRCr * uvcosf;
+    vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+    vco[2] = trans[ref].RefBCb * uvsinf;
+    yoff = Loff * yco + bright;
+
+    if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
+	needux8 = TRUE;
+	ucscale = 0.125;
+    }
+    if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
+	needvx8 = TRUE;
+	vcscale = 0.125;
+    }
 
+    if (isplanar) {
 	/* need 2 texcoord sets (even though they are identical) due
 	   to denormalization! hw apparently can't premultiply
 	   same coord set by different texture size */
 	vtx_count = 6;
 
-	txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
-		     (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
 	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
 	txpitch -= 32;
-	txfilter =  R200_MAG_FILTER_LINEAR |
-	    R200_MIN_FILTER_LINEAR |
-	    R200_CLAMP_S_CLAMP_LAST |
-	    R200_CLAMP_T_CLAMP_LAST;
 
 	BEGIN_ACCEL(36);
 
@@ -627,14 +621,14 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
 	OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
 	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
-	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txsize);
 	OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
 	OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
 
 	OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
 	OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
 	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
-	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txsize);
 	OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
 	OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
 
@@ -757,58 +751,8 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						      0.0));
 
 	FINISH_ACCEL();
-    } else if (info->ChipFamily == CHIP_FAMILY_RV250) {
-	/* fix up broken packed yuv - shader same as above except
-	   yuv components are all in same reg */
-	/* note: in contrast to r300, use input biasing on uv components */
-	const float Loff = -0.0627;
-	float uvcosf, uvsinf;
-	float yco, yoff;
-	float uco[3], vco[3];
-	float bright, cont, sat;
-	int ref = pPriv->transform_index;
-	float ucscale = 0.25, vcscale = 0.25;
-	Bool needux8 = FALSE, needvx8 = FALSE;
-
-	/* contrast can cause constant overflow, clamp */
-	cont = RTFContrast(pPriv->contrast);
-	if (cont * trans[ref].RefLuma > 2.0)
-	    cont = 2.0 / trans[ref].RefLuma;
-	/* brightness is only from -0.5 to 0.5 should be safe */
-	bright = RTFBrightness(pPriv->brightness);
-	/* saturation can also cause overflow, clamp */
-	sat = RTFSaturation(pPriv->saturation);
-	if (sat * trans[ref].RefBCb > 4.0)
-	    sat = 4.0 / trans[ref].RefBCb;
-	uvcosf = sat * cos(RTFHue(pPriv->hue));
-	uvsinf = sat * sin(RTFHue(pPriv->hue));
-
-	yco = trans[ref].RefLuma * cont;
-	uco[0] = -trans[ref].RefRCr * uvsinf;
-	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-	uco[2] = trans[ref].RefBCb * uvcosf;
-	vco[0] = trans[ref].RefRCr * uvcosf;
-	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-	vco[2] = trans[ref].RefBCb * uvsinf;
-	yoff = Loff * yco + bright;
-
-	if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
-	    needux8 = TRUE;
-	    ucscale = 0.125;
-	}
-	if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
-	    needvx8 = TRUE;
-	    vcscale = 0.125;
-	}
-
-	txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
-		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
-	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
-	txpitch -= 32;
-	txfilter =  R200_MAG_FILTER_LINEAR |
-	    R200_MIN_FILTER_LINEAR |
-	    R200_CLAMP_S_CLAMP_LAST |
-	    R200_CLAMP_T_CLAMP_LAST;
+    } else {
+	vtx_count = 4;
 
 	BEGIN_ACCEL(24);
 
@@ -911,45 +855,6 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 						      0.0));
 
 	FINISH_ACCEL();
-    } else {
-	BEGIN_ACCEL(13);
-	OUT_ACCEL_REG(RADEON_PP_CNTL,
-		      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
-
-	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
-	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
-		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
-
-	OUT_ACCEL_REG(R200_PP_TXFILTER_0,
-		      R200_MAG_FILTER_LINEAR |
-		      R200_MIN_FILTER_LINEAR |
-		      R200_CLAMP_S_CLAMP_LAST |
-		      R200_CLAMP_T_CLAMP_LAST |
-		      R200_YUV_TO_RGB);
-	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
-	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
-	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
-		      (pPriv->w - 1) |
-		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
-
-	OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
-
-	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
-		      R200_TXC_ARG_A_ZERO |
-		      R200_TXC_ARG_B_ZERO |
-		      R200_TXC_ARG_C_R0_COLOR |
-		      R200_TXC_OP_MADD);
-	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
-		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
-	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
-		      R200_TXA_ARG_A_ZERO |
-		      R200_TXA_ARG_B_ZERO |
-		      R200_TXA_ARG_C_R0_ALPHA |
-		      R200_TXA_OP_MADD);
-	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
-		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
-	FINISH_ACCEL();
     }
 
     if (pPriv->vsync) {
@@ -1021,7 +926,6 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	if (isplanar) {
 	    /*
 	     * Just render a rect (using three coords).
-	     * Filter is a bit a misnomer, it's just texcoords...
 	     */
 	    VTX_OUT_6((float)dstX,                                (float)(dstY + dsth),
 		      (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
commit 17685fefba68d188c7c0fe7a079180ec0722c046
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Apr 13 20:06:11 2009 -0400

    Tex vid: general cleanup
    
    - convert macros to more meaningful VTX_OUT_4 and VTX_OUT_6
      names to reflect that they actually do
    - fix indenting

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 83aa101..76c8456 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -47,8 +47,8 @@
 
 #ifdef ACCEL_CP
 
-#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
-do {									\
+#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
+do {								\
     OUT_RING_F(_dstX);						\
     OUT_RING_F(_dstY);						\
     OUT_RING_F(_srcX);						\
@@ -57,7 +57,7 @@ do {									\
     OUT_RING_F(_maskY);						\
 } while (0)
 
-#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
+#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
 do {								\
     OUT_RING_F(_dstX);						\
     OUT_RING_F(_dstY);						\
@@ -67,7 +67,7 @@ do {								\
 
 #else /* ACCEL_CP */
 
-#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
+#define VTX_OUT_6(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)		\
 do {									\
     OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
     OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
@@ -77,7 +77,7 @@ do {									\
     OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
 } while (0)
 
-#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
+#define VTX_OUT_4(_dstX, _dstY, _srcX, _srcY)			\
 do {								\
     OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
     OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
@@ -404,25 +404,25 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	     * Just render a rect (using three coords).
 	     * Filter is a bit a misnomer, it's just texcoords...
 	     */
-	    VTX_OUT_FILTER((float)dstX,                (float)(dstY + dsth),
-			   (float)srcX,                (float)(srcY + srch),
-			   (float)srcX,                (float)(srcY + (srch / 2)));
-	    VTX_OUT_FILTER((float)(dstX + dstw),       (float)(dstY + dsth),
-			   (float)(srcX + srcw),       (float)(srcY + srch),
-			   (float)(srcX + (srcw / 2)), (float)(srcY + (srch / 2)));
-	    VTX_OUT_FILTER((float)(dstX + dstw),       (float)dstY,
-			   (float)(srcX + srcw),       (float)srcY,
-			   (float)(srcX + (srcw / 2)), (float)srcY);
+	    VTX_OUT_6((float)dstX,                (float)(dstY + dsth),
+		      (float)srcX,                (float)(srcY + srch),
+		      (float)srcX,                (float)(srcY + (srch / 2)));
+	    VTX_OUT_6((float)(dstX + dstw),       (float)(dstY + dsth),
+		      (float)(srcX + srcw),       (float)(srcY + srch),
+		      (float)(srcX + (srcw / 2)), (float)(srcY + (srch / 2)));
+	    VTX_OUT_6((float)(dstX + dstw),       (float)dstY,
+		      (float)(srcX + srcw),       (float)srcY,
+		      (float)(srcX + (srcw / 2)), (float)srcY);
 	} else {
 	    /*
 	     * Just render a rect (using three coords).
 	     */
-	    VTX_OUT((float)dstX,          (float)(dstY + dsth),
-		    (float)srcX,          (float)(srcY + srch));
-	    VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth),
-		    (float)(srcX + srcw), (float)(srcY + srch));
-	    VTX_OUT((float)(dstX + dstw), (float)dstY,
-		    (float)(srcX + srcw), (float)srcY);
+	    VTX_OUT_4((float)dstX,          (float)(dstY + dsth),
+		      (float)srcX,          (float)(srcY + srch));
+	    VTX_OUT_4((float)(dstX + dstw), (float)(dstY + dsth),
+		      (float)(srcX + srcw), (float)(srcY + srch));
+	    VTX_OUT_4((float)(dstX + dstw), (float)dstY,
+		      (float)(srcX + srcw), (float)srcY);
 	}
 
 	pBox++;
@@ -480,20 +480,20 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	RADEON_SWITCH_TO_3D();
     } else
 #endif
-	{
-	    BEGIN_ACCEL(2);
-	    OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
-	    /* We must wait for 3d to idle, in case source was just written as a dest. */
-	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
-			  RADEON_WAIT_HOST_IDLECLEAN |
-			  RADEON_WAIT_2D_IDLECLEAN |
-			  RADEON_WAIT_3D_IDLECLEAN |
-			  RADEON_WAIT_DMA_GUI_IDLE);
-	    FINISH_ACCEL();
-
-	    if (!info->accel_state->XInited3D)
-		RADEONInit3DEngine(pScrn);
-	}
+    {
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
+	/* We must wait for 3d to idle, in case source was just written as a dest. */
+	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+		      RADEON_WAIT_HOST_IDLECLEAN |
+		      RADEON_WAIT_2D_IDLECLEAN |
+		      RADEON_WAIT_3D_IDLECLEAN |
+		      RADEON_WAIT_DMA_GUI_IDLE);
+	FINISH_ACCEL();
+
+	if (!info->accel_state->XInited3D)
+	    RADEONInit3DEngine(pScrn);
+    }
 
     vtx_count = 4;
 
@@ -534,10 +534,8 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     BEGIN_ACCEL(4);
 
-    OUT_ACCEL_REG(RADEON_RB3D_CNTL,
-		  dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
+    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
     OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
-
     OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
 
     OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
@@ -990,14 +988,14 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
      */
 
 #ifdef ACCEL_CP
-	BEGIN_RING(nBox * 3 * vtx_count + 2);
+	BEGIN_RING(nBox * 3 * vtx_count + 4);
 	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
 			    nBox * 3 * vtx_count));
 	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
 		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
 		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
 #else /* ACCEL_CP */
-	BEGIN_ACCEL(nBox * 3 * vtx_count + 1);
+	BEGIN_ACCEL(nBox * 3 * vtx_count + 2);
 	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
 					  RADEON_VF_PRIM_WALK_DATA |
 					  ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
@@ -1025,40 +1023,38 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	     * Just render a rect (using three coords).
 	     * Filter is a bit a misnomer, it's just texcoords...
 	     */
-	    VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
-			   (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
-			   (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-	    VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
-			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
-			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-	    VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
-			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
-			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	    VTX_OUT_6((float)dstX,                                (float)(dstY + dsth),
+		      (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
+		      (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_6((float)(dstX + dstw),                       (float)(dstY + dsth),
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_6((float)(dstX + dstw),                       (float)dstY,
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	} else {
 	    /*
 	     * Just render a rect (using three coords).
 	     */
-	    VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
-		    (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-	    VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
-		    (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-	    VTX_OUT((float)(dstX + dstw),                              (float)dstY,
-		    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)dstX,                                       (float)(dstY + dsth),
+		      (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)(dstX + dstw),                              (float)(dstY + dsth),
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)(dstX + dstw),                              (float)dstY,
+		      (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	}
 
 	pBox++;
     }
 
+    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+
 #ifdef ACCEL_CP
 	ADVANCE_RING();
 #else
 	FINISH_ACCEL();
 #endif /* !ACCEL_CP */
 
-    BEGIN_ACCEL(1);
-    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
-    FINISH_ACCEL();
-
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
 
@@ -1104,20 +1100,20 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	RADEON_SWITCH_TO_3D();
     } else
 #endif
-	{
-	    BEGIN_ACCEL(2);
-	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
-	    /* We must wait for 3d to idle, in case source was just written as a dest. */
-	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
-			  RADEON_WAIT_HOST_IDLECLEAN |
-			  RADEON_WAIT_2D_IDLECLEAN |
-			  RADEON_WAIT_3D_IDLECLEAN |
-			  RADEON_WAIT_DMA_GUI_IDLE);
-	    FINISH_ACCEL();
-
-	    if (!info->accel_state->XInited3D)
-		RADEONInit3DEngine(pScrn);
-	}
+    {
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+	/* We must wait for 3d to idle, in case source was just written as a dest. */
+	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+		      RADEON_WAIT_HOST_IDLECLEAN |
+		      RADEON_WAIT_2D_IDLECLEAN |
+		      RADEON_WAIT_3D_IDLECLEAN |
+		      RADEON_WAIT_DMA_GUI_IDLE);
+	FINISH_ACCEL();
+
+	if (!info->accel_state->XInited3D)
+	    RADEONInit3DEngine(pScrn);
+    }
 
     if (pPriv->bicubic_enabled)
 	vtx_count = 6;
@@ -2191,54 +2187,57 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 		 * have to deal with the legacy handling.
 		 */
 	    if (use_quad) {
-		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
-			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
-			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
-		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dsth),
-			       (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
-			       (float)srcX + 0.5,                                 (float)(srcY + srch) + 0.5);
-		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)(dstY + dsth),
-			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
-			       (float)(srcX + srcw) + 0.5,                        (float)(srcY + srch) + 0.5);
-		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)dstY,
-			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
-			       (float)(srcX + srcw) + 0.5,                        (float)srcY + 0.5);
+		VTX_OUT_6((float)dstX,                                       (float)dstY,
+			  (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
+			  (float)srcX + 0.5,                                 (float)srcY + 0.5);
+		VTX_OUT_6((float)dstX,                                       (float)(dstY + dsth),
+			  (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
+			  (float)srcX + 0.5,                                 (float)(srcY + srch) + 0.5);
+		VTX_OUT_6((float)(dstX + dstw),                              (float)(dstY + dsth),
+			  (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
+			  (float)(srcX + srcw) + 0.5,                        (float)(srcY + srch) + 0.5);
+		VTX_OUT_6((float)(dstX + dstw),                              (float)dstY,
+			  (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
+			  (float)(srcX + srcw) + 0.5,                        (float)srcY + 0.5);
 	    } else {
-		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
-			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
-			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
-		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
-			       (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
-			       (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
-		VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
-			       ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
-			                                                          (float)srcY / info->accel_state->texH[0],
-			       (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
-			                                                          (float)srcY + 0.5);
+		VTX_OUT_6((float)dstX,                                       (float)dstY,
+			  (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
+			  (float)srcX + 0.5,                                 (float)srcY + 0.5);
+		VTX_OUT_6((float)dstX,                                       (float)(dstY + dstw + dsth),
+			  (float)srcX / info->accel_state->texW[0],
+			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
+			  (float)srcX + 0.5,
+			  (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
+		VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
+			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+			  (float)srcY / info->accel_state->texH[0],
+			  (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
+			  (float)srcY + 0.5);
 	    }
 	} else {
 	    if (use_quad) {
-		VTX_OUT((float)dstX,                                       (float)dstY,
-			(float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
-		VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
-			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-		VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-		VTX_OUT((float)(dstX + dstw),                              (float)dstY,
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+		VTX_OUT_4((float)dstX,                                       (float)dstY,
+			  (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
+		VTX_OUT_4((float)dstX,                                       (float)(dstY + dsth),
+			  (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+		VTX_OUT_4((float)(dstX + dstw),                              (float)(dstY + dsth),
+			  (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+		VTX_OUT_4((float)(dstX + dstw),                              (float)dstY,
+			  (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	    } else {
 		/*
 		 * Render a big, scissored triangle. This means
 		 * increasing the triangle size and adjusting
 		 * texture coordinates.
 		 */
-		VTX_OUT((float)dstX,                              (float)dstY,
-			(float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
-		VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
-			(float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
-		VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
-			((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
-			(float)srcY / info->accel_state->texH[0]);
+		VTX_OUT_4((float)dstX,                              (float)dstY,
+			  (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+		VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
+			  (float)srcX / info->accel_state->texW[0],
+			  ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
+		VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
+			  ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+			  (float)srcY / info->accel_state->texH[0]);
 	    }
 	}
 
@@ -2305,20 +2304,20 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	RADEON_SWITCH_TO_3D();
     } else
 #endif
-	{
-	    BEGIN_ACCEL(2);
-	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
-	    /* We must wait for 3d to idle, in case source was just written as a dest. */
-	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
-			  RADEON_WAIT_HOST_IDLECLEAN |
-			  RADEON_WAIT_2D_IDLECLEAN |
-			  RADEON_WAIT_3D_IDLECLEAN |
-			  RADEON_WAIT_DMA_GUI_IDLE);
-	    FINISH_ACCEL();
-
-	    if (!info->accel_state->XInited3D)
-		RADEONInit3DEngine(pScrn);
-	}
+    {
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+	/* We must wait for 3d to idle, in case source was just written as a dest. */
+	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+		      RADEON_WAIT_HOST_IDLECLEAN |
+		      RADEON_WAIT_2D_IDLECLEAN |
+		      RADEON_WAIT_3D_IDLECLEAN |
+		      RADEON_WAIT_DMA_GUI_IDLE);
+	FINISH_ACCEL();
+
+	if (!info->accel_state->XInited3D)
+	    RADEONInit3DEngine(pScrn);
+    }
 
     if (pPriv->bicubic_enabled)
 	vtx_count = 6;
@@ -3207,30 +3206,30 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
 #endif
 	if (pPriv->bicubic_enabled) {
-	    VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
-			   (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
-			   (float)srcX + 0.5,                                 (float)srcY + 0.5);
-	    VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
-			   (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
-			   (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
-	    VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
-			   ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
-			   (float)srcY / info->accel_state->texH[0],
-			   (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
-			   (float)srcY + 0.5);
+	    VTX_OUT_6((float)dstX,                                       (float)dstY,
+		      (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
+		      (float)srcX + 0.5,                                 (float)srcY + 0.5);
+	    VTX_OUT_6((float)dstX,                                       (float)(dstY + dstw + dsth),
+		      (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
+		      (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
+	    VTX_OUT_6((float)(dstX + dstw + dsth),                       (float)dstY,
+		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+		      (float)srcY / info->accel_state->texH[0],
+		      (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
+		      (float)srcY + 0.5);
 	} else {
 	    /*
 	     * Render a big, scissored triangle. This means
 	     * increasing the triangle size and adjusting
 	     * texture coordinates.
 	     */
-	    VTX_OUT((float)dstX,                              (float)dstY,
-		    (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
-	    VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
-		    (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
-	    VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
-		    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
-		    (float)srcY / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)dstX,                              (float)dstY,
+		      (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)dstX,                              (float)(dstY + dsth + dstw),
+		      (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
+	    VTX_OUT_4((float)(dstX + dstw + dsth),              (float)dstY,
+		      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+		      (float)srcY / info->accel_state->texH[0]);
 	}
 
 	/* flushing is pipelined, free/finish is not */
@@ -3254,6 +3253,6 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
 
-#undef VTX_OUT
-#undef VTX_OUT_FILTER
+#undef VTX_OUT_4
+#undef VTX_OUT_6
 #undef FUNC_NAME
commit 093ab4c9a33b0b396b78c061c3321dc044bdccdc
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Apr 13 19:48:35 2009 -0400

    R1xx: add support for native planar textured Xv

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index f64da02..f657536 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -374,7 +374,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     pPriv->planar_hw = pPriv->planar_state;
-    if (pPriv->bicubic_enabled || !( IS_R300_3D || IS_R200_3D ))
+    if (pPriv->bicubic_enabled || (IS_R600_3D || IS_R500_3D))
 	pPriv->planar_hw = 0;
 
     switch(id) {
@@ -663,11 +663,12 @@ static XF86VideoFormatRec Formats[NUM_FORMATS] =
     {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
 };
 
-#define NUM_ATTRIBUTES 1
+#define NUM_ATTRIBUTES 2
 
 static XF86AttributeRec Attributes[NUM_ATTRIBUTES+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {0, 0, 0, NULL}
 };
 
@@ -710,6 +711,14 @@ static XF86AttributeRec Attributes_r500[NUM_ATTRIBUTES_R500+1] =
     {0, 0, 0, NULL}
 };
 
+#define NUM_ATTRIBUTES_R600 1
+
+static XF86AttributeRec Attributes_r600[NUM_ATTRIBUTES_R600+1] =
+{
+    {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {0, 0, 0, NULL}
+};
+
 static Atom xvBicubic;
 static Atom xvVSync;
 static Atom xvHWPlanar;
@@ -841,14 +850,18 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
     pPortPriv =
 	(RADEONPortPrivPtr)(&adapt->pPortPrivates[num_texture_ports]);
 
-    if (IS_R300_3D) {
-	adapt->pAttributes = Attributes_r300;
-	adapt->nAttributes = NUM_ATTRIBUTES_R300;
+    if (IS_R600_3D) {
+	adapt->pAttributes = Attributes_r600;
+	adapt->nAttributes = NUM_ATTRIBUTES_R600;
     }
     else if (IS_R500_3D) {
 	adapt->pAttributes = Attributes_r500;
 	adapt->nAttributes = NUM_ATTRIBUTES_R500;
     }
+    else if (IS_R300_3D) {
+	adapt->pAttributes = Attributes_r300;
+	adapt->nAttributes = NUM_ATTRIBUTES_R300;
+    }
     else if (IS_R200_3D) {
 	adapt->pAttributes = Attributes_r200;
 	adapt->nAttributes = NUM_ATTRIBUTES_R200;
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 360532e..83aa101 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -92,7 +92,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
     PixmapPtr pPixmap = pPriv->pPixmap;
-    uint32_t txformat;
+    uint32_t txformat, txsize, txpitch;
     uint32_t dst_offset, dst_pitch, dst_format;
     uint32_t colorpitch;
     Bool isplanar = FALSE;
@@ -128,22 +128,20 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	RADEON_SWITCH_TO_3D();
     } else
 #endif
-	{
-	    BEGIN_ACCEL(2);
-	    OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
-	    /* We must wait for 3d to idle, in case source was just written as a dest. */
-	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
-			  RADEON_WAIT_HOST_IDLECLEAN |
-			  RADEON_WAIT_2D_IDLECLEAN |
-			  RADEON_WAIT_3D_IDLECLEAN |
-			  RADEON_WAIT_DMA_GUI_IDLE);
-	    FINISH_ACCEL();
-
-	    if (!info->accel_state->XInited3D)
-		RADEONInit3DEngine(pScrn);
-	}
+    {
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
+	/* We must wait for 3d to idle, in case source was just written as a dest. */
+	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+		      RADEON_WAIT_HOST_IDLECLEAN |
+		      RADEON_WAIT_2D_IDLECLEAN |
+		      RADEON_WAIT_3D_IDLECLEAN |
+		      RADEON_WAIT_DMA_GUI_IDLE);
+	FINISH_ACCEL();
 
-    vtx_count = 4;
+	if (!info->accel_state->XInited3D)
+	    RADEONInit3DEngine(pScrn);
+    }
 
     /* Same for R100/R200 */
     switch (pPixmap->drawable.bitsPerPixel) {
@@ -165,7 +163,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
     }
 
     if (isplanar) {
-	txformat = RADEON_TXFORMAT_I8;
+	txformat = RADEON_TXFORMAT_Y8;
     } else {
 	if (pPriv->id == FOURCC_UYVY)
 	    txformat = RADEON_TXFORMAT_YVYU422;
@@ -182,56 +180,149 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 
     BEGIN_ACCEL(4);
 
-    OUT_ACCEL_REG(RADEON_RB3D_CNTL,
-		  dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
+    OUT_ACCEL_REG(RADEON_RB3D_CNTL, dst_format);
     OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
-
     OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
-
     OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
 		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
 
     FINISH_ACCEL();
 
+    if (isplanar) {
+	/* need 2 texcoord sets (even though they are identical) due
+	   to denormalization! hw apparently can't premultiply
+	   same coord set by different texture size */
+	vtx_count = 6;
 
-    info->accel_state->texW[0] = 1;
-    info->accel_state->texH[0] = 1;
-
-    BEGIN_ACCEL(9);
-
-    OUT_ACCEL_REG(RADEON_PP_CNTL,
-		  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
-
-    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
-				      RADEON_SE_VTX_FMT_ST0));
-
-    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
-		  RADEON_MAG_FILTER_LINEAR |
-		  RADEON_MIN_FILTER_LINEAR |
-		  RADEON_CLAMP_S_CLAMP_LAST |
-		  RADEON_CLAMP_T_CLAMP_LAST |
-		  RADEON_YUV_TO_RGB);
-    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
-    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
-    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
-		  RADEON_COLOR_ARG_A_ZERO |
-		  RADEON_COLOR_ARG_B_ZERO |
-		  RADEON_COLOR_ARG_C_T0_COLOR |
-		  RADEON_BLEND_CTL_ADD |
-		  RADEON_CLAMP_TX);
-    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
-		  RADEON_ALPHA_ARG_A_ZERO |
-		  RADEON_ALPHA_ARG_B_ZERO |
-		  RADEON_ALPHA_ARG_C_T0_ALPHA |
-		  RADEON_BLEND_CTL_ADD |
-		  RADEON_CLAMP_TX);
-
-    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
-		  (pPriv->w - 1) |
-		  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
-		  pPriv->src_pitch - 32);
-    FINISH_ACCEL();
+	txsize = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+		  (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	txpitch -= 32;
+
+	BEGIN_ACCEL(23);
+
+	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
+					  RADEON_SE_VTX_FMT_ST0 |
+					  RADEON_SE_VTX_FMT_ST1));
+
+	OUT_ACCEL_REG(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE |
+				       RADEON_TEX_1_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
+				       RADEON_TEX_2_ENABLE | RADEON_TEX_BLEND_2_ENABLE |
+				       RADEON_PLANAR_YUV_ENABLE));
+
+	/* Y */
+	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
+		      RADEON_MAG_FILTER_LINEAR |
+		      RADEON_MIN_FILTER_LINEAR |
+		      RADEON_CLAMP_S_CLAMP_LAST |
+		      RADEON_CLAMP_T_CLAMP_LAST |
+		      RADEON_YUV_TO_RGB);
+	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
+	OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
+	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
+		      RADEON_COLOR_ARG_A_ZERO |
+		      RADEON_COLOR_ARG_B_ZERO |
+		      RADEON_COLOR_ARG_C_T0_COLOR |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
+		      RADEON_ALPHA_ARG_A_ZERO |
+		      RADEON_ALPHA_ARG_B_ZERO |
+		      RADEON_ALPHA_ARG_C_T0_ALPHA |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+
+	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
+		      (pPriv->w - 1) |
+		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
+		      pPriv->src_pitch - 32);
+
+	/* U */
+	OUT_ACCEL_REG(RADEON_PP_TXFILTER_1,
+		      RADEON_MAG_FILTER_LINEAR |
+		      RADEON_MIN_FILTER_LINEAR |
+		      RADEON_CLAMP_S_CLAMP_LAST |
+		      RADEON_CLAMP_T_CLAMP_LAST);
+	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_1, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
+	OUT_ACCEL_REG(RADEON_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
+	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_1,
+		      RADEON_COLOR_ARG_A_ZERO |
+		      RADEON_COLOR_ARG_B_ZERO |
+		      RADEON_COLOR_ARG_C_T0_COLOR |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+	OUT_ACCEL_REG(RADEON_PP_TXABLEND_1,
+		      RADEON_ALPHA_ARG_A_ZERO |
+		      RADEON_ALPHA_ARG_B_ZERO |
+		      RADEON_ALPHA_ARG_C_T0_ALPHA |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+
+	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_1, txsize);
+	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_1, txpitch);
+
+	/* V */
+	OUT_ACCEL_REG(RADEON_PP_TXFILTER_2,
+		      RADEON_MAG_FILTER_LINEAR |
+		      RADEON_MIN_FILTER_LINEAR |
+		      RADEON_CLAMP_S_CLAMP_LAST |
+		      RADEON_CLAMP_T_CLAMP_LAST);
+	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_2, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ1);
+	OUT_ACCEL_REG(RADEON_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
+	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_2,
+		      RADEON_COLOR_ARG_A_ZERO |
+		      RADEON_COLOR_ARG_B_ZERO |
+		      RADEON_COLOR_ARG_C_T0_COLOR |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+	OUT_ACCEL_REG(RADEON_PP_TXABLEND_2,
+		      RADEON_ALPHA_ARG_A_ZERO |
+		      RADEON_ALPHA_ARG_B_ZERO |
+		      RADEON_ALPHA_ARG_C_T0_ALPHA |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+
+	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_2, txsize);
+	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_2, txpitch);
+	FINISH_ACCEL();
+    } else {
+	vtx_count = 4;
+	BEGIN_ACCEL(9);
+
+	OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
+					  RADEON_SE_VTX_FMT_ST0));
+
+	OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+
+	OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
+		      RADEON_MAG_FILTER_LINEAR |
+		      RADEON_MIN_FILTER_LINEAR |
+		      RADEON_CLAMP_S_CLAMP_LAST |
+		      RADEON_CLAMP_T_CLAMP_LAST |
+		      RADEON_YUV_TO_RGB);
+	OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat | RADEON_TXFORMAT_ST_ROUTE_STQ0);
+	OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
+	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
+		      RADEON_COLOR_ARG_A_ZERO |
+		      RADEON_COLOR_ARG_B_ZERO |
+		      RADEON_COLOR_ARG_C_T0_COLOR |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+	OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
+		      RADEON_ALPHA_ARG_A_ZERO |
+		      RADEON_ALPHA_ARG_B_ZERO |
+		      RADEON_ALPHA_ARG_C_T0_ALPHA |
+		      RADEON_BLEND_CTL_ADD |
+		      RADEON_CLAMP_TX);
+
+	OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
+		      (pPriv->w - 1) |
+		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+	OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
+		      pPriv->src_pitch - 32);
+	FINISH_ACCEL();
+    }
 
     if (pPriv->vsync) {
 	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
@@ -269,18 +360,23 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
      */
 
 #ifdef ACCEL_CP
-	BEGIN_RING(nBox * 3 * vtx_count + 3);
+	BEGIN_RING(nBox * 3 * vtx_count + 5);
 	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
 			    nBox * 3 * vtx_count + 1));
-	OUT_RING(RADEON_CP_VC_FRMT_XY |
-		 RADEON_CP_VC_FRMT_ST0);
+	if (isplanar)
+	    OUT_RING(RADEON_CP_VC_FRMT_XY |
+		     RADEON_CP_VC_FRMT_ST0 |
+		     RADEON_CP_VC_FRMT_ST1);
+	else
+	    OUT_RING(RADEON_CP_VC_FRMT_XY |
+		     RADEON_CP_VC_FRMT_ST0);
 	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
 		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
 		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
 		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
 		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
 #else /* ACCEL_CP */
-	BEGIN_ACCEL(nBox * vtx_count * 3 + 1);
+	BEGIN_ACCEL(nBox * vtx_count * 3 + 2);
 	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
 					  RADEON_VF_PRIM_WALK_DATA |
 					  RADEON_VF_RADEON_MODE |
@@ -303,29 +399,42 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
 	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
 
-	/*
-	 * Just render a rect (using three coords).
-	 */
-	VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
-		(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-	VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
-		(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-	VTX_OUT((float)(dstX + dstw),                              (float)dstY,
-		(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	if (isplanar) {
+	    /*
+	     * Just render a rect (using three coords).
+	     * Filter is a bit a misnomer, it's just texcoords...
+	     */
+	    VTX_OUT_FILTER((float)dstX,                (float)(dstY + dsth),
+			   (float)srcX,                (float)(srcY + srch),
+			   (float)srcX,                (float)(srcY + (srch / 2)));
+	    VTX_OUT_FILTER((float)(dstX + dstw),       (float)(dstY + dsth),
+			   (float)(srcX + srcw),       (float)(srcY + srch),
+			   (float)(srcX + (srcw / 2)), (float)(srcY + (srch / 2)));
+	    VTX_OUT_FILTER((float)(dstX + dstw),       (float)dstY,
+			   (float)(srcX + srcw),       (float)srcY,
+			   (float)(srcX + (srcw / 2)), (float)srcY);
+	} else {
+	    /*
+	     * Just render a rect (using three coords).
+	     */
+	    VTX_OUT((float)dstX,          (float)(dstY + dsth),
+		    (float)srcX,          (float)(srcY + srch));
+	    VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth),
+		    (float)(srcX + srcw), (float)(srcY + srch));
+	    VTX_OUT((float)(dstX + dstw), (float)dstY,
+		    (float)(srcX + srcw), (float)srcY);
+	}
 
 	pBox++;
     }
 
+    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
 #ifdef ACCEL_CP
 	ADVANCE_RING();
 #else
 	FINISH_ACCEL();
 #endif /* !ACCEL_CP */
 
-    BEGIN_ACCEL(1);
-    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
-    FINISH_ACCEL();
-
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
 
commit ec0cb51df81c6c9a1de640d227fa9c9c33161083
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Apr 13 17:21:20 2009 -0400

    R2xx tex vid: append verts for clip boxes
    
    rather than sending a new draw packet for each rect

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index d5cf47c..360532e 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -880,6 +880,21 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
      *     render as a quad.
      */
 
+#ifdef ACCEL_CP
+	BEGIN_RING(nBox * 3 * vtx_count + 2);
+	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
+			    nBox * 3 * vtx_count));
+	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
+#else /* ACCEL_CP */
+	BEGIN_ACCEL(nBox * 3 * vtx_count + 1);
+	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+					  RADEON_VF_PRIM_WALK_DATA |
+					  ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
+
+#endif
+
     while (nBox--) {
 	int srcX, srcY, srcw, srch;
 	int dstX, dstY, dstw, dsth;
@@ -896,20 +911,6 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
 	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
 
-#ifdef ACCEL_CP
-	BEGIN_RING(3 * vtx_count + 2);
-	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
-			    3 * vtx_count));
-	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
-		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
-		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
-#else /* ACCEL_CP */
-	BEGIN_ACCEL(1 + vtx_count * 3);
-	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
-					  RADEON_VF_PRIM_WALK_DATA |
-					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
-
-#endif
 	if (isplanar) {
 	    /*
 	     * Just render a rect (using three coords).
@@ -936,15 +937,15 @@ FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 		    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	}
 
+	pBox++;
+    }
+
 #ifdef ACCEL_CP
 	ADVANCE_RING();
 #else
 	FINISH_ACCEL();
 #endif /* !ACCEL_CP */
 
-	pBox++;
-    }
-
     BEGIN_ACCEL(1);
     OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
     FINISH_ACCEL();
commit fde075a30a8ee2c333aa1bbe8fbd177258b085ba
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Apr 13 17:13:51 2009 -0400

    R1xx tex vid: append verts for clip boxes
    
    rather than sending a new draw packet for each rect

diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 9361f07..d5cf47c 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -268,6 +268,25 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
      *     render as a quad.
      */
 
+#ifdef ACCEL_CP
+	BEGIN_RING(nBox * 3 * vtx_count + 3);
+	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
+			    nBox * 3 * vtx_count + 1));
+	OUT_RING(RADEON_CP_VC_FRMT_XY |
+		 RADEON_CP_VC_FRMT_ST0);
+	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
+		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+		 ((nBox * 3) << RADEON_CP_VC_CNTL_NUM_SHIFT));
+#else /* ACCEL_CP */
+	BEGIN_ACCEL(nBox * vtx_count * 3 + 1);
+	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+					  RADEON_VF_PRIM_WALK_DATA |
+					  RADEON_VF_RADEON_MODE |
+					  ((nBox * 3) << RADEON_VF_NUM_VERTICES_SHIFT)));
+#endif
+
     while (nBox--) {
 	int srcX, srcY, srcw, srch;
 	int dstX, dstY, dstw, dsth;
@@ -284,24 +303,6 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
 	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
 
-#ifdef ACCEL_CP
-	BEGIN_RING(3 * vtx_count + 3);
-	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
-			    3 * vtx_count + 1));
-	OUT_RING(RADEON_CP_VC_FRMT_XY |
-		 RADEON_CP_VC_FRMT_ST0);
-	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
-		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
-		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
-		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
-#else /* ACCEL_CP */
-	BEGIN_ACCEL(1 + vtx_count * 3);
-	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
-					  RADEON_VF_PRIM_WALK_DATA |
-					  RADEON_VF_RADEON_MODE |
-					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
-#endif
 	/*
 	 * Just render a rect (using three coords).
 	 */
@@ -312,15 +313,15 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	VTX_OUT((float)(dstX + dstw),                              (float)dstY,
 		(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 
+	pBox++;
+    }
+
 #ifdef ACCEL_CP
 	ADVANCE_RING();
 #else
 	FINISH_ACCEL();
 #endif /* !ACCEL_CP */
 
-	pBox++;
-    }
-
     BEGIN_ACCEL(1);
     OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
     FINISH_ACCEL();
commit 12839fc17a2cca4ac14b9757bdaa63ba4679f96f
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Apr 13 17:04:31 2009 -0400

    Tex vid: split by family

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index bf8a276..f64da02 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -598,13 +598,29 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     pPriv->h = height;
 
 #ifdef XF86DRI
-    if (IS_R600_3D)
-	R600DisplayTexturedVideo(pScrn, pPriv);
-    else if (info->directRenderingEnabled)
-	RADEONDisplayTexturedVideoCP(pScrn, pPriv);
-    else
+    if (info->directRenderingEnabled) {
+	if (IS_R600_3D)
+	    R600DisplayTexturedVideo(pScrn, pPriv);
+	else if (IS_R500_3D)
+	    R500DisplayTexturedVideoCP(pScrn, pPriv);
+	else if (IS_R300_3D)
+	    R300DisplayTexturedVideoCP(pScrn, pPriv);
+	else if (IS_R200_3D)
+	    R200DisplayTexturedVideoCP(pScrn, pPriv);
+	else
+	    RADEONDisplayTexturedVideoCP(pScrn, pPriv);
+    } else
 #endif
-	RADEONDisplayTexturedVideoMMIO(pScrn, pPriv);
+    {
+	if (IS_R500_3D)
+	    R500DisplayTexturedVideoMMIO(pScrn, pPriv);
+	else if (IS_R300_3D)
+	    R300DisplayTexturedVideoMMIO(pScrn, pPriv);
+	else if (IS_R200_3D)
+	    R200DisplayTexturedVideoMMIO(pScrn, pPriv);
+	else
+	    RADEONDisplayTexturedVideoMMIO(pScrn, pPriv);
+    }
 
     return Success;
 }
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 3c4289f..9361f07 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -93,10 +93,8 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
     RADEONInfoPtr info = RADEONPTR(pScrn);
     PixmapPtr pPixmap = pPriv->pPixmap;
     uint32_t txformat;
-    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
     uint32_t dst_offset, dst_pitch, dst_format;
-    uint32_t txenable, colorpitch;
-    uint32_t blendcntl;
+    uint32_t colorpitch;
     Bool isplanar = FALSE;
     int dstxoff, dstyoff, pixel_shift, vtx_count;
     BoxPtr pBox = REGION_RECTS(&pPriv->clip);
@@ -132,10 +130,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 #endif
 	{
 	    BEGIN_ACCEL(2);
-	    if (IS_R300_3D || IS_R500_3D)
-		OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
-	    else
-		OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
+	    OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
 	    /* We must wait for 3d to idle, in case source was just written as a dest. */
 	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
 			  RADEON_WAIT_HOST_IDLECLEAN |
@@ -148,2057 +143,1835 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 		RADEONInit3DEngine(pScrn);
 	}
 
-    if (pPriv->bicubic_enabled)
-	vtx_count = 6;
-    else
-	vtx_count = 4;
+    vtx_count = 4;
 
-    if (IS_R300_3D || IS_R500_3D) {
-	uint32_t output_fmt;
-
-	switch (pPixmap->drawable.bitsPerPixel) {
-	case 16:
-	    if (pPixmap->drawable.depth == 15)
-		dst_format = R300_COLORFORMAT_ARGB1555;
-	    else
-		dst_format = R300_COLORFORMAT_RGB565;
-	    break;
-	case 32:
-	    dst_format = R300_COLORFORMAT_ARGB8888;
-	    break;
-	default:
-	    return;
+    /* Same for R100/R200 */
+    switch (pPixmap->drawable.bitsPerPixel) {
+    case 16:
+	if (pPixmap->drawable.depth == 15)
+	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
+	else
+	    dst_format = RADEON_COLOR_FORMAT_RGB565;
+	break;
+    case 32:
+	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
+	break;
+    default:
+	return;
+    }
+
+    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	isplanar = TRUE;
+    }
+
+    if (isplanar) {
+	txformat = RADEON_TXFORMAT_I8;
+    } else {
+	if (pPriv->id == FOURCC_UYVY)
+	    txformat = RADEON_TXFORMAT_YVYU422;
+	else
+	    txformat = RADEON_TXFORMAT_VYUY422;
+    }
+
+    txformat |= RADEON_TXFORMAT_NON_POWER2;
+
+    colorpitch = dst_pitch >> pixel_shift;
+
+    if (RADEONTilingEnabled(pScrn, pPixmap))
+	colorpitch |= RADEON_COLOR_TILE_ENABLE;
+
+    BEGIN_ACCEL(4);
+
+    OUT_ACCEL_REG(RADEON_RB3D_CNTL,
+		  dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
+    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
+
+    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
+
+    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
+		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+
+    FINISH_ACCEL();
+
+
+    info->accel_state->texW[0] = 1;
+    info->accel_state->texH[0] = 1;
+
+    BEGIN_ACCEL(9);
+
+    OUT_ACCEL_REG(RADEON_PP_CNTL,
+		  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+
+    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
+				      RADEON_SE_VTX_FMT_ST0));
+
+    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
+		  RADEON_MAG_FILTER_LINEAR |
+		  RADEON_MIN_FILTER_LINEAR |
+		  RADEON_CLAMP_S_CLAMP_LAST |
+		  RADEON_CLAMP_T_CLAMP_LAST |
+		  RADEON_YUV_TO_RGB);
+    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
+    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
+    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
+		  RADEON_COLOR_ARG_A_ZERO |
+		  RADEON_COLOR_ARG_B_ZERO |
+		  RADEON_COLOR_ARG_C_T0_COLOR |
+		  RADEON_BLEND_CTL_ADD |
+		  RADEON_CLAMP_TX);
+    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
+		  RADEON_ALPHA_ARG_A_ZERO |
+		  RADEON_ALPHA_ARG_B_ZERO |
+		  RADEON_ALPHA_ARG_C_T0_ALPHA |
+		  RADEON_BLEND_CTL_ADD |
+		  RADEON_CLAMP_TX);
+
+    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
+		  (pPriv->w - 1) |
+		  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
+		  pPriv->src_pitch - 32);
+    FINISH_ACCEL();
+
+    if (pPriv->vsync) {
+	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
+						    pPriv->drw_x,
+						    pPriv->drw_x + pPriv->dst_w,
+						    pPriv->drw_y,
+						    pPriv->drw_y + pPriv->dst_h);
+	if (crtc) {
+	    RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;
+
+	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
+					  radeon_crtc->crtc_id,
+					  pPriv->drw_y - crtc->y,
+					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
 	}
+    }
+    /*
+     * Rendering of the actual polygon is done in two different
+     * ways depending on chip generation:
+     *
+     * < R300:
+     *
+     *     These chips can render a rectangle in one pass, so
+     *     handling is pretty straight-forward.
+     *
+     * >= R300:
+     *
+     *     These chips can accept a quad, but will render it as
+     *     two triangles which results in a diagonal tear. Instead
+     *     We render a single, large triangle and use the scissor
+     *     functionality to restrict it to the desired rectangle.
+     *     Due to guardband limits on r3xx/r4xx, we can only use
+     *     the single triangle up to 2880 pixels; above that we
+     *     render as a quad.
+     */
+
+    while (nBox--) {
+	int srcX, srcY, srcw, srch;
+	int dstX, dstY, dstw, dsth;
+	dstX = pBox->x1 + dstxoff;
+	dstY = pBox->y1 + dstyoff;
+	dstw = pBox->x2 - pBox->x1;
+	dsth = pBox->y2 - pBox->y1;
+
+	srcX = ((pBox->x1 - pPriv->drw_x) *
+		pPriv->src_w) / pPriv->dst_w;
+	srcY = ((pBox->y1 - pPriv->drw_y) *
+		pPriv->src_h) / pPriv->dst_h;
+
+	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
+	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
+
+#ifdef ACCEL_CP
+	BEGIN_RING(3 * vtx_count + 3);
+	OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
+			    3 * vtx_count + 1));
+	OUT_RING(RADEON_CP_VC_FRMT_XY |
+		 RADEON_CP_VC_FRMT_ST0);
+	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+		 RADEON_CP_VC_CNTL_MAOS_ENABLE |
+		 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
+#else /* ACCEL_CP */
+	BEGIN_ACCEL(1 + vtx_count * 3);
+	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+					  RADEON_VF_PRIM_WALK_DATA |
+					  RADEON_VF_RADEON_MODE |
+					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
+#endif
+	/*
+	 * Just render a rect (using three coords).
+	 */
+	VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
+		(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+	VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
+		(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+	VTX_OUT((float)(dstX + dstw),                              (float)dstY,
+		(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+
+#ifdef ACCEL_CP
+	ADVANCE_RING();
+#else
+	FINISH_ACCEL();
+#endif /* !ACCEL_CP */
+
+	pBox++;
+    }
 
-	output_fmt = (R300_OUT_FMT_C4_8 |
-		      R300_OUT_FMT_C0_SEL_BLUE |
-		      R300_OUT_FMT_C1_SEL_GREEN |
-		      R300_OUT_FMT_C2_SEL_RED |
-		      R300_OUT_FMT_C3_SEL_ALPHA);
+    BEGIN_ACCEL(1);
+    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+    FINISH_ACCEL();
 
-	colorpitch = dst_pitch >> pixel_shift;
-	colorpitch |= dst_format;
+    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+}
 
-	if (RADEONTilingEnabled(pScrn, pPixmap))
-	    colorpitch |= R300_COLORTILE;
+static void
+FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    PixmapPtr pPixmap = pPriv->pPixmap;
+    uint32_t txformat;
+    uint32_t txfilter, txformat0, txpitch;
+    uint32_t dst_offset, dst_pitch, dst_format;
+    uint32_t colorpitch;
+    Bool isplanar = FALSE;
+    int dstxoff, dstyoff, pixel_shift, vtx_count;
+    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
+    int nBox = REGION_NUM_RECTS(&pPriv->clip);
+    ACCEL_PREAMBLE();
 
-	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
-	    isplanar = TRUE;
+    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
+
+#ifdef USE_EXA
+    if (info->useEXA) {
+	dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
+	dst_pitch = exaGetPixmapPitch(pPixmap);
+    } else
+#endif
+	{
+	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
+		info->fbLocation + pScrn->fbOffset;
+	    dst_pitch = pPixmap->devKind;
 	}
 
-	if (isplanar) {
-	    txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
-	    txpitch = pPriv->src_pitch;
-	} else {
-	    if (pPriv->id == FOURCC_UYVY)
-		txformat1 = R300_TX_FORMAT_YVYU422;
-	    else
-		txformat1 = R300_TX_FORMAT_VYUY422;
+#ifdef COMPOSITE
+    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
+    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
+#else
+    dstxoff = 0;
+    dstyoff = 0;
+#endif
 
-	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+#ifdef USE_EXA
+    if (info->useEXA) {
+	RADEON_SWITCH_TO_3D();
+    } else
+#endif
+	{
+	    BEGIN_ACCEL(2);
+	    OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
+	    /* We must wait for 3d to idle, in case source was just written as a dest. */
+	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+			  RADEON_WAIT_HOST_IDLECLEAN |
+			  RADEON_WAIT_2D_IDLECLEAN |
+			  RADEON_WAIT_3D_IDLECLEAN |
+			  RADEON_WAIT_DMA_GUI_IDLE);
+	    FINISH_ACCEL();
 
-	    /* pitch is in pixels */
-	    txpitch = pPriv->src_pitch / 2;
+	    if (!info->accel_state->XInited3D)
+		RADEONInit3DEngine(pScrn);
 	}
-	txpitch -= 1;
 
-	txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
-		    (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
-		    R300_TXPITCH_EN);
+    vtx_count = 4;
 
-	info->accel_state->texW[0] = pPriv->w;
-	info->accel_state->texH[0] = pPriv->h;
+    /* Same for R100/R200 */
+    switch (pPixmap->drawable.bitsPerPixel) {
+    case 16:
+	if (pPixmap->drawable.depth == 15)
+	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
+	else
+	    dst_format = RADEON_COLOR_FORMAT_RGB565;
+	break;
+    case 32:
+	dst_format = RADEON_COLOR_FORMAT_ARGB8888;
+	break;
+    default:
+	return;
+    }
 
-	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
-		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
-		    R300_TX_MAG_FILTER_LINEAR |
-		    R300_TX_MIN_FILTER_LINEAR |
-		    (0 << R300_TX_ID_SHIFT));
+    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	isplanar = TRUE;
+    }
 
+    if (isplanar) {
+	txformat = RADEON_TXFORMAT_I8;
+    } else {
+	if (pPriv->id == FOURCC_UYVY)
+	    txformat = RADEON_TXFORMAT_YVYU422;
+	else
+	    txformat = RADEON_TXFORMAT_VYUY422;
+    }
 
-	if (IS_R500_3D && ((pPriv->w - 1) & 0x800))
-	    txpitch |= R500_TXWIDTH_11;
+    txformat |= RADEON_TXFORMAT_NON_POWER2;
 
-	if (IS_R500_3D && ((pPriv->h - 1) & 0x800))
-	    txpitch |= R500_TXHEIGHT_11;
+    colorpitch = dst_pitch >> pixel_shift;
 
-	txoffset = pPriv->src_offset;
+    if (RADEONTilingEnabled(pScrn, pPixmap))
+	colorpitch |= RADEON_COLOR_TILE_ENABLE;
 
-	BEGIN_ACCEL(6);
-	OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
-	OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
-	OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
-	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
-	OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
-	OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
-	FINISH_ACCEL();
+    BEGIN_ACCEL(4);
 
-	txenable = R300_TEX_0_ENABLE;
+    OUT_ACCEL_REG(RADEON_RB3D_CNTL,
+		  dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
+    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
 
-	if (isplanar) {
-	    txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
-			(((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
-			R300_TXPITCH_EN);
-	    txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
-	    txpitch -= 1;
-	    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
-		        R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
-			R300_TX_MIN_FILTER_LINEAR |
-			R300_TX_MAG_FILTER_LINEAR);
-
-		BEGIN_ACCEL(12);
-		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
-		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
-		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
-		OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
-		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
-		OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
-		OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
-		OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
-		OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
-		OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
-		OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
-		OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
-		FINISH_ACCEL();
-		txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
+    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
+
+    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
+		  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+
+    FINISH_ACCEL();
+
+    info->accel_state->texW[0] = pPriv->w;
+    info->accel_state->texH[0] = pPriv->h;
+
+    if (isplanar) {
+	/* note: in contrast to r300, use input biasing on uv components */
+	const float Loff = -0.0627;
+	float uvcosf, uvsinf;
+	float yco, yoff;
+	float uco[3], vco[3];
+	float bright, cont, sat;
+	int ref = pPriv->transform_index;
+	float ucscale = 0.25, vcscale = 0.25;
+	Bool needux8 = FALSE, needvx8 = FALSE;
+
+	/* contrast can cause constant overflow, clamp */
+	cont = RTFContrast(pPriv->contrast);
+	if (cont * trans[ref].RefLuma > 2.0)
+	    cont = 2.0 / trans[ref].RefLuma;
+	/* brightness is only from -0.5 to 0.5 should be safe */
+	bright = RTFBrightness(pPriv->brightness);
+	/* saturation can also cause overflow, clamp */
+	sat = RTFSaturation(pPriv->saturation);
+	if (sat * trans[ref].RefBCb > 4.0)
+	    sat = 4.0 / trans[ref].RefBCb;
+	uvcosf = sat * cos(RTFHue(pPriv->hue));
+	uvsinf = sat * sin(RTFHue(pPriv->hue));
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	yoff = Loff * yco + bright;
+
+	if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
+	    needux8 = TRUE;
+	    ucscale = 0.125;
+	}
+	if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
+	    needvx8 = TRUE;
+	    vcscale = 0.125;
 	}
 
-	if (pPriv->bicubic_enabled) {
-		/* Size is 128x1 */
-		txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
-			     (0x0 << R300_TXHEIGHT_SHIFT) |
-			     R300_TXPITCH_EN);
-		/* Format is 32-bit floats, 4bpp */
-		txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
-		/* Pitch is 127 (128-1) */
-		txpitch = 0x7f;
-		/* Tex filter */
-		txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
-			    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
-			    R300_TX_MIN_FILTER_NEAREST |
-			    R300_TX_MAG_FILTER_NEAREST |
-			    (1 << R300_TX_ID_SHIFT));
-
-		BEGIN_ACCEL(6);
-		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
-		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
-		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
-		OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
-		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
-		OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
-		FINISH_ACCEL();
-
-		/* Enable tex 1 */
-		txenable |= R300_TEX_1_ENABLE;
+	/* need 2 texcoord sets (even though they are identical) due
+	   to denormalization! hw apparently can't premultiply
+	   same coord set by different texture size */
+	vtx_count = 6;
+
+	txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+		     (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	txpitch -= 32;
+	txfilter =  R200_MAG_FILTER_LINEAR |
+	    R200_MIN_FILTER_LINEAR |
+	    R200_CLAMP_S_CLAMP_LAST |
+	    R200_CLAMP_T_CLAMP_LAST;
+
+	BEGIN_ACCEL(36);
+
+	OUT_ACCEL_REG(RADEON_PP_CNTL,
+		      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
+		      RADEON_TEX_BLEND_0_ENABLE |
+		      RADEON_TEX_BLEND_1_ENABLE |
+		      RADEON_TEX_BLEND_2_ENABLE);
+
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
+		      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
+
+	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+		      (pPriv->w - 1) |
+		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+	OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+	OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
+	OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
+	OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
+
+	OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
+	OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
+	OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
+
+	/* similar to r300 code. Note the big problem is that hardware constants
+	 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
+	 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
+	 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
+	 * the constants not. To get larger range can use output scale, but for
+	 * that 2.018 value we need a total scale by 8, which means the constants
+	 * really have no accuracy whatsoever (5 fractional bits only).
+	 * The only direct way to get high  precision "constants" into the fragment
+	 * pipe I know of is to use the texcoord interpolator (not color, this one
+	 * is 8 bit only too), which seems a bit expensive. We're lucky though it
+	 * seems the values we need seem to fit better than worst case (get about
+	 * 6 fractional bits for this instead of 5, at least when not correcting for
+	 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
+	 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
+	 * even with non-default saturation/hue/contrast/brightness adjustments,
+	 * it gets a little crazy and ultimately precision might still be lacking.
+	 *
+	 * A higher precision (8 fractional bits) version might just put uco into
+	 * a texcoord, and calculate a new vcoconst in the shader, like so:
+	 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
+	 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
+	 * vcocalc = ADD temp, bias/scale(cohelper), vco
+	 * would in total use 4 tex units, 4 instructions which seems fairly
+	 * balanced for this architecture (instead of 3 + 3 for the solution here)
+	 *
+	 * temp = MAD(yco, yuv.yyyy, yoff)
+	 * temp = MAD(uco, yuv.uuuu, temp)
+	 * result = MAD(vco, yuv.vvvv, temp)
+	 *
+	 * note first mad produces actually scalar, hence we transform
+	 * it into a dp2a to get 8 bit precision of yco instead of 7 -
+	 * That's assuming hw correctly expands consts to internal precision.
+	 * (y * 1 + y * (yco - 1) + yoff)
+	 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
+	 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
+	 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
+	 *
+	 * vco, uco need bias (and hence scale too)
+	 *
+	 */
+
+	/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_ARG_B_R0_COLOR |
+		      R200_TXC_ARG_C_TFACTOR_COLOR |
+		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
+		      R200_TXC_OP_DOT2_ADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      R200_TXC_SCALE_INV2 |
+		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+		      R200_TXA_OUTPUT_REG_NONE);
+
+	/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_BIAS_ARG_A |
+		      R200_TXC_SCALE_ARG_A |
+		      R200_TXC_ARG_B_R1_COLOR |
+		      R200_TXC_BIAS_ARG_B |
+		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
+		      R200_TXC_ARG_C_R0_COLOR |
+		      R200_TXC_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
+		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
+		      R200_TXA_OUTPUT_REG_NONE);
+
+	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_BIAS_ARG_A |
+		      R200_TXC_SCALE_ARG_A |
+		      R200_TXC_ARG_B_R2_COLOR |
+		      R200_TXC_BIAS_ARG_B |
+		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
+		      R200_TXC_ARG_C_R0_COLOR |
+		      R200_TXC_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
+		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      R200_TXC_SCALE_2X |
+		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_COMP_ARG_C |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
+		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+
+	/* shader constants */
+	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
+						      yco > 1.0 ? yco - 1.0: yco,
+						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
+						      0.0));
+	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
+						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
+						      uco[2] * ucscale + 0.5,
+						      0.0));
+	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
+						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
+						      vco[2] * vcscale + 0.5,
+						      0.0));
+
+	FINISH_ACCEL();
+    } else if (info->ChipFamily == CHIP_FAMILY_RV250) {
+	/* fix up broken packed yuv - shader same as above except
+	   yuv components are all in same reg */
+	/* note: in contrast to r300, use input biasing on uv components */
+	const float Loff = -0.0627;
+	float uvcosf, uvsinf;
+	float yco, yoff;
+	float uco[3], vco[3];
+	float bright, cont, sat;
+	int ref = pPriv->transform_index;
+	float ucscale = 0.25, vcscale = 0.25;
+	Bool needux8 = FALSE, needvx8 = FALSE;
+
+	/* contrast can cause constant overflow, clamp */
+	cont = RTFContrast(pPriv->contrast);
+	if (cont * trans[ref].RefLuma > 2.0)
+	    cont = 2.0 / trans[ref].RefLuma;
+	/* brightness is only from -0.5 to 0.5 should be safe */
+	bright = RTFBrightness(pPriv->brightness);
+	/* saturation can also cause overflow, clamp */
+	sat = RTFSaturation(pPriv->saturation);
+	if (sat * trans[ref].RefBCb > 4.0)
+	    sat = 4.0 / trans[ref].RefBCb;
+	uvcosf = sat * cos(RTFHue(pPriv->hue));
+	uvsinf = sat * sin(RTFHue(pPriv->hue));
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	yoff = Loff * yco + bright;
+
+	if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
+	    needux8 = TRUE;
+	    ucscale = 0.125;
+	}
+	if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
+	    needvx8 = TRUE;
+	    vcscale = 0.125;
 	}
 
-	/* setup the VAP */
-	if (info->accel_state->has_tcl) {
-	    if (pPriv->bicubic_enabled)
-		BEGIN_ACCEL(7);
-	    else
-		BEGIN_ACCEL(6);
-	} else {
-	    if (pPriv->bicubic_enabled)
-		BEGIN_ACCEL(5);
-	    else
-		BEGIN_ACCEL(4);
+	txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	txpitch -= 32;
+	txfilter =  R200_MAG_FILTER_LINEAR |
+	    R200_MIN_FILTER_LINEAR |
+	    R200_CLAMP_S_CLAMP_LAST |
+	    R200_CLAMP_T_CLAMP_LAST;
+
+	BEGIN_ACCEL(24);
+
+	OUT_ACCEL_REG(RADEON_PP_CNTL,
+		      RADEON_TEX_0_ENABLE |
+		      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
+		      RADEON_TEX_BLEND_2_ENABLE);
+
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
+
+	OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+		      (pPriv->w - 1) |
+		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+	OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+	/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_ARG_B_R0_COLOR |
+		      R200_TXC_ARG_C_TFACTOR_COLOR |
+		      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
+		      R200_TXC_OP_DOT2_ADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+		      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      R200_TXC_SCALE_INV2 |
+		      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
+		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+		      R200_TXA_OUTPUT_REG_NONE);
+
+	/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_BIAS_ARG_A |
+		      R200_TXC_SCALE_ARG_A |
+		      R200_TXC_ARG_B_R0_COLOR |
+		      R200_TXC_BIAS_ARG_B |
+		      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
+		      R200_TXC_ARG_C_R1_COLOR |
+		      R200_TXC_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
+		      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
+		      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_1,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
+		      R200_TXA_OUTPUT_REG_NONE);
+
+	/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
+		      R200_TXC_ARG_A_TFACTOR_COLOR |
+		      R200_TXC_BIAS_ARG_A |
+		      R200_TXC_SCALE_ARG_A |
+		      R200_TXC_ARG_B_R0_COLOR |
+		      R200_TXC_BIAS_ARG_B |
+		      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
+		      R200_TXC_ARG_C_R1_COLOR |
+		      R200_TXC_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
+		      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
+		      R200_TXC_SCALE_2X |
+		      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
+		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_2,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_ZERO |
+		      R200_TXA_COMP_ARG_C |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
+		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+
+	/* shader constants */
+	OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
+						      yco > 1.0 ? yco - 1.0: yco,
+						      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
+						      0.0));
+	OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
+						      uco[1] * ucscale + 0.5, /* or [-2, 2] */
+						      uco[2] * ucscale + 0.5,
+						      0.0));
+	OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
+						      vco[1] * vcscale + 0.5, /* or [-4, 4] */
+						      vco[2] * vcscale + 0.5,
+						      0.0));
+
+	FINISH_ACCEL();
+    } else {
+	BEGIN_ACCEL(13);
+	OUT_ACCEL_REG(RADEON_PP_CNTL,
+		      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+	OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+		      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
+
+	OUT_ACCEL_REG(R200_PP_TXFILTER_0,
+		      R200_MAG_FILTER_LINEAR |
+		      R200_MIN_FILTER_LINEAR |
+		      R200_CLAMP_S_CLAMP_LAST |
+		      R200_CLAMP_T_CLAMP_LAST |
+		      R200_YUV_TO_RGB);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+	OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+	OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+		      (pPriv->w - 1) |
+		      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+	OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+
+	OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+	OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+		      R200_TXC_ARG_A_ZERO |
+		      R200_TXC_ARG_B_ZERO |
+		      R200_TXC_ARG_C_R0_COLOR |
+		      R200_TXC_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+		      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+	OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+		      R200_TXA_ARG_A_ZERO |
+		      R200_TXA_ARG_B_ZERO |
+		      R200_TXA_ARG_C_R0_ALPHA |
+		      R200_TXA_OP_MADD);
+	OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+		      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+	FINISH_ACCEL();
+    }
+
+    if (pPriv->vsync) {
+	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
+						    pPriv->drw_x,
+						    pPriv->drw_x + pPriv->dst_w,
+						    pPriv->drw_y,
+						    pPriv->drw_y + pPriv->dst_h);
+	if (crtc) {
+	    RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;
+
+	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
+					  radeon_crtc->crtc_id,
+					  pPriv->drw_y - crtc->y,
+					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
 	}
+    }
+    /*
+     * Rendering of the actual polygon is done in two different
+     * ways depending on chip generation:
+     *
+     * < R300:
+     *
+     *     These chips can render a rectangle in one pass, so
+     *     handling is pretty straight-forward.
+     *
+     * >= R300:
+     *
+     *     These chips can accept a quad, but will render it as
+     *     two triangles which results in a diagonal tear. Instead
+     *     We render a single, large triangle and use the scissor
+     *     functionality to restrict it to the desired rectangle.
+     *     Due to guardband limits on r3xx/r4xx, we can only use
+     *     the single triangle up to 2880 pixels; above that we
+     *     render as a quad.
+     */
 
-	/* These registers define the number, type, and location of data submitted
-	 * to the PVS unit of GA input (when PVS is disabled)
-	 * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
-	 * enabled.  This memory provides the imputs to the vertex shader program
-	 * and ordering is not important.  When PVS/TCL is disabled, this field maps
-	 * directly to the GA input memory and the order is signifigant.  In
-	 * PVS_BYPASS mode the order is as follows:
-	 * Position
-	 * Point Size
-	 * Color 0-3
-	 * Textures 0-7
-	 * Fog
-	 */
-	if (pPriv->bicubic_enabled) {
-	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
-			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
-			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
-			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
-			   R300_SIGNED_0 |
-			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
-			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
-			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
-			   R300_SIGNED_1));
-	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
-			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
-			   (0 << R300_SKIP_DWORDS_2_SHIFT) |
-			   (7 << R300_DST_VEC_LOC_2_SHIFT) |
-			   R300_LAST_VEC_2 |
-			   R300_SIGNED_2));
+    while (nBox--) {
+	int srcX, srcY, srcw, srch;
+	int dstX, dstY, dstw, dsth;
+	dstX = pBox->x1 + dstxoff;
+	dstY = pBox->y1 + dstyoff;
+	dstw = pBox->x2 - pBox->x1;
+	dsth = pBox->y2 - pBox->y1;
+
+	srcX = ((pBox->x1 - pPriv->drw_x) *
+		pPriv->src_w) / pPriv->dst_w;
+	srcY = ((pBox->y1 - pPriv->drw_y) *
+		pPriv->src_h) / pPriv->dst_h;
+
+	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
+	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
+
+#ifdef ACCEL_CP
+	BEGIN_RING(3 * vtx_count + 2);
+	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
+			    3 * vtx_count));
+	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
+#else /* ACCEL_CP */
+	BEGIN_ACCEL(1 + vtx_count * 3);
+	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+					  RADEON_VF_PRIM_WALK_DATA |
+					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
+
+#endif
+	if (isplanar) {
+	    /*
+	     * Just render a rect (using three coords).
+	     * Filter is a bit a misnomer, it's just texcoords...
+	     */
+	    VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
+			   (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
+			   (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
+			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
+			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
+			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
+			   (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	} else {
-	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
-			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
-			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
-			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
-			   R300_SIGNED_0 |
-			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
-			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
-			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
-			   R300_LAST_VEC_1 |
-			   R300_SIGNED_1));
+	    /*
+	     * Just render a rect (using three coords).
+	     */
+	    VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
+		    (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
+		    (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+	    VTX_OUT((float)(dstX + dstw),                              (float)dstY,
+		    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	}
 
-	/* load the vertex shader
-	 * We pre-load vertex programs in RADEONInit3DEngine():
-	 * - exa mask/Xv bicubic
-	 * - exa no mask
-	 * - Xv
-	 * Here we select the offset of the vertex program we want to use
-	 */
-	if (info->accel_state->has_tcl) {
-	    if (pPriv->bicubic_enabled) {
-		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			      ((0 << R300_PVS_FIRST_INST_SHIFT) |
-			       (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			       (2 << R300_PVS_LAST_INST_SHIFT)));
-		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			      (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
-	    } else {
-		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			      ((5 << R300_PVS_FIRST_INST_SHIFT) |
-			       (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			       (6 << R300_PVS_LAST_INST_SHIFT)));
-		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			      (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
-	    }
+#ifdef ACCEL_CP
+	ADVANCE_RING();
+#else
+	FINISH_ACCEL();
+#endif /* !ACCEL_CP */
+
+	pBox++;
+    }
+
+    BEGIN_ACCEL(1);
+    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+    FINISH_ACCEL();
+
+    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+}
+
+static void
+FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    PixmapPtr pPixmap = pPriv->pPixmap;
+    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
+    uint32_t dst_offset, dst_pitch, dst_format;
+    uint32_t txenable, colorpitch;
+    uint32_t output_fmt;
+    Bool isplanar = FALSE;
+    int dstxoff, dstyoff, pixel_shift, vtx_count;
+    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
+    int nBox = REGION_NUM_RECTS(&pPriv->clip);
+    ACCEL_PREAMBLE();
+
+    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
+
+#ifdef USE_EXA
+    if (info->useEXA) {
+	dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
+	dst_pitch = exaGetPixmapPitch(pPixmap);
+    } else
+#endif
+	{
+	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
+		info->fbLocation + pScrn->fbOffset;
+	    dst_pitch = pPixmap->devKind;
 	}
 
-	/* Position and one set of 2 texture coordinates */
-	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
-	if (pPriv->bicubic_enabled)
-	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
-						   (2 << R300_TEX_1_COMP_CNT_SHIFT)));
+#ifdef COMPOSITE
+    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
+    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
+#else
+    dstxoff = 0;
+    dstyoff = 0;
+#endif
+
+#ifdef USE_EXA
+    if (info->useEXA) {
+	RADEON_SWITCH_TO_3D();
+    } else
+#endif
+	{
+	    BEGIN_ACCEL(2);
+	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+	    /* We must wait for 3d to idle, in case source was just written as a dest. */
+	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+			  RADEON_WAIT_HOST_IDLECLEAN |
+			  RADEON_WAIT_2D_IDLECLEAN |
+			  RADEON_WAIT_3D_IDLECLEAN |
+			  RADEON_WAIT_DMA_GUI_IDLE);
+	    FINISH_ACCEL();
+
+	    if (!info->accel_state->XInited3D)
+		RADEONInit3DEngine(pScrn);
+	}
+
+    if (pPriv->bicubic_enabled)
+	vtx_count = 6;
+    else
+	vtx_count = 4;
+
+    switch (pPixmap->drawable.bitsPerPixel) {
+    case 16:
+	if (pPixmap->drawable.depth == 15)
+	    dst_format = R300_COLORFORMAT_ARGB1555;
 	else
-	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
+	    dst_format = R300_COLORFORMAT_RGB565;
+	break;
+    case 32:
+	dst_format = R300_COLORFORMAT_ARGB8888;
+	break;
+    default:
+	return;
+    }
 
-	OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
-	FINISH_ACCEL();
+    output_fmt = (R300_OUT_FMT_C4_8 |
+		  R300_OUT_FMT_C0_SEL_BLUE |
+		  R300_OUT_FMT_C1_SEL_GREEN |
+		  R300_OUT_FMT_C2_SEL_RED |
+		  R300_OUT_FMT_C3_SEL_ALPHA);
 
-	/* setup pixel shader */
-	if (IS_R300_3D) {
-	    if (pPriv->bicubic_enabled) {
-		BEGIN_ACCEL(79);
-
-		/* 4 components: 2 for tex0 and 2 for tex1 */
-		OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-						   R300_RS_COUNT_HIRES_EN));
-
-		/* R300_INST_COUNT_RS - highest RS instruction used */
-		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
-
-		/* Pixel stack frame size. */
-		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
-
-		/* Indirection levels */
-		OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
-							R300_FIRST_TEX));
-
-		/* Set nodes. */
-		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-							R300_ALU_CODE_SIZE(14) |
-							R300_TEX_CODE_OFFSET(0) |
-							R300_TEX_CODE_SIZE(6)));
-
-		/* Nodes are allocated highest first, but executed lowest first */
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
-							R300_ALU_SIZE(0) |
-							R300_TEX_START(0) |
-							R300_TEX_SIZE(0)));
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
-							R300_ALU_SIZE(9) |
-							R300_TEX_START(1) |
-							R300_TEX_SIZE(0)));
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
-							R300_ALU_SIZE(2) |
-							R300_TEX_START(2) |
-							R300_TEX_SIZE(3) |
-							R300_RGBA_OUT));
-
-		/* ** BICUBIC FP ** */
-
-		/* texcoord0 => temp0
-		 * texcoord1 => temp1 */
-
-		// first node
-		/* TEX temp2, temp1.rrr0, tex1, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(1) |
-						   R300_TEX_SRC_ADDR(1) |
-						   R300_TEX_DST_ADDR(2)));
-
-		/* MOV temp1.r, temp1.ggg0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
-						   R300_ALU_RGB_ADDRD(1) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    colorpitch = dst_pitch >> pixel_shift;
+    colorpitch |= dst_format;
 
+    if (RADEONTilingEnabled(pScrn, pPixmap))
+	colorpitch |= R300_COLORTILE;
 
-		// second node
-		/* TEX temp1, temp1, tex1, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(1) |
-						   R300_TEX_SRC_ADDR(1) |
-						   R300_TEX_DST_ADDR(1)));
-
-		/* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
-						   R300_ALU_RGB_ADDRD(3) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	isplanar = TRUE;
+    }
 
+    if (isplanar) {
+	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
+	txpitch = pPriv->src_pitch;
+    } else {
+	if (pPriv->id == FOURCC_UYVY)
+	    txformat1 = R300_TX_FORMAT_YVYU422;
+	else
+	    txformat1 = R300_TX_FORMAT_VYUY422;
 
-		/* MUL temp2.rg, temp2.rrr0, const0.rgb */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
-						   R300_ALU_RGB_ADDRD(2) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
 
-		/* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						   R300_ALU_RGB_ADDR2(3) |
-						   R300_ALU_RGB_ADDRD(4) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	/* pitch is in pixels */
+	txpitch = pPriv->src_pitch / 2;
+    }
+    txpitch -= 1;
+
+    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
+		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+		 R300_TXPITCH_EN);
+
+    info->accel_state->texW[0] = pPriv->w;
+    info->accel_state->texH[0] = pPriv->h;
+
+    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
+		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
+		R300_TX_MAG_FILTER_LINEAR |
+		R300_TX_MIN_FILTER_LINEAR |
+		(0 << R300_TX_ID_SHIFT));
+
+    txoffset = pPriv->src_offset;
+
+    BEGIN_ACCEL(6);
+    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
+    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
+    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
+    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
+    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
+    OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
+    FINISH_ACCEL();
 
-		/* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						   R300_ALU_RGB_ADDR2(2) |
-						   R300_ALU_RGB_ADDRD(5) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    txenable = R300_TEX_0_ENABLE;
 
-		/* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						   R300_ALU_RGB_ADDR2(3) |
-						   R300_ALU_RGB_ADDRD(3) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    if (isplanar) {
+	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
+		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+		     R300_TXPITCH_EN);
+	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	txpitch -= 1;
+	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
+		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
+		    R300_TX_MIN_FILTER_LINEAR |
+		    R300_TX_MAG_FILTER_LINEAR);
+
+	BEGIN_ACCEL(12);
+	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
+	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
+	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
+	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
+	FINISH_ACCEL();
+	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
+    }
 
-		/* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
-						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
-						   R300_ALU_RGB_ADDR2(2) |
-						   R300_ALU_RGB_ADDRD(1) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    if (pPriv->bicubic_enabled) {
+	/* Size is 128x1 */
+	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
+		     (0x0 << R300_TXHEIGHT_SHIFT) |
+		     R300_TXPITCH_EN);
+	/* Format is 32-bit floats, 4bpp */
+	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
+	/* Pitch is 127 (128-1) */
+	txpitch = 0x7f;
+	/* Tex filter */
+	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
+		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
+		    R300_TX_MIN_FILTER_NEAREST |
+		    R300_TX_MAG_FILTER_NEAREST |
+		    (1 << R300_TX_ID_SHIFT));
 
-		/* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR2(1) |
-						   R300_ALU_RGB_ADDRD(1) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	BEGIN_ACCEL(6);
+	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
+	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
+	FINISH_ACCEL();
 
-		/* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR2(3) |
-						   R300_ALU_RGB_ADDRD(2) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	/* Enable tex 1 */
+	txenable |= R300_TEX_1_ENABLE;
+    }
 
-		/* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR2(5) |
-						   R300_ALU_RGB_ADDRD(3) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+    /* setup the VAP */
+    if (info->accel_state->has_tcl) {
+	if (pPriv->bicubic_enabled)
+	    BEGIN_ACCEL(7);
+	else
+	    BEGIN_ACCEL(6);
+    } else {
+	if (pPriv->bicubic_enabled)
+	    BEGIN_ACCEL(5);
+	else
+	    BEGIN_ACCEL(4);
+    }
 
-		/* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR2(4) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+    /* These registers define the number, type, and location of data submitted
+     * to the PVS unit of GA input (when PVS is disabled)
+     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
+     * enabled.  This memory provides the imputs to the vertex shader program
+     * and ordering is not important.  When PVS/TCL is disabled, this field maps
+     * directly to the GA input memory and the order is signifigant.  In
+     * PVS_BYPASS mode the order is as follows:
+     * Position
+     * Point Size
+     * Color 0-3
+     * Textures 0-7
+     * Fog
+     */
+    if (pPriv->bicubic_enabled) {
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
+		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
+		       R300_SIGNED_0 |
+		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
+		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
+		       R300_SIGNED_1));
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
+		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
+		       R300_LAST_VEC_2 |
+		       R300_SIGNED_2));
+    } else {
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
+		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
+		       R300_SIGNED_0 |
+		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
+		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
+		       R300_LAST_VEC_1 |
+		       R300_SIGNED_1));
+    }
+
+    /* load the vertex shader
+     * We pre-load vertex programs in RADEONInit3DEngine():
+     * - exa mask/Xv bicubic
+     * - exa no mask
+     * - Xv
+     * Here we select the offset of the vertex program we want to use
+     */
+    if (info->accel_state->has_tcl) {
+	if (pPriv->bicubic_enabled) {
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
+			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
+			   (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (2 << R300_PVS_LAST_INST_SHIFT)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
+			  (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+	} else {
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
+			  ((5 << R300_PVS_FIRST_INST_SHIFT) |
+			   (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (6 << R300_PVS_LAST_INST_SHIFT)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
+			  (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+	}
+    }
+
+    /* Position and one set of 2 texture coordinates */
+    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
+    if (pPriv->bicubic_enabled)
+	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
+					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
+    else
+	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
+
+    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
+    FINISH_ACCEL();
+
+    /* setup pixel shader */
+    if (pPriv->bicubic_enabled) {
+	BEGIN_ACCEL(79);
+
+	/* 4 components: 2 for tex0 and 2 for tex1 */
+	OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+				      R300_RS_COUNT_HIRES_EN));
+
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
+
+	/* Pixel stack frame size. */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
+
+	/* Indirection levels */
+	OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
+				       R300_FIRST_TEX));
+
+	/* Set nodes. */
+	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+					    R300_ALU_CODE_SIZE(14) |
+					    R300_TEX_CODE_OFFSET(0) |
+					    R300_TEX_CODE_SIZE(6)));
+
+	/* Nodes are allocated highest first, but executed lowest first */
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
+					    R300_ALU_SIZE(0) |
+					    R300_TEX_START(0) |
+					    R300_TEX_SIZE(0)));
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
+					    R300_ALU_SIZE(9) |
+					    R300_TEX_START(1) |
+					    R300_TEX_SIZE(0)));
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
+					    R300_ALU_SIZE(2) |
+					    R300_TEX_START(2) |
+					    R300_TEX_SIZE(3) |
+					    R300_RGBA_OUT));
+
+	/* ** BICUBIC FP ** */
+
+	/* texcoord0 => temp0
+	 * texcoord1 => temp1 */
+
+	// first node
+	/* TEX temp2, temp1.rrr0, tex1, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(1) |
+					    R300_TEX_SRC_ADDR(1) |
+					    R300_TEX_DST_ADDR(2)));
+
+	/* MOV temp1.r, temp1.ggg0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
+						R300_ALU_RGB_ADDRD(1) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+
+	// second node
+	/* TEX temp1, temp1, tex1, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(1) |
+					    R300_TEX_SRC_ADDR(1) |
+					    R300_TEX_DST_ADDR(1)));
+
+	/* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
+						R300_ALU_RGB_ADDRD(3) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+
+	/* MUL temp2.rg, temp2.rrr0, const0.rgb */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
+						R300_ALU_RGB_ADDRD(2) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR2(3) |
+						R300_ALU_RGB_ADDRD(4) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR2(2) |
+						R300_ALU_RGB_ADDRD(5) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR2(3) |
+						R300_ALU_RGB_ADDRD(3) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
+						R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR2(2) |
+						R300_ALU_RGB_ADDRD(1) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+						R300_ALU_RGB_ADDR2(1) |
+						R300_ALU_RGB_ADDRD(1) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+						R300_ALU_RGB_ADDR2(3) |
+						R300_ALU_RGB_ADDRD(2) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+						R300_ALU_RGB_ADDR2(5) |
+						R300_ALU_RGB_ADDRD(3) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+
+	/* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
+						 R300_ALU_RGB_ADDR2(4) |
+						 R300_ALU_RGB_ADDRD(0) |
+						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
 						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
 
 
-		// third node
-		/* TEX temp4, temp1.rg--, tex0, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(0) |
-						   R300_TEX_SRC_ADDR(1) |
-						   R300_TEX_DST_ADDR(4)));
-
-		/* TEX temp3, temp3.rg--, tex0, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(0) |
-						   R300_TEX_SRC_ADDR(3) |
-						   R300_TEX_DST_ADDR(3)));
-
-		/* TEX temp5, temp2.rg--, tex0, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(0) |
-						   R300_TEX_SRC_ADDR(2) |
-						   R300_TEX_DST_ADDR(5)));
-
-		/* TEX temp0, temp0.rg--, tex0, 1D */
-		OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
-						   R300_TEX_ID(0) |
-						   R300_TEX_SRC_ADDR(0) |
-						   R300_TEX_DST_ADDR(0)));
-
-		/* LRP temp3, temp1.bbbb, temp4, temp3 ->
-		 * - PRESUB temps, temp4 - temp3
-		 * - MAD temp3, temp1.bbbb, temps, temp3 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
-						   R300_ALU_RGB_ADDR1(4) |
-						   R300_ALU_RGB_ADDR2(1) |
-						   R300_ALU_RGB_ADDRD(3) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+	// third node
+	/* TEX temp4, temp1.rg--, tex0, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(0) |
+					    R300_TEX_SRC_ADDR(1) |
+					    R300_TEX_DST_ADDR(4)));
+
+	/* TEX temp3, temp3.rg--, tex0, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(0) |
+					    R300_TEX_SRC_ADDR(3) |
+					    R300_TEX_DST_ADDR(3)));
+
+	/* TEX temp5, temp2.rg--, tex0, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(0) |
+					    R300_TEX_SRC_ADDR(2) |
+					    R300_TEX_DST_ADDR(5)));
+
+	/* TEX temp0, temp0.rg--, tex0, 1D */
+	OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
+					    R300_TEX_ID(0) |
+					    R300_TEX_SRC_ADDR(0) |
+					    R300_TEX_DST_ADDR(0)));
+
+	/* LRP temp3, temp1.bbbb, temp4, temp3 ->
+	 * - PRESUB temps, temp4 - temp3
+	 * - MAD temp3, temp1.bbbb, temps, temp3 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
+						 R300_ALU_RGB_ADDR1(4) |
+						 R300_ALU_RGB_ADDR2(1) |
+						 R300_ALU_RGB_ADDRD(3) |
+						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
 						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
 						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
 						   R300_ALU_ALPHA_ADDR1(4) |
 						   R300_ALU_ALPHA_ADDR2(1) |
 						   R300_ALU_ALPHA_ADDRD(3) |
 						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
 
-		/* LRP temp0, temp1.bbbb, temp5, temp0 ->
-		 * - PRESUB temps, temp5 - temp0
-		 * - MAD temp0, temp1.bbbb, temps, temp0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
-						   R300_ALU_RGB_INSERT_NOP));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR1(5) |
-						   R300_ALU_RGB_ADDR2(1) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+	/* LRP temp0, temp1.bbbb, temp5, temp0 ->
+	 * - PRESUB temps, temp5 - temp0
+	 * - MAD temp0, temp1.bbbb, temps, temp0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
+						 R300_ALU_RGB_INSERT_NOP));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
+						 R300_ALU_RGB_ADDR1(5) |
+						 R300_ALU_RGB_ADDR2(1) |
+						 R300_ALU_RGB_ADDRD(0) |
+						 R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
 						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
 						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
 						   R300_ALU_ALPHA_ADDR1(5) |
 						   R300_ALU_ALPHA_ADDR2(1) |
 						   R300_ALU_ALPHA_ADDRD(0) |
 						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));
 
-		/* LRP output, temp2.bbbb, temp3, temp0 ->
-		 * - PRESUB temps, temp3 - temp0
-		 * - MAD output, temp2.bbbb, temps, temp0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR1(3) |
-						   R300_ALU_RGB_ADDR2(2) |
-						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+	/* LRP output, temp2.bbbb, temp3, temp0 ->
+	 * - PRESUB temps, temp3 - temp0
+	 * - MAD output, temp2.bbbb, temps, temp0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						 R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
+						 R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
+						 R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						 R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
+						 R300_ALU_RGB_ADDR1(3) |
+						 R300_ALU_RGB_ADDR2(2) |
+						 R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
 						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
 						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
 						   R300_ALU_ALPHA_ADDR1(3) |
 						   R300_ALU_ALPHA_ADDR2(2) |
 						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));
 
-		/* Shader constants. */
-		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
-		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
-
-		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
-		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
-
-		FINISH_ACCEL();
-	    } else if (isplanar) {
-	    /*
-	     * y' = y - .0625
-	     * u' = u - .5
-	     * v' = v - .5;
-	     *
-	     * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
-	     * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
-	     * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
-	     *
-	     * DP3 might look like the straightforward solution
-	     * but we'd need to move the texture yuv values in
-	     * the same reg for this to work. Therefore use MADs.
-	     * Brightness just adds to the off constant.
-	     * Contrast is multiplication of luminance.
-	     * Saturation and hue change the u and v coeffs.
-	     * Default values (before adjustments - depend on colorspace):
-	     * yco = 1.1643
-	     * uco = 0, -0.39173, 2.017
-	     * vco = 1.5958, -0.8129, 0
-	     * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
-	     *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
-	     *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
-	     *
-	     * temp = MAD(yco, yuv.yyyy, off)
-	     * temp = MAD(uco, yuv.uuuu, temp)
-	     * result = MAD(vco, yuv.vvvv, temp)
-	     */
-	     /* TODO: don't recalc consts always */
-		const float Loff = -0.0627;
-		const float Coff = -0.502;
-		float uvcosf, uvsinf;
-		float yco;
-		float uco[3], vco[3], off[3];
-		float bright, cont, gamma;
-		int ref = pPriv->transform_index;
-		Bool needgamma = FALSE;
-
-		cont = RTFContrast(pPriv->contrast);
-		bright = RTFBrightness(pPriv->brightness);
-		gamma = (float)pPriv->gamma / 1000.0;
-		uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
-		uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
-		/* overlay video also does pre-gamma contrast/sat adjust, should we? */
-
-		yco = trans[ref].RefLuma * cont;
-		uco[0] = -trans[ref].RefRCr * uvsinf;
-		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-		uco[2] = trans[ref].RefBCb * uvcosf;
-		vco[0] = trans[ref].RefRCr * uvcosf;
-		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-		vco[2] = trans[ref].RefBCb * uvsinf;
-		off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
-		off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
-		off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
-
-		if (gamma != 1.0) {
-			needgamma = TRUE;
-			/* note: gamma correction is out = in ^ gamma;
-			   gpu can only do LG2/EX2 therefore we transform into
-			   in ^ gamma = 2 ^ (log2(in) * gamma).
-			   Lots of scalar ops, unfortunately (better solution?) -
-			   without gamma that's 3 inst, with gamma it's 10...
-			   could use different gamma factors per channel,
-			   if that's of any use. */
-		}
-
-		BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
-		/* 2 components: same 2 for tex0/1/2 */
-		OUT_ACCEL_REG(R300_RS_COUNT,
-			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-			   R300_RS_COUNT_HIRES_EN));
-		/* R300_INST_COUNT_RS - highest RS instruction used */
-		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-		OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
-
-		/* Indirection levels */
-		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
-							R300_FIRST_TEX));
-
-		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-						   R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
-						   R300_TEX_CODE_OFFSET(0) |
-						   R300_TEX_CODE_SIZE(3)));
-
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-						   R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
-						   R300_TEX_START(0) |
-						   R300_TEX_SIZE(2) |
-						   R300_RGBA_OUT));
-
-		/* tex inst */
-		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
-						  R300_TEX_DST_ADDR(0) |
-						  R300_TEX_ID(0) |
-						  R300_TEX_INST(R300_TEX_INST_LD)));
-		OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
-						  R300_TEX_DST_ADDR(1) |
-						  R300_TEX_ID(1) |
-						  R300_TEX_INST(R300_TEX_INST_LD)));
-		OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
-						  R300_TEX_DST_ADDR(2) |
-						  R300_TEX_ID(2) |
-						  R300_TEX_INST(R300_TEX_INST_LD)));
-
-		/* ALU inst */
-		/* MAD temp0, const0.a, temp0, const0.rgb */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
-						   R300_ALU_RGB_ADDR1(0) |
-						   R300_ALU_RGB_ADDR2(0) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
-						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-		/* alpha nop, but need to set up alpha source for rgb usage */
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
-						   R300_ALU_ALPHA_ADDR1(0) |
-						   R300_ALU_ALPHA_ADDR2(0) |
-						   R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		/* MAD const1, temp1, temp0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
-						   R300_ALU_RGB_ADDR1(1) |
-						   R300_ALU_RGB_ADDR2(0) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-		/* alpha nop */
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		/* MAD result, const2, temp2, temp0 */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
-						   R300_ALU_RGB_ADDR1(2) |
-						   R300_ALU_RGB_ADDR2(0) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
-						   (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
-						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
-						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
-						   R300_ALU_RGB_CLAMP));
-		/* write alpha 1 */
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
-						   R300_ALU_ALPHA_TARGET_A));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
-
-		if (needgamma) {
-		    /* rgb temp0.r = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha lg2 temp0, temp0.r */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		    /* rgb temp0.g = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha lg2 temp0, temp0.g */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	/* Shader constants. */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);
 
-		    /* rgb temp0.b = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha lg2 temp0, temp0.b */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
 
-		    /* MUL const1, temp1, temp0 */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR1(0) |
-						   R300_ALU_RGB_ADDR2(0) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
-						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
-						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
-		    /* alpha nop, but set up const1 */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		    /* rgb out0.r = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
-						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), 
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha ex2 temp0, temp0.r */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		    /* rgb out0.g = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
-						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha ex2 temp0, temp0.g */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-
-		    /* rgb out0.b = op_sop, set up src0 reg */
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
-						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
-		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
-		    /* alpha ex2 temp0, temp0.b */
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
-		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
-						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
-		}
-
-		/* Shader constants. */
-		/* constant 0: off, yco */
-		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
-		/* constant 1: uco */
-		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
-		/* constant 2: vco */
-		OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
-
-		FINISH_ACCEL();
-
-	    } else {
-		BEGIN_ACCEL(11);
-		/* 2 components: 2 for tex0 */
-		OUT_ACCEL_REG(R300_RS_COUNT,
-			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-			   R300_RS_COUNT_HIRES_EN));
-		/* R300_INST_COUNT_RS - highest RS instruction used */
-		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
-
-		/* Indirection levels */
-		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
-							R300_FIRST_TEX));
-
-		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-						   R300_ALU_CODE_SIZE(1) |
-						   R300_TEX_CODE_OFFSET(0) |
-						   R300_TEX_CODE_SIZE(1)));
-
-		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-						   R300_ALU_SIZE(0) |
-						   R300_TEX_START(0) |
-						   R300_TEX_SIZE(0) |
-						   R300_RGBA_OUT));
-
-		/* tex inst */
-		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
-						  R300_TEX_DST_ADDR(0) |
-						  R300_TEX_ID(0) |
-						  R300_TEX_INST(R300_TEX_INST_LD)));
-
-		/* ALU inst */
-		/* RGB */
-		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
-						   R300_ALU_RGB_ADDR1(0) |
-						   R300_ALU_RGB_ADDR2(0) |
-						   R300_ALU_RGB_ADDRD(0) |
-						   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
-						   R300_ALU_RGB_MASK_G |
-						   R300_ALU_RGB_MASK_B)) |
-						   R300_ALU_RGB_TARGET_A));
-		OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
-						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
-						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
-						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
-						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
-						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
-						   R300_ALU_RGB_CLAMP));
-		/* Alpha */
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
-						   R300_ALU_ALPHA_ADDR1(0) |
-						   R300_ALU_ALPHA_ADDR2(0) |
-						   R300_ALU_ALPHA_ADDRD(0) |
-						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
-						   R300_ALU_ALPHA_TARGET_A |
-						   R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
-						   R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
-						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
-						   R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
-						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
-						   R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
-						   R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
-						   R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
-						   R300_ALU_ALPHA_CLAMP));
-		FINISH_ACCEL();
-	    }
-	} else {
-	    if (pPriv->bicubic_enabled) {
-		BEGIN_ACCEL(7);
-
-		/* 4 components: 2 for tex0 and 2 for tex1 */
-		OUT_ACCEL_REG(R300_RS_COUNT,
-			      ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-			       R300_RS_COUNT_HIRES_EN));
-
-		/* R300_INST_COUNT_RS - highest RS instruction used */
-		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
-
-		/* Pixel stack frame size. */
-		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
-
-		/* FP length. */
-		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-						  R500_US_CODE_END_ADDR(13)));
-		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-						   R500_US_CODE_RANGE_SIZE(13)));
-
-		/* Prepare for FP emission. */
-		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
-		FINISH_ACCEL();
-
-		BEGIN_ACCEL(89);
-		/* Pixel shader.
-		 * I've gone ahead and annotated each instruction, since this
-		 * thing is MASSIVE. :3
-		 * Note: In order to avoid buggies with temps and multiple
-		 * inputs, all temps are offset by 2. temp0 -> register2. */
-
-		/* TEX temp2, input1.xxxx, tex1, 1D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
-						       R500_TEX_SRC_S_SWIZ_R |
-						       R500_TEX_SRC_T_SWIZ_R |
-						       R500_TEX_SRC_R_SWIZ_R |
-						       R500_TEX_SRC_Q_SWIZ_R |
-						       R500_TEX_DST_ADDR(2) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* TEX temp5, input1.yyyy, tex1, 1D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_SEM_ACQUIRE |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
-						       R500_TEX_SRC_S_SWIZ_G |
-						       R500_TEX_SRC_T_SWIZ_G |
-						       R500_TEX_SRC_R_SWIZ_G |
-						       R500_TEX_SRC_Q_SWIZ_G |
-						       R500_TEX_DST_ADDR(5) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* MUL temp4, const0.x0x0, temp2.yyxx */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_ADDR0_CONST |
-						       R500_RGB_ADDR1(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_ADDR0_CONST |
-						       R500_ALPHA_ADDR1(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-						       R500_ALU_RGB_R_SWIZ_A_R |
-						       R500_ALU_RGB_G_SWIZ_A_0 |
-						       R500_ALU_RGB_B_SWIZ_A_R |
-						       R500_ALU_RGB_SEL_B_SRC1 |
-						       R500_ALU_RGB_R_SWIZ_B_G |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_R));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC0 |
-						       R500_ALPHA_SWIZ_A_0 |
-						       R500_ALPHA_SEL_B_SRC1 |
-						       R500_ALPHA_SWIZ_B_R));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_R_SWIZ_0 |
-						       R500_ALU_RGBA_G_SWIZ_0 |
-						       R500_ALU_RGBA_B_SWIZ_0 |
-						       R500_ALU_RGBA_A_SWIZ_0));
-
-		/* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_ADDR0_CONST |
-						       R500_RGB_ADDR1(5) |
-						       R500_RGB_ADDR2(4)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_ADDR0_CONST |
-						       R500_ALPHA_ADDR1(5) |
-						       R500_ALPHA_ADDR2(4)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-						       R500_ALU_RGB_R_SWIZ_A_0 |
-						       R500_ALU_RGB_G_SWIZ_A_G |
-						       R500_ALU_RGB_B_SWIZ_A_0 |
-						       R500_ALU_RGB_SEL_B_SRC1 |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_R |
-						       R500_ALU_RGB_B_SWIZ_B_R));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC0 |
-						       R500_ALPHA_SWIZ_A_G |
-						       R500_ALPHA_SEL_B_SRC1 |
-						       R500_ALPHA_SWIZ_B_R));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC2 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_B |
-						       R500_ALU_RGBA_A_SWIZ_A));
-
-		/* ADD temp3, temp3, input0.xyxy */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
-						       R500_RGB_ADDR2(0)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
-						       R500_ALPHA_ADDR2(0)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
-						       R500_ALU_RGB_G_SWIZ_A_1 |
-						       R500_ALU_RGB_B_SWIZ_A_1 |
-						       R500_ALU_RGB_SEL_B_SRC1 |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SWIZ_A_1 |
-						       R500_ALPHA_SEL_B_SRC1 |
-						       R500_ALPHA_SWIZ_B_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC2 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_R |
-						       R500_ALU_RGBA_A_SWIZ_G));
-
-		/* TEX temp1, temp3.zwxy, tex0, 2D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
-						       R500_TEX_SRC_S_SWIZ_B |
-						       R500_TEX_SRC_T_SWIZ_A |
-						       R500_TEX_SRC_R_SWIZ_R |
-						       R500_TEX_SRC_Q_SWIZ_G |
-						       R500_TEX_DST_ADDR(1) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* TEX temp3, temp3.xyzw, tex0, 2D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_SEM_ACQUIRE |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
-						       R500_TEX_SRC_S_SWIZ_R |
-						       R500_TEX_SRC_T_SWIZ_G |
-						       R500_TEX_SRC_R_SWIZ_B |
-						       R500_TEX_SRC_Q_SWIZ_A |
-						       R500_TEX_DST_ADDR(3) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_ADDR0_CONST |
-						       R500_RGB_ADDR1(5) |
-						       R500_RGB_ADDR2(4)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_ADDR0_CONST |
-						       R500_ALPHA_ADDR1(5) |
-						       R500_ALPHA_ADDR2(4)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-						       R500_ALU_RGB_R_SWIZ_A_0 |
-						       R500_ALU_RGB_G_SWIZ_A_G |
-						       R500_ALU_RGB_B_SWIZ_A_0 |
-						       R500_ALU_RGB_SEL_B_SRC1 |
-						       R500_ALU_RGB_R_SWIZ_B_G |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_G));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC0 |
-						       R500_ALPHA_SWIZ_A_G |
-						       R500_ALPHA_SEL_B_SRC1 |
-						       R500_ALPHA_SWIZ_B_G));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC2 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_B |
-						       R500_ALU_RGBA_A_SWIZ_A));
-
-		/* ADD temp0, temp4, input0.xyxy */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
-						       R500_RGB_ADDR2(0)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
-						       R500_ALPHA_ADDR2(0)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
-						       R500_ALU_RGB_G_SWIZ_A_1 |
-						       R500_ALU_RGB_B_SWIZ_A_1 |
-						       R500_ALU_RGB_SEL_B_SRC1 |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SWIZ_A_1 |
-						       R500_ALPHA_SEL_B_SRC1 |
-						       R500_ALPHA_SWIZ_B_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC2 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_R |
-						       R500_ALU_RGBA_A_SWIZ_G));
-
-		/* TEX temp4, temp0.zwzw, tex0, 2D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-						       R500_TEX_SRC_S_SWIZ_B |
-						       R500_TEX_SRC_T_SWIZ_A |
-						       R500_TEX_SRC_R_SWIZ_B |
-						       R500_TEX_SRC_Q_SWIZ_A |
-						       R500_TEX_DST_ADDR(4) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* TEX temp0, temp0.xyzw, tex0, 2D */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						   R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_SEM_ACQUIRE |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-						       R500_TEX_SRC_S_SWIZ_R |
-						       R500_TEX_SRC_T_SWIZ_G |
-						       R500_TEX_SRC_R_SWIZ_B |
-						       R500_TEX_SRC_Q_SWIZ_A |
-						       R500_TEX_DST_ADDR(0) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* LRP temp3, temp2.zzzz, temp1, temp3 ->
-		 * - PRESUB temps, temp1 - temp3
-		 * - MAD temp2.zzzz, temps, temp3 */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
-						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-						       R500_RGB_ADDR1(1) |
-						       R500_RGB_ADDR2(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
-						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-						       R500_ALPHA_ADDR1(1) |
-						       R500_ALPHA_ADDR2(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-						       R500_ALU_RGB_R_SWIZ_A_B |
-						       R500_ALU_RGB_G_SWIZ_A_B |
-						       R500_ALU_RGB_B_SWIZ_A_B |
-						       R500_ALU_RGB_SEL_B_SRCP |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC2 |
-						       R500_ALPHA_SWIZ_A_B |
-						       R500_ALPHA_SEL_B_SRCP |
-						       R500_ALPHA_SWIZ_B_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC0 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_B |
-						       R500_ALU_RGBA_A_SWIZ_A));
-
-		/* LRP temp0, temp2.zzzz, temp4, temp0 ->
-		 * - PRESUB temps, temp4 - temp1
-		 * - MAD temp2.zzzz, temps, temp0 */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-						       R500_RGB_ADDR1(4) |
-						       R500_RGB_ADDR2(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-						       R500_ALPHA_ADDR1(4) |
-						       R500_ALPHA_ADDR2(2)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-						       R500_ALU_RGB_R_SWIZ_A_B |
-						       R500_ALU_RGB_G_SWIZ_A_B |
-						       R500_ALU_RGB_B_SWIZ_A_B |
-						       R500_ALU_RGB_SEL_B_SRCP |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC2 |
-						       R500_ALPHA_SWIZ_A_B |
-						       R500_ALPHA_SEL_B_SRCP |
-						       R500_ALPHA_SWIZ_B_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC0 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_B |
-						       R500_ALU_RGBA_A_SWIZ_A));
-
-		/* LRP output, temp5.zzzz, temp3, temp0 ->
-		 * - PRESUB temps, temp3 - temp0
-		 * - MAD temp5.zzzz, temps, temp0 */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
-						       R500_INST_LAST |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK |
-						       R500_INST_RGB_OMASK_R |
-						       R500_INST_RGB_OMASK_G |
-						       R500_INST_RGB_OMASK_B |
-						       R500_INST_ALPHA_OMASK));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
-						       R500_RGB_ADDR1(3) |
-						       R500_RGB_ADDR2(5)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
-						       R500_ALPHA_ADDR1(3) |
-						       R500_ALPHA_ADDR2(5)));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
-						       R500_ALU_RGB_R_SWIZ_A_B |
-						       R500_ALU_RGB_G_SWIZ_A_B |
-						       R500_ALU_RGB_B_SWIZ_A_B |
-						       R500_ALU_RGB_SEL_B_SRCP |
-						       R500_ALU_RGB_R_SWIZ_B_R |
-						       R500_ALU_RGB_G_SWIZ_B_G |
-						       R500_ALU_RGB_B_SWIZ_B_B));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
-						       R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SEL_A_SRC2 |
-						       R500_ALPHA_SWIZ_A_B |
-						       R500_ALPHA_SEL_B_SRCP |
-						       R500_ALPHA_SWIZ_B_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
-						       R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_SEL_C_SRC0 |
-						       R500_ALU_RGBA_R_SWIZ_R |
-						       R500_ALU_RGBA_G_SWIZ_G |
-						       R500_ALU_RGBA_B_SWIZ_B |
-						       R500_ALU_RGBA_A_SWIZ_A));
-
-		/* Shader constants. */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
-
-		/* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
-		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
-		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
-		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
-		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
-
-		FINISH_ACCEL();
-
-	    } else {
-		BEGIN_ACCEL(19);
-		/* 2 components: 2 for tex0 */
-		OUT_ACCEL_REG(R300_RS_COUNT,
-			      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
-			       R300_RS_COUNT_HIRES_EN));
-
-		/* R300_INST_COUNT_RS - highest RS instruction used */
-		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
-
-		/* Pixel stack frame size. */
-		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
-
-		/* FP length. */
-		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
-						  R500_US_CODE_END_ADDR(1)));
-		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
-						   R500_US_CODE_RANGE_SIZE(1)));
-
-		/* Prepare for FP emission. */
-		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
-
-		/* tex inst */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_RGB_WMASK_R |
-						       R500_INST_RGB_WMASK_G |
-						       R500_INST_RGB_WMASK_B |
-						       R500_INST_ALPHA_WMASK |
-						       R500_INST_RGB_CLAMP |
-						       R500_INST_ALPHA_CLAMP));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
-						       R500_TEX_INST_LD |
-						       R500_TEX_SEM_ACQUIRE |
-						       R500_TEX_IGNORE_UNCOVERED));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
-						       R500_TEX_SRC_S_SWIZ_R |
-						       R500_TEX_SRC_T_SWIZ_G |
-						       R500_TEX_DST_ADDR(0) |
-						       R500_TEX_DST_R_SWIZ_R |
-						       R500_TEX_DST_G_SWIZ_G |
-						       R500_TEX_DST_B_SWIZ_B |
-						       R500_TEX_DST_A_SWIZ_A));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
-						       R500_DX_S_SWIZ_R |
-						       R500_DX_T_SWIZ_R |
-						       R500_DX_R_SWIZ_R |
-						       R500_DX_Q_SWIZ_R |
-						       R500_DY_ADDR(0) |
-						       R500_DY_S_SWIZ_R |
-						       R500_DY_T_SWIZ_R |
-						       R500_DY_R_SWIZ_R |
-						       R500_DY_Q_SWIZ_R));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
-
-		/* ALU inst */
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
-						       R500_INST_TEX_SEM_WAIT |
-						       R500_INST_LAST |
-						       R500_INST_RGB_OMASK_R |
-						       R500_INST_RGB_OMASK_G |
-						       R500_INST_RGB_OMASK_B |
-						       R500_INST_ALPHA_OMASK |
-						       R500_INST_RGB_CLAMP |
-						       R500_INST_ALPHA_CLAMP));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
-						       R500_RGB_ADDR1(0) |
-						       R500_RGB_ADDR1_CONST |
-						       R500_RGB_ADDR2(0) |
-						       R500_RGB_ADDR2_CONST));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
-						       R500_ALPHA_ADDR1(0) |
-						       R500_ALPHA_ADDR1_CONST |
-						       R500_ALPHA_ADDR2(0) |
-						       R500_ALPHA_ADDR2_CONST));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
-						       R500_ALU_RGB_R_SWIZ_A_R |
-						       R500_ALU_RGB_G_SWIZ_A_G |
-						       R500_ALU_RGB_B_SWIZ_A_B |
-						       R500_ALU_RGB_SEL_B_SRC0 |
-						       R500_ALU_RGB_R_SWIZ_B_1 |
-						       R500_ALU_RGB_B_SWIZ_B_1 |
-						       R500_ALU_RGB_G_SWIZ_B_1));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
-						       R500_ALPHA_SWIZ_A_A |
-						       R500_ALPHA_SWIZ_B_1));
-		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
-						       R500_ALU_RGBA_R_SWIZ_0 |
-						       R500_ALU_RGBA_G_SWIZ_0 |
-						       R500_ALU_RGBA_B_SWIZ_0 |
-						       R500_ALU_RGBA_A_SWIZ_0));
-		FINISH_ACCEL();
-	    }
-	}
-
-	BEGIN_ACCEL(6);
-	OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
-	OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
-
-	OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
-	OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);
-
-	blendcntl = RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO;
-	/* no need to enable blending */
-	OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, blendcntl);
-
-	OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
 	FINISH_ACCEL();
-
-    } else {
-
-	/* Same for R100/R200 */
-	switch (pPixmap->drawable.bitsPerPixel) {
-	case 16:
-	    if (pPixmap->drawable.depth == 15)
-		dst_format = RADEON_COLOR_FORMAT_ARGB1555;
-	    else
-		dst_format = RADEON_COLOR_FORMAT_RGB565;
-	    break;
-	case 32:
-	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
-	    break;
-	default:
-	    return;
-	}
-
-	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
-	    isplanar = TRUE;
+    } else if (isplanar) {
+	/*
+	 * y' = y - .0625
+	 * u' = u - .5
+	 * v' = v - .5;
+	 *
+	 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+	 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+	 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+	 *
+	 * DP3 might look like the straightforward solution
+	 * but we'd need to move the texture yuv values in
+	 * the same reg for this to work. Therefore use MADs.
+	 * Brightness just adds to the off constant.
+	 * Contrast is multiplication of luminance.
+	 * Saturation and hue change the u and v coeffs.
+	 * Default values (before adjustments - depend on colorspace):
+	 * yco = 1.1643
+	 * uco = 0, -0.39173, 2.017
+	 * vco = 1.5958, -0.8129, 0
+	 * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+	 *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+	 *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+	 *
+	 * temp = MAD(yco, yuv.yyyy, off)
+	 * temp = MAD(uco, yuv.uuuu, temp)
+	 * result = MAD(vco, yuv.vvvv, temp)
+	 */
+	/* TODO: don't recalc consts always */
+	const float Loff = -0.0627;
+	const float Coff = -0.502;
+	float uvcosf, uvsinf;
+	float yco;
+	float uco[3], vco[3], off[3];
+	float bright, cont, gamma;
+	int ref = pPriv->transform_index;
+	Bool needgamma = FALSE;
+
+	cont = RTFContrast(pPriv->contrast);
+	bright = RTFBrightness(pPriv->brightness);
+	gamma = (float)pPriv->gamma / 1000.0;
+	uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+	uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+	/* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+	yco = trans[ref].RefLuma * cont;
+	uco[0] = -trans[ref].RefRCr * uvsinf;
+	uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+	uco[2] = trans[ref].RefBCb * uvcosf;
+	vco[0] = trans[ref].RefRCr * uvcosf;
+	vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+	vco[2] = trans[ref].RefBCb * uvsinf;
+	off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+	off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+	off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+	if (gamma != 1.0) {
+	    needgamma = TRUE;
+	    /* note: gamma correction is out = in ^ gamma;
+	       gpu can only do LG2/EX2 therefore we transform into
+	       in ^ gamma = 2 ^ (log2(in) * gamma).
+	       Lots of scalar ops, unfortunately (better solution?) -
+	       without gamma that's 3 inst, with gamma it's 10...
+	       could use different gamma factors per channel,
+	       if that's of any use. */
 	}
 
-	if (isplanar) {
-	    txformat = RADEON_TXFORMAT_I8;
-	} else {
-	    if (pPriv->id == FOURCC_UYVY)
-		txformat = RADEON_TXFORMAT_YVYU422;
-	    else
-		txformat = RADEON_TXFORMAT_VYUY422;
+	BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
+	/* 2 components: same 2 for tex0/1/2 */
+	OUT_ACCEL_REG(R300_RS_COUNT,
+		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+		       R300_RS_COUNT_HIRES_EN));
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
+
+	/* Indirection levels */
+	OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+				       R300_FIRST_TEX));
+
+	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+					    R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
+					    R300_TEX_CODE_OFFSET(0) |
+					    R300_TEX_CODE_SIZE(3)));
+
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+					    R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
+					    R300_TEX_START(0) |
+					    R300_TEX_SIZE(2) |
+					    R300_RGBA_OUT));
+
+	/* tex inst */
+	OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+					   R300_TEX_DST_ADDR(0) |
+					   R300_TEX_ID(0) |
+					   R300_TEX_INST(R300_TEX_INST_LD)));
+	OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
+					   R300_TEX_DST_ADDR(1) |
+					   R300_TEX_ID(1) |
+					   R300_TEX_INST(R300_TEX_INST_LD)));
+	OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
+					   R300_TEX_DST_ADDR(2) |
+					   R300_TEX_ID(2) |
+					   R300_TEX_INST(R300_TEX_INST_LD)));
+
+	/* ALU inst */
+	/* MAD temp0, const0.a, temp0, const0.rgb */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
+						R300_ALU_RGB_ADDR1(0) |
+						R300_ALU_RGB_ADDR2(0) |
+						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	/* alpha nop, but need to set up alpha source for rgb usage */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
+						  R300_ALU_ALPHA_ADDR1(0) |
+						  R300_ALU_ALPHA_ADDR2(0) |
+						  R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	/* MAD const1, temp1, temp0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
+						R300_ALU_RGB_ADDR1(1) |
+						R300_ALU_RGB_ADDR2(0) |
+						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	/* alpha nop */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	/* MAD result, const2, temp2, temp0 */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
+						R300_ALU_RGB_ADDR1(2) |
+						R300_ALU_RGB_ADDR2(0) |
+						R300_ALU_RGB_ADDRD(0) |
+						R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
+						(needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+						R300_ALU_RGB_CLAMP));
+	/* write alpha 1 */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
+						  R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						  R300_ALU_ALPHA_TARGET_A));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						  R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						  R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
+
+	if (needgamma) {
+	    /* rgb temp0.r = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.r */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb temp0.g = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.g */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb temp0.b = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha lg2 temp0, temp0.b */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* MUL const1, temp1, temp0 */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_ADDR1(0) |
+						    R300_ALU_RGB_ADDR2(0) |
+						    R300_ALU_RGB_ADDRD(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						    R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+						    R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+						    R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						    R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						    R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+	    /* alpha nop, but set up const1 */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.r = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.r */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.g = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.g */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+	    /* rgb out0.b = op_sop, set up src0 reg */
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+						    R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
+						    R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
+	    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
+			  R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+			  R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+	    /* alpha ex2 temp0, temp0.b */
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
+						      R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+	    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						      R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						      R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						      R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
 	}
 
-	txformat |= RADEON_TXFORMAT_NON_POWER2;
-
-	colorpitch = dst_pitch >> pixel_shift;
+	/* Shader constants. */
+	/* constant 0: off, yco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
+	/* constant 1: uco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
+	/* constant 2: vco */
+	OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
+	OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
 
-	if (RADEONTilingEnabled(pScrn, pPixmap))
-	    colorpitch |= RADEON_COLOR_TILE_ENABLE;
-
-	BEGIN_ACCEL(4);
-
-	OUT_ACCEL_REG(RADEON_RB3D_CNTL,
-		      dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
-	OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
-
-	OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
-
-	OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
-		      RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+	FINISH_ACCEL();
 
+    } else {
+	BEGIN_ACCEL(11);
+	/* 2 components: 2 for tex0 */
+	OUT_ACCEL_REG(R300_RS_COUNT,
+		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+		       R300_RS_COUNT_HIRES_EN));
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+
+	/* Indirection levels */
+	OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+				       R300_FIRST_TEX));
+
+	OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+					    R300_ALU_CODE_SIZE(1) |
+					    R300_TEX_CODE_OFFSET(0) |
+					    R300_TEX_CODE_SIZE(1)));
+
+	OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+					    R300_ALU_SIZE(0) |
+					    R300_TEX_START(0) |
+					    R300_TEX_SIZE(0) |
+					    R300_RGBA_OUT));
+
+	/* tex inst */
+	OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+					   R300_TEX_DST_ADDR(0) |
+					   R300_TEX_ID(0) |
+					   R300_TEX_INST(R300_TEX_INST_LD)));
+
+	/* ALU inst */
+	/* RGB */
+	OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
+					       R300_ALU_RGB_ADDR1(0) |
+					       R300_ALU_RGB_ADDR2(0) |
+					       R300_ALU_RGB_ADDRD(0) |
+					       R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
+								   R300_ALU_RGB_MASK_G |
+								   R300_ALU_RGB_MASK_B)) |
+					       R300_ALU_RGB_TARGET_A));
+	OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+					       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+					       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
+					       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+					       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+					       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+					       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+					       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+					       R300_ALU_RGB_CLAMP));
+	/* Alpha */
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
+						 R300_ALU_ALPHA_ADDR1(0) |
+						 R300_ALU_ALPHA_ADDR2(0) |
+						 R300_ALU_ALPHA_ADDRD(0) |
+						 R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						 R300_ALU_ALPHA_TARGET_A |
+						 R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
+	OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
+						 R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
+						 R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
+						 R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
+						 R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
+						 R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
+						 R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						 R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
+						 R300_ALU_ALPHA_CLAMP));
 	FINISH_ACCEL();
+    }
 
+    BEGIN_ACCEL(6);
+    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
+    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
 
-	if (IS_R200_3D) {
-
-	    info->accel_state->texW[0] = pPriv->w;
-	    info->accel_state->texH[0] = pPriv->h;
-
-	    if (isplanar) {
-		/* note: in contrast to r300, use input biasing on uv components */
-		const float Loff = -0.0627;
-		float uvcosf, uvsinf;
-		float yco, yoff;
-		float uco[3], vco[3];
-		float bright, cont, sat;
-		int ref = pPriv->transform_index;
-		float ucscale = 0.25, vcscale = 0.25;
-		Bool needux8 = FALSE, needvx8 = FALSE;
-
-		/* contrast can cause constant overflow, clamp */
-		cont = RTFContrast(pPriv->contrast);
-		if (cont * trans[ref].RefLuma > 2.0)
-		    cont = 2.0 / trans[ref].RefLuma;
-		/* brightness is only from -0.5 to 0.5 should be safe */
-		bright = RTFBrightness(pPriv->brightness);
-		/* saturation can also cause overflow, clamp */
-		sat = RTFSaturation(pPriv->saturation);
-		if (sat * trans[ref].RefBCb > 4.0)
-		    sat = 4.0 / trans[ref].RefBCb;
-		uvcosf = sat * cos(RTFHue(pPriv->hue));
-		uvsinf = sat * sin(RTFHue(pPriv->hue));
-
-		yco = trans[ref].RefLuma * cont;
-		uco[0] = -trans[ref].RefRCr * uvsinf;
-		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-		uco[2] = trans[ref].RefBCb * uvcosf;
-		vco[0] = trans[ref].RefRCr * uvcosf;
-		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-		vco[2] = trans[ref].RefBCb * uvsinf;
-		yoff = Loff * yco + bright;
-
-		if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
-		    needux8 = TRUE;
-		    ucscale = 0.125;
-		}
-		if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
-		    needvx8 = TRUE;
-		    vcscale = 0.125;
-		}
-
-		/* need 2 texcoord sets (even though they are identical) due
-		   to denormalization! hw apparently can't premultiply
-		   same coord set by different texture size */
-		vtx_count = 6;
-
-		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
-			    (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
-		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
-		txpitch -= 32;
-		txfilter =  R200_MAG_FILTER_LINEAR |
-			    R200_MIN_FILTER_LINEAR |
-			    R200_CLAMP_S_CLAMP_LAST |
-			    R200_CLAMP_T_CLAMP_LAST;
-
-		BEGIN_ACCEL(36);
-
-		OUT_ACCEL_REG(RADEON_PP_CNTL,
-			      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
-			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
-			      RADEON_TEX_BLEND_2_ENABLE);
-
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
-			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
-			      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
-
-		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
-		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
-			      (pPriv->w - 1) |
-			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
-		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
-
-		OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
-		OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
-		OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
-		OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
-
-		OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
-		OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
-		OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
-		OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
-
-		/* similar to r300 code. Note the big problem is that hardware constants
-		 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
-		 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
-		 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
-		 * the constants not. To get larger range can use output scale, but for
-		 * that 2.018 value we need a total scale by 8, which means the constants
-		 * really have no accuracy whatsoever (5 fractional bits only).
-		 * The only direct way to get high  precision "constants" into the fragment
-		 * pipe I know of is to use the texcoord interpolator (not color, this one
-		 * is 8 bit only too), which seems a bit expensive. We're lucky though it
-		 * seems the values we need seem to fit better than worst case (get about
-		 * 6 fractional bits for this instead of 5, at least when not correcting for
-		 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
-		 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
-		 * even with non-default saturation/hue/contrast/brightness adjustments,
-		 * it gets a little crazy and ultimately precision might still be lacking.
-		 *
-		 * A higher precision (8 fractional bits) version might just put uco into
-		 * a texcoord, and calculate a new vcoconst in the shader, like so:
-		 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
-		 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
-		 * vcocalc = ADD temp, bias/scale(cohelper), vco
-		 * would in total use 4 tex units, 4 instructions which seems fairly
-		 * balanced for this architecture (instead of 3 + 3 for the solution here)
-		 *
-		 * temp = MAD(yco, yuv.yyyy, yoff)
-		 * temp = MAD(uco, yuv.uuuu, temp)
-		 * result = MAD(vco, yuv.vvvv, temp)
-		 *
-		 * note first mad produces actually scalar, hence we transform
-		 * it into a dp2a to get 8 bit precision of yco instead of 7 -
-		 * That's assuming hw correctly expands consts to internal precision.
-		 * (y * 1 + y * (yco - 1) + yoff)
-		 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
-		 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
-		 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
-		 *
-		 * vco, uco need bias (and hence scale too)
-		 *
-		 */
+    OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
+    OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);
 
-		/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_ARG_B_R0_COLOR |
-			      R200_TXC_ARG_C_TFACTOR_COLOR |
-			      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
-			      R200_TXC_OP_DOT2_ADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
-			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      R200_TXC_SCALE_INV2 |
-			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
-			      R200_TXA_OUTPUT_REG_NONE);
-
-		/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_BIAS_ARG_A |
-			      R200_TXC_SCALE_ARG_A |
-			      R200_TXC_ARG_B_R1_COLOR |
-			      R200_TXC_BIAS_ARG_B |
-			      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
-			      R200_TXC_ARG_C_R0_COLOR |
-			      R200_TXC_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
-			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
-			      R200_TXA_OUTPUT_REG_NONE);
-
-		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_BIAS_ARG_A |
-			      R200_TXC_SCALE_ARG_A |
-			      R200_TXC_ARG_B_R2_COLOR |
-			      R200_TXC_BIAS_ARG_B |
-			      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
-			      R200_TXC_ARG_C_R0_COLOR |
-			      R200_TXC_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
-			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      R200_TXC_SCALE_2X |
-			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_COMP_ARG_C |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
-			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
-
-		/* shader constants */
-		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
-							      yco > 1.0 ? yco - 1.0: yco,
-							      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
-							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
-							      uco[1] * ucscale + 0.5, /* or [-2, 2] */
-							      uco[2] * ucscale + 0.5,
-							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
-							      vco[1] * vcscale + 0.5, /* or [-4, 4] */
-							      vco[2] * vcscale + 0.5,
-							      0.0));
-
-		FINISH_ACCEL();
-	    }
-	    else if (info->ChipFamily == CHIP_FAMILY_RV250) {
-		/* fix up broken packed yuv - shader same as above except
-		   yuv components are all in same reg */
-		/* note: in contrast to r300, use input biasing on uv components */
-		const float Loff = -0.0627;
-		float uvcosf, uvsinf;
-		float yco, yoff;
-		float uco[3], vco[3];
-		float bright, cont, sat;
-		int ref = pPriv->transform_index;
-		float ucscale = 0.25, vcscale = 0.25;
-		Bool needux8 = FALSE, needvx8 = FALSE;
-
-		/* contrast can cause constant overflow, clamp */
-		cont = RTFContrast(pPriv->contrast);
-		if (cont * trans[ref].RefLuma > 2.0)
-		    cont = 2.0 / trans[ref].RefLuma;
-		/* brightness is only from -0.5 to 0.5 should be safe */
-		bright = RTFBrightness(pPriv->brightness);
-		/* saturation can also cause overflow, clamp */
-		sat = RTFSaturation(pPriv->saturation);
-		if (sat * trans[ref].RefBCb > 4.0)
-		    sat = 4.0 / trans[ref].RefBCb;
-		uvcosf = sat * cos(RTFHue(pPriv->hue));
-		uvsinf = sat * sin(RTFHue(pPriv->hue));
-
-		yco = trans[ref].RefLuma * cont;
-		uco[0] = -trans[ref].RefRCr * uvsinf;
-		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
-		uco[2] = trans[ref].RefBCb * uvcosf;
-		vco[0] = trans[ref].RefRCr * uvcosf;
-		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
-		vco[2] = trans[ref].RefBCb * uvsinf;
-		yoff = Loff * yco + bright;
-
-		if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
-		    needux8 = TRUE;
-		    ucscale = 0.125;
-		}
-		if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
-		    needvx8 = TRUE;
-		    vcscale = 0.125;
-		}
-
-		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
-			    (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
-		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
-		txpitch -= 32;
-		txfilter =  R200_MAG_FILTER_LINEAR |
-			    R200_MIN_FILTER_LINEAR |
-			    R200_CLAMP_S_CLAMP_LAST |
-			    R200_CLAMP_T_CLAMP_LAST;
-
-		BEGIN_ACCEL(24);
-
-		OUT_ACCEL_REG(RADEON_PP_CNTL,
-			      RADEON_TEX_0_ENABLE |
-			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
-			      RADEON_TEX_BLEND_2_ENABLE);
-
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
-			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
-
-		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
-		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
-			      (pPriv->w - 1) |
-			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
-		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
-
-		/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_ARG_B_R0_COLOR |
-			      R200_TXC_ARG_C_TFACTOR_COLOR |
-			      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
-			      R200_TXC_OP_DOT2_ADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
-			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      R200_TXC_SCALE_INV2 |
-			      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
-			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
-			      R200_TXA_OUTPUT_REG_NONE);
-
-		/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_BIAS_ARG_A |
-			      R200_TXC_SCALE_ARG_A |
-			      R200_TXC_ARG_B_R0_COLOR |
-			      R200_TXC_BIAS_ARG_B |
-			      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
-			      R200_TXC_ARG_C_R1_COLOR |
-			      R200_TXC_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
-			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
-			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
-			      R200_TXA_OUTPUT_REG_NONE);
-
-		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
-			      R200_TXC_ARG_A_TFACTOR_COLOR |
-			      R200_TXC_BIAS_ARG_A |
-			      R200_TXC_SCALE_ARG_A |
-			      R200_TXC_ARG_B_R0_COLOR |
-			      R200_TXC_BIAS_ARG_B |
-			      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
-			      R200_TXC_ARG_C_R1_COLOR |
-			      R200_TXC_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
-			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
-			      R200_TXC_SCALE_2X |
-			      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
-			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_ZERO |
-			      R200_TXA_COMP_ARG_C |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
-			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
-
-		/* shader constants */
-		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
-							      yco > 1.0 ? yco - 1.0: yco,
-							      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
-							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
-							      uco[1] * ucscale + 0.5, /* or [-2, 2] */
-							      uco[2] * ucscale + 0.5,
-							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
-							      vco[1] * vcscale + 0.5, /* or [-4, 4] */
-							      vco[2] * vcscale + 0.5,
-							      0.0));
-
-		FINISH_ACCEL();
-	    }
-	    else {
-		BEGIN_ACCEL(13);
-		OUT_ACCEL_REG(RADEON_PP_CNTL,
-			      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
-
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
-		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
-			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
-
-		OUT_ACCEL_REG(R200_PP_TXFILTER_0,
-			      R200_MAG_FILTER_LINEAR |
-			      R200_MIN_FILTER_LINEAR |
-			      R200_CLAMP_S_CLAMP_LAST |
-			      R200_CLAMP_T_CLAMP_LAST |
-			      R200_YUV_TO_RGB);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
-		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
-		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
-			      (pPriv->w - 1) |
-			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
-
-		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
-
-		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
-			      R200_TXC_ARG_A_ZERO |
-			      R200_TXC_ARG_B_ZERO |
-			      R200_TXC_ARG_C_R0_COLOR |
-			      R200_TXC_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
-			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
-		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
-			      R200_TXA_ARG_A_ZERO |
-			      R200_TXA_ARG_B_ZERO |
-			      R200_TXA_ARG_C_R0_ALPHA |
-			      R200_TXA_OP_MADD);
-		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
-			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
-		FINISH_ACCEL();
-	    }
-	} else {
+    /* no need to enable blending */
+    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
 
-	    info->accel_state->texW[0] = 1;
-	    info->accel_state->texH[0] = 1;
-
-	    BEGIN_ACCEL(9);
-
-	    OUT_ACCEL_REG(RADEON_PP_CNTL,
-			  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
-
-	    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
-					      RADEON_SE_VTX_FMT_ST0));
-
-	    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
-			  RADEON_MAG_FILTER_LINEAR |
-			  RADEON_MIN_FILTER_LINEAR |
-			  RADEON_CLAMP_S_CLAMP_LAST |
-			  RADEON_CLAMP_T_CLAMP_LAST |
-			  RADEON_YUV_TO_RGB);
-	    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
-	    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
-	    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
-			  RADEON_COLOR_ARG_A_ZERO |
-			  RADEON_COLOR_ARG_B_ZERO |
-			  RADEON_COLOR_ARG_C_T0_COLOR |
-			  RADEON_BLEND_CTL_ADD |
-			  RADEON_CLAMP_TX);
-	    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
-			  RADEON_ALPHA_ARG_A_ZERO |
-			  RADEON_ALPHA_ARG_B_ZERO |
-			  RADEON_ALPHA_ARG_C_T0_ALPHA |
-			  RADEON_BLEND_CTL_ADD |
-			  RADEON_CLAMP_TX);
-
-	    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
-			  (pPriv->w - 1) |
-			  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-	    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
-			  pPriv->src_pitch - 32);
-	    FINISH_ACCEL();
-	}
-    }
+    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
+    FINISH_ACCEL();
 
     if (pPriv->vsync) {
 	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
@@ -2257,92 +2030,49 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch);
 #endif
 
-	if (IS_R300_3D || IS_R500_3D) {
-	    if (IS_R300_3D && ((dstw+dsth) > 2880))
-		use_quad = TRUE;
-	    /*
-	     * Set up the scissor area to that of the output size.
-	     */
-	    BEGIN_ACCEL(2);
-	    if (IS_R300_3D) {
-		/* R300 has an offset */
-		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) |
-						 ((dstY + 1088) << R300_SCISSOR_Y_SHIFT)));
-		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) |
-						 ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT)));
-	    } else {
-		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
-						 ((dstY) << R300_SCISSOR_Y_SHIFT)));
-		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
-						 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
-	    }
-	    FINISH_ACCEL();
-	}
+	if (IS_R300_3D && ((dstw+dsth) > 2880))
+	    use_quad = TRUE;
+	/*
+	 * Set up the scissor area to that of the output size.
+	 */
+	BEGIN_ACCEL(2);
+	/* R300 has an offset */
+	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) |
+					 ((dstY + 1088) << R300_SCISSOR_Y_SHIFT)));
+	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) |
+					 ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT)));
+	FINISH_ACCEL();
 
 #ifdef ACCEL_CP
-	if (info->ChipFamily < CHIP_FAMILY_R200) {
-	    BEGIN_RING(3 * vtx_count + 3);
-	    OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
-				3 * vtx_count + 1));
-	    OUT_RING(RADEON_CP_VC_FRMT_XY |
-		     RADEON_CP_VC_FRMT_ST0);
-	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+	if (use_quad) {
+	    BEGIN_RING(4 * vtx_count + 4);
+	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
+				4 * vtx_count));
+	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
 		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
-		     RADEON_CP_VC_CNTL_MAOS_ENABLE |
-		     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
-	} else if (IS_R300_3D || IS_R500_3D) {
-	    if (use_quad) {
-		BEGIN_RING(4 * vtx_count + 4);
-		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
-				    4 * vtx_count));
-		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
-			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
-			 (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
-	    } else {
-		BEGIN_RING(3 * vtx_count + 4);
-		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
-				    3 * vtx_count));
-		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
-			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
-			 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
-	    }
+		     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
 	} else {
-	    BEGIN_RING(3 * vtx_count + 2);
+	    BEGIN_RING(3 * vtx_count + 4);
 	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
 				3 * vtx_count));
-	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
 		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
 		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
 	}
 #else /* ACCEL_CP */
-	if (IS_R300_3D || IS_R500_3D) {
-	    if (use_quad)
-		BEGIN_ACCEL(2 + vtx_count * 4);
-	    else
-		BEGIN_ACCEL(2 + vtx_count * 3);
-	} else
-	    BEGIN_ACCEL(1 + vtx_count * 3);
-
-	if (info->ChipFamily < CHIP_FAMILY_R200)
-	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+	if (use_quad)
+	    BEGIN_ACCEL(2 + vtx_count * 4);
+	else
+	    BEGIN_ACCEL(2 + vtx_count * 3);
+
+	if (use_quad)
+	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
 					      RADEON_VF_PRIM_WALK_DATA |
-					      RADEON_VF_RADEON_MODE |
-					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
-	else if (IS_R300_3D || IS_R500_3D) {
-	    if (use_quad)
-		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
-						  RADEON_VF_PRIM_WALK_DATA |
-						  (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
-	    else
-		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
-						  RADEON_VF_PRIM_WALK_DATA |
-						  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
-	} else
-	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
+					      (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
+	else
+	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
 					      RADEON_VF_PRIM_WALK_DATA |
 					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
-
 #endif
 	if (pPriv->bicubic_enabled) {
 		/*
@@ -2376,61 +2106,33 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			                                                          (float)srcY + 0.5);
 	    }
 	} else {
-	    if (IS_R300_3D || IS_R500_3D) {
-		if (use_quad) {
-		    VTX_OUT((float)dstX,                                       (float)dstY,
-			    (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
-		    VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
-			    (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-		    VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
-			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-		    VTX_OUT((float)(dstX + dstw),                              (float)dstY,
-			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
-		} else {
-		    /*
-		     * Render a big, scissored triangle. This means
-		     * increasing the triangle size and adjusting
-		     * texture coordinates.
-		     */
-		    VTX_OUT((float)dstX,                              (float)dstY,
-			    (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
-		    VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
-			    (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
-			    
-		    VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
-			    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
-			                                              (float)srcY / info->accel_state->texH[0]);
-		}
-	    } else if (isplanar) {
-		/*
-		 * Just render a rect (using three coords).
-		 * Filter is a bit a misnomer, it's just texcoords...
-		 */
-		VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
-			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
-			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
-		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
-		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
-			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
-	    } else {
-		/*
-		 * Just render a rect (using three coords).
-		 */
+	    if (use_quad) {
+		VTX_OUT((float)dstX,                                       (float)dstY,
+			(float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
 		VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
 			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
 		VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
 			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
 		VTX_OUT((float)(dstX + dstw),                              (float)dstY,
 			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	    } else {
+		/*
+		 * Render a big, scissored triangle. This means
+		 * increasing the triangle size and adjusting
+		 * texture coordinates.
+		 */
+		VTX_OUT((float)dstX,                              (float)dstY,
+			(float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+		VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
+			(float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
+		VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
+			((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+			(float)srcY / info->accel_state->texH[0]);
 	    }
 	}
 
-	if (IS_R300_3D || IS_R500_3D)
-	    /* flushing is pipelined, free/finish is not */
-	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+	/* flushing is pipelined, free/finish is not */
+	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
 
 #ifdef ACCEL_CP
 	ADVANCE_RING();
@@ -2441,12 +2143,1000 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	pBox++;
     }
 
-    if (IS_R300_3D || IS_R500_3D) {
-	BEGIN_ACCEL(3);
-	OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
-	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
+    BEGIN_ACCEL(3);
+    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
+    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
+    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+    FINISH_ACCEL();
+
+    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+}
+
+static void
+FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    PixmapPtr pPixmap = pPriv->pPixmap;
+    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
+    uint32_t dst_offset, dst_pitch, dst_format;
+    uint32_t txenable, colorpitch;
+    uint32_t output_fmt;
+    Bool isplanar = FALSE;
+    int dstxoff, dstyoff, pixel_shift, vtx_count;
+    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
+    int nBox = REGION_NUM_RECTS(&pPriv->clip);
+    ACCEL_PREAMBLE();
+
+    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;
+
+#ifdef USE_EXA
+    if (info->useEXA) {
+	dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
+	dst_pitch = exaGetPixmapPitch(pPixmap);
     } else
-	BEGIN_ACCEL(1);
+#endif
+	{
+	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
+		info->fbLocation + pScrn->fbOffset;
+	    dst_pitch = pPixmap->devKind;
+	}
+
+#ifdef COMPOSITE
+    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
+    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
+#else
+    dstxoff = 0;
+    dstyoff = 0;
+#endif
+
+#ifdef USE_EXA
+    if (info->useEXA) {
+	RADEON_SWITCH_TO_3D();
+    } else
+#endif
+	{
+	    BEGIN_ACCEL(2);
+	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+	    /* We must wait for 3d to idle, in case source was just written as a dest. */
+	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+			  RADEON_WAIT_HOST_IDLECLEAN |
+			  RADEON_WAIT_2D_IDLECLEAN |
+			  RADEON_WAIT_3D_IDLECLEAN |
+			  RADEON_WAIT_DMA_GUI_IDLE);
+	    FINISH_ACCEL();
+
+	    if (!info->accel_state->XInited3D)
+		RADEONInit3DEngine(pScrn);
+	}
+
+    if (pPriv->bicubic_enabled)
+	vtx_count = 6;
+    else
+	vtx_count = 4;
+
+    switch (pPixmap->drawable.bitsPerPixel) {
+    case 16:
+	if (pPixmap->drawable.depth == 15)
+	    dst_format = R300_COLORFORMAT_ARGB1555;
+	else
+	    dst_format = R300_COLORFORMAT_RGB565;
+	break;
+    case 32:
+	dst_format = R300_COLORFORMAT_ARGB8888;
+	break;
+    default:
+	return;
+    }
+
+    output_fmt = (R300_OUT_FMT_C4_8 |
+		  R300_OUT_FMT_C0_SEL_BLUE |
+		  R300_OUT_FMT_C1_SEL_GREEN |
+		  R300_OUT_FMT_C2_SEL_RED |
+		  R300_OUT_FMT_C3_SEL_ALPHA);
+
+    colorpitch = dst_pitch >> pixel_shift;
+    colorpitch |= dst_format;
+
+    if (RADEONTilingEnabled(pScrn, pPixmap))
+	colorpitch |= R300_COLORTILE;
+
+    if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	isplanar = TRUE;
+    }
+
+    if (isplanar) {
+	txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
+	txpitch = pPriv->src_pitch;
+    } else {
+	if (pPriv->id == FOURCC_UYVY)
+	    txformat1 = R300_TX_FORMAT_YVYU422;
+	else
+	    txformat1 = R300_TX_FORMAT_VYUY422;
+
+	txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+
+	/* pitch is in pixels */
+	txpitch = pPriv->src_pitch / 2;
+    }
+    txpitch -= 1;
+
+    txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
+		 (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+		 R300_TXPITCH_EN);
+
+    info->accel_state->texW[0] = pPriv->w;
+    info->accel_state->texH[0] = pPriv->h;
+
+    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
+		R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
+		R300_TX_MAG_FILTER_LINEAR |
+		R300_TX_MIN_FILTER_LINEAR |
+		(0 << R300_TX_ID_SHIFT));
+
+
+    if ((pPriv->w - 1) & 0x800)
+	txpitch |= R500_TXWIDTH_11;
+
+    if ((pPriv->h - 1) & 0x800)
+	txpitch |= R500_TXHEIGHT_11;
+
+    txoffset = pPriv->src_offset;
+
+    BEGIN_ACCEL(6);
+    OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
+    OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
+    OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
+    OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
+    OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
+    OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
+    FINISH_ACCEL();
+
+    txenable = R300_TEX_0_ENABLE;
+
+    if (isplanar) {
+	txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
+		     (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+		     R300_TXPITCH_EN);
+	txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	txpitch -= 1;
+	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
+		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
+		    R300_TX_MIN_FILTER_LINEAR |
+		    R300_TX_MAG_FILTER_LINEAR);
+
+	BEGIN_ACCEL(12);
+	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
+	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
+	OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
+	OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
+	FINISH_ACCEL();
+	txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
+    }
+
+    if (pPriv->bicubic_enabled) {
+	/* Size is 128x1 */
+	txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
+		     (0x0 << R300_TXHEIGHT_SHIFT) |
+		     R300_TXPITCH_EN);
+	/* Format is 32-bit floats, 4bpp */
+	txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
+	/* Pitch is 127 (128-1) */
+	txpitch = 0x7f;
+	/* Tex filter */
+	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
+		    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
+		    R300_TX_MIN_FILTER_NEAREST |
+		    R300_TX_MAG_FILTER_NEAREST |
+		    (1 << R300_TX_ID_SHIFT));
+
+	BEGIN_ACCEL(6);
+	OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
+	OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
+	OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
+	OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
+	OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
+	OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
+	FINISH_ACCEL();
+
+	/* Enable tex 1 */
+	txenable |= R300_TEX_1_ENABLE;
+    }
+
+    /* setup the VAP */
+    if (info->accel_state->has_tcl) {
+	if (pPriv->bicubic_enabled)
+	    BEGIN_ACCEL(7);
+	else
+	    BEGIN_ACCEL(6);
+    } else {
+	if (pPriv->bicubic_enabled)
+	    BEGIN_ACCEL(5);
+	else
+	    BEGIN_ACCEL(4);
+    }
+
+    /* These registers define the number, type, and location of data submitted
+     * to the PVS unit of GA input (when PVS is disabled)
+     * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
+     * enabled.  This memory provides the imputs to the vertex shader program
+     * and ordering is not important.  When PVS/TCL is disabled, this field maps
+     * directly to the GA input memory and the order is signifigant.  In
+     * PVS_BYPASS mode the order is as follows:
+     * Position
+     * Point Size
+     * Color 0-3
+     * Textures 0-7
+     * Fog
+     */
+    if (pPriv->bicubic_enabled) {
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
+		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
+		       R300_SIGNED_0 |
+		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
+		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
+		       R300_SIGNED_1));
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_2_SHIFT) |
+		       (7 << R300_DST_VEC_LOC_2_SHIFT) |
+		       R300_LAST_VEC_2 |
+		       R300_SIGNED_2));
+    } else {
+	OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
+		      ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_0_SHIFT) |
+		       (0 << R300_DST_VEC_LOC_0_SHIFT) |
+		       R300_SIGNED_0 |
+		       (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
+		       (0 << R300_SKIP_DWORDS_1_SHIFT) |
+		       (6 << R300_DST_VEC_LOC_1_SHIFT) |
+		       R300_LAST_VEC_1 |
+		       R300_SIGNED_1));
+    }
+
+    /* load the vertex shader
+     * We pre-load vertex programs in RADEONInit3DEngine():
+     * - exa mask/Xv bicubic
+     * - exa no mask
+     * - Xv
+     * Here we select the offset of the vertex program we want to use
+     */
+    if (info->accel_state->has_tcl) {
+	if (pPriv->bicubic_enabled) {
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
+			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
+			   (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (2 << R300_PVS_LAST_INST_SHIFT)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
+			  (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+	} else {
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
+			  ((5 << R300_PVS_FIRST_INST_SHIFT) |
+			   (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (6 << R300_PVS_LAST_INST_SHIFT)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
+			  (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+	}
+    }
+
+    /* Position and one set of 2 texture coordinates */
+    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
+    if (pPriv->bicubic_enabled)
+	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
+					       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
+    else
+	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));
+
+    OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
+    FINISH_ACCEL();
+
+    /* setup pixel shader */
+    if (pPriv->bicubic_enabled) {
+	BEGIN_ACCEL(7);
+
+	/* 4 components: 2 for tex0 and 2 for tex1 */
+	OUT_ACCEL_REG(R300_RS_COUNT,
+		      ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+		       R300_RS_COUNT_HIRES_EN));
+
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));
+
+	/* Pixel stack frame size. */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 5);
+
+	/* FP length. */
+	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					  R500_US_CODE_END_ADDR(13)));
+	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					   R500_US_CODE_RANGE_SIZE(13)));
+
+	/* Prepare for FP emission. */
+	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+	FINISH_ACCEL();
+
+	BEGIN_ACCEL(89);
+	/* Pixel shader.
+	 * I've gone ahead and annotated each instruction, since this
+	 * thing is MASSIVE. :3
+	 * Note: In order to avoid buggies with temps and multiple
+	 * inputs, all temps are offset by 2. temp0 -> register2. */
+
+	/* TEX temp2, input1.xxxx, tex1, 1D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_R |
+					       R500_TEX_SRC_R_SWIZ_R |
+					       R500_TEX_SRC_Q_SWIZ_R |
+					       R500_TEX_DST_ADDR(2) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* TEX temp5, input1.yyyy, tex1, 1D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_SEM_ACQUIRE |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
+					       R500_TEX_SRC_S_SWIZ_G |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_SRC_R_SWIZ_G |
+					       R500_TEX_SRC_Q_SWIZ_G |
+					       R500_TEX_DST_ADDR(5) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* MUL temp4, const0.x0x0, temp2.yyxx */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_R |
+					       R500_ALU_RGB_G_SWIZ_A_0 |
+					       R500_ALU_RGB_B_SWIZ_A_R |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC0 |
+					       R500_ALPHA_SWIZ_A_0 |
+					       R500_ALPHA_SEL_B_SRC1 |
+					       R500_ALPHA_SWIZ_B_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_R_SWIZ_0 |
+					       R500_ALU_RGBA_G_SWIZ_0 |
+					       R500_ALU_RGBA_B_SWIZ_0 |
+					       R500_ALU_RGBA_A_SWIZ_0));
+
+	/* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(5) |
+					       R500_RGB_ADDR2(4)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(5) |
+					       R500_ALPHA_ADDR2(4)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_0 |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_0 |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_R |
+					       R500_ALU_RGB_B_SWIZ_B_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC0 |
+					       R500_ALPHA_SWIZ_A_G |
+					       R500_ALPHA_SEL_B_SRC1 |
+					       R500_ALPHA_SWIZ_B_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_A_SWIZ_A));
+
+	/* ADD temp3, temp3, input0.xyxy */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
+					       R500_RGB_ADDR2(0)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
+					       R500_ALPHA_ADDR2(0)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
+					       R500_ALU_RGB_G_SWIZ_A_1 |
+					       R500_ALU_RGB_B_SWIZ_A_1 |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SWIZ_A_1 |
+					       R500_ALPHA_SEL_B_SRC1 |
+					       R500_ALPHA_SWIZ_B_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_R |
+					       R500_ALU_RGBA_A_SWIZ_G));
+
+	/* TEX temp1, temp3.zwxy, tex0, 2D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
+					       R500_TEX_SRC_S_SWIZ_B |
+					       R500_TEX_SRC_T_SWIZ_A |
+					       R500_TEX_SRC_R_SWIZ_R |
+					       R500_TEX_SRC_Q_SWIZ_G |
+					       R500_TEX_DST_ADDR(1) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* TEX temp3, temp3.xyzw, tex0, 2D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_SEM_ACQUIRE |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_SRC_R_SWIZ_B |
+					       R500_TEX_SRC_Q_SWIZ_A |
+					       R500_TEX_DST_ADDR(3) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR0_CONST |
+					       R500_RGB_ADDR1(5) |
+					       R500_RGB_ADDR2(4)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR0_CONST |
+					       R500_ALPHA_ADDR1(5) |
+					       R500_ALPHA_ADDR2(4)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_0 |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_0 |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_G |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_G));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC0 |
+					       R500_ALPHA_SWIZ_A_G |
+					       R500_ALPHA_SEL_B_SRC1 |
+					       R500_ALPHA_SWIZ_B_G));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_A_SWIZ_A));
+
+	/* ADD temp0, temp4, input0.xyxy */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
+					       R500_RGB_ADDR2(0)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
+					       R500_ALPHA_ADDR2(0)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
+					       R500_ALU_RGB_G_SWIZ_A_1 |
+					       R500_ALU_RGB_B_SWIZ_A_1 |
+					       R500_ALU_RGB_SEL_B_SRC1 |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SWIZ_A_1 |
+					       R500_ALPHA_SEL_B_SRC1 |
+					       R500_ALPHA_SWIZ_B_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC2 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_R |
+					       R500_ALU_RGBA_A_SWIZ_G));
+
+	/* TEX temp4, temp0.zwzw, tex0, 2D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_B |
+					       R500_TEX_SRC_T_SWIZ_A |
+					       R500_TEX_SRC_R_SWIZ_B |
+					       R500_TEX_SRC_Q_SWIZ_A |
+					       R500_TEX_DST_ADDR(4) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* TEX temp0, temp0.xyzw, tex0, 2D */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_SEM_ACQUIRE |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_SRC_R_SWIZ_B |
+					       R500_TEX_SRC_Q_SWIZ_A |
+					       R500_TEX_DST_ADDR(0) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* LRP temp3, temp2.zzzz, temp1, temp3 ->
+	 * - PRESUB temps, temp1 - temp3
+	 * - MAD temp2.zzzz, temps, temp3 */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
+					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+					       R500_RGB_ADDR1(1) |
+					       R500_RGB_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
+					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+					       R500_ALPHA_ADDR1(1) |
+					       R500_ALPHA_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+					       R500_ALU_RGB_R_SWIZ_A_B |
+					       R500_ALU_RGB_G_SWIZ_A_B |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRCP |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC2 |
+					       R500_ALPHA_SWIZ_A_B |
+					       R500_ALPHA_SEL_B_SRCP |
+					       R500_ALPHA_SWIZ_B_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_A_SWIZ_A));
+
+	/* LRP temp0, temp2.zzzz, temp4, temp0 ->
+	 * - PRESUB temps, temp4 - temp1
+	 * - MAD temp2.zzzz, temps, temp0 */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+					       R500_RGB_ADDR1(4) |
+					       R500_RGB_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+					       R500_ALPHA_ADDR1(4) |
+					       R500_ALPHA_ADDR2(2)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+					       R500_ALU_RGB_R_SWIZ_A_B |
+					       R500_ALU_RGB_G_SWIZ_A_B |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRCP |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC2 |
+					       R500_ALPHA_SWIZ_A_B |
+					       R500_ALPHA_SEL_B_SRCP |
+					       R500_ALPHA_SWIZ_B_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_A_SWIZ_A));
+
+	/* LRP output, temp5.zzzz, temp3, temp0 ->
+	 * - PRESUB temps, temp3 - temp0
+	 * - MAD temp5.zzzz, temps, temp0 */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+					       R500_INST_LAST |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK |
+					       R500_INST_RGB_OMASK_R |
+					       R500_INST_RGB_OMASK_G |
+					       R500_INST_RGB_OMASK_B |
+					       R500_INST_ALPHA_OMASK));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
+					       R500_RGB_ADDR1(3) |
+					       R500_RGB_ADDR2(5)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
+					       R500_ALPHA_ADDR1(3) |
+					       R500_ALPHA_ADDR2(5)));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
+					       R500_ALU_RGB_R_SWIZ_A_B |
+					       R500_ALU_RGB_G_SWIZ_A_B |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRCP |
+					       R500_ALU_RGB_R_SWIZ_B_R |
+					       R500_ALU_RGB_G_SWIZ_B_G |
+					       R500_ALU_RGB_B_SWIZ_B_B));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
+					       R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SEL_A_SRC2 |
+					       R500_ALPHA_SWIZ_A_B |
+					       R500_ALPHA_SEL_B_SRCP |
+					       R500_ALPHA_SWIZ_B_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
+					       R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_SEL_C_SRC0 |
+					       R500_ALU_RGBA_R_SWIZ_R |
+					       R500_ALU_RGBA_G_SWIZ_G |
+					       R500_ALU_RGBA_B_SWIZ_B |
+					       R500_ALU_RGBA_A_SWIZ_A));
+
+	/* Shader constants. */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));
+
+	/* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
+	OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
+
+	FINISH_ACCEL();
+
+    } else {
+	BEGIN_ACCEL(19);
+	/* 2 components: 2 for tex0 */
+	OUT_ACCEL_REG(R300_RS_COUNT,
+		      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+		       R300_RS_COUNT_HIRES_EN));
+
+	/* R300_INST_COUNT_RS - highest RS instruction used */
+	OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+	/* Pixel stack frame size. */
+	OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */
+
+	/* FP length. */
+	OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
+					  R500_US_CODE_END_ADDR(1)));
+	OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
+					   R500_US_CODE_RANGE_SIZE(1)));
+
+	/* Prepare for FP emission. */
+	OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
+
+	/* tex inst */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_RGB_WMASK_R |
+					       R500_INST_RGB_WMASK_G |
+					       R500_INST_RGB_WMASK_B |
+					       R500_INST_ALPHA_WMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
+					       R500_TEX_INST_LD |
+					       R500_TEX_SEM_ACQUIRE |
+					       R500_TEX_IGNORE_UNCOVERED));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
+					       R500_TEX_SRC_S_SWIZ_R |
+					       R500_TEX_SRC_T_SWIZ_G |
+					       R500_TEX_DST_ADDR(0) |
+					       R500_TEX_DST_R_SWIZ_R |
+					       R500_TEX_DST_G_SWIZ_G |
+					       R500_TEX_DST_B_SWIZ_B |
+					       R500_TEX_DST_A_SWIZ_A));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
+					       R500_DX_S_SWIZ_R |
+					       R500_DX_T_SWIZ_R |
+					       R500_DX_R_SWIZ_R |
+					       R500_DX_Q_SWIZ_R |
+					       R500_DY_ADDR(0) |
+					       R500_DY_S_SWIZ_R |
+					       R500_DY_T_SWIZ_R |
+					       R500_DY_R_SWIZ_R |
+					       R500_DY_Q_SWIZ_R));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
+
+	/* ALU inst */
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
+					       R500_INST_TEX_SEM_WAIT |
+					       R500_INST_LAST |
+					       R500_INST_RGB_OMASK_R |
+					       R500_INST_RGB_OMASK_G |
+					       R500_INST_RGB_OMASK_B |
+					       R500_INST_ALPHA_OMASK |
+					       R500_INST_RGB_CLAMP |
+					       R500_INST_ALPHA_CLAMP));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
+					       R500_RGB_ADDR1(0) |
+					       R500_RGB_ADDR1_CONST |
+					       R500_RGB_ADDR2(0) |
+					       R500_RGB_ADDR2_CONST));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
+					       R500_ALPHA_ADDR1(0) |
+					       R500_ALPHA_ADDR1_CONST |
+					       R500_ALPHA_ADDR2(0) |
+					       R500_ALPHA_ADDR2_CONST));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_A_R |
+					       R500_ALU_RGB_G_SWIZ_A_G |
+					       R500_ALU_RGB_B_SWIZ_A_B |
+					       R500_ALU_RGB_SEL_B_SRC0 |
+					       R500_ALU_RGB_R_SWIZ_B_1 |
+					       R500_ALU_RGB_B_SWIZ_B_1 |
+					       R500_ALU_RGB_G_SWIZ_B_1));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
+					       R500_ALPHA_SWIZ_A_A |
+					       R500_ALPHA_SWIZ_B_1));
+	OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
+					       R500_ALU_RGBA_R_SWIZ_0 |
+					       R500_ALU_RGBA_G_SWIZ_0 |
+					       R500_ALU_RGBA_B_SWIZ_0 |
+					       R500_ALU_RGBA_A_SWIZ_0));
+	FINISH_ACCEL();
+    }
+
+    BEGIN_ACCEL(6);
+    OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
+    OUT_ACCEL_REG(R300_TX_ENABLE, txenable);
+
+    OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
+    OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);
+
+    /* no need to enable blending */
+    OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+
+    OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
+    FINISH_ACCEL();
+
+    if (pPriv->vsync) {
+	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
+						    pPriv->drw_x,
+						    pPriv->drw_x + pPriv->dst_w,
+						    pPriv->drw_y,
+						    pPriv->drw_y + pPriv->dst_h);
+	if (crtc) {
+	    RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;
+
+	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
+					  radeon_crtc->crtc_id,
+					  pPriv->drw_y - crtc->y,
+					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
+	}
+    }
+    /*
+     * Rendering of the actual polygon is done in two different
+     * ways depending on chip generation:
+     *
+     * < R300:
+     *
+     *     These chips can render a rectangle in one pass, so
+     *     handling is pretty straight-forward.
+     *
+     * >= R300:
+     *
+     *     These chips can accept a quad, but will render it as
+     *     two triangles which results in a diagonal tear. Instead
+     *     We render a single, large triangle and use the scissor
+     *     functionality to restrict it to the desired rectangle.
+     *     Due to guardband limits on r3xx/r4xx, we can only use
+     *     the single triangle up to 2880 pixels; above that we
+     *     render as a quad.
+     */
+
+    while (nBox--) {
+	int srcX, srcY, srcw, srch;
+	int dstX, dstY, dstw, dsth;
+	dstX = pBox->x1 + dstxoff;
+	dstY = pBox->y1 + dstyoff;
+	dstw = pBox->x2 - pBox->x1;
+	dsth = pBox->y2 - pBox->y1;
+
+	srcX = ((pBox->x1 - pPriv->drw_x) *
+		pPriv->src_w) / pPriv->dst_w;
+	srcY = ((pBox->y1 - pPriv->drw_y) *
+		pPriv->src_h) / pPriv->dst_h;
+
+	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
+	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
+
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
+					 ((dstY) << R300_SCISSOR_Y_SHIFT)));
+	OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
+					 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
+	FINISH_ACCEL();
+
+#ifdef ACCEL_CP
+	BEGIN_RING(3 * vtx_count + 4);
+	OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
+			    3 * vtx_count));
+	OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
+		 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+		 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
+#else /* ACCEL_CP */
+	BEGIN_ACCEL(2 + vtx_count * 3);
+	OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
+					  RADEON_VF_PRIM_WALK_DATA |
+					  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
+#endif
+	if (pPriv->bicubic_enabled) {
+	    VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
+			   (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
+			   (float)srcX + 0.5,                                 (float)srcY + 0.5);
+	    VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
+			   (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
+			   (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
+	    VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
+			   ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+			   (float)srcY / info->accel_state->texH[0],
+			   (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
+			   (float)srcY + 0.5);
+	} else {
+	    /*
+	     * Render a big, scissored triangle. This means
+	     * increasing the triangle size and adjusting
+	     * texture coordinates.
+	     */
+	    VTX_OUT((float)dstX,                              (float)dstY,
+		    (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
+	    VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
+		    (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
+	    VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
+		    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
+		    (float)srcY / info->accel_state->texH[0]);
+	}
+
+	/* flushing is pipelined, free/finish is not */
+	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
+
+#ifdef ACCEL_CP
+	ADVANCE_RING();
+#else
+	FINISH_ACCEL();
+#endif /* !ACCEL_CP */
+
+	pBox++;
+    }
+
+    BEGIN_ACCEL(3);
+    OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
+    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
     OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
     FINISH_ACCEL();
 
commit a30737b337edb31528174b483c9094941a5d41bb
Author: Roland Scheidegger <sroland at tungstengraphics.com>
Date:   Mon Apr 13 15:36:07 2009 -0400

    r200/r300: implement brightness/contrast/hue/saturation/gamma controls for textured video
    
    This implements
    contrast/brightness/hue/saturation controls for r200/r300 plus gamma (same
    gamma value for all channels used though separate values would be trivial)
    control for r300.
    Some issues left:
    - only r200/r300
    - still can't be combined with bicubic
    - controls will silently cease to work if the format used is packed and not
    planar (except for rv250)
    - gamma range is from 100 to 10000 corresponding to 0.1 and 10.0 like used in
    overlay. However, usable range is far smaller. Over 2.0 picture gets dark
    pretty quickly, and below 0.6 or so black seems to turn into purple (I've
    verified that even with gamma 1.0 black actually often seems to be RGB 1/0/1 so
    this explains this since that gets amplified by low gamma values - not sure if
    this is a rounding problem somewhere, bogus reference values or is somehow
    expected).
    - gamma adds a bit too many instructions for my taste (7) though the
    alternative (3 texture lookups + some swizzling instructions) doesn't seem any
    better.

diff --git a/src/radeon.h b/src/radeon.h
index 7e84aeb..174352d 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -378,6 +378,11 @@ typedef enum {
 	(info->ChipFamily == CHIP_FAMILY_RS400) ||  \
 	(info->ChipFamily == CHIP_FAMILY_RS480))
 
+#define IS_R200_3D ((info->ChipFamily == CHIP_FAMILY_RV250) || \
+	(info->ChipFamily == CHIP_FAMILY_RV280) || \
+	(info->ChipFamily == CHIP_FAMILY_RS300) || \
+	(info->ChipFamily == CHIP_FAMILY_R200))
+
 /*
  * Errata workarounds
  */
diff --git a/src/radeon_accelfuncs.c b/src/radeon_accelfuncs.c
index 45eb6d5..2d6fe01 100644
--- a/src/radeon_accelfuncs.c
+++ b/src/radeon_accelfuncs.c
@@ -1345,10 +1345,7 @@ FUNC_NAME(RADEONAccelInit)(ScreenPtr pScreen, XAAInfoRecPtr a)
 	    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "XAA Render acceleration "
 		       "unsupported on Radeon 9500/9700 and newer. "
 		       "Please use EXA instead.\n");
-	} else if ((info->ChipFamily == CHIP_FAMILY_RV250) || 
-		   (info->ChipFamily == CHIP_FAMILY_RV280) || 
-		   (info->ChipFamily == CHIP_FAMILY_RS300) || 
-		   (info->ChipFamily == CHIP_FAMILY_R200)) {
+	} else if (IS_R200_3D) {
 	    a->SetupForCPUToScreenAlphaTexture2 =
 		FUNC_NAME(R200SetupForCPUToScreenAlphaTexture);
 	    a->SubsequentCPUToScreenAlphaTexture =
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index a9bc7d2..3dbe617 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -565,10 +565,7 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 	OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
 	OUT_ACCEL_REG(R300_SC_SCREENDOOR, 0xffffff);
 	FINISH_ACCEL();
-    } else if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
-	       (info->ChipFamily == CHIP_FAMILY_RV280) ||
-	       (info->ChipFamily == CHIP_FAMILY_RS300) ||
-	       (info->ChipFamily == CHIP_FAMILY_R200)) {
+    } else if (IS_R200_3D) {
 
 	BEGIN_ACCEL(6);
 	if (info->ChipFamily == CHIP_FAMILY_RS300) {
diff --git a/src/radeon_exa_funcs.c b/src/radeon_exa_funcs.c
index 59cb46f..6a2b25c 100644
--- a/src/radeon_exa_funcs.c
+++ b/src/radeon_exa_funcs.c
@@ -505,10 +505,7 @@ Bool FUNC_NAME(RADEONDrawInit)(ScreenPtr pScreen)
 		info->accel_state->exa->DoneComposite = FUNC_NAME(RadeonDoneComposite);
 	    } else
 		xf86DrvMsg(pScrn->scrnIndex, X_INFO, "EXA Composite requires CP on R5xx/IGP\n");
-	} else if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
-		   (info->ChipFamily == CHIP_FAMILY_RV280) ||
-		   (info->ChipFamily == CHIP_FAMILY_RS300) ||
-		   (info->ChipFamily == CHIP_FAMILY_R200)) {
+	} else if (IS_R200_3D) {
 		xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Render acceleration "
 			       "enabled for R200 type cards.\n");
 		info->accel_state->exa->CheckComposite = R200CheckComposite;
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 79671c0..bf8a276 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -128,6 +128,21 @@ static __inline__ uint32_t float4touint(float fr, float fg, float fb, float fa)
     return (ua << 24) | (ur << 16) | (ug << 8) | ub;
 }
 
+/* Parameters for ITU-R BT.601 and ITU-R BT.709 colour spaces
+   note the difference to the parameters used in overlay are due
+   to 10bit vs. float calcs */
+static REF_TRANSFORM trans[2] =
+{
+    {1.1643, 0.0, 1.5960, -0.3918, -0.8129, 2.0172, 0.0}, /* BT.601 */
+    {1.1643, 0.0, 1.7927, -0.2132, -0.5329, 2.1124, 0.0}  /* BT.709 */
+};
+
+
+#define RTFSaturation(a)   (1.0 + ((a)*1.0)/1000.0)
+#define RTFBrightness(a)   (((a)*1.0)/2000.0)
+#define RTFContrast(a)   (1.0 + ((a)*1.0)/1000.0)
+#define RTFHue(a)   (((a)*3.1416)/1000.0)
+
 #define ACCEL_MMIO
 #define ACCEL_PREAMBLE()	unsigned char *RADEONMMIO = info->MMIO
 #define BEGIN_ACCEL(n)		RADEONWaitForFifo(pScrn, (n))
@@ -359,12 +374,8 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     pPriv->planar_hw = pPriv->planar_state;
-    if (pPriv->bicubic_enabled || !( IS_R300_3D ||
-	    (info->ChipFamily == CHIP_FAMILY_RV250) ||
-	    (info->ChipFamily == CHIP_FAMILY_RV280) ||
-	    (info->ChipFamily == CHIP_FAMILY_RS300) ||
-	    (info->ChipFamily == CHIP_FAMILY_R200) ))
-        pPriv->planar_hw = 0;
+    if (pPriv->bicubic_enabled || !( IS_R300_3D || IS_R200_3D ))
+	pPriv->planar_hw = 0;
 
     switch(id) {
     case FOURCC_YV12:
@@ -636,28 +647,58 @@ static XF86VideoFormatRec Formats[NUM_FORMATS] =
     {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
 };
 
-#define NUM_ATTRIBUTES 2
+#define NUM_ATTRIBUTES 1
 
 static XF86AttributeRec Attributes[NUM_ATTRIBUTES+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {0, 0, 0, NULL}
+};
+
+#define NUM_ATTRIBUTES_R200 7
+
+static XF86AttributeRec Attributes_r200[NUM_ATTRIBUTES_R200+1] =
+{
+    {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
     {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
+    {XvSettable | XvGettable, 100, 10000, "XV_COLORSPACE"},
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R300 3
+#define NUM_ATTRIBUTES_R300 9
 
 static XF86AttributeRec Attributes_r300[NUM_ATTRIBUTES_R300+1] =
 {
     {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
     {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_BRIGHTNESS"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_CONTRAST"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_SATURATION"},
+    {XvSettable | XvGettable, -1000, 1000, "XV_HUE"},
+    {XvSettable | XvGettable, 100, 10000, "XV_GAMMA"},
+    {XvSettable | XvGettable, 0, 1, "XV_COLORSPACE"},
+    {0, 0, 0, NULL}
+};
+
+#define NUM_ATTRIBUTES_R500 2
+
+static XF86AttributeRec Attributes_r500[NUM_ATTRIBUTES_R500+1] =
+{
+    {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
+    {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
     {0, 0, 0, NULL}
 };
 
 static Atom xvBicubic;
 static Atom xvVSync;
 static Atom xvHWPlanar;
+static Atom xvBrightness, xvContrast, xvSaturation, xvHue;
+static Atom xvGamma, xvColorspace;
 
 #define NUM_IMAGES 4
 
@@ -686,6 +727,18 @@ RADEONGetTexPortAttribute(ScrnInfoPtr  pScrn,
 	*value = pPriv->vsync;
     else if (attribute == xvHWPlanar)
 	*value = pPriv->planar_state;
+    else if (attribute == xvBrightness)
+	*value = pPriv->brightness;
+    else if (attribute == xvContrast)
+	*value = pPriv->contrast;
+    else if (attribute == xvSaturation)
+	*value = pPriv->saturation;
+    else if (attribute == xvHue)
+	*value = pPriv->hue;
+    else if (attribute == xvGamma)
+	*value = pPriv->gamma;
+    else if(attribute == xvColorspace)
+	*value = pPriv->transform_index;
     else
 	return BadMatch;
 
@@ -709,6 +762,20 @@ RADEONSetTexPortAttribute(ScrnInfoPtr  pScrn,
 	pPriv->vsync = ClipValue (value, 0, 1);
     else if (attribute == xvHWPlanar)
 	pPriv->planar_state = ClipValue (value, 0, 1);
+    else if (attribute == xvHWPlanar)
+	pPriv->planar_state = ClipValue (value, 0, 1);
+    else if (attribute == xvBrightness)
+	pPriv->brightness = ClipValue (value, -1000, 1000);
+    else if (attribute == xvContrast)
+	pPriv->contrast = ClipValue (value, -1000, 1000);
+    else if (attribute == xvSaturation)
+	pPriv->saturation = ClipValue (value, -1000, 1000);
+    else if (attribute == xvHue)
+	pPriv->hue = ClipValue (value, -1000, 1000);
+    else if (attribute == xvGamma)
+	pPriv->gamma = ClipValue (value, 100, 10000);
+    else if(attribute == xvColorspace)
+	pPriv->transform_index = ClipValue (value, 0, 1);
     else
 	return BadMatch;
 
@@ -733,6 +800,12 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
     xvBicubic         = MAKE_ATOM("XV_BICUBIC");
     xvVSync           = MAKE_ATOM("XV_VSYNC");
     xvHWPlanar        = MAKE_ATOM("XV_HWPLANAR");
+    xvBrightness      = MAKE_ATOM("XV_BRIGHTNESS");
+    xvContrast        = MAKE_ATOM("XV_CONTRAST");
+    xvSaturation      = MAKE_ATOM("XV_SATURATION");
+    xvHue             = MAKE_ATOM("XV_HUE");
+    xvGamma           = MAKE_ATOM("XV_GAMMA");
+    xvColorspace      = MAKE_ATOM("XV_COLORSPACE");
 
     adapt->type = XvWindowMask | XvInputMask | XvImageMask;
     adapt->flags = 0;
@@ -752,10 +825,19 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
     pPortPriv =
 	(RADEONPortPrivPtr)(&adapt->pPortPrivates[num_texture_ports]);
 
-    if (IS_R300_3D || IS_R500_3D) {
+    if (IS_R300_3D) {
 	adapt->pAttributes = Attributes_r300;
 	adapt->nAttributes = NUM_ATTRIBUTES_R300;
-    } else {
+    }
+    else if (IS_R500_3D) {
+	adapt->pAttributes = Attributes_r500;
+	adapt->nAttributes = NUM_ATTRIBUTES_R500;
+    }
+    else if (IS_R200_3D) {
+	adapt->pAttributes = Attributes_r200;
+	adapt->nAttributes = NUM_ATTRIBUTES_R200;
+    }
+    else {
 	adapt->pAttributes = Attributes;
 	adapt->nAttributes = NUM_ATTRIBUTES;
     }
@@ -783,6 +865,12 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
 	pPriv->bicubic_state = BICUBIC_AUTO;
 	pPriv->vsync = TRUE;
 	pPriv->planar_state = 1;
+	pPriv->brightness = 0;
+	pPriv->contrast = 0;
+	pPriv->saturation = 0;
+	pPriv->hue = 0;
+	pPriv->gamma = 1000;
+	pPriv->transform_index = 0;
 
 	/* gotta uninit this someplace, XXX: shouldn't be necessary for textured */
 	REGION_NULL(pScreen, &pPriv->clip);
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index 6cb2870..3c4289f 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -743,9 +743,10 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	     * DP3 might look like the straightforward solution
 	     * but we'd need to move the texture yuv values in
 	     * the same reg for this to work. Therefore use MADs.
-	     * Without changing the shader at all (only the constants)
-	     * could also provide hue/saturation/brightness/contrast control.
-	     *
+	     * Brightness just adds to the off constant.
+	     * Contrast is multiplication of luminance.
+	     * Saturation and hue change the u and v coeffs.
+	     * Default values (before adjustments - depend on colorspace):
 	     * yco = 1.1643
 	     * uco = 0, -0.39173, 2.017
 	     * vco = 1.5958, -0.8129, 0
@@ -757,14 +758,46 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	     * temp = MAD(uco, yuv.uuuu, temp)
 	     * result = MAD(vco, yuv.vvvv, temp)
 	     */
-		float yco = 1.1643;
-		float uco[3] = {0.0, -0.39173, 2.018};
-		float vco[3] = {1.5958, -0.8129, 0.0};
-		float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
-				-0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
-				-0.0625 * yco + -0.5 * uco[2] + -0.5 * vco[2]};
-
-		BEGIN_ACCEL(33);
+	     /* TODO: don't recalc consts always */
+		const float Loff = -0.0627;
+		const float Coff = -0.502;
+		float uvcosf, uvsinf;
+		float yco;
+		float uco[3], vco[3], off[3];
+		float bright, cont, gamma;
+		int ref = pPriv->transform_index;
+		Bool needgamma = FALSE;
+
+		cont = RTFContrast(pPriv->contrast);
+		bright = RTFBrightness(pPriv->brightness);
+		gamma = (float)pPriv->gamma / 1000.0;
+		uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue));
+		uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue));
+		/* overlay video also does pre-gamma contrast/sat adjust, should we? */
+
+		yco = trans[ref].RefLuma * cont;
+		uco[0] = -trans[ref].RefRCr * uvsinf;
+		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+		uco[2] = trans[ref].RefBCb * uvcosf;
+		vco[0] = trans[ref].RefRCr * uvcosf;
+		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+		vco[2] = trans[ref].RefBCb * uvsinf;
+		off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright;
+		off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright;
+		off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright;
+
+		if (gamma != 1.0) {
+			needgamma = TRUE;
+			/* note: gamma correction is out = in ^ gamma;
+			   gpu can only do LG2/EX2 therefore we transform into
+			   in ^ gamma = 2 ^ (log2(in) * gamma).
+			   Lots of scalar ops, unfortunately (better solution?) -
+			   without gamma that's 3 inst, with gamma it's 10...
+			   could use different gamma factors per channel,
+			   if that's of any use. */
+		}
+
+		BEGIN_ACCEL(needgamma ? 28 + 33 : 33);
 		/* 2 components: same 2 for tex0/1/2 */
 		OUT_ACCEL_REG(R300_RS_COUNT,
 			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
@@ -779,12 +812,12 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 							R300_FIRST_TEX));
 
 		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
-						   R300_ALU_CODE_SIZE(3) |
+						   R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) |
 						   R300_TEX_CODE_OFFSET(0) |
 						   R300_TEX_CODE_SIZE(3)));
 
 		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
-						   R300_ALU_SIZE(2) |
+						   R300_ALU_SIZE(needgamma ? 7 + 2 : 2) |
 						   R300_TEX_START(0) |
 						   R300_TEX_SIZE(2) |
 						   R300_RGBA_OUT));
@@ -857,7 +890,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 						   R300_ALU_RGB_ADDR2(0) |
 						   R300_ALU_RGB_ADDRD(0) |
 						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
-						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
+						   (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))));
 		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
 						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
 						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
@@ -868,14 +901,126 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
 						   R300_ALU_RGB_CLAMP));
 		/* write alpha 1 */
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) |
 						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
 						   R300_ALU_ALPHA_TARGET_A));
-		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
 						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
 						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
 						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
 
+		if (needgamma) {
+		    /* rgb temp0.r = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3),
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha lg2 temp0, temp0.r */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* rgb temp0.g = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4),
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha lg2 temp0, temp0.g */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* rgb temp0.b = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5),
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha lg2 temp0, temp0.b */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* MUL const1, temp1, temp0 */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_ADDR1(0) |
+						   R300_ALU_RGB_ADDR2(0) |
+						   R300_ALU_RGB_ADDRD(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) |
+						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
+						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+		    /* alpha nop, but set up const1 */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* rgb out0.r = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) |
+						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), 
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha ex2 temp0, temp0.r */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* rgb out0.g = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) |
+						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8),
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha ex2 temp0, temp0.g */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		    /* rgb out0.b = op_sop, set up src0 reg */
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) |
+						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B)));
+		    OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9),
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE));
+		    /* alpha ex2 temp0, temp0.b */
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		    OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+		}
+
 		/* Shader constants. */
 		/* constant 0: off, yco */
 		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
@@ -886,7 +1031,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
 		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
 		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
-		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(0.0));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma));
 		/* constant 2: vco */
 		OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
 		OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
@@ -1601,20 +1746,52 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	FINISH_ACCEL();
 
 
-	if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
-	    (info->ChipFamily == CHIP_FAMILY_RV280) ||
-	    (info->ChipFamily == CHIP_FAMILY_RS300) ||
-	    (info->ChipFamily == CHIP_FAMILY_R200)) {
+	if (IS_R200_3D) {
 
 	    info->accel_state->texW[0] = pPriv->w;
 	    info->accel_state->texH[0] = pPriv->h;
 
 	    if (isplanar) {
 		/* note: in contrast to r300, use input biasing on uv components */
-		float yco = 1.1643;
-		float yoff = -0.0625 * yco;
-		float uco[3] = {0.0, -0.39173, 2.018};
-		float vco[3] = {1.5958, -0.8129, 0.0};
+		const float Loff = -0.0627;
+		float uvcosf, uvsinf;
+		float yco, yoff;
+		float uco[3], vco[3];
+		float bright, cont, sat;
+		int ref = pPriv->transform_index;
+		float ucscale = 0.25, vcscale = 0.25;
+		Bool needux8 = FALSE, needvx8 = FALSE;
+
+		/* contrast can cause constant overflow, clamp */
+		cont = RTFContrast(pPriv->contrast);
+		if (cont * trans[ref].RefLuma > 2.0)
+		    cont = 2.0 / trans[ref].RefLuma;
+		/* brightness is only from -0.5 to 0.5 should be safe */
+		bright = RTFBrightness(pPriv->brightness);
+		/* saturation can also cause overflow, clamp */
+		sat = RTFSaturation(pPriv->saturation);
+		if (sat * trans[ref].RefBCb > 4.0)
+		    sat = 4.0 / trans[ref].RefBCb;
+		uvcosf = sat * cos(RTFHue(pPriv->hue));
+		uvsinf = sat * sin(RTFHue(pPriv->hue));
+
+		yco = trans[ref].RefLuma * cont;
+		uco[0] = -trans[ref].RefRCr * uvsinf;
+		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+		uco[2] = trans[ref].RefBCb * uvcosf;
+		vco[0] = trans[ref].RefRCr * uvcosf;
+		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+		vco[2] = trans[ref].RefBCb * uvsinf;
+		yoff = Loff * yco + bright;
+
+		if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
+		    needux8 = TRUE;
+		    ucscale = 0.125;
+		}
+		if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
+		    needvx8 = TRUE;
+		    vcscale = 0.125;
+		}
 
 		/* need 2 texcoord sets (even though they are identical) due
 		   to denormalization! hw apparently can't premultiply
@@ -1678,7 +1855,9 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 		 * seems the values we need seem to fit better than worst case (get about
 		 * 6 fractional bits for this instead of 5, at least when not correcting for
 		 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
-		 * yoff get 8 fractional bits).
+		 * yoff get 8 fractional bits). Try to preserve as much accuracy as possible
+		 * even with non-default saturation/hue/contrast/brightness adjustments,
+		 * it gets a little crazy and ultimately precision might still be lacking.
 		 *
 		 * A higher precision (8 fractional bits) version might just put uco into
 		 * a texcoord, and calculate a new vcoconst in the shader, like so:
@@ -1709,7 +1888,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_ARG_A_TFACTOR_COLOR |
 			      R200_TXC_ARG_B_R0_COLOR |
 			      R200_TXC_ARG_C_TFACTOR_COLOR |
-			      R200_TXC_NEG_ARG_C |
+			      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
 			      R200_TXC_OP_DOT2_ADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
 			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
@@ -1730,7 +1909,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_SCALE_ARG_A |
 			      R200_TXC_ARG_B_R1_COLOR |
 			      R200_TXC_BIAS_ARG_B |
-			      R200_TXC_SCALE_ARG_B |
+			      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
 			      R200_TXC_ARG_C_R0_COLOR |
 			      R200_TXC_OP_MADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
@@ -1751,6 +1930,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_SCALE_ARG_A |
 			      R200_TXC_ARG_B_R2_COLOR |
 			      R200_TXC_BIAS_ARG_B |
+			      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
 			      R200_TXC_ARG_C_R0_COLOR |
 			      R200_TXC_OP_MADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
@@ -1767,28 +1947,64 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
 
 		/* shader constants */
-		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
-							      yco - 1.0,
-							      -yoff, /* range [-1, 0] */
+		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
+							      yco > 1.0 ? yco - 1.0: yco,
+							      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
 							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
-							      uco[1] * 0.125 + 0.5,
-							      uco[2] * 0.125 + 0.5,
+		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
+							      uco[1] * ucscale + 0.5, /* or [-2, 2] */
+							      uco[2] * ucscale + 0.5,
 							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
-							      vco[1] * 0.25 + 0.5,
-							      vco[2] * 0.25 + 0.5,
+		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
+							      vco[1] * vcscale + 0.5, /* or [-4, 4] */
+							      vco[2] * vcscale + 0.5,
 							      0.0));
 
 		FINISH_ACCEL();
 	    }
 	    else if (info->ChipFamily == CHIP_FAMILY_RV250) {
 		/* fix up broken packed yuv - shader same as above except
-		   yuv compoents are all in same reg */
-		float yco = 1.1643;
-		float yoff = -0.0625 * yco;
-		float uco[3] = {0.0, -0.39173, 2.018};
-		float vco[3] = {1.5958, -0.8129, 0.0};
+		   yuv components are all in same reg */
+		/* note: in contrast to r300, use input biasing on uv components */
+		const float Loff = -0.0627;
+		float uvcosf, uvsinf;
+		float yco, yoff;
+		float uco[3], vco[3];
+		float bright, cont, sat;
+		int ref = pPriv->transform_index;
+		float ucscale = 0.25, vcscale = 0.25;
+		Bool needux8 = FALSE, needvx8 = FALSE;
+
+		/* contrast can cause constant overflow, clamp */
+		cont = RTFContrast(pPriv->contrast);
+		if (cont * trans[ref].RefLuma > 2.0)
+		    cont = 2.0 / trans[ref].RefLuma;
+		/* brightness is only from -0.5 to 0.5 should be safe */
+		bright = RTFBrightness(pPriv->brightness);
+		/* saturation can also cause overflow, clamp */
+		sat = RTFSaturation(pPriv->saturation);
+		if (sat * trans[ref].RefBCb > 4.0)
+		    sat = 4.0 / trans[ref].RefBCb;
+		uvcosf = sat * cos(RTFHue(pPriv->hue));
+		uvsinf = sat * sin(RTFHue(pPriv->hue));
+
+		yco = trans[ref].RefLuma * cont;
+		uco[0] = -trans[ref].RefRCr * uvsinf;
+		uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf;
+		uco[2] = trans[ref].RefBCb * uvcosf;
+		vco[0] = trans[ref].RefRCr * uvcosf;
+		vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf;
+		vco[2] = trans[ref].RefBCb * uvsinf;
+		yoff = Loff * yco + bright;
+
+		if ((uco[0] > 2.0) || (uco[2] > 2.0)) {
+		    needux8 = TRUE;
+		    ucscale = 0.125;
+		}
+		if ((vco[0] > 2.0) || (vco[2] > 2.0)) {
+		    needvx8 = TRUE;
+		    vcscale = 0.125;
+		}
 
 		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
 			    (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
@@ -1824,7 +2040,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_ARG_A_TFACTOR_COLOR |
 			      R200_TXC_ARG_B_R0_COLOR |
 			      R200_TXC_ARG_C_TFACTOR_COLOR |
-			      R200_TXC_NEG_ARG_C |
+			      (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) |
 			      R200_TXC_OP_DOT2_ADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
 			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
@@ -1846,7 +2062,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_SCALE_ARG_A |
 			      R200_TXC_ARG_B_R0_COLOR |
 			      R200_TXC_BIAS_ARG_B |
-			      R200_TXC_SCALE_ARG_B |
+			      (needux8 ? R200_TXC_SCALE_ARG_B : 0) |
 			      R200_TXC_ARG_C_R1_COLOR |
 			      R200_TXC_OP_MADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
@@ -1868,6 +2084,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXC_SCALE_ARG_A |
 			      R200_TXC_ARG_B_R0_COLOR |
 			      R200_TXC_BIAS_ARG_B |
+			      (needvx8 ? R200_TXC_SCALE_ARG_B : 0) |
 			      R200_TXC_ARG_C_R1_COLOR |
 			      R200_TXC_OP_MADD);
 		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
@@ -1885,17 +2102,17 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
 
 		/* shader constants */
-		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
-							      yco - 1.0,
-							      -yoff, /* range [-1, 0] */
+		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */
+							      yco > 1.0 ? yco - 1.0: yco,
+							      yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */
 							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
-							      uco[1] * 0.125 + 0.5,
-							      uco[2] * 0.125 + 0.5,
+		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */
+							      uco[1] * ucscale + 0.5, /* or [-2, 2] */
+							      uco[2] * ucscale + 0.5,
 							      0.0));
-		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
-							      vco[1] * 0.25 + 0.5,
-							      vco[2] * 0.25 + 0.5,
+		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */
+							      vco[1] * vcscale + 0.5, /* or [-4, 4] */
+							      vco[2] * vcscale + 0.5,
 							      0.0));
 
 		FINISH_ACCEL();
diff --git a/src/radeon_video.c b/src/radeon_video.c
index 6314eb1..8479160 100644
--- a/src/radeon_video.c
+++ b/src/radeon_video.c
@@ -541,18 +541,6 @@ static XF86ImageRec Images[NUM_IMAGES] =
 
 #endif
 
-/* Reference color space transform data */
-typedef struct tagREF_TRANSFORM
-{
-    float   RefLuma;
-    float   RefRCb;
-    float   RefRCr;
-    float   RefGCb;
-    float   RefGCr;
-    float   RefBCb;
-    float   RefBCr;
-} REF_TRANSFORM;
-
 /* Parameters for ITU-R BT.601 and ITU-R BT.709 colour spaces */
 static REF_TRANSFORM trans[2] =
 {
@@ -560,7 +548,6 @@ static REF_TRANSFORM trans[2] =
     {1.1678, 0.0, 1.7980, -0.2139, -0.5345, 2.1186, 0.0}  /* BT.709 */
 };
 
-
 /* Gamma curve definition for preset gammas */
 typedef struct tagGAMMA_CURVE_R100
 {
diff --git a/src/radeon_video.h b/src/radeon_video.h
index 4498002..be33871 100644
--- a/src/radeon_video.h
+++ b/src/radeon_video.h
@@ -123,6 +123,18 @@ typedef struct {
     int vsync;
 } RADEONPortPrivRec, *RADEONPortPrivPtr;
 
+/* Reference color space transform data */
+typedef struct tagREF_TRANSFORM
+{
+    float   RefLuma;
+    float   RefRCb;
+    float   RefRCr;
+    float   RefGCb;
+    float   RefGCr;
+    float   RefBCb;
+    float   RefBCr;
+} REF_TRANSFORM;
+
 xf86CrtcPtr
 radeon_xv_pick_best_crtc(ScrnInfoPtr pScrn,
 			 int x1, int x2, int y1, int y2);


More information about the xorg-commit mailing list