xf86-video-ati: Branch 'master' - 3 commits

Alex Deucher agd5f at kemper.freedesktop.org
Wed May 13 13:15:18 PDT 2009


 src/r600_exa.c                   |   93 ++---
 src/r600_shader.c                |  620 ++++++++++++++++++++++++++++++++++++---
 src/r600_textured_videofuncs.c   |   22 +
 src/radeon_commonfuncs.c         |  306 +++++++++++++++----
 src/radeon_exa_render.c          |   99 +++++-
 src/radeon_reg.h                 |   13 
 src/radeon_textured_videofuncs.c |   40 +-
 7 files changed, 1002 insertions(+), 191 deletions(-)

New commits:
commit fa09b058c7a17689989e600ffd465856a058579d
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed May 13 16:10:40 2009 -0400

    R6xx/R7xx Xv: normalize texture coordinates in the vertex shader

diff --git a/src/r600_shader.c b/src/r600_shader.c
index fba8dcb..ceabad8 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -457,7 +457,7 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
     int i = 0;
 
     /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(4));
+    shader[i++] = CF_DWORD0(ADDR(6));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_ACTIVE),
@@ -468,7 +468,22 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                             CF_INST(SQ_CF_INST_VTX),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
-    /* 1 */
+
+    /* 1 - ALU */
+    shader[i++] = CF_ALU_DWORD0(ADDR(4),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(2),
+				USES_WATERFALL(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 2 */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
                                           TYPE(SQ_EXPORT_POS),
                                           RW_GPR(1),
@@ -486,7 +501,7 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                                CF_INST(SQ_CF_INST_EXPORT_DONE),
                                                WHOLE_QUAD_MODE(0),
                                                BARRIER(1));
-    /* 2 */
+    /* 3 */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
                                           TYPE(SQ_EXPORT_PARAM),
                                           RW_GPR(0),
@@ -504,9 +519,63 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                                CF_INST(SQ_CF_INST_EXPORT_DONE),
                                                WHOLE_QUAD_MODE(0),
                                                BARRIER(0));
-    shader[i++] = 0x00000000;
-    shader[i++] = 0x00000000;
-    /* 4/5 */
+
+
+    /* 4 texX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(1));
+
+    /* 5 texY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(1));
+
+    /* 6/7 */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
                              FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
                              FETCH_WHOLE_QUAD(0),
@@ -531,7 +600,7 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                              CONST_BUF_NO_STRIDE(0),
                              MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 6/7 */
+    /* 8/9 */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
                              FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
                              FETCH_WHOLE_QUAD(0),
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 23e7f40..6af0949 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -160,6 +160,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     int ref = pPriv->transform_index;
     Bool needgamma = FALSE;
     float ps_alu_consts[12];
+    float vs_alu_consts[4];
 
     cont = RTFContrast(pPriv->contrast);
     bright = RTFBrightness(pPriv->brightness);
@@ -521,6 +522,15 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     EREG(accel_state->ib, SPI_INTERP_CONTROL_0,                0);
 
 
+    vs_alu_consts[0] = 1.0 / pPriv->w;
+    vs_alu_consts[1] = 1.0 / pPriv->h;
+    vs_alu_consts[2] = 0.0;
+    vs_alu_consts[3] = 0.0;
+
+    /* VS alu constants */
+    set_alu_consts(pScrn, accel_state->ib, SQ_ALU_CONSTANT_vs,
+		   sizeof(vs_alu_consts) / SQ_ALU_CONSTANT_offset, vs_alu_consts);
+
     if (pPriv->vsync) {
 	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
 						    pPriv->drw_x,
@@ -571,18 +581,18 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
 	vb[0] = (float)dstX;
 	vb[1] = (float)dstY;
-	vb[2] = (float)srcX / pPriv->w;
-	vb[3] = (float)srcY / pPriv->h;
+	vb[2] = (float)srcX;
+	vb[3] = (float)srcY;
 
 	vb[4] = (float)dstX;
 	vb[5] = (float)(dstY + dsth);
-	vb[6] = (float)srcX / pPriv->w;
-	vb[7] = (float)(srcY + srch) / pPriv->h;
+	vb[6] = (float)srcX;
+	vb[7] = (float)(srcY + srch);
 
 	vb[8] = (float)(dstX + dstw);
 	vb[9] = (float)(dstY + dsth);
-	vb[10] = (float)(srcX + srcw) / pPriv->w;
-	vb[11] = (float)(srcY + srch) / pPriv->h;
+	vb[10] = (float)(srcX + srcw);
+	vb[11] = (float)(srcY + srch);
 
 	accel_state->vb_index += 3;
 
commit 026b6f820d6caea17d2a082193e850713d5770a8
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed May 13 15:48:32 2009 -0400

    R6xx/R7xx: do EXA transforms in the vertex shader

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 2dc33a8..18831f7 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -924,17 +924,6 @@ do {					\
 
 #define xFixedToFloat(f) (((float) (f)) / 65536)
 
-static inline void transformPoint(PictTransform *transform, xPointFixed *point)
-{
-    PictVector v;
-    v.vector[0] = point->x;
-    v.vector[1] = point->y;
-    v.vector[2] = xFixed1;
-    PictureTransformPoint(transform, &v);
-    point->x = v.vector[0];
-    point->y = v.vector[1];
-}
-
 struct blendinfo {
     Bool dst_alpha;
     Bool src_alpha;
@@ -1099,6 +1088,7 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     tex_resource_t  tex_res;
     tex_sampler_t   tex_samp;
     int pix_r, pix_g, pix_b, pix_a;
+    float vs_alu_consts[8];
 
     CLEAR (tex_res);
     CLEAR (tex_samp);
@@ -1118,9 +1108,6 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
 	    break;
     }
 
-    accel_state->texW[unit] = w;
-    accel_state->texH[unit] = h;
-
     /* ErrorF("Tex %d setup %dx%d\n", unit, w, h);  */
 
     /* flush texture cache */
@@ -1294,9 +1281,34 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     if (pPict->transform != 0) {
 	accel_state->is_transform[unit] = TRUE;
 	accel_state->transform[unit] = pPict->transform;
-    } else
+
+	vs_alu_consts[0] = xFixedToFloat(pPict->transform->matrix[0][0]);
+	vs_alu_consts[1] = xFixedToFloat(pPict->transform->matrix[0][1]);
+	vs_alu_consts[2] = xFixedToFloat(pPict->transform->matrix[0][2]);
+	vs_alu_consts[3] = 1.0 / w;
+
+	vs_alu_consts[4] = xFixedToFloat(pPict->transform->matrix[1][0]);
+	vs_alu_consts[5] = xFixedToFloat(pPict->transform->matrix[1][1]);
+	vs_alu_consts[6] = xFixedToFloat(pPict->transform->matrix[1][2]);
+	vs_alu_consts[7] = 1.0 / h;
+    } else {
 	accel_state->is_transform[unit] = FALSE;
 
+	vs_alu_consts[0] = 1.0;
+	vs_alu_consts[1] = 0.0;
+	vs_alu_consts[2] = 0.0;
+	vs_alu_consts[3] = 1.0 / w;
+
+	vs_alu_consts[4] = 0.0;
+	vs_alu_consts[5] = 1.0;
+	vs_alu_consts[6] = 0.0;
+	vs_alu_consts[7] = 1.0 / h;
+    }
+
+    /* VS alu constants */
+    set_alu_consts(pScrn, accel_state->ib, SQ_ALU_CONSTANT_vs + (unit * 2),
+		   sizeof(vs_alu_consts) / SQ_ALU_CONSTANT_offset, vs_alu_consts);
+
     return TRUE;
 }
 
@@ -1586,14 +1598,6 @@ static void R600Composite(PixmapPtr pDst,
     srcBottomRight.x = IntToxFixed(srcX + w);
     srcBottomRight.y = IntToxFixed(srcY + h);
 
-    /* XXX do transform in vertex shader */
-    if (accel_state->is_transform[0]) {
-	transformPoint(accel_state->transform[0], &srcTopLeft);
-	transformPoint(accel_state->transform[0], &srcTopRight);
-	transformPoint(accel_state->transform[0], &srcBottomLeft);
-	transformPoint(accel_state->transform[0], &srcBottomRight);
-    }
-
     if (accel_state->has_mask) {
 	xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
 
@@ -1616,33 +1620,26 @@ static void R600Composite(PixmapPtr pDst,
 	maskBottomRight.x = IntToxFixed(maskX + w);
 	maskBottomRight.y = IntToxFixed(maskY + h);
 
-	if (accel_state->is_transform[1]) {
-	    transformPoint(accel_state->transform[1], &maskTopLeft);
-	    transformPoint(accel_state->transform[1], &maskTopRight);
-	    transformPoint(accel_state->transform[1], &maskBottomLeft);
-	    transformPoint(accel_state->transform[1], &maskBottomRight);
-	}
-
 	vb[0] = (float)dstX;
 	vb[1] = (float)dstY;
-	vb[2] = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
-	vb[3] = xFixedToFloat(srcTopLeft.y) / accel_state->texH[0];
-	vb[4] = xFixedToFloat(maskTopLeft.x) / accel_state->texW[1];
-	vb[5] = xFixedToFloat(maskTopLeft.y) / accel_state->texH[1];
+	vb[2] = xFixedToFloat(srcTopLeft.x);
+	vb[3] = xFixedToFloat(srcTopLeft.y);
+	vb[4] = xFixedToFloat(maskTopLeft.x);
+	vb[5] = xFixedToFloat(maskTopLeft.y);
 
 	vb[6] = (float)dstX;
 	vb[7] = (float)(dstY + h);
-	vb[8] = xFixedToFloat(srcBottomLeft.x) / accel_state->texW[0];
-	vb[9] = xFixedToFloat(srcBottomLeft.y) / accel_state->texH[0];
-	vb[10] = xFixedToFloat(maskBottomLeft.x) / accel_state->texW[1];
-	vb[11] = xFixedToFloat(maskBottomLeft.y) / accel_state->texH[1];
+	vb[8] = xFixedToFloat(srcBottomLeft.x);
+	vb[9] = xFixedToFloat(srcBottomLeft.y);
+	vb[10] = xFixedToFloat(maskBottomLeft.x);
+	vb[11] = xFixedToFloat(maskBottomLeft.y);
 
 	vb[12] = (float)(dstX + w);
 	vb[13] = (float)(dstY + h);
-	vb[14] = xFixedToFloat(srcBottomRight.x) / accel_state->texW[0];
-	vb[15] = xFixedToFloat(srcBottomRight.y) / accel_state->texH[0];
-	vb[16] = xFixedToFloat(maskBottomRight.x) / accel_state->texW[1];
-	vb[17] = xFixedToFloat(maskBottomRight.y) / accel_state->texH[1];
+	vb[14] = xFixedToFloat(srcBottomRight.x);
+	vb[15] = xFixedToFloat(srcBottomRight.y);
+	vb[16] = xFixedToFloat(maskBottomRight.x);
+	vb[17] = xFixedToFloat(maskBottomRight.y);
 
     } else {
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
@@ -1657,18 +1654,18 @@ static void R600Composite(PixmapPtr pDst,
 
 	vb[0] = (float)dstX;
 	vb[1] = (float)dstY;
-	vb[2] = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
-	vb[3] = xFixedToFloat(srcTopLeft.y) / accel_state->texH[0];
+	vb[2] = xFixedToFloat(srcTopLeft.x);
+	vb[3] = xFixedToFloat(srcTopLeft.y);
 
 	vb[4] = (float)dstX;
 	vb[5] = (float)(dstY + h);
-	vb[6] = xFixedToFloat(srcBottomLeft.x) / accel_state->texW[0];
-	vb[7] = xFixedToFloat(srcBottomLeft.y) / accel_state->texH[0];
+	vb[6] = xFixedToFloat(srcBottomLeft.x);
+	vb[7] = xFixedToFloat(srcBottomLeft.y);
 
 	vb[8] = (float)(dstX + w);
 	vb[9] = (float)(dstY + h);
-	vb[10] = xFixedToFloat(srcBottomRight.x) / accel_state->texW[0];
-	vb[11] = xFixedToFloat(srcBottomRight.y) / accel_state->texH[0];
+	vb[10] = xFixedToFloat(srcBottomRight.x);
+	vb[11] = xFixedToFloat(srcBottomRight.y);
     }
 
     accel_state->vb_index += 3;
diff --git a/src/r600_shader.c b/src/r600_shader.c
index 0a820cf..fba8dcb 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -1322,7 +1322,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                             WHOLE_QUAD_MODE(0),
                             BARRIER(0));
     /* 1 */
-    shader[i++] = CF_DWORD0(ADDR(14));
+    shader[i++] = CF_DWORD0(ADDR(28));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
                             COND(SQ_CF_COND_NOT_BOOL),
@@ -1346,7 +1346,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
     /* 3 - mask sub */
-    shader[i++] = CF_DWORD0(ADDR(8));
+    shader[i++] = CF_DWORD0(ADDR(22));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
@@ -1357,7 +1357,22 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_VTX),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 4 - dst */
+
+    /* 4 - ALU */
+    shader[i++] = CF_ALU_DWORD0(ADDR(9),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(12),
+				USES_WATERFALL(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 5 - dst */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
 					  TYPE(SQ_EXPORT_POS),
 					  RW_GPR(2),
@@ -1366,8 +1381,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					  ELEM_SIZE(0));
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
 					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
+					       SRC_SEL_Z(SQ_SEL_0),
+					       SRC_SEL_W(SQ_SEL_1),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(1),
 					       END_OF_PROGRAM(0),
@@ -1375,7 +1390,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(1));
-    /* 5 - src */
+    /* 6 - src */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
 					  TYPE(SQ_EXPORT_PARAM),
 					  RW_GPR(1),
@@ -1384,8 +1399,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					  ELEM_SIZE(0));
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
 					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
+					       SRC_SEL_Z(SQ_SEL_0),
+					       SRC_SEL_W(SQ_SEL_1),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(1),
 					       END_OF_PROGRAM(0),
@@ -1393,7 +1408,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(0));
-    /* 6 - mask */
+    /* 7 - mask */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(1),
 					  TYPE(SQ_EXPORT_PARAM),
 					  RW_GPR(0),
@@ -1402,8 +1417,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					  ELEM_SIZE(0));
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
 					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
+					       SRC_SEL_Z(SQ_SEL_0),
+					       SRC_SEL_W(SQ_SEL_1),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(1),
 					       END_OF_PROGRAM(0),
@@ -1411,7 +1426,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(0));
-    /* 7 */
+    /* 8 */
     shader[i++] = CF_DWORD0(ADDR(0));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1423,7 +1438,301 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 8/9 - dst */
+
+
+    /* 9 srcX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+    /* 10 srcY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(257),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 11 srcX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(1),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+    /* 12 srcY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(1),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 13 maskX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(258),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 14 maskY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(259),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 15 srcX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+    /* 16 srcY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(259),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 17 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(1));
+
+    /* 18 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(1),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(1));
+
+    /* 19 maskX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(258),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(1));
+
+    /* 20 maskY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(259),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(1));
+    /* 21 */
+    shader[i++] = 0x00000000;
+    shader[i++] = 0x00000000;
+
+    /* 22/23 - dst */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1448,7 +1757,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 10/11 - src */
+    /* 24/25 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1461,8 +1770,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(0),
 				 DST_SEL_X(SQ_SEL_X),
 				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
+				 DST_SEL_Z(SQ_SEL_1),
+				 DST_SEL_W(SQ_SEL_0),
 				 USE_CONST_FIELDS(0),
 				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
 				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
@@ -1473,7 +1782,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(0));
     shader[i++] = VTX_DWORD_PAD;
-    /* 12/13 - mask */
+    /* 26/27 - mask */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1486,8 +1795,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(0),
 				 DST_SEL_X(SQ_SEL_X),
 				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
+				 DST_SEL_Z(SQ_SEL_1),
+				 DST_SEL_W(SQ_SEL_0),
 				 USE_CONST_FIELDS(0),
 				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
 				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
@@ -1499,8 +1808,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     MEGA_FETCH(0));
     shader[i++] = VTX_DWORD_PAD;
 
-    /* 14 - non-mask sub */
-    shader[i++] = CF_DWORD0(ADDR(18));
+    /* 28 - non-mask sub */
+    shader[i++] = CF_DWORD0(ADDR(40));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
@@ -1511,7 +1820,22 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_VTX),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 15 - dst */
+
+    /* 29 - ALU */
+    shader[i++] = CF_ALU_DWORD0(ADDR(33),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(6),
+				USES_WATERFALL(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 30 - dst */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
 					  TYPE(SQ_EXPORT_POS),
 					  RW_GPR(1),
@@ -1520,8 +1844,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					  ELEM_SIZE(0));
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
 					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
+					       SRC_SEL_Z(SQ_SEL_0),
+					       SRC_SEL_W(SQ_SEL_1),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(0),
 					       END_OF_PROGRAM(0),
@@ -1529,7 +1853,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(1));
-    /* 16 - src */
+    /* 31 - src */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
 					  TYPE(SQ_EXPORT_PARAM),
 					  RW_GPR(0),
@@ -1538,8 +1862,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					  ELEM_SIZE(0));
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
 					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
+					       SRC_SEL_Z(SQ_SEL_0),
+					       SRC_SEL_W(SQ_SEL_1),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(0),
 					       END_OF_PROGRAM(0),
@@ -1547,7 +1871,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(0));
-    /* 17 */
+    /* 32 */
     shader[i++] = CF_DWORD0(ADDR(0));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1559,7 +1883,156 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 18/19 - dst */
+
+
+    /* 33 srcX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+    /* 34 srcY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(257),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 35 srcX MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_Z),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+    /* 36 srcY MAD */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
+                                 SRC2_REL(ABSOLUTE),
+                                 SRC2_ELEM(ELEM_W),
+                                 SRC2_NEG(0),
+                                 ALU_INST(SQ_OP3_INST_MULADD),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+    /* 37 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(1));
+
+    /* 38 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_AR_X),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_MUL),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(0),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(1));
+
+    /* 39 */
+    shader[i++] = 0x00000000;
+    shader[i++] = 0x00000000;
+
+    /* 40/41 - dst */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1584,7 +2057,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 20/21 - src */
+    /* 42/43 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1597,8 +2070,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(0),
 				 DST_SEL_X(SQ_SEL_X),
 				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
+				 DST_SEL_Z(SQ_SEL_1),
+				 DST_SEL_W(SQ_SEL_0),
 				 USE_CONST_FIELDS(0),
 				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
 				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
commit cd89241396d1931b04cfbdd8d553be16dbf9c360
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue May 12 17:30:02 2009 -0400

    R3xx-R5xx: do EXA transforms in the vertex shader

diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index ba358ab..28bb6e5 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -220,10 +220,10 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 
 	/* pre-load the vertex shaders */
 	if (info->accel_state->has_tcl) {
-	    /* exa mask/Xv bicubic shader program */
-	    BEGIN_ACCEL(13);
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
-	    /* PVS inst 0 */
+	    BEGIN_ACCEL(37);
+	    /* exa composite shader program */
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_INST_INDEX(0));
+	    /* PVS inst 0 - dst X,Y */
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
 			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
@@ -235,8 +235,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_OFFSET(0) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
-			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(0) |
@@ -252,20 +252,26 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 
-	    /* PVS inst 1 */
+	    /* PVS inst 1 - src X */
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
-			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-			   R300_PVS_DST_OFFSET(1) |
-			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
-			   R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+			  (R300_PVS_DST_OPCODE(R300_VE_DOT_PRODUCT) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_TEMPORARY) |
+			   R300_PVS_DST_OFFSET(0) |
+			   R300_PVS_DST_WE_X));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_1) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
@@ -273,6 +279,27 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+	    /* PVS inst 2 - src Y */
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_DST_OPCODE(R300_VE_DOT_PRODUCT) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_TEMPORARY) |
+			   R300_PVS_DST_OFFSET(0) |
+			   R300_PVS_DST_WE_Y));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+			   R300_PVS_SRC_OFFSET(6) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_1) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(1) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
@@ -281,82 +308,138 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 
-	    /* PVS inst 2 */
+	    /* PVS inst 3 - src X / w */
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
+			  (R300_PVS_DST_OPCODE(R300_VE_MULTIPLY) |
 			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-			   R300_PVS_DST_OFFSET(2) |
-			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
-			   R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+			   R300_PVS_DST_OFFSET(1) |
+			   R300_PVS_DST_WE_X));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(7) |
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_TEMPORARY) |
+			   R300_PVS_SRC_OFFSET(0) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
-			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
-			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_W) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(7) |
+			   R300_PVS_SRC_OFFSET(6) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+	    /* PVS inst 4 - src y / h */
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_DST_OPCODE(R300_VE_MULTIPLY) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+			   R300_PVS_DST_OFFSET(1) |
+			   R300_PVS_DST_WE_Y));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_TEMPORARY) |
+			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(1) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_W) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(7) |
+			   R300_PVS_SRC_OFFSET(6) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
-	    FINISH_ACCEL();
 
-	    BEGIN_ACCEL(9);
-	    /* exa no mask instruction */
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 3);
-	    /* PVS inst 0 */
+	    /* PVS inst 5 - mask X */
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
-			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+			  (R300_PVS_DST_OPCODE(R300_VE_DOT_PRODUCT) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_TEMPORARY) |
 			   R300_PVS_DST_OFFSET(0) |
-			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
-			   R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+			   R300_PVS_DST_WE_Z));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_OFFSET(7) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_1) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(2) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_OFFSET(7) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+	    /* PVS inst 6 - mask Y */
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_DST_OPCODE(R300_VE_DOT_PRODUCT) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_TEMPORARY) |
+			   R300_PVS_DST_OFFSET(0) |
+			   R300_PVS_DST_WE_W));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_OFFSET(7) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_1) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(3) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+			   R300_PVS_SRC_OFFSET(7) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 
-	    /* PVS inst 1 */
+	    /* PVS inst 7 - mask X / w */
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
+			  (R300_PVS_DST_OPCODE(R300_VE_MULTIPLY) |
 			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-			   R300_PVS_DST_OFFSET(1) |
-			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
-			   R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+			   R300_PVS_DST_OFFSET(2) |
+			   R300_PVS_DST_WE_X));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
-			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-			   R300_PVS_SRC_OFFSET(6) |
-			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
-			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
-			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_TEMPORARY) |
+			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_Z) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(2) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_W) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
@@ -364,6 +447,27 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+	    /* PVS inst 8 - mask y / h */
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_DST_OPCODE(R300_VE_MULTIPLY) |
+			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+			   R300_PVS_DST_OFFSET(2) |
+			   R300_PVS_DST_WE_Y));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_TEMPORARY) |
+			   R300_PVS_SRC_OFFSET(0) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_W) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_CONSTANT) |
+			   R300_PVS_SRC_OFFSET(3) |
+			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_W) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
@@ -375,7 +479,7 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 
 	    /* Xv shader program */
 	    BEGIN_ACCEL(9);
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 5);
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_INST_INDEX(9));
 
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
@@ -388,8 +492,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_OFFSET(0) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
-			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(0) |
@@ -409,15 +513,14 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			  (R300_PVS_DST_OPCODE(R300_VE_ADD) |
 			   R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
 			   R300_PVS_DST_OFFSET(1) |
-			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
-			   R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+			   R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
 			   R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
 			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
-			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_Z) |
-			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_W)));
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
 			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
 			   R300_PVS_SRC_OFFSET(6) |
@@ -433,6 +536,97 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
 			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
 			   R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
 	    FINISH_ACCEL();
+
+            /* Xv bicubic shader program */
+	    BEGIN_ACCEL(13);
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_INST_INDEX(11));
+            /* PVS inst 0 */
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_DST_OPCODE(R300_VE_ADD) |
+                           R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+			   R300_PVS_DST_OFFSET(0) |
+                           R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
+                           R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(0) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(0) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(0) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+            /* PVS inst 1 */
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_DST_OPCODE(R300_VE_ADD) |
+                           R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+                           R300_PVS_DST_OFFSET(1) |
+                           R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
+                           R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(6) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(6) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+			  (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(6) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+			   R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+
+            /* PVS inst 2 */
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_DST_OPCODE(R300_VE_ADD) |
+                           R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
+                           R300_PVS_DST_OFFSET(2) |
+                           R300_PVS_DST_WE_X | R300_PVS_DST_WE_Y |
+                           R300_PVS_DST_WE_Z | R300_PVS_DST_WE_W));
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(7) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_X) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_Y) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_1)));
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(7) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG,
+                          (R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
+                           R300_PVS_SRC_OFFSET(7) |
+                           R300_PVS_SRC_SWIZZLE_X(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Y(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_Z(R300_PVS_SRC_SELECT_FORCE_0) |
+                           R300_PVS_SRC_SWIZZLE_W(R300_PVS_SRC_SELECT_FORCE_0)));
+            FINISH_ACCEL();
 	}
 
 	/* pre-load the RS instructions */
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index 8dbbee9..89c803a 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -1099,9 +1099,6 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
      */
     txformat0 |= R300_TXPITCH_EN;
 
-    info->accel_state->texW[unit] = w;
-    info->accel_state->texH[unit] = h;
-
     txfilter = (unit << R300_TX_ID_SHIFT);
 
     if (pPict->repeat) {
@@ -1160,8 +1157,61 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
     if (pPict->transform != 0) {
 	info->accel_state->is_transform[unit] = TRUE;
 	info->accel_state->transform[unit] = pPict->transform;
+
+	/* setup the PVS consts */
+	if (info->accel_state->has_tcl) {
+	    info->accel_state->texW[unit] = 1;
+	    info->accel_state->texH[unit] = 1;
+	    BEGIN_ACCEL(9);
+	    if (IS_R300_3D)
+		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
+	    else
+		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
+
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2])));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
+
+	    FINISH_ACCEL();
+	} else {
+	    info->accel_state->texW[unit] = w;
+	    info->accel_state->texH[unit] = h;
+	}
     } else {
 	info->accel_state->is_transform[unit] = FALSE;
+
+	/* setup the PVS consts */
+	if (info->accel_state->has_tcl) {
+	    info->accel_state->texW[unit] = 1;
+	    info->accel_state->texH[unit] = 1;
+
+	    BEGIN_ACCEL(9);
+	    if (IS_R300_3D)
+		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
+	    else
+		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
+
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
+
+	    FINISH_ACCEL();
+	} else {
+	    info->accel_state->texW[unit] = w;
+	    info->accel_state->texH[unit] = h;
+	}
     }
 
     return TRUE;
@@ -1310,9 +1360,10 @@ static Bool FUNC_NAME(R300PrepareComposite)(int op, PicturePtr pSrcPicture,
     /* setup the VAP */
     if (info->accel_state->has_tcl) {
 	if (pMask)
-	    BEGIN_ACCEL(8);
+	    BEGIN_ACCEL(10);
 	else
-	    BEGIN_ACCEL(7);
+	    BEGIN_ACCEL(9);
+	OUT_ACCEL_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0);
     } else {
 	if (pMask)
 	    BEGIN_ACCEL(6);
@@ -1363,22 +1414,28 @@ static Bool FUNC_NAME(R300PrepareComposite)(int op, PicturePtr pSrcPicture,
 
     /* load the vertex shader
      * We pre-load vertex programs in RADEONInit3DEngine():
-     * - exa no mask
-     * - exa mask
+     * - exa
      * - Xv
+     * - Xv bicubic
      * Here we select the offset of the vertex program we want to use
      */
     if (info->accel_state->has_tcl) {
 	if (pMask) {
+	    /* consts used by vertex shaders */
+	    OUT_ACCEL_REG(R300_VAP_PVS_CONST_CNTL, (R300_PVS_CONST_BASE_OFFSET(0) |
+						    R300_PVS_MAX_CONST_ADDR(3)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
 			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
-			   (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			   (2 << R300_PVS_LAST_INST_SHIFT)));
+			   (8 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (8 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			  (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+			  (8 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
 	} else {
+	    /* consts used by vertex shaders */
+	    OUT_ACCEL_REG(R300_VAP_PVS_CONST_CNTL, (R300_PVS_CONST_BASE_OFFSET(0) |
+						    R300_PVS_MAX_CONST_ADDR(3)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			  ((3 << R300_PVS_FIRST_INST_SHIFT) |
+			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
 			   (4 << R300_PVS_XYZW_VALID_INST_SHIFT) |
 			   (4 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
@@ -2054,10 +2111,12 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
     srcBottomRight.y = IntToxFixed(srcY + h);
 
     if (info->accel_state->is_transform[0]) {
-	transformPoint(info->accel_state->transform[0], &srcTopLeft);
-	transformPoint(info->accel_state->transform[0], &srcTopRight);
-	transformPoint(info->accel_state->transform[0], &srcBottomLeft);
-	transformPoint(info->accel_state->transform[0], &srcBottomRight);
+	if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
+	    transformPoint(info->accel_state->transform[0], &srcTopLeft);
+	    transformPoint(info->accel_state->transform[0], &srcTopRight);
+	    transformPoint(info->accel_state->transform[0], &srcBottomLeft);
+	    transformPoint(info->accel_state->transform[0], &srcBottomRight);
+	}
     }
 
     if (info->accel_state->has_mask) {
@@ -2071,10 +2130,12 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
 	maskBottomRight.y = IntToxFixed(maskY + h);
 
 	if (info->accel_state->is_transform[1]) {
-	    transformPoint(info->accel_state->transform[1], &maskTopLeft);
-	    transformPoint(info->accel_state->transform[1], &maskTopRight);
-	    transformPoint(info->accel_state->transform[1], &maskBottomLeft);
-	    transformPoint(info->accel_state->transform[1], &maskBottomRight);
+	    if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
+		transformPoint(info->accel_state->transform[1], &maskTopLeft);
+		transformPoint(info->accel_state->transform[1], &maskTopRight);
+		transformPoint(info->accel_state->transform[1], &maskBottomLeft);
+		transformPoint(info->accel_state->transform[1], &maskBottomRight);
+	    }
 	}
 
 	vtx_count = 6;
diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 248cb42..9261b39 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -4284,6 +4284,12 @@
 #define R300_VAP_PVS_CODE_CNTL_1			0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT         0
 #define R300_VAP_PVS_VECTOR_INDX_REG		        0x2200
+#       define R300_PVS_CODE_START                      0
+#       define R300_PVS_CONST_START                     512
+#       define R500_PVS_CONST_START                     1024
+#       define R300_PVS_VECTOR_INST_INDEX(x)            ((x) + R300_PVS_CODE_START)
+#       define R300_PVS_VECTOR_CONST_INDEX(x)           ((x) + R300_PVS_CONST_START)
+#       define R500_PVS_VECTOR_CONST_INDEX(x)           ((x) + R500_PVS_CONST_START)
 #define R300_VAP_PVS_VECTOR_DATA_REG		        0x2204
 /* PVS instructions */
 /* Opcode and dst instruction */
@@ -4402,6 +4408,10 @@
 #define R300_PVS_SRC_ADDR_SEL(x)                        ((x) << 29)
 #define R300_PVS_SRC_ADDR_MODE_1                        (1 << 31)
 
+#define R300_VAP_PVS_CONST_CNTL                         0x22d4
+#       define R300_PVS_CONST_BASE_OFFSET(x)            ((x) << 0)
+#       define R300_PVS_MAX_CONST_ADDR(x)               ((x) << 16)
+
 #define R300_VAP_PVS_FLOW_CNTL_OPC		        0x22dc
 #define R300_VAP_OUT_VTX_FMT_0			        0x2090
 #       define R300_VTX_POS_PRESENT                     (1 << 0)
@@ -5441,9 +5451,6 @@
 #   define R500_W_SRC_US				(0 << 2)
 #   define R500_W_SRC_RAS				(1 << 2)
 
-#define R500_GA_US_VECTOR_INDEX 0x4250
-#define R500_GA_US_VECTOR_DATA 0x4254
-
 #define R500_RS_INST_0					0x4320
 #define R500_RS_INST_1					0x4324
 #   define R500_RS_INST_TEX_ID_SHIFT			0
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index bbc5caf..8ead2a4 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -1213,26 +1213,26 @@ FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     /* load the vertex shader
      * We pre-load vertex programs in RADEONInit3DEngine():
-     * - exa mask/Xv bicubic
-     * - exa no mask
+     * - exa
      * - Xv
+     * - Xv bicubic
      * Here we select the offset of the vertex program we want to use
      */
     if (info->accel_state->has_tcl) {
 	if (pPriv->bicubic_enabled) {
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
-			   (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			   (2 << R300_PVS_LAST_INST_SHIFT)));
+			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
+			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (13 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			  (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
 	} else {
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			  ((5 << R300_PVS_FIRST_INST_SHIFT) |
-			   (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			   (6 << R300_PVS_LAST_INST_SHIFT)));
+			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
+			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (10 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			  (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
 	}
     }
 
@@ -2643,26 +2643,26 @@ FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     /* load the vertex shader
      * We pre-load vertex programs in RADEONInit3DEngine():
-     * - exa mask/Xv bicubic
-     * - exa no mask
+     * - exa
      * - Xv
+     * - Xv bicubic
      * Here we select the offset of the vertex program we want to use
      */
     if (info->accel_state->has_tcl) {
 	if (pPriv->bicubic_enabled) {
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			  ((0 << R300_PVS_FIRST_INST_SHIFT) |
-			   (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			   (2 << R300_PVS_LAST_INST_SHIFT)));
+			  ((11 << R300_PVS_FIRST_INST_SHIFT) |
+			   (13 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (13 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			  (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+			  (13 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
 	} else {
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
-			  ((5 << R300_PVS_FIRST_INST_SHIFT) |
-			   (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			   (6 << R300_PVS_LAST_INST_SHIFT)));
+			  ((9 << R300_PVS_FIRST_INST_SHIFT) |
+			   (10 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			   (10 << R300_PVS_LAST_INST_SHIFT)));
 	    OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
-			  (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
+			  (10 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
 	}
     }
 


More information about the xorg-commit mailing list