xf86-video-ati: Branch 'master' - 2 commits

Alex Deucher agd5f at kemper.freedesktop.org
Mon Mar 2 01:12:34 PST 2009


 src/r600_exa.c                 |   37 -
 src/r600_shader.c              | 1018 +++++++++++++----------------------------
 src/r600_shader.h              |    3 
 src/r600_state.h               |    2 
 src/r600_textured_videofuncs.c |   16 
 src/r6xx_accel.c               |    7 
 src/radeon.h                   |    4 
 7 files changed, 359 insertions(+), 728 deletions(-)

New commits:
commit b7164ac4ad55e5d0fc474df8ae762b469b91ba30
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Mar 2 04:08:09 2009 -0500

    R6xx/R7xx EXA: combine composite mask/non-mask VS
    
    Also fix set_bool_const()
    the CF bool consts are not contiguous by shader type
    There are 96 boolean constants (32 each for PS, VS, GS) and
    they are ordered as follows:
    ps, vs, gs ... ps, vs, gs

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 70c59b2..f406630 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1496,12 +1496,14 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 	accel_state->is_transform[1] = FALSE;
     }
 
-    if (pMask != NULL)
-	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->comp_mask_vs_offset;
+    /* VS bool constant */
+    if (pMask)
+	set_bool_const(pScrn, accel_state->ib, 1, 1);
     else
-	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->comp_vs_offset;
+	set_bool_const(pScrn, accel_state->ib, 1, 0);
+
+    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->comp_vs_offset;
 
     memcpy ((char *)accel_state->ib->address + (accel_state->ib->total / 2) - 256, ps, sizeof(ps));
     accel_state->ps_mc_addr = info->gartLocation + info->dri->bufStart +
@@ -1518,7 +1520,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.num_gprs            = 3;
-    vs_conf.stack_size          = 0;
+    vs_conf.stack_size          = 1;
     vs_setup                    (pScrn, accel_state->ib, &vs_conf);
 
     /* flush SQ cache */
@@ -1980,7 +1982,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     RADEONChipFamily ChipSet = info->ChipFamily;
     uint32_t *shader;
     /* 512 bytes per shader for now */
-    int size = 512 * 11;
+    int size = 512 * 9;
 
     accel_state->shaders = NULL;
 
@@ -2016,20 +2018,16 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     accel_state->comp_ps_offset = 2560;
     /*  not yet */
 
-    /*  comp mask vs --------------------------------------- */
-    accel_state->comp_mask_vs_offset = 3072;
-    R600_comp_mask_vs(ChipSet, shader + accel_state->comp_mask_vs_offset / 4);
-
     /*  comp mask ps --------------------------------------- */
-    accel_state->comp_mask_ps_offset = 3584;
+    accel_state->comp_mask_ps_offset = 3072;
     /*  not yet */
 
     /*  xv vs --------------------------------------- */
-    accel_state->xv_vs_offset = 4096;
+    accel_state->xv_vs_offset = 3584;
     R600_xv_vs(ChipSet, shader + accel_state->xv_vs_offset / 4);
 
     /*  xv ps --------------------------------------- */
-    accel_state->xv_ps_offset = 4608;
+    accel_state->xv_ps_offset = 4096;
     R600_xv_ps(ChipSet, shader + accel_state->xv_ps_offset / 4);
 
     return TRUE;
diff --git a/src/r600_reg_r6xx.h b/src/r600_reg_r6xx.h
index 2c9113e..2e7dfa9 100644
--- a/src/r600_reg_r6xx.h
+++ b/src/r600_reg_r6xx.h
@@ -488,15 +488,6 @@ enum {
     SQ_LOOP_CONST_ps                                      = 0,
     SQ_LOOP_CONST_vs                                      = SQ_LOOP_CONST_ps + SQ_LOOP_CONST_ps_num,
     SQ_LOOP_CONST_gs                                      = SQ_LOOP_CONST_vs + SQ_LOOP_CONST_vs_num,
-    SQ_BOOL_CONST                                         = SQ_BOOL_CONST_0,		/* 32 per PS, VS, GS */
-    SQ_BOOL_CONST_ps_num                                  = 32,
-    SQ_BOOL_CONST_vs_num                                  = 32,
-    SQ_BOOL_CONST_gs_num                                  = 32,
-    SQ_BOOL_CONST_all_num                                 = 96,
-    SQ_BOOL_CONST_offset                                  = 4,
-    SQ_BOOL_CONST_ps                                      = 0,
-    SQ_BOOL_CONST_vs                                      = SQ_BOOL_CONST_ps + SQ_BOOL_CONST_ps_num,
-    SQ_BOOL_CONST_gs                                      = SQ_BOOL_CONST_vs + SQ_BOOL_CONST_vs_num,
 } ;
 
 
diff --git a/src/r600_shader.c b/src/r600_shader.c
index c5522f9..21c4c68 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -1244,156 +1244,6 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
     return i;
 }
 
-/* comp mask vs --------------------------------------- */
-int R600_comp_mask_vs(RADEONChipFamily ChipSet, uint32_t* shader)
-{
-    int i = 0;
-
-    /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(4));
-    shader[i++] = CF_DWORD1(POP_COUNT(0),
-			    CF_CONST(0),
-			    COND(SQ_CF_COND_ACTIVE),
-			    I_COUNT(3),
-			    CALL_COUNT(0),
-			    END_OF_PROGRAM(0),
-			    VALID_PIXEL_MODE(0),
-			    CF_INST(SQ_CF_INST_VTX),
-			    WHOLE_QUAD_MODE(0),
-			    BARRIER(1));
-    /* 1 - dst */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
-					  TYPE(SQ_EXPORT_POS),
-					  RW_GPR(2),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(0));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       R6xx_ELEM_LOOP(0),
-					       BURST_COUNT(1),
-					       END_OF_PROGRAM(0),
-					       VALID_PIXEL_MODE(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       WHOLE_QUAD_MODE(0),
-					       BARRIER(1));
-    /* 2 - src */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
-					  TYPE(SQ_EXPORT_PARAM),
-					  RW_GPR(1),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(0));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       R6xx_ELEM_LOOP(0),
-					       BURST_COUNT(1),
-					       END_OF_PROGRAM(0),
-					       VALID_PIXEL_MODE(0),
-					       CF_INST(SQ_CF_INST_EXPORT),
-					       WHOLE_QUAD_MODE(0),
-					       BARRIER(0));
-    /* 3 - mask */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(1),
-					  TYPE(SQ_EXPORT_PARAM),
-					  RW_GPR(0),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(0));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       R6xx_ELEM_LOOP(0),
-					       BURST_COUNT(1),
-					       END_OF_PROGRAM(1),
-					       VALID_PIXEL_MODE(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       WHOLE_QUAD_MODE(0),
-					       BARRIER(0));
-    /* 4/5 - dst */
-    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
-			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
-			     FETCH_WHOLE_QUAD(0),
-			     BUFFER_ID(0),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-			     SRC_SEL_X(SQ_SEL_X),
-			     MEGA_FETCH_COUNT(24));
-    shader[i++] = VTX_DWORD1_GPR(DST_GPR(2),
-				 DST_REL(0),
-				 DST_SEL_X(SQ_SEL_X),
-				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
-				 USE_CONST_FIELDS(0),
-				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
-				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
-				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
-				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
-    shader[i++] = VTX_DWORD2(OFFSET(0),
-			     ENDIAN_SWAP(ENDIAN_NONE),
-			     CONST_BUF_NO_STRIDE(0),
-			     MEGA_FETCH(1));
-    shader[i++] = VTX_DWORD_PAD;
-    /* 6/7 - src */
-    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
-			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
-			     FETCH_WHOLE_QUAD(0),
-			     BUFFER_ID(0),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-			     SRC_SEL_X(SQ_SEL_X),
-			     MEGA_FETCH_COUNT(8));
-    shader[i++] = VTX_DWORD1_GPR(DST_GPR(1),
-				 DST_REL(0),
-				 DST_SEL_X(SQ_SEL_X),
-				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
-				 USE_CONST_FIELDS(0),
-				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
-				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
-				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
-				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
-    shader[i++] = VTX_DWORD2(OFFSET(8),
-			     ENDIAN_SWAP(ENDIAN_NONE),
-			     CONST_BUF_NO_STRIDE(0),
-			     MEGA_FETCH(0));
-    shader[i++] = VTX_DWORD_PAD;
-    /* 8/9 - mask */
-    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
-			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
-			     FETCH_WHOLE_QUAD(0),
-			     BUFFER_ID(0),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-			     SRC_SEL_X(SQ_SEL_X),
-			     MEGA_FETCH_COUNT(8));
-    shader[i++] = VTX_DWORD1_GPR(DST_GPR(0),
-				 DST_REL(0),
-				 DST_SEL_X(SQ_SEL_X),
-				 DST_SEL_Y(SQ_SEL_Y),
-				 DST_SEL_Z(SQ_SEL_0),
-				 DST_SEL_W(SQ_SEL_1),
-				 USE_CONST_FIELDS(0),
-				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
-				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
-				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
-				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
-    shader[i++] = VTX_DWORD2(OFFSET(16),
-			     ENDIAN_SWAP(ENDIAN_NONE),
-			     CONST_BUF_NO_STRIDE(0),
-			     MEGA_FETCH(0));
-    shader[i++] = VTX_DWORD_PAD;
-
-    return i;
-}
-
 /* comp mask ps --------------------------------------- */
 int R600_comp_mask_ps(RADEONChipFamily ChipSet,
 		      uint32_t* shader,
@@ -1627,7 +1477,197 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
     int i = 0;
 
     /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(4));
+    shader[i++] = CF_DWORD0(ADDR(3));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_BOOL),
+                            I_COUNT(0),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(0));
+    /* 1 */
+    shader[i++] = CF_DWORD0(ADDR(14));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_NOT_BOOL),
+                            I_COUNT(0),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(0));
+    /* 2 */
+    shader[i++] = CF_DWORD0(0);
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_ACTIVE),
+                            I_COUNT(0),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(1),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_NOP),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(1));
+    /* 3 - mask sub */
+    shader[i++] = CF_DWORD0(ADDR(8));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(3),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_VTX),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    /* 4 - dst */
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+					  TYPE(SQ_EXPORT_POS),
+					  RW_GPR(2),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(0));
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       R6xx_ELEM_LOOP(0),
+					       BURST_COUNT(1),
+					       END_OF_PROGRAM(0),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       WHOLE_QUAD_MODE(0),
+					       BARRIER(1));
+    /* 5 - src */
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+					  TYPE(SQ_EXPORT_PARAM),
+					  RW_GPR(1),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(0));
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       R6xx_ELEM_LOOP(0),
+					       BURST_COUNT(1),
+					       END_OF_PROGRAM(0),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT),
+					       WHOLE_QUAD_MODE(0),
+					       BARRIER(0));
+    /* 6 - mask */
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(1),
+					  TYPE(SQ_EXPORT_PARAM),
+					  RW_GPR(0),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(0));
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       R6xx_ELEM_LOOP(0),
+					       BURST_COUNT(1),
+					       END_OF_PROGRAM(0),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       WHOLE_QUAD_MODE(0),
+					       BARRIER(0));
+    /* 7 */
+    shader[i++] = CF_DWORD0(ADDR(0));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    /* 8/9 - dst */
+    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			     FETCH_WHOLE_QUAD(0),
+			     BUFFER_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     SRC_SEL_X(SQ_SEL_X),
+			     MEGA_FETCH_COUNT(24));
+    shader[i++] = VTX_DWORD1_GPR(DST_GPR(2),
+				 DST_REL(0),
+				 DST_SEL_X(SQ_SEL_X),
+				 DST_SEL_Y(SQ_SEL_Y),
+				 DST_SEL_Z(SQ_SEL_0),
+				 DST_SEL_W(SQ_SEL_1),
+				 USE_CONST_FIELDS(0),
+				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
+				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
+				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
+				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    shader[i++] = VTX_DWORD2(OFFSET(0),
+			     ENDIAN_SWAP(ENDIAN_NONE),
+			     CONST_BUF_NO_STRIDE(0),
+			     MEGA_FETCH(1));
+    shader[i++] = VTX_DWORD_PAD;
+    /* 10/11 - src */
+    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			     FETCH_WHOLE_QUAD(0),
+			     BUFFER_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     SRC_SEL_X(SQ_SEL_X),
+			     MEGA_FETCH_COUNT(8));
+    shader[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+				 DST_REL(0),
+				 DST_SEL_X(SQ_SEL_X),
+				 DST_SEL_Y(SQ_SEL_Y),
+				 DST_SEL_Z(SQ_SEL_0),
+				 DST_SEL_W(SQ_SEL_1),
+				 USE_CONST_FIELDS(0),
+				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
+				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
+				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
+				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    shader[i++] = VTX_DWORD2(OFFSET(8),
+			     ENDIAN_SWAP(ENDIAN_NONE),
+			     CONST_BUF_NO_STRIDE(0),
+			     MEGA_FETCH(0));
+    shader[i++] = VTX_DWORD_PAD;
+    /* 12/13 - mask */
+    shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			     FETCH_WHOLE_QUAD(0),
+			     BUFFER_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     SRC_SEL_X(SQ_SEL_X),
+			     MEGA_FETCH_COUNT(8));
+    shader[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+				 DST_REL(0),
+				 DST_SEL_X(SQ_SEL_X),
+				 DST_SEL_Y(SQ_SEL_Y),
+				 DST_SEL_Z(SQ_SEL_0),
+				 DST_SEL_W(SQ_SEL_1),
+				 USE_CONST_FIELDS(0),
+				 DATA_FORMAT(FMT_32_32_FLOAT), /* xxx */
+				 NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), /* xxx */
+				 FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), /* xxx */
+				 SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    shader[i++] = VTX_DWORD2(OFFSET(16),
+			     ENDIAN_SWAP(ENDIAN_NONE),
+			     CONST_BUF_NO_STRIDE(0),
+			     MEGA_FETCH(0));
+    shader[i++] = VTX_DWORD_PAD;
+
+    /* 14 - non-mask sub */
+    shader[i++] = CF_DWORD0(ADDR(18));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
@@ -1638,7 +1678,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_VTX),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
-    /* 1 - dst */
+    /* 15 - dst */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
 					  TYPE(SQ_EXPORT_POS),
 					  RW_GPR(1),
@@ -1656,7 +1696,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(1));
-    /* 2 - src */
+    /* 16 - src */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
 					  TYPE(SQ_EXPORT_PARAM),
 					  RW_GPR(0),
@@ -1669,15 +1709,24 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 					       SRC_SEL_W(SQ_SEL_W),
 					       R6xx_ELEM_LOOP(0),
 					       BURST_COUNT(0),
-					       END_OF_PROGRAM(1),
+					       END_OF_PROGRAM(0),
 					       VALID_PIXEL_MODE(0),
 					       CF_INST(SQ_CF_INST_EXPORT_DONE),
 					       WHOLE_QUAD_MODE(0),
 					       BARRIER(0));
-    /* 3 */
-    shader[i++] = 0x00000000;
-    shader[i++] = 0x00000000;
-    /* 4/5 - dst */
+    /* 17 */
+    shader[i++] = CF_DWORD0(ADDR(0));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    /* 18/19 - dst */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1702,7 +1751,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 6/7 - src */
+    /* 20/21 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
diff --git a/src/r600_state.h b/src/r600_state.h
index 8e7334d..6621420 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -255,9 +255,7 @@ ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf);
 void
 set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf);
 void
-set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf);
-void
-set_loop_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf);
+set_bool_const(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val);
 void
 set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res);
 void
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index d208b07..56adc6d 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -115,7 +115,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
     int uv_offset;
-    uint32_t bool_consts[1] = { 0 };
     static float ps_alu_consts[] = {
         1.0,  0.0,      1.4020,   0,  /* r - c[0] */
         1.0, -0.34414, -0.71414,  0,  /* g - c[1] */
@@ -169,15 +168,16 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
 	accel_state->xv_ps_offset;
 
+    /* PS bool constant */
     switch(pPriv->id) {
     case FOURCC_YV12:
     case FOURCC_I420:
-	bool_consts[0] = 1;
+	set_bool_const(pScrn, accel_state->ib, 0, 1);
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
     default:
-	bool_consts[0] = 0;
+	set_bool_const(pScrn, accel_state->ib, 0, 0);
 	break;
     }
 
@@ -211,10 +211,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     set_alu_consts(pScrn, accel_state->ib, SQ_ALU_CONSTANT_ps,
 		   sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
 
-    /* CF bool constants */
-    set_bool_consts(pScrn, accel_state->ib, SQ_BOOL_CONST_ps,
-		    sizeof(bool_consts) / SQ_BOOL_CONST_offset, bool_consts);
-
     /* Texture */
     switch(pPriv->id) {
     case FOURCC_YV12:
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index aa2ab86..114ccf5 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -433,26 +433,10 @@ set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *co
 }
 
 void
-set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf)
+set_bool_const(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val)
 {
-    int i;
-    const int countreg = count * (SQ_BOOL_CONST_offset >> 2);
-
-    PACK0(ib, SQ_BOOL_CONST + offset * SQ_BOOL_CONST_offset, countreg);
-    for (i = 0; i < countreg; i++)
-	E32(ib, const_buf[i]);
-
-}
-
-void
-set_loop_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf)
-{
-    int i;
-    const int countreg = count * (SQ_LOOP_CONST_offset >> 2);
-
-    PACK0(ib, SQ_LOOP_CONST + offset * SQ_LOOP_CONST_offset, countreg);
-    for (i = 0; i < countreg; i++)
-	E32(ib, const_buf[i]);
+    /* bool order is: ps, vs, gs, ps, vs, gs, ... */
+    EREG(ib, SQ_BOOL_CONST_0 + (offset << 2), val);
 }
 
 void
diff --git a/src/radeon.h b/src/radeon.h
index 6fcc36a..355a949 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -623,7 +623,6 @@ struct radeon_accel_state {
     uint32_t          copy_ps_offset;
     uint32_t          comp_vs_offset;
     uint32_t          comp_ps_offset;
-    uint32_t          comp_mask_vs_offset;
     uint32_t          comp_mask_ps_offset;
     uint32_t          xv_vs_offset;
     uint32_t          xv_ps_offset;
commit fa98f424de739be2c6005b740a74bbf1ee968a8b
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Mar 2 02:28:57 2009 -0500

    R6xx/R7xx Xv: combine packed and planar shaders
    
    use a bool const to select the tex fetch routine

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a44b611..70c59b2 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -246,7 +246,8 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 	ps_alu_consts[2] = (float)b / 255; /* B */
 	ps_alu_consts[3] = (float)a / 255; /* A */
     }
-    set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
+    set_alu_consts(pScrn, accel_state->ib, SQ_ALU_CONSTANT_ps,
+		   sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
 
     accel_state->vb_index = 0;
 
@@ -2027,13 +2028,9 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     accel_state->xv_vs_offset = 4096;
     R600_xv_vs(ChipSet, shader + accel_state->xv_vs_offset / 4);
 
-    /*  xv ps packed --------------------------------------- */
-    accel_state->xv_ps_offset_packed = 4608;
-    R600_xv_ps_packet(ChipSet, shader + accel_state->xv_ps_offset_packed / 4);
-
-    /*  xv ps planar ---------------------------------- */
-    accel_state->xv_ps_offset_planar = 5120;
-    R600_xv_ps_planar(ChipSet, shader + accel_state->xv_ps_offset_planar / 4);
+    /*  xv ps --------------------------------------- */
+    accel_state->xv_ps_offset = 4608;
+    R600_xv_ps(ChipSet, shader + accel_state->xv_ps_offset / 4);
 
     return TRUE;
 }
diff --git a/src/r600_reg_r6xx.h b/src/r600_reg_r6xx.h
index 2e7dfa9..2c9113e 100644
--- a/src/r600_reg_r6xx.h
+++ b/src/r600_reg_r6xx.h
@@ -488,6 +488,15 @@ enum {
     SQ_LOOP_CONST_ps                                      = 0,
     SQ_LOOP_CONST_vs                                      = SQ_LOOP_CONST_ps + SQ_LOOP_CONST_ps_num,
     SQ_LOOP_CONST_gs                                      = SQ_LOOP_CONST_vs + SQ_LOOP_CONST_vs_num,
+    SQ_BOOL_CONST                                         = SQ_BOOL_CONST_0,		/* 32 per PS, VS, GS */
+    SQ_BOOL_CONST_ps_num                                  = 32,
+    SQ_BOOL_CONST_vs_num                                  = 32,
+    SQ_BOOL_CONST_gs_num                                  = 32,
+    SQ_BOOL_CONST_all_num                                 = 96,
+    SQ_BOOL_CONST_offset                                  = 4,
+    SQ_BOOL_CONST_ps                                      = 0,
+    SQ_BOOL_CONST_vs                                      = SQ_BOOL_CONST_ps + SQ_BOOL_CONST_ps_num,
+    SQ_BOOL_CONST_gs                                      = SQ_BOOL_CONST_vs + SQ_BOOL_CONST_vs_num,
 } ;
 
 
diff --git a/src/r600_shader.c b/src/r600_shader.c
index ba716da..c5522f9 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -561,31 +561,35 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 }
 
 /*
- * ; xv ps packed
- * 00 TEX: ADDR(20) CNT(2) NO_BARRIER 
+ * ; xv ps planar
+ * 00 TEX: ADDR(20) CNT(3) NO_BARRIER 
+ *       0  SAMPLE R1.x__1, R0.xy01, t0, s0
+ *       1  SAMPLE R1.__x_, R0.xy01, t1, s1
+ *       2  SAMPLE R1._x__, R0.xy01, t2, s2
+ * 01 TEX: ADDR(28) CNT(2) NO_BARRIER 
  *       0  SAMPLE R1.x__1, R0.xy01, t0, s0
  *       1  SAMPLE R1._xy_, R0.xy01, t1, s1
- * 01 ALU: ADDR(3) CNT(16) 
- *       2  x: MULADD      R1.x,  R1.x,  C3.x,  C3.y      CLAMP 
+ * 02 ALU: ADDR(4) CNT(16) 
+ *       3  x: MULADD      R1.x,  R1.x,  C3.x,  C3.y      CLAMP 
  *          y: MULADD      R1.y,  R1.y,  C3.z,  C3.w      
  *          z: MULADD      R1.z,  R1.z,  C3.z,  C3.w      
- *          w: MOV         R1.w,  0.0f
- *       3  x: DOT4        R2.x,  R1.x,  C0.x      CLAMP VEC_102 
+ *          w: MOV         R1.w,  0.0f 
+ *       4  x: DOT4        R2.x,  R1.x,  C0.x      CLAMP VEC_102 
  *          y: DOT4        ____,  R1.y,  C0.y      CLAMP VEC_102 
  *          z: DOT4        ____,  R1.z,  C0.z      CLAMP VEC_102 
  *          w: DOT4        ____,  R1.w,  C0.w      CLAMP VEC_021 
- *       4  x: DOT4        ____,  R1.x,  C1.x      CLAMP VEC_102 
+ *       5  x: DOT4        ____,  R1.x,  C1.x      CLAMP VEC_102 
  *          y: DOT4        R2.y,  R1.y,  C1.y      CLAMP VEC_102 
  *          z: DOT4        ____,  R1.z,  C1.z      CLAMP VEC_102 
  *          w: DOT4        ____,  R1.w,  C1.w      CLAMP VEC_021 
- *       5  x: DOT4        ____,  R1.x,  C2.x      CLAMP VEC_102 
+ *       6  x: DOT4        ____,  R1.x,  C2.x      CLAMP VEC_102 
  *          y: DOT4        ____,  R1.y,  C2.y      CLAMP VEC_102 
  *          z: DOT4        R2.z,  R1.z,  C2.z      CLAMP VEC_102 
  *          w: DOT4        ____,  R1.w,  C2.w      CLAMP VEC_021 
- * 02 EXP_DONE: PIX0, R2
+ * 03 EXP_DONE: PIX0, R2
  * END_OF_PROGRAM
  */
-int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
+int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 {
     int i = 0;
 
@@ -593,16 +597,28 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = CF_DWORD0(ADDR(20));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
-                            COND(SQ_CF_COND_ACTIVE),
-                            I_COUNT(2),
+                            COND(SQ_CF_COND_BOOL),
+                            I_COUNT(0),
                             CALL_COUNT(0),
                             END_OF_PROGRAM(0),
                             VALID_PIXEL_MODE(0),
-                            CF_INST(SQ_CF_INST_TEX),
+                            CF_INST(SQ_CF_INST_CALL),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(0));
     /* 1 */
-    shader[i++] = CF_ALU_DWORD0(ADDR(3),
+    shader[i++] = CF_DWORD0(ADDR(28));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_NOT_BOOL),
+                            I_COUNT(0),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(0));
+    /* 2 */
+    shader[i++] = CF_ALU_DWORD0(ADDR(4),
                                 KCACHE_BANK0(0),
                                 KCACHE_BANK1(0),
                                 KCACHE_MODE0(SQ_CF_KCACHE_NOP));
@@ -614,7 +630,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                 CF_INST(SQ_CF_INST_ALU),
                                 WHOLE_QUAD_MODE(0),
                                 BARRIER(1));
-    /* 2 */
+    /* 3 */
     shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
                                           TYPE(SQ_EXPORT_PIXEL),
                                           RW_GPR(2),
@@ -632,7 +648,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                                CF_INST(SQ_CF_INST_EXPORT_DONE),
                                                WHOLE_QUAD_MODE(0),
                                                BARRIER(1));
-    /* 3 */
+    /* 4 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -654,7 +670,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
-    /* 4 */
+    /* 5 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
@@ -676,7 +692,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
-    /* 5 */
+    /* 6 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
@@ -698,7 +714,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 6 */
+    /* 7 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -724,7 +740,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
-    /* 7 */
+    /* 8 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -750,7 +766,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
-    /* 8 */
+    /* 9 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
@@ -776,7 +792,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(1));
-    /* 9 */
+    /* 10 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
@@ -802,7 +818,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(1));
-    /* 10 */
+    /* 11 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
@@ -828,7 +844,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(1));
-    /* 11 */
+    /* 12 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -854,7 +870,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
-    /* 12 */
+    /* 13 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
@@ -880,7 +896,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(1));
-    /* 13 */
+    /* 14 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
@@ -906,7 +922,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(1));
-    /* 14 */
+    /* 15 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
@@ -932,7 +948,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(1));
-    /* 15 */
+    /* 16 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
@@ -958,7 +974,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
-    /* 16 */
+    /* 17 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
@@ -984,7 +1000,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(1));
-    /* 17 */
+    /* 18 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
@@ -1010,7 +1026,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(1));
-    /* 18 */
+    /* 19 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
@@ -1036,9 +1052,31 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(1));
-    shader[i++] = 0x00000000;
-    shader[i++] = 0x00000000;
-    /* 20/21 */
+    /* 20 */
+    shader[i++] = CF_DWORD0(ADDR(22));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_ACTIVE),
+                            I_COUNT(3),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_TEX),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(1));
+    /* 21 */
+    shader[i++] = CF_DWORD0(ADDR(0));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    /* 22/23 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1066,7 +1104,7 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 22/23 */
+    /* 24/25 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
@@ -1077,8 +1115,8 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD1(DST_GPR(1),
                              DST_REL(ABSOLUTE),
                              DST_SEL_X(SQ_SEL_MASK),
-                             DST_SEL_Y(SQ_SEL_X),
-                             DST_SEL_Z(SQ_SEL_Y),
+                             DST_SEL_Y(SQ_SEL_MASK),
+                             DST_SEL_Z(SQ_SEL_X),
                              DST_SEL_W(SQ_SEL_MASK),
                              LOD_BIAS(0),
                              COORD_TYPE_X(TEX_NORMALIZED),
@@ -1094,503 +1132,20 @@ int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader)
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-
-    return i;
-}
-
-/*
- * ; xv ps planar
- * 00 TEX: ADDR(20) CNT(3) NO_BARRIER 
- *       0  SAMPLE R1.x__1, R0.xy01, t0, s0
- *       1  SAMPLE R1.__x_, R0.xy01, t1, s1
- *       2  SAMPLE R1._x__, R0.xy01, t2, s2
- * 01 ALU: ADDR(3) CNT(16) 
- *       3  x: MULADD      R1.x,  R1.x,  C3.x,  C3.y      CLAMP 
- *          y: MULADD      R1.y,  R1.y,  C3.z,  C3.w      
- *          z: MULADD      R1.z,  R1.z,  C3.z,  C3.w      
- *          w: MOV         R1.w,  0.0f 
- *       4  x: DOT4        R2.x,  R1.x,  C0.x      CLAMP VEC_102 
- *          y: DOT4        ____,  R1.y,  C0.y      CLAMP VEC_102 
- *          z: DOT4        ____,  R1.z,  C0.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C0.w      CLAMP VEC_021 
- *       5  x: DOT4        ____,  R1.x,  C1.x      CLAMP VEC_102 
- *          y: DOT4        R2.y,  R1.y,  C1.y      CLAMP VEC_102 
- *          z: DOT4        ____,  R1.z,  C1.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C1.w      CLAMP VEC_021 
- *       6  x: DOT4        ____,  R1.x,  C2.x      CLAMP VEC_102 
- *          y: DOT4        ____,  R1.y,  C2.y      CLAMP VEC_102 
- *          z: DOT4        R2.z,  R1.z,  C2.z      CLAMP VEC_102 
- *          w: DOT4        ____,  R1.w,  C2.w      CLAMP VEC_021 
- * 02 EXP_DONE: PIX0, R2
- * END_OF_PROGRAM
- */
-int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader)
-{
-    int i=0;
-
-    /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(20));
-    shader[i++] = CF_DWORD1(POP_COUNT(0),
-                            CF_CONST(0),
-                            COND(SQ_CF_COND_ACTIVE),
-                            I_COUNT(3),
-                            CALL_COUNT(0),
-                            END_OF_PROGRAM(0),
-                            VALID_PIXEL_MODE(0),
-                            CF_INST(SQ_CF_INST_TEX),
-                            WHOLE_QUAD_MODE(0),
-                            BARRIER(0));
-    /* 1 */
-    shader[i++] = CF_ALU_DWORD0(ADDR(3),
-                                KCACHE_BANK0(0),
-                                KCACHE_BANK1(0),
-                                KCACHE_MODE0(SQ_CF_KCACHE_NOP));
-    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
-                                KCACHE_ADDR0(0),
-                                KCACHE_ADDR1(0),
-                                I_COUNT(16),
-                                USES_WATERFALL(0),
-                                CF_INST(SQ_CF_INST_ALU),
-                                WHOLE_QUAD_MODE(0),
-                                BARRIER(1));
-    /* 2 */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
-                                          TYPE(SQ_EXPORT_PIXEL),
-                                          RW_GPR(2),
-                                          RW_REL(ABSOLUTE),
-                                          INDEX_GPR(0),
-                                          ELEM_SIZE(3));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-                                               SRC_SEL_Y(SQ_SEL_Y),
-                                               SRC_SEL_Z(SQ_SEL_Z),
-                                               SRC_SEL_W(SQ_SEL_W),
-                                               R6xx_ELEM_LOOP(0),
-                                               BURST_COUNT(1),
-                                               END_OF_PROGRAM(1),
-                                               VALID_PIXEL_MODE(0),
-                                               CF_INST(SQ_CF_INST_EXPORT_DONE),
-                                               WHOLE_QUAD_MODE(0),
-                                               BARRIER(1));
-    /* 3 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(259),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Y),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 4 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
-                             SRC0_NEG(0),
-                             SRC1_SEL(259),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
-                                 CLAMP(0));
-    /* 5 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
-                             SRC0_NEG(0),
-                             SRC1_SEL(259),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Z),
-                                 CLAMP(0));
-    /* 6 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(SQ_ALU_SRC_0),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_MOV),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_W),
-                                 CLAMP(0));
-    /* 7 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(2),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 8 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
-                             SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
-                                 CLAMP(1));
-    /* 9 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
-                             SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Z),
-                                 CLAMP(1));
-    /* 10 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
-                             SRC0_NEG(0),
-                             SRC1_SEL(256),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_W),
-                                 CLAMP(1));
-    /* 11 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 12 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(2),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
-                                 CLAMP(1));
-    /* 13 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Z),
-                                 CLAMP(1));
-    /* 14 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
-                             SRC0_NEG(0),
-                             SRC1_SEL(257),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_W),
-                                 CLAMP(1));
-    /* 15 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
-                             SRC0_NEG(0),
-                             SRC1_SEL(258),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_X),
-                                 CLAMP(1));
-    /* 16 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
-                             SRC0_NEG(0),
-                             SRC1_SEL(258),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
-                                 CLAMP(1));
-    /* 17 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Z),
-                             SRC0_NEG(0),
-                             SRC1_SEL(258),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Z),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(0));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(1),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_102),
-                                 DST_GPR(2),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Z),
-                                 CLAMP(1));
-    /* 18 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
-                             SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_W),
-                             SRC0_NEG(0),
-                             SRC1_SEL(258),
-                             SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_W),
-                             SRC1_NEG(0),
-                             INDEX_MODE(SQ_INDEX_LOOP),
-                             PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP2(ChipSet,
-                                 SRC0_ABS(0),
-                                 SRC1_ABS(0),
-                                 UPDATE_EXECUTE_MASK(0),
-                                 UPDATE_PRED(0),
-                                 WRITE_MASK(0),
-                                 FOG_MERGE(0),
-                                 OMOD(SQ_ALU_OMOD_OFF),
-                                 ALU_INST(SQ_OP2_INST_DOT4),
-                                 BANK_SWIZZLE(SQ_ALU_VEC_021),
-                                 DST_GPR(0),
-                                 DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_W),
-                                 CLAMP(1));
-    shader[i++] = 0x00000000;
-    shader[i++] = 0x00000000;
-    /* 20/21 */
+    /* 26/27 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
-                             RESOURCE_ID(0),
+                             RESOURCE_ID(2),
                              SRC_GPR(0),
                              SRC_REL(ABSOLUTE),
                              R7xx_ALT_CONST(0));
     shader[i++] = TEX_DWORD1(DST_GPR(1),
                              DST_REL(ABSOLUTE),
-                             DST_SEL_X(SQ_SEL_X),
-                             DST_SEL_Y(SQ_SEL_MASK),
+                             DST_SEL_X(SQ_SEL_MASK),
+                             DST_SEL_Y(SQ_SEL_X),
                              DST_SEL_Z(SQ_SEL_MASK),
-                             DST_SEL_W(SQ_SEL_1),
+                             DST_SEL_W(SQ_SEL_MASK),
                              LOD_BIAS(0),
                              COORD_TYPE_X(TEX_NORMALIZED),
                              COORD_TYPE_Y(TEX_NORMALIZED),
@@ -1599,26 +1154,50 @@ int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD2(OFFSET_X(0),
                              OFFSET_Y(0),
                              OFFSET_Z(0),
-                             SAMPLER_ID(0),
+                             SAMPLER_ID(2),
                              SRC_SEL_X(SQ_SEL_X),
                              SRC_SEL_Y(SQ_SEL_Y),
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 22/23 */
+    /* 28 */
+    shader[i++] = CF_DWORD0(ADDR(30));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_ACTIVE),
+                            I_COUNT(2),
+                            CALL_COUNT(0),
+                            END_OF_PROGRAM(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_TEX),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(1));
+    /* 29 */
+    shader[i++] = CF_DWORD0(ADDR(0));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    /* 30/31 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
-                             RESOURCE_ID(1),
+                             RESOURCE_ID(0),
                              SRC_GPR(0),
                              SRC_REL(ABSOLUTE),
                              R7xx_ALT_CONST(0));
     shader[i++] = TEX_DWORD1(DST_GPR(1),
                              DST_REL(ABSOLUTE),
-                             DST_SEL_X(SQ_SEL_MASK),
+                             DST_SEL_X(SQ_SEL_X),
                              DST_SEL_Y(SQ_SEL_MASK),
-                             DST_SEL_Z(SQ_SEL_X),
-                             DST_SEL_W(SQ_SEL_MASK),
+                             DST_SEL_Z(SQ_SEL_MASK),
+                             DST_SEL_W(SQ_SEL_1),
                              LOD_BIAS(0),
                              COORD_TYPE_X(TEX_NORMALIZED),
                              COORD_TYPE_Y(TEX_NORMALIZED),
@@ -1627,17 +1206,17 @@ int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD2(OFFSET_X(0),
                              OFFSET_Y(0),
                              OFFSET_Z(0),
-                             SAMPLER_ID(1),
+                             SAMPLER_ID(0),
                              SRC_SEL_X(SQ_SEL_X),
                              SRC_SEL_Y(SQ_SEL_Y),
                              SRC_SEL_Z(SQ_SEL_0),
                              SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
-    /* 24/25 */
+    /* 32/33 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
                              BC_FRAC_MODE(0),
                              FETCH_WHOLE_QUAD(0),
-                             RESOURCE_ID(2),
+                             RESOURCE_ID(1),
                              SRC_GPR(0),
                              SRC_REL(ABSOLUTE),
                              R7xx_ALT_CONST(0));
@@ -1645,7 +1224,7 @@ int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader)
                              DST_REL(ABSOLUTE),
                              DST_SEL_X(SQ_SEL_MASK),
                              DST_SEL_Y(SQ_SEL_X),
-                             DST_SEL_Z(SQ_SEL_MASK),
+                             DST_SEL_Z(SQ_SEL_Y),
                              DST_SEL_W(SQ_SEL_MASK),
                              LOD_BIAS(0),
                              COORD_TYPE_X(TEX_NORMALIZED),
@@ -1655,7 +1234,7 @@ int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD2(OFFSET_X(0),
                              OFFSET_Y(0),
                              OFFSET_Z(0),
-                             SAMPLER_ID(2),
+                             SAMPLER_ID(1),
                              SRC_SEL_X(SQ_SEL_X),
                              SRC_SEL_Y(SQ_SEL_Y),
                              SRC_SEL_Z(SQ_SEL_0),
diff --git a/src/r600_shader.h b/src/r600_shader.h
index 7333d0b..ffb50c3 100644
--- a/src/r600_shader.h
+++ b/src/r600_shader.h
@@ -350,8 +350,7 @@ extern int R600_copy_vs(RADEONChipFamily ChipSet, uint32_t* vs);
 extern int R600_copy_ps(RADEONChipFamily ChipSet, uint32_t* ps);
 
 extern int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader);
-extern int R600_xv_ps_packet(RADEONChipFamily ChipSet, uint32_t* shader);
-extern int R600_xv_ps_planar(RADEONChipFamily ChipSet, uint32_t* shader);
+extern int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader);
 
 extern int R600_comp_mask_vs(RADEONChipFamily ChipSet, uint32_t* vs);
 extern int R600_comp_mask_ps(RADEONChipFamily ChipSet,
diff --git a/src/r600_state.h b/src/r600_state.h
index e3f491b..8e7334d 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -255,6 +255,10 @@ ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf);
 void
 set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf);
 void
+set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf);
+void
+set_loop_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf);
+void
 set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res);
 void
 set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res);
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index bf98ec7..d208b07 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -115,7 +115,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
     int uv_offset;
-
+    uint32_t bool_consts[1] = { 0 };
     static float ps_alu_consts[] = {
         1.0,  0.0,      1.4020,   0,  /* r - c[0] */
         1.0, -0.34414, -0.71414,  0,  /* g - c[1] */
@@ -166,17 +166,18 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
 	accel_state->xv_vs_offset;
 
+    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->xv_ps_offset;
+
     switch(pPriv->id) {
     case FOURCC_YV12:
     case FOURCC_I420:
-	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->xv_ps_offset_planar;
+	bool_consts[0] = 1;
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
     default:
-	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->xv_ps_offset_packed;
+	bool_consts[0] = 0;
 	break;
     }
 
@@ -200,14 +201,19 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.num_gprs            = 3;
-    ps_conf.stack_size          = 0;
+    ps_conf.stack_size          = 1;
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
     ps_setup                    (pScrn, accel_state->ib, &ps_conf);
 
     /* PS alu constants */
-    set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
+    set_alu_consts(pScrn, accel_state->ib, SQ_ALU_CONSTANT_ps,
+		   sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
+
+    /* CF bool constants */
+    set_bool_consts(pScrn, accel_state->ib, SQ_BOOL_CONST_ps,
+		    sizeof(bool_consts) / SQ_BOOL_CONST_offset, bool_consts);
 
     /* Texture */
     switch(pPriv->id) {
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 93c3ae3..aa2ab86 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -433,6 +433,29 @@ set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *co
 }
 
 void
+set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf)
+{
+    int i;
+    const int countreg = count * (SQ_BOOL_CONST_offset >> 2);
+
+    PACK0(ib, SQ_BOOL_CONST + offset * SQ_BOOL_CONST_offset, countreg);
+    for (i = 0; i < countreg; i++)
+	E32(ib, const_buf[i]);
+
+}
+
+void
+set_loop_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, uint32_t *const_buf)
+{
+    int i;
+    const int countreg = count * (SQ_LOOP_CONST_offset >> 2);
+
+    PACK0(ib, SQ_LOOP_CONST + offset * SQ_LOOP_CONST_offset, countreg);
+    for (i = 0; i < countreg; i++)
+	E32(ib, const_buf[i]);
+}
+
+void
 set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
 {
     uint32_t sq_vtx_constant_word2;
diff --git a/src/radeon.h b/src/radeon.h
index 0eea7c1..6fcc36a 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -626,8 +626,7 @@ struct radeon_accel_state {
     uint32_t          comp_mask_vs_offset;
     uint32_t          comp_mask_ps_offset;
     uint32_t          xv_vs_offset;
-    uint32_t          xv_ps_offset_packed;
-    uint32_t          xv_ps_offset_planar;
+    uint32_t          xv_ps_offset;
 
     //size/addr stuff
     uint32_t          src_size[2];


More information about the xorg-commit mailing list