xf86-video-ati: Branch 'master' - 3 commits

Alex Deucher agd5f at kemper.freedesktop.org
Mon Nov 29 15:11:16 PST 2010


 src/evergreen_exa.c    |    2 
 src/evergreen_shader.c |  622 ++++++++++++++++++++++++++++++---------
 src/r600_exa.c         |    2 
 src/r600_shader.c      |  766 ++++++++++++++++++++++++++++++++++++-------------
 src/r600_shader.h      |    4 
 5 files changed, 1054 insertions(+), 342 deletions(-)

New commits:
commit 90f831361844f1b80b3f6bb718ff5ac584d73d48
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Nov 29 18:09:05 2010 -0500

    evergreen: use dot4 for transforms

diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c
index 1c02752..89afaff 100644
--- a/src/evergreen_exa.c
+++ b/src/evergreen_exa.c
@@ -1318,7 +1318,7 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     /* Shader */
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.shader_size         = accel_state->vs_size;
-    vs_conf.num_gprs            = 3;
+    vs_conf.num_gprs            = 5;
     vs_conf.stack_size          = 1;
     vs_conf.bo                  = accel_state->shaders_bo;
     evergreen_vs_setup(pScrn, &vs_conf, RADEON_GEM_DOMAIN_VRAM);
diff --git a/src/evergreen_shader.c b/src/evergreen_shader.c
index 42cea7a..ef56d2d 100644
--- a/src/evergreen_shader.c
+++ b/src/evergreen_shader.c
@@ -1410,7 +1410,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
     /* 3 - mask sub */
-    shader[i++] = CF_DWORD0(ADDR(32),
+    shader[i++] = CF_DWORD0(ADDR(44),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1430,7 +1430,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(12),
+				I_COUNT(20),
 				ALT_CONST(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
@@ -1500,7 +1500,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
     /* 9 - non-mask sub */
-    shader[i++] = CF_DWORD0(ADDR(38),
+    shader[i++] = CF_DWORD0(ADDR(50),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -1513,14 +1513,14 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    BARRIER(1));
 
     /* 10 - ALU */
-    shader[i++] = CF_ALU_DWORD0(ADDR(26),
+    shader[i++] = CF_ALU_DWORD0(ADDR(34),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1));
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(6),
+				I_COUNT(10),
 				ALT_CONST(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
@@ -1573,189 +1573,408 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
 
-    /* mask alu - 14 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0),
+    /* 14 srcX.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+
+    /* 15 srcX.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 16 srcX.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 15 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1),
+
+    /* 17 srcX.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 16 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0),
+    /* 18 srcY.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 17 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1),
+
+    /* 19 srcY.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 20 srcY.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 21 srcY.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 22 maskX.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
+                                 DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 18 maskX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 2),
+    /* 23 maskX.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 2),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 24 maskX.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
 
-    /* 19 maskY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 3),
+    /* 25 maskX.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 2),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 3),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 20 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 2),
+    /* 26 maskY.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 21 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 3),
+
+    /* 27 maskY.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
+                             SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 22 srcX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+    /* 28 maskY.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 29 maskY.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 3),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 30 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
@@ -1779,8 +1998,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 23 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
+    /* 31 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1804,8 +2023,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 24 maskX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+    /* 32 maskX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
@@ -1829,8 +2048,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 25 maskY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+    /* 33 maskY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1854,98 +2073,209 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* no mask alu - 26 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0),
+    /* 34 srcX.x DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+
+    /* 35 srcX.y DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 36 srcX.z DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 27 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1),
+
+    /* 37 srcX.w DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_KCACHE0_BASE + 1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 28 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 0),
+    /* 38 srcY.x DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 29 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 1),
+
+    /* 39 srcY.y DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
+                             SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_GPR_BASE + 0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
-    /* 30 srcX / w */
+
+    /* 40 srcY.z DOT4 - non-mask */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 41 srcY.w DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(ALU_SRC_KCACHE0_BASE + 1),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 42 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2),
+                             SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
                              SRC1_SEL(ALU_SRC_KCACHE0_BASE + 0),
@@ -1968,8 +2298,8 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 31 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+    /* 43 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1993,7 +2323,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* mask vfetch - 32/33 - dst */
+    /* mask vfetch - 44/45 - dst */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -2020,7 +2350,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     ALT_CONST(0),
 			     BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE));
     shader[i++] = VTX_DWORD_PAD;
-    /* 34/35 - src */
+    /* 46/47 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -2047,7 +2377,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     ALT_CONST(0),
 			     BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE));
     shader[i++] = VTX_DWORD_PAD;
-    /* 36/37 - mask */
+    /* 48/49 - mask */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -2075,7 +2405,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE));
     shader[i++] = VTX_DWORD_PAD;
 
-    /* no mask vfetch - 38/39 - dst */
+    /* no mask vfetch - 50/51 - dst */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -2102,7 +2432,7 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     ALT_CONST(0),
 			     BUFFER_INDEX_MODE(SQ_CF_INDEX_NONE));
     shader[i++] = VTX_DWORD_PAD;
-    /* 40/41 - src */
+    /* 52/53 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
commit 3cae361d0448b6e231c80f53d64bdbbdd74dc4cf
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Nov 29 17:44:47 2010 -0500

    6xx/7xx: clean up gpr/const handling in shaders

diff --git a/src/r600_shader.c b/src/r600_shader.c
index b42690c..7dceffe 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -157,11 +157,11 @@ int R600_solid_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 					       BARRIER(1));
 
     /* 2 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -183,11 +183,11 @@ int R600_solid_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_ELEM(ELEM_X),
 				 CLAMP(1));
     /* 3 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_Y),
 			     SRC1_NEG(0),
@@ -209,11 +209,11 @@ int R600_solid_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(1));
     /* 4 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_Z),
 			     SRC1_NEG(0),
@@ -235,11 +235,11 @@ int R600_solid_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(1));
     /* 5 */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_W),
 			     SRC1_NEG(0),
@@ -522,11 +522,11 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 
 
     /* 4 texX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -549,11 +549,11 @@ int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 5 texY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -690,18 +690,18 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                                BARRIER(1));
     /* 4,5,6,7 */
     /* r2.x = MAD(c0.w, r1.x, c0.x) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_CFILE_BASE + 0),
                                  SRC2_REL(ABSOLUTE),
                                  SRC2_ELEM(ELEM_X),
                                  SRC2_NEG(0),
@@ -712,18 +712,18 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
     /* r2.y = MAD(c0.w, r1.x, c0.y) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_CFILE_BASE + 0),
                                  SRC2_REL(ABSOLUTE),
                                  SRC2_ELEM(ELEM_Y),
                                  SRC2_NEG(0),
@@ -734,18 +734,18 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
     /* r2.z = MAD(c0.w, r1.x, c0.z) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
+    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(ALU_SRC_CFILE_BASE + 0),
                                  SRC2_REL(ABSOLUTE),
                                  SRC2_ELEM(ELEM_Z),
                                  SRC2_NEG(0),
@@ -780,11 +780,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 
     /* 8,9,10,11 */
     /* r2.x = MAD(c1.x, r1.y, pv.x) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -802,11 +802,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
     /* r2.y = MAD(c1.y, r1.y, pv.y) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -824,11 +824,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
     /* r2.z = MAD(c1.z, r1.y, pv.z) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -869,11 +869,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
     /* 12,13,14,15 */
     /* r2.x = MAD(c2.x, r1.z, pv.x) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -891,11 +891,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(1));
     /* r2.y = MAD(c2.y, r1.z, pv.y) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -913,11 +913,11 @@ int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(1));
     /* r2.z = MAD(c2.z, r1.z, pv.z) */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1361,11 +1361,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 
 
     /* 14 srcX.x DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -1388,11 +1388,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 15 srcX.y DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -1415,11 +1415,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 16 srcX.z DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1442,11 +1442,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 17 srcX.w DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1469,11 +1469,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 18 srcY.x DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -1496,11 +1496,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 19 srcY.y DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -1523,11 +1523,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 20 srcY.z DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1550,11 +1550,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 21 srcY.w DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1577,11 +1577,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 22 maskX.x DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -1604,11 +1604,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 23 maskX.y DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -1631,11 +1631,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 24 maskX.z DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1658,11 +1658,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 25 maskX.w DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1685,11 +1685,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 26 maskY.x DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -1712,11 +1712,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 27 maskY.y DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -1739,11 +1739,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 28 maskY.z DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1766,11 +1766,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 29 maskY.w DOT4 - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1793,11 +1793,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 30 srcX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(3),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1820,11 +1820,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 31 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(3),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1847,11 +1847,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 32 maskX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(4),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(258),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 2),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1874,11 +1874,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 33 maskY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(4),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(259),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 3),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -1901,11 +1901,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 34 srcX.x DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -1928,11 +1928,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 35 srcX.y DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -1955,11 +1955,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 36 srcX.z DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -1982,11 +1982,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 37 srcX.w DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -2009,11 +2009,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 38 srcY.x DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
@@ -2036,11 +2036,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 39 srcY.y DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
@@ -2063,11 +2063,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 40 srcY.z DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Z),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Z),
                              SRC1_NEG(0),
@@ -2090,11 +2090,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 41 srcY.w DOT4 - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -2117,11 +2117,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 42 srcX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(2),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(256),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 0),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -2144,11 +2144,11 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  CLAMP(0));
 
     /* 43 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(2),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(257),
+                             SRC1_SEL(ALU_SRC_CFILE_BASE + 1),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
@@ -2445,11 +2445,11 @@ int R600_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 
     /* 10 - alu 0 */
     /* MUL gpr[2].x gpr[1].x gpr[0].x */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2472,11 +2472,11 @@ int R600_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 CLAMP(1));
     /* 11 - alu 1 */
     /* MUL gpr[2].y gpr[1].y gpr[0].y */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_Y),
 			     SRC1_NEG(0),
@@ -2499,11 +2499,11 @@ int R600_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 CLAMP(1));
     /* 12 - alu 2 */
     /* MUL gpr[2].z gpr[1].z gpr[0].z */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_Z),
 			     SRC1_NEG(0),
@@ -2526,11 +2526,11 @@ int R600_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 CLAMP(1));
     /* 13 - alu 3 */
     /* MUL gpr[2].w gpr[1].w gpr[0].w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_W),
 			     SRC1_NEG(0),
diff --git a/src/r600_shader.h b/src/r600_shader.h
index a68d6c2..3d5acc7 100644
--- a/src/r600_shader.h
+++ b/src/r600_shader.h
@@ -193,6 +193,10 @@
 // 128-159 kcache constants bank 0
 // 160-191 kcache constants bank 1
 // 248-255 special SQ_ALU_SRC_* (0, 1, etc.)
+#define ALU_SRC_GPR_BASE        0
+#define ALU_SRC_KCACHE0_BASE  128
+#define ALU_SRC_KCACHE1_BASE  160
+#define ALU_SRC_CFILE_BASE    256
 
 #define SRC0_REL(x)        (x)
 #define SRC1_REL(x)        (x)
commit d9bcac516f2a810acb300b29169e56a2df0b47ac
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Nov 29 17:23:30 2010 -0500

    r6xx/r7xx use dot4 for transforms

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a04d66a..f6cde1d 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1358,7 +1358,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     /* Shader */
     vs_conf.shader_addr         = accel_state->vs_mc_addr;
     vs_conf.shader_size         = accel_state->vs_size;
-    vs_conf.num_gprs            = 3;
+    vs_conf.num_gprs            = 5;
     vs_conf.stack_size          = 1;
     vs_conf.bo                  = accel_state->shaders_bo;
     r600_vs_setup(pScrn, accel_state->ib, &vs_conf, RADEON_GEM_DOMAIN_VRAM);
diff --git a/src/r600_shader.c b/src/r600_shader.c
index e2a4163..b42690c 100644
--- a/src/r600_shader.c
+++ b/src/r600_shader.c
@@ -1191,7 +1191,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                             WHOLE_QUAD_MODE(0),
                             BARRIER(1));
     /* 3 - mask sub */
-    shader[i++] = CF_DWORD0(ADDR(32));
+    shader[i++] = CF_DWORD0(ADDR(44));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
@@ -1211,7 +1211,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(12),
+				I_COUNT(20),
 				USES_WATERFALL(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
@@ -1284,7 +1284,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
     /* 9 - non-mask sub */
-    shader[i++] = CF_DWORD0(ADDR(38));
+    shader[i++] = CF_DWORD0(ADDR(50));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
@@ -1297,14 +1297,14 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    BARRIER(1));
 
     /* 10 - ALU */
-    shader[i++] = CF_ALU_DWORD0(ADDR(26),
+    shader[i++] = CF_ALU_DWORD0(ADDR(34),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(6),
+				I_COUNT(10),
 				USES_WATERFALL(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
@@ -1360,189 +1360,440 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			    BARRIER(1));
 
 
-    /* 14 srcX MAD - mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    /* 14 srcX.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+
+    /* 15 srcX.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(256),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 16 srcX.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 15 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+
+    /* 17 srcX.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(256),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(257),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 16 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    /* 18 srcY.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(257),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(3),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 17 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+
+    /* 19 srcY.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Y),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Y),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 20 srcY.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 21 srcY.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(3),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 22 maskX.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(1),
+                             SRC1_SEL(258),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(1),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(1),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
-                                 DST_ELEM(ELEM_Y),
+                                 DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 18 maskX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+    /* 23 maskX.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(258),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(258),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 24 maskX.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(258),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
 
-    /* 19 maskY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(259),
+    /* 25 maskX.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(258),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 20 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(258),
+    /* 26 maskY.x DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(259),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 21 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(259),
+
+    /* 27 maskY.y DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
+                             SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(259),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(4),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 22 srcX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* 28 maskY.z DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(259),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 29 maskY.w DOT4 - mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(259),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(4),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 30 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
@@ -1568,8 +1819,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 23 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(1),
+    /* 31 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(3),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1595,8 +1846,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 24 maskX / w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    /* 32 maskX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
@@ -1622,8 +1873,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 25 maskY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    /* 33 maskY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(4),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1649,98 +1900,225 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 26 srcX MAD - non-mask */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    /* 34 srcX.x DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_X),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_X),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_X),
+                                 CLAMP(0));
+
+    /* 35 srcX.y DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(256),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(256),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Y),
+                                 CLAMP(0));
+
+    /* 36 srcX.z DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(256),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Z),
                                  CLAMP(0));
-    /* 27 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+
+    /* 37 srcX.w DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_Y),
+                             SRC0_ELEM(ELEM_W),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(256),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_Y),
+                             SRC1_ELEM(ELEM_W),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(257),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_W),
                                  CLAMP(0));
 
-    /* 28 srcX MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(256),
+    /* 38 srcY.x DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(257),
                              SRC1_REL(ABSOLUTE),
                              SRC1_ELEM(ELEM_X),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
                              LAST(0));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_Z),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
-    /* 29 srcY MAD */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(257),
+
+    /* 39 srcY.y DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
-                             SRC0_ELEM(ELEM_X),
+                             SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
-                             SRC1_SEL(0),
+                             SRC1_SEL(257),
                              SRC1_REL(ABSOLUTE),
-                             SRC1_ELEM(ELEM_X),
+                             SRC1_ELEM(ELEM_Y),
                              SRC1_NEG(0),
                              INDEX_MODE(SQ_INDEX_LOOP),
                              PRED_SEL(SQ_PRED_SEL_OFF),
-                             LAST(1));
-    shader[i++] = ALU_DWORD1_OP3(SRC2_SEL(0),
-                                 SRC2_REL(ABSOLUTE),
-                                 SRC2_ELEM(ELEM_W),
-                                 SRC2_NEG(0),
-                                 ALU_INST(SQ_OP3_INST_MULADD),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(1),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
                                  BANK_SWIZZLE(SQ_ALU_VEC_012),
-                                 DST_GPR(0),
+                                 DST_GPR(2),
                                  DST_REL(ABSOLUTE),
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
-    /* 30 srcX / w */
+
+    /* 40 srcY.z DOT4 - non-mask */
     shader[i++] = ALU_DWORD0(SRC0_SEL(0),
                              SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_Z),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_Z),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_Z),
+                                 CLAMP(0));
+
+    /* 41 srcY.w DOT4 - non-mask */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+                             SRC0_REL(ABSOLUTE),
+                             SRC0_ELEM(ELEM_W),
+                             SRC0_NEG(0),
+                             SRC1_SEL(257),
+                             SRC1_REL(ABSOLUTE),
+                             SRC1_ELEM(ELEM_W),
+                             SRC1_NEG(0),
+                             INDEX_MODE(SQ_INDEX_LOOP),
+                             PRED_SEL(SQ_PRED_SEL_OFF),
+                             LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(ChipSet,
+                                 SRC0_ABS(0),
+                                 SRC1_ABS(0),
+                                 UPDATE_EXECUTE_MASK(0),
+                                 UPDATE_PRED(0),
+                                 WRITE_MASK(0),
+                                 FOG_MERGE(0),
+                                 OMOD(SQ_ALU_OMOD_OFF),
+                                 ALU_INST(SQ_OP2_INST_DOT4),
+                                 BANK_SWIZZLE(SQ_ALU_VEC_012),
+                                 DST_GPR(2),
+                                 DST_REL(ABSOLUTE),
+                                 DST_ELEM(ELEM_W),
+                                 CLAMP(0));
+
+    /* 42 srcX / w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(2),
+                             SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_X),
                              SRC0_NEG(0),
                              SRC1_SEL(256),
@@ -1765,8 +2143,8 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_X),
                                  CLAMP(0));
 
-    /* 31 srcY / h */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(0),
+    /* 43 srcY / h */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(2),
                              SRC0_REL(ABSOLUTE),
                              SRC0_ELEM(ELEM_Y),
                              SRC0_NEG(0),
@@ -1792,7 +2170,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
                                  DST_ELEM(ELEM_Y),
                                  CLAMP(0));
 
-    /* 32/33 - dst - mask */
+    /* 44/45 - dst - mask */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1817,7 +2195,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 34/35 - src */
+    /* 46/47 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1842,7 +2220,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(0));
     shader[i++] = VTX_DWORD_PAD;
-    /* 36/37 - mask */
+    /* 48/49 - mask */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1868,7 +2246,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     MEGA_FETCH(0));
     shader[i++] = VTX_DWORD_PAD;
 
-    /* 38/39 - dst - non-mask */
+    /* 50/51 - dst - non-mask */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),
@@ -1893,7 +2271,7 @@ int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 			     CONST_BUF_NO_STRIDE(0),
 			     MEGA_FETCH(1));
     shader[i++] = VTX_DWORD_PAD;
-    /* 40/41 - src */
+    /* 52/53 - src */
     shader[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
 			     FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
 			     FETCH_WHOLE_QUAD(0),


More information about the xorg-commit mailing list