xf86-video-ati: Branch 'master' - 5 commits

Alex Deucher agd5f at kemper.freedesktop.org
Tue Jul 30 06:29:02 PDT 2013


 src/cayman_shader.c     |  590 +++++++++++++++++++++++++++++++----------------
 src/evergreen_accel.c   |   12 
 src/evergreen_exa.c     |  287 +++++++++++++++++------
 src/evergreen_shader.c  |  596 ++++++++++++++++++++++++++++++------------------
 src/evergreen_state.h   |    2 
 src/r600_exa.c          |   11 
 src/radeon_exa_render.c |   12 
 7 files changed, 1016 insertions(+), 494 deletions(-)

New commits:
commit 6a278369c05a298a4367306d986467a9ceacae8c
Author: Raul Fernandes <rgfernandes at gmail.com>
Date:   Tue Jul 30 09:26:05 2013 -0400

    EXA/6xx/7xx: optimize non-overlapping Copy
    
    In case dst and src rectangles of a Copy operation in the same surface
    don't overlap, it is safe to skip the scratch surface. This is a
    common case.
    
    Based on evergreen/ni patch from Grigori Goronzy.
    
    Signed-off-by: Alex Deucher <alexander.deucher at amd.com>

diff --git a/src/r600_exa.c b/src/r600_exa.c
index b243234..a354ccd 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -643,7 +643,12 @@ R600Copy(PixmapPtr pDst,
     if (accel_state->vsync)
 	RADEONVlineHelperSet(pScrn, dstX, dstY, dstX + w, dstY + h);
 
-    if (accel_state->same_surface && accel_state->copy_area) {
+    if (accel_state->same_surface &&
+	    (srcX + w <= dstX || dstX + w <= srcX || srcY + h <= dstY || dstY + h <= srcY)) {
+	R600DoPrepareCopy(pScrn);
+	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	R600DoCopyVline(pDst);
+    } else if (accel_state->same_surface && accel_state->copy_area) {
 	uint32_t orig_dst_domain = accel_state->dst_obj.domain;
 	uint32_t orig_src_domain = accel_state->src_obj[0].domain;
 	uint32_t orig_src_tiling_flags = accel_state->src_obj[0].tiling_flags;
commit 4375a6e75e5d41139be7031a0dee58c057ecbd07
Author: Grigori Goronzy <greg at chown.ath.cx>
Date:   Mon Jul 22 02:30:28 2013 +0200

    EXA/evergreen/ni: accelerate PictOpOver with component alpha
    
    Subpixel text rendering is typically done with a solid src and a
    pixmap mask. Traditionally, this cannot be accelerated in a single
    pass and requires two passes [1]. However, we can cheat a little
    with a constant blend color.
    
    We can use:
    
    const.A = src.A / src.A
    const.R = src.R / src.A
    const.G = src.G / src.A
    const.B = src.B / src.A
    
    dst.A = const.A * (src.A * mask.A) + (1 - (src.A * mask.A)) * dst.A
    dst.R = const.R * (src.A * mask.R) + (1 - (src.A * mask.R)) * dst.R
    dst.G = const.G * (src.A * mask.G) + (1 - (src.A * mask.G)) * dst.G
    dst.B = const.B * (src.A * mask.B) + (1 - (src.A * mask.B)) * dst.B
    
    This only needs a single source value. src.A is cancelled down in
    the right places.
    
    [1] http://anholt.livejournal.com/32058.html

diff --git a/src/evergreen_accel.c b/src/evergreen_accel.c
index 10f2e51..e25010b 100644
--- a/src/evergreen_accel.c
+++ b/src/evergreen_accel.c
@@ -335,7 +335,19 @@ evergreen_set_render_target(ScrnInfoPtr pScrn, cb_config_t *cb_conf, uint32_t do
 					       (CB_NORMAL << CB_COLOR_CONTROL__MODE_shift)));
     EREG(CB_BLEND0_CONTROL,                   cb_conf->blendcntl);
     END_BATCH();
+}
 
+void evergreen_set_blend_color(ScrnInfoPtr pScrn, float *color)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
+    BEGIN_BATCH(2 + 4);
+    PACK0(CB_BLEND_RED, 4);
+    EFLOAT(color[0]); /* R */
+    EFLOAT(color[1]); /* G */
+    EFLOAT(color[2]); /* B */
+    EFLOAT(color[3]); /* A */
+    END_BATCH();
 }
 
 static void
diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c
index 5b8a631..ee5b06b 100644
--- a/src/evergreen_exa.c
+++ b/src/evergreen_exa.c
@@ -704,6 +704,14 @@ static uint32_t EVERGREENGetBlendCntl(int op, PicturePtr pMask, uint32_t dst_for
 	} else if (dblend == (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)) {
 	    dblend = (BLEND_ONE_MINUS_SRC_COLOR << COLOR_DESTBLEND_shift);
 	}
+
+	/* With some tricks, we can still accelerate PictOpOver with solid src.
+	 * This is commonly used for text rendering, so it's worth the extra
+	 * effort.
+	 */
+	if (sblend == (BLEND_ONE << COLOR_SRCBLEND_shift)) {
+	    sblend = (BLEND_CONSTANT_COLOR << COLOR_SRCBLEND_shift);
+	}
     }
 
     return sblend | dblend;
@@ -1095,12 +1103,17 @@ static Bool EVERGREENCheckComposite(int op, PicturePtr pSrcPicture,
 		/* Check if it's component alpha that relies on a source alpha and
 		 * on the source value.  We can only get one of those into the
 		 * single source value that we get to blend with.
+		 *
+		 * We can cheat a bit if the src is solid, though. PictOpOver
+		 * can use the constant blend color to sneak a second blend
+		 * source in.
 		 */
 		if (EVERGREENBlendOp[op].src_alpha &&
 		    (EVERGREENBlendOp[op].blend_cntl & COLOR_SRCBLEND_mask) !=
 		    (BLEND_ZERO << COLOR_SRCBLEND_shift)) {
-		    RADEON_FALLBACK(("Component alpha not supported with source "
-				     "alpha and source value blending.\n"));
+		    if (pSrcPicture->pDrawable || op != 3)
+			RADEON_FALLBACK(("Component alpha not supported with source "
+					 "alpha and source value blending.\n"));
 		}
 	    }
 
@@ -1196,6 +1209,11 @@ static void EVERGREENSetSolidConsts(ScrnInfoPtr pScrn, float *buf, int format, u
 	} else {
 	    if (accel_state->component_alpha) {
 		if (accel_state->src_alpha) {
+		    /* required for PictOpOver */
+		    float cblend[4] = { pix_r / pix_a, pix_g / pix_a,
+					pix_b / pix_a, pix_a / pix_a };
+		    evergreen_set_blend_color(pScrn, cblend);
+
 		    if (PICT_FORMAT_A(format) == 0) {
 			pix_r = 1.0;
 			pix_g = 1.0;
diff --git a/src/evergreen_state.h b/src/evergreen_state.h
index 3ce2bf2..795d447 100644
--- a/src/evergreen_state.h
+++ b/src/evergreen_state.h
@@ -297,6 +297,8 @@ evergreen_start_3d(ScrnInfoPtr pScrn);
 void
 evergreen_set_render_target(ScrnInfoPtr pScrn, cb_config_t *cb_conf, uint32_t domain);
 void
+evergreen_set_blend_color(ScrnInfoPtr pScrn, float *color);
+void
 evergreen_cp_wait_vline_sync(ScrnInfoPtr pScrn, PixmapPtr pPix, xf86CrtcPtr crtc, int start, int stop);
 void
 evergreen_set_spi(ScrnInfoPtr pScrn, int vs_export_count, int num_interp);
commit 94d0d14914a025525a0766669b556eaa6681def7
Author: Grigori Goronzy <greg at chown.ath.cx>
Date:   Thu Jul 18 16:06:23 2013 +0200

    EXA/evergreen/ni: fast solid pixmap support
    
    Solid pixmaps are currently implemented with scratch pixmaps, which
    is slow. This replaces the hack with a proper implementation. The
    Composite shader can now either sample a src/mask or use a constant
    value.

diff --git a/src/cayman_shader.c b/src/cayman_shader.c
index 2a6d6b1..59f4177 100644
--- a/src/cayman_shader.c
+++ b/src/cayman_shader.c
@@ -2495,17 +2495,44 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
     int i = 0;
 
     /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(3),
+    /* call interp-fetch-mask if boolean1 == true */
+    shader[i++] = CF_DWORD0(ADDR(12),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
-                            CF_CONST(0),
+                            CF_CONST(1),
                             COND(SQ_CF_COND_BOOL),
                             I_COUNT(0),
                             VALID_PIXEL_MODE(0),
                             CF_INST(SQ_CF_INST_CALL),
                             BARRIER(0));
+
     /* 1 */
-    shader[i++] = CF_DWORD0(ADDR(8),
+    /* call read-constant-mask if boolean1 == false */
+    shader[i++] = CF_DWORD0(ADDR(15),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(1),
+                            COND(SQ_CF_COND_NOT_BOOL),
+                            I_COUNT(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            BARRIER(0));
+
+    /* 2 */
+    /* call interp-fetch-src if boolean0 == true */
+    shader[i++] = CF_DWORD0(ADDR(7),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_BOOL),
+                            I_COUNT(0),
+                            VALID_PIXEL_MODE(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            BARRIER(0));
+
+    /* 3 */
+    /* call read-constant-src if boolean0 == false */
+    shader[i++] = CF_DWORD0(ADDR(10),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
@@ -2514,7 +2541,41 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             VALID_PIXEL_MODE(0),
                             CF_INST(SQ_CF_INST_CALL),
                             BARRIER(0));
-    /* 2 - end */
+    /* 4 */
+    /* src IN mask (GPR2 := GPR1 .* GPR0) */
+    shader[i++] = CF_ALU_DWORD0(ADDR(17),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(4),
+				ALT_CONST(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 5 */
+    /* export pixel data */
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+					  TYPE(SQ_EXPORT_PIXEL),
+					  RW_GPR(0),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(1));
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       BURST_COUNT(1),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       MARK(0),
+					       BARRIER(1));
+
+    /* 6 */
+    /* end of program */
     shader[i++] = CF_DWORD0(ADDR(0),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
@@ -2524,33 +2585,53 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    VALID_PIXEL_MODE(0),
 			    CF_INST(SQ_CF_INST_END),
 			    BARRIER(1));
-    /* 3 - mask sub */
-    shader[i++] = CF_ALU_DWORD0(ADDR(12),
+
+    /* subroutine interp-fetch-src */
+
+    /* 7 */
+    /* interpolate src */
+    shader[i++] = CF_ALU_DWORD0(ADDR(21),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(8),
+				I_COUNT(4),
 				ALT_CONST(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
 
-    /* 4 */
-    shader[i++] = CF_DWORD0(ADDR(28),
+    /* 8 */
+    /* texture fetch src into GPR0 */
+    shader[i++] = CF_DWORD0(ADDR(26),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
-			    I_COUNT(2),
+			    I_COUNT(1),
 			    VALID_PIXEL_MODE(0),
 			    CF_INST(SQ_CF_INST_TC),
 			    BARRIER(1));
 
-    /* 5 */
-    shader[i++] = CF_ALU_DWORD0(ADDR(20),
+    /* 9 */
+    /* return */
+    shader[i++] = CF_DWORD0(ADDR(0),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    BARRIER(0));
+
+    /* subroutine read-constant-src */
+
+    /* 10 */
+    /* read constants into GPR0 */
+    shader[i++] = CF_ALU_DWORD0(ADDR(28),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1));
@@ -2558,29 +2639,13 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
 				I_COUNT(4),
-				ALT_CONST(0),
+				ALT_CONST(1),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
 
-    /* 6 */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
-					  TYPE(SQ_EXPORT_PIXEL),
-					  RW_GPR(2),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(1));
-
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       BURST_COUNT(1),
-					       VALID_PIXEL_MODE(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       MARK(0),
-					       BARRIER(1));
-    /* 7 */
+    /* 11 */
+    /* return */
     shader[i++] = CF_DWORD0(ADDR(0),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
@@ -2589,10 +2654,13 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    I_COUNT(0),
 			    VALID_PIXEL_MODE(0),
 			    CF_INST(SQ_CF_INST_RETURN),
-			    BARRIER(1));
+			    BARRIER(0));
 
-    /* 8 - non-mask sub */
-    shader[i++] = CF_ALU_DWORD0(ADDR(24),
+    /* subroutine interp-fetch-mask */
+
+    /* 12 */
+    /* interpolate mask */
+    shader[i++] = CF_ALU_DWORD0(ADDR(32),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
@@ -2604,8 +2672,10 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
-    /* 9 */
-    shader[i++] = CF_DWORD0(ADDR(32),
+
+    /* 13 */
+    /* texture fetch mask into GPR1 */
+    shader[i++] = CF_DWORD0(ADDR(36),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -2615,24 +2685,37 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    CF_INST(SQ_CF_INST_TC),
 			    BARRIER(1));
 
-    /* 10 */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
-					  TYPE(SQ_EXPORT_PIXEL),
-					  RW_GPR(0),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(1));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       BURST_COUNT(1),
-					       VALID_PIXEL_MODE(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       MARK(0),
-					       BARRIER(1));
+    /* 14 */
+    /* return */
+    shader[i++] = CF_DWORD0(ADDR(0),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    BARRIER(0));
 
-    /* 11 */
+    /* subroutine read-constant-src */
+
+    /* 15 */
+    /* read constants into GPR1 */
+    shader[i++] = CF_ALU_DWORD0(ADDR(38),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(4),
+				ALT_CONST(1),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 16 */
+    /* return */
     shader[i++] = CF_DWORD0(ADDR(0),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
@@ -2641,18 +2724,21 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    I_COUNT(0),
 			    VALID_PIXEL_MODE(0),
 			    CF_INST(SQ_CF_INST_RETURN),
-			    BARRIER(1));
+			    BARRIER(0));
+
+    /* ALU clauses */
 
-    /* 12 interpolate src tex coords - mask */
+    /* 17 */
+    /* MUL gpr[0].x gpr[0].x gpr[1].x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_Y),
+			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2661,22 +2747,24 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
-				 CLAMP(0));
-    /* 13 */
+				 CLAMP(1));
+
+    /* 18 */
+    /* MUL gpr[0].y gpr[0].y gpr[1].y */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_X),
+			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_Y),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2685,67 +2773,70 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
-				 CLAMP(0));
-    /* 14 */
+				 CLAMP(1));
+    /* 19 */
+    /* MUL gpr[0].z gpr[0].z gpr[1].z */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_Y),
+			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_Z),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
 				 SRC1_ABS(0),
 				 UPDATE_EXECUTE_MASK(0),
 				 UPDATE_PRED(0),
-				 WRITE_MASK(0),
+				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
-				 CLAMP(0));
-    /* 15 */
+				 CLAMP(1));
+    /* 20 */
+    /* MUL gpr[0].w gpr[0].w gpr[1].w */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_X),
+			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_W),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(1));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
 				 SRC1_ABS(0),
 				 UPDATE_EXECUTE_MASK(0),
 				 UPDATE_PRED(0),
-				 WRITE_MASK(0),
+				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
-				 CLAMP(0));
+				 CLAMP(1));
 
-    /* 16 interpolate mask tex coords */
+    /* 21 */
+    /* INTERP_XY GPR0.x, GPR0.y PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2764,12 +2855,13 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(0));
-    /* 17 */
+    /* 22 */
+    /* INTERP_XY GPR0.y, GPR0.x PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2788,12 +2880,13 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(0));
-    /* 18 */
+    /* 23 */
+    /* INTERP_XY GPR0.z, GPR0.y PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2812,12 +2905,14 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(0));
-    /* 19 */
+
+    /* 24 */
+    /* INTERP_XY GPR0.w, GPR0.x PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2837,17 +2932,53 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_ELEM(ELEM_W),
 				 CLAMP(0));
 
-    /* 20 - alu 0 */
-    /* MUL gpr[2].x gpr[0].x gpr[1].x */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+    /* 25 */
+    shader[i++] = 0;
+    shader[i++] = 0;
+
+    /* 26/27 */
+    /* SAMPLE RID=0 GPR0, GPR0 */
+    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			     INST_MOD(0),
+			     FETCH_WHOLE_QUAD(0),
+			     RESOURCE_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     ALT_CONST(0),
+			     RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
+			     SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
+    shader[i++] = TEX_DWORD1(DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_Z),
+			     DST_SEL_W(SQ_SEL_W),
+			     LOD_BIAS(0),
+			     COORD_TYPE_X(TEX_NORMALIZED),
+			     COORD_TYPE_Y(TEX_NORMALIZED),
+			     COORD_TYPE_Z(TEX_NORMALIZED),
+			     COORD_TYPE_W(TEX_NORMALIZED));
+    shader[i++] = TEX_DWORD2(OFFSET_X(0),
+			     OFFSET_Y(0),
+			     OFFSET_Z(0),
+			     SAMPLER_ID(0),
+			     SRC_SEL_X(SQ_SEL_X),
+			     SRC_SEL_Y(SQ_SEL_Y),
+			     SRC_SEL_Z(SQ_SEL_0),
+			     SRC_SEL_W(SQ_SEL_1));
+    shader[i++] = TEX_DWORD_PAD;
+
+    /* 28 */
+    /* MOV GPR0.x, KC4.x */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2856,23 +2987,24 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(1));
-    /* 21 - alu 1 */
-    /* MUL gpr[2].y gpr[0].y gpr[1].y */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 29 */
+    /* MOV GPR0.y, KC4.y */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_Y),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2881,23 +3013,24 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(1));
-    /* 22 - alu 2 */
-    /* MUL gpr[2].z gpr[0].z gpr[1].z */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 30  */
+    /* MOV GPR0.z, KC4.z */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_Z),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2906,23 +3039,24 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(1));
-    /* 23 - alu 3 */
-    /* MUL gpr[2].w gpr[0].w gpr[1].w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 31 */
+    /* MOV GPR0.w, KC4.w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_W),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(1));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2931,19 +3065,20 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
 				 CLAMP(1));
 
-    /* 24 - interpolate tex coords - non-mask */
+    /* 32 */
+    /* INTERP_XY GPR1.x, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2958,16 +3093,17 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(0));
-    /* 25 */
+    /* 33 */
+    /* INTERP_XY GPR1.y, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2982,16 +3118,17 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(0));
-    /* 26 */
+    /* 34 */
+    /* INTERP_XY GPR1.z, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -3006,16 +3143,17 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(0));
-    /* 27 */
+    /* 35 */
+    /* INTERP_XY GPR1.w, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -3030,16 +3168,17 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
 				 CLAMP(0));
 
-    /* 28/29 - src - mask */
+    /* 36/37 */
+    /* SAMPLE RID=1 GPR1, GPR1 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
 			     INST_MOD(0),
 			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(0),
+			     RESOURCE_ID(1),
 			     SRC_GPR(1),
 			     SRC_REL(ABSOLUTE),
 			     ALT_CONST(0),
@@ -3059,36 +3198,6 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD2(OFFSET_X(0),
 			     OFFSET_Y(0),
 			     OFFSET_Z(0),
-			     SAMPLER_ID(0),
-			     SRC_SEL_X(SQ_SEL_X),
-			     SRC_SEL_Y(SQ_SEL_Y),
-			     SRC_SEL_Z(SQ_SEL_0),
-			     SRC_SEL_W(SQ_SEL_1));
-    shader[i++] = TEX_DWORD_PAD;
-    /* 30/31 - mask */
-    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
-			     INST_MOD(0),
-			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(1),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-                             ALT_CONST(0),
-                             RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
-                             SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
-    shader[i++] = TEX_DWORD1(DST_GPR(0),
-			     DST_REL(ABSOLUTE),
-			     DST_SEL_X(SQ_SEL_X),
-			     DST_SEL_Y(SQ_SEL_Y),
-			     DST_SEL_Z(SQ_SEL_Z),
-			     DST_SEL_W(SQ_SEL_W),
-			     LOD_BIAS(0),
-			     COORD_TYPE_X(TEX_NORMALIZED),
-			     COORD_TYPE_Y(TEX_NORMALIZED),
-			     COORD_TYPE_Z(TEX_NORMALIZED),
-			     COORD_TYPE_W(TEX_NORMALIZED));
-    shader[i++] = TEX_DWORD2(OFFSET_X(0),
-			     OFFSET_Y(0),
-			     OFFSET_Z(0),
 			     SAMPLER_ID(1),
 			     SRC_SEL_X(SQ_SEL_X),
 			     SRC_SEL_Y(SQ_SEL_Y),
@@ -3096,36 +3205,109 @@ int cayman_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			     SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
 
-    /* 32/33 - src - non-mask */
-    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
-			     INST_MOD(0),
-			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(0),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-			     ALT_CONST(0),
-			     RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
-			     SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
-    shader[i++] = TEX_DWORD1(DST_GPR(0),
-			     DST_REL(ABSOLUTE),
-			     DST_SEL_X(SQ_SEL_X),
-			     DST_SEL_Y(SQ_SEL_Y),
-			     DST_SEL_Z(SQ_SEL_Z),
-			     DST_SEL_W(SQ_SEL_W),
-			     LOD_BIAS(0),
-			     COORD_TYPE_X(TEX_NORMALIZED),
-			     COORD_TYPE_Y(TEX_NORMALIZED),
-			     COORD_TYPE_Z(TEX_NORMALIZED),
-			     COORD_TYPE_W(TEX_NORMALIZED));
-    shader[i++] = TEX_DWORD2(OFFSET_X(0),
-			     OFFSET_Y(0),
-			     OFFSET_Z(0),
-			     SAMPLER_ID(0),
-			     SRC_SEL_X(SQ_SEL_X),
-			     SRC_SEL_Y(SQ_SEL_Y),
-			     SRC_SEL_Z(SQ_SEL_0),
-			     SRC_SEL_W(SQ_SEL_1));
-    shader[i++] = TEX_DWORD_PAD;
+    /* 38 */
+    /* MOV GPR1.x, KC5.x */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_X),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_X),
+				 CLAMP(1));
+
+    /* 39 */
+    /* MOV GPR1.y, KC5.y */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Y),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Y),
+				 CLAMP(1));
+
+    /* 40 */
+    /* MOV GPR1.z, KC5.z */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Z),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Z),
+				 CLAMP(1));
+
+    /* 41 */
+    /* MOV GPR1.w, KC5.w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_W),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_W),
+				 CLAMP(1));
 
     return i;
 }
diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c
index 2cdce0f..5b8a631 100644
--- a/src/evergreen_exa.c
+++ b/src/evergreen_exa.c
@@ -777,10 +777,9 @@ static Bool EVERGREENCheckCompositeTexture(PicturePtr pPict,
     return TRUE;
 }
 
-static void EVERGREENXFormSetup(PicturePtr pPict, PixmapPtr pPix,
+static void EVERGREENXFormSetup(PicturePtr pPict, ScrnInfoPtr pScrn,
 				int unit, float *vs_alu_consts)
 {
-    ScrnInfoPtr pScrn = xf86ScreenToScrn(pPix->drawable.pScreen);
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
     int const_offset = unit * 8;
@@ -1118,6 +1117,134 @@ static Bool EVERGREENCheckComposite(int op, PicturePtr pSrcPicture,
 
 }
 
+static void EVERGREENSetSolidConsts(ScrnInfoPtr pScrn, float *buf, int format, uint32_t fg, int unit)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    float pix_r = 0, pix_g = 0, pix_b = 0, pix_a = 0;
+
+    uint32_t w = (fg >> 24) & 0xff;
+    uint32_t z = (fg >> 16) & 0xff;
+    uint32_t y = (fg >> 8) & 0xff;
+    uint32_t x = (fg >> 0) & 0xff;
+    float xf = (float)x / 255; /* R */
+    float yf = (float)y / 255; /* G */
+    float zf = (float)z / 255; /* B */
+    float wf = (float)w / 255; /* A */
+
+    /* component swizzles */
+    switch (format) {
+	case PICT_a1r5g5b5:
+	case PICT_a8r8g8b8:
+	    pix_r = zf; /* R */
+	    pix_g = yf; /* G */
+	    pix_b = xf; /* B */
+	    pix_a = wf; /* A */
+	    break;
+	case PICT_a8b8g8r8:
+	    pix_r = xf; /* R */
+	    pix_g = yf; /* G */
+	    pix_b = zf; /* B */
+	    pix_a = wf; /* A */
+	    break;
+	case PICT_x8b8g8r8:
+	    pix_r = xf; /* R */
+	    pix_g = yf; /* G */
+	    pix_b = zf; /* B */
+	    pix_a = 1.0; /* A */
+	    break;
+	case PICT_b8g8r8a8:
+	    pix_r = yf; /* R */
+	    pix_g = zf; /* G */
+	    pix_b = wf; /* B */
+	    pix_a = xf; /* A */
+	    break;
+	case PICT_b8g8r8x8:
+	    pix_r = yf; /* R */
+	    pix_g = zf; /* G */
+	    pix_b = wf; /* B */
+	    pix_a = 1.0; /* A */
+	    break;
+	case PICT_x1r5g5b5:
+	case PICT_x8r8g8b8:
+	case PICT_r5g6b5:
+	    pix_r = zf; /* R */
+	    pix_g = yf; /* G */
+	    pix_b = xf; /* B */
+	    pix_a = 1.0; /* A */
+	    break;
+	case PICT_a8:
+	    pix_r = 0.0; /* R */
+	    pix_g = 0.0; /* G */
+	    pix_b = 0.0; /* B */
+	    pix_a = xf; /* A */
+	    break;
+	default:
+	    ErrorF("Bad format 0x%x\n", format);
+    }
+
+    if (unit == 0) {
+	if (!accel_state->msk_pic) {
+	    if (PICT_FORMAT_RGB(format) == 0) {
+		pix_r = 0.0;
+		pix_g = 0.0;
+		pix_b = 0.0;
+	    }
+
+	    if (PICT_FORMAT_A(format) == 0)
+		pix_a = 1.0;
+	} else {
+	    if (accel_state->component_alpha) {
+		if (accel_state->src_alpha) {
+		    if (PICT_FORMAT_A(format) == 0) {
+			pix_r = 1.0;
+			pix_g = 1.0;
+			pix_b = 1.0;
+			pix_a = 1.0;
+		    } else {
+			pix_r = pix_a;
+			pix_g = pix_a;
+			pix_b = pix_a;
+		    }
+		} else {
+		    if (PICT_FORMAT_A(format) == 0)
+			pix_a = 1.0;
+		}
+	    } else {
+		if (PICT_FORMAT_RGB(format) == 0) {
+		    pix_r = 0;
+		    pix_g = 0;
+		    pix_b = 0;
+		}
+
+		if (PICT_FORMAT_A(format) == 0)
+		    pix_a = 1.0;
+	    }
+	}
+    } else {
+	if (accel_state->component_alpha) {
+	    if (PICT_FORMAT_A(format) == 0)
+		pix_a = 1.0;
+	} else {
+	    if (PICT_FORMAT_A(format) == 0) {
+		pix_r = 1.0;
+		pix_g = 1.0;
+		pix_b = 1.0;
+		pix_a = 1.0;
+	    } else {
+		pix_r = pix_a;
+		pix_g = pix_a;
+		pix_b = pix_a;
+	    }
+	}
+    }
+
+    buf[0] = pix_r;
+    buf[1] = pix_g;
+    buf[2] = pix_b;
+    buf[3] = pix_a;
+}
+
 static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
 				      PicturePtr pMaskPicture, PicturePtr pDstPicture,
 				      PixmapPtr pSrc, PixmapPtr pMask, PixmapPtr pDst)
@@ -1132,30 +1259,26 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     const_config_t vs_const_conf;
     struct r600_accel_object src_obj, mask_obj, dst_obj;
     float *cbuf;
+    uint32_t ps_bool_consts = 0;
 
     if (pDst->drawable.bitsPerPixel < 8 || (pSrc && pSrc->drawable.bitsPerPixel < 8))
 	return FALSE;
 
-    if (!pSrc) {
-	pSrc = RADEONSolidPixmap(pScreen, pSrcPicture->pSourcePict->solidFill.color);
-	if (!pSrc)
-	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
+    if (pSrc) {
+	src_obj.bo = radeon_get_pixmap_bo(pSrc);
+	src_obj.surface = radeon_get_pixmap_surface(pSrc);
+	src_obj.tiling_flags = radeon_get_pixmap_tiling(pSrc);
+	src_obj.pitch = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
+	src_obj.width = pSrc->drawable.width;
+	src_obj.height = pSrc->drawable.height;
+	src_obj.bpp = pSrc->drawable.bitsPerPixel;
+	src_obj.domain = RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT;
     }
 
     dst_obj.bo = radeon_get_pixmap_bo(pDst);
-    src_obj.bo = radeon_get_pixmap_bo(pSrc);
     dst_obj.surface = radeon_get_pixmap_surface(pDst);
-    src_obj.surface = radeon_get_pixmap_surface(pSrc);
     dst_obj.tiling_flags = radeon_get_pixmap_tiling(pDst);
-    src_obj.tiling_flags = radeon_get_pixmap_tiling(pSrc);
-    src_obj.pitch = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
     dst_obj.pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
-
-    src_obj.width = pSrc->drawable.width;
-    src_obj.height = pSrc->drawable.height;
-    src_obj.bpp = pSrc->drawable.bitsPerPixel;
-    src_obj.domain = RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT;
-
     dst_obj.width = pDst->drawable.width;
     dst_obj.height = pDst->drawable.height;
     dst_obj.bpp = pDst->drawable.bitsPerPixel;
@@ -1165,30 +1288,16 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
 	dst_obj.domain = RADEON_GEM_DOMAIN_VRAM;
 
     if (pMaskPicture) {
-	if (!pMask) {
-	    pMask = RADEONSolidPixmap(pScreen, pMaskPicture->pSourcePict->solidFill.color);
-	    if (!pMask) {
-		if (!pSrcPicture->pDrawable)
-		    pScreen->DestroyPixmap(pSrc);
-		RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
-	    }
+	if (pMask) {
+	    mask_obj.bo = radeon_get_pixmap_bo(pMask);
+	    mask_obj.tiling_flags = radeon_get_pixmap_tiling(pMask);
+	    mask_obj.pitch = exaGetPixmapPitch(pMask) / (pMask->drawable.bitsPerPixel / 8);
+	    mask_obj.surface = radeon_get_pixmap_surface(pMask);
+	    mask_obj.width = pMask->drawable.width;
+	    mask_obj.height = pMask->drawable.height;
+	    mask_obj.bpp = pMask->drawable.bitsPerPixel;
+	    mask_obj.domain = RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT;
 	}
-	mask_obj.bo = radeon_get_pixmap_bo(pMask);
-	mask_obj.tiling_flags = radeon_get_pixmap_tiling(pMask);
-	mask_obj.pitch = exaGetPixmapPitch(pMask) / (pMask->drawable.bitsPerPixel / 8);
-	mask_obj.surface = radeon_get_pixmap_surface(pMask);
-	mask_obj.width = pMask->drawable.width;
-	mask_obj.height = pMask->drawable.height;
-	mask_obj.bpp = pMask->drawable.bitsPerPixel;
-	mask_obj.domain = RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT;
-
-	if (!R600SetAccelState(pScrn,
-			       &src_obj,
-			       &mask_obj,
-			       &dst_obj,
-			       accel_state->comp_vs_offset, accel_state->comp_ps_offset,
-			       3, 0xffffffff))
-	    return FALSE;
 
 	accel_state->msk_pic = pMaskPicture;
 	if (pMaskPicture->componentAlpha) {
@@ -1202,19 +1311,19 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
 	    accel_state->src_alpha = FALSE;
 	}
     } else {
-	if (!R600SetAccelState(pScrn,
-			       &src_obj,
-			       NULL,
-			       &dst_obj,
-			       accel_state->comp_vs_offset, accel_state->comp_ps_offset,
-			       3, 0xffffffff))
-	    return FALSE;
-
 	accel_state->msk_pic = NULL;
 	accel_state->component_alpha = FALSE;
 	accel_state->src_alpha = FALSE;
     }
 
+    if (!R600SetAccelState(pScrn,
+		pSrc ? &src_obj : NULL,
+		(pMaskPicture && pMask) ? &mask_obj : NULL,
+		&dst_obj,
+		accel_state->comp_vs_offset, accel_state->comp_ps_offset,
+		3, 0xffffffff))
+	return FALSE;
+
     if (!EVERGREENGetDestFormat(pDstPicture, &dst_format))
 	return FALSE;
 
@@ -1238,11 +1347,14 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     evergreen_set_screen_scissor(pScrn, 0, 0, accel_state->dst_obj.width, accel_state->dst_obj.height);
     evergreen_set_window_scissor(pScrn, 0, 0, accel_state->dst_obj.width, accel_state->dst_obj.height);
 
-    if (!EVERGREENTextureSetup(pSrcPicture, pSrc, 0)) {
-        radeon_ib_discard(pScrn);
-        radeon_cs_flush_indirect(pScrn);
-        return FALSE;
-    }
+    if (pSrc) {
+	if (!EVERGREENTextureSetup(pSrcPicture, pSrc, 0)) {
+	    radeon_ib_discard(pScrn);
+	    radeon_cs_flush_indirect(pScrn);
+	    return FALSE;
+	}
+    } else
+	accel_state->is_transform[0] = FALSE;
 
     if (pMask) {
         if (!EVERGREENTextureSetup(pMaskPicture, pMask, 1)) {
@@ -1253,12 +1365,16 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     } else
         accel_state->is_transform[1] = FALSE;
 
+    if (pSrc)
+	ps_bool_consts |= (1 << 0);
+    if (pMask)
+	ps_bool_consts |= (1 << 1);
+    evergreen_set_bool_consts(pScrn, SQ_BOOL_CONST_ps, ps_bool_consts);
+
     if (pMask) {
 	evergreen_set_bool_consts(pScrn, SQ_BOOL_CONST_vs, (1 << 0));
-	evergreen_set_bool_consts(pScrn, SQ_BOOL_CONST_ps, (1 << 0));
     } else {
 	evergreen_set_bool_consts(pScrn, SQ_BOOL_CONST_vs, (0 << 0));
-	evergreen_set_bool_consts(pScrn, SQ_BOOL_CONST_ps, (0 << 0));
     }
 
     /* Shader */
@@ -1271,7 +1387,7 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
     ps_conf.shader_size         = accel_state->ps_size;
-    ps_conf.num_gprs            = 3;
+    ps_conf.num_gprs            = 2;
     ps_conf.stack_size          = 1;
     ps_conf.clamp_consts        = 0;
     ps_conf.export_mode         = 2;
@@ -1346,9 +1462,27 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     vs_const_conf.const_addr = accel_state->cbuf.vb_offset;
 
     vs_const_conf.cpu_ptr = (uint32_t *)(char *)cbuf;
-    EVERGREENXFormSetup(pSrcPicture, pSrc, 0, cbuf);
+    EVERGREENXFormSetup(pSrcPicture, pScrn, 0, cbuf);
     if (pMask)
-        EVERGREENXFormSetup(pMaskPicture, pMask, 1, cbuf);
+        EVERGREENXFormSetup(pMaskPicture, pScrn, 1, cbuf);
+
+    if (!pSrc) {
+	/* solid src color */
+	EVERGREENSetSolidConsts(pScrn, &cbuf[16], pSrcPicture->format,
+		pSrcPicture->pSourcePict->solidFill.color, 0);
+    }
+
+    if (!pMaskPicture) {
+	/* use identity constant if there is no mask */
+	cbuf[20] = 1.0;
+	cbuf[21] = 1.0;
+	cbuf[22] = 1.0;
+	cbuf[23] = 1.0;
+    } else if (!pMask) {
+	/* solid mask color */
+	EVERGREENSetSolidConsts(pScrn, &cbuf[20], pMaskPicture->format,
+		pMaskPicture->pSourcePict->solidFill.color, 1);
+    }
 
     radeon_vbo_commit(pScrn, &accel_state->cbuf);
     evergreen_set_alu_consts(pScrn, &vs_const_conf, RADEON_GEM_DOMAIN_GTT);
@@ -1377,7 +1511,7 @@ static void EVERGREENFinishComposite(ScrnInfoPtr pScrn, PixmapPtr pDst,
 				    accel_state->vline_y1,
 				    accel_state->vline_y2);
 
-    vtx_size = accel_state->msk_pic ? 24 : 16;
+    vtx_size = accel_state->msk_pix ? 24 : 16;
 
     evergreen_finish_op(pScrn, vtx_size);
 }
@@ -1390,12 +1524,6 @@ static void EVERGREENDoneComposite(PixmapPtr pDst)
     struct radeon_accel_state *accel_state = info->accel_state;
 
     EVERGREENFinishComposite(pScrn, pDst, accel_state);
-
-    if (!accel_state->src_pic->pDrawable)
-	pScreen->DestroyPixmap(accel_state->src_pix);
-
-    if (accel_state->msk_pic && !accel_state->msk_pic->pDrawable)
-	pScreen->DestroyPixmap(accel_state->msk_pix);
 }
 
 static void EVERGREENComposite(PixmapPtr pDst,
@@ -1424,7 +1552,7 @@ static void EVERGREENComposite(PixmapPtr pDst,
     if (accel_state->vsync)
 	RADEONVlineHelperSet(pScrn, dstX, dstY, dstX + w, dstY + h);
 
-    if (accel_state->msk_pic) {
+    if (accel_state->msk_pix) {
 
 	vb = radeon_vbo_space(pScrn, &accel_state->vbo, 24);
 
diff --git a/src/evergreen_shader.c b/src/evergreen_shader.c
index ebc58f2..4852578 100644
--- a/src/evergreen_shader.c
+++ b/src/evergreen_shader.c
@@ -2472,15 +2472,16 @@ int evergreen_comp_vs(RADEONChipFamily ChipSet, uint32_t* shader)
 }
 
 /* comp ps --------------------------------------- */
-int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
+int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t *shader)
 {
     int i = 0;
 
     /* 0 */
-    shader[i++] = CF_DWORD0(ADDR(3),
+    /* call interp-fetch-mask if boolean1 == true */
+    shader[i++] = CF_DWORD0(ADDR(11),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
-                            CF_CONST(0),
+                            CF_CONST(1),
                             COND(SQ_CF_COND_BOOL),
                             I_COUNT(0),
                             VALID_PIXEL_MODE(0),
@@ -2488,11 +2489,13 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             CF_INST(SQ_CF_INST_CALL),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(0));
+
     /* 1 */
-    shader[i++] = CF_DWORD0(ADDR(8),
+    /* call read-constant-mask if boolean1 == false */
+    shader[i++] = CF_DWORD0(ADDR(14),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
-                            CF_CONST(0),
+                            CF_CONST(1),
                             COND(SQ_CF_COND_NOT_BOOL),
                             I_COUNT(0),
                             VALID_PIXEL_MODE(0),
@@ -2500,48 +2503,118 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
                             CF_INST(SQ_CF_INST_CALL),
                             WHOLE_QUAD_MODE(0),
                             BARRIER(0));
+
     /* 2 */
-    shader[i++] = CF_DWORD0(ADDR(0),
-                            JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    /* call interp-fetch-src if boolean0 == true */
+    shader[i++] = CF_DWORD0(ADDR(6),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
                             CF_CONST(0),
-                            COND(SQ_CF_COND_ACTIVE),
+                            COND(SQ_CF_COND_BOOL),
                             I_COUNT(0),
                             VALID_PIXEL_MODE(0),
-                            END_OF_PROGRAM(1),
-                            CF_INST(SQ_CF_INST_NOP),
+                            END_OF_PROGRAM(0),
+                            CF_INST(SQ_CF_INST_CALL),
                             WHOLE_QUAD_MODE(0),
-                            BARRIER(1));
+                            BARRIER(0));
 
-    /* 3 - mask sub */
-    shader[i++] = CF_ALU_DWORD0(ADDR(12),
+    /* 3 */
+    /* call read-constant-src if boolean0 == false */
+    shader[i++] = CF_DWORD0(ADDR(9),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+                            CF_CONST(0),
+                            COND(SQ_CF_COND_NOT_BOOL),
+                            I_COUNT(0),
+                            VALID_PIXEL_MODE(0),
+                            END_OF_PROGRAM(0),
+                            CF_INST(SQ_CF_INST_CALL),
+                            WHOLE_QUAD_MODE(0),
+                            BARRIER(0));
+    /* 4 */
+    /* src IN mask (GPR2 := GPR1 .* GPR0) */
+    shader[i++] = CF_ALU_DWORD0(ADDR(16),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
     shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
-				I_COUNT(8),
+				I_COUNT(4),
 				ALT_CONST(0),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
 
-    /* 4 */
-    shader[i++] = CF_DWORD0(ADDR(28),
+    /* 5 */
+    /* export pixel data */
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+					  TYPE(SQ_EXPORT_PIXEL),
+					  RW_GPR(0),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(1));
+    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       BURST_COUNT(1),
+					       VALID_PIXEL_MODE(0),
+					       END_OF_PROGRAM(1),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       MARK(0),
+					       BARRIER(1));
+
+    /* subroutine interp-fetch-src */
+
+    /* 6 */
+    /* interpolate src */
+    shader[i++] = CF_ALU_DWORD0(ADDR(20),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(4),
+				ALT_CONST(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 7 */
+    /* texture fetch src into GPR0 */
+    shader[i++] = CF_DWORD0(ADDR(24),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
 			    COND(SQ_CF_COND_ACTIVE),
-			    I_COUNT(2),
+			    I_COUNT(1),
 			    VALID_PIXEL_MODE(0),
 			    END_OF_PROGRAM(0),
 			    CF_INST(SQ_CF_INST_TC),
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
 
-    /* 5 */
-    shader[i++] = CF_ALU_DWORD0(ADDR(20),
+    /* 8 */
+    /* return */
+    shader[i++] = CF_DWORD0(ADDR(0),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    VALID_PIXEL_MODE(0),
+			    END_OF_PROGRAM(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(0));
+
+    /* subroutine read-constant-src */
+
+    /* 9 */
+    /* read constants into GPR0 */
+    shader[i++] = CF_ALU_DWORD0(ADDR(26),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1));
@@ -2549,30 +2622,13 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				KCACHE_ADDR0(0),
 				KCACHE_ADDR1(0),
 				I_COUNT(4),
-				ALT_CONST(0),
+				ALT_CONST(1),
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
 
-    /* 6 */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
-					  TYPE(SQ_EXPORT_PIXEL),
-					  RW_GPR(2),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(1));
-
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       BURST_COUNT(1),
-					       VALID_PIXEL_MODE(0),
-					       END_OF_PROGRAM(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       MARK(0),
-					       BARRIER(1));
-    /* 7 */
+    /* 10 */
+    /* return */
     shader[i++] = CF_DWORD0(ADDR(0),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
@@ -2583,10 +2639,13 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    END_OF_PROGRAM(0),
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
-			    BARRIER(1));
+			    BARRIER(0));
+
+    /* subroutine interp-fetch-mask */
 
-    /* 8 - non-mask sub */
-    shader[i++] = CF_ALU_DWORD0(ADDR(24),
+    /* 11 */
+    /* interpolate mask */
+    shader[i++] = CF_ALU_DWORD0(ADDR(30),
 				KCACHE_BANK0(0),
 				KCACHE_BANK1(0),
 				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
@@ -2598,8 +2657,10 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				CF_INST(SQ_CF_INST_ALU),
 				WHOLE_QUAD_MODE(0),
 				BARRIER(1));
-    /* 9 */
-    shader[i++] = CF_DWORD0(ADDR(32),
+
+    /* 12 */
+    /* texture fetch mask into GPR1 */
+    shader[i++] = CF_DWORD0(ADDR(34),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
 			    CF_CONST(0),
@@ -2611,25 +2672,39 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    WHOLE_QUAD_MODE(0),
 			    BARRIER(1));
 
-    /* 10 */
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
-					  TYPE(SQ_EXPORT_PIXEL),
-					  RW_GPR(0),
-					  RW_REL(ABSOLUTE),
-					  INDEX_GPR(0),
-					  ELEM_SIZE(1));
-    shader[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
-					       SRC_SEL_Y(SQ_SEL_Y),
-					       SRC_SEL_Z(SQ_SEL_Z),
-					       SRC_SEL_W(SQ_SEL_W),
-					       BURST_COUNT(1),
-					       VALID_PIXEL_MODE(0),
-					       END_OF_PROGRAM(0),
-					       CF_INST(SQ_CF_INST_EXPORT_DONE),
-					       MARK(0),
-					       BARRIER(1));
+    /* 13 */
+    /* return */
+    shader[i++] = CF_DWORD0(ADDR(0),
+			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
+    shader[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(0),
+			    VALID_PIXEL_MODE(0),
+			    END_OF_PROGRAM(0),
+			    CF_INST(SQ_CF_INST_RETURN),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(0));
 
-    /* 11 */
+    /* subroutine read-constant-src */
+
+    /* 14 */
+    /* read constants into GPR1 */
+    shader[i++] = CF_ALU_DWORD0(ADDR(36),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_LOCK_1));
+    shader[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(4),
+				ALT_CONST(1),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+    /* 15 */
+    /* return */
     shader[i++] = CF_DWORD0(ADDR(0),
 			    JUMPTABLE_SEL(SQ_CF_JUMPTABLE_SEL_CONST_A));
     shader[i++] = CF_DWORD1(POP_COUNT(0),
@@ -2640,18 +2715,21 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			    END_OF_PROGRAM(0),
 			    CF_INST(SQ_CF_INST_RETURN),
 			    WHOLE_QUAD_MODE(0),
-			    BARRIER(1));
+			    BARRIER(0));
 
-    /* 12 interpolate src tex coords - mask */
+    /* ALU clauses */
+
+    /* 16 */
+    /* MUL gpr[0].x gpr[0].x gpr[1].x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_Y),
+			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2660,22 +2738,24 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
-				 CLAMP(0));
-    /* 13 */
+				 CLAMP(1));
+
+    /* 17 */
+    /* MUL gpr[0].y gpr[0].y gpr[1].y */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_X),
+			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_Y),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2684,67 +2764,70 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
-				 CLAMP(0));
-    /* 14 */
+				 CLAMP(1));
+    /* 18 */
+    /* MUL gpr[0].z gpr[0].z gpr[1].z */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_Y),
+			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_Z),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
 				 SRC1_ABS(0),
 				 UPDATE_EXECUTE_MASK(0),
 				 UPDATE_PRED(0),
-				 WRITE_MASK(0),
+				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
-				 CLAMP(0));
-    /* 15 */
+				 CLAMP(1));
+    /* 19 */
+    /* MUL gpr[0].w gpr[0].w gpr[1].w */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
-			     SRC0_ELEM(ELEM_X),
+			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_X),
+			     SRC1_ELEM(ELEM_W),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_AR_X),
+			     INDEX_MODE(SQ_INDEX_LOOP),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(1));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
 				 SRC1_ABS(0),
 				 UPDATE_EXECUTE_MASK(0),
 				 UPDATE_PRED(0),
-				 WRITE_MASK(0),
+				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_INTERP_XY),
-				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(1),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
-				 CLAMP(0));
+				 CLAMP(1));
 
-    /* 16 interpolate mask tex coords */
+    /* 20 */
+    /* INTERP_XY GPR0.x, GPR0.y PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2763,12 +2846,13 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(0));
-    /* 17 */
+    /* 21 */
+    /* INTERP_XY GPR0.y, GPR0.x PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2787,12 +2871,13 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(0));
-    /* 18 */
+    /* 22 */
+    /* INTERP_XY GPR0.z, GPR0.y PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2811,12 +2896,14 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(0));
-    /* 19 */
+
+    /* 23 */
+    /* INTERP_XY GPR0.w, GPR0.x PARAM0.x */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2836,17 +2923,49 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 DST_ELEM(ELEM_W),
 				 CLAMP(0));
 
-    /* 20 - alu 0 */
-    /* MUL gpr[2].x gpr[0].x gpr[1].x */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+    /* 24/25 */
+    /* SAMPLE RID=0 GPR0, GPR0 */
+    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			     INST_MOD(0),
+			     FETCH_WHOLE_QUAD(0),
+			     RESOURCE_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     ALT_CONST(0),
+			     RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
+			     SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
+    shader[i++] = TEX_DWORD1(DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_Z),
+			     DST_SEL_W(SQ_SEL_W),
+			     LOD_BIAS(0),
+			     COORD_TYPE_X(TEX_NORMALIZED),
+			     COORD_TYPE_Y(TEX_NORMALIZED),
+			     COORD_TYPE_Z(TEX_NORMALIZED),
+			     COORD_TYPE_W(TEX_NORMALIZED));
+    shader[i++] = TEX_DWORD2(OFFSET_X(0),
+			     OFFSET_Y(0),
+			     OFFSET_Z(0),
+			     SAMPLER_ID(0),
+			     SRC_SEL_X(SQ_SEL_X),
+			     SRC_SEL_Y(SQ_SEL_Y),
+			     SRC_SEL_Z(SQ_SEL_0),
+			     SRC_SEL_W(SQ_SEL_1));
+    shader[i++] = TEX_DWORD_PAD;
+
+    /* 26 */
+    /* MOV GPR0.x, KC4.x */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2855,23 +2974,24 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(1));
-    /* 21 - alu 1 */
-    /* MUL gpr[2].y gpr[0].y gpr[1].y */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 27 */
+    /* MOV GPR0.y, KC4.y */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_Y),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2880,23 +3000,24 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(1));
-    /* 22 - alu 2 */
-    /* MUL gpr[2].z gpr[0].z gpr[1].z */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 28  */
+    /* MOV GPR0.z, KC4.z */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Z),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_Z),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(0));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2905,23 +3026,24 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(1));
-    /* 23 - alu 3 */
-    /* MUL gpr[2].w gpr[0].w gpr[1].w */
-    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
+
+    /* 29 */
+    /* MOV GPR0.w, KC4.w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 4),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_W),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_GPR_BASE + 1),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC1_REL(ABSOLUTE),
-			     SRC1_ELEM(ELEM_W),
+			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
-			     INDEX_MODE(SQ_INDEX_LOOP),
+			     INDEX_MODE(SQ_INDEX_AR_X),
 			     PRED_SEL(SQ_PRED_SEL_OFF),
 			     LAST(1));
     shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
@@ -2930,19 +3052,20 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 UPDATE_PRED(0),
 				 WRITE_MASK(1),
 				 OMOD(SQ_ALU_OMOD_OFF),
-				 ALU_INST(SQ_OP2_INST_MUL),
+				 ALU_INST(SQ_OP2_INST_MOV),
 				 BANK_SWIZZLE(SQ_ALU_VEC_012),
-				 DST_GPR(2),
+				 DST_GPR(0),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
 				 CLAMP(1));
 
-    /* 24 - interpolate tex coords - non-mask */
+    /* 30 */
+    /* INTERP_XY GPR1.x, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2957,16 +3080,17 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_X),
 				 CLAMP(0));
-    /* 25 */
+    /* 31 */
+    /* INTERP_XY GPR1.y, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -2981,16 +3105,17 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Y),
 				 CLAMP(0));
-    /* 26 */
+    /* 32 */
+    /* INTERP_XY GPR1.z, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_Y),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -3005,16 +3130,17 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_Z),
 				 CLAMP(0));
-    /* 27 */
+    /* 33 */
+    /* INTERP_XY GPR1.w, PARAM1 */
     shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_GPR_BASE + 0),
 			     SRC0_REL(ABSOLUTE),
 			     SRC0_ELEM(ELEM_X),
 			     SRC0_NEG(0),
-			     SRC1_SEL(ALU_SRC_PARAM_BASE + 0),
+			     SRC1_SEL(ALU_SRC_PARAM_BASE + 1),
 			     SRC1_REL(ABSOLUTE),
 			     SRC1_ELEM(ELEM_X),
 			     SRC1_NEG(0),
@@ -3029,16 +3155,17 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 				 OMOD(SQ_ALU_OMOD_OFF),
 				 ALU_INST(SQ_OP2_INST_INTERP_XY),
 				 BANK_SWIZZLE(SQ_ALU_VEC_210),
-				 DST_GPR(0),
+				 DST_GPR(1),
 				 DST_REL(ABSOLUTE),
 				 DST_ELEM(ELEM_W),
 				 CLAMP(0));
 
-    /* 28/29 - src - mask */
+    /* 34/35 */
+    /* SAMPLE RID=1 GPR1, GPR1 */
     shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
 			     INST_MOD(0),
 			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(0),
+			     RESOURCE_ID(1),
 			     SRC_GPR(1),
 			     SRC_REL(ABSOLUTE),
 			     ALT_CONST(0),
@@ -3058,36 +3185,6 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
     shader[i++] = TEX_DWORD2(OFFSET_X(0),
 			     OFFSET_Y(0),
 			     OFFSET_Z(0),
-			     SAMPLER_ID(0),
-			     SRC_SEL_X(SQ_SEL_X),
-			     SRC_SEL_Y(SQ_SEL_Y),
-			     SRC_SEL_Z(SQ_SEL_0),
-			     SRC_SEL_W(SQ_SEL_1));
-    shader[i++] = TEX_DWORD_PAD;
-    /* 30/31 - mask */
-    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
-			     INST_MOD(0),
-			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(1),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-                             ALT_CONST(0),
-                             RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
-                             SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
-    shader[i++] = TEX_DWORD1(DST_GPR(0),
-			     DST_REL(ABSOLUTE),
-			     DST_SEL_X(SQ_SEL_X),
-			     DST_SEL_Y(SQ_SEL_Y),
-			     DST_SEL_Z(SQ_SEL_Z),
-			     DST_SEL_W(SQ_SEL_W),
-			     LOD_BIAS(0),
-			     COORD_TYPE_X(TEX_NORMALIZED),
-			     COORD_TYPE_Y(TEX_NORMALIZED),
-			     COORD_TYPE_Z(TEX_NORMALIZED),
-			     COORD_TYPE_W(TEX_NORMALIZED));
-    shader[i++] = TEX_DWORD2(OFFSET_X(0),
-			     OFFSET_Y(0),
-			     OFFSET_Z(0),
 			     SAMPLER_ID(1),
 			     SRC_SEL_X(SQ_SEL_X),
 			     SRC_SEL_Y(SQ_SEL_Y),
@@ -3095,36 +3192,109 @@ int evergreen_comp_ps(RADEONChipFamily ChipSet, uint32_t* shader)
 			     SRC_SEL_W(SQ_SEL_1));
     shader[i++] = TEX_DWORD_PAD;
 
-    /* 32/33 - src - non-mask */
-    shader[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
-			     INST_MOD(0),
-			     FETCH_WHOLE_QUAD(0),
-			     RESOURCE_ID(0),
-			     SRC_GPR(0),
-			     SRC_REL(ABSOLUTE),
-			     ALT_CONST(0),
-			     RESOURCE_INDEX_MODE(SQ_CF_INDEX_NONE),
-			     SAMPLER_INDEX_MODE(SQ_CF_INDEX_NONE));
-    shader[i++] = TEX_DWORD1(DST_GPR(0),
-			     DST_REL(ABSOLUTE),
-			     DST_SEL_X(SQ_SEL_X),
-			     DST_SEL_Y(SQ_SEL_Y),
-			     DST_SEL_Z(SQ_SEL_Z),
-			     DST_SEL_W(SQ_SEL_W),
-			     LOD_BIAS(0),
-			     COORD_TYPE_X(TEX_NORMALIZED),
-			     COORD_TYPE_Y(TEX_NORMALIZED),
-			     COORD_TYPE_Z(TEX_NORMALIZED),
-			     COORD_TYPE_W(TEX_NORMALIZED));
-    shader[i++] = TEX_DWORD2(OFFSET_X(0),
-			     OFFSET_Y(0),
-			     OFFSET_Z(0),
-			     SAMPLER_ID(0),
-			     SRC_SEL_X(SQ_SEL_X),
-			     SRC_SEL_Y(SQ_SEL_Y),
-			     SRC_SEL_Z(SQ_SEL_0),
-			     SRC_SEL_W(SQ_SEL_1));
-    shader[i++] = TEX_DWORD_PAD;
+    /* 36 */
+    /* MOV GPR1.x, KC5.x */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_X),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_X),
+				 CLAMP(1));
+
+    /* 37 */
+    /* MOV GPR1.y, KC5.y */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Y),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Y),
+				 CLAMP(1));
+
+    /* 38 */
+    /* MOV GPR1.z, KC5.z */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Z),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Z),
+				 CLAMP(1));
+
+    /* 39 */
+    /* MOV GPR1.w, KC5.w */
+    shader[i++] = ALU_DWORD0(SRC0_SEL(ALU_SRC_KCACHE0_BASE + 5),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_W),
+			     SRC0_NEG(0),
+			     SRC1_SEL(ALU_SRC_GPR_BASE + 0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_AR_X),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(1));
+    shader[i++] = ALU_DWORD1_OP2(SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MOV),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(1),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_W),
+				 CLAMP(1));
 
     return i;
 }
commit 5bb04351c43a91a1d60348b7293544da05d75e72
Author: Grigori Goronzy <greg at chown.ath.cx>
Date:   Fri Jul 27 17:31:53 2012 +0200

    EXA/evergreen/ni: optimize non-overlapping Copy
    
    In case dst and src rectangles of a Copy operation in the same surface
    don't overlap, it is safe to skip the scratch surface. This is a
    common case.

diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c
index 86f455d..2cdce0f 100644
--- a/src/evergreen_exa.c
+++ b/src/evergreen_exa.c
@@ -575,7 +575,12 @@ EVERGREENCopy(PixmapPtr pDst,
     if (accel_state->vsync)
 	RADEONVlineHelperSet(pScrn, dstX, dstY, dstX + w, dstY + h);
 
-    if (accel_state->same_surface && accel_state->copy_area) {
+    if (accel_state->same_surface &&
+	    (srcX + w <= dstX || dstX + w <= srcX || srcY + h <= dstY || dstY + h <= srcY)) {
+	EVERGREENDoPrepareCopy(pScrn);
+	EVERGREENAppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	EVERGREENDoCopyVline(pDst);
+    } else if (accel_state->same_surface && accel_state->copy_area) {
 	uint32_t orig_dst_domain = accel_state->dst_obj.domain;
 	uint32_t orig_src_domain = accel_state->src_obj[0].domain;
 	uint32_t orig_src_tiling_flags = accel_state->src_obj[0].tiling_flags;
commit c08e09b7bec441c4bf93b4cae4de1260754bf940
Author: Grigori Goronzy <greg at chown.ath.cx>
Date:   Sat May 18 13:46:03 2013 +0200

    Fix RADEON_FALLBACK logging

diff --git a/src/evergreen_exa.c b/src/evergreen_exa.c
index bd57135..86f455d 100644
--- a/src/evergreen_exa.c
+++ b/src/evergreen_exa.c
@@ -1134,7 +1134,7 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
     if (!pSrc) {
 	pSrc = RADEONSolidPixmap(pScreen, pSrcPicture->pSourcePict->solidFill.color);
 	if (!pSrc)
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
     }
 
     dst_obj.bo = radeon_get_pixmap_bo(pDst);
@@ -1165,7 +1165,7 @@ static Bool EVERGREENPrepareComposite(int op, PicturePtr pSrcPicture,
 	    if (!pMask) {
 		if (!pSrcPicture->pDrawable)
 		    pScreen->DestroyPixmap(pSrc);
-		RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+		RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
 	    }
 	}
 	mask_obj.bo = radeon_get_pixmap_bo(pMask);
diff --git a/src/r600_exa.c b/src/r600_exa.c
index fbb1383..b243234 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1179,7 +1179,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
     if (!pSrc) {
 	pSrc = RADEONSolidPixmap(pScreen, pSrcPicture->pSourcePict->solidFill.color);
 	if (!pSrc)
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
     }
 
     dst_obj.bo = radeon_get_pixmap_bo(pDst);
@@ -1211,7 +1211,7 @@ static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
 	    if (!pMask) {
 		if (!pSrcPicture->pDrawable)
 		    pScreen->DestroyPixmap(pSrc);
-		RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+		RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
 	    }
 	}
 
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index 1f6b86d..9510f7f 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -602,7 +602,7 @@ static Bool R100PrepareComposite(int op,
     if (!pSrc) {
 	pSrc = RADEONSolidPixmap(pScreen, cpu_to_le32(pSrcPicture->pSourcePict->solidFill.color));
 	if (!pSrc)
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
     }
 
     if (((dst_pitch >> pixel_shift) & 0x7) != 0)
@@ -616,7 +616,7 @@ static Bool R100PrepareComposite(int op,
 	if (!pMask) {
 	    if (!pSrcPicture->pDrawable)
 		pScreen->DestroyPixmap(pSrc);
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
 	}
     }
 
@@ -967,7 +967,7 @@ static Bool R200PrepareComposite(int op, PicturePtr pSrcPicture,
     if (!pSrc) {
 	pSrc = RADEONSolidPixmap(pScreen, cpu_to_le32(pSrcPicture->pSourcePict->solidFill.color));
 	if (!pSrc)
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
     }
 
     if (!RADEONSetupSourceTile(pSrcPicture, pSrc, FALSE, TRUE))
@@ -978,7 +978,7 @@ static Bool R200PrepareComposite(int op, PicturePtr pSrcPicture,
 	if (!pMask) {
 	    if (!pSrcPicture->pDrawable)
 		pScreen->DestroyPixmap(pSrc);
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
 	}
     }
 
@@ -1459,7 +1459,7 @@ static Bool R300PrepareComposite(int op, PicturePtr pSrcPicture,
     if (!pSrc) {
 	pSrc = RADEONSolidPixmap(pScreen, cpu_to_le32(pSrcPicture->pSourcePict->solidFill.color));
 	if (!pSrc)
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
     }
 
     if (!RADEONSetupSourceTile(pSrcPicture, pSrc, TRUE, FALSE))
@@ -1470,7 +1470,7 @@ static Bool R300PrepareComposite(int op, PicturePtr pSrcPicture,
 	if (!pMask) {
 	    if (!pSrcPicture->pDrawable)
 		pScreen->DestroyPixmap(pSrc);
-	    RADEON_FALLBACK("Failed to create solid scratch pixmap\n");
+	    RADEON_FALLBACK(("Failed to create solid scratch pixmap\n"));
 	}
     }
 


More information about the xorg-commit mailing list