xf86-video-intel: Branch 'xf86-video-intel-2.3-branch' - 17 commits - src/exa_sf.g4a src/exa_sf.g4b src/exa_sf_mask.g4a src/exa_sf_mask.g4b src/exa_sf_mask_prog.h src/exa_sf_prog.h src/exa_sf_rotation.g4a src/exa_sf_rotation_prog.h src/exa_wm_affine.g4i src/exa_wm_ca.g4a src/exa_wm_ca.g4b src/exa_wm_ca_srcalpha.g4a src/exa_wm_ca_srcalpha.g4b src/exa_wm.g4i src/exa_wm_mask_affine.g4a src/exa_wm_mask_affine.g4b src/exa_wm_maskca.g4a src/exa_wm_maskca_prog.h src/exa_wm_maskca_srcalpha.g4a src/exa_wm_maskca_srcalpha_prog.h src/exa_wm_masknoca.g4a src/exa_wm_masknoca_prog.h src/exa_wm_mask_projective.g4a src/exa_wm_mask_projective.g4b src/exa_wm_mask_sample_a.g4a src/exa_wm_mask_sample_a.g4b src/exa_wm_mask_sample_argb.g4a src/exa_wm_mask_sample_argb.g4b src/exa_wm_noca.g4a src/exa_wm_noca.g4b src/exa_wm_nomask.g4a src/exa_wm_nomask_prog.h src/exa_wm_projective.g4i src/exa_wm_rotation.g4a src/exa_wm_rotation_prog.h src/exa_wm_src_affine.g4a src/exa_wm_src_affine.g4b src/exa_wm_src_projective.g4a src/exa_wm_src_projective.g4b src/exa_wm_src_sample_a.g4a src/exa_wm_src_sample_a.g4b src/exa_wm_src_sample_argb.g4a src/exa_wm_src_sample_argb.g4b src/exa_wm_write.g4a src/exa_wm_write.g4b src/exa_wm_xy.g4a src/exa_wm_xy.g4b src/i810_reg.h src/i830_quirks.c src/i965_render.c src/i965_video.c src/Makefile.am src/packed_yuv_sf.g4b src/packed_yuv_wm.g4a src/packed_yuv_wm.g4b src/sf_prog.h src/wm_prog.h

Zhenyu Wang zhen at kemper.freedesktop.org
Sun Apr 6 18:27:33 PDT 2008


 src/Makefile.am                   |   93 ++++----
 src/exa_sf.g4a                    |   78 ++++++-
 src/exa_sf.g4b                    |   15 +
 src/exa_sf_mask.g4a               |  105 ++++++----
 src/exa_sf_mask.g4b               |   15 +
 src/exa_sf_mask_prog.h            |   25 --
 src/exa_sf_prog.h                 |   17 -
 src/exa_sf_rotation.g4a           |   55 -----
 src/exa_sf_rotation_prog.h        |   20 -
 src/exa_wm.g4i                    |  122 +++++++++++
 src/exa_wm_affine.g4i             |   44 ++++
 src/exa_wm_ca.g4a                 |   38 +++
 src/exa_wm_ca.g4b                 |    4 
 src/exa_wm_ca_srcalpha.g4a        |   37 +++
 src/exa_wm_ca_srcalpha.g4b        |    4 
 src/exa_wm_mask_affine.g4a        |   41 +++
 src/exa_wm_mask_affine.g4b        |    8 
 src/exa_wm_mask_projective.g4a    |   53 +++++
 src/exa_wm_mask_projective.g4b    |   16 +
 src/exa_wm_mask_sample_a.g4a      |   48 ++++
 src/exa_wm_mask_sample_a.g4b      |    2 
 src/exa_wm_mask_sample_argb.g4a   |   48 ++++
 src/exa_wm_mask_sample_argb.g4b   |    2 
 src/exa_wm_maskca.g4a             |  228 ----------------------
 src/exa_wm_maskca_prog.h          |   95 ---------
 src/exa_wm_maskca_srcalpha.g4a    |  228 ----------------------
 src/exa_wm_maskca_srcalpha_prog.h |   95 ---------
 src/exa_wm_masknoca.g4a           |  228 ----------------------
 src/exa_wm_masknoca_prog.h        |   95 ---------
 src/exa_wm_noca.g4a               |   38 +++
 src/exa_wm_noca.g4b               |    4 
 src/exa_wm_nomask.g4a             |   96 +++------
 src/exa_wm_nomask_prog.h          |   70 ------
 src/exa_wm_projective.g4i         |   51 ++++
 src/exa_wm_rotation.g4a           |  184 -----------------
 src/exa_wm_rotation_prog.h        |   70 ------
 src/exa_wm_src_affine.g4a         |   45 ++++
 src/exa_wm_src_affine.g4b         |    8 
 src/exa_wm_src_projective.g4a     |   49 ++++
 src/exa_wm_src_projective.g4b     |   16 +
 src/exa_wm_src_sample_a.g4a       |   47 ++++
 src/exa_wm_src_sample_a.g4b       |    2 
 src/exa_wm_src_sample_argb.g4a    |   47 ++++
 src/exa_wm_src_sample_argb.g4b    |    2 
 src/exa_wm_write.g4a              |   76 +++++++
 src/exa_wm_write.g4b              |   18 +
 src/exa_wm_xy.g4a                 |   52 +++++
 src/exa_wm_xy.g4b                 |    4 
 src/i810_reg.h                    |    1 
 src/i830_quirks.c                 |   12 +
 src/i965_render.c                 |  393 +++++++++++++++++++++++++-------------
 src/i965_video.c                  |    4 
 src/packed_yuv_sf.g4b             |   17 +
 src/packed_yuv_wm.g4a             |   32 +--
 src/packed_yuv_wm.g4b             |   82 +++++++
 src/sf_prog.h                     |   17 -
 src/wm_prog.h                     |   82 -------
 57 files changed, 1565 insertions(+), 1813 deletions(-)

New commits:
commit 24248097988775d62f6c416f2988e74d31c91cd0
Author: Bryce Harrington <bryce at bryceharrington.org>
Date:   Mon Apr 7 17:26:49 2008 +0800

    Quirks from Ubuntu/Dell
    
    FD bug #15353. Launchpad bug ID is available for reference.

diff --git a/src/i830_quirks.c b/src/i830_quirks.c
index f29083b..24c9658 100644
--- a/src/i830_quirks.c
+++ b/src/i830_quirks.c
@@ -218,8 +218,14 @@ static i830_quirk i830_quirk_list[] = {
 
     /* Dell Latitude X1 */
     { PCI_CHIP_I915_GM, 0x1028, 0x01a3, quirk_ignore_tv },
+    /* Dell Latitude X1 / D630 (LP: #197740) */
+    { PCI_CHIP_I915_GM, 0x1028, 0x01f9, quirk_ignore_tv },
     /* Dell XPS 1330 */
     { PCI_CHIP_I965_GM, 0x1028, 0x0209, quirk_ignore_tv },
+    /* Dell Inspiron 1535 */
+    { PCI_CHIP_I965_GM, 0x1028, 0x0254, quirk_ignore_tv },
+    /* Dell Inspiron 1735 */
+    { PCI_CHIP_I965_GM, 0x1028, 0x0256, quirk_ignore_tv },
 
     /* Lenovo Napa TV (use dmi)*/
     { PCI_CHIP_I945_GM, 0x17aa, SUBSYS_ANY, quirk_lenovo_tv_dmi },
@@ -249,6 +255,9 @@ static i830_quirk i830_quirk_list[] = {
     /* HP Compaq 6730s has no TV output */
     { PCI_CHIP_IGD_GM, 0x103c, 0x30e8, quirk_ignore_tv },
 
+    /* Dell Inspiron 510m needs pipe A force quirk */
+    { PCI_CHIP_I855_GM, 0x1028, 0x0164, quirk_pipea_force },
+
     /* Thinkpad R31 needs pipe A force quirk */
     { PCI_CHIP_I830_M, 0x1014, 0x0505, quirk_pipea_force },
     /* Dell Latitude D500 needs pipe A force quirk */
@@ -266,6 +275,9 @@ static i830_quirk i830_quirk_list[] = {
     /* Sony vaio PCG-r600HFP (fix bug 13722) */
     { PCI_CHIP_I830_M, 0x104d, 0x8100, quirk_ivch_dvob },
 
+    /* Intel 945GM hardware (See LP: #152416) */
+    { PCI_CHIP_I945_GM, 0x1584, 0x9900, quirk_ignore_tv },
+
     { 0, 0, 0, NULL },
 };
 
commit 19320b33f06ca07a89f58d689a101f44ecfcd03b
Author: Keith Packard <keithp at keithp.com>
Date:   Fri Apr 4 19:17:55 2008 -0700

    Forgot to build exa_wm_src_sample_argb.g4b
    (cherry picked from commit 0147c1c84872f7a109721a53d88a539932d9be81)

diff --git a/src/exa_wm_src_sample_argb.g4b b/src/exa_wm_src_sample_argb.g4b
index ddbb1db..c5b9274 100644
--- a/src/exa_wm_src_sample_argb.g4b
+++ b/src/exa_wm_src_sample_argb.g4b
@@ -1 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
    { 0x01800031, 0x21c01d29, 0x008d0000, 0x02580001 },
commit 1b9cd443941ea80507df78520be8798871c11647
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 1 00:54:13 2008 -0700

    Remove extra flushing
    (cherry picked from commit 90886f9a602d58b754e9a8d0f1a9c40803d34fa2)

diff --git a/src/exa_wm_src_sample_argb.g4b b/src/exa_wm_src_sample_argb.g4b
index c5b9274..ddbb1db 100644
--- a/src/exa_wm_src_sample_argb.g4b
+++ b/src/exa_wm_src_sample_argb.g4b
@@ -1,2 +1 @@
-   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
    { 0x01800031, 0x21c01d29, 0x008d0000, 0x02580001 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 921ea80..96082bb 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -318,7 +318,7 @@ static const uint32_t sip_kernel_static[][4] = {
  */
 
 #define SF_KERNEL_NUM_GRF  16
-#define SF_MAX_THREADS	   1
+#define SF_MAX_THREADS	   2
 
 static const uint32_t sf_kernel_static[][4] = {
 #include "exa_sf.g4b"
@@ -330,7 +330,7 @@ static const uint32_t sf_kernel_static_mask[][4] = {
 
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
-#define PS_MAX_THREADS	    32
+#define PS_MAX_THREADS	    48
 #define PS_SCRATCH_SPACE    1024
 #define PS_SCRATCH_SPACE_LOG	0   /* log2 (PS_SCRATCH_SPACE) - 10  (1024 is 0, 2048 is 1) */
 
@@ -931,7 +931,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     wm_state->thread0.kernel_start_pointer =
 	(state_base_offset + ps_kernel_offset) >> 6;
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
-    wm_state->thread1.single_program_flow = 1;
+    wm_state->thread1.single_program_flow = 0;
     if (!pMask)
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
     else
@@ -1240,14 +1240,6 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 	}
     }
 
-    {
-	BEGIN_BATCH(2);
-	OUT_BATCH(MI_FLUSH |
-		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
-		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
-	OUT_BATCH(MI_NOOP);
-	ADVANCE_BATCH();
-    }
     /* Wait for any existing composite rectangles to land before we overwrite
      * the VB with the next one.
      */
commit 6809ac3f309364778fcdd46629a16b7b92e0b953
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 1 00:16:05 2008 -0700

    Add projective versions of the PS kernels
    (cherry picked from commit 0836373dc6e2f8612f120074980561f7ac11f6f7)

diff --git a/src/i965_render.c b/src/i965_render.c
index dc5bd5e..921ea80 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -348,7 +348,7 @@ static const uint32_t ps_kernel_static_nomask_projective [][4] = {
 #include "exa_wm_write.g4b"
 };
 
-static const uint32_t ps_kernel_static_maskca [][4] = {
+static const uint32_t ps_kernel_static_maskca_affine [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_argb.g4b"
@@ -358,7 +358,17 @@ static const uint32_t ps_kernel_static_maskca [][4] = {
 #include "exa_wm_write.g4b"
 };
 
-static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
+static const uint32_t ps_kernel_static_maskca_projective [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_static_maskca_srcalpha_affine [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_a.g4b"
@@ -368,7 +378,17 @@ static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
 #include "exa_wm_write.g4b"
 };
 
-static const uint32_t ps_kernel_static_masknoca [][4] = {
+static const uint32_t ps_kernel_static_maskca_srcalpha_projective [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_a.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca_srcalpha.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_static_masknoca_affine [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_argb.g4b"
@@ -378,6 +398,16 @@ static const uint32_t ps_kernel_static_masknoca [][4] = {
 #include "exa_wm_write.g4b"
 };
 
+static const uint32_t ps_kernel_static_masknoca_projective [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_a.g4b"
+#include "exa_wm_noca.g4b"
+#include "exa_wm_write.g4b"
+};
+
 static uint32_t 
 i965_get_card_format(PicturePtr pPict)
 {
@@ -484,15 +514,22 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	if (pMaskPicture->componentAlpha && 
                 PICT_FORMAT_RGB(pMaskPicture->format)) {
             if (i965_blend_op[op].src_alpha) {
-                next_offset = ps_kernel_offset + 
-                    sizeof(ps_kernel_static_maskca_srcalpha);
+		if (is_affine)
+		    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_srcalpha_affine);
+		else
+		    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_srcalpha_projective);
             } else {
-                next_offset = ps_kernel_offset + 
-                    sizeof(ps_kernel_static_maskca);
+		if (is_affine)
+		    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_affine);
+		else
+		    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_projective);
             }
-        } else
-	    next_offset = ps_kernel_offset + 
-                          sizeof(ps_kernel_static_masknoca);
+        } else {
+	    if (is_affine)
+		next_offset = ps_kernel_offset + sizeof(ps_kernel_static_masknoca_affine);
+	    else
+		next_offset = ps_kernel_offset + sizeof(ps_kernel_static_masknoca_projective);
+	}
     } else {
 	if (is_affine)
 	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_affine);
@@ -865,22 +902,28 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pMask) {
 	if (pMaskPicture->componentAlpha && 
                 PICT_FORMAT_RGB(pMaskPicture->format)) {
-            if (i965_blend_op[op].src_alpha) 
-                memcpy(ps_kernel, ps_kernel_static_maskca_srcalpha,
-                        sizeof (ps_kernel_static_maskca_srcalpha));
-            else
-                memcpy(ps_kernel, ps_kernel_static_maskca,
-                        sizeof (ps_kernel_static_maskca));
-        } else
-   	    memcpy(ps_kernel, ps_kernel_static_masknoca,
-		   sizeof (ps_kernel_static_masknoca));
+            if (i965_blend_op[op].src_alpha) {
+		if (is_affine)
+		    memcpy(ps_kernel, ps_kernel_static_maskca_srcalpha_affine, sizeof (ps_kernel_static_maskca_srcalpha_affine));
+		else
+                    memcpy(ps_kernel, ps_kernel_static_maskca_srcalpha_projective, sizeof (ps_kernel_static_maskca_srcalpha_projective));
+            } else {
+		if (is_affine)
+		    memcpy(ps_kernel, ps_kernel_static_maskca_affine, sizeof (ps_kernel_static_maskca_affine));
+		else
+		    memcpy(ps_kernel, ps_kernel_static_maskca_projective, sizeof (ps_kernel_static_maskca_projective));
+	    }
+        } else {
+	    if (is_affine)
+		memcpy(ps_kernel, ps_kernel_static_masknoca_affine, sizeof (ps_kernel_static_masknoca_affine));
+	    else
+		memcpy(ps_kernel, ps_kernel_static_masknoca_projective, sizeof (ps_kernel_static_masknoca_projective));
+	}
     } else {
 	if (is_affine)
-	    memcpy(ps_kernel, ps_kernel_static_nomask_affine,
-		   sizeof (ps_kernel_static_nomask_affine));
+	    memcpy(ps_kernel, ps_kernel_static_nomask_affine, sizeof (ps_kernel_static_nomask_affine));
 	else
-	    memcpy(ps_kernel, ps_kernel_static_nomask_projective,
-		   sizeof (ps_kernel_static_nomask_projective));
+	    memcpy(ps_kernel, ps_kernel_static_nomask_projective, sizeof (ps_kernel_static_nomask_projective));
     }
 
     wm_state = &wm_state_local;
commit a5264efc7f36971aa06869feed9980a154cdddbe
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 1 00:06:08 2008 -0700

    Shrink WM thread to 32 registers and 1024 scratch space.
    
    Saving registers means we can run more in parallel.
    (cherry picked from commit bfd803e085e938866efb45c67a79facef78ec399)

diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index 724ef2b..10e630e 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -57,7 +57,7 @@ define(`mask_dw_dy', `g6.4<0,1,0>F')
 define(`mask_wo',    `g6.12<0,1,0>F')
 
 /*
- * Local variables
+ * Local variables. Pairs must be aligned on even reg boundry
  */
 
 /* this holds the X dest coordinates */
@@ -71,14 +71,14 @@ define(`dst_y_0',   `dst_y')
 define(`dst_y_1',   `g11')
 
 /* When computing x * dn/dx, use this */
-define(`temp_x',    `g34')
+define(`temp_x',    `g30')
 define(`temp_x_0',  `temp_x')
-define(`temp_x_1',  `g35')
+define(`temp_x_1',  `g31')
 
 /* When computing y * dn/dy, use this */
-define(`temp_y',    `g32')
+define(`temp_y',    `g28')
 define(`temp_y_0',  temp_y)
-define(`temp_y_1',  `g33')
+define(`temp_y_1',  `g29')
 
 /* when loading x/y, use these to hold them in UW format */
 define(`temp_x_uw', temp_x)
@@ -90,33 +90,33 @@ define(`src_msg_ind',`1')
 define(`src_u',	    `m2')
 define(`src_v',	    `m4')
 define(`src_w',	    `g12')
-define(`src_w_0',   `g12')
+define(`src_w_0',   `src_w')
 define(`src_w_1',   `g13')
 
 define(`mask_msg',  `m7')
 define(`mask_msg_ind',`7')
 define(`mask_u',    `m8')
 define(`mask_v',    `m10')
-define(`mask_w',    `g14')
-define(`mask_w_0',  `g14')
-define(`mask_w_1',  `g15')
+define(`mask_w',    `src_w')
+define(`mask_w_0',  `src_w_0')
+define(`mask_w_1',  `src_w_1')
 
 /* sample src to these registers */
-define(`src_sample0',	`g16')
-define(`src_sample1',	`g17')
-define(`src_sample2',	`g18')
-define(`src_sample3',	`g19')
-define(`src_sample4',	`g20')
-define(`src_sample5',	`g21')
-define(`src_sample6',	`g22')
-define(`src_sample7',	`g23')
+define(`src_sample0',	`g14')
+define(`src_sample1',	`g15')
+define(`src_sample2',	`g16')
+define(`src_sample3',	`g17')
+define(`src_sample4',	`g18')
+define(`src_sample5',	`g19')
+define(`src_sample6',	`g20')
+define(`src_sample7',	`g21')
 
 /* sample mask to these registers */
-define(`mask_sample0',	`g24')
-define(`mask_sample1',	`g25')
-define(`mask_sample2',	`g26')
-define(`mask_sample3',	`g27')
-define(`mask_sample4',	`g28')
-define(`mask_sample5',	`g29')
-define(`mask_sample6',	`g30')
-define(`mask_sample7',	`g31')
+define(`mask_sample0',	`g22')
+define(`mask_sample1',	`g23')
+define(`mask_sample2',	`g24')
+define(`mask_sample3',	`g25')
+define(`mask_sample4',	`g26')
+define(`mask_sample5',	`g27')
+define(`mask_sample6',	`g28')
+define(`mask_sample7',	`g29')
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
index 28bd6c6..372e8b2 100644
--- a/src/exa_wm_ca.g4b
+++ b/src/exa_wm_ca.g4b
@@ -1,4 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
    { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
    { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
    { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
-   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
index 94f1516..963d676 100644
--- a/src/exa_wm_ca_srcalpha.g4b
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -1,4 +1,4 @@
-   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d02c0 },
-   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d02c0 },
-   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d02c0 },
-   { 0x00802041, 0x22c077bd, 0x008d03c0, 0x008d02c0 },
+   { 0x00802041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d0280 },
diff --git a/src/exa_wm_mask_affine.g4b b/src/exa_wm_mask_affine.g4b
index 35dec6f..14a5451 100644
--- a/src/exa_wm_mask_affine.g4b
+++ b/src/exa_wm_mask_affine.g4b
@@ -1,8 +1,8 @@
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000a0 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000a4 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x210077be, 0x008d0440, 0x000000ac },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000b0 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000b4 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x214077be, 0x008d0440, 0x000000bc },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x210077be, 0x008d03c0, 0x000000ac },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x214077be, 0x008d03c0, 0x000000bc },
diff --git a/src/exa_wm_mask_projective.g4b b/src/exa_wm_mask_projective.g4b
index 0684882..78cb9ae 100644
--- a/src/exa_wm_mask_projective.g4b
+++ b/src/exa_wm_mask_projective.g4b
@@ -1,16 +1,16 @@
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000c0 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000c4 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000cc },
-   { 0x00600031, 0x21c01fbd, 0x008d0440, 0x01110001 },
-   { 0x00600031, 0x21e01fbd, 0x008d0460, 0x01110001 },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000a0 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000a4 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000ac },
-   { 0x00802041, 0x210077be, 0x008d0440, 0x008d01c0 },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000b0 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000b4 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000bc },
-   { 0x00802041, 0x214077be, 0x008d0440, 0x008d01c0 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000c0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000c4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000cc },
+   { 0x00600031, 0x21801fbd, 0x008d03c0, 0x01110001 },
+   { 0x00600031, 0x21a01fbd, 0x008d03e0, 0x01110001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000ac },
+   { 0x00802041, 0x210077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000bc },
+   { 0x00802041, 0x214077be, 0x008d03c0, 0x008d0180 },
diff --git a/src/exa_wm_mask_sample_a.g4b b/src/exa_wm_mask_sample_a.g4b
index 01fc8d5..018bd36 100644
--- a/src/exa_wm_mask_sample_a.g4b
+++ b/src/exa_wm_mask_sample_a.g4b
@@ -1,2 +1,2 @@
    { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
-   { 0x07800031, 0x23c01d29, 0x008d0000, 0x02520102 },
+   { 0x07800031, 0x23801d29, 0x008d0000, 0x02520102 },
diff --git a/src/exa_wm_mask_sample_argb.g4b b/src/exa_wm_mask_sample_argb.g4b
index 97d3803..b159cba 100644
--- a/src/exa_wm_mask_sample_argb.g4b
+++ b/src/exa_wm_mask_sample_argb.g4b
@@ -1,2 +1,2 @@
    { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
-   { 0x07800031, 0x23001d29, 0x008d0000, 0x02580102 },
+   { 0x07800031, 0x22c01d29, 0x008d0000, 0x02580102 },
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
index 1c9d948..1506334 100644
--- a/src/exa_wm_noca.g4b
+++ b/src/exa_wm_noca.g4b
@@ -1,4 +1,4 @@
-   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d03c0 },
-   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d03c0 },
-   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d03c0 },
-   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/src/exa_wm_src_affine.g4b b/src/exa_wm_src_affine.g4b
index 9fef62c..d30da87 100644
--- a/src/exa_wm_src_affine.g4b
+++ b/src/exa_wm_src_affine.g4b
@@ -1,8 +1,8 @@
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000060 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000064 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x204077be, 0x008d0440, 0x0000006c },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000070 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000074 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x208077be, 0x008d0440, 0x0000007c },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x204077be, 0x008d03c0, 0x0000006c },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x208077be, 0x008d03c0, 0x0000007c },
diff --git a/src/exa_wm_src_projective.g4b b/src/exa_wm_src_projective.g4b
index 2d20395..198bab3 100644
--- a/src/exa_wm_src_projective.g4b
+++ b/src/exa_wm_src_projective.g4b
@@ -1,16 +1,16 @@
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000080 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000084 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000008c },
-   { 0x00600031, 0x21801fbd, 0x008d0440, 0x01110001 },
-   { 0x00600031, 0x21a01fbd, 0x008d0460, 0x01110001 },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000060 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000064 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000006c },
-   { 0x00802041, 0x204077be, 0x008d0440, 0x008d0180 },
-   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000070 },
-   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000074 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
-   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000007c },
-   { 0x00802041, 0x208077be, 0x008d0440, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000008c },
+   { 0x00600031, 0x21801fbd, 0x008d03c0, 0x01110001 },
+   { 0x00600031, 0x21a01fbd, 0x008d03e0, 0x01110001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000006c },
+   { 0x00802041, 0x204077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000007c },
+   { 0x00802041, 0x208077be, 0x008d03c0, 0x008d0180 },
diff --git a/src/exa_wm_src_sample_a.g4b b/src/exa_wm_src_sample_a.g4b
index 8505757..ce8650a 100644
--- a/src/exa_wm_src_sample_a.g4b
+++ b/src/exa_wm_src_sample_a.g4b
@@ -1,2 +1,2 @@
    { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
-   { 0x01800031, 0x22c01d29, 0x008d0000, 0x02520001 },
+   { 0x01800031, 0x22801d29, 0x008d0000, 0x02520001 },
diff --git a/src/exa_wm_src_sample_argb.g4b b/src/exa_wm_src_sample_argb.g4b
index 1d4a730..c5b9274 100644
--- a/src/exa_wm_src_sample_argb.g4b
+++ b/src/exa_wm_src_sample_argb.g4b
@@ -1,2 +1,2 @@
    { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
-   { 0x01800031, 0x22001d29, 0x008d0000, 0x02580001 },
+   { 0x01800031, 0x21c01d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
index b7421c2..785fe32 100644
--- a/src/exa_wm_write.g4b
+++ b/src/exa_wm_write.g4b
@@ -1,11 +1,11 @@
-   { 0x00600001, 0x204003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d02c0, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d02a0, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
    { 0x00600001, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_xy.g4b b/src/exa_wm_xy.g4b
index c5620cd..327fc29 100644
--- a/src/exa_wm_xy.g4b
+++ b/src/exa_wm_xy.g4b
@@ -1,4 +1,4 @@
-   { 0x00800040, 0x24406d29, 0x00480028, 0x10101010 },
-   { 0x00800040, 0x24006d29, 0x0048002a, 0x11001100 },
-   { 0x00802040, 0x2100753d, 0x008d0440, 0x00004020 },
-   { 0x00802040, 0x2140753d, 0x008d0400, 0x00004024 },
+   { 0x00800040, 0x23c06d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x23806d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d03c0, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d0380, 0x00004024 },
diff --git a/src/i965_render.c b/src/i965_render.c
index c2260eb..dc5bd5e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -329,10 +329,10 @@ static const uint32_t sf_kernel_static_mask[][4] = {
 };
 
 /* ps kernels */
-#define PS_KERNEL_NUM_GRF   48
+#define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	    32
-#define PS_SCRATCH_SPACE    2048
-#define PS_SCRATCH_SPACE_LOG	1   /* log2 (PS_SCRATCH_SPACE) - 10  (1024 is 0, 2048 is 1) */
+#define PS_SCRATCH_SPACE    1024
+#define PS_SCRATCH_SPACE_LOG	0   /* log2 (PS_SCRATCH_SPACE) - 10  (1024 is 0, 2048 is 1) */
 
 static const uint32_t ps_kernel_static_nomask_affine [][4] = {
 #include "exa_wm_xy.g4b"
commit 09c65c29335ab5192f7398cb2c46c91d3fee2a60
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 23:50:20 2008 -0700

    Fix composite with mask using new compositing thread code
    
    Clean up register allocation to never overlap
    Always write 4 values for each texture vertex.
    (cherry picked from commit a6492661ae07310128eb73c3ef037c42ce7ab184)

diff --git a/src/Makefile.am b/src/Makefile.am
index 81d9596..9b5d653 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -131,18 +131,14 @@ INTEL_G4A =				\
 	packed_yuv_wm.g4a		\
 	exa_sf.g4a 			\
 	exa_sf_mask.g4a 		\
-	exa_sf_rotation.g4a		\
-	exa_wm_maskca.g4a 		\
-	exa_wm_maskca_srcalpha.g4a 	\
-	exa_wm_masknoca.g4a 		\
-	exa_wm_nomask.g4a		\
-	exa_wm_rotation.g4a		\
 	exa_wm_src_affine.g4a 		\
 	exa_wm_src_projective.g4a 	\
-	exa_wm_src_sample.g4a 		\
+	exa_wm_src_sample_argb.g4a 	\
+	exa_wm_src_sample_a.g4a 	\
 	exa_wm_mask_affine.g4a 		\
 	exa_wm_mask_projective.g4a 	\
-	exa_wm_mask_sample.g4a 		\
+	exa_wm_mask_sample_argb.g4a 	\
+	exa_wm_mask_sample_a.g4a 	\
 	exa_wm_noca.g4a			\
 	exa_wm_ca.g4a			\
 	exa_wm_ca_srcalpha.g4a		\
@@ -153,29 +149,21 @@ INTEL_G4I =				\
 	exa_wm.g4i			\
 	exa_wm_affine.g4i		\
 	exa_wm_projective.g4i
+	
 
 INTEL_G4B = 				\
 	packed_yuv_sf.g4b		\
-	packed_yuv_wm.g4b 		\
-	exa_sf_mask.g4b			\
+	packed_yuv_wm.g4b		\
 	exa_sf.g4b 			\
-	exa_sf_rotation.g4b		\
-	exa_wm_maskca.g4b		\
-	exa_wm_maskca_srcalpha.g4b	\
-	exa_wm_masknoca.g4b		\
-	exa_wm_nomask.g4b		\
-	exa_wm_rotation.g4b		\
-	exa_wm_maskca.g4b 		\
-	exa_wm_maskca_srcalpha.g4b 	\
-	exa_wm_masknoca.g4b 		\
-	exa_wm_nomask.g4b		\
-	exa_wm_rotation.g4b		\
+	exa_sf_mask.g4b 		\
 	exa_wm_src_affine.g4b 		\
 	exa_wm_src_projective.g4b 	\
-	exa_wm_src_sample.g4b 		\
+	exa_wm_src_sample_argb.g4b 	\
+	exa_wm_src_sample_a.g4b 	\
 	exa_wm_mask_affine.g4b 		\
 	exa_wm_mask_projective.g4b 	\
-	exa_wm_mask_sample.g4b 		\
+	exa_wm_mask_sample_argb.g4b 	\
+	exa_wm_mask_sample_a.g4b 	\
 	exa_wm_noca.g4b			\
 	exa_wm_ca.g4b			\
 	exa_wm_ca_srcalpha.g4b		\
@@ -194,8 +182,11 @@ if HAVE_GEN4ASM
 
 SUFFIXES = .g4a .g4b
 .g4a.g4b:
-	m4 -s $*.g4a > $*.g4m
-	intel-gen4asm -o $@ $*.g4m && rm $*.g4m
+	m4 -s $*.g4a > $*.g4m && intel-gen4asm -o $@ $*.g4m && rm $*.g4m
+
+$(INTEL_G4B): $(INTEL_G4I)
+
+BUILT_SOURCES= $(INTEL_G4B)
 
 endif
 
diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index 1be40e7..724ef2b 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -71,47 +71,52 @@ define(`dst_y_0',   `dst_y')
 define(`dst_y_1',   `g11')
 
 /* When computing x * dn/dx, use this */
-define(`temp_x',    `g12')
+define(`temp_x',    `g34')
 define(`temp_x_0',  `temp_x')
-define(`temp_x_1',  `g13')
+define(`temp_x_1',  `g35')
 
 /* When computing y * dn/dy, use this */
-define(`temp_y',    `g14')
+define(`temp_y',    `g32')
 define(`temp_y_0',  temp_y)
-define(`temp_y_1',  `g15')
+define(`temp_y_1',  `g33')
 
 /* when loading x/y, use these to hold them in UW format */
 define(`temp_x_uw', temp_x)
 define(`temp_y_uw', temp_y)
 
 /* compute source and mask u/v to this pair to send to sampler */
-define(`src_u',	    `m1')
-define(`src_v',	    `m3')
-define(`mask_u',    src_u)
-define(`mask_v',    src_v)
-define(`src_w',	    `g16')
-define(`src_w_0',   src_w)
-define(`src_w_1',   `g17')
-define(`mask_w',    src_w)
-define(`mask_w_0',  src_w_0)
-define(`mask_w_1',  src_w_1)
+define(`src_msg',   `m1')
+define(`src_msg_ind',`1')
+define(`src_u',	    `m2')
+define(`src_v',	    `m4')
+define(`src_w',	    `g12')
+define(`src_w_0',   `g12')
+define(`src_w_1',   `g13')
+
+define(`mask_msg',  `m7')
+define(`mask_msg_ind',`7')
+define(`mask_u',    `m8')
+define(`mask_v',    `m10')
+define(`mask_w',    `g14')
+define(`mask_w_0',  `g14')
+define(`mask_w_1',  `g15')
 
 /* sample src to these registers */
-define(`src_sample0',	`g18')
-define(`src_sample1',	`g19')
-define(`src_sample2',	`g20')
-define(`src_sample3',	`g21')
-define(`src_sample4',	`g22')
-define(`src_sample5',	`g23')
-define(`src_sample6',	`g24')
-define(`src_sample7',	`g25')
+define(`src_sample0',	`g16')
+define(`src_sample1',	`g17')
+define(`src_sample2',	`g18')
+define(`src_sample3',	`g19')
+define(`src_sample4',	`g20')
+define(`src_sample5',	`g21')
+define(`src_sample6',	`g22')
+define(`src_sample7',	`g23')
 
 /* sample mask to these registers */
-define(`mask_sample0',	`g26')
-define(`mask_sample1',	`g27')
-define(`mask_sample2',	`g28')
-define(`mask_sample3',	`g29')
-define(`mask_sample4',	`g30')
-define(`mask_sample5',	`g31')
-define(`mask_sample6',	`g32')
-define(`mask_sample7',	`g33')
+define(`mask_sample0',	`g24')
+define(`mask_sample1',	`g25')
+define(`mask_sample2',	`g26')
+define(`mask_sample3',	`g27')
+define(`mask_sample4',	`g28')
+define(`mask_sample5',	`g29')
+define(`mask_sample6',	`g30')
+define(`mask_sample7',	`g31')
diff --git a/src/exa_wm_affine.g4i b/src/exa_wm_affine.g4i
index 8fc6450..e72656b 100644
--- a/src/exa_wm_affine.g4i
+++ b/src/exa_wm_affine.g4i
@@ -42,4 +42,3 @@ mul (16)	temp_x<1>F	dst_x<8,8,1>F	dv_dx		{ compr align1 };
 mul (16)	temp_y<1>F	dst_y<8,8,1>F	dv_dy		{ compr align1 };
 add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
 add (16)	v<1>F		temp_x<8,8,1>F	vo		{ compr align1 };
-
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
index d0f3519..28bd6c6 100644
--- a/src/exa_wm_ca.g4b
+++ b/src/exa_wm_ca.g4b
@@ -1,4 +1,4 @@
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
    { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
    { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
    { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
-   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
index a1be28e..e252e19 100644
--- a/src/exa_wm_ca_srcalpha.g4a
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -31,8 +31,7 @@
 
 include(`exa_wm.g4i')
 
-/* mul mask rgba channels to src */
-mul (16)    src_sample0<1>F src_sample0<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F src_sample2<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F src_sample4<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F src_sample6<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample0<1>F mask_sample0<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F mask_sample2<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F mask_sample4<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F mask_sample6<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
index 780e704..94f1516 100644
--- a/src/exa_wm_ca_srcalpha.g4b
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -1,4 +1,4 @@
-   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0300 },
-   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0300 },
-   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0300 },
-   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0300 },
+   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d02c0 },
+   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d02c0 },
+   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d02c0 },
+   { 0x00802041, 0x22c077bd, 0x008d03c0, 0x008d02c0 },
diff --git a/src/exa_wm_mask_affine.g4a b/src/exa_wm_mask_affine.g4a
index 4c096cb..9c52d2f 100644
--- a/src/exa_wm_mask_affine.g4a
+++ b/src/exa_wm_mask_affine.g4a
@@ -26,12 +26,16 @@
  */
 
 include(`exa_wm.g4i')
+
 define(`du_dx',	`mask_du_dx')
 define(`du_dy',	`mask_du_dy')
 define(`uo',	`mask_uo')
+
 define(`dv_dx',	`mask_dv_dx')
 define(`dv_dy',	`mask_dv_dy')
 define(`vo',	`mask_vo')
+
 define(`u',	`mask_u')
 define(`v',	`mask_v')
+
 include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_mask_affine.g4b b/src/exa_wm_mask_affine.g4b
index 62b46e0..35dec6f 100644
--- a/src/exa_wm_mask_affine.g4b
+++ b/src/exa_wm_mask_affine.g4b
@@ -1,8 +1,8 @@
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x202077be, 0x008d0180, 0x0000009c },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x206077be, 0x008d0180, 0x000000ac },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x210077be, 0x008d0440, 0x000000ac },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x214077be, 0x008d0440, 0x000000bc },
diff --git a/src/exa_wm_mask_projective.g4a b/src/exa_wm_mask_projective.g4a
index 464f6c5..9acaace 100644
--- a/src/exa_wm_mask_projective.g4a
+++ b/src/exa_wm_mask_projective.g4a
@@ -42,6 +42,11 @@ define(`wo',	`mask_wo')
 define(`u',	`mask_u')
 define(`v',	`mask_v')
 define(`w',	`mask_w')
+
+define(`u_0',	`mask_u_0')
+define(`v_0',	`mask_v_0')
+define(`u_1',	`mask_u_1')
+define(`v_1',	`mask_v_1')
 define(`w_0',	`mask_w_0')
 define(`w_1',	`mask_w_1')
 
diff --git a/src/exa_wm_mask_projective.g4b b/src/exa_wm_mask_projective.g4b
index ac4faa3..0684882 100644
--- a/src/exa_wm_mask_projective.g4b
+++ b/src/exa_wm_mask_projective.g4b
@@ -1,16 +1,16 @@
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000b0 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000b4 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000bc },
-   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
-   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000ac },
-   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000c0 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000c4 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000cc },
+   { 0x00600031, 0x21c01fbd, 0x008d0440, 0x01110001 },
+   { 0x00600031, 0x21e01fbd, 0x008d0460, 0x01110001 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000ac },
+   { 0x00802041, 0x210077be, 0x008d0440, 0x008d01c0 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x000000bc },
+   { 0x00802041, 0x214077be, 0x008d0440, 0x008d01c0 },
diff --git a/src/exa_wm_mask_sample.g4a b/src/exa_wm_mask_sample.g4a
deleted file mode 100644
index 45dc3c4..0000000
--- a/src/exa_wm_mask_sample.g4a
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright © 2006 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- *    Keith Packard <keithp at keithp.com>
- */
-
-/* Sample the mask surface */
-
-include(`exa_wm.g4i')
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0			/* msg reg index */
-	mask_sample0<1>UW 	/* readback */
-	g0<8,8,1>UW		/* copy to msg start reg*/
-	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
-				/* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-// mov (8)  mask_sample7<1>UD	mask_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
-
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
diff --git a/src/exa_wm_mask_sample.g4b b/src/exa_wm_mask_sample.g4b
deleted file mode 100644
index 45f7ead..0000000
--- a/src/exa_wm_mask_sample.g4b
+++ /dev/null
@@ -1 +0,0 @@
-   { 0x00800031, 0x23401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_mask_sample_a.g4a b/src/exa_wm_mask_sample_a.g4a
new file mode 100644
index 0000000..c06611d
--- /dev/null
+++ b/src/exa_wm_mask_sample_a.g4a
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the mask surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load only alpha */
+mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
+
+/* mask_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) mask_msg_ind		/* msg reg index */
+	mask_sample6<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 2 { align1 };   /* required message len 5, readback len 8 */
+
diff --git a/src/exa_wm_mask_sample_a.g4b b/src/exa_wm_mask_sample_a.g4b
new file mode 100644
index 0000000..01fc8d5
--- /dev/null
+++ b/src/exa_wm_mask_sample_a.g4b
@@ -0,0 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x07800031, 0x23c01d29, 0x008d0000, 0x02520102 },
diff --git a/src/exa_wm_mask_sample_argb.g4a b/src/exa_wm_mask_sample_argb.g4a
new file mode 100644
index 0000000..7f0815f
--- /dev/null
+++ b/src/exa_wm_mask_sample_argb.g4a
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the mask surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load argb */
+mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
+
+/* mask_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) mask_msg_ind		/* msg reg index */
+	mask_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
diff --git a/src/exa_wm_mask_sample_argb.g4b b/src/exa_wm_mask_sample_argb.g4b
new file mode 100644
index 0000000..97d3803
--- /dev/null
+++ b/src/exa_wm_mask_sample_argb.g4b
@@ -0,0 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x07800031, 0x23001d29, 0x008d0000, 0x02580102 },
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
index ba01d1a..1c9d948 100644
--- a/src/exa_wm_noca.g4b
+++ b/src/exa_wm_noca.g4b
@@ -1,4 +1,4 @@
-   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0400 },
-   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0400 },
-   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0400 },
-   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d03c0 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d03c0 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d03c0 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
diff --git a/src/exa_wm_nomask.g4a b/src/exa_wm_nomask.g4a
index 97426ec..eb535fe 100644
--- a/src/exa_wm_nomask.g4a
+++ b/src/exa_wm_nomask.g4a
@@ -119,7 +119,7 @@ mov (8) m8<1>F g17<8,8,1>F { align1 };
 mov (8) m9<1>F g19<8,8,1>F { align1 };
 
 /* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+mov (8) m1<1>UD g1<8,8,1>UD { align1 };
 
 /* write */
 send (16) 0 acc0<1>UW g0<8,8,1>UW write (
diff --git a/src/exa_wm_src_affine.g4a b/src/exa_wm_src_affine.g4a
index 3bf8717..3194b5a 100644
--- a/src/exa_wm_src_affine.g4a
+++ b/src/exa_wm_src_affine.g4a
@@ -30,12 +30,16 @@
  */
 
 include(`exa_wm.g4i')
+
 define(`du_dx',	`src_du_dx')
 define(`du_dy',	`src_du_dy')
 define(`uo',	`src_uo')
+
 define(`dv_dx',	`src_dv_dx')
 define(`dv_dy',	`src_dv_dy')
 define(`vo',	`src_vo')
+
 define(`u',	`src_u')
 define(`v',	`src_v')
+
 include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_src_affine.g4b b/src/exa_wm_src_affine.g4b
index f18ea1e..9fef62c 100644
--- a/src/exa_wm_src_affine.g4b
+++ b/src/exa_wm_src_affine.g4b
@@ -1,8 +1,8 @@
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x202077be, 0x008d0180, 0x0000006c },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x206077be, 0x008d0180, 0x0000007c },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x204077be, 0x008d0440, 0x0000006c },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x208077be, 0x008d0440, 0x0000007c },
diff --git a/src/exa_wm_src_projective.g4a b/src/exa_wm_src_projective.g4a
index 6bd2d6a..16c9cd5 100644
--- a/src/exa_wm_src_projective.g4a
+++ b/src/exa_wm_src_projective.g4a
@@ -39,6 +39,10 @@ define(`wo',	`src_wo')
 define(`u',	`src_u')
 define(`v',	`src_v')
 define(`w',	`src_w')
+define(`u_0',	`src_u_0')
+define(`v_0',	`src_v_0')
+define(`u_1',	`src_u_1')
+define(`v_1',	`src_v_1')
 define(`w_0',	`src_w_0')
 define(`w_1',	`src_w_1')
 
diff --git a/src/exa_wm_src_projective.g4b b/src/exa_wm_src_projective.g4b
index 68bfc92..2d20395 100644
--- a/src/exa_wm_src_projective.g4b
+++ b/src/exa_wm_src_projective.g4b
@@ -1,16 +1,16 @@
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000080 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000084 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000008c },
-   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
-   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000006c },
-   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
-   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
-   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
-   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000008c },
+   { 0x00600031, 0x21801fbd, 0x008d0440, 0x01110001 },
+   { 0x00600031, 0x21a01fbd, 0x008d0460, 0x01110001 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000006c },
+   { 0x00802041, 0x204077be, 0x008d0440, 0x008d0180 },
+   { 0x00802041, 0x244077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x240077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x008d0400 },
+   { 0x00802040, 0x244077bd, 0x008d0440, 0x0000007c },
+   { 0x00802041, 0x208077be, 0x008d0440, 0x008d0180 },
diff --git a/src/exa_wm_src_sample.g4a b/src/exa_wm_src_sample.g4a
deleted file mode 100644
index 04cd3e3..0000000
--- a/src/exa_wm_src_sample.g4a
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright © 2006 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- *    Keith Packard <keithp at keithp.com>
- */
-
-/* Sample the src surface */
-
-include(`exa_wm.g4i')
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0			/* msg reg index */
-	src_sample0<1>UW 	/* readback */
-	g0<8,8,1>UW		/* copy to msg start reg*/
-	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
-				/* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-// mov (8)  src_sample7<1>UD	src_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
-
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
diff --git a/src/exa_wm_src_sample.g4b b/src/exa_wm_src_sample.g4b
deleted file mode 100644
index 5ca33f5..0000000
--- a/src/exa_wm_src_sample.g4b
+++ /dev/null
@@ -1 +0,0 @@
-   { 0x00800031, 0x22401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_src_sample_a.g4a b/src/exa_wm_src_sample_a.g4a
new file mode 100644
index 0000000..803c358
--- /dev/null
+++ b/src/exa_wm_src_sample_a.g4a
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the src surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load alpha */
+mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
+
+/* src_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) src_msg_ind		/* msg reg index */
+	src_sample6<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 2 { align1 };   /* required message len 5, readback len 8 */
diff --git a/src/exa_wm_src_sample_a.g4b b/src/exa_wm_src_sample_a.g4b
new file mode 100644
index 0000000..8505757
--- /dev/null
+++ b/src/exa_wm_src_sample_a.g4b
@@ -0,0 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x01800031, 0x22c01d29, 0x008d0000, 0x02520001 },
diff --git a/src/exa_wm_src_sample_argb.g4a b/src/exa_wm_src_sample_argb.g4a
new file mode 100644
index 0000000..4fcf276
--- /dev/null
+++ b/src/exa_wm_src_sample_argb.g4a
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the src surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load argb */
+mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
+
+/* src_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) src_msg_ind		/* msg reg index */
+	src_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
diff --git a/src/exa_wm_src_sample_argb.g4b b/src/exa_wm_src_sample_argb.g4b
new file mode 100644
index 0000000..1d4a730
--- /dev/null
+++ b/src/exa_wm_src_sample_argb.g4b
@@ -0,0 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x01800031, 0x22001d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
index 9a821d7..5d3e6b1 100644
--- a/src/exa_wm_write.g4a
+++ b/src/exa_wm_write.g4a
@@ -31,9 +31,6 @@
 
 include(`exa_wm.g4i')
 
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>F g1<8,8,1>F { align1 };
-
 /* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
 /* src_sample0 -> m2
    src_sample1 -> m6
@@ -55,7 +52,7 @@ mov (8) m8<1>F src_sample5<8,8,1>F { align1 };
 mov (8) m9<1>F src_sample7<8,8,1>F { align1 };
 
 /* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+mov (8) m1<1>UD g1<8,8,1>UD { align1 };
 
 /* write */
 send (16) 0 acc0<1>UW g0<8,8,1>UW write (
@@ -76,5 +73,4 @@ nop;
 nop;
 nop;
 nop;
-nop;
 
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
index dd266a3..b7421c2 100644
--- a/src/exa_wm_write.g4b
+++ b/src/exa_wm_write.g4b
@@ -1,13 +1,12 @@
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d02c0, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0300, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d02a0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d02e0, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0320, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d02c0, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d02a0, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02e0, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
@@ -17,4 +16,3 @@
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_xy.g4b b/src/exa_wm_xy.g4b
index 7784a3d..c5620cd 100644
--- a/src/exa_wm_xy.g4b
+++ b/src/exa_wm_xy.g4b
@@ -1,4 +1,4 @@
-   { 0x00800040, 0x21806d29, 0x00480028, 0x10101010 },
-   { 0x00800040, 0x21c06d29, 0x0048002a, 0x11001100 },
-   { 0x00802040, 0x2100753d, 0x008d0180, 0x00004020 },
-   { 0x00802040, 0x2140753d, 0x008d01c0, 0x00004024 },
+   { 0x00800040, 0x24406d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x24006d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d0440, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d0400, 0x00004024 },
diff --git a/src/i810_reg.h b/src/i810_reg.h
index d799e77..834b948 100644
--- a/src/i810_reg.h
+++ b/src/i810_reg.h
@@ -2322,6 +2322,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define MI_FLUSH			(0x04<<23)
 #define MI_WRITE_DIRTY_STATE		(1<<4)
 #define MI_END_SCENE			(1<<3)
+#define MI_GLOBAL_SNAPSHOT_COUNT_RESET	(1<<3)
 #define MI_INHIBIT_RENDER_CACHE_FLUSH	(1<<2)
 #define MI_STATE_INSTRUCTION_CACHE_FLUSH (1<<1)
 #define MI_INVALIDATE_MAP_CACHE		(1<<0)
diff --git a/src/i965_render.c b/src/i965_render.c
index e348c2b..c2260eb 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -285,7 +285,7 @@ static int next_offset, total_state_size;
 static char *state_base;
 static int state_base_offset;
 static float *vb;
-static int vb_size = (6 * 4) * 4 ; /* 6 DWORDS per vertex - and mask*/
+static int vb_size = (2 + 3 + 3) * 3 * 4;   /* (dst, src, mask) 3 vertices, 4 bytes */
 
 static uint32_t src_blend, dst_blend;
 
@@ -318,7 +318,7 @@ static const uint32_t sip_kernel_static[][4] = {
  */
 
 #define SF_KERNEL_NUM_GRF  16
-#define SF_MAX_THREADS	   2
+#define SF_MAX_THREADS	   1
 
 static const uint32_t sf_kernel_static[][4] = {
 #include "exa_sf.g4b"
@@ -329,29 +329,31 @@ static const uint32_t sf_kernel_static_mask[][4] = {
 };
 
 /* ps kernels */
-#define PS_KERNEL_NUM_GRF   32
-#define PS_MAX_THREADS	   32
+#define PS_KERNEL_NUM_GRF   48
+#define PS_MAX_THREADS	    32
+#define PS_SCRATCH_SPACE    2048
+#define PS_SCRATCH_SPACE_LOG	1   /* log2 (PS_SCRATCH_SPACE) - 10  (1024 is 0, 2048 is 1) */
 
 static const uint32_t ps_kernel_static_nomask_affine [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
-#include "exa_wm_src_sample.g4b"
+#include "exa_wm_src_sample_argb.g4b"
 #include "exa_wm_write.g4b"
 };
 
 static const uint32_t ps_kernel_static_nomask_projective [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_projective.g4b"
-#include "exa_wm_src_sample.g4b"
+#include "exa_wm_src_sample_argb.g4b"
 #include "exa_wm_write.g4b"
 };
 
 static const uint32_t ps_kernel_static_maskca [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
-#include "exa_wm_src_sample.g4b"
+#include "exa_wm_src_sample_argb.g4b"
 #include "exa_wm_mask_affine.g4b"
-#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
 #include "exa_wm_ca.g4b"
 #include "exa_wm_write.g4b"
 };
@@ -359,9 +361,9 @@ static const uint32_t ps_kernel_static_maskca [][4] = {
 static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
-#include "exa_wm_src_sample.g4b"
+#include "exa_wm_src_sample_a.g4b"
 #include "exa_wm_mask_affine.g4b"
-#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
 #include "exa_wm_ca_srcalpha.g4b"
 #include "exa_wm_write.g4b"
 };
@@ -369,9 +371,9 @@ static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
 static const uint32_t ps_kernel_static_masknoca [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
-#include "exa_wm_src_sample.g4b"
+#include "exa_wm_src_sample_argb.g4b"
 #include "exa_wm_mask_affine.g4b"
-#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_mask_sample_a.g4b"
 #include "exa_wm_noca.g4b"
 #include "exa_wm_write.g4b"
 };
@@ -432,21 +434,21 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     pI830->transform[0] = pSrcPicture->transform;
     is_affine_src = i830_transform_is_affine (pI830->transform[0]);
-    is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
-    is_affine = is_affine_src && is_affine_mask;
 
     if (!pMask) {
 	pI830->transform[1] = NULL;
 	pI830->scale_units[1][0] = -1;
 	pI830->scale_units[1][1] = -1;
+	is_affine_mask = TRUE;
     } else {
 	pI830->transform[1] = pMaskPicture->transform;
-	if (pI830->transform[1])
-	    I830FALLBACK("i965 mask transform not implemented!\n");
 	pI830->scale_units[1][0] = pMask->drawable.width;
 	pI830->scale_units[1][1] = pMask->drawable.height;
+	is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
     }
 
+    is_affine = is_affine_src && is_affine_mask;
+
     /* setup 3d pipeline state */
 
     binding_table_entries = 2; /* default no mask */
@@ -463,7 +465,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     next_offset = wm_offset + sizeof(*wm_state);
 
     wm_scratch_offset = ALIGN(next_offset, 1024);
-    next_offset = wm_scratch_offset + 1024 * PS_MAX_THREADS;
+    next_offset = wm_scratch_offset + PS_SCRATCH_SPACE * PS_MAX_THREADS;
 
     cc_offset = ALIGN(next_offset, 32);
     next_offset = cc_offset + sizeof(*cc_state);
@@ -782,6 +784,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    I830FALLBACK("Bad filter 0x%x\n", pMaskPicture->filter);
    	}
 
+	mask_sampler_state->ss0.default_color_mode = 0; /* GL mode */
    	if (!pMaskPicture->repeat) {
    	    mask_sampler_state->ss1.r_wrap_mode =
 		BRW_TEXCOORDMODE_CLAMP_BORDER;
@@ -885,7 +888,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     wm_state->thread0.kernel_start_pointer =
 	(state_base_offset + ps_kernel_offset) >> 6;
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
-    wm_state->thread1.single_program_flow = 0;
+    wm_state->thread1.single_program_flow = 1;
     if (!pMask)
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
     else
@@ -893,7 +896,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     wm_state->thread2.scratch_space_base_pointer = (state_base_offset +
 						    wm_scratch_offset)>>10;
-    wm_state->thread2.per_thread_scratch_space = 0;
+    wm_state->thread2.per_thread_scratch_space = PS_SCRATCH_SPACE_LOG; 
     wm_state->thread3.const_urb_entry_read_length = 0;
     wm_state->thread3.const_urb_entry_read_offset = 0;
     /* Each pair of attributes (src/mask coords) is one URB entry */
@@ -1044,12 +1047,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	if (is_affine)
 	{
 	    src_format = BRW_SURFACEFORMAT_R32G32_FLOAT;
-	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	    w_component = BRW_VFCOMPONENT_STORE_1_FLT;
 	}
 	else
 	{
 	    src_format = BRW_SURFACEFORMAT_R32G32B32_FLOAT;
-	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	    w_component = BRW_VFCOMPONENT_STORE_SRC;
 	}
 	BEGIN_BATCH(pMask?12:10);
 	/* Set up the pointer to our (single) vertex buffer */
@@ -1083,7 +1086,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_0_SHIFT) |
 		  (BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_1_SHIFT) |
 		  (w_component			<< VE1_VFCOMPONENT_2_SHIFT) |
-		  (BRW_VFCOMPONENT_NOSTORE	<< VE1_VFCOMPONENT_3_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_1_FLT	<< VE1_VFCOMPONENT_3_SHIFT) |
 		  ((4 + 4)			<< VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
 	/* u1, v1, w1 */
    	if (pMask) {
@@ -1095,15 +1098,15 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_0_SHIFT) |
 		      (BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_1_SHIFT) |
 		      (w_component		    << VE1_VFCOMPONENT_2_SHIFT) |
-		      (BRW_VFCOMPONENT_NOSTORE	    << VE1_VFCOMPONENT_3_SHIFT) |
-		      ((4 + 2 + 4)		    << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
+		      (BRW_VFCOMPONENT_STORE_1_FLT  << VE1_VFCOMPONENT_3_SHIFT) |
+		      ((4 + 4 + 4)		    << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
    	}
 
 	ADVANCE_BATCH();
     }
 
 #ifdef I830DEBUG
-    ErrorF("try to sync to show any errors...");
+    ErrorF("try to sync to show any errors...\n");
     I830Sync(pScrn);
 #endif
     return TRUE;
@@ -1119,7 +1122,6 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     Bool is_affine_src, is_affine_mask, is_affine;
     float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
     int i;
-    int per_vertex = 2; /* dst x/y */
 
     is_affine_src = i830_transform_is_affine (pI830->transform[0]);
     is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
@@ -1139,7 +1141,6 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 					      pI830->transform[0],
 					      &src_x[2], &src_y[2]))
 	    return;
-	per_vertex += 2;    /* src u/v */
     }
     else
     {
@@ -1158,14 +1159,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 						 &src_x[2], &src_y[2],
 						 &src_w[2]))
 	    return;
-	per_vertex += 3;    /* src u/v/w */
     }
 
     if (pI830->scale_units[1][0] == -1 || pI830->scale_units[1][1] == -1) {
 	has_mask = FALSE;
     } else {
 	has_mask = TRUE;
-	if (is_affine_mask) {
+	if (is_affine) {
 	    if (!i830_get_transformed_coordinates(maskX, maskY,
 						  pI830->transform[1],
 						  &mask_x[0], &mask_y[0]))
@@ -1178,7 +1178,6 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 						  pI830->transform[1],
 						  &mask_x[2], &mask_y[2]))
 		return;
-	    per_vertex += 2;	/* mask u/v */
 	} else {
 	    if (!i830_get_transformed_coordinates_3d(maskX, maskY,
 						     pI830->transform[1],
@@ -1195,10 +1194,17 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 						     &mask_x[2], &mask_y[2],
 						     &mask_w[2]))
 		return;
-	    per_vertex += 3;	/* mask u/v/w */
 	}
     }
 
+    {
+	BEGIN_BATCH(2);
+	OUT_BATCH(MI_FLUSH |
+		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
+		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
+	OUT_BATCH(MI_NOOP);
+	ADVANCE_BATCH();
+    }
     /* Wait for any existing composite rectangles to land before we overwrite
      * the VB with the next one.
      */
@@ -1246,6 +1252,7 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 	if (!is_affine)
 	    vb[i++] = mask_w[0];
     }
+    assert (i * 4 <= vb_size);
 
     {
       BEGIN_BATCH(6);
@@ -1262,7 +1269,7 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
       ADVANCE_BATCH();
     }
 #ifdef I830DEBUG
-    ErrorF("sync after 3dprimitive");
+    ErrorF("sync after 3dprimitive\n");
     I830Sync(pScrn);
 #endif
     /* we must be sure that the pipeline is flushed before next exa draw,
diff --git a/src/i965_video.c b/src/i965_video.c
index 41f56a9..1d2c3f5 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -78,7 +78,7 @@ static const uint32_t sip_kernel_static[][4] = {
 #define SF_MAX_THREADS	   1
 
 static const uint32_t sf_kernel_static[][4] = {
-#include "sf_prog.h"
+#include "packed_yuv_sf.g4b"
 };
 
 /*
@@ -94,7 +94,7 @@ static const uint32_t sf_kernel_static[][4] = {
 #define BRW_GRF_BLOCKS(nreg)	((nreg + 15) / 16 - 1)
 
 static const uint32_t ps_kernel_static[][4] = {
-#include "wm_prog.h"
+#include "packed_yuv_wm.g4b"
 };
 
 #define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
commit ff643d5b490095a202963fa799ba1ee657cf07ed
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 12:06:37 2008 -0700

    remove old shader source files
    (cherry picked from commit f8081178eb6fda0e405967cbacad532561619262)

diff --git a/src/exa_sf_mask_prog.h b/src/exa_sf_mask_prog.h
deleted file mode 100644
index be0a77b..0000000
--- a/src/exa_sf_mask_prog.h
+++ /dev/null
@@ -1,15 +0,0 @@
-   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
-   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
-   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
-   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_sf_prog.h b/src/exa_sf_prog.h
deleted file mode 100644
index 223c9c9..0000000
--- a/src/exa_sf_prog.h
+++ /dev/null
@@ -1,15 +0,0 @@
-   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
-   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
-   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
-   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
-   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
-   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_maskca_prog.h b/src/exa_wm_maskca_prog.h
deleted file mode 100644
index d936412..0000000
--- a/src/exa_wm_maskca_prog.h
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
-   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d02e0 },
-   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0300 },
-   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d0320 },
-   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0340 },
-   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d0360 },
-   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_maskca_srcalpha_prog.h b/src/exa_wm_maskca_srcalpha_prog.h
deleted file mode 100644
index d83b119..0000000
--- a/src/exa_wm_maskca_srcalpha_prog.h
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
-   { 0x00600041, 0x21e077bd, 0x008d02e0, 0x008d02a0 },
-   { 0x00600041, 0x220077bd, 0x008d0300, 0x008d0280 },
-   { 0x00600041, 0x222077bd, 0x008d0320, 0x008d02a0 },
-   { 0x00600041, 0x224077bd, 0x008d0340, 0x008d0280 },
-   { 0x00600041, 0x226077bd, 0x008d0360, 0x008d02a0 },
-   { 0x00600041, 0x228077bd, 0x008d0380, 0x008d0280 },
-   { 0x00600041, 0x22a077bd, 0x008d03a0, 0x008d02a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca_prog.h b/src/exa_wm_masknoca_prog.h
deleted file mode 100644
index 5fcf3b5..0000000
--- a/src/exa_wm_masknoca_prog.h
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
-   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
-   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
-   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
-   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
-   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
-   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_nomask_prog.h b/src/exa_wm_nomask_prog.h
deleted file mode 100644
index c73bdbc..0000000
--- a/src/exa_wm_nomask_prog.h
+++ /dev/null
@@ -1,34 +0,0 @@
-   { 0x00800040, 0x20806d29, 0x00480028, 0x10101010 },
-   { 0x00800040, 0x20c06d29, 0x0048002a, 0x11001100 },
-   { 0x00802040, 0x2180753d, 0x008d0080, 0x00004020 },
-   { 0x00802040, 0x2200753d, 0x008d00c0, 0x00004024 },
-   { 0x00802041, 0x210077bd, 0x008d0180, 0x00000060 },
-   { 0x00802041, 0x214077bd, 0x008d0200, 0x00000064 },
-   { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
-   { 0x00802040, 0x202077be, 0x008d0100, 0x0000006c },
-   { 0x00802041, 0x210077bd, 0x008d0180, 0x00000070 },
-   { 0x00802041, 0x214077bd, 0x008d0200, 0x00000074 },
-   { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
-   { 0x00802040, 0x206077be, 0x008d0100, 0x0000007c },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/sf_prog.h b/src/sf_prog.h
deleted file mode 100644
index 830d176..0000000
--- a/src/sf_prog.h
+++ /dev/null
@@ -1,17 +0,0 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/wm_prog.h b/src/wm_prog.h
deleted file mode 100644
index d72c651..0000000
--- a/src/wm_prog.h
+++ /dev/null
@@ -1,82 +0,0 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600129, 0x008d0260, 0x00000000 },
-   { 0x00600040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
-   { 0x00600040, 0x21807fbd, 0x008d0180, 0xbf008084 },
-   { 0x00600040, 0x22007fbd, 0x008d0200, 0xbf008084 },
-   { 0x00600041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
-   { 0x00600041, 0x20007fbc, 0x008d0180, 0x3fcc49ba },
-   { 0x80600048, 0x20407fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0180, 0xbf5020c5 },
-   { 0x00600048, 0x20007fbc, 0x008d0200, 0xbec8b439 },
-   { 0x80600048, 0x20607fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
-   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600040, 0x21e07fbd, 0x008d01e0, 0xbd808081 },
-   { 0x00600040, 0x21a07fbd, 0x008d01a0, 0xbf008084 },
-   { 0x00600040, 0x22207fbd, 0x008d0220, 0xbf008084 },
-   { 0x00600041, 0x21e07fbd, 0x008d01e0, 0x3f94fdf4 },
-   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
-   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d01a0, 0xbf5020c5 },
-   { 0x00600048, 0x20007fbc, 0x008d0220, 0xbec8b439 },
-   { 0x80600048, 0x20e07fbe, 0x008d01e0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0220, 0x40011687 },
-   { 0x80600048, 0x21007fbe, 0x008d01e0, 0x3f800000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit 55110c54c7d40413121484995117b921eb7cab42
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 12:05:56 2008 -0700

    remove old monolithic shaders
    (cherry picked from commit 879f8717b09f79156b218ee9cc2107700190d586)

diff --git a/src/exa_wm_maskca.g4b b/src/exa_wm_maskca.g4b
deleted file mode 100644
index d936412..0000000
--- a/src/exa_wm_maskca.g4b
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
-   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d02e0 },
-   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0300 },
-   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d0320 },
-   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0340 },
-   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d0360 },
-   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_maskca_srcalpha.g4b b/src/exa_wm_maskca_srcalpha.g4b
deleted file mode 100644
index d83b119..0000000
--- a/src/exa_wm_maskca_srcalpha.g4b
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
-   { 0x00600041, 0x21e077bd, 0x008d02e0, 0x008d02a0 },
-   { 0x00600041, 0x220077bd, 0x008d0300, 0x008d0280 },
-   { 0x00600041, 0x222077bd, 0x008d0320, 0x008d02a0 },
-   { 0x00600041, 0x224077bd, 0x008d0340, 0x008d0280 },
-   { 0x00600041, 0x226077bd, 0x008d0360, 0x008d02a0 },
-   { 0x00600041, 0x228077bd, 0x008d0380, 0x008d0280 },
-   { 0x00600041, 0x22a077bd, 0x008d03a0, 0x008d02a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca.g4b b/src/exa_wm_masknoca.g4b
deleted file mode 100644
index 5fcf3b5..0000000
--- a/src/exa_wm_masknoca.g4b
+++ /dev/null
@@ -1,95 +0,0 @@
-   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
-   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
-   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
-   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
-   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
-   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
-   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
-   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
-   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
-   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
-   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
-   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
-   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
-   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
-   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
-   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
-   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
-   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
-   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
-   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
-   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
-   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
-   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit bfd67e6ff556050684a3093381d383ef22c35ae1
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 12:04:00 2008 -0700

    remove old monolithic shaders
    (cherry picked from commit 6db8faeb754897b21af045d00f50db9640b080bb)

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index 8701a10..5078d01 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -23,7 +23,7 @@
  * Authors:
  *    Keith Packard <keithp at keithp.com>
  *    Eric Anholt <eric at anholt.net>
- *
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  */
 
 /*
diff --git a/src/exa_wm_maskca.g4a b/src/exa_wm_maskca.g4a
deleted file mode 100644
index d030467..0000000
--- a/src/exa_wm_maskca.g4a
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright © 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- * XXX: This is still experimental, and should be fixed to support multiple texture
- * map, and conditional mul actions. 
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
-   As mask texture coeffient needs extra setup urb starting from g4, we should
-   shift this location. 
-
- * X0_R is g4->g6
- * X1_R is g5->g7
- * Y0_R is g6->g8
- * Y1_R is g7->g9
-
-     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
-     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
-     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
-     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
- */
-
-/* multitexture program with src and mask texture */
-/* - load src texture */
-/* - load mask texture */
-/* - mul src.X with mask's alpha */
-/* - write out src.X */
-
-    /* Set up ss0.x coordinates*/
-mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
-mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
-mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
-mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
-mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-/* This is for src texture */
-/* I don't want to change origin ssX coords, as it will be used later in mask */
-/* so store tex coords in g10, g11, g12, g13 */
-
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[0] */
-mul (8) g10<1>F g10<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[0] */
-add (8) g10<1>F g10<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g3.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[0] */
-mul (8) g12<1>F g12<8,8,1>F g3.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g3.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[1] */
-add (8) g12<1>F g12<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-
-/* src texture readback: g14-g21 */
-send (16) 0 		/* msg reg index */
-	g14<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, 
-				(binding_table,sampler_index,datatype). 
-			    here(src->dst) we should use src_sampler and 
-			    src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-mov (8) g21<1>UD g21<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* sampler mask texture, use g10, g11, g12, g13 */
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[2] */
-mul (8) g10<1>F g10<8,8,1>F g4<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g4<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[2] */
-add (8) g10<1>F g10<8,8,1>F g4.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g4.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[2] */
-mul (8) g12<1>F g12<8,8,1>F g4.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g4.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[3] */
-add (8) g12<1>F g12<8,8,1>F g4.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g4.28<0,1,0>F { align1 };
-
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; 
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 };
-
-/* mask sampler g22-g29 */
-/* binding_table (2), sampler (1) */
-send (16) 0 g22<1>UW g0<8,8,1>UW sampler (2,1,F) mlen 5 rlen 8 { align1 };
-mov (8) g29<1>UD g29<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* mul mask's channel to src, then write out src */
-mul (8) g14<1>F g14<8,8,1>F g22<8,8,1>F { align1 };
-mul (8) g15<1>F g15<8,8,1>F g23<8,8,1>F { align1 };
-mul (8) g16<1>F g16<8,8,1>F g24<8,8,1>F { align1 };
-mul (8) g17<1>F g17<8,8,1>F g25<8,8,1>F { align1 };
-mul (8) g18<1>F g18<8,8,1>F g26<8,8,1>F { align1 };
-mul (8) g19<1>F g19<8,8,1>F g27<8,8,1>F { align1 };
-mul (8) g20<1>F g20<8,8,1>F g28<8,8,1>F { align1 };
-mul (8) g21<1>F g21<8,8,1>F g29<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-mov (8) m2<1>F g14<8,8,1>F { align1 };
-mov (8) m3<1>F g16<8,8,1>F { align1 };
-mov (8) m4<1>F g18<8,8,1>F { align1 };
-mov (8) m5<1>F g20<8,8,1>F { align1 };
-mov (8) m6<1>F g15<8,8,1>F { align1 };
-mov (8) m7<1>F g17<8,8,1>F { align1 };
-mov (8) m8<1>F g19<8,8,1>F { align1 };
-mov (8) m9<1>F g21<8,8,1>F { align1 };
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-
-/* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_wm_maskca_srcalpha.g4a b/src/exa_wm_maskca_srcalpha.g4a
deleted file mode 100644
index 133c9f0..0000000
--- a/src/exa_wm_maskca_srcalpha.g4a
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright © 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- * XXX: This is still experimental, and should be fixed to support multiple texture
- * map, and conditional mul actions. 
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
-   As mask texture coeffient needs extra setup urb starting from g4, we should
-   shift this location. 
-
- * X0_R is g4->g6
- * X1_R is g5->g7
- * Y0_R is g6->g8
- * Y1_R is g7->g9
-
-     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
-     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
-     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
-     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
- */
-
-/* multitexture program with src and mask texture */
-/* - load src texture */
-/* - load mask texture */
-/* - mul src.X with mask's alpha */
-/* - write out src.X */
-
-    /* Set up ss0.x coordinates*/
-mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
-mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
-mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
-mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
-mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-/* This is for src texture */
-/* I don't want to change origin ssX coords, as it will be used later in mask */
-/* so store tex coords in g10, g11, g12, g13 */
-
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[0] */
-mul (8) g10<1>F g10<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[0] */
-add (8) g10<1>F g10<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g3.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[0] */
-mul (8) g12<1>F g12<8,8,1>F g3.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g3.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[1] */
-add (8) g12<1>F g12<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-
-/* src texture readback: g14-g21 */
-send (16) 0 		/* msg reg index */
-	g14<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, 
-				(binding_table,sampler_index,datatype). 
-			    here(src->dst) we should use src_sampler and 
-			    src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-mov (8) g21<1>UD g21<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* sampler mask texture, use g10, g11, g12, g13 */
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[2] */
-mul (8) g10<1>F g10<8,8,1>F g4<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g4<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[2] */
-add (8) g10<1>F g10<8,8,1>F g4.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g4.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[2] */
-mul (8) g12<1>F g12<8,8,1>F g4.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g4.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[3] */
-add (8) g12<1>F g12<8,8,1>F g4.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g4.28<0,1,0>F { align1 };
-
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; 
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 };
-
-/* mask sampler g22-g29 */
-/* binding_table (2), sampler (1) */
-send (16) 0 g22<1>UW g0<8,8,1>UW sampler (2,1,F) mlen 5 rlen 8 { align1 };
-mov (8) g29<1>UD g29<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* src channel has no more use, src.A * mask.C */
-mul (8) g14<1>F g22<8,8,1>F g20<8,8,1>F { align1 };
-mul (8) g15<1>F g23<8,8,1>F g21<8,8,1>F { align1 };
-mul (8) g16<1>F g24<8,8,1>F g20<8,8,1>F { align1 };
-mul (8) g17<1>F g25<8,8,1>F g21<8,8,1>F { align1 };
-mul (8) g18<1>F g26<8,8,1>F g20<8,8,1>F { align1 };
-mul (8) g19<1>F g27<8,8,1>F g21<8,8,1>F { align1 };
-mul (8) g20<1>F g28<8,8,1>F g20<8,8,1>F { align1 };
-mul (8) g21<1>F g29<8,8,1>F g21<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-mov (8) m2<1>F g14<8,8,1>F { align1 };
-mov (8) m3<1>F g16<8,8,1>F { align1 };
-mov (8) m4<1>F g18<8,8,1>F { align1 };
-mov (8) m5<1>F g20<8,8,1>F { align1 };
-mov (8) m6<1>F g15<8,8,1>F { align1 };
-mov (8) m7<1>F g17<8,8,1>F { align1 };
-mov (8) m8<1>F g19<8,8,1>F { align1 };
-mov (8) m9<1>F g21<8,8,1>F { align1 };
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-
-/* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
deleted file mode 100644
index 44f6953..0000000
--- a/src/exa_wm_masknoca.g4a
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright © 2006 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- * XXX: This is still experimental, and should be fixed to support multiple texture
- * map, and conditional mul actions. 
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
-   As mask texture coeffient needs extra setup urb starting from g4, we should
-   shift this location. 
-
- * X0_R is g4->g6
- * X1_R is g5->g7
- * Y0_R is g6->g8
- * Y1_R is g7->g9
-
-     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
-     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
-     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
-     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
- */
-
-/* multitexture program with src and mask texture */
-/* - load src texture */
-/* - load mask texture */
-/* - mul src.X with mask's alpha */
-/* - write out src.X */
-
-    /* Set up ss0.x coordinates*/
-mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
-mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
-mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
-mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
-mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-/* This is for src texture */
-/* I don't want to change origin ssX coords, as it will be used later in mask */
-/* so store tex coords in g10, g11, g12, g13 */
-
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[0] */
-mul (8) g10<1>F g10<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[0] */
-add (8) g10<1>F g10<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g3.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[0] */
-mul (8) g12<1>F g12<8,8,1>F g3.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g3.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[1] */
-add (8) g12<1>F g12<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-
-/* src texture readback: g14-g21 */
-send (16) 0 		/* msg reg index */
-	g14<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, 
-				(binding_table,sampler_index,datatype). 
-			    here(src->dst) we should use src_sampler and 
-			    src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-mov (8) g21<1>UD g21<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* sampler mask texture, use g10, g11, g12, g13 */
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-/* Cx[2] */
-mul (8) g10<1>F g10<8,8,1>F g4<0,1,0>F { align1 };
-mul (8) g11<1>F g11<8,8,1>F g4<0,1,0>F { align1 };
-    /* add in texture X offset */
-/* Co[2] */
-add (8) g10<1>F g10<8,8,1>F g4.12<0,1,0>F { align1 };
-add (8) g11<1>F g11<8,8,1>F g4.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-/* Cy[2] */
-mul (8) g12<1>F g12<8,8,1>F g4.4<0,1,0>F { align1 };
-mul (8) g13<1>F g13<8,8,1>F g4.4<0,1,0>F { align1 };
-    /* add in texture Y offset */
-/* Co[3] */
-add (8) g12<1>F g12<8,8,1>F g4.28<0,1,0>F { align1 };
-add (8) g13<1>F g13<8,8,1>F g4.28<0,1,0>F { align1 };
-
-mov (8) m1<1>F g10<8,8,1>F { align1 };
-mov (8) m2<1>F g11<8,8,1>F { align1 }; 
-mov (8) m3<1>F g12<8,8,1>F { align1 };
-mov (8) m4<1>F g13<8,8,1>F { align1 };
-
-/* mask sampler g22-g29 */
-/* binding_table (2), sampler (1) */
-send (16) 0 g22<1>UW g0<8,8,1>UW sampler (2,1,F) mlen 5 rlen 8 { align1 };
-mov (8) g29<1>UD g29<8,8,1>UD { align1 };  /* wait sampler return */
-
-/* mul mask's alpha channel g28,g29 to src (g14-g21), then write out src */
-mul (8) g14<1>F g14<8,8,1>F g28<8,8,1>F { align1 };
-mul (8) g15<1>F g15<8,8,1>F g29<8,8,1>F { align1 };
-mul (8) g16<1>F g16<8,8,1>F g28<8,8,1>F { align1 };
-mul (8) g17<1>F g17<8,8,1>F g29<8,8,1>F { align1 };
-mul (8) g18<1>F g18<8,8,1>F g28<8,8,1>F { align1 };
-mul (8) g19<1>F g19<8,8,1>F g29<8,8,1>F { align1 };
-mul (8) g20<1>F g20<8,8,1>F g28<8,8,1>F { align1 };
-mul (8) g21<1>F g21<8,8,1>F g29<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-mov (8) m2<1>F g14<8,8,1>F { align1 };
-mov (8) m3<1>F g16<8,8,1>F { align1 };
-mov (8) m4<1>F g18<8,8,1>F { align1 };
-mov (8) m5<1>F g20<8,8,1>F { align1 };
-mov (8) m6<1>F g15<8,8,1>F { align1 };
-mov (8) m7<1>F g17<8,8,1>F { align1 };
-mov (8) m8<1>F g19<8,8,1>F { align1 };
-mov (8) m9<1>F g21<8,8,1>F { align1 };
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-
-/* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
commit ae8a5ef1a223edfc7e586a77d0ca2cfc69d116b2
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 11:59:14 2008 -0700

    Back to new shaders, fix urb read length
    (cherry picked from commit 6bb92213374f278387c539bbe05b773e87e11b90)

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index a0d6efc..8701a10 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -21,52 +21,81 @@
  * IN THE SOFTWARE.
  *
  * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ *    Eric Anholt <eric at anholt.net>
+ *
  */
 
-/* FIXME how to setup second coeffient for mask tex coord */
-
-/* 
-   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
-   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
-   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
-   g6      { 1/(x1-x0), 1/(y1-y0) }
-   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
-	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
-		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
+/*
+ * Inputs (note all sub-register addresses are bytes, not float indices)
+ *
+ * Note that the vertices will have been reordered:
+ *
+ * V0 is topmost (leftmost among topmost) (upper left)
+ * V1 is next clockwise (lower right)
+ * V2 is remaining (lower left)
+ *
+ *  V0 ...................... XX
+ *  |                          .
+ *  |                          .
+ *  |                          .
+ *  V2------------------------V1
+ *
+ *  G0	    thread state -- just pass along
+ *
+ *  G1 and G2 are fixed by SF spec
+ *
+ *  G1.0    reserved
+ *  G1.4    Provoking vertex
+ *  G1.8    Determinant
+ *  G1.12   X1 - X0
+ *  G1.16   X2 - X0
+ *  G1.20   Y1 - Y0
+ *  G1.24   Y2 - Y0
+ *  G1.30   reserved
+ *
+ *  G2.0    Z0
+ *  G2.4    1/W0
+ *  G2.8    Z1
+ *  G2.12   1/W1
+ *  G2.16   Z2
+ *  G2.20   1/W2
+ *  G2.24   reserved
+ *  G2.30   reserved
+ *
+ *  G3 is V0 Vertex Attribute Data from URB (upper left)
+ *
+ *  G3.0    u0
+ *  G3.4    v0
+ *
+ *  G4 is V1 Vertex Attribute Data from URB (lower right)
+ *
+ *  G4.0    u1
+ *  G4.4    v1
+ *
+ *  G5 is V2 Vertex Attribute Data from URB (lower left)
+ *
  */
 
-/* assign Cx[0], Cx[1] to src, same to Cy, Co 
-          Cx[2], Cx[3] to mask, same to Cy, Co */
+/* Compute inverses of the input deltas */
+send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
 
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
-/* Cx[0] */
-mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[0] */
-mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
-/* Cx[2] */
-mul (1) g7.8<1>F g7.8<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[2] */
-mul (1) g7.12<1>F g7.12<0,1,0>F g6.4<0,1,0>F { align1 };
-
-/* src Cx[0], Cx[1] */
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-/* mask Cx[2], Cx[3] */
-mov (1) m1.8<1>F g7.8<0,1,0>F { align1 };
-mov (1) m1.12<1>F g7.8<0,1,0>F { align1 };
-/* src Cy[0], Cy[1] */
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-/* mask Cy[2], Cy[3] */
-mov (1) m2.8<1>F g7.12<0,1,0>F { align1 };
-mov (1) m2.12<1>F g7.12<0,1,0>F { align1 };
-/* src Co[0], Co[1] */
+/* texture location at V0 */
 mov (8) m3<1>F g3<8,8,1>F { align1 };
-/* mask Co[2], Co[3] */
-mov (1) m3.8<1>F g3.8<0,1,0>F { align1 };
-mov (1) m3.12<1>F g3.12<0,1,0>F { align1 };
 
+/* compute V1 - V2 (motion in X) for texture coordinates */
+add (8) g7<1>F g4<8,8,1>F -g5<8,8,1>F { align1 };
+
+/* multiply by 1/dx */
+mul (8) m1<1>F g7<8,8,1>F g6.0<0,1,0>F { align1 };
+
+/* Compute V2 - V0 (motion in Y) for texture coordinates */
+add (8) g7<1>F g5<8,8,1>F -g3<8,8,1>F { align1 };
+
+/* multiply by 1/dy */
+mul (8) m2<1>F g7<8,8,1>F g6.8<0,1,0>F {align1 };
+
+/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_mask.g4b b/src/exa_sf_mask.g4b
index 4e9114d..be0a77b 100644
--- a/src/exa_sf_mask.g4b
+++ b/src/exa_sf_mask.g4b
@@ -1,19 +1,9 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-   { 0x00000041, 0x20e877bd, 0x000000e8, 0x000000c0 },
-   { 0x00000041, 0x20ec77bd, 0x000000ec, 0x000000c4 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00000001, 0x202803be, 0x000000e8, 0x00000000 },
-   { 0x00000001, 0x202c03be, 0x000000e8, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00000001, 0x204803be, 0x000000ec, 0x00000000 },
-   { 0x00000001, 0x204c03be, 0x000000ec, 0x00000000 },
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
    { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00000001, 0x206803be, 0x00000068, 0x00000000 },
-   { 0x00000001, 0x206c03be, 0x0000006c, 0x00000000 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
+   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
+   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
+   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
    { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index c7ecb09..1be40e7 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -29,8 +29,6 @@
  * Input parameters
  */
 
-define(`quote', `ifelse(`$#', `0', `', ``$*'')')
-
 /* Destination X/Y */
 define(`dst_x_uw',  `g1.8<2,4,0>UW')
 define(`dst_y_uw',  `g1.10<2,4,0>UW')
@@ -48,15 +46,15 @@ define(`src_dw_dx', `g4.0<0,1,0>F')
 define(`src_dw_dy', `g4.4<0,1,0>F')
 define(`src_wo',    `g4.12<0,1,0>F')
 
-define(`mask_du_dx', `g4.16<0,1,0>F')
-define(`mask_du_dy', `g4.20<0,1,0>F')
-define(`mask_uo',    `g4.28<0,1,0>F')
-define(`mask_dv_dx', `g5.0<0,1,0>F')
-define(`mask_dv_dy', `g5.4<0,1,0>F')
-define(`mask_vo',    `g5.12<0,1,0>F')
-define(`mask_dw_dx', `g5.16<0,1,0>F')
-define(`mask_dw_dy', `g5.20<0,1,0>F')
-define(`mask_wo',    `g5.28<0,1,0>F')
+define(`mask_du_dx', `g5.0<0,1,0>F')
+define(`mask_du_dy', `g5.4<0,1,0>F')
+define(`mask_uo',    `g5.12<0,1,0>F')
+define(`mask_dv_dx', `g5.16<0,1,0>F')
+define(`mask_dv_dy', `g5.20<0,1,0>F')
+define(`mask_vo',    `g5.28<0,1,0>F')
+define(`mask_dw_dx', `g6.0<0,1,0>F')
+define(`mask_dw_dy', `g6.4<0,1,0>F')
+define(`mask_wo',    `g6.12<0,1,0>F')
 
 /*
  * Local variables
diff --git a/src/i965_render.c b/src/i965_render.c
index 7668779..e348c2b 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -347,8 +347,6 @@ static const uint32_t ps_kernel_static_nomask_projective [][4] = {
 };
 
 static const uint32_t ps_kernel_static_maskca [][4] = {
-#include "exa_wm_maskca.g4b"
-#if 0
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample.g4b"
@@ -356,12 +354,9 @@ static const uint32_t ps_kernel_static_maskca [][4] = {
 #include "exa_wm_mask_sample.g4b"
 #include "exa_wm_ca.g4b"
 #include "exa_wm_write.g4b"
-#endif
 };
 
 static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
-#include "exa_wm_maskca_srcalpha.g4b"
-#if 0
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample.g4b"
@@ -369,12 +364,9 @@ static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
 #include "exa_wm_mask_sample.g4b"
 #include "exa_wm_ca_srcalpha.g4b"
 #include "exa_wm_write.g4b"
-#endif
 };
 
 static const uint32_t ps_kernel_static_masknoca [][4] = {
-#include "exa_wm_masknoca.g4b"
-#if 0
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample.g4b"
@@ -382,7 +374,6 @@ static const uint32_t ps_kernel_static_masknoca [][4] = {
 #include "exa_wm_mask_sample.g4b"
 #include "exa_wm_noca.g4b"
 #include "exa_wm_write.g4b"
-#endif
 };
 
 static uint32_t 
@@ -907,9 +898,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     wm_state->thread3.const_urb_entry_read_offset = 0;
     /* Each pair of attributes (src/mask coords) is one URB entry */
     if (pMask)
-	wm_state->thread3.urb_entry_read_length = 2;
+	wm_state->thread3.urb_entry_read_length = 4;
     else
-	wm_state->thread3.urb_entry_read_length = 1;
+	wm_state->thread3.urb_entry_read_length = 2;
     wm_state->thread3.urb_entry_read_offset = 0;
     /* wm kernel use urb from 3, see wm_program in compiler module */
     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
commit 5becbf9cd88c11941a5ae588da6a4a65711dfad3
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 11:31:31 2008 -0700

    Dont set the compr bit on 8-unit sends
    (cherry picked from commit 05710145b6fc4ed2c528128b2e6022591a53d050)

diff --git a/src/exa_wm_projective.g4i b/src/exa_wm_projective.g4i
index 13da99c..3c3bbf0 100644
--- a/src/exa_wm_projective.g4i
+++ b/src/exa_wm_projective.g4i
@@ -31,8 +31,8 @@ mul (16)	temp_x<1>F	dst_x<8,8,1>F	dw_dx		{ compr align1 };
 mul (16)	temp_y<1>F	dst_y<8,8,1>F	dw_dy		{ compr align1 };
 add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
 add (16)	temp_x<1>F	temp_x<8,8,1>F	wo		{ compr align1 };
-send (8) 0	w_0<1>F		temp_x_0<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
-send (8) 0	w_1<1>F		temp_x_1<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
+send (8) 0	w_0<1>F		temp_x_0<8,8,1>F math inv mlen 1 rlen 1	{ align1 };
+send (8) 0	w_1<1>F		temp_x_1<8,8,1>F math inv mlen 1 rlen 1	{ align1 };
 
 /********** Compute u *************/
 
commit a7b21a94574d403fd3d526db4e69e69c46cf09a0
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 02:20:43 2008 -0700

    Use m4 to clean up gen4 asm progs. Start adding projective transform support.
    
    Use macros for register names, modularize functions into separate files.
    (cherry picked from commit 08500507284f13ad7084eb231b43e117e9728129)

diff --git a/src/Makefile.am b/src/Makefile.am
index 7df69b6..81d9596 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -136,58 +136,66 @@ INTEL_G4A =				\
 	exa_wm_maskca_srcalpha.g4a 	\
 	exa_wm_masknoca.g4a 		\
 	exa_wm_nomask.g4a		\
-	exa_wm_rotation.g4a
-
-INTEL_G4H = 				\
-	sf_prog.h			\
-	wm_prog.h 			\
-	exa_sf_mask_prog.h		\
-	exa_sf_prog.h 			\
-	exa_sf_rotation_prog.h		\
-	exa_wm_maskca_prog.h		\
-	exa_wm_maskca_srcalpha_prog.h	\
-	exa_wm_masknoca_prog.h		\
-	exa_wm_nomask_prog.h		\
-	exa_wm_rotation_prog.h
-
+	exa_wm_rotation.g4a		\
+	exa_wm_src_affine.g4a 		\
+	exa_wm_src_projective.g4a 	\
+	exa_wm_src_sample.g4a 		\
+	exa_wm_mask_affine.g4a 		\
+	exa_wm_mask_projective.g4a 	\
+	exa_wm_mask_sample.g4a 		\
+	exa_wm_noca.g4a			\
+	exa_wm_ca.g4a			\
+	exa_wm_ca_srcalpha.g4a		\
+	exa_wm_write.g4a 		\
+	exa_wm_xy.g4a
+
+INTEL_G4I =				\
+	exa_wm.g4i			\
+	exa_wm_affine.g4i		\
+	exa_wm_projective.g4i
+
+INTEL_G4B = 				\
+	packed_yuv_sf.g4b		\
+	packed_yuv_wm.g4b 		\
+	exa_sf_mask.g4b			\
+	exa_sf.g4b 			\
+	exa_sf_rotation.g4b		\
+	exa_wm_maskca.g4b		\
+	exa_wm_maskca_srcalpha.g4b	\
+	exa_wm_masknoca.g4b		\
+	exa_wm_nomask.g4b		\
+	exa_wm_rotation.g4b		\
+	exa_wm_maskca.g4b 		\
+	exa_wm_maskca_srcalpha.g4b 	\
+	exa_wm_masknoca.g4b 		\
+	exa_wm_nomask.g4b		\
+	exa_wm_rotation.g4b		\
+	exa_wm_src_affine.g4b 		\
+	exa_wm_src_projective.g4b 	\
+	exa_wm_src_sample.g4b 		\
+	exa_wm_mask_affine.g4b 		\
+	exa_wm_mask_projective.g4b 	\
+	exa_wm_mask_sample.g4b 		\
+	exa_wm_noca.g4b			\
+	exa_wm_ca.g4b			\
+	exa_wm_ca_srcalpha.g4b		\
+	exa_wm_write.g4b 		\
+	exa_wm_xy.g4b
+	
 EXTRA_DIST = 		\
 	$(XMODE_SRCS)	\
 	$(INTEL_G4A)	\
-	$(INTEL_G4H)	\
+	$(INTEL_G4I)	\
+	$(INTEL_G4B)	\
 	$(INTEL_DRI_SRCS) \
 	$(INTEL_XVMC_SRCS)
 
 if HAVE_GEN4ASM
 
-sf_prog.h: packed_yuv_sf.g4a
-	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
-
-wm_prog.h: packed_yuv_wm.g4a
-	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
-
-exa_sf_mask_prog.h: exa_sf_mask.g4a
-	intel-gen4asm -o exa_sf_mask_prog.h exa_sf_mask.g4a
-
-exa_sf_prog.h: exa_sf.g4a
-	intel-gen4asm -o exa_sf_prog.h exa_sf.g4a
-
-exa_sf_rotation_prog.h: exa_sf_rotation.g4a
-	intel-gen4asm -o exa_sf_rotation_prog.h exa_sf_rotation.g4a
-
-exa_wm_maskca_prog.h: exa_wm_maskca.g4a
-	intel-gen4asm -o exa_wm_maskca_prog.h exa_wm_maskca.g4a
-
-exa_wm_maskca_srcalpha_prog.h: exa_wm_maskca_srcalpha.g4a
-	intel-gen4asm -o exa_wm_maskca_srcalpha_prog.h exa_wm_maskca_srcalpha.g4a
-
-exa_wm_masknoca_prog.h: exa_wm_masknoca.g4a
-	intel-gen4asm -o exa_wm_masknoca_prog.h exa_wm_masknoca.g4a
-
-exa_wm_nomask_prog.h: exa_wm_nomask.g4a
-	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
-
-exa_wm_rotation_prog.h: exa_wm_rotation.g4a
-	intel-gen4asm -o exa_wm_rotation_prog.h exa_wm_rotation.g4a
+SUFFIXES = .g4a .g4b
+.g4a.g4b:
+	m4 -s $*.g4a > $*.g4m
+	intel-gen4asm -o $@ $*.g4m && rm $*.g4m
 
 endif
 
diff --git a/src/exa_sf.g4b b/src/exa_sf.g4b
new file mode 100644
index 0000000..223c9c9
--- /dev/null
+++ b/src/exa_sf.g4b
@@ -0,0 +1,15 @@
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
+   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
+   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
+   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
+   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
+   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index c830fd8..a0d6efc 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -21,82 +21,52 @@
  * IN THE SOFTWARE.
  *
  * Authors:
- *    Keith Packard <keithp at keithp.com>
- *    Eric Anholt <eric at anholt.net>
  *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  */
 
+/* FIXME how to setup second coeffient for mask tex coord */
 
-/*
- * Inputs (note all sub-register addresses are bytes, not float indices)
- *
- * Note that the vertices will have been reordered:
- *
- * V0 is topmost (leftmost among topmost) (upper left)
- * V1 is next clockwise (lower right)
- * V2 is remaining (lower left)
- *
- *  V0 ...................... XX
- *  |                          .
- *  |                          .
- *  |                          .
- *  V2------------------------V1
- *
- *  G0	    thread state -- just pass along
- *
- *  G1 and G2 are fixed by SF spec
- *
- *  G1.0    reserved
- *  G1.4    Provoking vertex
- *  G1.8    Determinant
- *  G1.12   X1 - X0
- *  G1.16   X2 - X0
- *  G1.20   Y1 - Y0
- *  G1.24   Y2 - Y0
- *  G1.30   reserved
- *
- *  G2.0    Z0
- *  G2.4    1/W0
- *  G2.8    Z1
- *  G2.12   1/W1
- *  G2.16   Z2
- *  G2.20   1/W2
- *  G2.24   reserved
- *  G2.30   reserved
- *
- *  G3 is V0 Vertex Attribute Data from URB (upper left)
- *
- *  G3.0    u0
- *  G3.4    v0
- *
- *  G4 is V1 Vertex Attribute Data from URB (lower right)
- *
- *  G4.0    u1
- *  G4.4    v1
- *
- *  G5 is V2 Vertex Attribute Data from URB (lower left)
- *
+/* 
+   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
+   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
+   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
+   g6      { 1/(x1-x0), 1/(y1-y0) }
+   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
+	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
+		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
  */
 
-/* Compute inverses of the input deltas */
-send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
+/* assign Cx[0], Cx[1] to src, same to Cy, Co 
+          Cx[2], Cx[3] to mask, same to Cy, Co */
 
-/* texture location at V0 */
-mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+/* Cx[0] */
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[0] */
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Cx[2] */
+mul (1) g7.8<1>F g7.8<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[2] */
+mul (1) g7.12<1>F g7.12<0,1,0>F g6.4<0,1,0>F { align1 };
 
-/* compute V1 - V2 (motion in X) for texture coordinates */
-add (8) g7<1>F g4<8,8,1>F -g5<8,8,1>F { align1 };
-
-/* multiply by 1/dx */
-mul (8) m1<1>F g7<8,8,1>F g6.0<0,1,0>F { align1 };
-
-/* Compute V2 - V0 (motion in Y) for texture coordinates */
-add (8) g7<1>F g5<8,8,1>F -g3<8,8,1>F { align1 };
-
-/* multiply by 1/dy */
-mul (8) m2<1>F g7<8,8,1>F g6.8<0,1,0>F {align1 };
+/* src Cx[0], Cx[1] */
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+/* mask Cx[2], Cx[3] */
+mov (1) m1.8<1>F g7.8<0,1,0>F { align1 };
+mov (1) m1.12<1>F g7.8<0,1,0>F { align1 };
+/* src Cy[0], Cy[1] */
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+/* mask Cy[2], Cy[3] */
+mov (1) m2.8<1>F g7.12<0,1,0>F { align1 };
+mov (1) m2.12<1>F g7.12<0,1,0>F { align1 };
+/* src Co[0], Co[1] */
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+/* mask Co[2], Co[3] */
+mov (1) m3.8<1>F g3.8<0,1,0>F { align1 };
+mov (1) m3.12<1>F g3.12<0,1,0>F { align1 };
 
-/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_mask.g4b b/src/exa_sf_mask.g4b
new file mode 100644
index 0000000..4e9114d
--- /dev/null
+++ b/src/exa_sf_mask.g4b
@@ -0,0 +1,25 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00000041, 0x20e877bd, 0x000000e8, 0x000000c0 },
+   { 0x00000041, 0x20ec77bd, 0x000000ec, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00000001, 0x202803be, 0x000000e8, 0x00000000 },
+   { 0x00000001, 0x202c03be, 0x000000e8, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00000001, 0x204803be, 0x000000ec, 0x00000000 },
+   { 0x00000001, 0x204c03be, 0x000000ec, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00000001, 0x206803be, 0x00000068, 0x00000000 },
+   { 0x00000001, 0x206c03be, 0x0000006c, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
new file mode 100644
index 0000000..c7ecb09
--- /dev/null
+++ b/src/exa_wm.g4i
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Input parameters
+ */
+
+define(`quote', `ifelse(`$#', `0', `', ``$*'')')
+
+/* Destination X/Y */
+define(`dst_x_uw',  `g1.8<2,4,0>UW')
+define(`dst_y_uw',  `g1.10<2,4,0>UW')
+define(`screen_x0', `g1.0<0,1,0>F')
+define(`screen_y0', `g1.4<0,1,0>F')
+
+/* Source transformation parameters */
+define(`src_du_dx', `g3.0<0,1,0>F')
+define(`src_du_dy', `g3.4<0,1,0>F')
+define(`src_uo',    `g3.12<0,1,0>F')
+define(`src_dv_dx', `g3.16<0,1,0>F')
+define(`src_dv_dy', `g3.20<0,1,0>F')
+define(`src_vo',    `g3.28<0,1,0>F')
+define(`src_dw_dx', `g4.0<0,1,0>F')
+define(`src_dw_dy', `g4.4<0,1,0>F')
+define(`src_wo',    `g4.12<0,1,0>F')
+
+define(`mask_du_dx', `g4.16<0,1,0>F')
+define(`mask_du_dy', `g4.20<0,1,0>F')
+define(`mask_uo',    `g4.28<0,1,0>F')
+define(`mask_dv_dx', `g5.0<0,1,0>F')
+define(`mask_dv_dy', `g5.4<0,1,0>F')
+define(`mask_vo',    `g5.12<0,1,0>F')
+define(`mask_dw_dx', `g5.16<0,1,0>F')
+define(`mask_dw_dy', `g5.20<0,1,0>F')
+define(`mask_wo',    `g5.28<0,1,0>F')
+
+/*
+ * Local variables
+ */
+
+/* this holds the X dest coordinates */
+define(`dst_x',	    `g8')
+define(`dst_x_0',   `dst_x')
+define(`dst_x_1',   `g9')
+
+/* this holds the Y dest coordinates */
+define(`dst_y',	    `g10')
+define(`dst_y_0',   `dst_y')
+define(`dst_y_1',   `g11')
+
+/* When computing x * dn/dx, use this */
+define(`temp_x',    `g12')
+define(`temp_x_0',  `temp_x')
+define(`temp_x_1',  `g13')
+
+/* When computing y * dn/dy, use this */
+define(`temp_y',    `g14')
+define(`temp_y_0',  temp_y)
+define(`temp_y_1',  `g15')
+
+/* when loading x/y, use these to hold them in UW format */
+define(`temp_x_uw', temp_x)
+define(`temp_y_uw', temp_y)
+
+/* compute source and mask u/v to this pair to send to sampler */
+define(`src_u',	    `m1')
+define(`src_v',	    `m3')
+define(`mask_u',    src_u)
+define(`mask_v',    src_v)
+define(`src_w',	    `g16')
+define(`src_w_0',   src_w)
+define(`src_w_1',   `g17')
+define(`mask_w',    src_w)
+define(`mask_w_0',  src_w_0)
+define(`mask_w_1',  src_w_1)
+
+/* sample src to these registers */
+define(`src_sample0',	`g18')
+define(`src_sample1',	`g19')
+define(`src_sample2',	`g20')
+define(`src_sample3',	`g21')
+define(`src_sample4',	`g22')
+define(`src_sample5',	`g23')
+define(`src_sample6',	`g24')
+define(`src_sample7',	`g25')
+
+/* sample mask to these registers */
+define(`mask_sample0',	`g26')
+define(`mask_sample1',	`g27')
+define(`mask_sample2',	`g28')
+define(`mask_sample3',	`g29')
+define(`mask_sample4',	`g30')
+define(`mask_sample5',	`g31')
+define(`mask_sample6',	`g32')
+define(`mask_sample7',	`g33')
diff --git a/src/exa_wm_affine.g4i b/src/exa_wm_affine.g4i
new file mode 100644
index 0000000..8fc6450
--- /dev/null
+++ b/src/exa_wm_affine.g4i
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/*
+ * Fragment to compute src u/v values under an affine transform
+ */
+
+/********** Compute u *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	du_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	du_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	u<1>F		temp_x<8,8,1>F	uo		{ compr align1 };
+
+/********** Compute v *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dv_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dv_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	v<1>F		temp_x<8,8,1>F	vo		{ compr align1 };
+
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
new file mode 100644
index 0000000..955c68c
--- /dev/null
+++ b/src/exa_wm_ca.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+
+/* mul mask rgba channels to src */
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample0<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample2<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample4<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
new file mode 100644
index 0000000..d0f3519
--- /dev/null
+++ b/src/exa_wm_ca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
new file mode 100644
index 0000000..a1be28e
--- /dev/null
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+
+/* mul mask rgba channels to src */
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
new file mode 100644
index 0000000..780e704
--- /dev/null
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0300 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0300 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0300 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0300 },
diff --git a/src/exa_wm_mask_affine.g4a b/src/exa_wm_mask_affine.g4a
new file mode 100644
index 0000000..4c096cb
--- /dev/null
+++ b/src/exa_wm_mask_affine.g4a
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+include(`exa_wm.g4i')
+define(`du_dx',	`mask_du_dx')
+define(`du_dy',	`mask_du_dy')
+define(`uo',	`mask_uo')
+define(`dv_dx',	`mask_dv_dx')
+define(`dv_dy',	`mask_dv_dy')
+define(`vo',	`mask_vo')
+define(`u',	`mask_u')
+define(`v',	`mask_v')
+include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_mask_affine.g4b b/src/exa_wm_mask_affine.g4b
new file mode 100644
index 0000000..62b46e0
--- /dev/null
+++ b/src/exa_wm_mask_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x202077be, 0x008d0180, 0x0000009c },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x206077be, 0x008d0180, 0x000000ac },
diff --git a/src/exa_wm_mask_projective.g4a b/src/exa_wm_mask_projective.g4a
new file mode 100644
index 0000000..464f6c5
--- /dev/null
+++ b/src/exa_wm_mask_projective.g4a
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+include(`exa_wm.g4i')
+
+define(`du_dx',	`mask_du_dx')
+define(`du_dy',	`mask_du_dy')
+define(`uo',	`mask_uo')
+
+define(`dv_dx',	`mask_dv_dx')
+define(`dv_dy',	`mask_dv_dy')
+define(`vo',	`mask_vo')
+
+define(`dw_dx',	`mask_dw_dx')
+define(`dw_dy',	`mask_dw_dy')
+define(`wo',	`mask_wo')
+
+define(`u',	`mask_u')
+define(`v',	`mask_v')
+define(`w',	`mask_w')
+define(`w_0',	`mask_w_0')
+define(`w_1',	`mask_w_1')
+
+include(`exa_wm_projective.g4i')
diff --git a/src/exa_wm_mask_projective.g4b b/src/exa_wm_mask_projective.g4b
new file mode 100644
index 0000000..ac4faa3
--- /dev/null
+++ b/src/exa_wm_mask_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000bc },
+   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
+   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000ac },
+   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
diff --git a/src/exa_wm_mask_sample.g4a b/src/exa_wm_mask_sample.g4a
new file mode 100644
index 0000000..45dc3c4
--- /dev/null
+++ b/src/exa_wm_mask_sample.g4a
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the mask surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0			/* msg reg index */
+	mask_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+// mov (8)  mask_sample7<1>UD	mask_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
+
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
diff --git a/src/exa_wm_mask_sample.g4b b/src/exa_wm_mask_sample.g4b
new file mode 100644
index 0000000..45f7ead
--- /dev/null
+++ b/src/exa_wm_mask_sample.g4b
@@ -0,0 +1 @@
+   { 0x00800031, 0x23401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_maskca.g4a b/src/exa_wm_maskca.g4a
index 0e96aa0..d030467 100644
--- a/src/exa_wm_maskca.g4a
+++ b/src/exa_wm_maskca.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_maskca.g4b b/src/exa_wm_maskca.g4b
new file mode 100644
index 0000000..d936412
--- /dev/null
+++ b/src/exa_wm_maskca.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d02e0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d0320 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d0360 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_maskca_srcalpha.g4a b/src/exa_wm_maskca_srcalpha.g4a
index a92c9e4..133c9f0 100644
--- a/src/exa_wm_maskca_srcalpha.g4a
+++ b/src/exa_wm_maskca_srcalpha.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_maskca_srcalpha.g4b b/src/exa_wm_maskca_srcalpha.g4b
new file mode 100644
index 0000000..d83b119
--- /dev/null
+++ b/src/exa_wm_maskca_srcalpha.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00600041, 0x21e077bd, 0x008d02e0, 0x008d02a0 },
+   { 0x00600041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00600041, 0x222077bd, 0x008d0320, 0x008d02a0 },
+   { 0x00600041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00600041, 0x226077bd, 0x008d0360, 0x008d02a0 },
+   { 0x00600041, 0x228077bd, 0x008d0380, 0x008d0280 },
+   { 0x00600041, 0x22a077bd, 0x008d03a0, 0x008d02a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
index 2e9e3c9..44f6953 100644
--- a/src/exa_wm_masknoca.g4a
+++ b/src/exa_wm_masknoca.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_masknoca.g4b b/src/exa_wm_masknoca.g4b
new file mode 100644
index 0000000..5fcf3b5
--- /dev/null
+++ b/src/exa_wm_masknoca.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
new file mode 100644
index 0000000..7dd1224
--- /dev/null
+++ b/src/exa_wm_noca.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+/* mul mask's alpha channel to src */
+
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
new file mode 100644
index 0000000..ba01d1a
--- /dev/null
+++ b/src/exa_wm_noca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0400 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0400 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0400 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
diff --git a/src/exa_wm_projective.g4i b/src/exa_wm_projective.g4i
new file mode 100644
index 0000000..13da99c
--- /dev/null
+++ b/src/exa_wm_projective.g4i
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/********** Compute w *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dw_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dw_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	wo		{ compr align1 };
+send (8) 0	w_0<1>F		temp_x_0<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
+send (8) 0	w_1<1>F		temp_x_1<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
+
+/********** Compute u *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	du_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	du_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	uo		{ compr align1 };
+mul (16)	u<1>F		temp_x<8,8,1>F	w<8,8,1>F	{ compr align1 };
+
+/********** Compute v *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dv_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dv_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	vo		{ compr align1 };
+mul (16)	v<1>F		temp_x<8,8,1>F	w<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_src_affine.g4a b/src/exa_wm_src_affine.g4a
new file mode 100644
index 0000000..3bf8717
--- /dev/null
+++ b/src/exa_wm_src_affine.g4a
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/*
+ * Fragment to compute src u/v values under an affine transform
+ */
+
+include(`exa_wm.g4i')
+define(`du_dx',	`src_du_dx')
+define(`du_dy',	`src_du_dy')
+define(`uo',	`src_uo')
+define(`dv_dx',	`src_dv_dx')
+define(`dv_dy',	`src_dv_dy')
+define(`vo',	`src_vo')
+define(`u',	`src_u')
+define(`v',	`src_v')
+include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_src_affine.g4b b/src/exa_wm_src_affine.g4b
new file mode 100644
index 0000000..f18ea1e
--- /dev/null
+++ b/src/exa_wm_src_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x202077be, 0x008d0180, 0x0000006c },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x206077be, 0x008d0180, 0x0000007c },
diff --git a/src/exa_wm_src_projective.g4a b/src/exa_wm_src_projective.g4a
new file mode 100644
index 0000000..6bd2d6a
--- /dev/null
+++ b/src/exa_wm_src_projective.g4a
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+
+include(`exa_wm.g4i')
+define(`du_dx',	`src_du_dx')
+define(`du_dy',	`src_du_dy')
+define(`uo',	`src_uo')
+define(`dv_dx',	`src_dv_dx')
+define(`dv_dy',	`src_dv_dy')
+define(`vo',	`src_vo')
+define(`dw_dx',	`src_dw_dx')
+define(`dw_dy',	`src_dw_dy')
+define(`wo',	`src_wo')
+define(`u',	`src_u')
+define(`v',	`src_v')
+define(`w',	`src_w')
+define(`w_0',	`src_w_0')
+define(`w_1',	`src_w_1')
+
+include(`exa_wm_projective.g4i')
diff --git a/src/exa_wm_src_projective.g4b b/src/exa_wm_src_projective.g4b
new file mode 100644
index 0000000..68bfc92
--- /dev/null
+++ b/src/exa_wm_src_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000008c },
+   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
+   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000006c },
+   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
diff --git a/src/exa_wm_src_sample.g4a b/src/exa_wm_src_sample.g4a
new file mode 100644
index 0000000..04cd3e3
--- /dev/null
+++ b/src/exa_wm_src_sample.g4a
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the src surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0			/* msg reg index */
+	src_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+// mov (8)  src_sample7<1>UD	src_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
+
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
diff --git a/src/exa_wm_src_sample.g4b b/src/exa_wm_src_sample.g4b
new file mode 100644
index 0000000..5ca33f5
--- /dev/null
+++ b/src/exa_wm_src_sample.g4b
@@ -0,0 +1 @@
+   { 0x00800031, 0x22401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
new file mode 100644
index 0000000..9a821d7
--- /dev/null
+++ b/src/exa_wm_write.g4a
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* 
+ * Once the data are ready, write them to the destination
+ */
+
+include(`exa_wm.g4i')
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* src_sample0 -> m2
+   src_sample1 -> m6
+   src_sample2 -> m3
+   src_sample3 -> m7
+   src_sample4 -> m4
+   src_sample5 -> m8
+   src_sample6 -> m5
+   src_sample7 -> m9
+*/
+
+mov (8) m2<1>F src_sample0<8,8,1>F { align1 };
+mov (8) m3<1>F src_sample2<8,8,1>F { align1 };
+mov (8) m4<1>F src_sample4<8,8,1>F { align1 };
+mov (8) m5<1>F src_sample6<8,8,1>F { align1 };
+mov (8) m6<1>F src_sample1<8,8,1>F { align1 };
+mov (8) m7<1>F src_sample3<8,8,1>F { align1 };
+mov (8) m8<1>F src_sample5<8,8,1>F { align1 };
+mov (8) m9<1>F src_sample7<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
new file mode 100644
index 0000000..dd266a3
--- /dev/null
+++ b/src/exa_wm_write.g4b
@@ -0,0 +1,20 @@
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d02c0, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0300, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d02a0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d02e0, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0320, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_xy.g4a b/src/exa_wm_xy.g4a
new file mode 100644
index 0000000..e99f5ac
--- /dev/null
+++ b/src/exa_wm_xy.g4a
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Register assignments:
+ *
+ *  x		    g6/g7
+ *  y		    g8/g9
+ *
+ *  temp x	    g10/g11
+ *  temp y	    g12/g13
+ *
+ *  src w	    g14/g15
+ *  src u	    m1/m2
+ *  src v	    m3/m4
+ */
+ 
+/* Fragment to compute per-pixel XY values */
+
+include(`exa_wm.g4i')
+    
+    /* Load X and Y coordinates and compute per-pixel coordinates */
+add (16)	temp_x_uw<1>UW	dst_x_uw		0x10101010V	{ align1 };
+add (16)	temp_y_uw<1>UW	dst_y_uw		0x11001100V	{ align1 };
+
+    /* subtract screen-space origin of vertex 0 */
+add (16)	dst_x<1>F	temp_x_uw<8,8,1>UW	-screen_x0	{ compr align1 };
+add (16)	dst_y<1>F	temp_y_uw<8,8,1>UW	-screen_y0	{ compr align1 };
diff --git a/src/exa_wm_xy.g4b b/src/exa_wm_xy.g4b
new file mode 100644
index 0000000..7784a3d
--- /dev/null
+++ b/src/exa_wm_xy.g4b
@@ -0,0 +1,4 @@
+   { 0x00800040, 0x21806d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x21c06d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d0180, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d01c0, 0x00004024 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 26c06aa..7668779 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -321,31 +321,68 @@ static const uint32_t sip_kernel_static[][4] = {
 #define SF_MAX_THREADS	   2
 
 static const uint32_t sf_kernel_static[][4] = {
-#include "exa_sf_prog.h"
+#include "exa_sf.g4b"
 };
 
 static const uint32_t sf_kernel_static_mask[][4] = {
-#include "exa_sf_mask_prog.h"
+#include "exa_sf_mask.g4b"
 };
 
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
 
-static const uint32_t ps_kernel_static_nomask [][4] = {
-#include "exa_wm_nomask_prog.h"
+static const uint32_t ps_kernel_static_nomask_affine [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_static_nomask_projective [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_write.g4b"
 };
 
 static const uint32_t ps_kernel_static_maskca [][4] = {
-#include "exa_wm_maskca_prog.h"
+#include "exa_wm_maskca.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_ca.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
-#include "exa_wm_maskca_srcalpha_prog.h"
+#include "exa_wm_maskca_srcalpha.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_ca_srcalpha.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static const uint32_t ps_kernel_static_masknoca [][4] = {
-#include "exa_wm_masknoca_prog.h"
+#include "exa_wm_masknoca.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_noca.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static uint32_t 
@@ -374,6 +411,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_tiled = 0;
     uint32_t dst_format, dst_offset, dst_pitch, dst_tile_format = 0,
 	dst_tiled = 0;
+    Bool is_affine_src, is_affine_mask, is_affine;
 
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
@@ -402,6 +440,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     pI830->scale_units[0][1] = pSrc->drawable.height;
 
     pI830->transform[0] = pSrcPicture->transform;
+    is_affine_src = i830_transform_is_affine (pI830->transform[0]);
+    is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
+    is_affine = is_affine_src && is_affine_mask;
 
     if (!pMask) {
 	pI830->transform[1] = NULL;
@@ -460,7 +501,10 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    next_offset = ps_kernel_offset + 
                           sizeof(ps_kernel_static_masknoca);
     } else {
-   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
+	if (is_affine)
+	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_affine);
+	else
+	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_projective);
     }
 
     sip_kernel_offset = ALIGN(next_offset, 64);
@@ -837,8 +881,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	    memcpy(ps_kernel, ps_kernel_static_masknoca,
 		   sizeof (ps_kernel_static_masknoca));
     } else {
-   	memcpy(ps_kernel, ps_kernel_static_nomask,
-	       sizeof (ps_kernel_static_nomask));
+	if (is_affine)
+	    memcpy(ps_kernel, ps_kernel_static_nomask_affine,
+		   sizeof (ps_kernel_static_nomask_affine));
+	else
+	    memcpy(ps_kernel, ps_kernel_static_nomask_projective,
+		   sizeof (ps_kernel_static_nomask_projective));
     }
 
     wm_state = &wm_state_local;
@@ -989,51 +1037,75 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	ADVANCE_BATCH();
     }
     {
-        int nelem = pMask ? 3: 2;
+	/* 
+	 * number of extra parameters per vertex
+	 */
+        int nelem = pMask ? 2: 1;
+	/* 
+	 * size of extra parameters:
+	 *  3 for homogenous (xyzw)
+	 *  2 for cartesian (xy)
+	 */
+	int selem = is_affine ? 2 : 3;
+	uint32_t    w_component;
+	uint32_t    src_format;
+	
+	if (is_affine)
+	{
+	    src_format = BRW_SURFACEFORMAT_R32G32_FLOAT;
+	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	}
+	else
+	{
+	    src_format = BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	}
 	BEGIN_BATCH(pMask?12:10);
-	/* Set up the pointer to our vertex buffer */
+	/* Set up the pointer to our (single) vertex buffer */
 	OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
 	OUT_BATCH((0 << VB0_BUFFER_INDEX_SHIFT) |
 		  VB0_VERTEXDATA |
-		  ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
+		  ((4 * (2 + nelem * selem)) << VB0_BUFFER_PITCH_SHIFT));
 	OUT_BATCH(state_base_offset + vb_offset);
         OUT_BATCH(3);
 	OUT_BATCH(0); // ignore for VERTEXDATA, but still there
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
 	 */
-	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));
-	/* vertex coordinates */
-	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-		  VE0_VALID |
-		  (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		  (0 << VE0_OFFSET_SHIFT));
-	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-		  (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
-	/* u0, v0 */
+	
+	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * (1 + nelem)) - 1));
+	/* x,y */
 	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
 		  VE0_VALID |
 		  (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		  (8 << VE0_OFFSET_SHIFT)); /* offset vb in bytes */
-	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_2_SHIFT) |
-		  (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_3_SHIFT) |
-		  (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
-	/* u1, v1 */
+		  (0				<< VE0_OFFSET_SHIFT));
+	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_0_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_1_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_1_FLT	<< VE1_VFCOMPONENT_2_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_1_FLT	<< VE1_VFCOMPONENT_3_SHIFT) |
+		  (4				<< VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+	/* u0, v0, w0 */
+	OUT_BATCH((0				<< VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+		  VE0_VALID					     |
+		  (src_format			<< VE0_FORMAT_SHIFT) |
+		  ((2 * 4)			<< VE0_OFFSET_SHIFT)); /* offset vb in bytes */
+	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_0_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_1_SHIFT) |
+		  (w_component			<< VE1_VFCOMPONENT_2_SHIFT) |
+		  (BRW_VFCOMPONENT_NOSTORE	<< VE1_VFCOMPONENT_3_SHIFT) |
+		  ((4 + 4)			<< VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
+	/* u1, v1, w1 */
    	if (pMask) {
-	    OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-		      VE0_VALID |
-		      (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		      (16 << VE0_OFFSET_SHIFT));
-	    OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		      (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		      (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_2_SHIFT) |
-		      (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_3_SHIFT) |
-		      (10 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+	    OUT_BATCH((0			    << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+		      VE0_VALID							    |
+		      (src_format		    << VE0_FORMAT_SHIFT) |
+		      (((2 + selem) * 4)    	    << VE0_OFFSET_SHIFT));  /* vb offset in bytes */
+	    
+	    OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_0_SHIFT) |
+		      (BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_1_SHIFT) |
+		      (w_component		    << VE1_VFCOMPONENT_2_SHIFT) |
+		      (BRW_VFCOMPONENT_NOSTORE	    << VE1_VFCOMPONENT_3_SHIFT) |
+		      ((4 + 2 + 4)		    << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
    	}
 
 	ADVANCE_BATCH();
@@ -1053,38 +1125,87 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
     Bool has_mask;
-    float src_x[3], src_y[3], mask_x[3], mask_y[3];
+    Bool is_affine_src, is_affine_mask, is_affine;
+    float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
     int i;
+    int per_vertex = 2; /* dst x/y */
 
-    if (!i830_get_transformed_coordinates(srcX, srcY,
-					  pI830->transform[0],
-					  &src_x[0], &src_y[0]))
-	return;
-    if (!i830_get_transformed_coordinates(srcX, srcY + h,
-					  pI830->transform[0],
-					  &src_x[1], &src_y[1]))
-	return;
-    if (!i830_get_transformed_coordinates(srcX + w, srcY + h,
-					  pI830->transform[0],
-					  &src_x[2], &src_y[2]))
-	return;
+    is_affine_src = i830_transform_is_affine (pI830->transform[0]);
+    is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
+    is_affine = is_affine_src && is_affine_mask;
+    
+    if (is_affine)
+    {
+	if (!i830_get_transformed_coordinates(srcX, srcY,
+					      pI830->transform[0],
+					      &src_x[0], &src_y[0]))
+	    return;
+	if (!i830_get_transformed_coordinates(srcX, srcY + h,
+					      pI830->transform[0],
+					      &src_x[1], &src_y[1]))
+	    return;
+	if (!i830_get_transformed_coordinates(srcX + w, srcY + h,
+					      pI830->transform[0],
+					      &src_x[2], &src_y[2]))
+	    return;
+	per_vertex += 2;    /* src u/v */
+    }
+    else
+    {
+	if (!i830_get_transformed_coordinates_3d(srcX, srcY,
+						 pI830->transform[0],
+						 &src_x[0], &src_y[0],
+						 &src_w[0]))
+	    return;
+	if (!i830_get_transformed_coordinates_3d(srcX, srcY + h,
+						 pI830->transform[0],
+						 &src_x[1], &src_y[1],
+						 &src_w[1]))
+	    return;
+	if (!i830_get_transformed_coordinates_3d(srcX + w, srcY + h,
+						 pI830->transform[0],
+						 &src_x[2], &src_y[2],
+						 &src_w[2]))
+	    return;
+	per_vertex += 3;    /* src u/v/w */
+    }
 
     if (pI830->scale_units[1][0] == -1 || pI830->scale_units[1][1] == -1) {
 	has_mask = FALSE;
     } else {
 	has_mask = TRUE;
-	if (!i830_get_transformed_coordinates(maskX, maskY,
-					      pI830->transform[1],
-					      &mask_x[0], &mask_y[0]))
-	    return;
-	if (!i830_get_transformed_coordinates(maskX, maskY + h,
-					      pI830->transform[1],
-					      &mask_x[1], &mask_y[1]))
-	    return;
-	if (!i830_get_transformed_coordinates(maskX + w, maskY + h,
-					      pI830->transform[1],
-					      &mask_x[2], &mask_y[2]))
-	    return;
+	if (is_affine_mask) {
+	    if (!i830_get_transformed_coordinates(maskX, maskY,
+						  pI830->transform[1],
+						  &mask_x[0], &mask_y[0]))
+		return;
+	    if (!i830_get_transformed_coordinates(maskX, maskY + h,
+						  pI830->transform[1],
+						  &mask_x[1], &mask_y[1]))
+		return;
+	    if (!i830_get_transformed_coordinates(maskX + w, maskY + h,
+						  pI830->transform[1],
+						  &mask_x[2], &mask_y[2]))
+		return;
+	    per_vertex += 2;	/* mask u/v */
+	} else {
+	    if (!i830_get_transformed_coordinates_3d(maskX, maskY,
+						     pI830->transform[1],
+						     &mask_x[0], &mask_y[0],
+						     &mask_w[0]))
+		return;
+	    if (!i830_get_transformed_coordinates_3d(maskX, maskY + h,
+						     pI830->transform[1],
+						     &mask_x[1], &mask_y[1],
+						     &mask_w[1]))
+		return;
+	    if (!i830_get_transformed_coordinates_3d(maskX + w, maskY + h,
+						     pI830->transform[1],
+						     &mask_x[2], &mask_y[2],
+						     &mask_w[2]))
+		return;
+	    per_vertex += 3;	/* mask u/v/w */
+	}
     }
 
     /* Wait for any existing composite rectangles to land before we overwrite
@@ -1098,9 +1219,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)(dstY + h);
     vb[i++] = src_x[2] / pI830->scale_units[0][0];
     vb[i++] = src_y[2] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[2];
     if (has_mask) {
         vb[i++] = mask_x[2] / pI830->scale_units[1][0];
         vb[i++] = mask_y[2] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[2];
     }
 
     /* rect (x1,y2) */
@@ -1108,9 +1233,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)(dstY + h);
     vb[i++] = src_x[1] / pI830->scale_units[0][0];
     vb[i++] = src_y[1] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[1];
     if (has_mask) {
         vb[i++] = mask_x[1] / pI830->scale_units[1][0];
         vb[i++] = mask_y[1] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[1];
     }
 
     /* rect (x1,y1) */
@@ -1118,9 +1247,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)dstY;
     vb[i++] = src_x[0] / pI830->scale_units[0][0];
     vb[i++] = src_y[0] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[0];
     if (has_mask) {
         vb[i++] = mask_x[0] / pI830->scale_units[1][0];
         vb[i++] = mask_y[0] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[0];
     }
 
     {
diff --git a/src/packed_yuv_sf.g4b b/src/packed_yuv_sf.g4b
new file mode 100644
index 0000000..830d176
--- /dev/null
+++ b/src/packed_yuv_sf.g4b
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/packed_yuv_wm.g4a b/src/packed_yuv_wm.g4a
index 5e31f10..9e635ba 100644
--- a/src/packed_yuv_wm.g4a
+++ b/src/packed_yuv_wm.g4a
@@ -49,44 +49,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
     /* subtract screen-space X origin of vertex 0. */
diff --git a/src/packed_yuv_wm.g4b b/src/packed_yuv_wm.g4b
new file mode 100644
index 0000000..d72c651
--- /dev/null
+++ b/src/packed_yuv_wm.g4b
@@ -0,0 +1,82 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600129, 0x008d0260, 0x00000000 },
+   { 0x00600040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
+   { 0x00600040, 0x21807fbd, 0x008d0180, 0xbf008084 },
+   { 0x00600040, 0x22007fbd, 0x008d0200, 0xbf008084 },
+   { 0x00600041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00600041, 0x20007fbc, 0x008d0180, 0x3fcc49ba },
+   { 0x80600048, 0x20407fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0180, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0200, 0xbec8b439 },
+   { 0x80600048, 0x20607fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
+   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600040, 0x21e07fbd, 0x008d01e0, 0xbd808081 },
+   { 0x00600040, 0x21a07fbd, 0x008d01a0, 0xbf008084 },
+   { 0x00600040, 0x22207fbd, 0x008d0220, 0xbf008084 },
+   { 0x00600041, 0x21e07fbd, 0x008d01e0, 0x3f94fdf4 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
+   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0220, 0xbec8b439 },
+   { 0x80600048, 0x20e07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0220, 0x40011687 },
+   { 0x80600048, 0x21007fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit a696a0faab13c11e4ed4ff2d66430f41bb107443
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 19:19:46 2008 -0700

    Remove rotation sf and wm progs
    (cherry picked from commit 949d73271d7100c1f028fd60f185f4929461304e)

diff --git a/src/exa_sf_rotation.g4a b/src/exa_sf_rotation.g4a
deleted file mode 100644
index 59d40d4..0000000
--- a/src/exa_sf_rotation.g4a
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright © 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/* 1/dx */
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-/* 1/dy */
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-/* du, dv */
-mul (1) g7<1>F g3<0,1,0>F -1.0F { align1 };
-mul (1) g7.4<1>F g3.4<0,1,0>F -1.0F { align1 };
-add (1) g7<1>F g4<0,1,0>F g7<0,1,0>F { align1 };
-add (1) g7.4<1>F g4.4<0,1,0>F g7.4<0,1,0>F { align1 };
-
-/* du/dy */
-mul (1) g7<1>F g7<0,1,0>F g6.4<0,1,0>F { align1 };
-/* dv/dx */
-mul (1) g7.4<1>F g7.4<0,1,0>F g6<0,1,0>F { align1 };
-/* Cx */
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-/* Cy */
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-/* Co */
-mov (8) m3<1>F g3<8,8,1>F { align1 };
-send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_sf_rotation_prog.h b/src/exa_sf_rotation_prog.h
deleted file mode 100644
index 9589130..0000000
--- a/src/exa_sf_rotation_prog.h
+++ /dev/null
@@ -1,20 +0,0 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00000041, 0x20e07fbd, 0x00000060, 0xbf800000 },
-   { 0x00000041, 0x20e47fbd, 0x00000064, 0xbf800000 },
-   { 0x00000040, 0x20e077bd, 0x00000080, 0x000000e0 },
-   { 0x00000040, 0x20e477bd, 0x00000084, 0x000000e4 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c4 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c0 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_rotation.g4a b/src/exa_wm_rotation.g4a
deleted file mode 100644
index 613a5cb..0000000
--- a/src/exa_wm_rotation.g4a
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright © 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
- * X0_R is g4
- * X1_R is g5
- * Y0_R is g6
- * Y1_R is g7
- */
-
-    /* Set up the X/Y screen coordinates of the pixels in our 4 subspans.  Each
-     * subspan is a 2x2 rectangle, and the screen x/y of the upper left of each
-     * subspan are given in GRF register 1.2 through 1.5 (which, with the word
-     * addressing below, are 1.4 through 1.11).
-     *
-     * The result is WM_X*_R and WM_Y*R being:
-     *
-     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
-     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
-     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
-     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
-     */
-    /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-    /* subtract screen-space X origin of vertex 0. */
-/* for rotation, texture y is from ssX.x, so g4,g5 will be Y */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3.20<0,1,0>F { align1 };
-    /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* texture Y is from ssX.x */
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.16<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.16<0,1,0>F { align1 };
-    /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.12<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g6<8,8,1>F { align1 };
-mov (8) m2<1>F g7<8,8,1>F { align1 };  
-mov (8) m3<1>F g4<8,8,1>F { align1 };
-mov (8) m4<1>F g5<8,8,1>F { align1 }; 
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0 		/* msg reg index */
-	g12<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
-			 /* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-mov (8) g19<1>UD g19<8,8,1>UD { align1 };  /* wait sampler return */
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>F g1<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* g12 -> m2
-   g13 -> m6
-   g14 -> m3
-   g15 -> m7
-   g16 -> m4
-   g17 -> m8
-   g18 -> m5
-   g19 -> m9
-*/
-mov (8) m2<1>F g12<8,8,1>F { align1 };
-mov (8) m3<1>F g14<8,8,1>F { align1 };
-mov (8) m4<1>F g16<8,8,1>F { align1 };
-mov (8) m5<1>F g18<8,8,1>F { align1 };
-mov (8) m6<1>F g13<8,8,1>F { align1 };
-mov (8) m7<1>F g15<8,8,1>F { align1 };
-mov (8) m8<1>F g17<8,8,1>F { align1 };
-mov (8) m9<1>F g19<8,8,1>F { align1 };
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-
-/* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_wm_rotation_prog.h b/src/exa_wm_rotation_prog.h
deleted file mode 100644
index 890d2cf..0000000
--- a/src/exa_wm_rotation_prog.h
+++ /dev/null
@@ -1,70 +0,0 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000074 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000074 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000007c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000007c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000070 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000070 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000006c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000006c },
-   { 0x00600001, 0x202003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00e0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00a0, 0x00000000 },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit b19cf5e23f014ec65629dd1b0703bca54b973e0c
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 19:14:18 2008 -0700

    Fix the sf_mask program to compute and pass corrected uvw cooefficients
    
    sf_mask is the same as sf except that it must compute both src and mask uvw
    cooefficients, which are conveniently adjacent in the same registers, and so
    need only an extended execution width
    (cherry picked from commit 492ff1494f782240e6ca68919b2d0b9aa400fc53)

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index a0d6efc..c830fd8 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -21,52 +21,82 @@
  * IN THE SOFTWARE.
  *
  * Authors:
+ *    Keith Packard <keithp at keithp.com>
+ *    Eric Anholt <eric at anholt.net>
  *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  */
 
-/* FIXME how to setup second coeffient for mask tex coord */
 
-/* 
-   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
-   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
-   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
-   g6      { 1/(x1-x0), 1/(y1-y0) }
-   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
-	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
-		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
+/*
+ * Inputs (note all sub-register addresses are bytes, not float indices)
+ *
+ * Note that the vertices will have been reordered:
+ *
+ * V0 is topmost (leftmost among topmost) (upper left)
+ * V1 is next clockwise (lower right)
+ * V2 is remaining (lower left)
+ *
+ *  V0 ...................... XX
+ *  |                          .
+ *  |                          .
+ *  |                          .
+ *  V2------------------------V1
+ *
+ *  G0	    thread state -- just pass along
+ *
+ *  G1 and G2 are fixed by SF spec
+ *
+ *  G1.0    reserved
+ *  G1.4    Provoking vertex
+ *  G1.8    Determinant
+ *  G1.12   X1 - X0
+ *  G1.16   X2 - X0
+ *  G1.20   Y1 - Y0
+ *  G1.24   Y2 - Y0
+ *  G1.30   reserved
+ *
+ *  G2.0    Z0
+ *  G2.4    1/W0
+ *  G2.8    Z1
+ *  G2.12   1/W1
+ *  G2.16   Z2
+ *  G2.20   1/W2
+ *  G2.24   reserved
+ *  G2.30   reserved
+ *
+ *  G3 is V0 Vertex Attribute Data from URB (upper left)
+ *
+ *  G3.0    u0
+ *  G3.4    v0
+ *
+ *  G4 is V1 Vertex Attribute Data from URB (lower right)
+ *
+ *  G4.0    u1
+ *  G4.4    v1
+ *
+ *  G5 is V2 Vertex Attribute Data from URB (lower left)
+ *
  */
 
-/* assign Cx[0], Cx[1] to src, same to Cy, Co 
-          Cx[2], Cx[3] to mask, same to Cy, Co */
-
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
-/* Cx[0] */
-mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[0] */
-mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
-/* Cx[2] */
-mul (1) g7.8<1>F g7.8<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[2] */
-mul (1) g7.12<1>F g7.12<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Compute inverses of the input deltas */
+send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
 
-/* src Cx[0], Cx[1] */
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-/* mask Cx[2], Cx[3] */
-mov (1) m1.8<1>F g7.8<0,1,0>F { align1 };
-mov (1) m1.12<1>F g7.8<0,1,0>F { align1 };
-/* src Cy[0], Cy[1] */
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-/* mask Cy[2], Cy[3] */
-mov (1) m2.8<1>F g7.12<0,1,0>F { align1 };
-mov (1) m2.12<1>F g7.12<0,1,0>F { align1 };
-/* src Co[0], Co[1] */
+/* texture location at V0 */
 mov (8) m3<1>F g3<8,8,1>F { align1 };
-/* mask Co[2], Co[3] */
-mov (1) m3.8<1>F g3.8<0,1,0>F { align1 };
-mov (1) m3.12<1>F g3.12<0,1,0>F { align1 };
 
+/* compute V1 - V2 (motion in X) for texture coordinates */
+add (8) g7<1>F g4<8,8,1>F -g5<8,8,1>F { align1 };
+
+/* multiply by 1/dx */
+mul (8) m1<1>F g7<8,8,1>F g6.0<0,1,0>F { align1 };
+
+/* Compute V2 - V0 (motion in Y) for texture coordinates */
+add (8) g7<1>F g5<8,8,1>F -g3<8,8,1>F { align1 };
+
+/* multiply by 1/dy */
+mul (8) m2<1>F g7<8,8,1>F g6.8<0,1,0>F {align1 };
+
+/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_mask_prog.h b/src/exa_sf_mask_prog.h
index 4e9114d..be0a77b 100644
--- a/src/exa_sf_mask_prog.h
+++ b/src/exa_sf_mask_prog.h
@@ -1,19 +1,9 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-   { 0x00000041, 0x20e877bd, 0x000000e8, 0x000000c0 },
-   { 0x00000041, 0x20ec77bd, 0x000000ec, 0x000000c4 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00000001, 0x202803be, 0x000000e8, 0x00000000 },
-   { 0x00000001, 0x202c03be, 0x000000e8, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00000001, 0x204803be, 0x000000ec, 0x00000000 },
-   { 0x00000001, 0x204c03be, 0x000000ec, 0x00000000 },
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
    { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00000001, 0x206803be, 0x00000068, 0x00000000 },
-   { 0x00000001, 0x206c03be, 0x0000006c, 0x00000000 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
+   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
+   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
+   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
    { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit b6d54fffbd44b6cc516b49d4fc41b01483663a33
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 00:54:51 2008 -0700

    Reimplement wm program for nomask case to handle affine transforms
    
    This involves correctly computing u/v locations based on x/y vectors and
    line constants computed in new sf program.
    
    Also, use fewer instructions to make this go a bit faster (2X for 500x500
    composite).
    (cherry picked from commit 6304b38423f99190a5e54f1a7dcaa75adfad4f2a)

diff --git a/src/exa_wm_nomask.g4a b/src/exa_wm_nomask.g4a
index f92dc1a..97426ec 100644
--- a/src/exa_wm_nomask.g4a
+++ b/src/exa_wm_nomask.g4a
@@ -40,75 +40,49 @@
  * Y1_R is g7
  */
 
-    /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+/* Load X and Y coordinates and compute per-pixel coordinates */
+add (16)	g4<1>UW		g1.8<2,4,0>UW	0x10101010V	{ align1 };
+add (16)	g6<1>UW		g1.10<2,4,0>UW	0x11001100V	{ align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
+    
     /* subtract screen-space X origin of vertex 0. */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+add (16)	g12<1>F		g4<8,8,1>UW	-g1.0<0,1,0>F { compr align1 };
+
     /* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
-    /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+add (16)	g16<1>F		g6<8,8,1>UW	-g1.4<0,1,0>F { compr align1 };
+
+	/* g8/g9 = X * du/dx */
+mul (16)	g8<1>F		g12<8,8,1>F	g3.0<0,1,0>F { compr align1 };
+
+	/* g10/g11 = Y * du/dy */
+mul (16)	g10<1>F		g16<8,8,1>F	g3.4<0,1,0>F { compr align1 };
+
+	/* g8/g9 = X du/dx + Y du/dy */
+add (16)	g8<1>F		g8<8,8,1>F	g10<8,8,1>F { compr align1 };
+
+	/* m1/m2 = g8/g9 + uo */
+add (16)	m1<1>F		g8<8,8,1>F	g3.12<0,1,0>F { compr align1 };
+
+
+	/* g8/g9 = X * dv/dx */
+mul (16)	g8<1>F		g12<8,8,1>F	g3.16<0,1,0>F { compr align1 };
+
+	/* g10/g11 = Y * du/dy */
+mul (16)	g10<1>F		g16<8,8,1>F	g3.20<0,1,0>F { compr align1 };
+
+	/* g8/g9 = X du/dx + Y du/dy */
+add (16)	g8<1>F		g8<8,8,1>F	g10<8,8,1>F { compr align1 };
+
+	/* m3/m4 = g8/g9 + vo */
+add (16)	m3<1>F		g8<8,8,1>F	g3.28<0,1,0>F { compr align1 };
+
 
 /* prepare sampler read back gX register, which would be written back to output */
 
 /* use simd16 sampler, param 0 is u, param 1 is v. */
 /* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g4<8,8,1>F { align1 };
-mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
-mov (8) m3<1>F g6<8,8,1>F { align1 };
-mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
 
 /* m0 will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
diff --git a/src/exa_wm_nomask_prog.h b/src/exa_wm_nomask_prog.h
index 7870b3b..c73bdbc 100644
--- a/src/exa_wm_nomask_prog.h
+++ b/src/exa_wm_nomask_prog.h
@@ -1,51 +1,15 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800040, 0x20806d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x20c06d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2180753d, 0x008d0080, 0x00004020 },
+   { 0x00802040, 0x2200753d, 0x008d00c0, 0x00004024 },
+   { 0x00802041, 0x210077bd, 0x008d0180, 0x00000060 },
+   { 0x00802041, 0x214077bd, 0x008d0200, 0x00000064 },
+   { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
+   { 0x00802040, 0x202077be, 0x008d0100, 0x0000006c },
+   { 0x00802041, 0x210077bd, 0x008d0180, 0x00000070 },
+   { 0x00802041, 0x214077bd, 0x008d0200, 0x00000074 },
+   { 0x00802040, 0x210077bd, 0x008d0100, 0x008d0140 },
+   { 0x00802040, 0x206077be, 0x008d0100, 0x0000007c },
    { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
    { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
    { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 93583b0..26c06aa 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -318,7 +318,7 @@ static const uint32_t sip_kernel_static[][4] = {
  */
 
 #define SF_KERNEL_NUM_GRF  16
-#define SF_MAX_THREADS	   1
+#define SF_MAX_THREADS	   2
 
 static const uint32_t sf_kernel_static[][4] = {
 #include "exa_sf_prog.h"
@@ -328,10 +328,6 @@ static const uint32_t sf_kernel_static_mask[][4] = {
 #include "exa_sf_mask_prog.h"
 };
 
-static const uint32_t sf_kernel_static_rotation[][4] = {
-#include "exa_sf_rotation_prog.h"
-};
-
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
@@ -352,10 +348,6 @@ static const uint32_t ps_kernel_static_masknoca [][4] = {
 #include "exa_wm_masknoca_prog.h"
 };
 
-static const uint32_t ps_kernel_static_rotation [][4] = {
-#include "exa_wm_rotation_prog.h"
-};
-
 static uint32_t 
 i965_get_card_format(PicturePtr pPict)
 {
@@ -370,21 +362,6 @@ i965_get_card_format(PicturePtr pPict)
     return i965_tex_formats[i].card_fmt;
 }
 
-static Bool
-i965_check_rotation_transform(PictTransformPtr t)
-{
-    /* XXX this is arbitrary */
-    int a, b;
-    a = xFixedToInt(t->matrix[0][1]);
-    b = xFixedToInt(t->matrix[1][0]);
-    if (a == -1 && b == 1)
-	return TRUE;
-    else if (a == 1 && b == -1)
-	return TRUE;
-    else
-	return FALSE;
-}
-
 Bool
 i965_prepare_composite(int op, PicturePtr pSrcPicture,
 		       PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -397,7 +374,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_tiled = 0;
     uint32_t dst_format, dst_offset, dst_pitch, dst_tile_format = 0,
 	dst_tiled = 0;
-    Bool rotation_program = FALSE;
 
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
@@ -431,9 +407,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	pI830->transform[1] = NULL;
 	pI830->scale_units[1][0] = -1;
 	pI830->scale_units[1][1] = -1;
-	if (pI830->transform[0] && 
-		i965_check_rotation_transform(pI830->transform[0]))
-	    rotation_program = TRUE;
     } else {
 	pI830->transform[1] = pMaskPicture->transform;
 	if (pI830->transform[1])
@@ -469,8 +442,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     sf_kernel_offset = ALIGN(next_offset, 64);
     if (pMask)
 	next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
-    else if (rotation_program)
-	next_offset = sf_kernel_offset + sizeof (sf_kernel_static_rotation);
     else 
 	next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
 
@@ -488,8 +459,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
         } else
 	    next_offset = ps_kernel_offset + 
                           sizeof(ps_kernel_static_masknoca);
-    } else if (rotation_program) {
-   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_rotation);
     } else {
    	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
     }
@@ -816,9 +785,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pMask)
 	memcpy(sf_kernel, sf_kernel_static_mask,
 		sizeof (sf_kernel_static_mask));
-    else if (rotation_program)
-	memcpy(sf_kernel, sf_kernel_static_rotation, 
-		sizeof (sf_kernel_static_rotation));
     else
 	memcpy(sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
 
@@ -870,9 +836,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
         } else
    	    memcpy(ps_kernel, ps_kernel_static_masknoca,
 		   sizeof (ps_kernel_static_masknoca));
-    } else if (rotation_program) {
-   	memcpy(ps_kernel, ps_kernel_static_rotation,
-	       sizeof (ps_kernel_static_rotation));
     } else {
    	memcpy(ps_kernel, ps_kernel_static_nomask,
 	       sizeof (ps_kernel_static_nomask));
@@ -883,7 +846,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     wm_state->thread0.kernel_start_pointer =
 	(state_base_offset + ps_kernel_offset) >> 6;
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
-    wm_state->thread1.single_program_flow = 1;
+    wm_state->thread1.single_program_flow = 0;
     if (!pMask)
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
     else
commit 90dd1058b29a33d234ee56b3f21716a6f4b2302e
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 00:54:07 2008 -0700

    Clarify comment in exa_sf program
    (cherry picked from commit 771a56b1ed0df69345c723cb62a73b6842cd8227)

diff --git a/src/exa_sf.g4a b/src/exa_sf.g4a
index 4da5eba..3e660ac 100644
--- a/src/exa_sf.g4a
+++ b/src/exa_sf.g4a
@@ -80,7 +80,7 @@
 /* Compute inverses of the input deltas */
 send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
 
-/* texture value at V0 */
+/* texture location at V0 */
 mov (4) m3<1>F g3<4,4,1>F { align1 };
 
 /* compute V1 - V2 (motion in X) for texture coordinates */
commit 278b0aeb84b5e69142fa3a2c093db4c4642f32ad
Author: Keith Packard <keithp at keithp.com>
Date:   Sat Mar 29 14:28:05 2008 -0700

    Compute du/dv/dw in no-mask SF prog
    (cherry picked from commit 4f469189fed541549e5d470b2529275a29cc2f20)

diff --git a/src/exa_sf.g4a b/src/exa_sf.g4a
index 5a02399..4da5eba 100644
--- a/src/exa_sf.g4a
+++ b/src/exa_sf.g4a
@@ -26,14 +26,76 @@
  *
  */
 
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
-mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
-mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-mov (8) m3<1>F g3<8,8,1>F { align1 };
+/*
+ * Inputs (note all sub-register addresses are bytes, not float indices)
+ *
+ * Note that the vertices will have been reordered:
+ *
+ * V0 is topmost (leftmost among topmost) (upper left)
+ * V1 is next clockwise (lower right)
+ * V2 is remaining (lower left)
+ *
+ *  V0 ...................... XX
+ *  |                          .
+ *  |                          .
+ *  |                          .
+ *  V2------------------------V1
+ *
+ *  G0	    thread state -- just pass along
+ *
+ *  G1 and G2 are fixed by SF spec
+ *
+ *  G1.0    reserved
+ *  G1.4    Provoking vertex
+ *  G1.8    Determinant
+ *  G1.12   X1 - X0
+ *  G1.16   X2 - X0
+ *  G1.20   Y1 - Y0
+ *  G1.24   Y2 - Y0
+ *  G1.30   reserved
+ *
+ *  G2.0    Z0
+ *  G2.4    1/W0
+ *  G2.8    Z1
+ *  G2.12   1/W1
+ *  G2.16   Z2
+ *  G2.20   1/W2
+ *  G2.24   reserved
+ *  G2.30   reserved
+ *
+ *  G3 is V0 Vertex Attribute Data from URB (upper left)
+ *
+ *  G3.0    u0
+ *  G3.4    v0
+ *
+ *  G4 is V1 Vertex Attribute Data from URB (lower right)
+ *
+ *  G4.0    u1
+ *  G4.4    v1
+ *
+ *  G5 is V2 Vertex Attribute Data from URB (lower left)
+ *
+ */
+
+/* Compute inverses of the input deltas */
+send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
+
+/* texture value at V0 */
+mov (4) m3<1>F g3<4,4,1>F { align1 };
+
+/* compute V1 - V2 (motion in X) for texture coordinates */
+add (4) g7<1>F g4<4,4,1>F -g5<4,4,1>F { align1 };
+
+/* multiply by 1/dx */
+mul (4) m1<1>F g7<4,4,1>F g6.0<0,1,0>F { align1 };
+
+/* Compute V2 - V0 (motion in Y) for texture coordinates */
+add (4) g7<1>F g5<4,4,1>F -g3<4,4,1>F { align1 };
+
+/* multiply by 1/dy */
+mul (4) m2<1>F g7<4,4,1>F g6.8<0,1,0>F {align1 };
+
+/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_prog.h b/src/exa_sf_prog.h
index 830d176..223c9c9 100644
--- a/src/exa_sf_prog.h
+++ b/src/exa_sf_prog.h
@@ -1,11 +1,9 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
+   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
+   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
+   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
+   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
+   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
    { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },


More information about the xorg-commit mailing list