xf86-video-intel: Branch 'projective-965' - src/exa_sf.g4b src/exa_sf_mask.g4a src/exa_sf_mask.g4b src/exa_wm_affine.g4i src/exa_wm_ca.g4a src/exa_wm_ca.g4b src/exa_wm_ca_srcalpha.g4a src/exa_wm_ca_srcalpha.g4b src/exa_wm.g4i src/exa_wm_mask_affine.g4a src/exa_wm_mask_affine.g4b src/exa_wm_maskca.g4a src/exa_wm_maskca.g4b src/exa_wm_maskca_srcalpha.g4a src/exa_wm_maskca_srcalpha.g4b src/exa_wm_masknoca.g4a src/exa_wm_masknoca.g4b src/exa_wm_mask_projective.g4a src/exa_wm_mask_projective.g4b src/exa_wm_mask_sample.g4a src/exa_wm_mask_sample.g4b src/exa_wm_noca.g4a src/exa_wm_noca.g4b src/exa_wm_projective.g4i src/exa_wm_src_affine.g4a src/exa_wm_src_affine.g4b src/exa_wm_src_projective.g4a src/exa_wm_src_projective.g4b src/exa_wm_src_sample.g4a src/exa_wm_src_sample.g4b src/exa_wm_write.g4a src/exa_wm_write.g4b src/exa_wm_xy.g4a src/exa_wm_xy.g4b src/i965_render.c src/Makefile.am src/packed_yuv_sf.g4b src/packed_yuv_wm.g4a src/packed_yuv_wm.g4b

Keith Packard keithp at kemper.freedesktop.org
Mon Mar 31 02:21:57 PDT 2008


 src/Makefile.am                |   96 ++++++++------
 src/exa_sf.g4b                 |   15 ++
 src/exa_sf_mask.g4a            |  104 +++++-----------
 src/exa_sf_mask.g4b            |   25 +++
 src/exa_wm.g4i                 |  119 ++++++++++++++++++
 src/exa_wm_affine.g4i          |   45 ++++++
 src/exa_wm_ca.g4a              |   38 +++++
 src/exa_wm_ca.g4b              |    4 
 src/exa_wm_ca_srcalpha.g4a     |   38 +++++
 src/exa_wm_ca_srcalpha.g4b     |    4 
 src/exa_wm_mask_affine.g4a     |   37 +++++
 src/exa_wm_mask_affine.g4b     |    8 +
 src/exa_wm_mask_projective.g4a |   48 +++++++
 src/exa_wm_mask_projective.g4b |   16 ++
 src/exa_wm_mask_sample.g4a     |   49 +++++++
 src/exa_wm_mask_sample.g4b     |    1 
 src/exa_wm_maskca.g4a          |   32 ++--
 src/exa_wm_maskca.g4b          |   95 ++++++++++++++
 src/exa_wm_maskca_srcalpha.g4a |   32 ++--
 src/exa_wm_maskca_srcalpha.g4b |   95 ++++++++++++++
 src/exa_wm_masknoca.g4a        |   32 ++--
 src/exa_wm_masknoca.g4b        |   95 ++++++++++++++
 src/exa_wm_noca.g4a            |   38 +++++
 src/exa_wm_noca.g4b            |    4 
 src/exa_wm_projective.g4i      |   51 +++++++
 src/exa_wm_src_affine.g4a      |   41 ++++++
 src/exa_wm_src_affine.g4b      |    8 +
 src/exa_wm_src_projective.g4a  |   45 ++++++
 src/exa_wm_src_projective.g4b  |   16 ++
 src/exa_wm_src_sample.g4a      |   49 +++++++
 src/exa_wm_src_sample.g4b      |    1 
 src/exa_wm_write.g4a           |   80 ++++++++++++
 src/exa_wm_write.g4b           |   20 +++
 src/exa_wm_xy.g4a              |   52 ++++++++
 src/exa_wm_xy.g4b              |    4 
 src/i965_render.c              |  265 ++++++++++++++++++++++++++++++-----------
 src/packed_yuv_sf.g4b          |   17 ++
 src/packed_yuv_wm.g4a          |   32 ++--
 src/packed_yuv_wm.g4b          |   82 ++++++++++++
 39 files changed, 1592 insertions(+), 241 deletions(-)

New commits:
commit 08500507284f13ad7084eb231b43e117e9728129
Author: Keith Packard <keithp at keithp.com>
Date:   Mon Mar 31 02:20:43 2008 -0700

    Use m4 to clean up gen4 asm progs. Start adding projective transform support.
    
    Use macros for register names, modularize functions into separate files.

diff --git a/src/Makefile.am b/src/Makefile.am
index 7df69b6..81d9596 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -136,58 +136,66 @@ INTEL_G4A =				\
 	exa_wm_maskca_srcalpha.g4a 	\
 	exa_wm_masknoca.g4a 		\
 	exa_wm_nomask.g4a		\
-	exa_wm_rotation.g4a
-
-INTEL_G4H = 				\
-	sf_prog.h			\
-	wm_prog.h 			\
-	exa_sf_mask_prog.h		\
-	exa_sf_prog.h 			\
-	exa_sf_rotation_prog.h		\
-	exa_wm_maskca_prog.h		\
-	exa_wm_maskca_srcalpha_prog.h	\
-	exa_wm_masknoca_prog.h		\
-	exa_wm_nomask_prog.h		\
-	exa_wm_rotation_prog.h
-
+	exa_wm_rotation.g4a		\
+	exa_wm_src_affine.g4a 		\
+	exa_wm_src_projective.g4a 	\
+	exa_wm_src_sample.g4a 		\
+	exa_wm_mask_affine.g4a 		\
+	exa_wm_mask_projective.g4a 	\
+	exa_wm_mask_sample.g4a 		\
+	exa_wm_noca.g4a			\
+	exa_wm_ca.g4a			\
+	exa_wm_ca_srcalpha.g4a		\
+	exa_wm_write.g4a 		\
+	exa_wm_xy.g4a
+
+INTEL_G4I =				\
+	exa_wm.g4i			\
+	exa_wm_affine.g4i		\
+	exa_wm_projective.g4i
+
+INTEL_G4B = 				\
+	packed_yuv_sf.g4b		\
+	packed_yuv_wm.g4b 		\
+	exa_sf_mask.g4b			\
+	exa_sf.g4b 			\
+	exa_sf_rotation.g4b		\
+	exa_wm_maskca.g4b		\
+	exa_wm_maskca_srcalpha.g4b	\
+	exa_wm_masknoca.g4b		\
+	exa_wm_nomask.g4b		\
+	exa_wm_rotation.g4b		\
+	exa_wm_maskca.g4b 		\
+	exa_wm_maskca_srcalpha.g4b 	\
+	exa_wm_masknoca.g4b 		\
+	exa_wm_nomask.g4b		\
+	exa_wm_rotation.g4b		\
+	exa_wm_src_affine.g4b 		\
+	exa_wm_src_projective.g4b 	\
+	exa_wm_src_sample.g4b 		\
+	exa_wm_mask_affine.g4b 		\
+	exa_wm_mask_projective.g4b 	\
+	exa_wm_mask_sample.g4b 		\
+	exa_wm_noca.g4b			\
+	exa_wm_ca.g4b			\
+	exa_wm_ca_srcalpha.g4b		\
+	exa_wm_write.g4b 		\
+	exa_wm_xy.g4b
+	
 EXTRA_DIST = 		\
 	$(XMODE_SRCS)	\
 	$(INTEL_G4A)	\
-	$(INTEL_G4H)	\
+	$(INTEL_G4I)	\
+	$(INTEL_G4B)	\
 	$(INTEL_DRI_SRCS) \
 	$(INTEL_XVMC_SRCS)
 
 if HAVE_GEN4ASM
 
-sf_prog.h: packed_yuv_sf.g4a
-	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
-
-wm_prog.h: packed_yuv_wm.g4a
-	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
-
-exa_sf_mask_prog.h: exa_sf_mask.g4a
-	intel-gen4asm -o exa_sf_mask_prog.h exa_sf_mask.g4a
-
-exa_sf_prog.h: exa_sf.g4a
-	intel-gen4asm -o exa_sf_prog.h exa_sf.g4a
-
-exa_sf_rotation_prog.h: exa_sf_rotation.g4a
-	intel-gen4asm -o exa_sf_rotation_prog.h exa_sf_rotation.g4a
-
-exa_wm_maskca_prog.h: exa_wm_maskca.g4a
-	intel-gen4asm -o exa_wm_maskca_prog.h exa_wm_maskca.g4a
-
-exa_wm_maskca_srcalpha_prog.h: exa_wm_maskca_srcalpha.g4a
-	intel-gen4asm -o exa_wm_maskca_srcalpha_prog.h exa_wm_maskca_srcalpha.g4a
-
-exa_wm_masknoca_prog.h: exa_wm_masknoca.g4a
-	intel-gen4asm -o exa_wm_masknoca_prog.h exa_wm_masknoca.g4a
-
-exa_wm_nomask_prog.h: exa_wm_nomask.g4a
-	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
-
-exa_wm_rotation_prog.h: exa_wm_rotation.g4a
-	intel-gen4asm -o exa_wm_rotation_prog.h exa_wm_rotation.g4a
+SUFFIXES = .g4a .g4b
+.g4a.g4b:
+	m4 -s $*.g4a > $*.g4m
+	intel-gen4asm -o $@ $*.g4m && rm $*.g4m
 
 endif
 
diff --git a/src/exa_sf.g4b b/src/exa_sf.g4b
new file mode 100644
index 0000000..223c9c9
--- /dev/null
+++ b/src/exa_sf.g4b
@@ -0,0 +1,15 @@
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
+   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
+   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
+   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
+   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
+   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index c830fd8..a0d6efc 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -21,82 +21,52 @@
  * IN THE SOFTWARE.
  *
  * Authors:
- *    Keith Packard <keithp at keithp.com>
- *    Eric Anholt <eric at anholt.net>
  *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  */
 
+/* FIXME how to setup second coeffient for mask tex coord */
 
-/*
- * Inputs (note all sub-register addresses are bytes, not float indices)
- *
- * Note that the vertices will have been reordered:
- *
- * V0 is topmost (leftmost among topmost) (upper left)
- * V1 is next clockwise (lower right)
- * V2 is remaining (lower left)
- *
- *  V0 ...................... XX
- *  |                          .
- *  |                          .
- *  |                          .
- *  V2------------------------V1
- *
- *  G0	    thread state -- just pass along
- *
- *  G1 and G2 are fixed by SF spec
- *
- *  G1.0    reserved
- *  G1.4    Provoking vertex
- *  G1.8    Determinant
- *  G1.12   X1 - X0
- *  G1.16   X2 - X0
- *  G1.20   Y1 - Y0
- *  G1.24   Y2 - Y0
- *  G1.30   reserved
- *
- *  G2.0    Z0
- *  G2.4    1/W0
- *  G2.8    Z1
- *  G2.12   1/W1
- *  G2.16   Z2
- *  G2.20   1/W2
- *  G2.24   reserved
- *  G2.30   reserved
- *
- *  G3 is V0 Vertex Attribute Data from URB (upper left)
- *
- *  G3.0    u0
- *  G3.4    v0
- *
- *  G4 is V1 Vertex Attribute Data from URB (lower right)
- *
- *  G4.0    u1
- *  G4.4    v1
- *
- *  G5 is V2 Vertex Attribute Data from URB (lower left)
- *
+/* 
+   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
+   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
+   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
+   g6      { 1/(x1-x0), 1/(y1-y0) }
+   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
+	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
+		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
  */
 
-/* Compute inverses of the input deltas */
-send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
+/* assign Cx[0], Cx[1] to src, same to Cy, Co 
+          Cx[2], Cx[3] to mask, same to Cy, Co */
 
-/* texture location at V0 */
-mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+/* Cx[0] */
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[0] */
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Cx[2] */
+mul (1) g7.8<1>F g7.8<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[2] */
+mul (1) g7.12<1>F g7.12<0,1,0>F g6.4<0,1,0>F { align1 };
 
-/* compute V1 - V2 (motion in X) for texture coordinates */
-add (8) g7<1>F g4<8,8,1>F -g5<8,8,1>F { align1 };
-
-/* multiply by 1/dx */
-mul (8) m1<1>F g7<8,8,1>F g6.0<0,1,0>F { align1 };
-
-/* Compute V2 - V0 (motion in Y) for texture coordinates */
-add (8) g7<1>F g5<8,8,1>F -g3<8,8,1>F { align1 };
-
-/* multiply by 1/dy */
-mul (8) m2<1>F g7<8,8,1>F g6.8<0,1,0>F {align1 };
+/* src Cx[0], Cx[1] */
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+/* mask Cx[2], Cx[3] */
+mov (1) m1.8<1>F g7.8<0,1,0>F { align1 };
+mov (1) m1.12<1>F g7.8<0,1,0>F { align1 };
+/* src Cy[0], Cy[1] */
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+/* mask Cy[2], Cy[3] */
+mov (1) m2.8<1>F g7.12<0,1,0>F { align1 };
+mov (1) m2.12<1>F g7.12<0,1,0>F { align1 };
+/* src Co[0], Co[1] */
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+/* mask Co[2], Co[3] */
+mov (1) m3.8<1>F g3.8<0,1,0>F { align1 };
+mov (1) m3.12<1>F g3.12<0,1,0>F { align1 };
 
-/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_mask.g4b b/src/exa_sf_mask.g4b
new file mode 100644
index 0000000..4e9114d
--- /dev/null
+++ b/src/exa_sf_mask.g4b
@@ -0,0 +1,25 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00000041, 0x20e877bd, 0x000000e8, 0x000000c0 },
+   { 0x00000041, 0x20ec77bd, 0x000000ec, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00000001, 0x202803be, 0x000000e8, 0x00000000 },
+   { 0x00000001, 0x202c03be, 0x000000e8, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00000001, 0x204803be, 0x000000ec, 0x00000000 },
+   { 0x00000001, 0x204c03be, 0x000000ec, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00000001, 0x206803be, 0x00000068, 0x00000000 },
+   { 0x00000001, 0x206c03be, 0x0000006c, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
new file mode 100644
index 0000000..c7ecb09
--- /dev/null
+++ b/src/exa_wm.g4i
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Input parameters
+ */
+
+define(`quote', `ifelse(`$#', `0', `', ``$*'')')
+
+/* Destination X/Y */
+define(`dst_x_uw',  `g1.8<2,4,0>UW')
+define(`dst_y_uw',  `g1.10<2,4,0>UW')
+define(`screen_x0', `g1.0<0,1,0>F')
+define(`screen_y0', `g1.4<0,1,0>F')
+
+/* Source transformation parameters */
+define(`src_du_dx', `g3.0<0,1,0>F')
+define(`src_du_dy', `g3.4<0,1,0>F')
+define(`src_uo',    `g3.12<0,1,0>F')
+define(`src_dv_dx', `g3.16<0,1,0>F')
+define(`src_dv_dy', `g3.20<0,1,0>F')
+define(`src_vo',    `g3.28<0,1,0>F')
+define(`src_dw_dx', `g4.0<0,1,0>F')
+define(`src_dw_dy', `g4.4<0,1,0>F')
+define(`src_wo',    `g4.12<0,1,0>F')
+
+define(`mask_du_dx', `g4.16<0,1,0>F')
+define(`mask_du_dy', `g4.20<0,1,0>F')
+define(`mask_uo',    `g4.28<0,1,0>F')
+define(`mask_dv_dx', `g5.0<0,1,0>F')
+define(`mask_dv_dy', `g5.4<0,1,0>F')
+define(`mask_vo',    `g5.12<0,1,0>F')
+define(`mask_dw_dx', `g5.16<0,1,0>F')
+define(`mask_dw_dy', `g5.20<0,1,0>F')
+define(`mask_wo',    `g5.28<0,1,0>F')
+
+/*
+ * Local variables
+ */
+
+/* this holds the X dest coordinates */
+define(`dst_x',	    `g8')
+define(`dst_x_0',   `dst_x')
+define(`dst_x_1',   `g9')
+
+/* this holds the Y dest coordinates */
+define(`dst_y',	    `g10')
+define(`dst_y_0',   `dst_y')
+define(`dst_y_1',   `g11')
+
+/* When computing x * dn/dx, use this */
+define(`temp_x',    `g12')
+define(`temp_x_0',  `temp_x')
+define(`temp_x_1',  `g13')
+
+/* When computing y * dn/dy, use this */
+define(`temp_y',    `g14')
+define(`temp_y_0',  temp_y)
+define(`temp_y_1',  `g15')
+
+/* when loading x/y, use these to hold them in UW format */
+define(`temp_x_uw', temp_x)
+define(`temp_y_uw', temp_y)
+
+/* compute source and mask u/v to this pair to send to sampler */
+define(`src_u',	    `m1')
+define(`src_v',	    `m3')
+define(`mask_u',    src_u)
+define(`mask_v',    src_v)
+define(`src_w',	    `g16')
+define(`src_w_0',   src_w)
+define(`src_w_1',   `g17')
+define(`mask_w',    src_w)
+define(`mask_w_0',  src_w_0)
+define(`mask_w_1',  src_w_1)
+
+/* sample src to these registers */
+define(`src_sample0',	`g18')
+define(`src_sample1',	`g19')
+define(`src_sample2',	`g20')
+define(`src_sample3',	`g21')
+define(`src_sample4',	`g22')
+define(`src_sample5',	`g23')
+define(`src_sample6',	`g24')
+define(`src_sample7',	`g25')
+
+/* sample mask to these registers */
+define(`mask_sample0',	`g26')
+define(`mask_sample1',	`g27')
+define(`mask_sample2',	`g28')
+define(`mask_sample3',	`g29')
+define(`mask_sample4',	`g30')
+define(`mask_sample5',	`g31')
+define(`mask_sample6',	`g32')
+define(`mask_sample7',	`g33')
diff --git a/src/exa_wm_affine.g4i b/src/exa_wm_affine.g4i
new file mode 100644
index 0000000..8fc6450
--- /dev/null
+++ b/src/exa_wm_affine.g4i
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/*
+ * Fragment to compute src u/v values under an affine transform
+ */
+
+/********** Compute u *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	du_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	du_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	u<1>F		temp_x<8,8,1>F	uo		{ compr align1 };
+
+/********** Compute v *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dv_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dv_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	v<1>F		temp_x<8,8,1>F	vo		{ compr align1 };
+
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
new file mode 100644
index 0000000..955c68c
--- /dev/null
+++ b/src/exa_wm_ca.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+
+/* mul mask rgba channels to src */
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample0<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample2<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample4<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
new file mode 100644
index 0000000..d0f3519
--- /dev/null
+++ b/src/exa_wm_ca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d03c0 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
new file mode 100644
index 0000000..a1be28e
--- /dev/null
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+
+/* mul mask rgba channels to src */
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	src_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
new file mode 100644
index 0000000..780e704
--- /dev/null
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0300 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0300 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0300 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0300 },
diff --git a/src/exa_wm_mask_affine.g4a b/src/exa_wm_mask_affine.g4a
new file mode 100644
index 0000000..4c096cb
--- /dev/null
+++ b/src/exa_wm_mask_affine.g4a
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+include(`exa_wm.g4i')
+define(`du_dx',	`mask_du_dx')
+define(`du_dy',	`mask_du_dy')
+define(`uo',	`mask_uo')
+define(`dv_dx',	`mask_dv_dx')
+define(`dv_dy',	`mask_dv_dy')
+define(`vo',	`mask_vo')
+define(`u',	`mask_u')
+define(`v',	`mask_v')
+include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_mask_affine.g4b b/src/exa_wm_mask_affine.g4b
new file mode 100644
index 0000000..62b46e0
--- /dev/null
+++ b/src/exa_wm_mask_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x202077be, 0x008d0180, 0x0000009c },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x206077be, 0x008d0180, 0x000000ac },
diff --git a/src/exa_wm_mask_projective.g4a b/src/exa_wm_mask_projective.g4a
new file mode 100644
index 0000000..464f6c5
--- /dev/null
+++ b/src/exa_wm_mask_projective.g4a
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+include(`exa_wm.g4i')
+
+define(`du_dx',	`mask_du_dx')
+define(`du_dy',	`mask_du_dy')
+define(`uo',	`mask_uo')
+
+define(`dv_dx',	`mask_dv_dx')
+define(`dv_dy',	`mask_dv_dy')
+define(`vo',	`mask_vo')
+
+define(`dw_dx',	`mask_dw_dx')
+define(`dw_dy',	`mask_dw_dy')
+define(`wo',	`mask_wo')
+
+define(`u',	`mask_u')
+define(`v',	`mask_v')
+define(`w',	`mask_w')
+define(`w_0',	`mask_w_0')
+define(`w_1',	`mask_w_1')
+
+include(`exa_wm_projective.g4i')
diff --git a/src/exa_wm_mask_projective.g4b b/src/exa_wm_mask_projective.g4b
new file mode 100644
index 0000000..ac4faa3
--- /dev/null
+++ b/src/exa_wm_mask_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000bc },
+   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
+   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000090 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000094 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x000000ac },
+   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
diff --git a/src/exa_wm_mask_sample.g4a b/src/exa_wm_mask_sample.g4a
new file mode 100644
index 0000000..45dc3c4
--- /dev/null
+++ b/src/exa_wm_mask_sample.g4a
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the mask surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0			/* msg reg index */
+	mask_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+// mov (8)  mask_sample7<1>UD	mask_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
+
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
diff --git a/src/exa_wm_mask_sample.g4b b/src/exa_wm_mask_sample.g4b
new file mode 100644
index 0000000..45f7ead
--- /dev/null
+++ b/src/exa_wm_mask_sample.g4b
@@ -0,0 +1 @@
+   { 0x00800031, 0x23401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_maskca.g4a b/src/exa_wm_maskca.g4a
index 0e96aa0..d030467 100644
--- a/src/exa_wm_maskca.g4a
+++ b/src/exa_wm_maskca.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_maskca.g4b b/src/exa_wm_maskca.g4b
new file mode 100644
index 0000000..d936412
--- /dev/null
+++ b/src/exa_wm_maskca.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d02e0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d0320 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d0360 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_maskca_srcalpha.g4a b/src/exa_wm_maskca_srcalpha.g4a
index a92c9e4..133c9f0 100644
--- a/src/exa_wm_maskca_srcalpha.g4a
+++ b/src/exa_wm_maskca_srcalpha.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_maskca_srcalpha.g4b b/src/exa_wm_maskca_srcalpha.g4b
new file mode 100644
index 0000000..d83b119
--- /dev/null
+++ b/src/exa_wm_maskca_srcalpha.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00600041, 0x21e077bd, 0x008d02e0, 0x008d02a0 },
+   { 0x00600041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00600041, 0x222077bd, 0x008d0320, 0x008d02a0 },
+   { 0x00600041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00600041, 0x226077bd, 0x008d0360, 0x008d02a0 },
+   { 0x00600041, 0x228077bd, 0x008d0380, 0x008d0280 },
+   { 0x00600041, 0x22a077bd, 0x008d03a0, 0x008d02a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
index 2e9e3c9..44f6953 100644
--- a/src/exa_wm_masknoca.g4a
+++ b/src/exa_wm_masknoca.g4a
@@ -58,44 +58,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
 /* This is for src texture */
diff --git a/src/exa_wm_masknoca.g4b b/src/exa_wm_masknoca.g4b
new file mode 100644
index 0000000..5fcf3b5
--- /dev/null
+++ b/src/exa_wm_masknoca.g4b
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
new file mode 100644
index 0000000..7dd1224
--- /dev/null
+++ b/src/exa_wm_noca.g4a
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Composite src and mask together, no component alpha
+ */
+
+include(`exa_wm.g4i')
+/* mul mask's alpha channel to src */
+
+mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
new file mode 100644
index 0000000..ba01d1a
--- /dev/null
+++ b/src/exa_wm_noca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0400 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0400 },
+   { 0x00802041, 0x22c077bd, 0x008d02c0, 0x008d0400 },
+   { 0x00802041, 0x230077bd, 0x008d0300, 0x008d0400 },
diff --git a/src/exa_wm_projective.g4i b/src/exa_wm_projective.g4i
new file mode 100644
index 0000000..13da99c
--- /dev/null
+++ b/src/exa_wm_projective.g4i
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/********** Compute w *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dw_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dw_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	wo		{ compr align1 };
+send (8) 0	w_0<1>F		temp_x_0<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
+send (8) 0	w_1<1>F		temp_x_1<8,8,1>F math inv mlen 1 rlen 1	{ compr align1 };
+
+/********** Compute u *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	du_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	du_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	uo		{ compr align1 };
+mul (16)	u<1>F		temp_x<8,8,1>F	w<8,8,1>F	{ compr align1 };
+
+/********** Compute v *************/
+
+mul (16)	temp_x<1>F	dst_x<8,8,1>F	dv_dx		{ compr align1 };
+mul (16)	temp_y<1>F	dst_y<8,8,1>F	dv_dy		{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	temp_y<8,8,1>F	{ compr align1 };
+add (16)	temp_x<1>F	temp_x<8,8,1>F	vo		{ compr align1 };
+mul (16)	v<1>F		temp_x<8,8,1>F	w<8,8,1>F	{ compr align1 };
diff --git a/src/exa_wm_src_affine.g4a b/src/exa_wm_src_affine.g4a
new file mode 100644
index 0000000..3bf8717
--- /dev/null
+++ b/src/exa_wm_src_affine.g4a
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/*
+ * Fragment to compute src u/v values under an affine transform
+ */
+
+include(`exa_wm.g4i')
+define(`du_dx',	`src_du_dx')
+define(`du_dy',	`src_du_dy')
+define(`uo',	`src_uo')
+define(`dv_dx',	`src_dv_dx')
+define(`dv_dy',	`src_dv_dy')
+define(`vo',	`src_vo')
+define(`u',	`src_u')
+define(`v',	`src_v')
+include(`exa_wm_affine.g4i')
diff --git a/src/exa_wm_src_affine.g4b b/src/exa_wm_src_affine.g4b
new file mode 100644
index 0000000..f18ea1e
--- /dev/null
+++ b/src/exa_wm_src_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x202077be, 0x008d0180, 0x0000006c },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x206077be, 0x008d0180, 0x0000007c },
diff --git a/src/exa_wm_src_projective.g4a b/src/exa_wm_src_projective.g4a
new file mode 100644
index 0000000..6bd2d6a
--- /dev/null
+++ b/src/exa_wm_src_projective.g4a
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+
+include(`exa_wm.g4i')
+define(`du_dx',	`src_du_dx')
+define(`du_dy',	`src_du_dy')
+define(`uo',	`src_uo')
+define(`dv_dx',	`src_dv_dx')
+define(`dv_dy',	`src_dv_dy')
+define(`vo',	`src_vo')
+define(`dw_dx',	`src_dw_dx')
+define(`dw_dy',	`src_dw_dy')
+define(`wo',	`src_wo')
+define(`u',	`src_u')
+define(`v',	`src_v')
+define(`w',	`src_w')
+define(`w_0',	`src_w_0')
+define(`w_1',	`src_w_1')
+
+include(`exa_wm_projective.g4i')
diff --git a/src/exa_wm_src_projective.g4b b/src/exa_wm_src_projective.g4b
new file mode 100644
index 0000000..68bfc92
--- /dev/null
+++ b/src/exa_wm_src_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000008c },
+   { 0x00600031, 0x22001fbd, 0x008d0180, 0x01110001 },
+   { 0x00600031, 0x22201fbd, 0x008d01a0, 0x01110001 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000006c },
+   { 0x00802041, 0x202077be, 0x008d0180, 0x008d0200 },
+   { 0x00802041, 0x218077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x21c077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x008d01c0 },
+   { 0x00802040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00802041, 0x206077be, 0x008d0180, 0x008d0200 },
diff --git a/src/exa_wm_src_sample.g4a b/src/exa_wm_src_sample.g4a
new file mode 100644
index 0000000..04cd3e3
--- /dev/null
+++ b/src/exa_wm_src_sample.g4a
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the src surface */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0			/* msg reg index */
+	src_sample0<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+// mov (8)  src_sample7<1>UD	src_sample7<8,8,1>UD	    { align1 };  /* wait sampler return */
+
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
diff --git a/src/exa_wm_src_sample.g4b b/src/exa_wm_src_sample.g4b
new file mode 100644
index 0000000..5ca33f5
--- /dev/null
+++ b/src/exa_wm_src_sample.g4b
@@ -0,0 +1 @@
+   { 0x00800031, 0x22401d29, 0x008d0000, 0x02580001 },
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
new file mode 100644
index 0000000..9a821d7
--- /dev/null
+++ b/src/exa_wm_write.g4a
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* 
+ * Once the data are ready, write them to the destination
+ */
+
+include(`exa_wm.g4i')
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* src_sample0 -> m2
+   src_sample1 -> m6
+   src_sample2 -> m3
+   src_sample3 -> m7
+   src_sample4 -> m4
+   src_sample5 -> m8
+   src_sample6 -> m5
+   src_sample7 -> m9
+*/
+
+mov (8) m2<1>F src_sample0<8,8,1>F { align1 };
+mov (8) m3<1>F src_sample2<8,8,1>F { align1 };
+mov (8) m4<1>F src_sample4<8,8,1>F { align1 };
+mov (8) m5<1>F src_sample6<8,8,1>F { align1 };
+mov (8) m6<1>F src_sample1<8,8,1>F { align1 };
+mov (8) m7<1>F src_sample3<8,8,1>F { align1 };
+mov (8) m8<1>F src_sample5<8,8,1>F { align1 };
+mov (8) m9<1>F src_sample7<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
new file mode 100644
index 0000000..dd266a3
--- /dev/null
+++ b/src/exa_wm_write.g4b
@@ -0,0 +1,20 @@
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d02c0, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0300, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d02a0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d02e0, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0320, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_xy.g4a b/src/exa_wm_xy.g4a
new file mode 100644
index 0000000..e99f5ac
--- /dev/null
+++ b/src/exa_wm_xy.g4a
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+ 
+/*
+ * Register assignments:
+ *
+ *  x		    g6/g7
+ *  y		    g8/g9
+ *
+ *  temp x	    g10/g11
+ *  temp y	    g12/g13
+ *
+ *  src w	    g14/g15
+ *  src u	    m1/m2
+ *  src v	    m3/m4
+ */
+ 
+/* Fragment to compute per-pixel XY values */
+
+include(`exa_wm.g4i')
+    
+    /* Load X and Y coordinates and compute per-pixel coordinates */
+add (16)	temp_x_uw<1>UW	dst_x_uw		0x10101010V	{ align1 };
+add (16)	temp_y_uw<1>UW	dst_y_uw		0x11001100V	{ align1 };
+
+    /* subtract screen-space origin of vertex 0 */
+add (16)	dst_x<1>F	temp_x_uw<8,8,1>UW	-screen_x0	{ compr align1 };
+add (16)	dst_y<1>F	temp_y_uw<8,8,1>UW	-screen_y0	{ compr align1 };
diff --git a/src/exa_wm_xy.g4b b/src/exa_wm_xy.g4b
new file mode 100644
index 0000000..7784a3d
--- /dev/null
+++ b/src/exa_wm_xy.g4b
@@ -0,0 +1,4 @@
+   { 0x00800040, 0x21806d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x21c06d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d0180, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d01c0, 0x00004024 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 26c06aa..7668779 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -321,31 +321,68 @@ static const uint32_t sip_kernel_static[][4] = {
 #define SF_MAX_THREADS	   2
 
 static const uint32_t sf_kernel_static[][4] = {
-#include "exa_sf_prog.h"
+#include "exa_sf.g4b"
 };
 
 static const uint32_t sf_kernel_static_mask[][4] = {
-#include "exa_sf_mask_prog.h"
+#include "exa_sf_mask.g4b"
 };
 
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
 
-static const uint32_t ps_kernel_static_nomask [][4] = {
-#include "exa_wm_nomask_prog.h"
+static const uint32_t ps_kernel_static_nomask_affine [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_static_nomask_projective [][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_write.g4b"
 };
 
 static const uint32_t ps_kernel_static_maskca [][4] = {
-#include "exa_wm_maskca_prog.h"
+#include "exa_wm_maskca.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_ca.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static const uint32_t ps_kernel_static_maskca_srcalpha [][4] = {
-#include "exa_wm_maskca_srcalpha_prog.h"
+#include "exa_wm_maskca_srcalpha.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_ca_srcalpha.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static const uint32_t ps_kernel_static_masknoca [][4] = {
-#include "exa_wm_masknoca_prog.h"
+#include "exa_wm_masknoca.g4b"
+#if 0
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample.g4b"
+#include "exa_wm_noca.g4b"
+#include "exa_wm_write.g4b"
+#endif
 };
 
 static uint32_t 
@@ -374,6 +411,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_tiled = 0;
     uint32_t dst_format, dst_offset, dst_pitch, dst_tile_format = 0,
 	dst_tiled = 0;
+    Bool is_affine_src, is_affine_mask, is_affine;
 
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
@@ -402,6 +440,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     pI830->scale_units[0][1] = pSrc->drawable.height;
 
     pI830->transform[0] = pSrcPicture->transform;
+    is_affine_src = i830_transform_is_affine (pI830->transform[0]);
+    is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
+    is_affine = is_affine_src && is_affine_mask;
 
     if (!pMask) {
 	pI830->transform[1] = NULL;
@@ -460,7 +501,10 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    next_offset = ps_kernel_offset + 
                           sizeof(ps_kernel_static_masknoca);
     } else {
-   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
+	if (is_affine)
+	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_affine);
+	else
+	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_projective);
     }
 
     sip_kernel_offset = ALIGN(next_offset, 64);
@@ -837,8 +881,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	    memcpy(ps_kernel, ps_kernel_static_masknoca,
 		   sizeof (ps_kernel_static_masknoca));
     } else {
-   	memcpy(ps_kernel, ps_kernel_static_nomask,
-	       sizeof (ps_kernel_static_nomask));
+	if (is_affine)
+	    memcpy(ps_kernel, ps_kernel_static_nomask_affine,
+		   sizeof (ps_kernel_static_nomask_affine));
+	else
+	    memcpy(ps_kernel, ps_kernel_static_nomask_projective,
+		   sizeof (ps_kernel_static_nomask_projective));
     }
 
     wm_state = &wm_state_local;
@@ -989,51 +1037,75 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	ADVANCE_BATCH();
     }
     {
-        int nelem = pMask ? 3: 2;
+	/* 
+	 * number of extra parameters per vertex
+	 */
+        int nelem = pMask ? 2: 1;
+	/* 
+	 * size of extra parameters:
+	 *  3 for homogenous (xyzw)
+	 *  2 for cartesian (xy)
+	 */
+	int selem = is_affine ? 2 : 3;
+	uint32_t    w_component;
+	uint32_t    src_format;
+	
+	if (is_affine)
+	{
+	    src_format = BRW_SURFACEFORMAT_R32G32_FLOAT;
+	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	}
+	else
+	{
+	    src_format = BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+	    w_component = BRW_VFCOMPONENT_NOSTORE;
+	}
 	BEGIN_BATCH(pMask?12:10);
-	/* Set up the pointer to our vertex buffer */
+	/* Set up the pointer to our (single) vertex buffer */
 	OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
 	OUT_BATCH((0 << VB0_BUFFER_INDEX_SHIFT) |
 		  VB0_VERTEXDATA |
-		  ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
+		  ((4 * (2 + nelem * selem)) << VB0_BUFFER_PITCH_SHIFT));
 	OUT_BATCH(state_base_offset + vb_offset);
         OUT_BATCH(3);
 	OUT_BATCH(0); // ignore for VERTEXDATA, but still there
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
 	 */
-	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));
-	/* vertex coordinates */
-	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-		  VE0_VALID |
-		  (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		  (0 << VE0_OFFSET_SHIFT));
-	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-		  (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
-	/* u0, v0 */
+	
+	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * (1 + nelem)) - 1));
+	/* x,y */
 	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
 		  VE0_VALID |
 		  (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		  (8 << VE0_OFFSET_SHIFT)); /* offset vb in bytes */
-	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_2_SHIFT) |
-		  (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_3_SHIFT) |
-		  (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
-	/* u1, v1 */
+		  (0				<< VE0_OFFSET_SHIFT));
+	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_0_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_1_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_1_FLT	<< VE1_VFCOMPONENT_2_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_1_FLT	<< VE1_VFCOMPONENT_3_SHIFT) |
+		  (4				<< VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+	/* u0, v0, w0 */
+	OUT_BATCH((0				<< VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+		  VE0_VALID					     |
+		  (src_format			<< VE0_FORMAT_SHIFT) |
+		  ((2 * 4)			<< VE0_OFFSET_SHIFT)); /* offset vb in bytes */
+	OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_0_SHIFT) |
+		  (BRW_VFCOMPONENT_STORE_SRC	<< VE1_VFCOMPONENT_1_SHIFT) |
+		  (w_component			<< VE1_VFCOMPONENT_2_SHIFT) |
+		  (BRW_VFCOMPONENT_NOSTORE	<< VE1_VFCOMPONENT_3_SHIFT) |
+		  ((4 + 4)			<< VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
+	/* u1, v1, w1 */
    	if (pMask) {
-	    OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-		      VE0_VALID |
-		      (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-		      (16 << VE0_OFFSET_SHIFT));
-	    OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		      (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		      (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_2_SHIFT) |
-		      (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_3_SHIFT) |
-		      (10 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+	    OUT_BATCH((0			    << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+		      VE0_VALID							    |
+		      (src_format		    << VE0_FORMAT_SHIFT) |
+		      (((2 + selem) * 4)    	    << VE0_OFFSET_SHIFT));  /* vb offset in bytes */
+	    
+	    OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_0_SHIFT) |
+		      (BRW_VFCOMPONENT_STORE_SRC    << VE1_VFCOMPONENT_1_SHIFT) |
+		      (w_component		    << VE1_VFCOMPONENT_2_SHIFT) |
+		      (BRW_VFCOMPONENT_NOSTORE	    << VE1_VFCOMPONENT_3_SHIFT) |
+		      ((4 + 2 + 4)		    << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); /* VUE offset in dwords */
    	}
 
 	ADVANCE_BATCH();
@@ -1053,38 +1125,87 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
     Bool has_mask;
-    float src_x[3], src_y[3], mask_x[3], mask_y[3];
+    Bool is_affine_src, is_affine_mask, is_affine;
+    float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
     int i;
+    int per_vertex = 2; /* dst x/y */
 
-    if (!i830_get_transformed_coordinates(srcX, srcY,
-					  pI830->transform[0],
-					  &src_x[0], &src_y[0]))
-	return;
-    if (!i830_get_transformed_coordinates(srcX, srcY + h,
-					  pI830->transform[0],
-					  &src_x[1], &src_y[1]))
-	return;
-    if (!i830_get_transformed_coordinates(srcX + w, srcY + h,
-					  pI830->transform[0],
-					  &src_x[2], &src_y[2]))
-	return;
+    is_affine_src = i830_transform_is_affine (pI830->transform[0]);
+    is_affine_mask = i830_transform_is_affine (pI830->transform[1]);
+    is_affine = is_affine_src && is_affine_mask;
+    
+    if (is_affine)
+    {
+	if (!i830_get_transformed_coordinates(srcX, srcY,
+					      pI830->transform[0],
+					      &src_x[0], &src_y[0]))
+	    return;
+	if (!i830_get_transformed_coordinates(srcX, srcY + h,
+					      pI830->transform[0],
+					      &src_x[1], &src_y[1]))
+	    return;
+	if (!i830_get_transformed_coordinates(srcX + w, srcY + h,
+					      pI830->transform[0],
+					      &src_x[2], &src_y[2]))
+	    return;
+	per_vertex += 2;    /* src u/v */
+    }
+    else
+    {
+	if (!i830_get_transformed_coordinates_3d(srcX, srcY,
+						 pI830->transform[0],
+						 &src_x[0], &src_y[0],
+						 &src_w[0]))
+	    return;
+	if (!i830_get_transformed_coordinates_3d(srcX, srcY + h,
+						 pI830->transform[0],
+						 &src_x[1], &src_y[1],
+						 &src_w[1]))
+	    return;
+	if (!i830_get_transformed_coordinates_3d(srcX + w, srcY + h,
+						 pI830->transform[0],
+						 &src_x[2], &src_y[2],
+						 &src_w[2]))
+	    return;
+	per_vertex += 3;    /* src u/v/w */
+    }
 
     if (pI830->scale_units[1][0] == -1 || pI830->scale_units[1][1] == -1) {
 	has_mask = FALSE;
     } else {
 	has_mask = TRUE;
-	if (!i830_get_transformed_coordinates(maskX, maskY,
-					      pI830->transform[1],
-					      &mask_x[0], &mask_y[0]))
-	    return;
-	if (!i830_get_transformed_coordinates(maskX, maskY + h,
-					      pI830->transform[1],
-					      &mask_x[1], &mask_y[1]))
-	    return;
-	if (!i830_get_transformed_coordinates(maskX + w, maskY + h,
-					      pI830->transform[1],
-					      &mask_x[2], &mask_y[2]))
-	    return;
+	if (is_affine_mask) {
+	    if (!i830_get_transformed_coordinates(maskX, maskY,
+						  pI830->transform[1],
+						  &mask_x[0], &mask_y[0]))
+		return;
+	    if (!i830_get_transformed_coordinates(maskX, maskY + h,
+						  pI830->transform[1],
+						  &mask_x[1], &mask_y[1]))
+		return;
+	    if (!i830_get_transformed_coordinates(maskX + w, maskY + h,
+						  pI830->transform[1],
+						  &mask_x[2], &mask_y[2]))
+		return;
+	    per_vertex += 2;	/* mask u/v */
+	} else {
+	    if (!i830_get_transformed_coordinates_3d(maskX, maskY,
+						     pI830->transform[1],
+						     &mask_x[0], &mask_y[0],
+						     &mask_w[0]))
+		return;
+	    if (!i830_get_transformed_coordinates_3d(maskX, maskY + h,
+						     pI830->transform[1],
+						     &mask_x[1], &mask_y[1],
+						     &mask_w[1]))
+		return;
+	    if (!i830_get_transformed_coordinates_3d(maskX + w, maskY + h,
+						     pI830->transform[1],
+						     &mask_x[2], &mask_y[2],
+						     &mask_w[2]))
+		return;
+	    per_vertex += 3;	/* mask u/v/w */
+	}
     }
 
     /* Wait for any existing composite rectangles to land before we overwrite
@@ -1098,9 +1219,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)(dstY + h);
     vb[i++] = src_x[2] / pI830->scale_units[0][0];
     vb[i++] = src_y[2] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[2];
     if (has_mask) {
         vb[i++] = mask_x[2] / pI830->scale_units[1][0];
         vb[i++] = mask_y[2] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[2];
     }
 
     /* rect (x1,y2) */
@@ -1108,9 +1233,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)(dstY + h);
     vb[i++] = src_x[1] / pI830->scale_units[0][0];
     vb[i++] = src_y[1] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[1];
     if (has_mask) {
         vb[i++] = mask_x[1] / pI830->scale_units[1][0];
         vb[i++] = mask_y[1] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[1];
     }
 
     /* rect (x1,y1) */
@@ -1118,9 +1247,13 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     vb[i++] = (float)dstY;
     vb[i++] = src_x[0] / pI830->scale_units[0][0];
     vb[i++] = src_y[0] / pI830->scale_units[0][1];
+    if (!is_affine)
+	vb[i++] = src_w[0];
     if (has_mask) {
         vb[i++] = mask_x[0] / pI830->scale_units[1][0];
         vb[i++] = mask_y[0] / pI830->scale_units[1][1];
+	if (!is_affine)
+	    vb[i++] = mask_w[0];
     }
 
     {
diff --git a/src/packed_yuv_sf.g4b b/src/packed_yuv_sf.g4b
new file mode 100644
index 0000000..830d176
--- /dev/null
+++ b/src/packed_yuv_sf.g4b
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/packed_yuv_wm.g4a b/src/packed_yuv_wm.g4a
index 5e31f10..9e635ba 100644
--- a/src/packed_yuv_wm.g4a
+++ b/src/packed_yuv_wm.g4a
@@ -49,44 +49,44 @@
 
     /* Set up ss0.x coordinates*/
 mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UD { align1 };
 mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UD { align1 };
     /* Set up ss0.y coordinates */
 mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
 mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UD { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UD { align1 };
     /* set up ss1.x coordinates */
 mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UD { align1 };
 mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UD { align1 };
     /* set up ss1.y coordinates */
 mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
 mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UD { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UD { align1 };
     /* Set up ss2.x coordinates */
 mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UD { align1 };
 mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UD { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UD { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UD { align1 };
     /* Set up ss3.x coordinates */
 mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UD { align1 };
 mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UD { align1 };
     /* Set up ss3.y coordinates */
 mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
 mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UD { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UD { align1 };
 
     /* Now, map these screen space coordinates into texture coordinates. */
     /* subtract screen-space X origin of vertex 0. */
diff --git a/src/packed_yuv_wm.g4b b/src/packed_yuv_wm.g4b
new file mode 100644
index 0000000..d72c651
--- /dev/null
+++ b/src/packed_yuv_wm.g4b
@@ -0,0 +1,82 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600129, 0x008d0260, 0x00000000 },
+   { 0x00600040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
+   { 0x00600040, 0x21807fbd, 0x008d0180, 0xbf008084 },
+   { 0x00600040, 0x22007fbd, 0x008d0200, 0xbf008084 },
+   { 0x00600041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00600041, 0x20007fbc, 0x008d0180, 0x3fcc49ba },
+   { 0x80600048, 0x20407fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0180, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0200, 0xbec8b439 },
+   { 0x80600048, 0x20607fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
+   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600040, 0x21e07fbd, 0x008d01e0, 0xbd808081 },
+   { 0x00600040, 0x21a07fbd, 0x008d01a0, 0xbf008084 },
+   { 0x00600040, 0x22207fbd, 0x008d0220, 0xbf008084 },
+   { 0x00600041, 0x21e07fbd, 0x008d01e0, 0x3f94fdf4 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
+   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0220, 0xbec8b439 },
+   { 0x80600048, 0x20e07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0220, 0x40011687 },
+   { 0x80600048, 0x21007fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },


More information about the xorg-commit mailing list