xf86-video-intel: 11 commits - src/exa_wm_ca.g4a src/exa_wm_ca_srcalpha.g4a src/exa_wm.g4i src/exa_wm_mask_sample_a.g4a src/exa_wm_mask_sample_argb.g4a src/exa_wm_noca.g4a src/exa_wm_src_sample_a.g4a src/exa_wm_src_sample_argb.g4a src/exa_wm_src_sample_planar.g4a src/exa_wm_src_sample_planar.g4b src/exa_wm_write.g4a src/exa_wm_yuv_rgb.g4a src/exa_wm_yuv_rgb.g4b src/i830_video.c src/i965_video.c src/Makefile.am src/packed_yuv_wm.g4a src/packed_yuv_wm.g4b

Thu Apr 10 14:58:40 PDT 2008

src/Makefile.am                  |    6 +
 src/exa_wm.g4i                   |   50 +++++++---
 src/exa_wm_ca.g4a                |    8 -
 src/exa_wm_ca_srcalpha.g4a       |    8 -
 src/exa_wm_mask_sample_a.g4a     |    2 
 src/exa_wm_mask_sample_argb.g4a  |    2 
 src/exa_wm_noca.g4a              |    8 -
 src/exa_wm_src_sample_a.g4a      |    2 
 src/exa_wm_src_sample_argb.g4a   |    2 
 src/exa_wm_src_sample_planar.g4a |   66 +++++++++++++
 src/exa_wm_src_sample_planar.g4b |    4 
 src/exa_wm_write.g4a             |   54 +++++------
 src/exa_wm_yuv_rgb.g4a           |  114 ++++++++++++++++++++++++
 src/exa_wm_yuv_rgb.g4b           |   20 ++++
 src/i830_video.c                 |  110 +++++------------------
 src/i965_video.c                 |  185 ++++++++++++++++++++++++++-------------
 src/packed_yuv_wm.g4a            |  112 +++++++++++++++--------
 src/packed_yuv_wm.g4b            |   23 ++--
 18 files changed, 522 insertions(+), 254 deletions(-)

New commits:
commit d5a80e1e3ab5724d34b20f9ee6f830efd0f5b076
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 16:27:40 2008 -0500

    Single memcpy when pitches align on planar image transfer

diff --git a/src/i830_video.c b/src/i830_video.c
index 4e1f725..2437288 100644
--- a/src/i830_video.c
+++ b/src/i830_video.c
@@ -1382,11 +1382,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h; i++) {
-	    memcpy(dst1, src1, w);
-	    src1 += srcPitch;
-	    dst1 += dstPitch2;
-	}
+	if (srcPitch == dstPitch2)
+	    memcpy (dst1, src1, srcPitch * h);
+	else
+	    for (i = 0; i < h; i++) {
+		memcpy(dst1, src1, w);
+		src1 += srcPitch;
+		dst1 += dstPitch2;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < h; i++) {
@@ -1437,11 +1440,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h / 2; i++) {
-	    memcpy(dst2, src2, w / 2);
-	    src2 += srcPitch2;
-	    dst2 += dstPitch;
-	}
+	if (srcPitch2 == dstPitch)
+	    memcpy (dst2, src2, h/2 * srcPitch2);
+	else
+	    for (i = 0; i < h / 2; i++) {
+		memcpy(dst2, src2, w / 2);
+		src2 += srcPitch2;
+		dst2 += dstPitch;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < (h/2); i++) {
@@ -1493,11 +1499,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h / 2; i++) {
-	    memcpy(dst3, src3, w / 2);
-	    src3 += srcPitch2;
-	    dst3 += dstPitch;
-	}
+	if (srcPitch2 == dstPitch)
+	    memcpy (dst3, src3, srcPitch2 * h/2);
+	else
+	    for (i = 0; i < h / 2; i++) {
+		memcpy(dst3, src3, w / 2);
+		src3 += srcPitch2;
+		dst3 += dstPitch;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < (h/2); i++) {
commit ac97f2b1487df5574875350a9cded958dae33afa
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 16:27:23 2008 -0500

    Use available symbolic register name

diff --git a/src/exa_wm_src_sample_planar.g4a b/src/exa_wm_src_sample_planar.g4a
index 92b867e..10b15eb 100644
--- a/src/exa_wm_src_sample_planar.g4a
+++ b/src/exa_wm_src_sample_planar.g4a
@@ -42,7 +42,7 @@ mov (1) g0.8<1>UD	0x0000e000UD { align1 mask_disable };
 
 /* sample Y */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample_g<1>UW 	/* readback */
+	src_sample_g_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
@@ -50,7 +50,7 @@ send (16) src_msg_ind		/* msg reg index */
 	
 /* sample U (Cr) */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample_r<1>UW 	/* readback */
+	src_sample_r_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
@@ -58,7 +58,7 @@ send (16) src_msg_ind		/* msg reg index */
 	
 /* sample V (Cb) */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample_b<1>UW 	/* readback */
+	src_sample_b_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (3,2,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
commit f270456e5612cb88933e6aabcd9a816c5c292229
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 10:00:08 2008 -0500

    Remove .g4b files on clean

diff --git a/src/Makefile.am b/src/Makefile.am
index 48ea567..91f5995 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -192,6 +192,8 @@ $(INTEL_G4B): $(INTEL_G4I)
 
 BUILT_SOURCES= $(INTEL_G4B)
 
+clean-local:
+	-rm -f $(INTEL_G4B)
 endif
 
 if XMODES
commit fb6ed8b8b59e9dd18801afef5d7c62042ad176d7
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 09:58:44 2008 -0500

    Update yuv->rgb conversion programs to write to src_sample regs
    
    The YUV->RGB code was written to write directly to the dataport registers,
    but that didn't work for the compositing functions (cause still unknown).
    This change makes that code write RGB values to the src_sample registers as
    with the other sample computation fragments.

diff --git a/src/exa_wm_yuv_rgb.g4a b/src/exa_wm_yuv_rgb.g4a
index 0c7525d..327a808 100644
--- a/src/exa_wm_yuv_rgb.g4a
+++ b/src/exa_wm_yuv_rgb.g4a
@@ -30,18 +30,30 @@ include(`exa_wm.g4i')
 
 define(`YCbCr_base',	`src_sample_base')
 
-define(`Cr',		`src_sample_r')
+define(`Cr',		`src_sample_r_01')
 define(`Cr_01',		`src_sample_r_01')
 define(`Cr_23',		`src_sample_r_23')
 
-define(`Y',		`src_sample_g')
+define(`Y',		`src_sample_g_01')
 define(`Y_01',		`src_sample_g_01')
 define(`Y_23',		`src_sample_g_23')
 
-define(`Cb',		`src_sample_b')
+define(`Cb',		`src_sample_b_01')
 define(`Cb_01',		`src_sample_b_01')
 define(`Cb_23',		`src_sample_b_23')
 
+define(`Crn',		`mask_sample_r_01')
+define(`Crn_01',	`mask_sample_r_01')
+define(`Crn_23',	`mask_sample_r_23')
+
+define(`Yn',		`mask_sample_g_01')
+define(`Yn_01',		`mask_sample_g_01')
+define(`Yn_23',		`mask_sample_g_23')
+
+define(`Cbn',		`mask_sample_b_01')
+define(`Cbn_01',	`mask_sample_b_01')
+define(`Cbn_23',	`mask_sample_b_23')
+
     /* color space conversion function:
      * R = Clamp ( 1.164(Y-16/255) + 1.596(Cr-128/255), 0, 1)
      * G = Clamp ( 1.164(Y-16/255) - 0.813(Cr-128/255) - 0.392(Cb-128/255), 0, 1)
@@ -58,45 +70,45 @@ define(`Cb_23',		`src_sample_b_23')
 
     /* Normalize Y, Cb and Cr:
      *
-     * Y = (Y - 16/255) * 1.164
-     * Cr = Cr - 128 / 255
-     * Cb = Cb - 128 / 255
+     * Yn = (Y - 16/255) * 1.164
+     * Crn = Cr - 128 / 255
+     * Cbn = Cb - 128 / 255
      */
-add (16)    Y<1>F		Y<8,8,1>F	-0.0627451F { compr align1 };
-mul (16)    Y<1>F		Y<8,8,1>F	1.164F	    { compr align1 };
+add (16)    Yn<1>F		Y<8,8,1>F	-0.0627451F { compr align1 };
+mul (16)    Yn<1>F		Yn<8,8,1>F	1.164F	    { compr align1 };
 
-add (16)    Cr<1>F		Cr<8,8,1>F	-0.501961F  { compr align1 };
+add (16)    Crn<1>F		Cr<8,8,1>F	-0.501961F  { compr align1 };
 
-add (16)    Cb<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
+add (16)    Cbn<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
 
     /* 
      * R = Y + Cr * 1.596
      */
-mul (8)	    null		Cr_01<8,8,1>F	1.596F	    { align1 };
-mac.sat (8) data_port_r_01<1>F	Y_01<8,8,1>F	1F	    { align1  };
-mul (8)     null		Cr_23<8,8,1>F	1.596F	    { align1 };
-mac.sat (8) data_port_r_23<1>F	Y_23<8,8,1>F	1F	    { align1  };
+mul (8)	    null		Crn_01<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) src_sample_r_01<1>F	Yn_01<8,8,1>F	1F	    { align1  };
+mul (8)     null		Crn_23<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) src_sample_r_23<1>F	Yn_23<8,8,1>F	1F	    { align1  };
      
     /*
-     * G = Cr * -0.813 + Cb * -0.392 + Y
+     * G = Crn * -0.813 + Cbn * -0.392 + Y
      */
-mul (8)	    null		Cr_01<8,8,1>F	-0.813F	    { align1 };
-mac (8)	    null		Cb_01<8,8,1>F	-0.392F	    { align1 };
-mac.sat (8) data_port_g_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
-mul (8)	    null		Cr_23<8,8,1>F	-0.813F	    { align1 };
-mac (8)	    null		Cb_23<8,8,1>F	-0.392F	    { align1 };
-mac.sat (8) data_port_g_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Crn_01<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cbn_01<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) src_sample_g_01<1>F	Yn_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Crn_23<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cbn_23<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) src_sample_g_23<1>F	Yn_23<8,8,1>F	1F	    { align1 };
 
     /*
-     * B = Cb * 2.017 + Y
+     * B = Cbn * 2.017 + Y
      */
-mul (8)	    null		Cb_01<8,8,1>F	2.017F	    { align1 };
-mac.sat (8) data_port_b_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
-mul (8)	    null		Cb_23<8,8,1>F	2.017F	    { align1 };
-mac.sat (8) data_port_b_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cbn_01<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) src_sample_b_01<1>F	Yn_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cbn_23<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) src_sample_b_23<1>F	Yn_23<8,8,1>F	1F	    { align1 };
 
     /*
      * A = 1.0
      */
-mov (8)	    data_port_a_01<1>F	1.0F			    { align1 };
-mov (8)	    data_port_a_23<1>F	1.0F			    { align1 };
+mov (8)	    src_sample_a_01<1>F	1.0F			    { align1 };
+mov (8)	    src_sample_a_23<1>F	1.0F			    { align1 };
diff --git a/src/exa_wm_yuv_rgb.g4b b/src/exa_wm_yuv_rgb.g4b
index 017186a..be72e54 100644
--- a/src/exa_wm_yuv_rgb.g4b
+++ b/src/exa_wm_yuv_rgb.g4b
@@ -1,20 +1,20 @@
-   { 0x00802040, 0x22007fbd, 0x008d0200, 0xbd808081 },
-   { 0x00802041, 0x22007fbd, 0x008d0200, 0x3f94fdf4 },
-   { 0x00802040, 0x21c07fbd, 0x008d01c0, 0xbf008084 },
-   { 0x00802040, 0x22407fbd, 0x008d0240, 0xbf008084 },
-   { 0x00600041, 0x20007fbc, 0x008d01c0, 0x3fcc49ba },
-   { 0x80600048, 0x20407fbe, 0x008d0200, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d01e0, 0x3fcc49ba },
-   { 0x80600048, 0x20c07fbe, 0x008d0220, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d01c0, 0xbf5020c5 },
-   { 0x00600048, 0x20007fbc, 0x008d0240, 0xbec8b439 },
-   { 0x80600048, 0x20607fbe, 0x008d0200, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d01e0, 0xbf5020c5 },
-   { 0x00600048, 0x20007fbc, 0x008d0260, 0xbec8b439 },
-   { 0x80600048, 0x20e07fbe, 0x008d0220, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0240, 0x40011687 },
-   { 0x80600048, 0x20807fbe, 0x008d0200, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0260, 0x40011687 },
-   { 0x80600048, 0x21007fbe, 0x008d0220, 0x3f800000 },
-   { 0x00600001, 0x20a003fe, 0x00000000, 0x3f800000 },
-   { 0x00600001, 0x212003fe, 0x00000000, 0x3f800000 },
+   { 0x00802040, 0x23007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00802041, 0x23007fbd, 0x008d0300, 0x3f94fdf4 },
+   { 0x00802040, 0x22c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00802040, 0x23407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00600041, 0x20007fbc, 0x008d02c0, 0x3fcc49ba },
+   { 0x80600048, 0x21c07fbd, 0x008d0300, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d02e0, 0x3fcc49ba },
+   { 0x80600048, 0x21e07fbd, 0x008d0320, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d02c0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0340, 0xbec8b439 },
+   { 0x80600048, 0x22007fbd, 0x008d0300, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d02e0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0360, 0xbec8b439 },
+   { 0x80600048, 0x22207fbd, 0x008d0320, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0340, 0x40011687 },
+   { 0x80600048, 0x22407fbd, 0x008d0300, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0360, 0x40011687 },
+   { 0x80600048, 0x22607fbd, 0x008d0320, 0x3f800000 },
+   { 0x00600001, 0x228003fd, 0x00000000, 0x3f800000 },
+   { 0x00600001, 0x22a003fd, 0x00000000, 0x3f800000 },
commit b01d582e23fc99e32bc47a395e9caa366731372a
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 09:30:35 2008 -0500

    Revert "Compute pixel values directly into data port"
    
    This reverts commit 346cf57deabb4c336612df4c13650a87b5ef6775.
    
    Mixing randr transforms and video caused screen corruption for Render
    operations. No, I don't understand why.

diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index a4b464b..ee8e3ad 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -103,20 +103,12 @@ define(`mask_w_1',  `src_w_1')
 
 /* sample src to these registers */
 define(`src_sample_base',	`g14')
-
-define(`src_sample_r',		`g14')
 define(`src_sample_r_01',	`g14')
 define(`src_sample_r_23',	`g15')
-
-define(`src_sample_g',		`g16')
 define(`src_sample_g_01',	`g16')
 define(`src_sample_g_23',	`g17')
-
-define(`src_sample_b',		`g18')
 define(`src_sample_b_01',	`g18')
 define(`src_sample_b_23',	`g19')
-
-define(`src_sample_a',		`g20')
 define(`src_sample_a_01',	`g20')
 define(`src_sample_a_23',	`g21')
 
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
index a8cb806..5d982b3 100644
--- a/src/exa_wm_ca.g4a
+++ b/src/exa_wm_ca.g4a
@@ -32,14 +32,7 @@
 include(`exa_wm.g4i')
 
 /* mul mask rgba channels to src */
-mul (8)	    data_port_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { align1 };
-mul (8)	    data_port_r_23<1>F	src_sample_r_23<8,8,1>F	mask_sample_r_23<8,8,1>F { align1 };
-
-mul (8)	    data_port_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { align1 };
-mul (8)	    data_port_g_23<1>F src_sample_g_23<8,8,1>F	mask_sample_g_23<8,8,1>F { align1 };
-
-mul (8)	    data_port_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { align1 };
-mul (8)	    data_port_b_23<1>F src_sample_b_23<8,8,1>F	mask_sample_b_23<8,8,1>F { align1 };
-
-mul (8)	    data_port_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
-mul (8)	    data_port_a_23<1>F src_sample_a_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
index ec33611..372e8b2 100644
--- a/src/exa_wm_ca.g4b
+++ b/src/exa_wm_ca.g4b
@@ -1,8 +1,4 @@
-   { 0x00600041, 0x204077be, 0x008d01c0, 0x008d02c0 },
-   { 0x00600041, 0x20c077be, 0x008d01e0, 0x008d02e0 },
-   { 0x00600041, 0x206077be, 0x008d0200, 0x008d0300 },
-   { 0x00600041, 0x20e077be, 0x008d0220, 0x008d0320 },
-   { 0x00600041, 0x208077be, 0x008d0240, 0x008d0340 },
-   { 0x00600041, 0x210077be, 0x008d0260, 0x008d0360 },
-   { 0x00600041, 0x20a077be, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x212077be, 0x008d02a0, 0x008d03a0 },
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
index a5f029f..d1f847f 100644
--- a/src/exa_wm_ca_srcalpha.g4a
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -31,14 +31,7 @@
 
 include(`exa_wm.g4i')
 
-mul (8)     data_port_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
-mul (8)     data_port_r_23<1>F mask_sample_r_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
-
-mul (8)     data_port_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
-mul (8)     data_port_g_23<1>F mask_sample_g_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
-
-mul (8)     data_port_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
-mul (8)     data_port_b_23<1>F mask_sample_b_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
-
-mul (8)     data_port_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
-mul (8)     data_port_a_23<1>F mask_sample_a_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
+mul (16)    src_sample_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
index 6ea89b8..963d676 100644
--- a/src/exa_wm_ca_srcalpha.g4b
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -1,8 +1,4 @@
-   { 0x00600041, 0x204077be, 0x008d02c0, 0x008d0280 },
-   { 0x00600041, 0x20c077be, 0x008d02e0, 0x008d02a0 },
-   { 0x00600041, 0x206077be, 0x008d0300, 0x008d0280 },
-   { 0x00600041, 0x20e077be, 0x008d0320, 0x008d02a0 },
-   { 0x00600041, 0x208077be, 0x008d0340, 0x008d0280 },
-   { 0x00600041, 0x210077be, 0x008d0360, 0x008d02a0 },
-   { 0x00600041, 0x20a077be, 0x008d0380, 0x008d0280 },
-   { 0x00600041, 0x212077be, 0x008d03a0, 0x008d02a0 },
+   { 0x00802041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d0280 },
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
index f43c6f4..d0d60fa 100644
--- a/src/exa_wm_noca.g4a
+++ b/src/exa_wm_noca.g4a
@@ -32,14 +32,7 @@
 include(`exa_wm.g4i')
 /* mul mask's alpha channel to src */
 
-mul (8)    data_port_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
-mul (8)    data_port_r_23<1>F	src_sample_r_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
-
-mul (8)    data_port_g_01<1>F	src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
-mul (8)    data_port_g_23<1>F	src_sample_g_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
-
-mul (8)    data_port_b_01<1>F	src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
-mul (8)    data_port_b_23<1>F	src_sample_b_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
-
-mul (8)    data_port_a_01<1>F	src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
-mul (8)    data_port_a_23<1>F	src_sample_a_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
index 2f5940a..1506334 100644
--- a/src/exa_wm_noca.g4b
+++ b/src/exa_wm_noca.g4b
@@ -1,8 +1,4 @@
-   { 0x00600041, 0x204077be, 0x008d01c0, 0x008d0380 },
-   { 0x00600041, 0x20c077be, 0x008d01e0, 0x008d03a0 },
-   { 0x00600041, 0x206077be, 0x008d0200, 0x008d0380 },
-   { 0x00600041, 0x20e077be, 0x008d0220, 0x008d03a0 },
-   { 0x00600041, 0x208077be, 0x008d0240, 0x008d0380 },
-   { 0x00600041, 0x210077be, 0x008d0260, 0x008d03a0 },
-   { 0x00600041, 0x20a077be, 0x008d0280, 0x008d0380 },
-   { 0x00600041, 0x212077be, 0x008d02a0, 0x008d03a0 },
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/src/exa_wm_src_data.g4a b/src/exa_wm_src_data.g4a
deleted file mode 100644
index 9c3daf0..0000000
--- a/src/exa_wm_src_data.g4a
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright Â© 2006 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- *    Keith Packard <keithp at keithp.com>
- */
-
-include(`exa_wm.g4i')
-
-/*
- * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
- *
- * Note that the SIMD16 write message takes data for the first
- * two sub-spans followed by the data for the second two sub-spans
- * instead of having the two sub-spans interleaved by channel. Weird.
- */
-
-mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
-mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
-mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
-mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
-
-mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
-mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
-mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
-mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
diff --git a/src/exa_wm_src_data.g4b b/src/exa_wm_src_data.g4b
deleted file mode 100644
index 8b53580..0000000
--- a/src/exa_wm_src_data.g4b
+++ /dev/null
@@ -1,8 +0,0 @@
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
index c46023e..b16e649 100644
--- a/src/exa_wm_write.g4a
+++ b/src/exa_wm_write.g4a
@@ -27,6 +27,24 @@
 
 include(`exa_wm.g4i')
 
+/*
+ * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
+ *
+ * Note that the SIMD16 write message takes data for the first
+ * two sub-spans followed by the data for the second two sub-spans
+ * instead of having the two sub-spans interleaved by channel. Weird.
+ */
+
+mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
+mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
+mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
+mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
+
+mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
+mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
+mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
+mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
+
 /* m0, m1 are all direct passed by PS thread payload */
 mov (8) data_port_msg_1<1>UD	g1<8,8,1>UD		{ align1 };
 
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
index 9402d11..785fe32 100644
--- a/src/exa_wm_write.g4b
+++ b/src/exa_wm_write.g4b
@@ -1,3 +1,11 @@
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
    { 0x00600001, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 79db41c..1b4afcc 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -340,7 +340,6 @@ static const uint32_t ps_kernel_nomask_affine_static [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_argb.g4b"
-#include "exa_wm_src_data.g4b"
 #include "exa_wm_write.g4b"
 };
 
@@ -348,7 +347,6 @@ static const uint32_t ps_kernel_nomask_projective_static [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_projective.g4b"
 #include "exa_wm_src_sample_argb.g4b"
-#include "exa_wm_src_data.g4b"
 #include "exa_wm_write.g4b"
 };
 
commit 3fc3d1a701bae257b70aa7b7654c722f30e71399
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 02:02:56 2008 -0500

    Remove sync after 965 video put.
    
    The hardware has been marked as needing a sync, so the next video put will
    block waiting for the previous one to complete. Adding a sync here just
    stalls the video playback for no good reason.

diff --git a/src/i965_video.c b/src/i965_video.c
index 63f1192..464f2e3 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -923,7 +923,6 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
 	i830MarkSync(pScrn);
     }
 
-    i830WaitSync(pScrn);
 #if WATCH_STATS
     i830_dump_error_state(pScrn);
 #endif
commit 825d9e50c59450f07178a54fed2616e551dc0455
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 00:09:34 2008 -0500

    Add planer video decode kernel
    
    Support for planar video reduces bus bandwidth by 25% and also reduces CPU
    usage during planar->packed conversion.

diff --git a/src/Makefile.am b/src/Makefile.am
index f50d1d4..48ea567 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -135,6 +135,7 @@ INTEL_G4A =				\
 	exa_wm_src_projective.g4a 	\
 	exa_wm_src_sample_argb.g4a 	\
 	exa_wm_src_sample_a.g4a 	\
+	exa_wm_src_sample_planar.g4a 	\
 	exa_wm_src_data.g4a		\
 	exa_wm_mask_affine.g4a 		\
 	exa_wm_mask_projective.g4a 	\
@@ -161,6 +162,7 @@ INTEL_G4B = 				\
 	exa_wm_src_projective.g4b 	\
 	exa_wm_src_sample_argb.g4b 	\
 	exa_wm_src_sample_a.g4b 	\
+	exa_wm_src_sample_planar.g4b 	\
 	exa_wm_src_data.g4b		\
 	exa_wm_mask_affine.g4b 		\
 	exa_wm_mask_projective.g4b 	\
diff --git a/src/exa_wm_src_sample_planar.g4a b/src/exa_wm_src_sample_planar.g4a
new file mode 100644
index 0000000..92b867e
--- /dev/null
+++ b/src/exa_wm_src_sample_planar.g4a
@@ -0,0 +1,66 @@
+/*
+ * Copyright Â© 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+/* Sample the src surface in planar format */
+
+include(`exa_wm.g4i')
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+
+/* load r */
+mov (1) g0.8<1>UD	0x0000e000UD { align1 mask_disable };
+
+/* src_msg will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+
+/* sample Y */
+send (16) src_msg_ind		/* msg reg index */
+	src_sample_g<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 2 { align1 };   /* required message len 5, readback len 8 */
+	
+/* sample U (Cr) */
+send (16) src_msg_ind		/* msg reg index */
+	src_sample_r<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 2 { align1 };   /* required message len 5, readback len 8 */
+	
+/* sample V (Cb) */
+send (16) src_msg_ind		/* msg reg index */
+	src_sample_b<1>UW 	/* readback */
+	g0<8,8,1>UW		/* copy to msg start reg*/
+	sampler (3,2,F)		/* sampler message description, (binding_table,sampler_index,datatype)
+				/* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 2 { align1 };   /* required message len 5, readback len 8 */
+
diff --git a/src/exa_wm_src_sample_planar.g4b b/src/exa_wm_src_sample_planar.g4b
new file mode 100644
index 0000000..d2b9cfe
--- /dev/null
+++ b/src/exa_wm_src_sample_planar.g4b
@@ -0,0 +1,4 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x0000e000 },
+   { 0x01800031, 0x22001d29, 0x008d0000, 0x02520001 },
+   { 0x01800031, 0x21c01d29, 0x008d0000, 0x02520102 },
+   { 0x01800031, 0x22401d29, 0x008d0000, 0x02520203 },
diff --git a/src/i830_video.c b/src/i830_video.c
index 14dab8f..4e1f725 100644
--- a/src/i830_video.c
+++ b/src/i830_video.c
@@ -1350,65 +1350,6 @@ I830CopyPackedData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
     }
 }
 
-/* Copies planar data in *buf to UYVY-packed data in the screen atYBufXOffset.
- */
-static void
-I830CopyPlanarToPackedData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
-			   unsigned char *buf, int srcPitch,
-			   int srcPitch2, int dstPitch, int srcH,
-			   int top, int left, int h, int w, int id)
-{
-    I830Ptr pI830 = I830PTR(pScrn);
-    uint8_t *dst1, *srcy, *srcu, *srcv;
-    int y;
-
-    if (pPriv->currentBuf == 0)
-	dst1 = pI830->FbBase + pPriv->YBuf0offset;
-    else
-	dst1 = pI830->FbBase + pPriv->YBuf1offset;
-
-    srcy = buf + (top * srcPitch) + left;
-    if (id == FOURCC_YV12) {
-	srcu = buf + (srcH * srcPitch) + ((top / 2) * srcPitch2) + (left / 2);
-	srcv = buf + (srcH * srcPitch) + ((srcH / 2) * srcPitch2) +
-	((top / 2) * srcPitch2) + (left / 2);
-    } else {
-	srcv = buf + (srcH * srcPitch) + ((top / 2) * srcPitch2) + (left / 2);
-	srcu = buf + (srcH * srcPitch) + ((srcH / 2) * srcPitch2) +
-	((top / 2) * srcPitch2) + (left / 2);
-    }
-
-    for (y = 0; y < h; y++) {
-	uint32_t *dst = (uint32_t *)dst1;
-	uint8_t *sy = srcy;
-	uint8_t *su = srcu;
-	uint8_t *sv = srcv;
-	int i;
-
-	i = w / 2;
-	while(i > 4) {
-	    dst[0] = sy[0] | (sy[1] << 16) | (sv[0] << 8) | (su[0] << 24);
-	    dst[1] = sy[2] | (sy[3] << 16) | (sv[1] << 8) | (su[1] << 24);
-	    dst[2] = sy[4] | (sy[5] << 16) | (sv[2] << 8) | (su[2] << 24);
-	    dst[3] = sy[6] | (sy[7] << 16) | (sv[3] << 8) | (su[3] << 24);
-	    dst += 4; su += 4; sv += 4; sy += 8;
-	    i -= 4;
-	}
-	while(i--) {
-	    dst[0] = sy[0] | (sy[1] << 16) | (sv[0] << 8) | (su[0] << 24);
-	    dst++; su++; sv++;
-	    sy += 2;
-	}
-
-	dst1 += dstPitch;
-	srcy += srcPitch;
-	if (y & 1) {
-	    srcu += srcPitch2;
-	    srcv += srcPitch2;
-	}	
-    }
-}
-
 static void
 I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 		   unsigned char *buf, int srcPitch,
@@ -2339,8 +2280,6 @@ I830PutImage(ScrnInfoPtr pScrn,
             srcPitch2 = ((width >> 1) + 0x3ff) & ~0x3ff;
         }
 #endif
-	if (pPriv->textured && IS_I965G(pI830))
-	    destId = FOURCC_YUY2;
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
@@ -2460,14 +2399,8 @@ I830PutImage(ScrnInfoPtr pScrn,
     case FOURCC_I420:
 	top &= ~1;
 	nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
-	if (pPriv->textured && IS_I965G(pI830)) {
-	    I830CopyPlanarToPackedData(pScrn, pPriv, buf, srcPitch, srcPitch2,
-				       dstPitch, height, top, left, nlines,
-				       npixels, id);
-	} else {
-	    I830CopyPlanarData(pScrn, pPriv, buf, srcPitch, srcPitch2, dstPitch,
-			       height, top, left, nlines, npixels, id);
-	}
+	I830CopyPlanarData(pScrn, pPriv, buf, srcPitch, srcPitch2, dstPitch,
+	    	       height, top, left, nlines, npixels, id);
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
diff --git a/src/i965_video.c b/src/i965_video.c
index 68337e7..63f1192 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -93,7 +93,7 @@ static const uint32_t sf_kernel_static[][4] = {
 
 #define BRW_GRF_BLOCKS(nreg)	((nreg + 15) / 16 - 1)
 
-static const uint32_t ps_kernel_static[][4] = {
+static const uint32_t ps_kernel_packed_static[][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_argb.g4b"
@@ -101,11 +101,17 @@ static const uint32_t ps_kernel_static[][4] = {
 #include "exa_wm_write.g4b"
 };
 
+static const uint32_t ps_kernel_planar_static[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_planar.g4b"
+#include "exa_wm_yuv_rgb.g4b"
+#include "exa_wm_write.g4b"
+};
+
 #define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 
-#define WM_BINDING_TABLE_ENTRIES    2
-
 static uint32_t float_to_uint (float f) {
     union {uint32_t i; float f;} x;
     x.f = f;
@@ -165,8 +171,8 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     int urb_sf_start, urb_sf_size;
     int urb_cs_start, urb_cs_size;
     struct brw_surface_state *dest_surf_state;
-    struct brw_surface_state *src_surf_state;
-    struct brw_sampler_state *src_sampler_state;
+    struct brw_surface_state *src_surf_state[3];
+    struct brw_sampler_state *src_sampler_state[3];
     struct brw_vs_unit_state *vs_state;
     struct brw_sf_unit_state *sf_state;
     struct brw_wm_unit_state *wm_state;
@@ -179,7 +185,7 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     float src_scale_x, src_scale_y;
     uint32_t *binding_table;
     Bool first_output = TRUE;
-    int dest_surf_offset, src_surf_offset, src_sampler_offset, vs_offset;
+    int dest_surf_offset, src_surf_offset[3], src_sampler_offset[3], vs_offset;
     int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
     int wm_scratch_offset;
     int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
@@ -188,6 +194,16 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     int vb_size = (4 * 4) * 4; /* 4 DWORDS per vertex */
     char *state_base;
     int state_base_offset;
+    int src_surf;
+    int n_src_surf;
+    uint32_t	src_surf_format;
+    uint32_t	src_surf_base[3];
+    int		src_width[3];
+    int		src_height[3];
+    int		src_pitch[3];
+    int wm_binding_table_entries;
+    const uint32_t	*ps_kernel_static;
+    int		ps_kernel_static_size;
 
 #if 0
     ErrorF("BroadwaterDisplayVideoTextured: %dx%d (pitch %d)\n", width, height,
@@ -202,7 +218,50 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     ErrorF ("INST_PM 0x%08x\n", INREG(INST_PM));
 #endif
 
-    assert((id == FOURCC_UYVY) || (id == FOURCC_YUY2));
+    src_surf_base[0] = pPriv->YBuf0offset;
+    src_surf_base[1] = pPriv->VBuf0offset;
+    src_surf_base[2] = pPriv->UBuf0offset;
+#if 0
+    ErrorF ("base 0 0x%x base 1 0x%x base 2 0x%x\n",
+	    src_surf_base[0], src_surf_base[1], src_surf_base[2]);
+#endif
+    
+    switch (id) {
+    case FOURCC_UYVY:
+	src_surf_format = BRW_SURFACEFORMAT_YCRCB_SWAPY;
+	n_src_surf = 1;
+	ps_kernel_static = &ps_kernel_packed_static[0][0];
+	ps_kernel_static_size = sizeof (ps_kernel_packed_static);
+	src_width[0] = width;
+	src_height[0] = height;
+	src_pitch[0] = video_pitch;
+	break;
+    case FOURCC_YUY2:
+	src_surf_format = BRW_SURFACEFORMAT_YCRCB_NORMAL;
+	ps_kernel_static = &ps_kernel_packed_static[0][0];
+	ps_kernel_static_size = sizeof (ps_kernel_packed_static);
+	src_width[0] = width;
+	src_height[0] = height;
+	src_pitch[0] = video_pitch;
+	n_src_surf = 1;
+	break;
+    case FOURCC_I420:
+    case FOURCC_YV12:
+	src_surf_format = BRW_SURFACEFORMAT_R8_UNORM;
+	ps_kernel_static = &ps_kernel_planar_static[0][0];
+	ps_kernel_static_size = sizeof (ps_kernel_planar_static);
+	src_width[0] = width;
+	src_height[0] = height;
+	src_pitch[0] = video_pitch * 2;
+	src_width[1] = src_width[2] = width / 2;
+	src_height[1] = src_height[2] = height / 2;
+	src_pitch[1] = src_pitch[2] = video_pitch;
+	n_src_surf = 3;
+	break;
+    default:
+	return;
+    }    
+    wm_binding_table_entries = 1 + n_src_surf;
 
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_VIDEO;
@@ -224,15 +283,17 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     sf_kernel_offset = ALIGN(next_offset, 64);
     next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
     ps_kernel_offset = ALIGN(next_offset, 64);
-    next_offset = ps_kernel_offset + sizeof (ps_kernel_static);
+    next_offset = ps_kernel_offset + ps_kernel_static_size;
     sip_kernel_offset = ALIGN(next_offset, 64);
     next_offset = sip_kernel_offset + sizeof (sip_kernel_static);
     cc_viewport_offset = ALIGN(next_offset, 32);
     next_offset = cc_viewport_offset + sizeof(*cc_viewport);
 
-    src_sampler_offset = ALIGN(next_offset, 32);
-    next_offset = src_sampler_offset + sizeof(*src_sampler_state);
-
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++) {    
+	src_sampler_offset[src_surf] = ALIGN(next_offset, 32);
+	next_offset = src_sampler_offset[src_surf] + sizeof(struct brw_sampler_state);
+    }
+    
     /* Align VB to native size of elements, for safety */
     vb_offset = ALIGN(next_offset, 8);
     next_offset = vb_offset + vb_size;
@@ -240,10 +301,14 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     /* And then the general state: */
     dest_surf_offset = ALIGN(next_offset, 32);
     next_offset = dest_surf_offset + sizeof(*dest_surf_state);
-    src_surf_offset = ALIGN(next_offset, 32);
-    next_offset = src_surf_offset + sizeof(*src_surf_state);
+    
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++) {
+	src_surf_offset[src_surf] = ALIGN(next_offset, 32);
+	next_offset = src_surf_offset[src_surf] + sizeof(struct brw_surface_state);
+    }
+    
     binding_table_offset = ALIGN(next_offset, 32);
-    next_offset = binding_table_offset + (WM_BINDING_TABLE_ENTRIES * 4);
+    next_offset = binding_table_offset + (wm_binding_table_entries * 4);
 
     /* Allocate an area in framebuffer for our state layout we just set up */
     total_state_size = next_offset;
@@ -270,8 +335,12 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
 
     cc_viewport = (void *)(state_base + cc_viewport_offset);
     dest_surf_state = (void *)(state_base + dest_surf_offset);
-    src_surf_state = (void *)(state_base + src_surf_offset);
-    src_sampler_state = (void *)(state_base + src_sampler_offset);
+    
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++) 
+    {
+	src_surf_state[src_surf] = (void *)(state_base + src_surf_offset[src_surf]);
+	src_sampler_state[src_surf] = (void *)(state_base + src_sampler_offset[src_surf]);
+    }
     binding_table = (void *)(state_base + binding_table_offset);
     vb = (void *)(state_base + vb_offset);
 
@@ -384,50 +453,49 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     dest_surf_state->ss3.tiled_surface = i830_pixmap_tiled(pPixmap);
     dest_surf_state->ss3.tile_walk = 0; /* TileX */
 
-    /* Set up the source surface state buffer */
-    memset(src_surf_state, 0, sizeof(*src_surf_state));
-    src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    /* src_surf_state->ss0.data_return_format =
-       BRW_SURFACERETURNFORMAT_FLOAT32; */
-    switch (id) {
-    case FOURCC_YUY2:
-	src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_YCRCB_NORMAL;
-	break;
-    case FOURCC_UYVY:
-	src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_YCRCB_SWAPY;
-	break;
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++)
+    {
+	/* Set up the source surface state buffer */
+	memset(src_surf_state[src_surf], 0, sizeof(struct brw_surface_state));
+	src_surf_state[src_surf]->ss0.surface_type = BRW_SURFACE_2D;
+	src_surf_state[src_surf]->ss0.surface_format = src_surf_format;
+	src_surf_state[src_surf]->ss0.writedisable_alpha = 0;
+	src_surf_state[src_surf]->ss0.writedisable_red = 0;
+	src_surf_state[src_surf]->ss0.writedisable_green = 0;
+	src_surf_state[src_surf]->ss0.writedisable_blue = 0;
+	src_surf_state[src_surf]->ss0.color_blend = 1;
+	src_surf_state[src_surf]->ss0.vert_line_stride = 0;
+	src_surf_state[src_surf]->ss0.vert_line_stride_ofs = 0;
+	src_surf_state[src_surf]->ss0.mipmap_layout_mode = 0;
+	src_surf_state[src_surf]->ss0.render_cache_read_mode = 0;
+    
+	src_surf_state[src_surf]->ss1.base_addr = src_surf_base[src_surf];
+	src_surf_state[src_surf]->ss2.width = src_width[src_surf] - 1;
+	src_surf_state[src_surf]->ss2.height = src_height[src_surf] - 1;
+	src_surf_state[src_surf]->ss2.mip_count = 0;
+	src_surf_state[src_surf]->ss2.render_target_rotation = 0;
+	src_surf_state[src_surf]->ss3.pitch = src_pitch[src_surf] - 1;
     }
-    src_surf_state->ss0.writedisable_alpha = 0;
-    src_surf_state->ss0.writedisable_red = 0;
-    src_surf_state->ss0.writedisable_green = 0;
-    src_surf_state->ss0.writedisable_blue = 0;
-    src_surf_state->ss0.color_blend = 1;
-    src_surf_state->ss0.vert_line_stride = 0;
-    src_surf_state->ss0.vert_line_stride_ofs = 0;
-    src_surf_state->ss0.mipmap_layout_mode = 0;
-    src_surf_state->ss0.render_cache_read_mode = 0;
-
-    src_surf_state->ss1.base_addr = pPriv->YBuf0offset;
-    src_surf_state->ss2.width = width - 1;
-    src_surf_state->ss2.height = height - 1;
-    src_surf_state->ss2.mip_count = 0;
-    src_surf_state->ss2.render_target_rotation = 0;
-    src_surf_state->ss3.pitch = video_pitch - 1;
     /* FIXME: account for tiling if we ever do it */
 
     /* Set up a binding table for our two surfaces.  Only the PS will use it */
     /* XXX: are these offset from the right place? */
     binding_table[0] = state_base_offset + dest_surf_offset;
-    binding_table[1] = state_base_offset + src_surf_offset;
+    
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++)
+	binding_table[1 + src_surf] = state_base_offset + src_surf_offset[src_surf];
 
     /* Set up the packed YUV source sampler.  Doesn't do colorspace conversion.
      */
-    memset(src_sampler_state, 0, sizeof(*src_sampler_state));
-    src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-    src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-    src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-    src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-    src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+    for (src_surf = 0; src_surf < n_src_surf; src_surf++)
+    {
+	memset(src_sampler_state[src_surf], 0, sizeof(struct brw_sampler_state));
+	src_sampler_state[src_surf]->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+	src_sampler_state[src_surf]->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	src_sampler_state[src_surf]->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	src_sampler_state[src_surf]->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	src_sampler_state[src_surf]->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+    }
 
     /* Set up the vertex shader to be disabled (passthrough) */
     memset(vs_state, 0, sizeof(*vs_state));
@@ -472,13 +540,13 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     sf_state->sf6.dest_org_vbias = 0x8;
     sf_state->sf6.dest_org_hbias = 0x8;
 
-    memcpy (ps_kernel, ps_kernel_static, sizeof (ps_kernel_static));
+    memcpy (ps_kernel, ps_kernel_static, ps_kernel_static_size);
     memset (wm_state, 0, sizeof (*wm_state));
     wm_state->thread0.kernel_start_pointer =
 	(state_base_offset + ps_kernel_offset) >> 6;
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
     wm_state->thread1.single_program_flow = 1; /* XXX */
-    wm_state->thread1.binding_table_entry_count = 2;
+    wm_state->thread1.binding_table_entry_count = 1 + n_src_surf;
     /* Though we never use the scratch space in our WM kernel, it has to be
      * set, and the minimum allocation is 1024 bytes.
      */
@@ -492,7 +560,7 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
     wm_state->thread3.urb_entry_read_offset = 0; /* XXX */
     wm_state->wm4.stats_enable = 1;
     wm_state->wm4.sampler_state_pointer = (state_base_offset +
-					   src_sampler_offset) >> 5;
+					   src_sampler_offset[0]) >> 5;
     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
     wm_state->wm5.thread_dispatch_enable = 1;
commit a03eaaa67b33c57530e92c53d28917e2563b4427
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:24:55 2008 -0500

    Use shared exa_wm code for packed yuv decode
    
    Eliminate special video sf and ps programs.

diff --git a/src/Makefile.am b/src/Makefile.am
index 9b5d653..f50d1d4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -135,6 +135,7 @@ INTEL_G4A =				\
 	exa_wm_src_projective.g4a 	\
 	exa_wm_src_sample_argb.g4a 	\
 	exa_wm_src_sample_a.g4a 	\
+	exa_wm_src_data.g4a		\
 	exa_wm_mask_affine.g4a 		\
 	exa_wm_mask_projective.g4a 	\
 	exa_wm_mask_sample_argb.g4a 	\
@@ -160,6 +161,7 @@ INTEL_G4B = 				\
 	exa_wm_src_projective.g4b 	\
 	exa_wm_src_sample_argb.g4b 	\
 	exa_wm_src_sample_a.g4b 	\
+	exa_wm_src_data.g4b		\
 	exa_wm_mask_affine.g4b 		\
 	exa_wm_mask_projective.g4b 	\
 	exa_wm_mask_sample_argb.g4b 	\
diff --git a/src/exa_wm_yuv_rgb.g4a b/src/exa_wm_yuv_rgb.g4a
new file mode 100644
index 0000000..0c7525d
--- /dev/null
+++ b/src/exa_wm_yuv_rgb.g4a
@@ -0,0 +1,102 @@
+/*
+ * Copyright Â© 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Packard <keithp at keithp.com>
+ *    Eric Anholt <eric at anholt.net>
+ *
+ */
+
+include(`exa_wm.g4i')
+
+define(`YCbCr_base',	`src_sample_base')
+
+define(`Cr',		`src_sample_r')
+define(`Cr_01',		`src_sample_r_01')
+define(`Cr_23',		`src_sample_r_23')
+
+define(`Y',		`src_sample_g')
+define(`Y_01',		`src_sample_g_01')
+define(`Y_23',		`src_sample_g_23')
+
+define(`Cb',		`src_sample_b')
+define(`Cb_01',		`src_sample_b_01')
+define(`Cb_23',		`src_sample_b_23')
+
+    /* color space conversion function:
+     * R = Clamp ( 1.164(Y-16/255) + 1.596(Cr-128/255), 0, 1)
+     * G = Clamp ( 1.164(Y-16/255) - 0.813(Cr-128/255) - 0.392(Cb-128/255), 0, 1)
+     * B = Clamp ( 1.164(Y-16/255) + 2.017(Cb-128/255), 0, 1)
+     *
+     * Y is g14, g15.
+     * Cr is g12, g13.
+     * Cb is g16, g17.
+     *
+     * R is g2, g6.
+     * G is g3, g7.
+     * B is g4, g8.
+     */
+
+    /* Normalize Y, Cb and Cr:
+     *
+     * Y = (Y - 16/255) * 1.164
+     * Cr = Cr - 128 / 255
+     * Cb = Cb - 128 / 255
+     */
+add (16)    Y<1>F		Y<8,8,1>F	-0.0627451F { compr align1 };
+mul (16)    Y<1>F		Y<8,8,1>F	1.164F	    { compr align1 };
+
+add (16)    Cr<1>F		Cr<8,8,1>F	-0.501961F  { compr align1 };
+
+add (16)    Cb<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
+
+    /* 
+     * R = Y + Cr * 1.596
+     */
+mul (8)	    null		Cr_01<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_01<1>F	Y_01<8,8,1>F	1F	    { align1  };
+mul (8)     null		Cr_23<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_23<1>F	Y_23<8,8,1>F	1F	    { align1  };
+     
+    /*
+     * G = Cr * -0.813 + Cb * -0.392 + Y
+     */
+mul (8)	    null		Cr_01<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_01<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cr_23<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_23<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * B = Cb * 2.017 + Y
+     */
+mul (8)	    null		Cb_01<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cb_23<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * A = 1.0
+     */
+mov (8)	    data_port_a_01<1>F	1.0F			    { align1 };
+mov (8)	    data_port_a_23<1>F	1.0F			    { align1 };
diff --git a/src/exa_wm_yuv_rgb.g4b b/src/exa_wm_yuv_rgb.g4b
new file mode 100644
index 0000000..017186a
--- /dev/null
+++ b/src/exa_wm_yuv_rgb.g4b
@@ -0,0 +1,20 @@
+   { 0x00802040, 0x22007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00802041, 0x22007fbd, 0x008d0200, 0x3f94fdf4 },
+   { 0x00802040, 0x21c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00802040, 0x22407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00600041, 0x20007fbc, 0x008d01c0, 0x3fcc49ba },
+   { 0x80600048, 0x20407fbe, 0x008d0200, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01e0, 0x3fcc49ba },
+   { 0x80600048, 0x20c07fbe, 0x008d0220, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01c0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0240, 0xbec8b439 },
+   { 0x80600048, 0x20607fbe, 0x008d0200, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01e0, 0xbf5020c5 },
+   { 0x00600048, 0x20007fbc, 0x008d0260, 0xbec8b439 },
+   { 0x80600048, 0x20e07fbe, 0x008d0220, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0240, 0x40011687 },
+   { 0x80600048, 0x20807fbe, 0x008d0200, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0260, 0x40011687 },
+   { 0x80600048, 0x21007fbe, 0x008d0220, 0x3f800000 },
+   { 0x00600001, 0x20a003fe, 0x00000000, 0x3f800000 },
+   { 0x00600001, 0x212003fe, 0x00000000, 0x3f800000 },
diff --git a/src/i965_video.c b/src/i965_video.c
index 1d2c3f5..68337e7 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -78,7 +78,7 @@ static const uint32_t sip_kernel_static[][4] = {
 #define SF_MAX_THREADS	   1
 
 static const uint32_t sf_kernel_static[][4] = {
-#include "packed_yuv_sf.g4b"
+#include "exa_sf.g4b"
 };
 
 /*
@@ -94,7 +94,11 @@ static const uint32_t sf_kernel_static[][4] = {
 #define BRW_GRF_BLOCKS(nreg)	((nreg + 15) / 16 - 1)
 
 static const uint32_t ps_kernel_static[][4] = {
-#include "packed_yuv_wm.g4b"
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_yuv_rgb.g4b"
+#include "exa_wm_write.g4b"
 };
 
 #define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
commit 32ef98518394d29cb87405005c660278489396bb
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:21:55 2008 -0500

    Compute pixel values directly into data port
    
    Instead of leaving pixel values in src_sample registers, compute the pixel
    values directl to the data port to save 8 moves. This cannot work when no
    computation is done as there is both no way to wait for the sampler to
    finish and because the sampler returns data in a different order from that
    required by the data port (sigh).

diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index ee8e3ad..a4b464b 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -103,12 +103,20 @@ define(`mask_w_1',  `src_w_1')
 
 /* sample src to these registers */
 define(`src_sample_base',	`g14')
+
+define(`src_sample_r',		`g14')
 define(`src_sample_r_01',	`g14')
 define(`src_sample_r_23',	`g15')
+
+define(`src_sample_g',		`g16')
 define(`src_sample_g_01',	`g16')
 define(`src_sample_g_23',	`g17')
+
+define(`src_sample_b',		`g18')
 define(`src_sample_b_01',	`g18')
 define(`src_sample_b_23',	`g19')
+
+define(`src_sample_a',		`g20')
 define(`src_sample_a_01',	`g20')
 define(`src_sample_a_23',	`g21')
 
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
index 5d982b3..a8cb806 100644
--- a/src/exa_wm_ca.g4a
+++ b/src/exa_wm_ca.g4a
@@ -32,7 +32,14 @@
 include(`exa_wm.g4i')
 
 /* mul mask rgba channels to src */
-mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (8)	    data_port_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { align1 };
+mul (8)	    data_port_r_23<1>F	src_sample_r_23<8,8,1>F	mask_sample_r_23<8,8,1>F { align1 };
+
+mul (8)	    data_port_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { align1 };
+mul (8)	    data_port_g_23<1>F src_sample_g_23<8,8,1>F	mask_sample_g_23<8,8,1>F { align1 };
+
+mul (8)	    data_port_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { align1 };
+mul (8)	    data_port_b_23<1>F src_sample_b_23<8,8,1>F	mask_sample_b_23<8,8,1>F { align1 };
+
+mul (8)	    data_port_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
+mul (8)	    data_port_a_23<1>F src_sample_a_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
diff --git a/src/exa_wm_ca.g4b b/src/exa_wm_ca.g4b
index 372e8b2..ec33611 100644
--- a/src/exa_wm_ca.g4b
+++ b/src/exa_wm_ca.g4b
@@ -1,4 +1,8 @@
-   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
-   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
-   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
-   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x204077be, 0x008d01c0, 0x008d02c0 },
+   { 0x00600041, 0x20c077be, 0x008d01e0, 0x008d02e0 },
+   { 0x00600041, 0x206077be, 0x008d0200, 0x008d0300 },
+   { 0x00600041, 0x20e077be, 0x008d0220, 0x008d0320 },
+   { 0x00600041, 0x208077be, 0x008d0240, 0x008d0340 },
+   { 0x00600041, 0x210077be, 0x008d0260, 0x008d0360 },
+   { 0x00600041, 0x20a077be, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x212077be, 0x008d02a0, 0x008d03a0 },
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
index d1f847f..a5f029f 100644
--- a/src/exa_wm_ca_srcalpha.g4a
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -31,7 +31,14 @@
 
 include(`exa_wm.g4i')
 
-mul (16)    src_sample_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (8)     data_port_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
+mul (8)     data_port_r_23<1>F mask_sample_r_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
+
+mul (8)     data_port_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
+mul (8)     data_port_g_23<1>F mask_sample_g_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
+
+mul (8)     data_port_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
+mul (8)     data_port_b_23<1>F mask_sample_b_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
+
+mul (8)     data_port_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { align1 };
+mul (8)     data_port_a_23<1>F mask_sample_a_23<8,8,1>F src_sample_a_23<8,8,1>F { align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4b b/src/exa_wm_ca_srcalpha.g4b
index 963d676..6ea89b8 100644
--- a/src/exa_wm_ca_srcalpha.g4b
+++ b/src/exa_wm_ca_srcalpha.g4b
@@ -1,4 +1,8 @@
-   { 0x00802041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
-   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d0280 },
-   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d0280 },
-   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d0280 },
+   { 0x00600041, 0x204077be, 0x008d02c0, 0x008d0280 },
+   { 0x00600041, 0x20c077be, 0x008d02e0, 0x008d02a0 },
+   { 0x00600041, 0x206077be, 0x008d0300, 0x008d0280 },
+   { 0x00600041, 0x20e077be, 0x008d0320, 0x008d02a0 },
+   { 0x00600041, 0x208077be, 0x008d0340, 0x008d0280 },
+   { 0x00600041, 0x210077be, 0x008d0360, 0x008d02a0 },
+   { 0x00600041, 0x20a077be, 0x008d0380, 0x008d0280 },
+   { 0x00600041, 0x212077be, 0x008d03a0, 0x008d02a0 },
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
index d0d60fa..f43c6f4 100644
--- a/src/exa_wm_noca.g4a
+++ b/src/exa_wm_noca.g4a
@@ -32,7 +32,14 @@
 include(`exa_wm.g4i')
 /* mul mask's alpha channel to src */
 
-mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
-mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (8)    data_port_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
+mul (8)    data_port_r_23<1>F	src_sample_r_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
+
+mul (8)    data_port_g_01<1>F	src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
+mul (8)    data_port_g_23<1>F	src_sample_g_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
+
+mul (8)    data_port_b_01<1>F	src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
+mul (8)    data_port_b_23<1>F	src_sample_b_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
+
+mul (8)    data_port_a_01<1>F	src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { align1 };
+mul (8)    data_port_a_23<1>F	src_sample_a_23<8,8,1>F	mask_sample_a_23<8,8,1>F { align1 };
diff --git a/src/exa_wm_noca.g4b b/src/exa_wm_noca.g4b
index 1506334..2f5940a 100644
--- a/src/exa_wm_noca.g4b
+++ b/src/exa_wm_noca.g4b
@@ -1,4 +1,8 @@
-   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
-   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0380 },
-   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0380 },
-   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x204077be, 0x008d01c0, 0x008d0380 },
+   { 0x00600041, 0x20c077be, 0x008d01e0, 0x008d03a0 },
+   { 0x00600041, 0x206077be, 0x008d0200, 0x008d0380 },
+   { 0x00600041, 0x20e077be, 0x008d0220, 0x008d03a0 },
+   { 0x00600041, 0x208077be, 0x008d0240, 0x008d0380 },
+   { 0x00600041, 0x210077be, 0x008d0260, 0x008d03a0 },
+   { 0x00600041, 0x20a077be, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x212077be, 0x008d02a0, 0x008d03a0 },
diff --git a/src/exa_wm_src_data.g4a b/src/exa_wm_src_data.g4a
new file mode 100644
index 0000000..9c3daf0
--- /dev/null
+++ b/src/exa_wm_src_data.g4a
@@ -0,0 +1,46 @@
+/*
+ * Copyright Â© 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Keith Packard <keithp at keithp.com>
+ */
+
+include(`exa_wm.g4i')
+
+/*
+ * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
+ *
+ * Note that the SIMD16 write message takes data for the first
+ * two sub-spans followed by the data for the second two sub-spans
+ * instead of having the two sub-spans interleaved by channel. Weird.
+ */
+
+mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
+mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
+mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
+mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
+
+mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
+mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
+mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
+mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
diff --git a/src/exa_wm_src_data.g4b b/src/exa_wm_src_data.g4b
new file mode 100644
index 0000000..8b53580
--- /dev/null
+++ b/src/exa_wm_src_data.g4b
@@ -0,0 +1,8 @@
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
index b16e649..c46023e 100644
--- a/src/exa_wm_write.g4a
+++ b/src/exa_wm_write.g4a
@@ -27,24 +27,6 @@
 
 include(`exa_wm.g4i')
 
-/*
- * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
- *
- * Note that the SIMD16 write message takes data for the first
- * two sub-spans followed by the data for the second two sub-spans
- * instead of having the two sub-spans interleaved by channel. Weird.
- */
-
-mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
-mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
-mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
-mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
-
-mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
-mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
-mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
-mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
-
 /* m0, m1 are all direct passed by PS thread payload */
 mov (8) data_port_msg_1<1>UD	g1<8,8,1>UD		{ align1 };
 
diff --git a/src/exa_wm_write.g4b b/src/exa_wm_write.g4b
index 785fe32..9402d11 100644
--- a/src/exa_wm_write.g4b
+++ b/src/exa_wm_write.g4b
@@ -1,11 +1,3 @@
-   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
    { 0x00600001, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_render.c b/src/i965_render.c
index 1b4afcc..79db41c 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -340,6 +340,7 @@ static const uint32_t ps_kernel_nomask_affine_static [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_affine.g4b"
 #include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_src_data.g4b"
 #include "exa_wm_write.g4b"
 };
 
@@ -347,6 +348,7 @@ static const uint32_t ps_kernel_nomask_projective_static [][4] = {
 #include "exa_wm_xy.g4b"
 #include "exa_wm_src_projective.g4b"
 #include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_src_data.g4b"
 #include "exa_wm_write.g4b"
 };
 
commit b68d9f4245d0ebe3371c179401ff145f1a4d101b
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:09:00 2008 -0500

    Use symbolic names for channels in YUV code

diff --git a/src/packed_yuv_wm.g4a b/src/packed_yuv_wm.g4a
index 9e635ba..2be52b5 100644
--- a/src/packed_yuv_wm.g4a
+++ b/src/packed_yuv_wm.g4a
@@ -26,6 +26,19 @@
  *
  */
 
+include(`exa_wm.g4i')
+
+define(`YCbCr_base',	`g12')
+define(`Cr',		`g12')
+define(`Cr_01',		`g12')
+define(`Cr_23',		`g13')
+define(`Y',		`g14')
+define(`Y_01',		`g14')
+define(`Y_23',		`g15')
+define(`Cb',		`g16')
+define(`Cb_01',		`g16')
+define(`Cb_23',		`g17')
+
 /* The initial payload of the thread is always g0.
  * WM_URB (incoming URB entries) is g3
  * X0_R is g4
@@ -117,8 +130,12 @@ mov (8) m4<1>F g7<8,8,1>F { align1 };
      * g0 holds the PS thread payload, which (oddly) contains
      * precisely what the sampler wants to see in m0
      */
-send  (16) 0 g12<1>UW g0<8,8,1>UW sampler (1,0,F) mlen 5 rlen 8 { align1 };
-mov (8) g19<1>UW g19<8,8,1>UW { align1 };
+send  (16)
+    0	/* load g0 to m0 */
+    YCbCr_base<1>UW
+    g0<8,8,1>UW 
+    sampler (1,0,F)
+    mlen 5 rlen 8 { align1 };
 
     /* color space conversion function:
      * R = Clamp ( 1.164(Y-16/255) + 1.596(Cr-128/255), 0, 1)
@@ -133,45 +150,60 @@ mov (8) g19<1>UW g19<8,8,1>UW { align1 };
      * G is g3, g7.
      * B is g4, g8.
      */
-	/* Y = Y - 16/255 */
-add (8) g14<1>F g14<8,8,1>F -0.0627451F { align1 };
-	/* Cr = Cr - 128/255 */
-add (8) g12<1>F g12<8,8,1>F -0.501961F { align1 };
-	/* Cb = Cb - 128 / 255 */
-add (8) g16<1>F g16<8,8,1>F -0.501961F { align1 };
-	/* Y = Y * 1.164 */
-mul (8) g14<1>F g14<8,8,1>F 1.164F { align1 };
-	/* acc = 1.596 * Cr */
-mul (8) null g12<8,8,1>F 1.596F { align1 };
-	/* R = acc + Y */
-mac.sat (8) m2<1>F g14<8,8,1>F 1F { align1  };
-	/* acc = Cr * -0.813 */
-mul (8) null g12<8,8,1>F -0.813F { align1 };
-	/* acc += Cb * -0.392 */
-mac (8) null g16<8,8,1>F -0.392F { align1 };
-	/* G = acc + Y */
-mac.sat (8) m3<1>F g14<8,8,1>F 1F { align1  };
-	/* acc = Cb * 2.017 */
-mul (8) null g16<8,8,1>F 2.017F { align1 };
-	/* B = acc + Y */
-mac.sat (8) m4<1>F g14<8,8,1>F 1F { align1  };
- /* and do it again */
-add (8) g15<1>F g15<8,8,1>F -0.0627451F { align1 };
-add (8) g13<1>F g13<8,8,1>F -0.501961F { align1 };
-add (8) g17<1>F g17<8,8,1>F -0.501961F { align1 };
-mul (8) g15<1>F g15<8,8,1>F 1.164F { align1 };
-mul (8) null g13<8,8,1>F 1.596F { align1 };
-mac.sat (8) m6<1>F g15<8,8,1>F 1F { align1  };
-mul (8) null g13<8,8,1>F -0.813F { align1 };
-mac (8) null g17<8,8,1>F -0.392F { align1 };
-mac.sat (8) m7<1>F g15<8,8,1>F 1F { align1  };
-mul (8) null g17<8,8,1>F 2.017F { align1 };
-mac.sat (8) m8<1>F g15<8,8,1>F 1F { align1  };
-
-   /* Pass through control information:
+
+    /* Normalize Y, Cb and Cr:
+     *
+     * Y = (Y - 16/255) * 1.164
+     * Cr = Cr - 128 / 255
+     * Cb = Cb - 128 / 255
+     */
+add (16)    Y<1>F		Y<8,8,1>F	-0.0627451F { compr align1 };
+mul (16)    Y<1>F		Y<8,8,1>F	1.164F	    { compr align1 };
+
+add (16)    Cr<1>F		Cr<8,8,1>F	-0.501961F  { compr align1 };
+
+add (16)    Cb<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
+
+    /* 
+     * R = Y + Cr * 1.596
+     */
+mul (8)	    null		Cr_01<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_01<1>F	Y_01<8,8,1>F	1F	    { align1  };
+mul (8)     null		Cr_23<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_23<1>F	Y_23<8,8,1>F	1F	    { align1  };
+     
+    /*
+     * G = Cr * -0.813 + Cb * -0.392 + Y
+     */
+mul (8)	    null		Cr_01<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_01<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cr_23<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_23<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * B = Cb * 2.017 + Y
+     */
+mul (8)	    null		Cb_01<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cb_23<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * A = 1.0
+     */
+mov (8)	    data_port_a_01<1>F	1.0F			    { align1 };
+mov (8)	    data_port_a_23<1>F	1.0F			    { align1 };
+
+   /*
+    * Pass through control information:
+    */
+mov (8)	    m1<1>UD		g1<8,8,1>UD		    { align1 mask_disable };
+
+   /*
+    * Send framebuffer write message: XXX: acc0?
     */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-   /* Send framebuffer write message: XXX: acc0? */
 send (16) 0 acc0<1>UW g0<8,8,1>UW write (
 	0, /* binding table index 0 */
 	8, /* pixel scoreboard clear */
diff --git a/src/packed_yuv_wm.g4b b/src/packed_yuv_wm.g4b
index d72c651..f2e650a 100644
--- a/src/packed_yuv_wm.g4b
+++ b/src/packed_yuv_wm.g4b
@@ -47,29 +47,26 @@
    { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
    { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
    { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600129, 0x008d0260, 0x00000000 },
-   { 0x00600040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
-   { 0x00600040, 0x21807fbd, 0x008d0180, 0xbf008084 },
-   { 0x00600040, 0x22007fbd, 0x008d0200, 0xbf008084 },
-   { 0x00600041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00802040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
+   { 0x00802041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00802040, 0x21807fbd, 0x008d0180, 0xbf008084 },
+   { 0x00802040, 0x22007fbd, 0x008d0200, 0xbf008084 },
    { 0x00600041, 0x20007fbc, 0x008d0180, 0x3fcc49ba },
    { 0x80600048, 0x20407fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
+   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d0180, 0xbf5020c5 },
    { 0x00600048, 0x20007fbc, 0x008d0200, 0xbec8b439 },
    { 0x80600048, 0x20607fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
-   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600040, 0x21e07fbd, 0x008d01e0, 0xbd808081 },
-   { 0x00600040, 0x21a07fbd, 0x008d01a0, 0xbf008084 },
-   { 0x00600040, 0x22207fbd, 0x008d0220, 0xbf008084 },
-   { 0x00600041, 0x21e07fbd, 0x008d01e0, 0x3f94fdf4 },
-   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
-   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d01a0, 0xbf5020c5 },
    { 0x00600048, 0x20007fbc, 0x008d0220, 0xbec8b439 },
    { 0x80600048, 0x20e07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
+   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d0220, 0x40011687 },
    { 0x80600048, 0x21007fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600001, 0x20a003fe, 0x00000000, 0x3f800000 },
+   { 0x00600001, 0x212003fe, 0x00000000, 0x3f800000 },
    { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit 781be9d47289713b0a8fcd95c769a9c6241d62e9
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:08:20 2008 -0500

    Rename src/mask/data registers to indicate channel

diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index 10e630e..ee8e3ad 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -102,21 +102,39 @@ define(`mask_w_0',  `src_w_0')
 define(`mask_w_1',  `src_w_1')
 
 /* sample src to these registers */
-define(`src_sample0',	`g14')
-define(`src_sample1',	`g15')
-define(`src_sample2',	`g16')
-define(`src_sample3',	`g17')
-define(`src_sample4',	`g18')
-define(`src_sample5',	`g19')
-define(`src_sample6',	`g20')
-define(`src_sample7',	`g21')
+define(`src_sample_base',	`g14')
+define(`src_sample_r_01',	`g14')
+define(`src_sample_r_23',	`g15')
+define(`src_sample_g_01',	`g16')
+define(`src_sample_g_23',	`g17')
+define(`src_sample_b_01',	`g18')
+define(`src_sample_b_23',	`g19')
+define(`src_sample_a_01',	`g20')
+define(`src_sample_a_23',	`g21')
 
 /* sample mask to these registers */
-define(`mask_sample0',	`g22')
-define(`mask_sample1',	`g23')
-define(`mask_sample2',	`g24')
-define(`mask_sample3',	`g25')
-define(`mask_sample4',	`g26')
-define(`mask_sample5',	`g27')
-define(`mask_sample6',	`g28')
-define(`mask_sample7',	`g29')
+define(`mask_sample_base',	`g22')
+define(`mask_sample_r_01',	`g22')
+define(`mask_sample_r_23',	`g23')
+define(`mask_sample_g_01',	`g24')
+define(`mask_sample_g_23',	`g25')
+define(`mask_sample_b_01',	`g26')
+define(`mask_sample_b_23',	`g27')
+define(`mask_sample_a_01',	`g28')
+define(`mask_sample_a_23',	`g29')
+
+/* data port SIMD16 send registers */
+
+define(`data_port_msg_0',	`m0')
+define(`data_port_msg_0_ind',	`0')
+define(`data_port_msg_1',	`m1')
+define(`data_port_r_01',	`m2')
+define(`data_port_g_01',	`m3')
+define(`data_port_b_01',	`m4')
+define(`data_port_a_01',	`m5')
+
+define(`data_port_r_23',	`m6')
+define(`data_port_g_23',	`m7')
+define(`data_port_b_23',	`m8')
+define(`data_port_a_23',	`m9')
+
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
index 955c68c..5d982b3 100644
--- a/src/exa_wm_ca.g4a
+++ b/src/exa_wm_ca.g4a
@@ -32,7 +32,7 @@
 include(`exa_wm.g4i')
 
 /* mul mask rgba channels to src */
-mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample0<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample2<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample4<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
index e252e19..d1f847f 100644
--- a/src/exa_wm_ca_srcalpha.g4a
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -31,7 +31,7 @@
 
 include(`exa_wm.g4i')
 
-mul (16)    src_sample0<1>F mask_sample0<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F mask_sample2<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F mask_sample4<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F mask_sample6<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_mask_sample_a.g4a b/src/exa_wm_mask_sample_a.g4a
index c06611d..bbb19d7 100644
--- a/src/exa_wm_mask_sample_a.g4a
+++ b/src/exa_wm_mask_sample_a.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
 /* mask_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) mask_msg_ind		/* msg reg index */
-	mask_sample6<1>UW 	/* readback */
+	mask_sample_a_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_mask_sample_argb.g4a b/src/exa_wm_mask_sample_argb.g4a
index 7f0815f..def4cfe 100644
--- a/src/exa_wm_mask_sample_argb.g4a
+++ b/src/exa_wm_mask_sample_argb.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
 /* mask_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) mask_msg_ind		/* msg reg index */
-	mask_sample0<1>UW 	/* readback */
+	mask_sample_base<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
index 7dd1224..d0d60fa 100644
--- a/src/exa_wm_noca.g4a
+++ b/src/exa_wm_noca.g4a
@@ -32,7 +32,7 @@
 include(`exa_wm.g4i')
 /* mul mask's alpha channel to src */
 
-mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_src_sample_a.g4a b/src/exa_wm_src_sample_a.g4a
index 803c358..552aaee 100644
--- a/src/exa_wm_src_sample_a.g4a
+++ b/src/exa_wm_src_sample_a.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
 /* src_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample6<1>UW 	/* readback */
+	src_sample_a_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_src_sample_argb.g4a b/src/exa_wm_src_sample_argb.g4a
index 4fcf276..c20f53f 100644
--- a/src/exa_wm_src_sample_argb.g4a
+++ b/src/exa_wm_src_sample_argb.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
 /* src_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample0<1>UW 	/* readback */
+	src_sample_base<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
index 5d3e6b1..b16e649 100644
--- a/src/exa_wm_write.g4a
+++ b/src/exa_wm_write.g4a
@@ -25,41 +25,39 @@
  *    Keith Packard <keithp at keithp.com>
  */
 
-/* 
- * Once the data are ready, write them to the destination
- */
-
 include(`exa_wm.g4i')
 
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* src_sample0 -> m2
-   src_sample1 -> m6
-   src_sample2 -> m3
-   src_sample3 -> m7
-   src_sample4 -> m4
-   src_sample5 -> m8
-   src_sample6 -> m5
-   src_sample7 -> m9
-*/
+/*
+ * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
+ *
+ * Note that the SIMD16 write message takes data for the first
+ * two sub-spans followed by the data for the second two sub-spans
+ * instead of having the two sub-spans interleaved by channel. Weird.
+ */
+
+mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
+mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
+mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
+mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
 
-mov (8) m2<1>F src_sample0<8,8,1>F { align1 };
-mov (8) m3<1>F src_sample2<8,8,1>F { align1 };
-mov (8) m4<1>F src_sample4<8,8,1>F { align1 };
-mov (8) m5<1>F src_sample6<8,8,1>F { align1 };
-mov (8) m6<1>F src_sample1<8,8,1>F { align1 };
-mov (8) m7<1>F src_sample3<8,8,1>F { align1 };
-mov (8) m8<1>F src_sample5<8,8,1>F { align1 };
-mov (8) m9<1>F src_sample7<8,8,1>F { align1 };
+mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
+mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
+mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
+mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
 
 /* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 };
+mov (8) data_port_msg_1<1>UD	g1<8,8,1>UD		{ align1 };
 
 /* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
+send (16) 
+	data_port_msg_0_ind 
+	acc0<1>UW 
+	g0<8,8,1>UW 
+	write (
+	       0,  /* binding_table */
+	       8,  /* pixel scordboard clear, msg type simd16 single source */
+	       4,  /* render target write */
+	       0   /* no write commit message */
 	) 
 	mlen 10
 	rlen 0