xf86-video-intel: Branch 'xf86-video-intel-2.3-branch' - 8 commits - src/exa_wm_ca.g4a src/exa_wm_ca_srcalpha.g4a src/exa_wm.g4i src/exa_wm_mask_sample_a.g4a src/exa_wm_mask_sample_argb.g4a src/exa_wm_noca.g4a src/exa_wm_src_sample_a.g4a src/exa_wm_src_sample_argb.g4a src/exa_wm_write.g4a src/i830_display.c src/i830_video.c src/i965_video.c src/Makefile.am src/packed_yuv_wm.g4a src/packed_yuv_wm.g4b

Fri Apr 11 07:36:43 PDT 2008

src/Makefile.am                 |    4 +
 src/exa_wm.g4i                  |   50 ++++++++++++-----
 src/exa_wm_ca.g4a               |    8 +-
 src/exa_wm_ca_srcalpha.g4a      |    8 +-
 src/exa_wm_mask_sample_a.g4a    |    2 
 src/exa_wm_mask_sample_argb.g4a |    2 
 src/exa_wm_noca.g4a             |    8 +-
 src/exa_wm_src_sample_a.g4a     |    2 
 src/exa_wm_src_sample_argb.g4a  |    2 
 src/exa_wm_write.g4a            |   54 +++++++++----------
 src/i830_display.c              |   11 ++-
 src/i830_video.c                |   39 ++++++++-----
 src/i965_video.c                |    1 
 src/packed_yuv_wm.g4a           |  112 +++++++++++++++++++++++++---------------
 src/packed_yuv_wm.g4b           |   23 +++-----
 15 files changed, 192 insertions(+), 134 deletions(-)

New commits:
commit 709e1e955c9b011cf547c5d2df7062c27e921d8e
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Fri Apr 11 10:12:40 2008 +0800

    remove '#line NUM ...' in macro process
    
    intel-gen4asm doesn't allow '#' line
    (cherry picked from commit f47486fab3dffcbb03e7ad89f777abba1e887299)

diff --git a/src/Makefile.am b/src/Makefile.am
index 1b11330..c4cfff9 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -182,7 +182,7 @@ if HAVE_GEN4ASM
 
 SUFFIXES = .g4a .g4b
 .g4a.g4b:
-	m4 -s $*.g4a > $*.g4m && intel-gen4asm -o $@ $*.g4m && rm $*.g4m
+	m4 $*.g4a > $*.g4m && intel-gen4asm -o $@ $*.g4m && rm $*.g4m
 
 $(INTEL_G4B): $(INTEL_G4I)
 
commit e17168f29953ac276d5ecc2d8f459f2e513c8d1b
Author: Hong Liu <hong.liu at intel.com>
Date:   Fri Apr 11 09:54:34 2008 +0800

    Bug #14935: Fix i9xx reference clock for spread spectrum.
    (cherry picked from commit 5c9cde37e769287fb7bf4e08c3600a33c2e92dce)

diff --git a/src/i830_display.c b/src/i830_display.c
index 0588947..4910d96 100644
--- a/src/i830_display.c
+++ b/src/i830_display.c
@@ -1713,8 +1713,10 @@ i830_crtc_clock_get(ScrnInfoPtr pScrn, xf86CrtcPtr crtc)
 	    return 0;
 	}
 
-	/* XXX: Handle the 100Mhz refclk */
-	i9xx_clock(96000, &clock);
+	if ((dpll & PLL_REF_INPUT_MASK) == PLLB_REF_INPUT_SPREADSPECTRUMIN)
+	    i9xx_clock(100000, &clock);
+	else
+	    i9xx_clock(96000, &clock);
     } else {
 	Bool is_lvds = (pipe == 1) && (INREG(LVDS) & LVDS_PORT_EN);
 
commit 8c23ab274a502bb1a1face5b17776e577a8f35ef
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Apr 10 16:06:41 2008 -0700

    Fix compiler warning from 24-bit lvds change.
    (cherry picked from commit 152a50703aa5e9ebaa9abbe448518742734a5eb7)

diff --git a/src/i830_display.c b/src/i830_display.c
index 4091e79..0588947 100644
--- a/src/i830_display.c
+++ b/src/i830_display.c
@@ -1080,7 +1080,7 @@ i830_crtc_mode_set(xf86CrtcPtr crtc, DisplayModePtr mode,
     int i;
     int refclk;
     intel_clock_t clock;
-    uint32_t dpll = 0, fp = 0, dspcntr, pipeconf;
+    uint32_t dpll = 0, fp = 0, dspcntr, pipeconf, lvds_bits = 0;
     Bool ok, is_sdvo = FALSE, is_dvo = FALSE;
     Bool is_crt = FALSE, is_lvds = FALSE, is_tv = FALSE;
 
@@ -1097,6 +1097,7 @@ i830_crtc_mode_set(xf86CrtcPtr crtc, DisplayModePtr mode,
 	switch (intel_output->type) {
 	case I830_OUTPUT_LVDS:
 	    is_lvds = TRUE;
+	    lvds_bits = intel_output->lvds_bits;
 	    break;
 	case I830_OUTPUT_SDVO:
 	    is_sdvo = TRUE;
@@ -1314,7 +1315,7 @@ i830_crtc_mode_set(xf86CrtcPtr crtc, DisplayModePtr mode,
 		lvds |= LVDS_DITHER_ENABLE;
 	}
 
-	lvds |= intel_output->lvds_bits;
+	lvds |= lvds_bits;
 
 	OUTREG(LVDS, lvds);
 	POSTING_READ(LVDS);
commit 23dc90f35c6e5da33c09ebdfc53cc7837d93c886
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 16:27:40 2008 -0500

    Single memcpy when pitches align on planar image transfer
    (cherry picked from commit d5a80e1e3ab5724d34b20f9ee6f830efd0f5b076)

diff --git a/src/i830_video.c b/src/i830_video.c
index 14dab8f..64024d2 100644
--- a/src/i830_video.c
+++ b/src/i830_video.c
@@ -1441,11 +1441,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h; i++) {
-	    memcpy(dst1, src1, w);
-	    src1 += srcPitch;
-	    dst1 += dstPitch2;
-	}
+	if (srcPitch == dstPitch2)
+	    memcpy (dst1, src1, srcPitch * h);
+	else
+	    for (i = 0; i < h; i++) {
+		memcpy(dst1, src1, w);
+		src1 += srcPitch;
+		dst1 += dstPitch2;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < h; i++) {
@@ -1496,11 +1499,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h / 2; i++) {
-	    memcpy(dst2, src2, w / 2);
-	    src2 += srcPitch2;
-	    dst2 += dstPitch;
-	}
+	if (srcPitch2 == dstPitch)
+	    memcpy (dst2, src2, h/2 * srcPitch2);
+	else
+	    for (i = 0; i < h / 2; i++) {
+		memcpy(dst2, src2, w / 2);
+		src2 += srcPitch2;
+		dst2 += dstPitch;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < (h/2); i++) {
@@ -1552,11 +1558,14 @@ I830CopyPlanarData(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv,
 
     switch (pPriv->rotation) {
     case RR_Rotate_0:
-	for (i = 0; i < h / 2; i++) {
-	    memcpy(dst3, src3, w / 2);
-	    src3 += srcPitch2;
-	    dst3 += dstPitch;
-	}
+	if (srcPitch2 == dstPitch)
+	    memcpy (dst3, src3, srcPitch2 * h/2);
+	else
+	    for (i = 0; i < h / 2; i++) {
+		memcpy(dst3, src3, w / 2);
+		src3 += srcPitch2;
+		dst3 += dstPitch;
+	    }
 	break;
     case RR_Rotate_90:
 	for (i = 0; i < (h/2); i++) {
commit 7af82ca69fab431c744a8ca36a282d071ceb83c1
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 10:00:08 2008 -0500

    Remove .g4b files on clean
    (cherry picked from commit f270456e5612cb88933e6aabcd9a816c5c292229)

diff --git a/src/Makefile.am b/src/Makefile.am
index 9b5d653..1b11330 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -188,6 +188,8 @@ $(INTEL_G4B): $(INTEL_G4I)
 
 BUILT_SOURCES= $(INTEL_G4B)
 
+clean-local:
+	-rm -f $(INTEL_G4B)
 endif
 
 if XMODES
commit 601f9d3310653552a1d38acc7d5818c24bbaf53b
Author: Keith Packard <keithp at keithp.com>
Date:   Wed Apr 9 02:02:56 2008 -0500

    Remove sync after 965 video put.
    
    The hardware has been marked as needing a sync, so the next video put will
    block waiting for the previous one to complete. Adding a sync here just
    stalls the video playback for no good reason.
    (cherry picked from commit 3fc3d1a701bae257b70aa7b7654c722f30e71399)

diff --git a/src/i965_video.c b/src/i965_video.c
index 1d2c3f5..153a967 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -851,7 +851,6 @@ I965DisplayVideoTextured(ScrnInfoPtr pScrn, I830PortPrivPtr pPriv, int id,
 	i830MarkSync(pScrn);
     }
 
-    i830WaitSync(pScrn);
 #if WATCH_STATS
     i830_dump_error_state(pScrn);
 #endif
commit 42559c54ad0f436e749edfe75172522a5baa3b97
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:09:00 2008 -0500

    Use symbolic names for channels in YUV code
    (cherry picked from commit b68d9f4245d0ebe3371c179401ff145f1a4d101b)

diff --git a/src/packed_yuv_wm.g4a b/src/packed_yuv_wm.g4a
index 9e635ba..2be52b5 100644
--- a/src/packed_yuv_wm.g4a
+++ b/src/packed_yuv_wm.g4a
@@ -26,6 +26,19 @@
  *
  */
 
+include(`exa_wm.g4i')
+
+define(`YCbCr_base',	`g12')
+define(`Cr',		`g12')
+define(`Cr_01',		`g12')
+define(`Cr_23',		`g13')
+define(`Y',		`g14')
+define(`Y_01',		`g14')
+define(`Y_23',		`g15')
+define(`Cb',		`g16')
+define(`Cb_01',		`g16')
+define(`Cb_23',		`g17')
+
 /* The initial payload of the thread is always g0.
  * WM_URB (incoming URB entries) is g3
  * X0_R is g4
@@ -117,8 +130,12 @@ mov (8) m4<1>F g7<8,8,1>F { align1 };
      * g0 holds the PS thread payload, which (oddly) contains
      * precisely what the sampler wants to see in m0
      */
-send  (16) 0 g12<1>UW g0<8,8,1>UW sampler (1,0,F) mlen 5 rlen 8 { align1 };
-mov (8) g19<1>UW g19<8,8,1>UW { align1 };
+send  (16)
+    0	/* load g0 to m0 */
+    YCbCr_base<1>UW
+    g0<8,8,1>UW 
+    sampler (1,0,F)
+    mlen 5 rlen 8 { align1 };
 
     /* color space conversion function:
      * R = Clamp ( 1.164(Y-16/255) + 1.596(Cr-128/255), 0, 1)
@@ -133,45 +150,60 @@ mov (8) g19<1>UW g19<8,8,1>UW { align1 };
      * G is g3, g7.
      * B is g4, g8.
      */
-	/* Y = Y - 16/255 */
-add (8) g14<1>F g14<8,8,1>F -0.0627451F { align1 };
-	/* Cr = Cr - 128/255 */
-add (8) g12<1>F g12<8,8,1>F -0.501961F { align1 };
-	/* Cb = Cb - 128 / 255 */
-add (8) g16<1>F g16<8,8,1>F -0.501961F { align1 };
-	/* Y = Y * 1.164 */
-mul (8) g14<1>F g14<8,8,1>F 1.164F { align1 };
-	/* acc = 1.596 * Cr */
-mul (8) null g12<8,8,1>F 1.596F { align1 };
-	/* R = acc + Y */
-mac.sat (8) m2<1>F g14<8,8,1>F 1F { align1  };
-	/* acc = Cr * -0.813 */
-mul (8) null g12<8,8,1>F -0.813F { align1 };
-	/* acc += Cb * -0.392 */
-mac (8) null g16<8,8,1>F -0.392F { align1 };
-	/* G = acc + Y */
-mac.sat (8) m3<1>F g14<8,8,1>F 1F { align1  };
-	/* acc = Cb * 2.017 */
-mul (8) null g16<8,8,1>F 2.017F { align1 };
-	/* B = acc + Y */
-mac.sat (8) m4<1>F g14<8,8,1>F 1F { align1  };
- /* and do it again */
-add (8) g15<1>F g15<8,8,1>F -0.0627451F { align1 };
-add (8) g13<1>F g13<8,8,1>F -0.501961F { align1 };
-add (8) g17<1>F g17<8,8,1>F -0.501961F { align1 };
-mul (8) g15<1>F g15<8,8,1>F 1.164F { align1 };
-mul (8) null g13<8,8,1>F 1.596F { align1 };
-mac.sat (8) m6<1>F g15<8,8,1>F 1F { align1  };
-mul (8) null g13<8,8,1>F -0.813F { align1 };
-mac (8) null g17<8,8,1>F -0.392F { align1 };
-mac.sat (8) m7<1>F g15<8,8,1>F 1F { align1  };
-mul (8) null g17<8,8,1>F 2.017F { align1 };
-mac.sat (8) m8<1>F g15<8,8,1>F 1F { align1  };
-
-   /* Pass through control information:
+
+    /* Normalize Y, Cb and Cr:
+     *
+     * Y = (Y - 16/255) * 1.164
+     * Cr = Cr - 128 / 255
+     * Cb = Cb - 128 / 255
+     */
+add (16)    Y<1>F		Y<8,8,1>F	-0.0627451F { compr align1 };
+mul (16)    Y<1>F		Y<8,8,1>F	1.164F	    { compr align1 };
+
+add (16)    Cr<1>F		Cr<8,8,1>F	-0.501961F  { compr align1 };
+
+add (16)    Cb<1>F		Cb<8,8,1>F	-0.501961F  { compr align1 };
+
+    /* 
+     * R = Y + Cr * 1.596
+     */
+mul (8)	    null		Cr_01<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_01<1>F	Y_01<8,8,1>F	1F	    { align1  };
+mul (8)     null		Cr_23<8,8,1>F	1.596F	    { align1 };
+mac.sat (8) data_port_r_23<1>F	Y_23<8,8,1>F	1F	    { align1  };
+     
+    /*
+     * G = Cr * -0.813 + Cb * -0.392 + Y
+     */
+mul (8)	    null		Cr_01<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_01<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cr_23<8,8,1>F	-0.813F	    { align1 };
+mac (8)	    null		Cb_23<8,8,1>F	-0.392F	    { align1 };
+mac.sat (8) data_port_g_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * B = Cb * 2.017 + Y
+     */
+mul (8)	    null		Cb_01<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_01<1>F	Y_01<8,8,1>F	1F	    { align1 };
+mul (8)	    null		Cb_23<8,8,1>F	2.017F	    { align1 };
+mac.sat (8) data_port_b_23<1>F	Y_23<8,8,1>F	1F	    { align1 };
+
+    /*
+     * A = 1.0
+     */
+mov (8)	    data_port_a_01<1>F	1.0F			    { align1 };
+mov (8)	    data_port_a_23<1>F	1.0F			    { align1 };
+
+   /*
+    * Pass through control information:
+    */
+mov (8)	    m1<1>UD		g1<8,8,1>UD		    { align1 mask_disable };
+
+   /*
+    * Send framebuffer write message: XXX: acc0?
     */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-   /* Send framebuffer write message: XXX: acc0? */
 send (16) 0 acc0<1>UW g0<8,8,1>UW write (
 	0, /* binding table index 0 */
 	8, /* pixel scoreboard clear */
diff --git a/src/packed_yuv_wm.g4b b/src/packed_yuv_wm.g4b
index d72c651..f2e650a 100644
--- a/src/packed_yuv_wm.g4b
+++ b/src/packed_yuv_wm.g4b
@@ -47,29 +47,26 @@
    { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
    { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
    { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600129, 0x008d0260, 0x00000000 },
-   { 0x00600040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
-   { 0x00600040, 0x21807fbd, 0x008d0180, 0xbf008084 },
-   { 0x00600040, 0x22007fbd, 0x008d0200, 0xbf008084 },
-   { 0x00600041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00802040, 0x21c07fbd, 0x008d01c0, 0xbd808081 },
+   { 0x00802041, 0x21c07fbd, 0x008d01c0, 0x3f94fdf4 },
+   { 0x00802040, 0x21807fbd, 0x008d0180, 0xbf008084 },
+   { 0x00802040, 0x22007fbd, 0x008d0200, 0xbf008084 },
    { 0x00600041, 0x20007fbc, 0x008d0180, 0x3fcc49ba },
    { 0x80600048, 0x20407fbe, 0x008d01c0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
+   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d0180, 0xbf5020c5 },
    { 0x00600048, 0x20007fbc, 0x008d0200, 0xbec8b439 },
    { 0x80600048, 0x20607fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
-   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
-   { 0x00600040, 0x21e07fbd, 0x008d01e0, 0xbd808081 },
-   { 0x00600040, 0x21a07fbd, 0x008d01a0, 0xbf008084 },
-   { 0x00600040, 0x22207fbd, 0x008d0220, 0xbf008084 },
-   { 0x00600041, 0x21e07fbd, 0x008d01e0, 0x3f94fdf4 },
-   { 0x00600041, 0x20007fbc, 0x008d01a0, 0x3fcc49ba },
-   { 0x80600048, 0x20c07fbe, 0x008d01e0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d01a0, 0xbf5020c5 },
    { 0x00600048, 0x20007fbc, 0x008d0220, 0xbec8b439 },
    { 0x80600048, 0x20e07fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600041, 0x20007fbc, 0x008d0200, 0x40011687 },
+   { 0x80600048, 0x20807fbe, 0x008d01c0, 0x3f800000 },
    { 0x00600041, 0x20007fbc, 0x008d0220, 0x40011687 },
    { 0x80600048, 0x21007fbe, 0x008d01e0, 0x3f800000 },
+   { 0x00600001, 0x20a003fe, 0x00000000, 0x3f800000 },
+   { 0x00600001, 0x212003fe, 0x00000000, 0x3f800000 },
    { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
    { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit bddd411cf99c4c6b8a914589828ba09cc4cb6f0b
Author: Keith Packard <keithp at keithp.com>
Date:   Tue Apr 8 16:08:20 2008 -0500

    Rename src/mask/data registers to indicate channel
    (cherry picked from commit 781be9d47289713b0a8fcd95c769a9c6241d62e9)

diff --git a/src/exa_wm.g4i b/src/exa_wm.g4i
index 10e630e..ee8e3ad 100644
--- a/src/exa_wm.g4i
+++ b/src/exa_wm.g4i
@@ -102,21 +102,39 @@ define(`mask_w_0',  `src_w_0')
 define(`mask_w_1',  `src_w_1')
 
 /* sample src to these registers */
-define(`src_sample0',	`g14')
-define(`src_sample1',	`g15')
-define(`src_sample2',	`g16')
-define(`src_sample3',	`g17')
-define(`src_sample4',	`g18')
-define(`src_sample5',	`g19')
-define(`src_sample6',	`g20')
-define(`src_sample7',	`g21')
+define(`src_sample_base',	`g14')
+define(`src_sample_r_01',	`g14')
+define(`src_sample_r_23',	`g15')
+define(`src_sample_g_01',	`g16')
+define(`src_sample_g_23',	`g17')
+define(`src_sample_b_01',	`g18')
+define(`src_sample_b_23',	`g19')
+define(`src_sample_a_01',	`g20')
+define(`src_sample_a_23',	`g21')
 
 /* sample mask to these registers */
-define(`mask_sample0',	`g22')
-define(`mask_sample1',	`g23')
-define(`mask_sample2',	`g24')
-define(`mask_sample3',	`g25')
-define(`mask_sample4',	`g26')
-define(`mask_sample5',	`g27')
-define(`mask_sample6',	`g28')
-define(`mask_sample7',	`g29')
+define(`mask_sample_base',	`g22')
+define(`mask_sample_r_01',	`g22')
+define(`mask_sample_r_23',	`g23')
+define(`mask_sample_g_01',	`g24')
+define(`mask_sample_g_23',	`g25')
+define(`mask_sample_b_01',	`g26')
+define(`mask_sample_b_23',	`g27')
+define(`mask_sample_a_01',	`g28')
+define(`mask_sample_a_23',	`g29')
+
+/* data port SIMD16 send registers */
+
+define(`data_port_msg_0',	`m0')
+define(`data_port_msg_0_ind',	`0')
+define(`data_port_msg_1',	`m1')
+define(`data_port_r_01',	`m2')
+define(`data_port_g_01',	`m3')
+define(`data_port_b_01',	`m4')
+define(`data_port_a_01',	`m5')
+
+define(`data_port_r_23',	`m6')
+define(`data_port_g_23',	`m7')
+define(`data_port_b_23',	`m8')
+define(`data_port_a_23',	`m9')
+
diff --git a/src/exa_wm_ca.g4a b/src/exa_wm_ca.g4a
index 955c68c..5d982b3 100644
--- a/src/exa_wm_ca.g4a
+++ b/src/exa_wm_ca.g4a
@@ -32,7 +32,7 @@
 include(`exa_wm.g4i')
 
 /* mul mask rgba channels to src */
-mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample0<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample2<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample4<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_r_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_g_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_b_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_ca_srcalpha.g4a b/src/exa_wm_ca_srcalpha.g4a
index e252e19..d1f847f 100644
--- a/src/exa_wm_ca_srcalpha.g4a
+++ b/src/exa_wm_ca_srcalpha.g4a
@@ -31,7 +31,7 @@
 
 include(`exa_wm.g4i')
 
-mul (16)    src_sample0<1>F mask_sample0<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F mask_sample2<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F mask_sample4<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F mask_sample6<8,8,1>F    src_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F mask_sample_r_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F mask_sample_g_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F mask_sample_b_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F mask_sample_a_01<8,8,1>F src_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_mask_sample_a.g4a b/src/exa_wm_mask_sample_a.g4a
index c06611d..bbb19d7 100644
--- a/src/exa_wm_mask_sample_a.g4a
+++ b/src/exa_wm_mask_sample_a.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
 /* mask_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) mask_msg_ind		/* msg reg index */
-	mask_sample6<1>UW 	/* readback */
+	mask_sample_a_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_mask_sample_argb.g4a b/src/exa_wm_mask_sample_argb.g4a
index 7f0815f..def4cfe 100644
--- a/src/exa_wm_mask_sample_argb.g4a
+++ b/src/exa_wm_mask_sample_argb.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
 /* mask_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) mask_msg_ind		/* msg reg index */
-	mask_sample0<1>UW 	/* readback */
+	mask_sample_base<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (2,1,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_noca.g4a b/src/exa_wm_noca.g4a
index 7dd1224..d0d60fa 100644
--- a/src/exa_wm_noca.g4a
+++ b/src/exa_wm_noca.g4a
@@ -32,7 +32,7 @@
 include(`exa_wm.g4i')
 /* mul mask's alpha channel to src */
 
-mul (16)    src_sample0<1>F src_sample0<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample2<1>F src_sample2<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample4<1>F src_sample4<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
-mul (16)    src_sample6<1>F src_sample6<8,8,1>F	mask_sample6<8,8,1>F	{ compr align1 };
+mul (16)    src_sample_r_01<1>F	src_sample_r_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_g_01<1>F src_sample_g_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_b_01<1>F src_sample_b_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
+mul (16)    src_sample_a_01<1>F src_sample_a_01<8,8,1>F	mask_sample_a_01<8,8,1>F { compr align1 };
diff --git a/src/exa_wm_src_sample_a.g4a b/src/exa_wm_src_sample_a.g4a
index 803c358..552aaee 100644
--- a/src/exa_wm_src_sample_a.g4a
+++ b/src/exa_wm_src_sample_a.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00007000UD { align1 mask_disable };
 /* src_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample6<1>UW 	/* readback */
+	src_sample_a_01<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_src_sample_argb.g4a b/src/exa_wm_src_sample_argb.g4a
index 4fcf276..c20f53f 100644
--- a/src/exa_wm_src_sample_argb.g4a
+++ b/src/exa_wm_src_sample_argb.g4a
@@ -40,7 +40,7 @@ mov (1) g0.8<1>UD	0x00000000UD { align1 mask_disable };
 /* src_msg will be copied with g0, as it contains send desc */
 /* emit sampler 'send' cmd */
 send (16) src_msg_ind		/* msg reg index */
-	src_sample0<1>UW 	/* readback */
+	src_sample_base<1>UW 	/* readback */
 	g0<8,8,1>UW		/* copy to msg start reg*/
 	sampler (1,0,F)		/* sampler message description, (binding_table,sampler_index,datatype)
 				/* here(src->dst) we should use src_sampler and src_surface */
diff --git a/src/exa_wm_write.g4a b/src/exa_wm_write.g4a
index 5d3e6b1..b16e649 100644
--- a/src/exa_wm_write.g4a
+++ b/src/exa_wm_write.g4a
@@ -25,41 +25,39 @@
  *    Keith Packard <keithp at keithp.com>
  */
 
-/* 
- * Once the data are ready, write them to the destination
- */
-
 include(`exa_wm.g4i')
 
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* src_sample0 -> m2
-   src_sample1 -> m6
-   src_sample2 -> m3
-   src_sample3 -> m7
-   src_sample4 -> m4
-   src_sample5 -> m8
-   src_sample6 -> m5
-   src_sample7 -> m9
-*/
+/*
+ * Prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2),
+ *
+ * Note that the SIMD16 write message takes data for the first
+ * two sub-spans followed by the data for the second two sub-spans
+ * instead of having the two sub-spans interleaved by channel. Weird.
+ */
+
+mov (8) data_port_r_01<1>F	src_sample_r_01<8,8,1>F { align1 };
+mov (8) data_port_g_01<1>F	src_sample_g_01<8,8,1>F { align1 };
+mov (8) data_port_b_01<1>F	src_sample_b_01<8,8,1>F { align1 };
+mov (8) data_port_a_01<1>F	src_sample_a_01<8,8,1>F { align1 };
 
-mov (8) m2<1>F src_sample0<8,8,1>F { align1 };
-mov (8) m3<1>F src_sample2<8,8,1>F { align1 };
-mov (8) m4<1>F src_sample4<8,8,1>F { align1 };
-mov (8) m5<1>F src_sample6<8,8,1>F { align1 };
-mov (8) m6<1>F src_sample1<8,8,1>F { align1 };
-mov (8) m7<1>F src_sample3<8,8,1>F { align1 };
-mov (8) m8<1>F src_sample5<8,8,1>F { align1 };
-mov (8) m9<1>F src_sample7<8,8,1>F { align1 };
+mov (8) data_port_r_23<1>F	src_sample_r_23<8,8,1>F { align1 };
+mov (8) data_port_g_23<1>F	src_sample_g_23<8,8,1>F { align1 };
+mov (8) data_port_b_23<1>F	src_sample_b_23<8,8,1>F { align1 };
+mov (8) data_port_a_23<1>F 	src_sample_a_23<8,8,1>F { align1 };
 
 /* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 };
+mov (8) data_port_msg_1<1>UD	g1<8,8,1>UD		{ align1 };
 
 /* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
+send (16) 
+	data_port_msg_0_ind 
+	acc0<1>UW 
+	g0<8,8,1>UW 
+	write (
+	       0,  /* binding_table */
+	       8,  /* pixel scordboard clear, msg type simd16 single source */
+	       4,  /* render target write */
+	       0   /* no write commit message */
 	) 
 	mlen 10
 	rlen 0