Sun Jan 7 01:08:30 EET 2007

src/Makefile.am            |   11 
 src/exa_sf.g4a             |   17 
 src/exa_sf_mask.g4a        |   53 ++
 src/exa_sf_mask_prog.h     |   25 +
 src/exa_sf_prog.h          |   17 
 src/exa_wm_masknoca.g4a    |  202 ++++++++
 src/exa_wm_masknoca_prog.h |   95 +++
 src/exa_wm_nomask.g4a      |  143 +++++
 src/exa_wm_nomask_prog.h   |   70 ++
 src/i830.h                 |    3 
 src/i830_driver.c          |   27 -
 src/i830_exa.c             |   11 
 src/i830_memory.c          |   76 ++-
 src/i830_rotate.c          |  753 ++++++++++++++++++++++++++++++
 src/i830_tv.c              |  177 ++++++-
 src/i965_exa_render.c      | 1108 +++++++++++++++++++++++++++++++++++++++++++++
 src/rotation_sf0.g4a       |   17 
 src/rotation_sf90.g4a      |   17 
 src/rotation_sf_prog0.h    |   17 
 src/rotation_sf_prog90.h   |   17 
 src/rotation_wm0.g4a       |  123 ++++
 src/rotation_wm90.g4a      |  127 +++++
 src/rotation_wm_prog0.h    |   68 ++
 src/rotation_wm_prog90.h   |   68 ++
 24 files changed, 3201 insertions(+), 41 deletions(-)

New commits:
diff-tree 736d82a6b43f174cb95b425faacd4b0b889916fa (from parents)
Merge: 53b42f5bc7a58d02106436486e5bb56e56dbbfa1 4c790f614ecba1f6468e51779cfaf0e36b6b17ad
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Mon Dec 4 15:48:04 2006 +0800

    Merge branch 'modesetting-origin' into modesetting

diff-tree 53b42f5bc7a58d02106436486e5bb56e56dbbfa1 (from 71946bcdc3c68c220996afac944698eea1974a36)
Author: Zou Nan hai <nanhai.zou at intel.com>
Date:   Sat Jan 6 14:59:14 2007 -0800

    support NTSC 480i M-J, PAL 576i for 640x480-1280x1024 sizes
    
    I still have problem with non-interlace mode and Hi Res mode.
    also I don't know how to pickup those mode in xorg.conf

diff --git a/src/i830_tv.c b/src/i830_tv.c
index 5cf36a5..4a79d2e 100644
--- a/src/i830_tv.c
+++ b/src/i830_tv.c
@@ -250,6 +250,53 @@ const tv_mode_t tv_modes[] = {
 	    .ru =-0.0957, .gu =-0.1879, .bu = 0.2836, .au = 1.0000,
 	    .rv = 0.3992, .gv =-0.3343, .bv =-0.0649, .av = 1.0000,
 	},
+    },
+    {
+	/* 625 Lines, 50 Fields, 15.625KHz line, Sub-Carrier 4.434MHz */
+	.name	    = "PAL 576i",
+	.oversample	= TV_OVERSAMPLE_8X,
+
+	.hsync_end	= 64,		    .hblank_end		= 128,
+	.hblank_start	= 844,		    .htotal		= 863,
+	
+	.progressive	= FALSE,
+	
+	.vsync_start_f1	= 6,		    .vsync_start_f2	= 7,
+	.vsync_len	= 6,
+	
+	.veq_ena	= TRUE,		    .veq_start_f1    	= 0,
+	.veq_start_f2	= 1,		    .veq_len		= 18,
+
+	.vi_end_f1	= 24,		    .vi_end_f2		= 25,
+	.nbr_end	= 286,
+
+	.burst_ena	= TRUE,
+	.hburst_start	= 73,		    .hburst_len		= 34,
+	.vburst_start_f1 = 8,		    .vburst_end_f1	= 285,
+	.vburst_start_f2 = 8,		    .vburst_end_f2	= 286,
+	.vburst_start_f3 = 9,		    .vburst_end_f3	= 286, 
+	.vburst_start_f4 = 9,		    .vburst_end_f4	= 285,
+
+	/* desired 4.4336180 actual 4.4336180 clock 107.52 */
+	.dda1_inc	=    168,
+	.dda2_inc	=  18557,	.dda2_size	=  20625,
+	.dda3_inc	=      0,	.dda3_size	=      0,
+	.sc_reset   = TV_SC_RESET_EVERY_8,
+	.pal_burst  = TRUE,
+	
+	.composite_levels = { .blank = 237, .black = 237, .burst = 118 },
+	.composite_color = {
+	    .ry = 0.2990, .gy = 0.5870, .by = 0.1140, .ay = 0.5379,
+	    .ru =-0.0793, .gu =-0.1557, .bu = 0.2350, .au = 1.0000,
+	    .rv = 0.3307, .gv =-0.2769, .bv =-0.0538, .av = 1.0000,
+	},
+
+	.svideo_levels    = { .blank = 280, .black = 280, .burst = 139 },
+	.svideo_color = {
+	    .ry = 0.2990, .gy = 0.5870, .by = 0.1140, .ay = 0.6357,
+	    .ru =-0.0937, .gu =-0.1840, .bu = 0.2777, .au = 1.0000,
+	    .rv = 0.3908, .gv =-0.3273, .bv =-0.0636, .av = 1.0000,
+	},
     }
 #if 0
     {
@@ -802,22 +849,122 @@ i830_tv_mode_set(xf86OutputPtr output, D
 }
 
 static const DisplayModeRec reported_modes[] = {
-    {
-	.name = "NTSC 480i",
-	.Clock = TV_PLL_CLOCK,
-	
-	.HDisplay   = 1024,
-	.HSyncStart = 1048,
-	.HSyncEnd   = 1184,
-	.HTotal     = 1344,
-
-	.VDisplay   = 768,
-	.VSyncStart = 771,
-	.VSyncEnd   = 777,
-	.VTotal     = 806,
+	{
+		.name = "NTSC 480i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 1280,
+		.HSyncStart = 1368,
+		.HSyncEnd   = 1496,
+		.HTotal     = 1712,
+
+		.VDisplay   = 1024,
+		.VSyncStart = 1027,
+		.VSyncEnd   = 1034,
+		.VTotal     = 1104,
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "NTSC 480i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 1024,
+		.HSyncStart = 1080,
+		.HSyncEnd   = 1184,
+		.HTotal     = 1344,
+
+		.VDisplay   = 768,
+		.VSyncStart = 771,
+		.VSyncEnd   = 777,
+		.VTotal     = 806,
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "NTSC 480i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 800,
+		.HSyncStart = 832,
+		.HSyncEnd   = 912,
+		.HTotal     = 1024,
+
+		.VDisplay   = 600,
+		.VSyncStart = 603,
+		.VSyncEnd   = 607,
+		.VTotal     = 650,
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "NTSC 480i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 640,
+		.HSyncStart = 664,
+		.HSyncEnd   = 720,
+		.HTotal     = 800,
+
+		.VDisplay   = 480,
+		.VSyncStart = 483,
+		.VSyncEnd   = 487,
+		.VTotal     = 552,
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "PAL 576i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 1280,
+		.HSyncStart = 1352,
+		.HSyncEnd   = 1480,
+		.HTotal     = 1680,
+
+		.VDisplay   = 1024,
+		.VSyncStart = 1027,
+		.VSyncEnd   = 1034,
+		.VTotal     = 1092,
 
-	.type       = M_T_DRIVER
-    }
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "PAL 576i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 1024,
+		.HSyncStart = 1072,
+		.HSyncEnd   = 1168,
+		.HTotal     = 1312,
+		.VDisplay   = 768,
+		.VSyncStart = 771,
+		.VSyncEnd   = 775,
+		.VTotal     = 820,
+		.VRefresh   = 50.0f,
+
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "PAL 576i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 800,
+		.HSyncStart = 832,
+		.HSyncEnd   = 904,
+		.HTotal     = 1008,
+		.VDisplay   = 600,
+		.VSyncStart = 603,
+		.VSyncEnd   = 607,
+		.VTotal     = 642,
+		.VRefresh   = 50.0f,
+
+		.type       = M_T_DRIVER
+	},
+	{
+		.name = "PAL 576i",
+		.Clock = TV_PLL_CLOCK,
+		.HDisplay   = 640,
+		.HSyncStart = 664,
+		.HSyncEnd   = 720,
+		.HTotal     = 800,
+
+		.VDisplay   = 480,
+		.VSyncStart = 483,
+		.VSyncEnd   = 487,
+		.VTotal     = 516,
+		.VRefresh   = 50.0f,
+		.type       = M_T_DRIVER
+	},
 };
 
 /**
diff-tree 71946bcdc3c68c220996afac944698eea1974a36 (from 35cebed70827999812f8343ac97ad0dffda20786)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Jan 3 22:37:32 2007 -0800

    [PATCH] Add rotation support for 965.

diff --git a/src/i830.h b/src/i830.h
index 3b7301e..f89d022 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -288,6 +288,7 @@ typedef struct _I830Rec {
    XF86ModReqInfo shadowReq; /* to test for later libshadow */
    I830MemRange RotatedMem;
    I830MemRange RotatedMem2;
+   I830MemRange RotateStateMem; /* for G965 state buffer */
    Rotation rotation;
    int InitialRotation;
    int displayWidth;
diff --git a/src/i830_driver.c b/src/i830_driver.c
index 6b76d12..ffa391f 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -2534,6 +2534,10 @@ I830ScreenInit(int scrnIndex, ScreenPtr 
       /* Rotated2 Buffer */
       memset(&(pI830->RotatedMem2), 0, sizeof(pI830->RotatedMem2));
       pI830->RotatedMem2.Key = -1;
+      if (IS_I965G(pI830)) {
+          memset(&(pI830->RotateStateMem), 0, sizeof(pI830->RotateStateMem));
+          pI830->RotateStateMem.Key = -1;
+      }
    }
 
 #ifdef HAS_MTRR_SUPPORT
@@ -2902,11 +2906,7 @@ I830ScreenInit(int scrnIndex, ScreenPtr 
       shadowSetup(pScreen);
       /* support all rotations */
       xf86RandR12Init (pScreen);
-      if (IS_I965G(pI830)) {
-	 xf86RandR12SetRotations (pScreen, RR_Rotate_0); /* only 0 degrees for I965G */
-      } else {
-	 xf86RandR12SetRotations (pScreen, RR_Rotate_0 | RR_Rotate_90 | RR_Rotate_180 | RR_Rotate_270);
-      }
+      xf86RandR12SetRotations (pScreen, RR_Rotate_0 | RR_Rotate_90 | RR_Rotate_180 | RR_Rotate_270);
       pI830->PointerMoved = pScrn->PointerMoved;
       pScrn->PointerMoved = I830PointerMoved;
       pI830->CreateScreenResources = pScreen->CreateScreenResources;
@@ -3249,8 +3249,7 @@ I830SwitchMode(int scrnIndex, DisplayMod
     * The extra WindowTable check detects a rotation at startup.
     */
    if ( (!WindowTable[pScrn->scrnIndex] || pspix->devPrivate.ptr == NULL) &&
-         !pI830->DGAactive && (pScrn->PointerMoved == I830PointerMoved) &&
-	 !IS_I965G(pI830)) {
+         !pI830->DGAactive && (pScrn->PointerMoved == I830PointerMoved)) {
       if (!I830Rotate(pScrn, mode))
          ret = FALSE;
    }
diff --git a/src/i830_memory.c b/src/i830_memory.c
index 60257b9..3a3836c 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -531,6 +531,28 @@ I830AllocateRotatedBuffer(ScrnInfoPtr pS
    xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
 		  "%sAllocated %ld kB for the rotated buffer at 0x%lx.\n", s,
 		  alloced / 1024, pI830->RotatedMem.Start);
+
+#define BRW_LINEAR_EXTRA (32*1024)
+   if (IS_I965G(pI830)) {
+       memset(&(pI830->RotateStateMem), 0, sizeof(I830MemRange));
+       pI830->RotateStateMem.Key = -1;
+       size = ROUND_TO_PAGE(BRW_LINEAR_EXTRA);
+       align = GTT_PAGE_SIZE;
+       alloced = I830AllocVidMem(pScrn, &(pI830->RotateStateMem),
+				&(pI830->StolenPool), size, align,
+				flags | FROM_ANYWHERE | ALLOCATE_AT_TOP);
+       if (alloced < size) {
+          if (!dryrun) {
+	     xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		    "G965: Failed to allocate rotate state buffer space.\n");
+          }
+          return FALSE;
+       }
+       xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
+		  "%sAllocated %ld kB for the G965 rotate state buffer at 0x%lx - 0x%lx.\n", s, 
+		alloced / 1024, pI830->RotateStateMem.Start, pI830->RotateStateMem.End);
+   }
+  
    return TRUE;
 }
 
@@ -1743,8 +1765,13 @@ I830SetupMemoryTiling(ScrnInfoPtr pScrn)
    int i;
 
    /* Clear out */
-   for (i = 0; i < 8; i++)
-      pI830->ModeReg.Fence[i] = 0;
+   if (IS_I965G(pI830)) {
+      for (i = 0; i < FENCE_NEW_NR*2; i++)
+	 pI830->ModeReg.Fence[i] = 0;
+   } else {
+      for (i = 0; i < 8; i++)
+         pI830->ModeReg.Fence[i] = 0;
+   }
 
    nextTile = 0;
    tileGeneration = -1;
@@ -1814,6 +1841,9 @@ I830SetupMemoryTiling(ScrnInfoPtr pScrn)
       }
    }
 	
+/* XXX tiled rotate mem not ready on G965*/
+ 
+  if(!IS_I965G(pI830)) {
    if (pI830->RotatedMem.Alignment >= KB(512)) {
       if (MakeTiles(pScrn, &(pI830->RotatedMem), FENCE_XMAJOR)) {
 	 xf86DrvMsg(pScrn->scrnIndex, X_INFO,
@@ -1824,7 +1854,7 @@ I830SetupMemoryTiling(ScrnInfoPtr pScrn)
 		    "MakeTiles failed for the rotated buffer.\n");
       }
    }
-
+  }
 #if 0
    if (pI830->RotatedMem2.Alignment >= KB(512)) {
       if (MakeTiles(pScrn, &(pI830->RotatedMem2), FENCE_XMAJOR)) {
diff --git a/src/i830_rotate.c b/src/i830_rotate.c
index 9fa3290..b2587b2 100644
--- a/src/i830_rotate.c
+++ b/src/i830_rotate.c
@@ -60,6 +60,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN
 #include "i830.h"
 #include "i915_reg.h"
 #include "i915_3d.h"
+#include "brw_defines.h"
+#include "brw_structs.h"
 
 #ifdef XF86DRI
 #include "dri.h"
@@ -194,6 +196,718 @@ static void draw_poly(CARD32 *vb,
    }
 }
 
+
+/* Our PS kernel uses less than 32 GRF registers (about 20) */
+#define PS_KERNEL_NUM_GRF   32
+#define PS_MAX_THREADS	   32
+
+#define BRW_GRF_BLOCKS(nreg)	((nreg + 15) / 16 - 1)
+
+static const CARD32 ps_kernel_static0[][4] = {
+#include "rotation_wm_prog0.h"
+};
+
+static const CARD32 ps_kernel_static90[][4] = {
+#include "rotation_wm_prog90.h"
+};
+
+#define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define BRW_LINEAR_EXTRA (32*1024)
+#define WM_BINDING_TABLE_ENTRIES    2
+
+static const CARD32 sip_kernel_static[][4] = {
+/*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
+    { 0x00000030, 0x20000108, 0x00001220, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+};
+  
+#define SF_KERNEL_NUM_GRF  16
+#define SF_MAX_THREADS	   1
+
+static const CARD32 sf_kernel_static0[][4] = {
+#include "rotation_sf_prog0.h"
+};
+
+
+static const CARD32 sf_kernel_static90[][4] = {
+#include "rotation_sf_prog90.h"
+};
+
+static void
+I965UpdateRotate (ScreenPtr      pScreen,
+                 shadowBufPtr   pBuf)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+   I830Ptr pI830 = I830PTR(pScrn);
+   ScrnInfoPtr pScrn1 = pScrn;
+   I830Ptr pI8301 = NULL;
+   RegionPtr	damage = shadowDamage(pBuf);
+   int		nbox = REGION_NUM_RECTS (damage);
+   BoxPtr	pbox = REGION_RECTS (damage);
+   int		box_x1, box_x2, box_y1, box_y2;
+   float verts[4][2];
+   struct matrix23 rotMatrix;
+   Bool updateInvarient = FALSE;
+#ifdef XF86DRI
+   drmI830Sarea *sarea = NULL;
+   drm_context_t myContext = 0;
+#endif
+   Bool didLock = FALSE;
+
+/* Gen4 states */
+   int urb_vs_start, urb_vs_size;
+   int urb_gs_start, urb_gs_size;
+   int urb_clip_start, urb_clip_size;
+   int urb_sf_start, urb_sf_size;
+   int urb_cs_start, urb_cs_size;
+   struct brw_surface_state *dest_surf_state;
+   struct brw_surface_state *src_surf_state;
+   struct brw_sampler_state *src_sampler_state;
+   struct brw_vs_unit_state *vs_state;
+   struct brw_sf_unit_state *sf_state;
+   struct brw_wm_unit_state *wm_state;
+   struct brw_cc_unit_state *cc_state;
+   struct brw_cc_viewport *cc_viewport;
+   struct brw_instruction *sf_kernel;
+   struct brw_instruction *ps_kernel;
+   struct brw_instruction *sip_kernel;
+   float *vb;  
+   BOOL first_output = TRUE;
+   CARD32 *binding_table;
+   int dest_surf_offset, src_surf_offset, src_sampler_offset, vs_offset;
+   int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
+   int wm_scratch_offset;
+   int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
+   int binding_table_offset;
+   int next_offset, total_state_size;
+   int vb_size = (4 * 4) * 4; /* 4 DWORDS per vertex */
+   char *state_base;
+   int state_base_offset;
+
+   DPRINTF(PFX, "I965UpdateRotate: from (%d x %d) -> (%d x %d)\n",	
+		pScrn->virtualX, pScrn->virtualY, pScreen->width, pScreen->height);
+
+   if (I830IsPrimary(pScrn)) {
+      pI8301 = pI830;
+   } else {
+      pI8301 = I830PTR(pI830->entityPrivate->pScrn_1);
+      pScrn1 = pI830->entityPrivate->pScrn_1;
+   }
+
+   switch (pI830->rotation) {
+      case RR_Rotate_90:
+         matrix23Rotate(&rotMatrix,
+                     pScreen->width, pScreen->height,
+                     90);
+	 break;
+      case RR_Rotate_180:
+         matrix23Rotate(&rotMatrix,
+                     pScreen->width, pScreen->height,
+                     180);
+	 break;
+      case RR_Rotate_270:
+         matrix23Rotate(&rotMatrix,
+                     pScreen->width, pScreen->height,
+                     270);
+	 break;
+      default:
+	 break;
+   }
+
+#ifdef XF86DRI
+   if (pI8301->directRenderingEnabled) {
+      sarea = DRIGetSAREAPrivate(pScrn1->pScreen);
+      myContext = DRIGetContext(pScrn1->pScreen);
+      didLock = I830DRILock(pScrn1);
+   }
+#endif
+
+   if (pScrn->scrnIndex != *pI830->used3D) 
+      updateInvarient = TRUE;
+ 
+#ifdef XF86DRI
+   if (sarea && sarea->ctxOwner != myContext)
+      updateInvarient = TRUE;
+#endif
+
+   /*XXX we'll always update state */
+   *pI830->used3D = pScrn->scrnIndex;
+#ifdef XF86DRI
+   if (sarea)
+      sarea->ctxOwner = myContext;
+#endif
+
+   /* this starts initialize 3D engine for rotation mapping*/
+   next_offset = 0;
+
+   /* Set up our layout of state in framebuffer.  First the general state: */
+   vs_offset = ALIGN(next_offset, 64);
+   next_offset = vs_offset + sizeof(*vs_state);
+   sf_offset = ALIGN(next_offset, 32);
+   next_offset = sf_offset + sizeof(*sf_state);
+   wm_offset = ALIGN(next_offset, 32);
+   next_offset = wm_offset + sizeof(*wm_state);
+   wm_scratch_offset = ALIGN(next_offset, 1024);
+   next_offset = wm_scratch_offset + 1024 * PS_MAX_THREADS;
+   cc_offset = ALIGN(next_offset, 32);
+   next_offset = cc_offset + sizeof(*cc_state);
+
+   sf_kernel_offset = ALIGN(next_offset, 64);
+      
+   switch (pI830->rotation) {
+       case RR_Rotate_90:
+       case RR_Rotate_270:
+      	    next_offset = sf_kernel_offset + sizeof (sf_kernel_static90);
+      	    ps_kernel_offset = ALIGN(next_offset, 64);
+      	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static90);
+	    break;
+       case RR_Rotate_180:
+       default:
+      	    next_offset = sf_kernel_offset + sizeof (sf_kernel_static0);
+      	    ps_kernel_offset = ALIGN(next_offset, 64);
+      	    next_offset = ps_kernel_offset + sizeof (ps_kernel_static0);
+	    break;
+   }
+
+   sip_kernel_offset = ALIGN(next_offset, 64);
+   next_offset = sip_kernel_offset + sizeof (sip_kernel_static);
+   cc_viewport_offset = ALIGN(next_offset, 32);
+   next_offset = cc_viewport_offset + sizeof(*cc_viewport);
+
+   src_sampler_offset = ALIGN(next_offset, 32);
+   next_offset = src_sampler_offset + sizeof(*src_sampler_state);
+
+   /* Align VB to native size of elements, for safety */
+   vb_offset = ALIGN(next_offset, 8);
+   next_offset = vb_offset + vb_size;
+
+   dest_surf_offset = ALIGN(next_offset, 32);
+   next_offset = dest_surf_offset + sizeof(*dest_surf_state);
+   src_surf_offset = ALIGN(next_offset, 32);
+   next_offset = src_surf_offset + sizeof(*src_surf_state);
+   binding_table_offset = ALIGN(next_offset, 32);
+   next_offset = binding_table_offset + (WM_BINDING_TABLE_ENTRIES * 4);
+
+   total_state_size = next_offset;
+   assert (total_state_size < BRW_LINEAR_EXTRA);
+
+   state_base_offset = pI830->RotateStateMem.Start;
+   state_base_offset = ALIGN(state_base_offset, 64);
+   state_base = (char *)(pI830->FbBase + state_base_offset);
+   DPRINTF(PFX, "rotate state buffer start 0x%x, addr 0x%x, base 0x%x\n",
+			pI830->RotateStateMem.Start, state_base, pI830->FbBase);
+
+   vs_state = (void *)(state_base + vs_offset);
+   sf_state = (void *)(state_base + sf_offset);
+   wm_state = (void *)(state_base + wm_offset);
+   cc_state = (void *)(state_base + cc_offset);
+   sf_kernel = (void *)(state_base + sf_kernel_offset);
+   ps_kernel = (void *)(state_base + ps_kernel_offset);
+   sip_kernel = (void *)(state_base + sip_kernel_offset);
+   
+   cc_viewport = (void *)(state_base + cc_viewport_offset);
+   dest_surf_state = (void *)(state_base + dest_surf_offset);
+   src_surf_state = (void *)(state_base + src_surf_offset);
+   src_sampler_state = (void *)(state_base + src_sampler_offset);
+   binding_table = (void *)(state_base + binding_table_offset);
+   vb = (void *)(state_base + vb_offset);
+
+   /* For 3D, the VS must have 8, 12, 16, 24, or 32 VUEs allocated to it.
+    * A VUE consists of a 256-bit vertex header followed by the vertex data,
+    * which in our case is 4 floats (128 bits), thus a single 512-bit URB
+    * entry.
+    */
+#define URB_VS_ENTRIES	      8
+#define URB_VS_ENTRY_SIZE     1
+   
+#define URB_GS_ENTRIES	      0
+#define URB_GS_ENTRY_SIZE     0
+   
+#define URB_CLIP_ENTRIES      0
+#define URB_CLIP_ENTRY_SIZE   0
+   
+   /* The SF kernel we use outputs only 4 256-bit registers, leading to an
+    * entry size of 2 512-bit URBs.  We don't need to have many entries to
+    * output as we're generally working on large rectangles and don't care
+    * about having WM threads running on different rectangles simultaneously.
+    */
+#define URB_SF_ENTRIES	      1
+#define URB_SF_ENTRY_SIZE     2
+
+#define URB_CS_ENTRIES	      0
+#define URB_CS_ENTRY_SIZE     0
+   
+   urb_vs_start = 0;
+   urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
+   urb_gs_start = urb_vs_start + urb_vs_size;
+   urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
+   urb_clip_start = urb_gs_start + urb_gs_size;
+   urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
+   urb_sf_start = urb_clip_start + urb_clip_size;
+   urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
+   urb_cs_start = urb_sf_start + urb_sf_size;
+   urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+
+   memset (cc_viewport, 0, sizeof (*cc_viewport));
+   cc_viewport->min_depth = -1.e35;
+   cc_viewport->max_depth = 1.e35;
+
+   memset(cc_state, 0, sizeof(*cc_state));
+   cc_state->cc0.stencil_enable = 0;   /* disable stencil */
+   cc_state->cc2.depth_test = 0;       /* disable depth test */
+   cc_state->cc2.logicop_enable = 1;   /* enable logic op */
+   cc_state->cc3.ia_blend_enable = 1;  /* blend alpha just like colors */
+   cc_state->cc3.blend_enable = 0;     /* disable color blend */
+   cc_state->cc3.alpha_test = 0;       /* disable alpha test */
+   cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
+   cc_state->cc5.dither_enable = 0;    /* disable dither */
+   cc_state->cc5.logicop_func = 0xc;   /* COPY S*/
+   cc_state->cc5.statistics_enable = 1;
+   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
+   cc_state->cc5.ia_src_blend_factor = BRW_BLENDFACTOR_ONE;
+   cc_state->cc5.ia_dest_blend_factor = BRW_BLENDFACTOR_ZERO;
+
+   /* Upload system kernel */
+   memcpy (sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
+   
+   memset(dest_surf_state, 0, sizeof(*dest_surf_state));
+   dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+   dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
+   if (pI8301->cpp == 2)
+      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+   else
+      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   dest_surf_state->ss0.writedisable_alpha = 0;
+   dest_surf_state->ss0.writedisable_red = 0;
+   dest_surf_state->ss0.writedisable_green = 0;
+   dest_surf_state->ss0.writedisable_blue = 0;
+   dest_surf_state->ss0.color_blend = 0;
+   dest_surf_state->ss0.vert_line_stride = 0;
+   dest_surf_state->ss0.vert_line_stride_ofs = 0;
+   dest_surf_state->ss0.mipmap_layout_mode = 0;
+   dest_surf_state->ss0.render_cache_read_mode = 0;
+   
+   if (I830IsPrimary(pScrn))
+      dest_surf_state->ss1.base_addr = pI830->FrontBuffer.Start;
+   else 
+      dest_surf_state->ss1.base_addr = pI8301->FrontBuffer2.Start;
+   dest_surf_state->ss2.width = pScrn->virtualX - 1;
+   dest_surf_state->ss2.height = pScrn->virtualY - 1; 
+   dest_surf_state->ss2.mip_count = 0;
+   dest_surf_state->ss2.render_target_rotation = 0; /*XXX how to use? */
+   dest_surf_state->ss3.pitch = (pI830->displayWidth * pI830->cpp) - 1;
+   if (pI830->front_tiled) {
+      dest_surf_state->ss3.tiled_surface = 1;
+      dest_surf_state->ss3.tile_walk = 0; /* X major */
+   }
+
+   memset(src_surf_state, 0, sizeof(*src_surf_state));
+   src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+/* src_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;*/
+   if (pI8301->cpp == 2) 
+      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+   else 
+      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   src_surf_state->ss0.writedisable_alpha = 0;
+   src_surf_state->ss0.writedisable_red = 0;
+   src_surf_state->ss0.writedisable_green = 0;
+   src_surf_state->ss0.writedisable_blue = 0;
+   src_surf_state->ss0.color_blend = 0;
+   src_surf_state->ss0.vert_line_stride = 0;
+   src_surf_state->ss0.vert_line_stride_ofs = 0;
+   src_surf_state->ss0.mipmap_layout_mode = 0;
+   src_surf_state->ss0.render_cache_read_mode = 0;
+  
+   if (I830IsPrimary(pScrn)) 
+      src_surf_state->ss1.base_addr = pI830->RotatedMem.Start;
+   else 
+      src_surf_state->ss1.base_addr = pI8301->RotatedMem2.Start;
+   src_surf_state->ss2.width = pScreen->width - 1;
+   src_surf_state->ss2.height = pScreen->height - 1;
+   src_surf_state->ss2.mip_count = 0;
+   src_surf_state->ss2.render_target_rotation = 0;
+   src_surf_state->ss3.pitch = (pScrn->displayWidth * pI830->cpp) - 1;
+   if (pI830->rotated_tiled) {
+      src_surf_state->ss3.tiled_surface = 1;
+      src_surf_state->ss3.tile_walk = 0; /* X major */
+   }
+
+   binding_table[0] = state_base_offset + dest_surf_offset;
+   binding_table[1] = state_base_offset + src_surf_offset;
+
+   memset(src_sampler_state, 0, sizeof(*src_sampler_state));
+   src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+   src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+   src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+
+   /* Set up the vertex shader to be disabled (passthrough) */
+   memset(vs_state, 0, sizeof(*vs_state));
+   vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
+   vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
+   vs_state->vs6.vs_enable = 0;
+   vs_state->vs6.vert_cache_disable = 1;
+
+   /* Set up the SF kernel to do coord interp: for each attribute,
+    * calculate dA/dx and dA/dy.  Hand these interpolation coefficients
+    * back to SF which then hands pixels off to WM.
+    */
+
+   switch (pI830->rotation) {
+      case RR_Rotate_90:
+      case RR_Rotate_270:
+           memcpy (sf_kernel, sf_kernel_static90, sizeof (sf_kernel_static90));
+           memcpy (ps_kernel, ps_kernel_static90, sizeof (ps_kernel_static90));
+	   break;
+      case RR_Rotate_180:
+      default:
+           memcpy (sf_kernel, sf_kernel_static0, sizeof (sf_kernel_static0));
+           memcpy (ps_kernel, ps_kernel_static0, sizeof (ps_kernel_static0));
+	   break;
+   }
+
+   memset(sf_state, 0, sizeof(*sf_state));
+   sf_state->thread0.kernel_start_pointer = 
+	          (state_base_offset + sf_kernel_offset) >> 6;
+   sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
+   sf_state->sf1.single_program_flow = 1; /* XXX */
+   sf_state->sf1.binding_table_entry_count = 0;
+   sf_state->sf1.thread_priority = 0;
+   sf_state->sf1.floating_point_mode = 0; 
+   sf_state->sf1.illegal_op_exception_enable = 1;
+   sf_state->sf1.mask_stack_exception_enable = 1;
+   sf_state->sf1.sw_exception_enable = 1;
+   sf_state->thread2.per_thread_scratch_space = 0;
+   sf_state->thread2.scratch_space_base_pointer = 0; /* not used in our kernel */
+   sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */
+   sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */
+   sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */
+   sf_state->thread3.urb_entry_read_offset = 0;
+   sf_state->thread3.dispatch_grf_start_reg = 3;
+   sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
+   sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
+   sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
+   sf_state->thread4.stats_enable = 1;
+   sf_state->sf5.viewport_transform = FALSE; /* skip viewport */
+   sf_state->sf6.cull_mode = BRW_CULLMODE_NONE;
+   sf_state->sf6.scissor = 0;
+   sf_state->sf7.trifan_pv = 2;
+   sf_state->sf6.dest_org_vbias = 0x8;
+   sf_state->sf6.dest_org_hbias = 0x8;
+
+   memset (wm_state, 0, sizeof (*wm_state));
+   wm_state->thread0.kernel_start_pointer = 
+	       (state_base_offset + ps_kernel_offset) >> 6;
+   wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
+   wm_state->thread1.single_program_flow = 1; /* XXX */
+   wm_state->thread1.binding_table_entry_count = 2;
+   /* Though we never use the scratch space in our WM kernel, it has to be
+    * set, and the minimum allocation is 1024 bytes.
+    */
+   wm_state->thread2.scratch_space_base_pointer = (state_base_offset +
+						   wm_scratch_offset) >> 10;
+   wm_state->thread2.per_thread_scratch_space = 0; /* 1024 bytes */
+   wm_state->thread3.dispatch_grf_start_reg = 3;
+   wm_state->thread3.const_urb_entry_read_length = 0;
+   wm_state->thread3.const_urb_entry_read_offset = 0;
+   wm_state->thread3.urb_entry_read_length = 1;
+   wm_state->thread3.urb_entry_read_offset = 0;
+   wm_state->wm4.stats_enable = 1;
+   wm_state->wm4.sampler_state_pointer = (state_base_offset + src_sampler_offset) >> 5;
+   wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
+   wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
+   wm_state->wm5.thread_dispatch_enable = 1;
+   wm_state->wm5.enable_16_pix = 1;
+   wm_state->wm5.enable_8_pix = 0;
+   wm_state->wm5.early_depth_test = 1;
+
+   
+   {
+         BEGIN_LP_RING(2);
+         OUT_RING(MI_FLUSH | 
+	          MI_STATE_INSTRUCTION_CACHE_FLUSH |
+	          BRW_MI_GLOBAL_SNAPSHOT_RESET);
+         OUT_RING(MI_NOOP);
+         ADVANCE_LP_RING();
+    }
+
+    {
+         BEGIN_LP_RING(12);
+         OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+
+   /* Mesa does this. Who knows... */
+         OUT_RING(BRW_CS_URB_STATE | 0);
+         OUT_RING((0 << 4) |	/* URB Entry Allocation Size */
+	          (0 << 0));	/* Number of URB Entries */
+   
+   /* Zero out the two base address registers so all offsets are absolute */
+         OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
+         OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
+         OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
+         OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+         OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
+         OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
+
+   /* Set system instruction pointer */
+         OUT_RING(BRW_STATE_SIP | 0);
+         OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
+      
+         OUT_RING(MI_NOOP);
+         ADVANCE_LP_RING(); 
+    }
+   
+
+    { 
+         BEGIN_LP_RING(36);
+   /* Enable VF statistics */
+         OUT_RING(BRW_3DSTATE_VF_STATISTICS | 1);
+   
+   /* Pipe control */
+         OUT_RING(BRW_PIPE_CONTROL |
+	          BRW_PIPE_CONTROL_NOWRITE |
+	          BRW_PIPE_CONTROL_IS_FLUSH |
+	          2);
+         OUT_RING(0);			       /* Destination address */
+         OUT_RING(0);			       /* Immediate data low DW */
+         OUT_RING(0);			       /* Immediate data high DW */
+
+   /* Binding table pointers */
+         OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
+         OUT_RING(0); /* vs */
+         OUT_RING(0); /* gs */
+         OUT_RING(0); /* clip */
+         OUT_RING(0); /* sf */
+   /* Only the PS uses the binding table */
+         OUT_RING(state_base_offset + binding_table_offset); /* ps */
+   
+   /* XXX: Blend constant color (magenta is fun) */
+         //OUT_RING(BRW_3DSTATE_CONSTANT_COLOR | 3);
+         //OUT_RING(float_to_uint (1.0));
+         //OUT_RING(float_to_uint (0.0));
+         //OUT_RING(float_to_uint (1.0));
+         //OUT_RING(float_to_uint (1.0));
+   
+   /* The drawing rectangle clipping is always on.  Set it to values that
+    * shouldn't do any clipping.
+    */
+         OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
+         OUT_RING(0x00000000);	/* ymin, xmin */
+         OUT_RING((pScrn->virtualX - 1) |
+	          (pScrn->virtualY - 1) << 16); /* ymax, xmax */
+         OUT_RING(0x00000000);	/* yorigin, xorigin */
+
+   /* skip the depth buffer */
+   /* skip the polygon stipple */
+   /* skip the polygon stipple offset */
+   /* skip the line stipple */
+   
+   /* Set the pointers to the 3d pipeline state */
+         OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
+         OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
+         OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
+         OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
+         OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
+         OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
+         OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
+
+   /* URB fence */
+         OUT_RING(BRW_URB_FENCE |
+	          UF0_CS_REALLOC |
+	          UF0_SF_REALLOC |
+	          UF0_CLIP_REALLOC |
+	          UF0_GS_REALLOC |
+	          UF0_VS_REALLOC |
+	    	  1);
+         OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+	          ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+	          ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+         OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+	          ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+
+   /* Constant buffer state */
+         OUT_RING(BRW_CS_URB_STATE | 0);
+         OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
+	          (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
+   
+   /* Set up the pointer to our vertex buffer */
+         OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 2);
+         OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
+	          VB0_VERTEXDATA |
+	          ((4 * 4) << VB0_BUFFER_PITCH_SHIFT)); /* four 32-bit floats per vertex */
+         OUT_RING(state_base_offset + vb_offset);
+         OUT_RING(3); /* four corners to our rectangle */
+
+   /* Set up our vertex elements, sourced from the single vertex buffer. */
+         OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | 3);
+   /* offset 0: X,Y -> {X, Y, 1.0, 1.0} */
+         OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	          VE0_VALID |
+	          (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	          (0 << VE0_OFFSET_SHIFT));
+         OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	          (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+   /* offset 8: S0, T0 -> {S0, T0, 1.0, 1.0} */
+         OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	          VE0_VALID |
+	          (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	          (8 << VE0_OFFSET_SHIFT));
+         OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	          (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	          (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+
+         //OUT_RING(MI_NOOP);			/* pad to quadword */
+         ADVANCE_LP_RING(); 
+   }
+
+   {
+      BEGIN_LP_RING(2);
+      OUT_RING(MI_FLUSH | 
+	       MI_STATE_INSTRUCTION_CACHE_FLUSH |
+	       BRW_MI_GLOBAL_SNAPSHOT_RESET);
+      OUT_RING(MI_NOOP);
+      ADVANCE_LP_RING();
+   }
+
+   while (nbox--)
+   {
+      float src_scale_x, src_scale_y;
+      int i;
+      box_x1 = pbox->x1;
+      box_y1 = pbox->y1;
+      box_x2 = pbox->x2;
+      box_y2 = pbox->y2;
+
+      if (!first_output) {
+	 /* Since we use the same little vertex buffer over and over, sync for
+	  * subsequent rectangles.
+	  */
+	 if (pI830->AccelInfoRec && pI830->AccelInfoRec->NeedToSync) {
+	    (*pI830->AccelInfoRec->Sync)(pScrn);
+	    pI830->AccelInfoRec->NeedToSync = FALSE;
+	 }
+      }
+
+      pbox++;
+
+      verts[0][0] = box_x1; verts[0][1] = box_y1;
+      verts[1][0] = box_x2; verts[1][1] = box_y1;
+      verts[2][0] = box_x2; verts[2][1] = box_y2;
+      verts[3][0] = box_x1; verts[3][1] = box_y2;
+
+      /* transform coordinates to rotated versions, but leave texcoords unchanged */
+      for (i = 0; i < 4; i++)
+         matrix23TransformCoordf(&rotMatrix, &verts[i][0], &verts[i][1]);
+
+      src_scale_x = (float)1.0 / (float)pScreen->width;
+      src_scale_y = (float)1.0 / (float)pScreen->height;
+      i = 0;
+
+      DPRINTF(PFX, "box size (%d, %d) -> (%d, %d)\n", 
+			box_x1, box_y1, box_x2, box_y2);
+
+      switch (pI830->rotation) {
+         case RR_Rotate_90:
+      	    vb[i++] = (float)box_x1 * src_scale_x;
+      	    vb[i++] = (float)box_y2 * src_scale_y;
+      	    vb[i++] = verts[3][0];
+      	    vb[i++] = verts[3][1];
+
+      	    vb[i++] = (float)box_x1 * src_scale_x;
+      	    vb[i++] = (float)box_y1 * src_scale_y;
+      	    vb[i++] = verts[0][0];
+      	    vb[i++] = verts[0][1];
+
+      	    vb[i++] = (float)box_x2 * src_scale_x;
+      	    vb[i++] = (float)box_y1 * src_scale_y;
+      	    vb[i++] = verts[1][0];
+      	    vb[i++] = verts[1][1];
+	    break;
+         case RR_Rotate_270:
+      	    vb[i++] = (float)box_x2 * src_scale_x;
+      	    vb[i++] = (float)box_y1 * src_scale_y;
+      	    vb[i++] = verts[1][0];
+      	    vb[i++] = verts[1][1];
+
+      	    vb[i++] = (float)box_x2 * src_scale_x;
+      	    vb[i++] = (float)box_y2 * src_scale_y;
+      	    vb[i++] = verts[2][0];
+      	    vb[i++] = verts[2][1];
+
+      	    vb[i++] = (float)box_x1 * src_scale_x;
+      	    vb[i++] = (float)box_y2 * src_scale_y;
+      	    vb[i++] = verts[3][0];
+      	    vb[i++] = verts[3][1];
+	    break;
+	 case RR_Rotate_180:
+       	 default:
+      	    vb[i++] = (float)box_x1 * src_scale_x;
+      	    vb[i++] = (float)box_y1 * src_scale_y;
+      	    vb[i++] = verts[0][0];
+      	    vb[i++] = verts[0][1];
+
+            vb[i++] = (float)box_x2 * src_scale_x;
+      	    vb[i++] = (float)box_y1 * src_scale_y;
+      	    vb[i++] = verts[1][0];
+      	    vb[i++] = verts[1][1];
+
+      	    vb[i++] = (float)box_x2 * src_scale_x;
+      	    vb[i++] = (float)box_y2 * src_scale_y;
+      	    vb[i++] = verts[2][0];
+      	    vb[i++] = verts[2][1];
+	    break;
+      }
+
+      BEGIN_LP_RING(6);
+      OUT_RING(BRW_3DPRIMITIVE | 
+	       BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
+	       (_3DPRIM_RECTLIST << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
+	       (0 << 9) |  /* CTG - indirect vertex count */
+	       4);
+      OUT_RING(3); /* vertex count per instance */
+      OUT_RING(0); /* start vertex offset */
+      OUT_RING(1); /* single instance */
+      OUT_RING(0); /* start instance location */
+      OUT_RING(0); /* index buffer offset, ignored */
+      ADVANCE_LP_RING();
+
+      first_output = FALSE;
+      if (pI830->AccelInfoRec)
+	 pI830->AccelInfoRec->NeedToSync = TRUE;
+   }
+
+   if (pI830->AccelInfoRec)
+      (*pI830->AccelInfoRec->Sync)(pScrn);
+#ifdef XF86DRI
+   if (didLock)
+      I830DRIUnlock(pScrn1);
+#endif
+}
+
+
 static void
 I915UpdateRotate (ScreenPtr      pScreen,
                  shadowBufPtr   pBuf)
@@ -657,11 +1371,15 @@ I830Rotate(ScrnInfoPtr pScrn, DisplayMod
 
    if (pI830->noAccel)
       func = LoaderSymbol("shadowUpdateRotatePacked");
-   else
-      if (IS_I9XX(pI830))
-	 func = I915UpdateRotate;
-      else
+   else {
+      if (IS_I9XX(pI830)) {
+	 if (IS_I965G(pI830))
+	     func = I965UpdateRotate;
+	 else 
+	     func = I915UpdateRotate;
+      } else
 	 func = I830UpdateRotate;
+   }
 
    if (I830IsPrimary(pScrn)) {
       pI8301 = pI830;
@@ -738,6 +1456,15 @@ I830Rotate(ScrnInfoPtr pScrn, DisplayMod
       memset(&(pI8301->RotatedMem), 0, sizeof(pI8301->RotatedMem));
       pI8301->RotatedMem.Key = -1;
 
+      if (IS_I965G(pI8301)) {
+         if (pI8301->RotateStateMem.Key != -1)
+            xf86UnbindGARTMemory(pScrn1->scrnIndex, pI8301->RotateStateMem.Key);
+ 
+         I830FreeVidMem(pScrn1, &(pI8301->RotateStateMem));
+         memset(&(pI8301->RotateStateMem), 0, sizeof(pI8301->RotateStateMem));
+      	 pI8301->RotateStateMem.Key = -1;
+      }
+
       if (pI830->entityPrivate) {
          if (pI8301->RotatedMem2.Key != -1)
             xf86UnbindGARTMemory(pScrn1->scrnIndex, pI8301->RotatedMem2.Key);
@@ -820,6 +1547,12 @@ I830Rotate(ScrnInfoPtr pScrn, DisplayMod
          I830FixOffset(pScrn1, &(pI8301->RotatedMem));
          if (pI8301->RotatedMem.Key != -1)
             xf86BindGARTMemory(pScrn1->scrnIndex, pI8301->RotatedMem.Key, pI8301->RotatedMem.Offset);
+	 if (IS_I965G(pI8301)) {
+            I830FixOffset(pScrn1, &(pI8301->RotateStateMem));
+            if (pI8301->RotateStateMem.Key != -1)
+            	xf86BindGARTMemory(pScrn1->scrnIndex, pI8301->RotateStateMem.Key, 
+				   pI8301->RotateStateMem.Offset);
+	 }
       }
    }
    
@@ -887,8 +1620,16 @@ I830Rotate(ScrnInfoPtr pScrn, DisplayMod
       }
       I830SetupMemoryTiling(pScrn1);
       /* update fence registers */
-      for (i = 0; i < 8; i++) 
-         OUTREG(FENCE + i * 4, pI8301->ModeReg.Fence[i]);
+      if (IS_I965G(pI830)) {
+         for (i = 0; i < FENCE_NEW_NR; i++) {
+            OUTREG(FENCE_NEW + i * 8, pI830->ModeReg.Fence[i]);
+            OUTREG(FENCE_NEW + 4 + i * 8, pI830->ModeReg.Fence[i+FENCE_NEW_NR]);
+         }
+      } else {
+         for (i = 0; i < 8; i++) 
+            OUTREG(FENCE + i * 4, pI8301->ModeReg.Fence[i]);
+      }
+
       {
          drmI830Sarea *sarea = DRIGetSAREAPrivate(pScrn1->pScreen);
          I830UpdateDRIBuffers(pScrn1, sarea );
diff --git a/src/rotation_sf0.g4a b/src/rotation_sf0.g4a
new file mode 100644
index 0000000..8c1398f
--- /dev/null
+++ b/src/rotation_sf0.g4a
@@ -0,0 +1,17 @@
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/rotation_sf90.g4a b/src/rotation_sf90.g4a
new file mode 100644
index 0000000..2648dff
--- /dev/null
+++ b/src/rotation_sf90.g4a
@@ -0,0 +1,17 @@
+send (1) 0 g6<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/rotation_sf_prog0.h b/src/rotation_sf_prog0.h
new file mode 100644
index 0000000..830d176
--- /dev/null
+++ b/src/rotation_sf_prog0.h
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/rotation_sf_prog90.h b/src/rotation_sf_prog90.h
new file mode 100644
index 0000000..2e94b8f
--- /dev/null
+++ b/src/rotation_sf_prog90.h
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x00000034, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x0000002c, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/rotation_wm0.g4a b/src/rotation_wm0.g4a
new file mode 100644
index 0000000..fe09734
--- /dev/null
+++ b/src/rotation_wm0.g4a
@@ -0,0 +1,123 @@
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up the X/Y screen coordinates of the pixels in our 4 subspans.  Each
+     * subspan is a 2x2 rectangle, and the screen x/y of the upper left of each
+     * subspan are given in GRF register 1.2 through 1.5 (which, with the word
+     * addressing below, are 1.4 through 1.11).
+     *
+     * The result is WM_X*_R and WM_Y*R being:
+     *
+     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
+     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
+     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
+     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
+     */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+    /* XXX: double check the fields in Cx,Cy,Co and attributes*/
+mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture Y offset */
+add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+    /* sampler  */
+mov (8) m1<1>F g4<8,8,1>F { align1 };
+mov (8) m2<1>F g5<8,8,1>F { align1 };
+mov (8) m3<1>F g6<8,8,1>F { align1 };
+mov (8) m4<1>F g7<8,8,1>F { align1 };
+
+    /*
+     * g0 holds the PS thread payload, which (oddly) contains
+     * precisely what the sampler wants to see in m0
+     */
+send  (16) 0 g12<1>UW g0<8,8,1>UW sampler (1,0,F) mlen 5 rlen 8 { align1 };
+mov (8) g19<1>UD g19<8,8,1>UD { align1 };
+
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+   /* Pass through control information:
+    */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+   /* Send framebuffer write message: XXX: acc0? */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0, /* binding table index 0 */
+	8, /* pixel scoreboard clear */
+	4, /* render target write */
+	0 /* no write commit message */
+	) mlen 10 rlen 0 { align1 EOT };
+   /* padding */
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/rotation_wm90.g4a b/src/rotation_wm90.g4a
new file mode 100644
index 0000000..fd600bf
--- /dev/null
+++ b/src/rotation_wm90.g4a
@@ -0,0 +1,127 @@
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up the X/Y screen coordinates of the pixels in our 4 subspans.  Each
+     * subspan is a 2x2 rectangle, and the screen x/y of the upper left of each
+     * subspan are given in GRF register 1.2 through 1.5 (which, with the word
+     * addressing below, are 1.4 through 1.11).
+     *
+     * The result is WM_X*_R and WM_Y*R being:
+     *
+     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
+     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
+     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
+     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
+     */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+/* XXX: convert it to calculate (u,v) in 90 and 270 case */
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+
+/* (Yp - Ystart) * Cx */
+mul (8) g6<1>F g6<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3<0,1,0>F { align1 };
+
+    /* scale by texture Y increment */
+add (8) g6<1>F g6<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.12<0,1,0>F { align1 };
+
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.28<0,1,0>F { align1 };
+
+    /* sampler  */
+mov (8) m1<1>F g6<8,8,1>F { align1 };
+mov (8) m2<1>F g7<8,8,1>F { align1 };
+mov (8) m3<1>F g4<8,8,1>F { align1 };
+mov (8) m4<1>F g5<8,8,1>F { align1 };
+
+    /*
+     * g0 holds the PS thread payload, which (oddly) contains
+     * precisely what the sampler wants to see in m0
+     */
+send  (16) 0 g12<1>UW g0<8,8,1>UW sampler (1,0,F) mlen 5 rlen 8 { align1 };
+mov (8) g19<1>UD g19<8,8,1>UD { align1 };
+
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+   /* Pass through control information:
+    */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+   /* Send framebuffer write message: XXX: acc0? */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0, /* binding table index 0 */
+	8, /* pixel scoreboard clear */
+	4, /* render target write */
+	0 /* no write commit message */
+	) mlen 10 rlen 0 { align1 EOT };
+   /* padding */
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/rotation_wm_prog0.h b/src/rotation_wm_prog0.h
new file mode 100644
index 0000000..08269b7
--- /dev/null
+++ b/src/rotation_wm_prog0.h
@@ -0,0 +1,68 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/rotation_wm_prog90.h b/src/rotation_wm_prog90.h
new file mode 100644
index 0000000..9b87750
--- /dev/null
+++ b/src/rotation_wm_prog90.h
@@ -0,0 +1,68 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000060 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000060 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000006c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000006c },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000074 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000074 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000007c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00e0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00a0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff-tree 35cebed70827999812f8343ac97ad0dffda20786 (from 33e912aca08fa11ef588eb386e16ba5f9ea13727)
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Nov 16 15:12:43 2006 -0800

    [PATCH] Replace broken PCI resource size detection with pciGetBaseSize() call.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i830_driver.c b/src/i830_driver.c
index 94cba05..6b76d12 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -1189,16 +1189,12 @@ I830PreInit(ScrnInfoPtr pScrn, int flags
       }
    } else {
       if (IS_I9XX(pI830)) {
-	 if (pI830->PciInfo->memBase[2] & 0x08000000)
-	    pI830->FbMapSize = 0x8000000;	/* 128MB aperture */
-	 else
-	    pI830->FbMapSize = 0x10000000;	/* 256MB aperture */
-
-   	 if (pI830->PciInfo->chipType == PCI_CHIP_E7221_G)
-	    pI830->FbMapSize = 0x8000000;	/* 128MB aperture */
-      } else
-	 /* 128MB aperture for later chips */
+	 pI830->FbMapSize = 1UL << pciGetBaseSize(pI830->PciTag, 2, TRUE,
+						  NULL);
+      } else {
+	 /* 128MB aperture for later i8xx series. */
 	 pI830->FbMapSize = 0x8000000;
+      }
    }
 
    if (pI830->PciInfo->chipType == PCI_CHIP_E7221_G)
diff-tree 33e912aca08fa11ef588eb386e16ba5f9ea13727 (from fa54a3c08301e59558ab0493b3d22324f4162496)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Thu Jan 4 11:25:31 2007 +0800

    [PATCH] Fix EXA mem binding
    
    We should check if EXA is really enabled.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i830_memory.c b/src/i830_memory.c
index 20e3afb..60257b9 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -841,7 +841,7 @@ I830Allocate2DMemory(ScrnInfoPtr pScrn, 
 		       pI830->Offscreen.Start, pI830->Offscreen.Size/1024);
 	 }
       }
-      if (IS_I965G(pI830)) {
+      if (pI830->useEXA && IS_I965G(pI830)) {
           memset(&(pI830->EXAStateMem), 0, sizeof(I830MemRange));
           pI830->EXAStateMem.Key = -1;
           size = ROUND_TO_PAGE(EXA_LINEAR_EXTRA);
@@ -1513,9 +1513,11 @@ I830FixupOffsets(ScrnInfoPtr pScrn)
    }
 #endif
 #ifdef I830_USE_EXA
-   I830FixOffset(pScrn, &(pI830->Offscreen));
-   if (IS_I965G(pI830))
-       I830FixOffset(pScrn, &(pI830->EXAStateMem));
+   if (pI830->useEXA) {
+       I830FixOffset(pScrn, &(pI830->Offscreen));
+       if (IS_I965G(pI830))
+           I830FixOffset(pScrn, &(pI830->EXAStateMem));
+    }
 #endif
    return TRUE;
 }
@@ -1919,10 +1921,12 @@ I830BindAGPMemory(ScrnInfoPtr pScrn)
       }
 #endif
 #ifdef I830_USE_EXA
-     if (!BindMemRange(pScrn, &(pI830->Offscreen)))
-	return FALSE;
-     if (IS_I965G(pI830) && !BindMemRange(pScrn, &(pI830->EXAStateMem)))
-	return FALSE;
+     if (pI830->useEXA) {
+         if (!BindMemRange(pScrn, &(pI830->Offscreen)))
+	    return FALSE;
+         if (IS_I965G(pI830) && !BindMemRange(pScrn, &(pI830->EXAStateMem)))
+	    return FALSE;
+     }
 #endif
       pI830->GttBound = 1;
    }
diff-tree fa54a3c08301e59558ab0493b3d22324f4162496 (from 2f2c443e971faa54ffcf751b6acb444e8e7875ce)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Dec 6 13:24:44 2006 +0800

    [PATCH] fix Makefile.am
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/Makefile.am b/src/Makefile.am
index a9f427d..80cea10 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -94,7 +94,6 @@ i810_drv_la_SOURCES = \
 	 i830_xaa.c \
 	 i830_exa_render.c \
 	 i915_exa_render.c \
-	 i965_composite_wm_nomask.h \
 	 i965_exa_render.c
 
 if HAVE_GEN4ASM
@@ -102,8 +101,14 @@ sf_prog.h: packed_yuv_sf.g4a
 	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
 wm_prog.h: packed_yuv_wm.g4a
 	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
+exa_sf_prog.h: exa_sf.g4a
+	intel-gen4asm -o exa_sf_prog.h exa_sf.g4a
+exa_sf_mask_prog.h: exa_sf_mask.g4a
+	intel-gen4asm -o exa_sf_mask_prog.h exa_sf_mask.g4a
 exa_wm_nomask_prog.h: exa_wm_nomask.g4a
 	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
+exa_wm_masknoca_prog.h: exa_wm_masknoca.g4a
+	intel-gen4asm -o exa_wm_masknoca_prog.h exa_wm_masknoca.g4a
 endif
 
 if DRI
diff-tree 2f2c443e971faa54ffcf751b6acb444e8e7875ce (from 0bf04fe78a8a915310ef8a90f5c7872be7476e2e)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Dec 6 10:43:29 2006 +0800

    [PATCH] Formats fixes
    
    We should use card_fmt for src/mask picture, and use dest color
    buffer format helper. Also fix wrong name for G965 texture formats,
    and pict_x1r5g5b5 isn't supported by sampler engine.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 7e9c1e3..583bc26 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -121,13 +121,12 @@ static struct blendinfo I965BlendOp[] = 
 
 /* FIXME: surface format defined in brw_defines.h, shared Sampling engine 1.7.2*/
 static struct formatinfo I965TexFormats[] = {
-        {PICT_a8r8g8b8, BRW_SURFACEFORMAT_R8G8B8A8_UNORM },
-        {PICT_x8r8g8b8, BRW_SURFACEFORMAT_R8G8B8X8_UNORM },
-        {PICT_a8b8g8r8, BRW_SURFACEFORMAT_B8G8R8A8_UNORM },
-        {PICT_x8b8g8r8, BRW_SURFACEFORMAT_B8G8R8X8_UNORM },
+        {PICT_a8r8g8b8, BRW_SURFACEFORMAT_B8G8R8A8_UNORM },
+        {PICT_x8r8g8b8, BRW_SURFACEFORMAT_B8G8R8X8_UNORM },
+        {PICT_a8b8g8r8, BRW_SURFACEFORMAT_R8G8B8A8_UNORM },
+        {PICT_x8b8g8r8, BRW_SURFACEFORMAT_R8G8B8X8_UNORM },
         {PICT_r5g6b5,   BRW_SURFACEFORMAT_B5G6R5_UNORM   },
         {PICT_a1r5g5b5, BRW_SURFACEFORMAT_B5G5R5A1_UNORM },
-        {PICT_x1r5g5b5, BRW_SURFACEFORMAT_B5G5R5X1_UNORM },
         {PICT_a8,       BRW_SURFACEFORMAT_A8_UNORM	 },
 };
 
@@ -366,6 +365,16 @@ static const CARD32 ps_kernel_static_mas
 #include "exa_wm_masknoca_prog.h"
 };
 
+static CARD32 i965_get_card_format(PicturePtr pPict) 
+{
+	int i;
+        for (i = 0; i < sizeof(I965TexFormats) / sizeof(I965TexFormats[0]); i++) {
+            if (I965TexFormats[i].fmt == pPict->format)
+                break;
+        }
+	return I965TexFormats[i].card_fmt;
+}
+
 Bool
 I965EXAPrepareComposite(int op, PicturePtr pSrcPicture,
 			PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -376,10 +385,7 @@ I965EXAPrepareComposite(int op, PictureP
     CARD32 src_offset, src_pitch;
     CARD32 mask_offset = 0, mask_pitch = 0;
     CARD32 dst_format, dst_offset, dst_pitch;
- 
-ErrorF("i965 prepareComposite\n");
 
-    I965GetDestFormat(pDstPicture, &dst_format);
     src_offset = exaGetPixmapOffset(pSrc);
     src_pitch = exaGetPixmapPitch(pSrc);
     dst_offset = exaGetPixmapOffset(pDst);
@@ -590,11 +596,9 @@ ErrorF("i965 prepareComposite\n");
    memset(dest_surf_state, 0, sizeof(*dest_surf_state));
    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
-   if (pDst->drawable.bitsPerPixel == 16) {
-      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-   } else {
-      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-   }
+   I965GetDestFormat(pDstPicture, &dst_format);
+   dest_surf_state->ss0.surface_format = dst_format;
+
    dest_surf_state->ss0.writedisable_alpha = 0;
    dest_surf_state->ss0.writedisable_red = 0;
    dest_surf_state->ss0.writedisable_green = 0;
@@ -615,12 +619,7 @@ ErrorF("i965 prepareComposite\n");
    /* Set up the source surface state buffer */
    memset(src_surf_state, 0, sizeof(*src_surf_state));
    src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-   if (pSrc->drawable.bitsPerPixel == 8)
-      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_A8_UNORM; //XXX? 
-   else if (pSrc->drawable.bitsPerPixel == 16)
-      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-   else 
-      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   src_surf_state->ss0.surface_format = i965_get_card_format(pSrcPicture);
 
    src_surf_state->ss0.writedisable_alpha = 0;
    src_surf_state->ss0.writedisable_red = 0;
@@ -643,12 +642,7 @@ ErrorF("i965 prepareComposite\n");
    if (pMask) {
    	memset(mask_surf_state, 0, sizeof(*mask_surf_state));
 	mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-   	if (pMask->drawable.bitsPerPixel == 8)
-      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_A8_UNORM; //XXX? 
-   	else if (pMask->drawable.bitsPerPixel == 16)
-      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-   	else 
-      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
 
    	mask_surf_state->ss0.writedisable_alpha = 0;
    	mask_surf_state->ss0.writedisable_red = 0;
diff-tree 0bf04fe78a8a915310ef8a90f5c7872be7476e2e (from 5c461063cde68092e778c44ac6abd9129cd8019e)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Mon Dec 4 15:48:04 2006 +0800

    [PATCH] set correct default border color
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 2d1ce5f..7e9c1e3 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -691,10 +691,10 @@ ErrorF("i965 prepareComposite\n");
    }
 
    memset(default_color_state, 0, sizeof(*default_color_state));
-   default_color_state->color[0] = 1.0; /* RGBA format */
-   default_color_state->color[1] = 0.0; 
-   default_color_state->color[2] = 0.0; 
-   default_color_state->color[3] = 0.0; 
+   default_color_state->color[0] = 0.0; /* R */
+   default_color_state->color[1] = 0.0; /* G */
+   default_color_state->color[2] = 0.0; /* B */
+   default_color_state->color[3] = 1.0; /* A */
 
    src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
 
diff-tree 5c461063cde68092e778c44ac6abd9129cd8019e (from 89a42d489bd370b89e5ff4e01f026b4d64723cd8)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Mon Dec 4 15:47:31 2006 +0800

    [PATCH] fix typo in ps kernel
    
    fix corrupt in some subspans
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
index 195203c..c2049fd 100644
--- a/src/exa_wm_masknoca.g4a
+++ b/src/exa_wm_masknoca.g4a
@@ -51,10 +51,10 @@ mov (1) g8.20<1>F g1.14<0,1,0>UW { align
 add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
 add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
     /* Set up ss2.x coordinates */
-mov (1) g9<1>F g1.16<0,1,0>UW { align1 };
-add (1) g9.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g9.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g9.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g7<1>F g1.16<0,1,0>UW { align1 };
+add (1) g7.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g7.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g7.12<1>F g1.16<0,1,0>UW 1UB { align1 };
     /* Set up ss2.y coordinates */
 mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
 mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
diff --git a/src/exa_wm_masknoca_prog.h b/src/exa_wm_masknoca_prog.h
index 66eb960..5fcf3b5 100644
--- a/src/exa_wm_masknoca_prog.h
+++ b/src/exa_wm_masknoca_prog.h
@@ -14,10 +14,10 @@
    { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
    { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
    { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x2120013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x21240d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x2128013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x212c0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20e40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ec0d3d, 0x00000030, 0x00000001 },
    { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
    { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
    { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
diff-tree 89a42d489bd370b89e5ff4e01f026b4d64723cd8 (from 01bfa4fa6fc0ceec8581676e5d72c68dd71efa96)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:16:46 2006 +0800

    [PATCH] shut up warning
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 6f2bc84..2d1ce5f 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -1011,10 +1011,8 @@ I965EXAComposite(PixmapPtr pDst, int src
 
     srcXend = srcX + w;
     srcYend = srcY + h;
-    if (pMask) {
-        maskXend = maskX + w;
-        maskYend = maskY + h;
-    }
+    maskXend = maskX + w;
+    maskYend = maskY + h;
     if (is_transform[0]) {
         v.vector[0] = IntToxFixed(srcX);
         v.vector[1] = IntToxFixed(srcY);
diff-tree 01bfa4fa6fc0ceec8581676e5d72c68dd71efa96 (from 79018bb47c43510d59c592592f06204189bd12dc)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:14:55 2006 +0800

    [PATCH] fix alpha blending state
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index c4a3f97..6f2bc84 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -562,21 +562,26 @@ ErrorF("i965 prepareComposite\n");
    cc_state->cc0.stencil_enable = 0;   /* disable stencil */
    cc_state->cc2.depth_test = 0;       /* disable depth test */
    cc_state->cc2.logicop_enable = 0;   /* disable logic op */
-   cc_state->cc3.ia_blend_enable = 0;  /* blend alpha just like colors */
+   cc_state->cc3.ia_blend_enable = 1;  /* blend alpha just like colors */
    cc_state->cc3.blend_enable = 1;     /* enable color blend */
    cc_state->cc3.alpha_test = 0;       /* disable alpha test */
    cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
    cc_state->cc5.dither_enable = 0;    /* disable dither */
-//   cc_state->cc5.logicop_func = 0xc;   /* COPY */
-//   cc_state->cc5.statistics_enable = 1;
-//   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
-//   cc_state->cc5.ia_src_blend_factor = BRW_BLENDFACTOR_ONE;
-//   cc_state->cc5.ia_dest_blend_factor = BRW_BLENDFACTOR_ONE;
-   cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
+   cc_state->cc5.logicop_func = 0xc;   /* COPY */
+   cc_state->cc5.statistics_enable = 1;
+   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
    I965GetBlendCntl(op, pMaskPicture, pDstPicture->format, 
 		    &src_blend, &dst_blend);
+   /* XXX: alpha blend factor should be same as color, but check
+	   for CA case in future */
+   cc_state->cc5.ia_src_blend_factor = src_blend;
+   cc_state->cc5.ia_dest_blend_factor = dst_blend;
+   cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
    cc_state->cc6.src_blend_factor = src_blend;
    cc_state->cc6.dest_blend_factor = dst_blend;
+   cc_state->cc6.clamp_post_alpha_blend = 1; 
+   cc_state->cc6.clamp_pre_alpha_blend = 1; 
+   cc_state->cc6.clamp_range = 0;  /* clamp range [0,1] */
 
    /* Upload system kernel */
    memcpy (sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
diff-tree 79018bb47c43510d59c592592f06204189bd12dc (from a5b9b438469f171b002fa0b99d8cab83e51ec968)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 17:05:32 2006 +0800

    [PATCH] Add in sf/wm program for mask picture without CA
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
new file mode 100644
index 0000000..ab519ce
--- /dev/null
+++ b/src/exa_sf_mask.g4a
@@ -0,0 +1,53 @@
+
+/* FIXME how to setup second coeffient for mask tex coord */
+
+/* 
+   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
+   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
+   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
+   g6      { 1/(x1-x0), 1/(y1-y0) }
+   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
+	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
+		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
+ */
+
+/* assign Cx[0], Cx[1] to src, same to Cy, Co 
+          Cx[2], Cx[3] to mask, same to Cy, Co */
+
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+/* Cx[0] */
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[0] */
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Cx[2] */
+mul (1) g7.16<1>F g7.16<0,1,0>F g6<0,1,0>F { align1 };
+/* Cy[2] */
+mul (1) g7.20<1>F g7.20<0,1,0>F g6.4<0,1,0>F { align1 };
+
+/* src Cx[0], Cx[1] */
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+/* mask Cx[2], Cx[3] */
+mov (1) m1.8<1>F g7.16<0,1,0>F { align1 };
+mov (1) m1.12<1>F g7.16<0,1,0>F { align1 };
+/* src Cy[0], Cy[1] */
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+/* mask Cy[2], Cy[3] */
+mov (1) m2.8<1>F g7.20<0,1,0>F { align1 };
+mov (1) m2.12<1>F g7.20<0,1,0>F { align1 };
+/* src Co[0], Co[1] */
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+/* mask Co[2], Co[3] */
+mov (1) m3.8<1>F g3.16<0,1,0>F { align1 };
+mov (1) m3.12<1>F g3.20<0,1,0>F { align1 };
+
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_sf_mask_prog.h b/src/exa_sf_mask_prog.h
new file mode 100644
index 0000000..cd7f460
--- /dev/null
+++ b/src/exa_sf_mask_prog.h
@@ -0,0 +1,25 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00000041, 0x20f077bd, 0x000000f0, 0x000000c0 },
+   { 0x00000041, 0x20f477bd, 0x000000f4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00000001, 0x202803be, 0x000000f0, 0x00000000 },
+   { 0x00000001, 0x202c03be, 0x000000f0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00000001, 0x204803be, 0x000000f4, 0x00000000 },
+   { 0x00000001, 0x204c03be, 0x000000f4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00000001, 0x206803be, 0x00000070, 0x00000000 },
+   { 0x00000001, 0x206c03be, 0x00000074, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_masknoca.g4a b/src/exa_wm_masknoca.g4a
new file mode 100644
index 0000000..195203c
--- /dev/null
+++ b/src/exa_wm_masknoca.g4a
@@ -0,0 +1,202 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture.
+ * XXX: This is still experimental, and should be fixed to support multiple texture
+ * map, and conditional mul actions. 
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+   As mask texture coeffient needs extra setup urb starting from g4, we should
+   shift this location. 
+
+ * X0_R is g4->g6
+ * X1_R is g5->g7
+ * Y0_R is g6->g8
+ * Y1_R is g7->g9
+
+     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
+     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
+     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
+     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
+ */
+
+/* multitexture program with src and mask texture */
+/* - load src texture */
+/* - load mask texture */
+/* - mul src.X with mask's alpha */
+/* - write out src.X */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g6<1>F g1.8<0,1,0>UW { align1 };
+add (1) g6.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g6.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g6.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g8<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g8.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g8.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g8.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g6.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g6.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g6.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g6.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g8.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g8.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g8.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g8.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g9<1>F g1.16<0,1,0>UW { align1 };
+add (1) g9.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g9.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g9.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g9<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g9.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g9.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g9.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g7.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g7.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g7.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g7.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g9.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g9.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g9.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g9.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+/* This is for src texture */
+/* I don't want to change origin ssX coords, as it will be used later in mask */
+/* so store tex coords in g10, g11, g12, g13 */
+
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+/* Cx[0] */
+mul (8) g10<1>F g10<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g11<1>F g11<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+/* Co[0] */
+add (8) g10<1>F g10<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g11<1>F g11<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+/* Cy[0] */
+mul (8) g12<1>F g12<8,8,1>F g3.4<0,1,0>F { align1 };
+mul (8) g13<1>F g13<8,8,1>F g3.4<0,1,0>F { align1 };
+    /* add in texture Y offset */
+/* Co[1] */
+add (8) g12<1>F g12<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g13<1>F g13<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g10<8,8,1>F { align1 };
+mov (8) m2<1>F g11<8,8,1>F { align1 }; /* param 0 u in m1, m2 */
+mov (8) m3<1>F g12<8,8,1>F { align1 };
+mov (8) m4<1>F g13<8,8,1>F { align1 }; /* param 1 v in m3, m4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+
+/* src texture readback: g14-g21 */
+send (16) 0 		/* msg reg index */
+	g14<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, 
+				(binding_table,sampler_index,datatype). 
+			    here(src->dst) we should use src_sampler and 
+			    src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+mov (8) g21<1>UD g21<8,8,1>UD { align1 };  /* wait sampler return */
+
+/* sampler mask texture, use g10, g11, g12, g13 */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g10<1>F g6<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g11<1>F g7<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+/* Cx[2] */
+mul (8) g10<1>F g10<8,8,1>F g4<0,1,0>F { align1 };
+mul (8) g11<1>F g11<8,8,1>F g4<0,1,0>F { align1 };
+    /* add in texture X offset */
+/* Co[2] */
+add (8) g10<1>F g10<8,8,1>F g4.12<0,1,0>F { align1 };
+add (8) g11<1>F g11<8,8,1>F g4.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g12<1>F g8<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g13<1>F g9<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+/* Cy[2] */
+mul (8) g12<1>F g12<8,8,1>F g4.4<0,1,0>F { align1 };
+mul (8) g13<1>F g13<8,8,1>F g4.4<0,1,0>F { align1 };
+    /* add in texture Y offset */
+/* Co[3] */
+add (8) g12<1>F g12<8,8,1>F g4.28<0,1,0>F { align1 };
+add (8) g13<1>F g13<8,8,1>F g4.28<0,1,0>F { align1 };
+
+mov (8) m1<1>F g10<8,8,1>F { align1 };
+mov (8) m2<1>F g11<8,8,1>F { align1 }; 
+mov (8) m3<1>F g12<8,8,1>F { align1 };
+mov (8) m4<1>F g13<8,8,1>F { align1 };
+
+/* mask sampler g22-g29 */
+/* binding_table (2), sampler (1) */
+send (16) 0 g22<1>UW g0<8,8,1>UW sampler (2,1,F) mlen 5 rlen 8 { align1 };
+mov (8) g29<1>UD g29<8,8,1>UD { align1 };  /* wait sampler return */
+
+/* mul mask's alpha channel g28,g29 to src (g14-g21), then write out src */
+mul (8) g14<1>F g14<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g15<1>F g15<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g16<1>F g16<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g17<1>F g17<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g18<1>F g18<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g19<1>F g19<8,8,1>F g29<8,8,1>F { align1 };
+mul (8) g20<1>F g20<8,8,1>F g28<8,8,1>F { align1 };
+mul (8) g21<1>F g21<8,8,1>F g29<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+mov (8) m2<1>F g14<8,8,1>F { align1 };
+mov (8) m3<1>F g16<8,8,1>F { align1 };
+mov (8) m4<1>F g18<8,8,1>F { align1 };
+mov (8) m5<1>F g20<8,8,1>F { align1 };
+mov (8) m6<1>F g15<8,8,1>F { align1 };
+mov (8) m7<1>F g17<8,8,1>F { align1 };
+mov (8) m8<1>F g19<8,8,1>F { align1 };
+mov (8) m9<1>F g21<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_wm_masknoca_prog.h b/src/exa_wm_masknoca_prog.h
new file mode 100644
index 0000000..66eb960
--- /dev/null
+++ b/src/exa_wm_masknoca_prog.h
@@ -0,0 +1,95 @@
+   { 0x00000001, 0x20c0013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20c40d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c8013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20cc0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2100013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x2104013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x21080d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x210c0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20d40d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d8013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2110013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x2114013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x21180d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x211c0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x21240d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2128013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x212c0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x2120013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x2124013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x21280d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x212c0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20f40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20fc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x2130013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x2134013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x21380d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x213c0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000060 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000060 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000006c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000006c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000064 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000064 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000007c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x21c01d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22a00021, 0x008d02a0, 0x00000000 },
+   { 0x00600040, 0x214077bd, 0x008d00c0, 0x00004020 },
+   { 0x00600040, 0x216077bd, 0x008d00e0, 0x00004020 },
+   { 0x00600041, 0x214077bd, 0x008d0140, 0x00000080 },
+   { 0x00600041, 0x216077bd, 0x008d0160, 0x00000080 },
+   { 0x00600040, 0x214077bd, 0x008d0140, 0x0000008c },
+   { 0x00600040, 0x216077bd, 0x008d0160, 0x0000008c },
+   { 0x00600040, 0x218077bd, 0x008d0100, 0x00004024 },
+   { 0x00600040, 0x21a077bd, 0x008d0120, 0x00004024 },
+   { 0x00600041, 0x218077bd, 0x008d0180, 0x00000084 },
+   { 0x00600041, 0x21a077bd, 0x008d01a0, 0x00000084 },
+   { 0x00600040, 0x218077bd, 0x008d0180, 0x0000009c },
+   { 0x00600040, 0x21a077bd, 0x008d01a0, 0x0000009c },
+   { 0x00600001, 0x202003be, 0x008d0140, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0160, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d01a0, 0x00000000 },
+   { 0x00800031, 0x22c01d29, 0x008d0000, 0x02580102 },
+   { 0x00600001, 0x23a00021, 0x008d03a0, 0x00000000 },
+   { 0x00600041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00600041, 0x21e077bd, 0x008d01e0, 0x008d03a0 },
+   { 0x00600041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00600041, 0x222077bd, 0x008d0220, 0x008d03a0 },
+   { 0x00600041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00600041, 0x226077bd, 0x008d0260, 0x008d03a0 },
+   { 0x00600041, 0x228077bd, 0x008d0280, 0x008d0380 },
+   { 0x00600041, 0x22a077bd, 0x008d02a0, 0x008d03a0 },
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 4bc90c1..c4a3f97 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -344,12 +344,16 @@ static const CARD32 sf_kernel_static[][4
 #include "exa_sf_prog.h"
 };
 
+static const CARD32 sf_kernel_static_mask[][4] = {
+#include "exa_sf_mask_prog.h"
+};
+
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
-	#include "exa_wm_nomask_prog.h"
+#include "exa_wm_nomask_prog.h"
 };
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
@@ -359,7 +363,7 @@ static const CARD32 ps_kernel_static_mas
 
 /* 3: mask without componentAlpha, src * mask alpha */
 static const CARD32 ps_kernel_static_masknoca [][4] = {
-/*#include "i965_composite_wm_masknoca.h" */
+#include "exa_wm_masknoca_prog.h"
 };
 
 Bool
@@ -375,11 +379,6 @@ I965EXAPrepareComposite(int op, PictureP
  
 ErrorF("i965 prepareComposite\n");
 
-    /* FIXME: fallback in pMask for now, would be enable after finish
-	wm kernel program */
-    if (pMask)
-	I830FALLBACK("No mask support yet.\n");
-
     I965GetDestFormat(pDstPicture, &dst_format);
     src_offset = exaGetPixmapOffset(pSrc);
     src_pitch = exaGetPixmapPitch(pSrc);
@@ -436,7 +435,10 @@ ErrorF("i965 prepareComposite\n");
    /* keep current sf_kernel, which will send one setup urb entry to
 	PS kernel */
    sf_kernel_offset = ALIGN(next_offset, 64);
-   next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
+   if (pMask) 
+       next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
+   else
+       next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
 
    //XXX: ps_kernel may be seperated, fix with offset
    ps_kernel_offset = ALIGN(next_offset, 64);
@@ -746,7 +748,10 @@ ErrorF("i965 prepareComposite\n");
     * calculate dA/dx and dA/dy.  Hand these interpolation coefficients
     * back to SF which then hands pixels off to WM.
     */
-   memcpy (sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
+   if (pMask) 
+       memcpy (sf_kernel, sf_kernel_static_mask, sizeof (sf_kernel_static));
+   else
+       memcpy (sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
 
    memset(sf_state, 0, sizeof(*sf_state));
    sf_state->thread0.kernel_start_pointer = 
@@ -780,7 +785,6 @@ ErrorF("i965 prepareComposite\n");
    /* Set up the PS kernel (dispatched by WM) 
     */
     
-    // XXX: replace to texture blend shader, and different cases 
    if (pMask) {
 	if (pMaskPicture->componentAlpha)
    	    memcpy (ps_kernel, ps_kernel_static_maskca, sizeof (ps_kernel_static_maskca));
diff-tree a5b9b438469f171b002fa0b99d8cab83e51ec968 (from b7c1e1656f45e43ea2f9a47f1a487050c0884c22)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:52:44 2006 +0800

    [PATCH] misc cleanup for G965 vs/sf/wm states
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 51b2c60..4bc90c1 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -266,6 +266,7 @@ I965EXACheckComposite(int op, PicturePtr
 
 #define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define BRW_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
 
 int urb_vs_start, urb_vs_size;
 int urb_gs_start, urb_gs_size;
@@ -336,9 +337,8 @@ static const CARD32 sip_kernel_static[][
  * with the base texture coordinate. It was extracted from the Mesa driver
  */
 
-#define SF_KERNEL_NUM_GRF  10
-#define SF_KERNEL_NUM_URB  8
-#define SF_MAX_THREADS	   4
+#define SF_KERNEL_NUM_GRF  16
+#define SF_MAX_THREADS	   1
 
 static const CARD32 sf_kernel_static[][4] = {
 #include "exa_sf_prog.h"
@@ -468,7 +468,6 @@ ErrorF("i965 prepareComposite\n");
    next_offset = vb_offset + vb_size;
 
    /* And then the general state: */
-   //XXX: fix for texture map and target surface
    dest_surf_offset = ALIGN(next_offset, 32);
    next_offset = dest_surf_offset + sizeof(*dest_surf_state);
 
@@ -534,8 +533,8 @@ ErrorF("i965 prepareComposite\n");
 #define URB_CLIP_ENTRY_SIZE   0
 #define URB_CLIP_ENTRIES      0
    
-#define URB_SF_ENTRY_SIZE     4
-#define URB_SF_ENTRIES	      8
+#define URB_SF_ENTRY_SIZE     2
+#define URB_SF_ENTRIES	      1
 
    urb_vs_start = 0;
    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
@@ -564,7 +563,6 @@ ErrorF("i965 prepareComposite\n");
    cc_state->cc3.ia_blend_enable = 0;  /* blend alpha just like colors */
    cc_state->cc3.blend_enable = 1;     /* enable color blend */
    cc_state->cc3.alpha_test = 0;       /* disable alpha test */
-   // XXX:cc_viewport needed? 
    cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
    cc_state->cc5.dither_enable = 0;    /* disable dither */
 //   cc_state->cc5.logicop_func = 0xc;   /* COPY */
@@ -585,7 +583,6 @@ ErrorF("i965 prepareComposite\n");
    memset(dest_surf_state, 0, sizeof(*dest_surf_state));
    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
-   // XXX: should compare with picture's cpp?...8 bit surf?
    if (pDst->drawable.bitsPerPixel == 16) {
       dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
    } else {
@@ -601,14 +598,12 @@ ErrorF("i965 prepareComposite\n");
    dest_surf_state->ss0.mipmap_layout_mode = 0;
    dest_surf_state->ss0.render_cache_read_mode = 0;
    
-   // XXX: fix to picture address & size
    dest_surf_state->ss1.base_addr = dst_offset;
    dest_surf_state->ss2.height = pDst->drawable.height - 1;
    dest_surf_state->ss2.width = pDst->drawable.width - 1;
    dest_surf_state->ss2.mip_count = 0;
    dest_surf_state->ss2.render_target_rotation = 0;
    dest_surf_state->ss3.pitch = dst_pitch - 1; 
-   // tiled surface?
 
    /* Set up the source surface state buffer */
    memset(src_surf_state, 0, sizeof(*src_surf_state));
@@ -741,8 +736,10 @@ ErrorF("i965 prepareComposite\n");
 
    /* Set up the vertex shader to be disabled (passthrough) */
    memset(vs_state, 0, sizeof(*vs_state));
-   // XXX: vs URB should be defined for VF vertex URB store. done already?
+   vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
+   vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
    vs_state->vs6.vs_enable = 0;
+   vs_state->vs6.vert_cache_disable = 1;
 
    // XXX: sf_kernel? keep it as now
    /* Set up the SF kernel to do coord interp: for each attribute,
@@ -754,7 +751,7 @@ ErrorF("i965 prepareComposite\n");
    memset(sf_state, 0, sizeof(*sf_state));
    sf_state->thread0.kernel_start_pointer = 
 	       (state_base_offset + sf_kernel_offset) >> 6;
-   sf_state->thread0.grf_reg_count = ((SF_KERNEL_NUM_GRF & ~15) / 16);
+   sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
    sf_state->sf1.single_program_flow = 1;
    sf_state->sf1.binding_table_entry_count = 0;
    sf_state->sf1.thread_priority = 0;
@@ -795,7 +792,7 @@ ErrorF("i965 prepareComposite\n");
    memset (wm_state, 0, sizeof (*wm_state));
    wm_state->thread0.kernel_start_pointer = 
 	    (state_base_offset + ps_kernel_offset) >> 6;
-   wm_state->thread0.grf_reg_count = ((PS_KERNEL_NUM_GRF & ~15) / 16);
+   wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
    wm_state->thread1.single_program_flow = 1;
    if (!pMask)
        wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
@@ -808,7 +805,10 @@ ErrorF("i965 prepareComposite\n");
    // XXX: urb allocation
    wm_state->thread3.const_urb_entry_read_length = 0;
    wm_state->thread3.const_urb_entry_read_offset = 0;
-   wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
+   if (pMask)
+       wm_state->thread3.urb_entry_read_length = 2;  /* two per pair of attrib */
+   else 
+       wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
    wm_state->thread3.urb_entry_read_offset = 0;
    // wm kernel use urb from 3, see wm_program in compiler module
    wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
diff-tree b7c1e1656f45e43ea2f9a47f1a487050c0884c22 (from db9cfaa35adaf79ea57bc06b27c7e37935d3e1a7)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:40:15 2006 +0800

    [PATCH] WM kernel needs scratch space
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 51c2006..51b2c60 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -297,6 +297,7 @@ int dest_surf_offset, src_surf_offset, m
 int src_sampler_offset, mask_sampler_offset,vs_offset;
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
+int wm_scratch_offset;
 int binding_table_offset;
 int default_color_offset; 
 int next_offset, total_state_size;
@@ -426,6 +427,9 @@ ErrorF("i965 prepareComposite\n");
    wm_offset = ALIGN(next_offset, 32);
    next_offset = wm_offset + sizeof(*wm_state);
     
+   wm_scratch_offset = ALIGN(next_offset, 1024);
+   next_offset = wm_scratch_offset + 1024 * PS_MAX_THREADS;
+
    cc_offset = ALIGN(next_offset, 32);
    next_offset = cc_offset + sizeof(*cc_state);
 
@@ -798,7 +802,8 @@ ErrorF("i965 prepareComposite\n");
    else
        wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
 
-   wm_state->thread2.scratch_space_base_pointer = 0;
+   wm_state->thread2.scratch_space_base_pointer = (state_base_offset + 
+						   wm_scratch_offset)>>10;
    wm_state->thread2.per_thread_scratch_space = 0;
    // XXX: urb allocation
    wm_state->thread3.const_urb_entry_read_length = 0;
diff-tree db9cfaa35adaf79ea57bc06b27c7e37935d3e1a7 (from 70276e4e9a8a5026ec436d2be5bf5eab868aa178)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:37:06 2006 +0800

    [PATCH] Setup default border color for our samplers
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 94eabfb..51c2006 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -278,6 +278,7 @@ struct brw_surface_state *src_surf_state
 struct brw_surface_state *mask_surf_state;
 struct brw_sampler_state *src_sampler_state;
 struct brw_sampler_state *mask_sampler_state;  
+struct brw_sampler_default_color *default_color_state;
 
 struct brw_vs_unit_state *vs_state;
 struct brw_sf_unit_state *sf_state;
@@ -297,6 +298,7 @@ int src_sampler_offset, mask_sampler_off
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
 int binding_table_offset;
+int default_color_offset; 
 int next_offset, total_state_size;
 char *state_base;
 int state_base_offset;
@@ -478,6 +480,9 @@ ErrorF("i965 prepareComposite\n");
    binding_table_offset = ALIGN(next_offset, 32);
    next_offset = binding_table_offset + (binding_table_entries * 4);
 
+   default_color_offset = ALIGN(next_offset, 32);
+   next_offset = default_color_offset + sizeof(*default_color_state);
+
    total_state_size = next_offset;
    assert(total_state_size < EXA_LINEAR_EXTRA);
 
@@ -508,6 +513,8 @@ ErrorF("i965 prepareComposite\n");
 
    vb = (void *)(state_base + vb_offset);
 
+   default_color_state = (void*)(state_base + default_color_offset);
+
    /* Set up a default static partitioning of the URB, which is supposed to
     * allow anything we would want to do, at potentially lower performance.
     */
@@ -541,7 +548,6 @@ ErrorF("i965 prepareComposite\n");
     * here, but we should have synced the 3D engine already in I830PutImage.
     */
 
-// needed?
    memset (cc_viewport, 0, sizeof (*cc_viewport));
    cc_viewport->min_depth = -1.e35;
    cc_viewport->max_depth = 1.e35;
@@ -678,18 +684,25 @@ ErrorF("i965 prepareComposite\n");
 	I830FALLBACK("Bad filter 0x%x\n", pSrcPicture->filter);
    }
 
+   memset(default_color_state, 0, sizeof(*default_color_state));
+   default_color_state->color[0] = 1.0; /* RGBA format */
+   default_color_state->color[1] = 0.0; 
+   default_color_state->color[2] = 0.0; 
+   default_color_state->color[3] = 0.0; 
+
+   src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
+
    if (!pSrcPicture->repeat) {
-	/* XXX: clamp_border and set border to 0 */
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
-   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER; 
+   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	src_sampler_state->ss2.default_color_pointer = 
+			(state_base_offset + default_color_offset) >> 5;
    } else {
    	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
    	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    }
-   /* XXX: ss2 has border color pointer, which should be in general state address,
-    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
    src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
    if (pMask) {
@@ -709,17 +722,16 @@ ErrorF("i965 prepareComposite\n");
    	}
 
    	if (!pMaskPicture->repeat) {
-	/* XXX: clamp_border and set border to 0 */
-   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
-   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
-   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER; 
+   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+            mask_sampler_state->ss2.default_color_pointer = 
+				(state_base_offset + default_color_offset)>>5;
    	} else {
    	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
    	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
     	}
-   /* XXX: ss2 has border color pointer, which should be in general state address,
-    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
    	mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
    }
 
diff-tree 70276e4e9a8a5026ec436d2be5bf5eab868aa178 (from e8a4cbdeff4125e28d807d0a563efc0606d21a75)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:30:53 2006 +0800

    [PATCH] fix vertex buffer size
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 9127d65..94eabfb 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -233,16 +233,12 @@ Bool
 I965EXACheckComposite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 		      PicturePtr pDstPicture)
 {
-	/* check op*/
-	/* check op with mask's componentAlpha*/
-	/* check textures */
-	/* check dst buffer format */
     CARD32 tmp1;
     
     /* Check for unsupported compositing operations. */
     if (op >= sizeof(I965BlendOp) / sizeof(I965BlendOp[0]))
         I830FALLBACK("Unsupported Composite op 0x%x\n", op);
-                                                                                                                                                            
+
     if (pMaskPicture != NULL && pMaskPicture->componentAlpha) {
         /* Check if it's component alpha that relies on a source alpha and on
          * the source value.  We can only get one of those into the single
@@ -305,7 +301,7 @@ int next_offset, total_state_size;
 char *state_base;
 int state_base_offset;
 float *vb;
-int vb_size = 4 * 4 ; /* 4 DWORDS per vertex, 4 vertices for TRIFAN*/ 
+int vb_size = (4 * 4) * 4 ; /* 4 DWORDS per vertex*/ 
 
 CARD32 src_blend, dst_blend;
 
diff-tree e8a4cbdeff4125e28d807d0a563efc0606d21a75 (from 42534474fd2556e5987205626cca8f30e25855a8)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:24:24 2006 +0800

    [PATCH] clean up issue cmd to ring buffer
    
    Make it easy to track different part of ring state, and
    use rectlist primitive instead.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 2c3e43b..9127d65 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -376,11 +376,6 @@ I965EXAPrepareComposite(int op, PictureP
  
 ErrorF("i965 prepareComposite\n");
 
-//    i965_3d_pipeline_setup(pScrn);
-//    i965_surf_setup(pScrn, pSrcPicture, pMaskPicture, pDstPicture,
-//   			pSrc, pMask, pDst);
-    // then setup blend, and shader program 
-    
     /* FIXME: fallback in pMask for now, would be enable after finish
 	wm kernel program */
     if (pMask)
@@ -819,62 +814,65 @@ ErrorF("i965 prepareComposite\n");
     * rendering pipe
     */
    {
-   
-   BEGIN_LP_RING((pMask?48:46));
-   // MI_FLUSH prior to PIPELINE_SELECT
-   OUT_RING(MI_FLUSH | 
+	BEGIN_LP_RING(2);
+   	OUT_RING(MI_FLUSH | 
 	    MI_STATE_INSTRUCTION_CACHE_FLUSH |
 	    BRW_MI_GLOBAL_SNAPSHOT_RESET);
+	OUT_RING(MI_NOOP);
+	ADVANCE_LP_RING();
+   }
+   {
+        BEGIN_LP_RING(12);
    
-   /* Match Mesa driver setup */
-   OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+        /* Match Mesa driver setup */
+        OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
    
+   	OUT_RING(BRW_CS_URB_STATE | 0);
+   	OUT_RING((0 << 4) |  /* URB Entry Allocation Size */
+            (0 << 0));  /* Number of URB Entries */
+
    /* Zero out the two base address registers so all offsets are absolute */
-   // XXX: zero out...
-   OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
-   // why this's not state_base_offset? -> because later we'll always add on
-   // state_base_offset to offset params. see SIP
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
-   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
-   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
-   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
+   	OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
+   	OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+   	OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
+   	OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
 
    /* Set system instruction pointer */
-   OUT_RING(BRW_STATE_SIP | 0);
-   OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
-      
+   	OUT_RING(BRW_STATE_SIP | 0);
+   	OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
+	OUT_RING(MI_NOOP);
+	ADVANCE_LP_RING();
+   }
+   {
+	BEGIN_LP_RING(26);
    /* Pipe control */
-   // XXX: pipe control write cache before enabling color blending
-   // vol2, geometry pipeline 1.8.4
-   OUT_RING(BRW_PIPE_CONTROL |
+   	OUT_RING(BRW_PIPE_CONTROL |
 	    BRW_PIPE_CONTROL_NOWRITE |
 	    BRW_PIPE_CONTROL_IS_FLUSH |
 	    2);
-   OUT_RING(0);			       /* Destination address */
-   OUT_RING(0);			       /* Immediate data low DW */
-   OUT_RING(0);			       /* Immediate data high DW */
+   	OUT_RING(0);			       /* Destination address */
+   	OUT_RING(0);			       /* Immediate data low DW */
+   	OUT_RING(0);			       /* Immediate data high DW */
 
    /* Binding table pointers */
-   OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
-   OUT_RING(0); /* vs */
-   OUT_RING(0); /* gs */
-   OUT_RING(0); /* clip */
-   OUT_RING(0); /* sf */
+   	OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
+   	OUT_RING(0); /* vs */
+   	OUT_RING(0); /* gs */
+   	OUT_RING(0); /* clip */
+   	OUT_RING(0); /* sf */
    /* Only the PS uses the binding table */
-   OUT_RING(state_base_offset + binding_table_offset); /* ps */
-
-   //ring 20
+   	OUT_RING(state_base_offset + binding_table_offset); /* ps */
 
    /* The drawing rectangle clipping is always on.  Set it to values that
     * shouldn't do any clipping.
     */
-    //XXX: fix for picture size
-   OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
-   OUT_RING(0x00000000);	/* ymin, xmin */
-   OUT_RING((pScrn->virtualX - 1) |
-	    (pScrn->virtualY - 1) << 16); /* ymax, xmax */
-   OUT_RING(0x00000000);	/* yorigin, xorigin */
+   	OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
+   	OUT_RING(0x00000000);	/* ymin, xmin */
+   	OUT_RING((pScrn->virtualX - 1) |
+ 	         (pScrn->virtualY - 1) << 16); /* ymax, xmax */
+   	OUT_RING(0x00000000);	/* yorigin, xorigin */
 
    /* skip the depth buffer */
    /* skip the polygon stipple */
@@ -882,90 +880,82 @@ ErrorF("i965 prepareComposite\n");
    /* skip the line stipple */
    
    /* Set the pointers to the 3d pipeline state */
-   OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
-   OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
-   OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
-   OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
-   OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
-   OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
-   OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
+   	OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
+   	OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
+   	OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
+   	OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
+   	OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
+   	OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
+   	OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
 
    /* URB fence */
-   // XXX: CS for const URB needed? if not, cs_fence should be equal to sf_fence
-   OUT_RING(BRW_URB_FENCE |
-	    UF0_CS_REALLOC |
-	    UF0_SF_REALLOC |
-	    UF0_CLIP_REALLOC |
-	    UF0_GS_REALLOC |
-	    UF0_VS_REALLOC |
-	    1);
-   OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
-	    ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
-	    ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
-   OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
-	    ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+   	OUT_RING(BRW_URB_FENCE |
+        	 UF0_CS_REALLOC |
+	    	 UF0_SF_REALLOC |
+	    	 UF0_CLIP_REALLOC |
+	         UF0_GS_REALLOC |
+	         UF0_VS_REALLOC |
+	    	 1);
+   	OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+	    	 ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+	    	 ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+   	OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+	     	 ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
 
    /* Constant buffer state */
-   // XXX: needed? seems no usage, as we don't have CONSTANT_BUFFER definition
-   OUT_RING(BRW_CS_URB_STATE | 0);
-   OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
-	    (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
-   
+   	OUT_RING(BRW_CS_URB_STATE | 0);
+   	OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
+	    	 (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
+	ADVANCE_LP_RING();
+   }
+   {
+        int nelem = pMask ? 3: 2;
+   	BEGIN_LP_RING(pMask?12:10);
    /* Set up the pointer to our vertex buffer */
-   // XXX: double check
-  // int vb_pitch = 4 * 4;  // XXX: pitch should include mask's coords? possible
-  // all three coords on one row?
-   int nelem = pMask ? 3: 2;
-   OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); //XXX: should be 4n-1 -> 3
-   OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
-	    VB0_VERTEXDATA |
-	    ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
-   		// pitch includes all vertex data, 4bytes for 1 dword, each
-		// element has 2 coords (x,y)(s0,t0), nelem to reflect possible
-		// mask
-   OUT_RING(state_base_offset + vb_offset);
-   OUT_RING(4 * nelem); // max index, prim has 4 coords
-   OUT_RING(0); // ignore for VERTEXDATA, but still there
+   	OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); 
+   	OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
+	    	 VB0_VERTEXDATA |
+	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
+   	OUT_RING(state_base_offset + vb_offset);
+   	OUT_RING(2); // max index, prim has 4 coords
+   	OUT_RING(0); // ignore for VERTEXDATA, but still there
 
    /* Set up our vertex elements, sourced from the single vertex buffer. */
-   OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));  // XXX: 2n-1, (x,y) + (s0,t0) +
-						//   possible (s1, t1)
+   	OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));  
    /* offset 0: X,Y -> {X, Y, 1.0, 1.0} */
-   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (0 << VE0_OFFSET_SHIFT));
-   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+   	OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    	 VE0_VALID |
+	    	 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    	 (0 << VE0_OFFSET_SHIFT));
+   	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	     	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    	 (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
    /* offset 8: S0, T0 -> {S0, T0, 1.0, 1.0} */
-   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (8 << VE0_OFFSET_SHIFT));
-   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
-
-   if (pMask) {
    	OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
-	    VE0_VALID |
-	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
-	    (16 << VE0_OFFSET_SHIFT));
-	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
-	    (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
-		//XXX: is this has alignment issue? and thread access problem?
-   }
+	    	 VE0_VALID |
+	    	 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    	 (8 << VE0_OFFSET_SHIFT));
+   	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	     	 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    	 (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+
+   	if (pMask) {
+   		OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    		 VE0_VALID |
+	    		 (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    		 (16 << VE0_OFFSET_SHIFT));
+		OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    		 (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    		 (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
+   	}
    
-   ADVANCE_LP_RING();
-    
+   	ADVANCE_LP_RING();
    }
 
 #ifdef I830DEBUG
@@ -983,7 +973,7 @@ I965EXAComposite(PixmapPtr pDst, int src
     I830Ptr pI830 = I830PTR(pScrn);
     int srcXend, srcYend, maskXend, maskYend;
     PictVector v;
-    int pMask = 1, i = 0;
+    int pMask = 1, i;
 
     DPRINTF(PFX, "Composite: srcX %d, srcY %d\n\t maskX %d, maskY %d\n\t"
 	    "dstX %d, dstY %d\n\twidth %d, height %d\n\t"
@@ -999,8 +989,10 @@ I965EXAComposite(PixmapPtr pDst, int src
 
     srcXend = srcX + w;
     srcYend = srcY + h;
-    maskXend = maskX + w;
-    maskYend = maskY + h;
+    if (pMask) {
+        maskXend = maskX + w;
+        maskYend = maskY + h;
+    }
     if (is_transform[0]) {
         v.vector[0] = IntToxFixed(srcX);
         v.vector[1] = IntToxFixed(srcY);
@@ -1035,51 +1027,45 @@ I965EXAComposite(PixmapPtr pDst, int src
 		"dstX %d, dstY %d\n", srcX, srcY, srcXend, srcYend,
 		maskX, maskY, maskXend, maskYend, dstX, dstY);
 
- 
-    vb[i++] = (float)dstX;
-    vb[i++] = (float)dstY;
-    vb[i++] = (float)srcX / scale_units[0][0];
-    vb[i++] = (float)srcY / scale_units[0][1];
-    if (pMask) {
-        vb[i++] = (float)maskX / scale_units[1][0];
-        vb[i++] = (float)maskY / scale_units[1][1];
-    }
-
-    vb[i++] = (float)dstX;
-    vb[i++] = (float)(dstY + h);
-    vb[i++] = (float)srcX / scale_units[0][0];
-    vb[i++] = (float)srcYend / scale_units[0][1];
+    i = 0;
+    /* rect (x2,y2) */
+    vb[i++] = (float)(srcXend) / scale_units[0][0];
+    vb[i++] = (float)(srcYend) / scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskX / scale_units[1][0];
+        vb[i++] = (float)maskXend / scale_units[1][0];
         vb[i++] = (float)maskYend / scale_units[1][1];
     }
-
     vb[i++] = (float)(dstX + w);
     vb[i++] = (float)(dstY + h);
-    vb[i++] = (float)srcXend / scale_units[0][0];
-    vb[i++] = (float)srcYend / scale_units[0][1];
+
+    /* rect (x1,y2) */
+    vb[i++] = (float)(srcX)/ scale_units[0][0];
+    vb[i++] = (float)(srcYend)/ scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskX / scale_units[1][0];
         vb[i++] = (float)maskYend / scale_units[1][1];
     }
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)(dstY + h);
 
-    vb[i++] = (float)(dstX + w);
-    vb[i++] = (float)dstY;
-    vb[i++] = (float)srcXend / scale_units[0][0];
-    vb[i++] = (float)srcY / scale_units[0][1];
+    /* rect (x1,y1) */
+    vb[i++] = (float)(srcX) / scale_units[0][0];
+    vb[i++] = (float)(srcY) / scale_units[0][1];
     if (pMask) {
-        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskX / scale_units[1][0];
         vb[i++] = (float)maskY / scale_units[1][1];
     }
-
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)dstY;
+   
     {
       BEGIN_LP_RING(6);
       OUT_RING(BRW_3DPRIMITIVE | 
 	       BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
-	       (_3DPRIM_TRIFAN << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
+	       (_3DPRIM_RECTLIST << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
 	       (0 << 9) |  /* CTG - indirect vertex count */
 	       4);
-      OUT_RING(4);  /* vertex count per instance */
+      OUT_RING(3);  /* vertex count per instance */
       OUT_RING(0); /* start vertex offset */
       OUT_RING(1); /* single instance */
       OUT_RING(0); /* start instance location */
@@ -1090,4 +1076,19 @@ I965EXAComposite(PixmapPtr pDst, int src
     ErrorF("sync after 3dprimitive");
     I830Sync(pScrn);
 #endif
+    /* we must be sure that the pipeline is flushed before next exa draw,
+       because that will be new state, binding state and instructions*/
+    {
+	BEGIN_LP_RING(4);
+   	OUT_RING(BRW_PIPE_CONTROL |
+	    BRW_PIPE_CONTROL_NOWRITE |
+	    BRW_PIPE_CONTROL_WC_FLUSH |
+	    BRW_PIPE_CONTROL_IS_FLUSH |
+	    (1 << 10) |  /* XXX texture cache flush for BLC/CTG */
+	    2);
+   	OUT_RING(0); /* Destination address */
+   	OUT_RING(0); /* Immediate data low DW */
+   	OUT_RING(0); /* Immediate data high DW */
+	ADVANCE_LP_RING();
+    }
 }
diff-tree 42534474fd2556e5987205626cca8f30e25855a8 (from 05e8a7989db1b8b9dfa7b688629af65d9910cc53)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:04:16 2006 +0800

    [PATCH] remove wrong scale_units
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 2751778..2c3e43b 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -397,8 +397,6 @@ ErrorF("i965 prepareComposite\n");
     }
     scale_units[0][0] = pSrc->drawable.width;
     scale_units[0][1] = pSrc->drawable.height;
-    scale_units[2][0] = pDst->drawable.width;
-    scale_units[2][1] = pDst->drawable.height;
 
     if (pSrcPicture->transform) {
 	is_transform[0] = TRUE;
diff-tree 05e8a7989db1b8b9dfa7b688629af65d9910cc53 (from 1329c86f2a4b3664920ded970a984a745a530da4)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 16:02:16 2006 +0800

    [PATCH] Rename exa sf/wm program files
    
    Also fix some minors in wm program.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/Makefile.am b/src/Makefile.am
index 54e5657..a9f427d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -102,8 +102,8 @@ sf_prog.h: packed_yuv_sf.g4a
 	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
 wm_prog.h: packed_yuv_wm.g4a
 	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
-i965_composite_wm_nomask.h: i965_composite_wm_nomask.g4a
-	intel-gen4asm -o i965_composite_wm_nomask.h i965_composite_wm_nomask.g4a
+exa_wm_nomask_prog.h: exa_wm_nomask.g4a
+	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
 endif
 
 if DRI
diff --git a/src/exa_sf.g4a b/src/exa_sf.g4a
new file mode 100644
index 0000000..8c1398f
--- /dev/null
+++ b/src/exa_sf.g4a
@@ -0,0 +1,17 @@
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
+mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
+mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_sf_prog.h b/src/exa_sf_prog.h
new file mode 100644
index 0000000..830d176
--- /dev/null
+++ b/src/exa_sf_prog.h
@@ -0,0 +1,17 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_nomask.g4a b/src/exa_wm_nomask.g4a
new file mode 100644
index 0000000..8e851a3
--- /dev/null
+++ b/src/exa_wm_nomask.g4a
@@ -0,0 +1,143 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture.
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture Y offset */
+add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g4<8,8,1>F { align1 };
+mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
+mov (8) m3<1>F g6<8,8,1>F { align1 };
+mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0 		/* msg reg index */
+	g12<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
+			 /* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+mov (8) g19<1>UD g19<8,8,1>UD { align1 };  /* wait sampler return */
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* g12 -> m2
+   g13 -> m6
+   g14 -> m3
+   g15 -> m7
+   g16 -> m4
+   g17 -> m8
+   g18 -> m5
+   g19 -> m9
+*/
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff --git a/src/exa_wm_nomask_prog.h b/src/exa_wm_nomask_prog.h
new file mode 100644
index 0000000..7870b3b
--- /dev/null
+++ b/src/exa_wm_nomask_prog.h
@@ -0,0 +1,70 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_composite_wm_nomask.g4a b/src/i965_composite_wm_nomask.g4a
deleted file mode 100644
index 927d86a..0000000
--- a/src/i965_composite_wm_nomask.g4a
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- * XXX: This is still experimental, and should be fixed to support multiple texture
- * map, and conditional mul actions. 
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
- * X0_R is g4
- * X1_R is g5
- * Y0_R is g6
- * Y1_R is g7
- */
-
-    /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-    /* subtract screen-space X origin of vertex 0. */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
-    /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
-    /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g4<8,8,1>F { align1 };
-mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
-mov (8) m3<1>F g6<8,8,1>F { align1 };
-mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0 		/* msg reg index */
-	g12<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
-			 /* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>F g1<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* g12 -> m2
-   g13 -> m6
-   g14 -> m3
-   g15 -> m7
-   g16 -> m4
-   g17 -> m8
-   g18 -> m5
-   g19 -> m9
-*/
-mov (8) m2<1>F g12<8,8,1>F { align1 };
-mov (8) m3<1>F g14<8,8,1>F { align1 };
-mov (8) m4<1>F g16<8,8,1>F { align1 };
-mov (8) m5<1>F g18<8,8,1>F { align1 };
-mov (8) m6<1>F g13<8,8,1>F { align1 };
-mov (8) m7<1>F g15<8,8,1>F { align1 };
-mov (8) m8<1>F g17<8,8,1>F { align1 };
-mov (8) m9<1>F g19<8,8,1>F { align1 };
-
-/* write */
-send (16) 0 null g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/i965_composite_wm_nomask.h b/src/i965_composite_wm_nomask.h
deleted file mode 100644
index bd99dd9..0000000
--- a/src/i965_composite_wm_nomask.h
+++ /dev/null
@@ -1,68 +0,0 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
-   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
-   { 0x00800031, 0x20001d3c, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index fe3007b..2751778 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -342,7 +342,7 @@ static const CARD32 sip_kernel_static[][
 #define SF_MAX_THREADS	   4
 
 static const CARD32 sf_kernel_static[][4] = {
-#include "sf_prog.h"
+#include "exa_sf_prog.h"
 };
 
 /* ps kernels */
@@ -350,7 +350,7 @@ static const CARD32 sf_kernel_static[][4
 #define PS_MAX_THREADS	   32
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
-	#include "i965_composite_wm_nomask.h"
+	#include "exa_wm_nomask_prog.h"
 };
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
diff-tree 1329c86f2a4b3664920ded970a984a745a530da4 (from f9c3ef40100acb85fca6e49c5c0e98f5c9ac97ca)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Nov 29 15:47:19 2006 +0800

    [PATCH] EXA state mem for G965
    
    Not split offscreen mem for exa, but alloc a dedicated one
    for G965 states.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i830.h b/src/i830.h
index d5ca5d4..3b7301e 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -73,6 +73,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN
 #ifdef I830_USE_EXA
 #include "exa.h"
 Bool I830EXAInit(ScreenPtr pScreen);
+#define EXA_LINEAR_EXTRA	(64*1024)
 #endif
 
 #ifdef I830_USE_XAA
@@ -267,6 +268,7 @@ typedef struct _I830Rec {
    I830MemRange Scratch2;
 #ifdef I830_USE_EXA
    I830MemRange Offscreen;
+   I830MemRange EXAStateMem;  /* specific exa state for G965 */
 #endif
    /* Regions allocated either from the above pools, or from agpgart. */
    I830MemRange	*CursorMem;
diff --git a/src/i830_exa.c b/src/i830_exa.c
index c5b91b0..3e874c9 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -425,7 +425,6 @@ IntelEXADoneComposite(PixmapPtr pDst)
 #endif
 }
 
-#define BRW_LINEAR_EXTRA (32*1024)
 /*
  * TODO:
  *   - Dual head?
@@ -448,11 +447,7 @@ I830EXAInit(ScreenPtr pScreen)
     pI830->EXADriverPtr->exa_minor = 0;
     pI830->EXADriverPtr->memoryBase = pI830->FbBase;
     pI830->EXADriverPtr->offScreenBase = pI830->Offscreen.Start;
-    if (IS_I965G(pI830))
-    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End -
-					BRW_LINEAR_EXTRA; /* BRW needs state buffer*/
-    else
-    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
+    pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
 	   
     DPRINTF(PFX, "EXA Mem: memoryBase 0x%x, end 0x%x, offscreen base 0x%x, memorySize 0x%x\n",
 		pI830->EXADriverPtr->memoryBase,
diff --git a/src/i830_memory.c b/src/i830_memory.c
index 5bbf3e3..20e3afb 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -841,6 +841,25 @@ I830Allocate2DMemory(ScrnInfoPtr pScrn, 
 		       pI830->Offscreen.Start, pI830->Offscreen.Size/1024);
 	 }
       }
+      if (IS_I965G(pI830)) {
+          memset(&(pI830->EXAStateMem), 0, sizeof(I830MemRange));
+          pI830->EXAStateMem.Key = -1;
+          size = ROUND_TO_PAGE(EXA_LINEAR_EXTRA);
+          align = GTT_PAGE_SIZE;
+          alloced = I830AllocVidMem(pScrn, &(pI830->EXAStateMem),
+				&(pI830->StolenPool), size, align,
+				flags | FROM_ANYWHERE | ALLOCATE_AT_TOP);
+          if (alloced < size) {
+             if (!dryrun) {
+         	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		    "G965: Failed to allocate exa state buffer space.\n");
+             }
+             return FALSE;
+          }
+          xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
+ 		  "%sAllocated %ld kB for the G965 exa state buffer at 0x%lx - 0x%lx.\n", s, 
+ 		alloced / 1024, pI830->EXAStateMem.Start, pI830->EXAStateMem.End);
+      }
 #endif
    } else {
       long lineSize;
@@ -1493,6 +1512,11 @@ I830FixupOffsets(ScrnInfoPtr pScrn)
       }
    }
 #endif
+#ifdef I830_USE_EXA
+   I830FixOffset(pScrn, &(pI830->Offscreen));
+   if (IS_I965G(pI830))
+       I830FixOffset(pScrn, &(pI830->EXAStateMem));
+#endif
    return TRUE;
 }
 
@@ -1894,6 +1918,12 @@ I830BindAGPMemory(ScrnInfoPtr pScrn)
 	    return FALSE;
       }
 #endif
+#ifdef I830_USE_EXA
+     if (!BindMemRange(pScrn, &(pI830->Offscreen)))
+	return FALSE;
+     if (IS_I965G(pI830) && !BindMemRange(pScrn, &(pI830->EXAStateMem)))
+	return FALSE;
+#endif
       pI830->GttBound = 1;
    }
 
@@ -1979,6 +2009,12 @@ I830UnbindAGPMemory(ScrnInfoPtr pScrn)
 	    return FALSE;
       }
 #endif
+#ifdef I830_USE_EXA
+     if (!UnbindMemRange(pScrn, &(pI830->Offscreen)))
+	return FALSE;
+     if (IS_I965G(pI830) && !UnbindMemRange(pScrn, &(pI830->EXAStateMem)))
+	return FALSE;
+#endif
       if (!xf86ReleaseGART(pScrn->scrnIndex))
 	 return FALSE;
 
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 7fbf99c..fe3007b 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -490,21 +490,12 @@ ErrorF("i965 prepareComposite\n");
    next_offset = binding_table_offset + (binding_table_entries * 4);
 
    total_state_size = next_offset;
+   assert(total_state_size < EXA_LINEAR_EXTRA);
 
-   /*
-    * XXX: Use the extra space allocated at the end of the exa offscreen buffer?
-    */
-#define BRW_LINEAR_EXTRA	(32*1024)
-
-   state_base_offset = (pI830->Offscreen.End -
-			BRW_LINEAR_EXTRA);
-   
+   state_base_offset = pI830->EXAStateMem.Start;
    state_base_offset = ALIGN(state_base_offset, 64);
    state_base = (char *)(pI830->FbBase + state_base_offset);
-   /* Set up our pointers to state structures in framebuffer.  It would probably
-    * be a good idea to fill these structures out in system memory and then dump
-    * them there, instead.
-    */
+
    vs_state = (void *)(state_base + vs_offset);
    sf_state = (void *)(state_base + sf_offset);
    wm_state = (void *)(state_base + wm_offset);
diff-tree f9c3ef40100acb85fca6e49c5c0e98f5c9ac97ca (from ef2f25e5c6ce04c202698c5122bc8627130073c0)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Tue Oct 10 15:50:10 2006 +0800

    [PATCH] Mark current ps kernel is experimential with little test.
    
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_composite_wm_nomask.g4a b/src/i965_composite_wm_nomask.g4a
index 8791631..927d86a 100644
--- a/src/i965_composite_wm_nomask.g4a
+++ b/src/i965_composite_wm_nomask.g4a
@@ -1,6 +1,8 @@
 /*
  * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture
+ * The simplest case is just sending what src picture has to dst picture.
+ * XXX: This is still experimental, and should be fixed to support multiple texture
+ * map, and conditional mul actions. 
  */
 
 /* I think this should be same as in g4a program for texture video,
diff-tree ef2f25e5c6ce04c202698c5122bc8627130073c0 (from ca608028c5301700444d39a1c631cc0d5648e1a2)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Tue Oct 10 14:11:35 2006 +0800

    [PATCH] Use sf_prog.h instead
    
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 5528388..7fbf99c 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -89,7 +89,6 @@ struct formatinfo {
 
 // refer vol2, 3d rasterization 3.8.1
 
-/* XXX: bad!bad! broadwater has different blend factor definition */
 /* defined in brw_defines.h */
 static struct blendinfo I965BlendOp[] = { 
     /* Clear */
@@ -163,8 +162,6 @@ static void I965GetBlendCntl(int op, Pic
 
 }
 
-
-/* FIXME */
 static Bool I965GetDestFormat(PicturePtr pDstPicture, CARD32 *dst_format)
 {
     switch (pDstPicture->format) {
@@ -221,7 +218,6 @@ static Bool I965CheckCompositeTexture(Pi
         I830FALLBACK("Unsupported picture format 0x%x\n",
                          (int)pPict->format);
 
-    /* XXX: fallback when repeat? */
     if (pPict->repeat && pPict->repeatType != RepeatNormal)
 	I830FALLBACK("extended repeat (%d) not supported\n",
 		     pPict->repeatType);
@@ -346,40 +342,7 @@ static const CARD32 sip_kernel_static[][
 #define SF_MAX_THREADS	   4
 
 static const CARD32 sf_kernel_static[][4] = {
-/*    send   0 (1) g6<1>F g1.12<0,1,0>F math mlen 1 rlen 1 { align1 +  } */
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-/*    send   0 (1) g6.4<1>F g1.20<0,1,0>F math mlen 1 rlen 1 { align1 +  } */
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-/*    add (8) g7<1>F g4<8,8,1>F g3<8,8,1>F { align1 +  } */
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-/*    mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 +  } */
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-/*    mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 +  } */
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-/*    mov (8) m1<1>F g7<0,1,0>F { align1 +  } */
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-/*    mov (8) m2<1>F g7.4<0,1,0>F { align1 +  } */
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-/*    mov (8) m3<1>F g3<8,8,1>F { align1 +  } */
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-/*    send   0 (8) a0<1>F g0<8,8,1>F urb mlen 4 rlen 0 write +0 transpose used complete EOT{ align1 +  } */
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
-/*    nop (4) g0<1>UD { align1 +  } */
-   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+#include "sf_prog.h"
 };
 
 /* ps kernels */
@@ -475,9 +438,8 @@ ErrorF("i965 prepareComposite\n");
    cc_offset = ALIGN(next_offset, 32);
    next_offset = cc_offset + sizeof(*cc_state);
 
-// fixup sf_kernel_static, is sf_kernel needed? or not? why? 
-//	-> just keep current sf_kernel, which will send one setup urb entry to
-//	PS kernel
+   /* keep current sf_kernel, which will send one setup urb entry to
+	PS kernel */
    sf_kernel_offset = ALIGN(next_offset, 64);
    next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
 
@@ -965,7 +927,7 @@ ErrorF("i965 prepareComposite\n");
   // int vb_pitch = 4 * 4;  // XXX: pitch should include mask's coords? possible
   // all three coords on one row?
    int nelem = pMask ? 3: 2;
-   OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); //should be 4n-1 -> 3
+   OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); //XXX: should be 4n-1 -> 3
    OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
 	    VB0_VERTEXDATA |
 	    ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
diff-tree ca608028c5301700444d39a1c631cc0d5648e1a2 (from 848368d5d0b90e03d3ec447cb5bd39fc87aea8df)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Thu Sep 28 13:55:52 2006 +0800

    [PATCH] Fix picture's transform checking
    
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index f7093f2..5528388 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -437,11 +437,22 @@ ErrorF("i965 prepareComposite\n");
     scale_units[2][0] = pDst->drawable.width;
     scale_units[2][1] = pDst->drawable.height;
 
+    if (pSrcPicture->transform) {
+	is_transform[0] = TRUE;
+	transform[0] = pSrcPicture->transform;
+    } else 
+	is_transform[0] = FALSE;
+
     if (!pMask) {
 	is_transform[1] = FALSE;
 	scale_units[1][0] = -1;
 	scale_units[1][1] = -1;
     } else {
+	if (pMaskPicture->transform) {
+	    is_transform[1] = TRUE;
+	    transform[1] = pMaskPicture->transform;
+	} else
+	    is_transform[1] = FALSE;
 	scale_units[1][0] = pMask->drawable.width;
 	scale_units[1][1] = pMask->drawable.height;
     }
diff-tree 848368d5d0b90e03d3ec447cb5bd39fc87aea8df (from df23624eebe938fa444c80cbedcd61919ec1aeda)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Thu Sep 28 11:15:33 2006 +0800

    [PATCH] Fallback in mask picture for now
    
    Do it later after finish wm kernel program.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index b56bf7f..f7093f2 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -417,6 +417,11 @@ ErrorF("i965 prepareComposite\n");
 //    i965_surf_setup(pScrn, pSrcPicture, pMaskPicture, pDstPicture,
 //   			pSrc, pMask, pDst);
     // then setup blend, and shader program 
+    
+    /* FIXME: fallback in pMask for now, would be enable after finish
+	wm kernel program */
+    if (pMask)
+	I830FALLBACK("No mask support yet.\n");
 
     I965GetDestFormat(pDstPicture, &dst_format);
     src_offset = exaGetPixmapOffset(pSrc);
@@ -995,68 +1000,12 @@ ErrorF("i965 prepareComposite\n");
 	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
 	    (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
 		//XXX: is this has alignment issue? and thread access problem?
-	    
    }
    
    ADVANCE_LP_RING();
     
    }
 
-    {
-	/* cc states */
-	/* dest buffer */
-	/* urbs */
-	/* binding tables */
-	/* clipping */
-	/* color blend (color calculator, dataport shared function)
-		COLOR_CALC_STATE/SURFACE_STATE(rendertarget's color blend enable
-		bit)
-		Errata!!!: brw-a/b, rendertarget 'local' color blending always
-		enabled! only control by global enable bit.
-	   surface format for blend, "Surface format table in Sampling Engine"
-	   XXX: if surface format not support, we should fallback.
-	*/
-	/* 
-	    render target should be defined in SURFACE_STATE
-	    	o render target SURFTYPE_BUFFER? 2D? Keith has 2D set.
-		o depth buffer SURFTYPE_NULL?
-	    color blend:
-	        o Errata!!: mush issue PIPE_CONTROL with Write Cache Flush
-		enable set, before transite to read-write color buffer. 
-	    	o disable pre/post-blending clamping
-		o enable color buffer blending enable in COLOR_CALC_STATE,(vol2, 3d rasterization 3.8) 
-		  enable color blending enable in SURFACE_STATE.(shared,
-		  sampling engine 1.7) 
-		  disable depth test
-		o (we don't use BLENDFACT_SRC_ALPHA_SATURATE, so don't care
-		the Errata for independent alpha blending, just use color
-		blending factor for all) disable independent alpha blending
-		in COLOR_CALC_STATE
-		o set src/dst blend factor in COLOR_CALC_STATE
-
-	*/
-    }
-
-	/* shader program 
-		o use sampler shared function for texture data
-		o submit result to dataport for later color blending */
-    {
-	 /* PS program:
-	 	o declare sampler and variables??
-		o 'send' cmd to Sampling Engine to load 'src' picture
-		o if (!pMask) then 'send' 'src' texture value to DataPort
-		target render cache
-		o else 
-		    - 'send' cmd to SE to load 'mask' picture
-		    - if no alpha, force to 1 (move 1 to W element of mask)
-		    - if (mask->componentAlpha) then mul 'src' & 'mask', 'send'
-		    	output to DataPort render cache
-		    - else mul 'src' & 'mask''s W element(alpha), 'send' output
-		    	to Dataport render cache
-	 */
-
-    }
-
 #ifdef I830DEBUG
     ErrorF("try to sync to show any errors...");
     I830Sync(pScrn);
diff-tree df23624eebe938fa444c80cbedcd61919ec1aeda (from fc944859b1b9605c748162bad1c93a6303c84aae)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Thu Sep 28 11:09:52 2006 +0800

    [PATCH] Fix compile, add wm header file.
    
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/Makefile.am b/src/Makefile.am
index b0c6c92..54e5657 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -94,6 +94,7 @@ i810_drv_la_SOURCES = \
 	 i830_xaa.c \
 	 i830_exa_render.c \
 	 i915_exa_render.c \
+	 i965_composite_wm_nomask.h \
 	 i965_exa_render.c
 
 if HAVE_GEN4ASM
diff --git a/src/i965_composite_wm_nomask.h b/src/i965_composite_wm_nomask.h
new file mode 100644
index 0000000..bd99dd9
--- /dev/null
+++ b/src/i965_composite_wm_nomask.h
@@ -0,0 +1,68 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000060 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000060 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000006c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000006c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000074 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000074 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000007c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000007c },
+   { 0x00600001, 0x202003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00a0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00e0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00800031, 0x20001d3c, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index dfa9a04..b56bf7f 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -68,6 +68,9 @@ extern void
 I965EXAComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 		int dstX, int dstY, int width, int height);
 
+static void I965GetBlendCntl(int op, PicturePtr pMask, CARD32 dst_format, 
+			     CARD32 *sblend, CARD32 *dblend);
+
 extern float scale_units[2][2];
 extern Bool is_transform[2];
 extern PictTransform *transform[2];
@@ -90,31 +93,31 @@ struct formatinfo {
 /* defined in brw_defines.h */
 static struct blendinfo I965BlendOp[] = { 
     /* Clear */
-    {0, 0, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_ZERO},
+    {0, 0, BRW_BLENDFACTOR_ZERO,          BRW_BLENDFACTOR_ZERO},
     /* Src */
-    {0, 0, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_ZERO},
+    {0, 0, BRW_BLENDFACTOR_ONE,           BRW_BLENDFACTOR_ZERO},
     /* Dst */
-    {0, 0, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_ONE},
+    {0, 0, BRW_BLENDFACTOR_ZERO,          BRW_BLENDFACTOR_ONE},
     /* Over */
-    {0, 1, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_INV_SRC_ALPHA},
+    {0, 1, BRW_BLENDFACTOR_ONE,           BRW_BLENDFACTOR_INV_SRC_ALPHA},
     /* OverReverse */
-    {1, 0, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_ONE},
+    {1, 0, BRW_BLENDFACTOR_INV_DST_ALPHA, BRW_BLENDFACTOR_ONE},
     /* In */
-    {1, 0, BRW_BLENDFACT_DST_ALPHA,     BRW_BLENDFACT_ZERO},
+    {1, 0, BRW_BLENDFACTOR_DST_ALPHA,     BRW_BLENDFACTOR_ZERO},
     /* InReverse */
-    {0, 1, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_SRC_ALPHA},
+    {0, 1, BRW_BLENDFACTOR_ZERO,          BRW_BLENDFACTOR_SRC_ALPHA},
     /* Out */
-    {1, 0, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_ZERO},
+    {1, 0, BRW_BLENDFACTOR_INV_DST_ALPHA, BRW_BLENDFACTOR_ZERO},
     /* OutReverse */
-    {0, 1, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_INV_SRC_ALPHA},
+    {0, 1, BRW_BLENDFACTOR_ZERO,          BRW_BLENDFACTOR_INV_SRC_ALPHA},
     /* Atop */
-    {1, 1, BRW_BLENDFACT_DST_ALPHA,     BRW_BLENDFACT_INV_SRC_ALPHA},
+    {1, 1, BRW_BLENDFACTOR_DST_ALPHA,     BRW_BLENDFACTOR_INV_SRC_ALPHA},
     /* AtopReverse */
-    {1, 1, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_SRC_ALPHA},
+    {1, 1, BRW_BLENDFACTOR_INV_DST_ALPHA, BRW_BLENDFACTOR_SRC_ALPHA},
     /* Xor */
-    {1, 1, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_INV_SRC_ALPHA},
+    {1, 1, BRW_BLENDFACTOR_INV_DST_ALPHA, BRW_BLENDFACTOR_INV_SRC_ALPHA},
     /* Add */
-    {0, 0, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_ONE},
+    {0, 0, BRW_BLENDFACTOR_ONE,           BRW_BLENDFACTOR_ONE},
 };
 
 /* FIXME: surface format defined in brw_defines.h, shared Sampling engine 1.7.2*/
@@ -124,8 +127,8 @@ static struct formatinfo I965TexFormats[
         {PICT_a8b8g8r8, BRW_SURFACEFORMAT_B8G8R8A8_UNORM },
         {PICT_x8b8g8r8, BRW_SURFACEFORMAT_B8G8R8X8_UNORM },
         {PICT_r5g6b5,   BRW_SURFACEFORMAT_B5G6R5_UNORM   },
-        {PICT_a1r5g5b5, BRW_SURFACEFORMAT_B5G6R5A1_UNORM },
-        {PICT_x1r5g5b5, BRW_SURFACEFORMAT_B5G6R5X1_UNORM },
+        {PICT_a1r5g5b5, BRW_SURFACEFORMAT_B5G5R5A1_UNORM },
+        {PICT_x1r5g5b5, BRW_SURFACEFORMAT_B5G5R5X1_UNORM },
         {PICT_a8,       BRW_SURFACEFORMAT_A8_UNORM	 },
 };
 
@@ -140,10 +143,10 @@ static void I965GetBlendCntl(int op, Pic
      * it as always 1.
      */
     if (PICT_FORMAT_A(dst_format) == 0 && I965BlendOp[op].dst_alpha) {
-        if (*sblend == BRW_BLENDFACT_DST_ALPHA)
-            *sblend = BRW_BLENDFACT_ONE;
-        else if (*sblend == BRW_BLENDFACT_INV_DST_ALPHA)
-            *sblend = BRW_BLENDFACT_ZERO;
+        if (*sblend == BRW_BLENDFACTOR_DST_ALPHA)
+            *sblend = BRW_BLENDFACTOR_ONE;
+        else if (*sblend == BRW_BLENDFACTOR_INV_DST_ALPHA)
+            *sblend = BRW_BLENDFACTOR_ZERO;
     }
 
     /* If the source alpha is being used, then we should only be in a case where
@@ -151,10 +154,10 @@ static void I965GetBlendCntl(int op, Pic
      * channels multiplied by the source picture's alpha.
      */
     if (pMask && pMask->componentAlpha && I965BlendOp[op].src_alpha) {
-        if (*dblend == BRW_BLENDFACT_SRC_ALPHA) {
-	    *dblend = BRW_BLENDFACT_SRC_COLR;
-        } else if (*dblend == BRW_BLENDFACT_INV_SRC_ALPHA) {
-	    *dblend = BRW_BLENDFACT_INV_SRC_COLR;
+        if (*dblend == BRW_BLENDFACTOR_SRC_ALPHA) {
+	    *dblend = BRW_BLENDFACTOR_SRC_COLOR;
+        } else if (*dblend == BRW_BLENDFACTOR_INV_SRC_ALPHA) {
+	    *dblend = BRW_BLENDFACTOR_INV_SRC_COLOR;
         }
     }
 
@@ -173,10 +176,10 @@ static Bool I965GetDestFormat(PicturePtr
         *dst_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
         break;
     case PICT_a1r5g5b5:
-    	*dst_format = BRW_SURFACEFORMAT_B5G6R5A1_UNORM;
+    	*dst_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
 	break;
     case PICT_x1r5g5b5:
-        *dst_format = BRW_SURFACEFORMAT_B5G6R5X1_UNORM;
+        *dst_format = BRW_SURFACEFORMAT_B5G5R5X1_UNORM;
         break;
     /* COLR_BUF_8BIT is special for YUV surfaces.  While we may end up being
      * able to use it depending on how the hardware implements it, disable it
@@ -250,7 +253,7 @@ I965EXACheckComposite(int op, PicturePtr
          * source value that we get to blend with.
          */
         if (I965BlendOp[op].src_alpha &&
-            (I965BlendOp[op].src_blend != BRW_BLENDFACT_ZERO))
+            (I965BlendOp[op].src_blend != BRW_BLENDFACTOR_ZERO))
             	I830FALLBACK("Component alpha not supported with source "
                             "alpha and source value blending.\n");
 	/* XXX: fallback now for mask with componentAlpha */
@@ -297,7 +300,7 @@ struct brw_instruction *sip_kernel;
 CARD32 *binding_table;
 int binding_table_entries; 
 
-int dest_surf_offset, src_surf_offset;
+int dest_surf_offset, src_surf_offset, mask_surf_offset;
 int src_sampler_offset, mask_sampler_offset,vs_offset;
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
@@ -308,7 +311,7 @@ int state_base_offset;
 float *vb;
 int vb_size = 4 * 4 ; /* 4 DWORDS per vertex, 4 vertices for TRIFAN*/ 
 
-int src_blend, dst_blend;
+CARD32 src_blend, dst_blend;
 
 static const CARD32 sip_kernel_static[][4] = {
 /*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
@@ -380,6 +383,8 @@ static const CARD32 sf_kernel_static[][4
 };
 
 /* ps kernels */
+#define PS_KERNEL_NUM_GRF   32
+#define PS_MAX_THREADS	   32
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
 	#include "i965_composite_wm_nomask.h"
@@ -387,12 +392,12 @@ static const CARD32 ps_kernel_static_nom
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
 static const CARD32 ps_kernel_static_maskca [][4] = {
-	#include "i965_composite_wm_maskca.h"
+/*#include "i965_composite_wm_maskca.h" */
 };
 
 /* 3: mask without componentAlpha, src * mask alpha */
 static const CARD32 ps_kernel_static_masknoca [][4] = {
-	#include "i965_composite_wm_masknoca.h"
+/*#include "i965_composite_wm_masknoca.h" */
 };
 
 Bool
@@ -403,9 +408,8 @@ I965EXAPrepareComposite(int op, PictureP
     ScrnInfoPtr pScrn = xf86Screens[pSrcPicture->pDrawable->pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
     CARD32 src_offset, src_pitch;
-    CARD32 mask_offset, mask_pitch;
+    CARD32 mask_offset = 0, mask_pitch = 0;
     CARD32 dst_format, dst_offset, dst_pitch;
-    CARD32 blendctl;
  
 ErrorF("i965 prepareComposite\n");
 
@@ -437,7 +441,6 @@ ErrorF("i965 prepareComposite\n");
 	scale_units[1][1] = pMask->drawable.height;
     }
 
-/* FIXME */
 	/* setup 3d pipeline state */
 
    binding_table_entries = 2; /* default no mask */
@@ -602,7 +605,7 @@ ErrorF("i965 prepareComposite\n");
 //   cc_state->cc5.ia_src_blend_factor = BRW_BLENDFACTOR_ONE;
 //   cc_state->cc5.ia_dest_blend_factor = BRW_BLENDFACTOR_ONE;
    cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
-   I965GetBlendCntl(op, pMask, pDstPicture->format, 
+   I965GetBlendCntl(op, pMaskPicture, pDstPicture->format, 
 		    &src_blend, &dst_blend);
    cc_state->cc6.src_blend_factor = src_blend;
    cc_state->cc6.dest_blend_factor = dst_blend;
@@ -703,7 +706,7 @@ ErrorF("i965 prepareComposite\n");
 
    /* PS kernel use this sampler */
    memset(src_sampler_state, 0, sizeof(*src_sampler_state));
-   src_sampler_state->ss0.lod_peclamp = 1; /* GL mode */
+   src_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
    switch(pSrcPicture->filter) {
    case PictFilterNearest:
    	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST; 
@@ -733,7 +736,7 @@ ErrorF("i965 prepareComposite\n");
 
    if (pMask) {
    	memset(mask_sampler_state, 0, sizeof(*mask_sampler_state));
-   	mask_sampler_state->ss0.lod_peclamp = 1; /* GL mode */
+   	mask_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
    	switch(pMaskPicture->filter) {
    	case PictFilterNearest:
    	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST; 
@@ -1065,6 +1068,8 @@ void
 I965EXAComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 		int dstX, int dstY, int w, int h)
 {
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    I830Ptr pI830 = I830PTR(pScrn);
     int srcXend, srcYend, maskXend, maskYend;
     PictVector v;
     int pMask = 1, i = 0;
diff-tree fc944859b1b9605c748162bad1c93a6303c84aae (from acdc2da77b445e9347a4c6e53e35c81763cbb0b8)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Thu Sep 28 10:36:00 2006 +0800

    [PATCH] Add simplest wm kernel program for no mask picture composite
    
    This is a try to use new gen4asm language, and will finish
    composite program for mask picture with or without CA case later.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/Makefile.am b/src/Makefile.am
index 8285406..b0c6c92 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -101,6 +101,8 @@ sf_prog.h: packed_yuv_sf.g4a
 	intel-gen4asm -o sf_prog.h packed_yuv_sf.g4a
 wm_prog.h: packed_yuv_wm.g4a
 	intel-gen4asm -o wm_prog.h packed_yuv_wm.g4a
+i965_composite_wm_nomask.h: i965_composite_wm_nomask.g4a
+	intel-gen4asm -o i965_composite_wm_nomask.h i965_composite_wm_nomask.g4a
 endif
 
 if DRI
diff --git a/src/i965_composite_wm_nomask.g4a b/src/i965_composite_wm_nomask.g4a
new file mode 100644
index 0000000..8791631
--- /dev/null
+++ b/src/i965_composite_wm_nomask.g4a
@@ -0,0 +1,139 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+    /* subtract screen-space X origin of vertex 0. */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.12<0,1,0>F { align1 };
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+mul (8) g6<1>F g6<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture Y offset */
+add (8) g6<1>F g6<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g4<8,8,1>F { align1 };
+mov (8) m2<1>F g5<8,8,1>F { align1 };  /* param 0 u in m1, m2 */
+mov (8) m3<1>F g6<8,8,1>F { align1 };
+mov (8) m4<1>F g7<8,8,1>F { align1 };  /* param 1 v in m3, m4 */
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0 		/* msg reg index */
+	g12<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
+			 /* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* g12 -> m2
+   g13 -> m6
+   g14 -> m3
+   g15 -> m7
+   g16 -> m4
+   g17 -> m8
+   g18 -> m5
+   g19 -> m9
+*/
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+/* write */
+send (16) 0 null g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
diff-tree acdc2da77b445e9347a4c6e53e35c81763cbb0b8 (from 926d7fb09aaaabf050949ce7c6127c68441c8801)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Sep 27 16:48:43 2006 +0800

    [PATCH] Add mask sampler state
    
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 942f0eb..dfa9a04 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -282,7 +282,7 @@ struct brw_surface_state *dest_surf_stat
 struct brw_surface_state *src_surf_state;
 struct brw_surface_state *mask_surf_state;
 struct brw_sampler_state *src_sampler_state;
-struct brw_sampler_state *mask_sampler_state;  // could just use one sampler?
+struct brw_sampler_state *mask_sampler_state;  
 
 struct brw_vs_unit_state *vs_state;
 struct brw_sf_unit_state *sf_state;
@@ -297,7 +297,8 @@ struct brw_instruction *sip_kernel;
 CARD32 *binding_table;
 int binding_table_entries; 
 
-int dest_surf_offset, src_surf_offset, src_sampler_offset, vs_offset;
+int dest_surf_offset, src_surf_offset;
+int src_sampler_offset, mask_sampler_offset,vs_offset;
 int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
 int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
 int binding_table_offset;
@@ -381,17 +382,17 @@ static const CARD32 sf_kernel_static[][4
 /* ps kernels */
 /* 1: no mask */
 static const CARD32 ps_kernel_static_nomask [][4] = {
-	#include "i965_composite_ps_nomask.h"
+	#include "i965_composite_wm_nomask.h"
 };
 
 /* 2: mask with componentAlpha, src * mask color, XXX: later */
 static const CARD32 ps_kernel_static_maskca [][4] = {
-	#include "i965_composite_ps_maskca.h"
+	#include "i965_composite_wm_maskca.h"
 };
 
 /* 3: mask without componentAlpha, src * mask alpha */
 static const CARD32 ps_kernel_static_masknoca [][4] = {
-	#include "i965_composite_ps_masknoca.h"
+	#include "i965_composite_wm_masknoca.h"
 };
 
 Bool
@@ -478,11 +479,14 @@ ErrorF("i965 prepareComposite\n");
    cc_viewport_offset = ALIGN(next_offset, 32);
    next_offset = cc_viewport_offset + sizeof(*cc_viewport);
 
-   // : fix for texture sampler
-   // XXX: -> use only one sampler
+   // for texture sampler
    src_sampler_offset = ALIGN(next_offset, 32);
    next_offset = src_sampler_offset + sizeof(*src_sampler_state);
 
+   if (pMask) {
+   	mask_sampler_offset = ALIGN(next_offset, 32);
+   	next_offset = mask_sampler_offset + sizeof(*mask_sampler_state);
+   }
    /* Align VB to native size of elements, for safety */
    vb_offset = ALIGN(next_offset, 8);
    next_offset = vb_offset + vb_size;
@@ -536,6 +540,9 @@ ErrorF("i965 prepareComposite\n");
 	mask_surf_state = (void *)(state_base + mask_surf_offset);
 
    src_sampler_state = (void *)(state_base + src_sampler_offset);
+   if (pMask)
+	mask_sampler_state = (void *)(state_base + mask_sampler_offset);
+
    binding_table = (void *)(state_base + binding_table_offset);
 
    vb = (void *)(state_base + vb_offset);
@@ -724,6 +731,37 @@ ErrorF("i965 prepareComposite\n");
     	   and just a single texel tex map, with R32G32B32A32_FLOAT */
    src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
+   if (pMask) {
+   	memset(mask_sampler_state, 0, sizeof(*mask_sampler_state));
+   	mask_sampler_state->ss0.lod_peclamp = 1; /* GL mode */
+   	switch(pMaskPicture->filter) {
+   	case PictFilterNearest:
+   	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST; 
+   	    mask_sampler_state->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	    break;
+   	case PictFilterBilinear:
+   	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR; 
+   	    mask_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	    break;
+   	default:
+	    I830FALLBACK("Bad filter 0x%x\n", pMaskPicture->filter);
+   	}
+
+   	if (!pMaskPicture->repeat) {
+	/* XXX: clamp_border and set border to 0 */
+   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
+   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	} else {
+   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
+   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+    	}
+   /* XXX: ss2 has border color pointer, which should be in general state address,
+    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
+   	mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
+   }
+
    /* Set up the vertex shader to be disabled (passthrough) */
    memset(vs_state, 0, sizeof(*vs_state));
    // XXX: vs URB should be defined for VF vertex URB store. done already?
@@ -783,26 +821,26 @@ ErrorF("i965 prepareComposite\n");
    wm_state->thread0.grf_reg_count = ((PS_KERNEL_NUM_GRF & ~15) / 16);
    wm_state->thread1.single_program_flow = 1;
    if (!pMask)
-       wm_state->thread1.binding_table_entry_count = 2; /* tex and fb */
+       wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
    else
-       wm_state->thread1.binding_table_entry_count = 3; /* tex and fb */
+       wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
 
    wm_state->thread2.scratch_space_base_pointer = 0;
    wm_state->thread2.per_thread_scratch_space = 0;
    // XXX: urb allocation
-   wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
-   // wm kernel use urb from 3, see wm_program in compiler module
-   wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
    wm_state->thread3.const_urb_entry_read_length = 0;
    wm_state->thread3.const_urb_entry_read_offset = 0;
+   wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
    wm_state->thread3.urb_entry_read_offset = 0;
+   // wm kernel use urb from 3, see wm_program in compiler module
+   wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
 
-   wm_state->wm4.stats_enable = 1;
-   wm_state->wm4.sampler_state_pointer = (state_base_offset + src_sampler_offset) >> 5;
+   wm_state->wm4.stats_enable = 1;  /* statistic */
+   wm_state->wm4.sampler_state_pointer = (state_base_offset + src_sampler_offset) >> 5; 
    wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
    wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
    wm_state->wm5.thread_dispatch_enable = 1;
-   //just use 16-pixel dispatch, don't need to change kernel start point
+   //just use 16-pixel dispatch (4 subspans), don't need to change kernel start point
    wm_state->wm5.enable_16_pix = 1;
    wm_state->wm5.enable_8_pix = 0;
    wm_state->wm5.early_depth_test = 1;
diff-tree 926d7fb09aaaabf050949ce7c6127c68441c8801 (from ed73bbaf5c2e9d555c884037a249cf03e7f60fa0)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Wed Sep 27 13:54:14 2006 +0800

    [PATCH] change some src sampler states
    
    sampler for mask should also be set up, and fix
    default border texel.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
index 24e0ba2..942f0eb 100644
--- a/src/i965_exa_render.c
+++ b/src/i965_exa_render.c
@@ -546,8 +546,8 @@ ErrorF("i965 prepareComposite\n");
 #define URB_CS_ENTRY_SIZE     0
 #define URB_CS_ENTRIES	      0
    
-#define URB_VS_ENTRY_SIZE     1	  // XXX: VUE row num? double check, 1 row is enough
-#define URB_VS_ENTRIES	      8
+#define URB_VS_ENTRY_SIZE     1	  // each 512-bit row
+#define URB_VS_ENTRIES	      8	  // we needs at least 8 entries
    
 #define URB_GS_ENTRY_SIZE     0
 #define URB_GS_ENTRIES	      0
@@ -630,6 +630,7 @@ ErrorF("i965 prepareComposite\n");
    dest_surf_state->ss2.mip_count = 0;
    dest_surf_state->ss2.render_target_rotation = 0;
    dest_surf_state->ss3.pitch = dst_pitch - 1; 
+   // tiled surface?
 
    /* Set up the source surface state buffer */
    memset(src_surf_state, 0, sizeof(*src_surf_state));
@@ -695,19 +696,33 @@ ErrorF("i965 prepareComposite\n");
 
    /* PS kernel use this sampler */
    memset(src_sampler_state, 0, sizeof(*src_sampler_state));
-   src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-   src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+   src_sampler_state->ss0.lod_peclamp = 1; /* GL mode */
+   switch(pSrcPicture->filter) {
+   case PictFilterNearest:
+   	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST; 
+   	src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	break;
+   case PictFilterBilinear:
+   	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR; 
+   	src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	break;
+   default:
+	I830FALLBACK("Bad filter 0x%x\n", pSrcPicture->filter);
+   }
 
-   /* XXX: fix for repeat */
    if (!pSrcPicture->repeat) {
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; // XXX: clamp_border and set border to 0?
+	/* XXX: clamp_border and set border to 0 */
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; 
    	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
    	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
    } else {
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; // XXX: clamp_border and set border to 0?
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; 
    	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    }
+   /* XXX: ss2 has border color pointer, which should be in general state address,
+    	   and just a single texel tex map, with R32G32B32A32_FLOAT */
+   src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
    /* Set up the vertex shader to be disabled (passthrough) */
    memset(vs_state, 0, sizeof(*vs_state));
diff-tree ed73bbaf5c2e9d555c884037a249cf03e7f60fa0 (from 5a793b0dcf2d5de408b55073858fcfba6d99f994)
Author: Wang Zhenyu <zhenyu.z.wang at intel.com>
Date:   Mon Sep 25 14:35:51 2006 +0800

    [PATCH] Add file for i965 exa composite
    
    This does not include ps program, which will be added
    in g4a form.
    
    Signed-off-by: Keith Packard <keithp at neko.keithp.com>

diff --git a/src/Makefile.am b/src/Makefile.am
index 5309eea..8285406 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -93,7 +93,8 @@ i810_drv_la_SOURCES = \
 	 i830_exa.c \
 	 i830_xaa.c \
 	 i830_exa_render.c \
-	 i915_exa_render.c
+	 i915_exa_render.c \
+	 i965_exa_render.c
 
 if HAVE_GEN4ASM
 sf_prog.h: packed_yuv_sf.g4a
diff --git a/src/i830_exa.c b/src/i830_exa.c
index 9356c79..c5b91b0 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -121,6 +121,11 @@ extern Bool I915EXACheckComposite(int, P
 extern Bool I915EXAPrepareComposite(int, PicturePtr, PicturePtr, PicturePtr, 
 				PixmapPtr, PixmapPtr, PixmapPtr);
 
+extern Bool I965EXACheckComposite(int, PicturePtr, PicturePtr, PicturePtr);
+extern Bool I965EXAPrepareComposite(int, PicturePtr, PicturePtr, PicturePtr, 
+				PixmapPtr, PixmapPtr, PixmapPtr);
+extern void I965EXAComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, 
+			int maskY, int dstX, int dstY, int width, int height);
 /**
  * I830EXASync - wait for a command to finish
  * @pScreen: current screen
@@ -419,6 +424,8 @@ IntelEXADoneComposite(PixmapPtr pDst)
     I830Sync(pScrn);
 #endif
 }
+
+#define BRW_LINEAR_EXTRA (32*1024)
 /*
  * TODO:
  *   - Dual head?
@@ -441,7 +448,11 @@ I830EXAInit(ScreenPtr pScreen)
     pI830->EXADriverPtr->exa_minor = 0;
     pI830->EXADriverPtr->memoryBase = pI830->FbBase;
     pI830->EXADriverPtr->offScreenBase = pI830->Offscreen.Start;
-    pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
+    if (IS_I965G(pI830))
+    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End -
+					BRW_LINEAR_EXTRA; /* BRW needs state buffer*/
+    else
+    	pI830->EXADriverPtr->memorySize = pI830->Offscreen.End;
 	   
     DPRINTF(PFX, "EXA Mem: memoryBase 0x%x, end 0x%x, offscreen base 0x%x, memorySize 0x%x\n",
 		pI830->EXADriverPtr->memoryBase,
@@ -492,6 +503,11 @@ I830EXAInit(ScreenPtr pScreen)
     	pI830->EXADriverPtr->PrepareComposite = I830EXAPrepareComposite;
     	pI830->EXADriverPtr->Composite = IntelEXAComposite;
     	pI830->EXADriverPtr->DoneComposite = IntelEXADoneComposite;
+    } else if (IS_I965G(pI830)) {
+ 	pI830->EXADriverPtr->CheckComposite = I965EXACheckComposite;
+ 	pI830->EXADriverPtr->PrepareComposite = I965EXAPrepareComposite;
+ 	pI830->EXADriverPtr->Composite = I965EXAComposite;
+ 	pI830->EXADriverPtr->DoneComposite = IntelEXADoneComposite;
     }
 
     if(!exaDriverInit(pScreen, pI830->EXADriverPtr)) {
diff --git a/src/i965_exa_render.c b/src/i965_exa_render.c
new file mode 100644
index 0000000..24e0ba2
--- /dev/null
+++ b/src/i965_exa_render.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright Â© 2006 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Wang Zhenyu <zhenyu.z.wang at intel.com>
+ *    Eric Anholt <eric at anholt.net>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "xf86.h"
+#include "i830.h"
+#include "i915_reg.h"
+
+/* bring in brw structs */
+#include "brw_defines.h"
+#include "brw_structs.h"
+
+#ifdef I830DEBUG
+#define DEBUG_I830FALLBACK 1
+#endif
+
+#ifdef DEBUG_I830FALLBACK
+#define I830FALLBACK(s, arg...)				\
+do {							\
+	DPRINTF(PFX, "EXA fallback: " s "\n", ##arg); 	\
+	return FALSE;					\
+} while(0)
+#else
+#define I830FALLBACK(s, arg...) 			\
+do { 							\
+	return FALSE;					\
+} while(0) 
+#endif
+
+extern Bool
+I965EXACheckComposite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
+		      PicturePtr pDstPicture);
+
+extern Bool
+I965EXAPrepareComposite(int op, PicturePtr pSrcPicture,
+			PicturePtr pMaskPicture, PicturePtr pDstPicture,
+			PixmapPtr pSrc, PixmapPtr pMask, PixmapPtr pDst);
+
+extern void
+I965EXAComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
+		int dstX, int dstY, int width, int height);
+
+extern float scale_units[2][2];
+extern Bool is_transform[2];
+extern PictTransform *transform[2];
+
+struct blendinfo {
+    Bool dst_alpha;
+    Bool src_alpha;
+    CARD32 src_blend;
+    CARD32 dst_blend;
+};
+
+struct formatinfo {
+    int fmt;
+    CARD32 card_fmt;
+};
+
+// refer vol2, 3d rasterization 3.8.1
+
+/* XXX: bad!bad! broadwater has different blend factor definition */
+/* defined in brw_defines.h */
+static struct blendinfo I965BlendOp[] = { 
+    /* Clear */
+    {0, 0, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_ZERO},
+    /* Src */
+    {0, 0, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_ZERO},
+    /* Dst */
+    {0, 0, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_ONE},
+    /* Over */
+    {0, 1, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_INV_SRC_ALPHA},
+    /* OverReverse */
+    {1, 0, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_ONE},
+    /* In */
+    {1, 0, BRW_BLENDFACT_DST_ALPHA,     BRW_BLENDFACT_ZERO},
+    /* InReverse */
+    {0, 1, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_SRC_ALPHA},
+    /* Out */
+    {1, 0, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_ZERO},
+    /* OutReverse */
+    {0, 1, BRW_BLENDFACT_ZERO,          BRW_BLENDFACT_INV_SRC_ALPHA},
+    /* Atop */
+    {1, 1, BRW_BLENDFACT_DST_ALPHA,     BRW_BLENDFACT_INV_SRC_ALPHA},
+    /* AtopReverse */
+    {1, 1, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_SRC_ALPHA},
+    /* Xor */
+    {1, 1, BRW_BLENDFACT_INV_DST_ALPHA, BRW_BLENDFACT_INV_SRC_ALPHA},
+    /* Add */
+    {0, 0, BRW_BLENDFACT_ONE,           BRW_BLENDFACT_ONE},
+};
+
+/* FIXME: surface format defined in brw_defines.h, shared Sampling engine 1.7.2*/
+static struct formatinfo I965TexFormats[] = {
+        {PICT_a8r8g8b8, BRW_SURFACEFORMAT_R8G8B8A8_UNORM },
+        {PICT_x8r8g8b8, BRW_SURFACEFORMAT_R8G8B8X8_UNORM },
+        {PICT_a8b8g8r8, BRW_SURFACEFORMAT_B8G8R8A8_UNORM },
+        {PICT_x8b8g8r8, BRW_SURFACEFORMAT_B8G8R8X8_UNORM },
+        {PICT_r5g6b5,   BRW_SURFACEFORMAT_B5G6R5_UNORM   },
+        {PICT_a1r5g5b5, BRW_SURFACEFORMAT_B5G6R5A1_UNORM },
+        {PICT_x1r5g5b5, BRW_SURFACEFORMAT_B5G6R5X1_UNORM },
+        {PICT_a8,       BRW_SURFACEFORMAT_A8_UNORM	 },
+};
+
+static void I965GetBlendCntl(int op, PicturePtr pMask, CARD32 dst_format, 
+			     CARD32 *sblend, CARD32 *dblend)
+{
+
+    *sblend = I965BlendOp[op].src_blend;
+    *dblend = I965BlendOp[op].dst_blend;
+
+    /* If there's no dst alpha channel, adjust the blend op so that we'll treat
+     * it as always 1.
+     */
+    if (PICT_FORMAT_A(dst_format) == 0 && I965BlendOp[op].dst_alpha) {
+        if (*sblend == BRW_BLENDFACT_DST_ALPHA)
+            *sblend = BRW_BLENDFACT_ONE;
+        else if (*sblend == BRW_BLENDFACT_INV_DST_ALPHA)
+            *sblend = BRW_BLENDFACT_ZERO;
+    }
+
+    /* If the source alpha is being used, then we should only be in a case where
+     * the source blend factor is 0, and the source blend value is the mask
+     * channels multiplied by the source picture's alpha.
+     */
+    if (pMask && pMask->componentAlpha && I965BlendOp[op].src_alpha) {
+        if (*dblend == BRW_BLENDFACT_SRC_ALPHA) {
+	    *dblend = BRW_BLENDFACT_SRC_COLR;
+        } else if (*dblend == BRW_BLENDFACT_INV_SRC_ALPHA) {
+	    *dblend = BRW_BLENDFACT_INV_SRC_COLR;
+        }
+    }
+
+}
+
+
+/* FIXME */
+static Bool I965GetDestFormat(PicturePtr pDstPicture, CARD32 *dst_format)
+{
+    switch (pDstPicture->format) {
+    case PICT_a8r8g8b8:
+    case PICT_x8r8g8b8:
+        *dst_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+        break;
+    case PICT_r5g6b5:
+        *dst_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+        break;
+    case PICT_a1r5g5b5:
+    	*dst_format = BRW_SURFACEFORMAT_B5G6R5A1_UNORM;
+	break;
+    case PICT_x1r5g5b5:
+        *dst_format = BRW_SURFACEFORMAT_B5G6R5X1_UNORM;
+        break;
+    /* COLR_BUF_8BIT is special for YUV surfaces.  While we may end up being
+     * able to use it depending on how the hardware implements it, disable it
+     * for now while we don't know what exactly it does (what channel does it
+     * read from?
+     */
+    /*
+    case PICT_a8:
+        *dst_format = COLR_BUF_8BIT;
+        break;
+    */
+    case PICT_a4r4g4b4:
+    case PICT_x4r4g4b4:
+	*dst_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM; 
+	break;
+    default:
+        I830FALLBACK("Unsupported dest format 0x%x\n",
+                        (int)pDstPicture->format);
+    }
+
+    return TRUE;
+}
+
+static Bool I965CheckCompositeTexture(PicturePtr pPict, int unit)
+{
+    int w = pPict->pDrawable->width;
+    int h = pPict->pDrawable->height;
+    int i;
+                                                                                                                                                            
+    if ((w > 0x7ff) || (h > 0x7ff))
+        I830FALLBACK("Picture w/h too large (%dx%d)\n", w, h);
+
+    for (i = 0; i < sizeof(I965TexFormats) / sizeof(I965TexFormats[0]); i++)
+    {
+        if (I965TexFormats[i].fmt == pPict->format)
+            break;
+    }
+    if (i == sizeof(I965TexFormats) / sizeof(I965TexFormats[0]))
+        I830FALLBACK("Unsupported picture format 0x%x\n",
+                         (int)pPict->format);
+
+    /* XXX: fallback when repeat? */
+    if (pPict->repeat && pPict->repeatType != RepeatNormal)
+	I830FALLBACK("extended repeat (%d) not supported\n",
+		     pPict->repeatType);
+
+    if (pPict->filter != PictFilterNearest &&
+        pPict->filter != PictFilterBilinear)
+        I830FALLBACK("Unsupported filter 0x%x\n", pPict->filter);
+
+    return TRUE;
+}
+
+Bool
+I965EXACheckComposite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
+		      PicturePtr pDstPicture)
+{
+	/* check op*/
+	/* check op with mask's componentAlpha*/
+	/* check textures */
+	/* check dst buffer format */
+    CARD32 tmp1;
+    
+    /* Check for unsupported compositing operations. */
+    if (op >= sizeof(I965BlendOp) / sizeof(I965BlendOp[0]))
+        I830FALLBACK("Unsupported Composite op 0x%x\n", op);
+                                                                                                                                                            
+    if (pMaskPicture != NULL && pMaskPicture->componentAlpha) {
+        /* Check if it's component alpha that relies on a source alpha and on
+         * the source value.  We can only get one of those into the single
+         * source value that we get to blend with.
+         */
+        if (I965BlendOp[op].src_alpha &&
+            (I965BlendOp[op].src_blend != BRW_BLENDFACT_ZERO))
+            	I830FALLBACK("Component alpha not supported with source "
+                            "alpha and source value blending.\n");
+	/* XXX: fallback now for mask with componentAlpha */
+	I830FALLBACK("mask componentAlpha not ready.\n");
+    }
+
+    if (!I965CheckCompositeTexture(pSrcPicture, 0))
+        I830FALLBACK("Check Src picture texture\n");
+    if (pMaskPicture != NULL && !I965CheckCompositeTexture(pMaskPicture, 1))
+        I830FALLBACK("Check Mask picture texture\n");
+
+    if (!I965GetDestFormat(pDstPicture, &tmp1)) 
+	I830FALLBACK("Get Color buffer format\n");
+
+    return TRUE;
+
+}
+
+#define ALIGN(i,m)    (((i) + (m) - 1) & ~((m) - 1))
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+int urb_vs_start, urb_vs_size;
+int urb_gs_start, urb_gs_size;
+int urb_clip_start, urb_clip_size;
+int urb_sf_start, urb_sf_size;
+int urb_cs_start, urb_cs_size;
+
+struct brw_surface_state *dest_surf_state;
+struct brw_surface_state *src_surf_state;
+struct brw_surface_state *mask_surf_state;
+struct brw_sampler_state *src_sampler_state;
+struct brw_sampler_state *mask_sampler_state;  // could just use one sampler?
+
+struct brw_vs_unit_state *vs_state;
+struct brw_sf_unit_state *sf_state;
+struct brw_wm_unit_state *wm_state;
+struct brw_cc_unit_state *cc_state;
+struct brw_cc_viewport *cc_viewport;
+
+struct brw_instruction *sf_kernel;
+struct brw_instruction *ps_kernel;
+struct brw_instruction *sip_kernel;
+
+CARD32 *binding_table;
+int binding_table_entries; 
+
+int dest_surf_offset, src_surf_offset, src_sampler_offset, vs_offset;
+int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
+int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
+int binding_table_offset;
+int next_offset, total_state_size;
+char *state_base;
+int state_base_offset;
+float *vb;
+int vb_size = 4 * 4 ; /* 4 DWORDS per vertex, 4 vertices for TRIFAN*/ 
+
+int src_blend, dst_blend;
+
+static const CARD32 sip_kernel_static[][4] = {
+/*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
+    { 0x00000030, 0x20000108, 0x00001220, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+    { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+};
+
+/*
+ * this program computes dA/dx and dA/dy for the texture coordinates along
+ * with the base texture coordinate. It was extracted from the Mesa driver
+ */
+
+#define SF_KERNEL_NUM_GRF  10
+#define SF_KERNEL_NUM_URB  8
+#define SF_MAX_THREADS	   4
+
+static const CARD32 sf_kernel_static[][4] = {
+/*    send   0 (1) g6<1>F g1.12<0,1,0>F math mlen 1 rlen 1 { align1 +  } */
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+/*    send   0 (1) g6.4<1>F g1.20<0,1,0>F math mlen 1 rlen 1 { align1 +  } */
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+/*    add (8) g7<1>F g4<8,8,1>F g3<8,8,1>F { align1 +  } */
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
+/*    mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 +  } */
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
+/*    mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 +  } */
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
+/*    mov (8) m1<1>F g7<0,1,0>F { align1 +  } */
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+/*    mov (8) m2<1>F g7.4<0,1,0>F { align1 +  } */
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+/*    mov (8) m3<1>F g3<8,8,1>F { align1 +  } */
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+/*    send   0 (8) a0<1>F g0<8,8,1>F urb mlen 4 rlen 0 write +0 transpose used complete EOT{ align1 +  } */
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+/*    nop (4) g0<1>UD { align1 +  } */
+   { 0x0040007e, 0x20000c21, 0x00690000, 0x00000000 },
+};
+
+/* ps kernels */
+/* 1: no mask */
+static const CARD32 ps_kernel_static_nomask [][4] = {
+	#include "i965_composite_ps_nomask.h"
+};
+
+/* 2: mask with componentAlpha, src * mask color, XXX: later */
+static const CARD32 ps_kernel_static_maskca [][4] = {
+	#include "i965_composite_ps_maskca.h"
+};
+
+/* 3: mask without componentAlpha, src * mask alpha */
+static const CARD32 ps_kernel_static_masknoca [][4] = {
+	#include "i965_composite_ps_masknoca.h"
+};
+
+Bool
+I965EXAPrepareComposite(int op, PicturePtr pSrcPicture,
+			PicturePtr pMaskPicture, PicturePtr pDstPicture,
+			PixmapPtr pSrc, PixmapPtr pMask, PixmapPtr pDst)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pSrcPicture->pDrawable->pScreen->myNum];
+    I830Ptr pI830 = I830PTR(pScrn);
+    CARD32 src_offset, src_pitch;
+    CARD32 mask_offset, mask_pitch;
+    CARD32 dst_format, dst_offset, dst_pitch;
+    CARD32 blendctl;
+ 
+ErrorF("i965 prepareComposite\n");
+
+//    i965_3d_pipeline_setup(pScrn);
+//    i965_surf_setup(pScrn, pSrcPicture, pMaskPicture, pDstPicture,
+//   			pSrc, pMask, pDst);
+    // then setup blend, and shader program 
+
+    I965GetDestFormat(pDstPicture, &dst_format);
+    src_offset = exaGetPixmapOffset(pSrc);
+    src_pitch = exaGetPixmapPitch(pSrc);
+    dst_offset = exaGetPixmapOffset(pDst);
+    dst_pitch = exaGetPixmapPitch(pDst);
+    if (pMask) {
+	mask_offset = exaGetPixmapOffset(pMask);
+	mask_pitch = exaGetPixmapPitch(pMask);
+    }
+    scale_units[0][0] = pSrc->drawable.width;
+    scale_units[0][1] = pSrc->drawable.height;
+    scale_units[2][0] = pDst->drawable.width;
+    scale_units[2][1] = pDst->drawable.height;
+
+    if (!pMask) {
+	is_transform[1] = FALSE;
+	scale_units[1][0] = -1;
+	scale_units[1][1] = -1;
+    } else {
+	scale_units[1][0] = pMask->drawable.width;
+	scale_units[1][1] = pMask->drawable.height;
+    }
+
+/* FIXME */
+	/* setup 3d pipeline state */
+
+   binding_table_entries = 2; /* default no mask */
+
+   /* Set up our layout of state in framebuffer.  First the general state: */
+   next_offset = 0;
+   vs_offset = ALIGN(next_offset, 64);
+   next_offset = vs_offset + sizeof(*vs_state);
+    
+   sf_offset = ALIGN(next_offset, 32);
+   next_offset = sf_offset + sizeof(*sf_state);
+    
+   wm_offset = ALIGN(next_offset, 32);
+   next_offset = wm_offset + sizeof(*wm_state);
+    
+   cc_offset = ALIGN(next_offset, 32);
+   next_offset = cc_offset + sizeof(*cc_state);
+
+// fixup sf_kernel_static, is sf_kernel needed? or not? why? 
+//	-> just keep current sf_kernel, which will send one setup urb entry to
+//	PS kernel
+   sf_kernel_offset = ALIGN(next_offset, 64);
+   next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
+
+   //XXX: ps_kernel may be seperated, fix with offset
+   ps_kernel_offset = ALIGN(next_offset, 64);
+   if (pMask) {
+	if (pMaskPicture->componentAlpha)
+	    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca);
+	else 
+	    next_offset = ps_kernel_offset + sizeof(ps_kernel_static_masknoca);
+   } else 
+   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
+    
+   sip_kernel_offset = ALIGN(next_offset, 64);
+   next_offset = sip_kernel_offset + sizeof (sip_kernel_static);
+   
+   // needed?
+   cc_viewport_offset = ALIGN(next_offset, 32);
+   next_offset = cc_viewport_offset + sizeof(*cc_viewport);
+
+   // : fix for texture sampler
+   // XXX: -> use only one sampler
+   src_sampler_offset = ALIGN(next_offset, 32);
+   next_offset = src_sampler_offset + sizeof(*src_sampler_state);
+
+   /* Align VB to native size of elements, for safety */
+   vb_offset = ALIGN(next_offset, 8);
+   next_offset = vb_offset + vb_size;
+
+   /* And then the general state: */
+   //XXX: fix for texture map and target surface
+   dest_surf_offset = ALIGN(next_offset, 32);
+   next_offset = dest_surf_offset + sizeof(*dest_surf_state);
+
+   src_surf_offset = ALIGN(next_offset, 32);
+   next_offset = src_surf_offset + sizeof(*src_surf_state);
+
+   if (pMask) {
+   	mask_surf_offset = ALIGN(next_offset, 32);
+   	next_offset = mask_surf_offset + sizeof(*mask_surf_state);
+	binding_table_entries = 3;
+   }
+
+   binding_table_offset = ALIGN(next_offset, 32);
+   next_offset = binding_table_offset + (binding_table_entries * 4);
+
+   total_state_size = next_offset;
+
+   /*
+    * XXX: Use the extra space allocated at the end of the exa offscreen buffer?
+    */
+#define BRW_LINEAR_EXTRA	(32*1024)
+
+   state_base_offset = (pI830->Offscreen.End -
+			BRW_LINEAR_EXTRA);
+   
+   state_base_offset = ALIGN(state_base_offset, 64);
+   state_base = (char *)(pI830->FbBase + state_base_offset);
+   /* Set up our pointers to state structures in framebuffer.  It would probably
+    * be a good idea to fill these structures out in system memory and then dump
+    * them there, instead.
+    */
+   vs_state = (void *)(state_base + vs_offset);
+   sf_state = (void *)(state_base + sf_offset);
+   wm_state = (void *)(state_base + wm_offset);
+   cc_state = (void *)(state_base + cc_offset);
+   sf_kernel = (void *)(state_base + sf_kernel_offset);
+   ps_kernel = (void *)(state_base + ps_kernel_offset);
+   sip_kernel = (void *)(state_base + sip_kernel_offset);
+   
+   cc_viewport = (void *)(state_base + cc_viewport_offset);
+   
+   dest_surf_state = (void *)(state_base + dest_surf_offset);
+   src_surf_state = (void *)(state_base + src_surf_offset);
+   if (pMask)
+	mask_surf_state = (void *)(state_base + mask_surf_offset);
+
+   src_sampler_state = (void *)(state_base + src_sampler_offset);
+   binding_table = (void *)(state_base + binding_table_offset);
+
+   vb = (void *)(state_base + vb_offset);
+
+   /* Set up a default static partitioning of the URB, which is supposed to
+    * allow anything we would want to do, at potentially lower performance.
+    */
+#define URB_CS_ENTRY_SIZE     0
+#define URB_CS_ENTRIES	      0
+   
+#define URB_VS_ENTRY_SIZE     1	  // XXX: VUE row num? double check, 1 row is enough
+#define URB_VS_ENTRIES	      8
+   
+#define URB_GS_ENTRY_SIZE     0
+#define URB_GS_ENTRIES	      0
+   
+#define URB_CLIP_ENTRY_SIZE   0
+#define URB_CLIP_ENTRIES      0
+   
+#define URB_SF_ENTRY_SIZE     4
+#define URB_SF_ENTRIES	      8
+
+   urb_vs_start = 0;
+   urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
+   urb_gs_start = urb_vs_start + urb_vs_size;
+   urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
+   urb_clip_start = urb_gs_start + urb_gs_size;
+   urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
+   urb_sf_start = urb_clip_start + urb_clip_size;
+   urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
+   urb_cs_start = urb_sf_start + urb_sf_size;
+   urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+
+   /* We'll be poking the state buffers that could be in use by the 3d hardware
+    * here, but we should have synced the 3D engine already in I830PutImage.
+    */
+
+// needed?
+   memset (cc_viewport, 0, sizeof (*cc_viewport));
+   cc_viewport->min_depth = -1.e35;
+   cc_viewport->max_depth = 1.e35;
+
+   /* Color calculator state */
+   memset(cc_state, 0, sizeof(*cc_state));
+   cc_state->cc0.stencil_enable = 0;   /* disable stencil */
+   cc_state->cc2.depth_test = 0;       /* disable depth test */
+   cc_state->cc2.logicop_enable = 0;   /* disable logic op */
+   cc_state->cc3.ia_blend_enable = 0;  /* blend alpha just like colors */
+   cc_state->cc3.blend_enable = 1;     /* enable color blend */
+   cc_state->cc3.alpha_test = 0;       /* disable alpha test */
+   // XXX:cc_viewport needed? 
+   cc_state->cc4.cc_viewport_state_offset = (state_base_offset + cc_viewport_offset) >> 5;
+   cc_state->cc5.dither_enable = 0;    /* disable dither */
+//   cc_state->cc5.logicop_func = 0xc;   /* COPY */
+//   cc_state->cc5.statistics_enable = 1;
+//   cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
+//   cc_state->cc5.ia_src_blend_factor = BRW_BLENDFACTOR_ONE;
+//   cc_state->cc5.ia_dest_blend_factor = BRW_BLENDFACTOR_ONE;
+   cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
+   I965GetBlendCntl(op, pMask, pDstPicture->format, 
+		    &src_blend, &dst_blend);
+   cc_state->cc6.src_blend_factor = src_blend;
+   cc_state->cc6.dest_blend_factor = dst_blend;
+
+   /* Upload system kernel */
+   memcpy (sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
+   
+   /* Set up the state buffer for the destination surface */
+   memset(dest_surf_state, 0, sizeof(*dest_surf_state));
+   dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+   dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
+   // XXX: should compare with picture's cpp?...8 bit surf?
+   if (pDst->drawable.bitsPerPixel == 16) {
+      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+   } else {
+      dest_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   }
+   dest_surf_state->ss0.writedisable_alpha = 0;
+   dest_surf_state->ss0.writedisable_red = 0;
+   dest_surf_state->ss0.writedisable_green = 0;
+   dest_surf_state->ss0.writedisable_blue = 0;
+   dest_surf_state->ss0.color_blend = 1;
+   dest_surf_state->ss0.vert_line_stride = 0;
+   dest_surf_state->ss0.vert_line_stride_ofs = 0;
+   dest_surf_state->ss0.mipmap_layout_mode = 0;
+   dest_surf_state->ss0.render_cache_read_mode = 0;
+   
+   // XXX: fix to picture address & size
+   dest_surf_state->ss1.base_addr = dst_offset;
+   dest_surf_state->ss2.height = pDst->drawable.height - 1;
+   dest_surf_state->ss2.width = pDst->drawable.width - 1;
+   dest_surf_state->ss2.mip_count = 0;
+   dest_surf_state->ss2.render_target_rotation = 0;
+   dest_surf_state->ss3.pitch = dst_pitch - 1; 
+
+   /* Set up the source surface state buffer */
+   memset(src_surf_state, 0, sizeof(*src_surf_state));
+   src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+   if (pSrc->drawable.bitsPerPixel == 8)
+      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_A8_UNORM; //XXX? 
+   else if (pSrc->drawable.bitsPerPixel == 16)
+      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+   else 
+      src_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   src_surf_state->ss0.writedisable_alpha = 0;
+   src_surf_state->ss0.writedisable_red = 0;
+   src_surf_state->ss0.writedisable_green = 0;
+   src_surf_state->ss0.writedisable_blue = 0;
+   src_surf_state->ss0.color_blend = 1;
+   src_surf_state->ss0.vert_line_stride = 0;
+   src_surf_state->ss0.vert_line_stride_ofs = 0;
+   src_surf_state->ss0.mipmap_layout_mode = 0;
+   src_surf_state->ss0.render_cache_read_mode = 0;
+   
+   src_surf_state->ss1.base_addr = src_offset;
+   src_surf_state->ss2.width = pSrc->drawable.width - 1;
+   src_surf_state->ss2.height = pSrc->drawable.height - 1;
+   src_surf_state->ss2.mip_count = 0;
+   src_surf_state->ss2.render_target_rotation = 0;
+   src_surf_state->ss3.pitch = src_pitch - 1; 
+
+   /* setup mask surface */
+   if (pMask) {
+   	memset(mask_surf_state, 0, sizeof(*mask_surf_state));
+	mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+   	if (pMask->drawable.bitsPerPixel == 8)
+      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_A8_UNORM; //XXX? 
+   	else if (pMask->drawable.bitsPerPixel == 16)
+      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+   	else 
+      	    mask_surf_state->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   	mask_surf_state->ss0.writedisable_alpha = 0;
+   	mask_surf_state->ss0.writedisable_red = 0;
+   	mask_surf_state->ss0.writedisable_green = 0;
+   	mask_surf_state->ss0.writedisable_blue = 0;
+   	mask_surf_state->ss0.color_blend = 1;
+   	mask_surf_state->ss0.vert_line_stride = 0;
+   	mask_surf_state->ss0.vert_line_stride_ofs = 0;
+   	mask_surf_state->ss0.mipmap_layout_mode = 0;
+   	mask_surf_state->ss0.render_cache_read_mode = 0;
+   
+   	mask_surf_state->ss1.base_addr = mask_offset;
+   	mask_surf_state->ss2.width = pMask->drawable.width - 1;
+   	mask_surf_state->ss2.height = pMask->drawable.height - 1;
+   	mask_surf_state->ss2.mip_count = 0;
+   	mask_surf_state->ss2.render_target_rotation = 0;
+   	mask_surf_state->ss3.pitch = mask_pitch - 1; 
+   }
+
+   /* Set up a binding table for our surfaces.  Only the PS will use it */
+   binding_table[0] = state_base_offset + dest_surf_offset;
+   binding_table[1] = state_base_offset + src_surf_offset;
+   if (pMask)
+   	binding_table[2] = state_base_offset + mask_surf_offset;
+
+   /* PS kernel use this sampler */
+   memset(src_sampler_state, 0, sizeof(*src_sampler_state));
+   src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+   src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+
+   /* XXX: fix for repeat */
+   if (!pSrcPicture->repeat) {
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP; // XXX: clamp_border and set border to 0?
+   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   } else {
+   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP; // XXX: clamp_border and set border to 0?
+   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   }
+
+   /* Set up the vertex shader to be disabled (passthrough) */
+   memset(vs_state, 0, sizeof(*vs_state));
+   // XXX: vs URB should be defined for VF vertex URB store. done already?
+   vs_state->vs6.vs_enable = 0;
+
+   // XXX: sf_kernel? keep it as now
+   /* Set up the SF kernel to do coord interp: for each attribute,
+    * calculate dA/dx and dA/dy.  Hand these interpolation coefficients
+    * back to SF which then hands pixels off to WM.
+    */
+   memcpy (sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
+
+   memset(sf_state, 0, sizeof(*sf_state));
+   sf_state->thread0.kernel_start_pointer = 
+	       (state_base_offset + sf_kernel_offset) >> 6;
+   sf_state->thread0.grf_reg_count = ((SF_KERNEL_NUM_GRF & ~15) / 16);
+   sf_state->sf1.single_program_flow = 1;
+   sf_state->sf1.binding_table_entry_count = 0;
+   sf_state->sf1.thread_priority = 0;
+   sf_state->sf1.floating_point_mode = 0; /* Mesa does this */
+   sf_state->sf1.illegal_op_exception_enable = 1;
+   sf_state->sf1.mask_stack_exception_enable = 1;
+   sf_state->sf1.sw_exception_enable = 1;
+   sf_state->thread2.per_thread_scratch_space = 0;
+   sf_state->thread2.scratch_space_base_pointer = 0; /* not used in our kernel */
+   sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */
+   sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */
+   sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */
+   sf_state->thread3.urb_entry_read_offset = 0;
+   sf_state->thread3.dispatch_grf_start_reg = 3;
+   sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
+   sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
+   sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
+   sf_state->thread4.stats_enable = 1;
+   sf_state->sf5.viewport_transform = FALSE; /* skip viewport */
+   sf_state->sf6.cull_mode = BRW_CULLMODE_NONE;
+   sf_state->sf6.scissor = 0;
+   sf_state->sf7.trifan_pv = 2;
+   sf_state->sf6.dest_org_vbias = 0x8;
+   sf_state->sf6.dest_org_hbias = 0x8;
+
+   /* Set up the PS kernel (dispatched by WM) 
+    */
+    
+    // XXX: replace to texture blend shader, and different cases 
+   if (pMask) {
+	if (pMaskPicture->componentAlpha)
+   	    memcpy (ps_kernel, ps_kernel_static_maskca, sizeof (ps_kernel_static_maskca));
+	else
+   	    memcpy (ps_kernel, ps_kernel_static_masknoca, sizeof (ps_kernel_static_masknoca));
+   } else 
+   	memcpy (ps_kernel, ps_kernel_static_nomask, sizeof (ps_kernel_static_nomask));
+
+   memset (wm_state, 0, sizeof (*wm_state));
+   wm_state->thread0.kernel_start_pointer = 
+	    (state_base_offset + ps_kernel_offset) >> 6;
+   wm_state->thread0.grf_reg_count = ((PS_KERNEL_NUM_GRF & ~15) / 16);
+   wm_state->thread1.single_program_flow = 1;
+   if (!pMask)
+       wm_state->thread1.binding_table_entry_count = 2; /* tex and fb */
+   else
+       wm_state->thread1.binding_table_entry_count = 3; /* tex and fb */
+
+   wm_state->thread2.scratch_space_base_pointer = 0;
+   wm_state->thread2.per_thread_scratch_space = 0;
+   // XXX: urb allocation
+   wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
+   // wm kernel use urb from 3, see wm_program in compiler module
+   wm_state->thread3.urb_entry_read_length = 1;  /* one per pair of attrib */
+   wm_state->thread3.const_urb_entry_read_length = 0;
+   wm_state->thread3.const_urb_entry_read_offset = 0;
+   wm_state->thread3.urb_entry_read_offset = 0;
+
+   wm_state->wm4.stats_enable = 1;
+   wm_state->wm4.sampler_state_pointer = (state_base_offset + src_sampler_offset) >> 5;
+   wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
+   wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
+   wm_state->wm5.thread_dispatch_enable = 1;
+   //just use 16-pixel dispatch, don't need to change kernel start point
+   wm_state->wm5.enable_16_pix = 1;
+   wm_state->wm5.enable_8_pix = 0;
+   wm_state->wm5.early_depth_test = 1;
+
+   /* Begin the long sequence of commands needed to set up the 3D 
+    * rendering pipe
+    */
+   {
+   
+   BEGIN_LP_RING((pMask?48:46));
+   // MI_FLUSH prior to PIPELINE_SELECT
+   OUT_RING(MI_FLUSH | 
+	    MI_STATE_INSTRUCTION_CACHE_FLUSH |
+	    BRW_MI_GLOBAL_SNAPSHOT_RESET);
+   
+   /* Match Mesa driver setup */
+   OUT_RING(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+   
+   /* Zero out the two base address registers so all offsets are absolute */
+   // XXX: zero out...
+   OUT_RING(BRW_STATE_BASE_ADDRESS | 4);
+   // why this's not state_base_offset? -> because later we'll always add on
+   // state_base_offset to offset params. see SIP
+   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Generate state base address */
+   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* Surface state base address */
+   OUT_RING(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* general state max addr, disabled */
+   OUT_RING(0x10000000 | BASE_ADDRESS_MODIFY);  /* media object state max addr, disabled */
+
+   /* Set system instruction pointer */
+   OUT_RING(BRW_STATE_SIP | 0);
+   OUT_RING(state_base_offset + sip_kernel_offset); /* system instruction pointer */
+      
+   /* Pipe control */
+   // XXX: pipe control write cache before enabling color blending
+   // vol2, geometry pipeline 1.8.4
+   OUT_RING(BRW_PIPE_CONTROL |
+	    BRW_PIPE_CONTROL_NOWRITE |
+	    BRW_PIPE_CONTROL_IS_FLUSH |
+	    2);
+   OUT_RING(0);			       /* Destination address */
+   OUT_RING(0);			       /* Immediate data low DW */
+   OUT_RING(0);			       /* Immediate data high DW */
+
+   /* Binding table pointers */
+   OUT_RING(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
+   OUT_RING(0); /* vs */
+   OUT_RING(0); /* gs */
+   OUT_RING(0); /* clip */
+   OUT_RING(0); /* sf */
+   /* Only the PS uses the binding table */
+   OUT_RING(state_base_offset + binding_table_offset); /* ps */
+
+   //ring 20
+
+   /* The drawing rectangle clipping is always on.  Set it to values that
+    * shouldn't do any clipping.
+    */
+    //XXX: fix for picture size
+   OUT_RING(BRW_3DSTATE_DRAWING_RECTANGLE | 2);	/* XXX 3 for BLC or CTG */
+   OUT_RING(0x00000000);	/* ymin, xmin */
+   OUT_RING((pScrn->virtualX - 1) |
+	    (pScrn->virtualY - 1) << 16); /* ymax, xmax */
+   OUT_RING(0x00000000);	/* yorigin, xorigin */
+
+   /* skip the depth buffer */
+   /* skip the polygon stipple */
+   /* skip the polygon stipple offset */
+   /* skip the line stipple */
+   
+   /* Set the pointers to the 3d pipeline state */
+   OUT_RING(BRW_3DSTATE_PIPELINED_POINTERS | 5);
+   OUT_RING(state_base_offset + vs_offset);  /* 32 byte aligned */
+   OUT_RING(BRW_GS_DISABLE);		     /* disable GS, resulting in passthrough */
+   OUT_RING(BRW_CLIP_DISABLE);		     /* disable CLIP, resulting in passthrough */
+   OUT_RING(state_base_offset + sf_offset);  /* 32 byte aligned */
+   OUT_RING(state_base_offset + wm_offset);  /* 32 byte aligned */
+   OUT_RING(state_base_offset + cc_offset);  /* 64 byte aligned */
+
+   /* URB fence */
+   // XXX: CS for const URB needed? if not, cs_fence should be equal to sf_fence
+   OUT_RING(BRW_URB_FENCE |
+	    UF0_CS_REALLOC |
+	    UF0_SF_REALLOC |
+	    UF0_CLIP_REALLOC |
+	    UF0_GS_REALLOC |
+	    UF0_VS_REALLOC |
+	    1);
+   OUT_RING(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+	    ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+	    ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+   OUT_RING(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+	    ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+
+   /* Constant buffer state */
+   // XXX: needed? seems no usage, as we don't have CONSTANT_BUFFER definition
+   OUT_RING(BRW_CS_URB_STATE | 0);
+   OUT_RING(((URB_CS_ENTRY_SIZE - 1) << 4) | /* URB Entry Allocation Size */
+	    (URB_CS_ENTRIES << 0));	     /* Number of URB Entries */
+   
+   /* Set up the pointer to our vertex buffer */
+   // XXX: double check
+  // int vb_pitch = 4 * 4;  // XXX: pitch should include mask's coords? possible
+  // all three coords on one row?
+   int nelem = pMask ? 3: 2;
+   OUT_RING(BRW_3DSTATE_VERTEX_BUFFERS | 3); //should be 4n-1 -> 3
+   OUT_RING((0 << VB0_BUFFER_INDEX_SHIFT) |
+	    VB0_VERTEXDATA |
+	    ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT)); 
+   		// pitch includes all vertex data, 4bytes for 1 dword, each
+		// element has 2 coords (x,y)(s0,t0), nelem to reflect possible
+		// mask
+   OUT_RING(state_base_offset + vb_offset);
+   OUT_RING(4 * nelem); // max index, prim has 4 coords
+   OUT_RING(0); // ignore for VERTEXDATA, but still there
+
+   /* Set up our vertex elements, sourced from the single vertex buffer. */
+   OUT_RING(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));  // XXX: 2n-1, (x,y) + (s0,t0) +
+						//   possible (s1, t1)
+   /* offset 0: X,Y -> {X, Y, 1.0, 1.0} */
+   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    VE0_VALID |
+	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    (0 << VE0_OFFSET_SHIFT));
+   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+   /* offset 8: S0, T0 -> {S0, T0, 1.0, 1.0} */
+   OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    VE0_VALID |
+	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    (8 << VE0_OFFSET_SHIFT));
+   OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    (4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
+
+   if (pMask) {
+   	OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
+	    VE0_VALID |
+	    (BRW_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT) |
+	    (16 << VE0_OFFSET_SHIFT));
+	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
+	    (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) |
+	    (8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT)); 
+		//XXX: is this has alignment issue? and thread access problem?
+	    
+   }
+   
+   ADVANCE_LP_RING();
+    
+   }
+
+    {
+	/* cc states */
+	/* dest buffer */
+	/* urbs */
+	/* binding tables */
+	/* clipping */
+	/* color blend (color calculator, dataport shared function)
+		COLOR_CALC_STATE/SURFACE_STATE(rendertarget's color blend enable
+		bit)
+		Errata!!!: brw-a/b, rendertarget 'local' color blending always
+		enabled! only control by global enable bit.
+	   surface format for blend, "Surface format table in Sampling Engine"
+	   XXX: if surface format not support, we should fallback.
+	*/
+	/* 
+	    render target should be defined in SURFACE_STATE
+	    	o render target SURFTYPE_BUFFER? 2D? Keith has 2D set.
+		o depth buffer SURFTYPE_NULL?
+	    color blend:
+	        o Errata!!: mush issue PIPE_CONTROL with Write Cache Flush
+		enable set, before transite to read-write color buffer. 
+	    	o disable pre/post-blending clamping
+		o enable color buffer blending enable in COLOR_CALC_STATE,(vol2, 3d rasterization 3.8) 
+		  enable color blending enable in SURFACE_STATE.(shared,
+		  sampling engine 1.7) 
+		  disable depth test
+		o (we don't use BLENDFACT_SRC_ALPHA_SATURATE, so don't care
+		the Errata for independent alpha blending, just use color
+		blending factor for all) disable independent alpha blending
+		in COLOR_CALC_STATE
+		o set src/dst blend factor in COLOR_CALC_STATE
+
+	*/
+    }
+
+	/* shader program 
+		o use sampler shared function for texture data
+		o submit result to dataport for later color blending */
+    {
+	 /* PS program:
+	 	o declare sampler and variables??
+		o 'send' cmd to Sampling Engine to load 'src' picture
+		o if (!pMask) then 'send' 'src' texture value to DataPort
+		target render cache
+		o else 
+		    - 'send' cmd to SE to load 'mask' picture
+		    - if no alpha, force to 1 (move 1 to W element of mask)
+		    - if (mask->componentAlpha) then mul 'src' & 'mask', 'send'
+		    	output to DataPort render cache
+		    - else mul 'src' & 'mask''s W element(alpha), 'send' output
+		    	to Dataport render cache
+	 */
+
+    }
+
+#ifdef I830DEBUG
+    ErrorF("try to sync to show any errors...");
+    I830Sync(pScrn);
+#endif
+    return TRUE;
+}	
+
+void
+I965EXAComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
+		int dstX, int dstY, int w, int h)
+{
+    int srcXend, srcYend, maskXend, maskYend;
+    PictVector v;
+    int pMask = 1, i = 0;
+
+    DPRINTF(PFX, "Composite: srcX %d, srcY %d\n\t maskX %d, maskY %d\n\t"
+	    "dstX %d, dstY %d\n\twidth %d, height %d\n\t"
+	    "src_scale_x %f, src_scale_y %f, "
+	    "mask_scale_x %f, mask_scale_y %f\n",
+	    srcX, srcY, maskX, maskY, dstX, dstY, w, h,
+	    scale_units[0][0], scale_units[0][1],
+	    scale_units[1][0], scale_units[1][1]);
+
+    if (scale_units[1][0] == -1 || scale_units[1][1] == -1) {
+	pMask = 0;
+    }
+
+    srcXend = srcX + w;
+    srcYend = srcY + h;
+    maskXend = maskX + w;
+    maskYend = maskY + h;
+    if (is_transform[0]) {
+        v.vector[0] = IntToxFixed(srcX);
+        v.vector[1] = IntToxFixed(srcY);
+        v.vector[2] = xFixed1;
+        PictureTransformPoint(transform[0], &v);
+        srcX = xFixedToInt(v.vector[0]);
+        srcY = xFixedToInt(v.vector[1]);
+        v.vector[0] = IntToxFixed(srcXend);
+        v.vector[1] = IntToxFixed(srcYend);
+        v.vector[2] = xFixed1;
+        PictureTransformPoint(transform[0], &v);
+        srcXend = xFixedToInt(v.vector[0]);
+        srcYend = xFixedToInt(v.vector[1]);
+    }
+    if (is_transform[1]) {
+        v.vector[0] = IntToxFixed(maskX);
+        v.vector[1] = IntToxFixed(maskY);
+        v.vector[2] = xFixed1;
+        PictureTransformPoint(transform[1], &v);
+        maskX = xFixedToInt(v.vector[0]);
+        maskY = xFixedToInt(v.vector[1]);
+        v.vector[0] = IntToxFixed(maskXend);
+        v.vector[1] = IntToxFixed(maskYend);
+        v.vector[2] = xFixed1;
+        PictureTransformPoint(transform[1], &v);
+        maskXend = xFixedToInt(v.vector[0]);
+        maskYend = xFixedToInt(v.vector[1]);
+    }
+
+    DPRINTF(PFX, "After transform: srcX %d, srcY %d,srcXend %d, srcYend %d\n\t"
+		"maskX %d, maskY %d, maskXend %d, maskYend %d\n\t"
+		"dstX %d, dstY %d\n", srcX, srcY, srcXend, srcYend,
+		maskX, maskY, maskXend, maskYend, dstX, dstY);
+
+ 
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)dstY;
+    vb[i++] = (float)srcX / scale_units[0][0];
+    vb[i++] = (float)srcY / scale_units[0][1];
+    if (pMask) {
+        vb[i++] = (float)maskX / scale_units[1][0];
+        vb[i++] = (float)maskY / scale_units[1][1];
+    }
+
+    vb[i++] = (float)dstX;
+    vb[i++] = (float)(dstY + h);
+    vb[i++] = (float)srcX / scale_units[0][0];
+    vb[i++] = (float)srcYend / scale_units[0][1];
+    if (pMask) {
+        vb[i++] = (float)maskX / scale_units[1][0];
+        vb[i++] = (float)maskYend / scale_units[1][1];
+    }
+
+    vb[i++] = (float)(dstX + w);
+    vb[i++] = (float)(dstY + h);
+    vb[i++] = (float)srcXend / scale_units[0][0];
+    vb[i++] = (float)srcYend / scale_units[0][1];
+    if (pMask) {
+        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskYend / scale_units[1][1];
+    }
+
+    vb[i++] = (float)(dstX + w);
+    vb[i++] = (float)dstY;
+    vb[i++] = (float)srcXend / scale_units[0][0];
+    vb[i++] = (float)srcY / scale_units[0][1];
+    if (pMask) {
+        vb[i++] = (float)maskXend / scale_units[1][0];
+        vb[i++] = (float)maskY / scale_units[1][1];
+    }
+
+    {
+      BEGIN_LP_RING(6);
+      OUT_RING(BRW_3DPRIMITIVE | 
+	       BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
+	       (_3DPRIM_TRIFAN << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) | 
+	       (0 << 9) |  /* CTG - indirect vertex count */
+	       4);
+      OUT_RING(4);  /* vertex count per instance */
+      OUT_RING(0); /* start vertex offset */
+      OUT_RING(1); /* single instance */
+      OUT_RING(0); /* start instance location */
+      OUT_RING(0); /* index buffer offset, ignored */
+      ADVANCE_LP_RING();
+    }
+#ifdef I830DEBUG
+    ErrorF("sync after 3dprimitive");
+    I830Sync(pScrn);
+#endif
+}