xf86-video-intel: Branch 'projective-965' - 2 commits - src/exa_sf_mask.g4a src/exa_sf_mask_prog.h src/exa_sf_rotation.g4a src/exa_sf_rotation_prog.h src/exa_wm_rotation.g4a src/exa_wm_rotation_prog.h

Sun Mar 30 19:19:54 PDT 2008

src/exa_sf_mask.g4a        |  104 ++++++++++++++++---------
 src/exa_sf_mask_prog.h     |   20 +---
 src/exa_sf_rotation.g4a    |   55 -------------
 src/exa_sf_rotation_prog.h |   20 ----
 src/exa_wm_rotation.g4a    |  184 ---------------------------------------------
 src/exa_wm_rotation_prog.h |   70 -----------------
 6 files changed, 72 insertions(+), 381 deletions(-)

New commits:
commit 949d73271d7100c1f028fd60f185f4929461304e
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 19:19:46 2008 -0700

    Remove rotation sf and wm progs

diff --git a/src/exa_sf_rotation.g4a b/src/exa_sf_rotation.g4a
deleted file mode 100644
index 59d40d4..0000000
--- a/src/exa_sf_rotation.g4a
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright Â© 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/* 1/dx */
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-/* 1/dy */
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-/* du, dv */
-mul (1) g7<1>F g3<0,1,0>F -1.0F { align1 };
-mul (1) g7.4<1>F g3.4<0,1,0>F -1.0F { align1 };
-add (1) g7<1>F g4<0,1,0>F g7<0,1,0>F { align1 };
-add (1) g7.4<1>F g4.4<0,1,0>F g7.4<0,1,0>F { align1 };
-
-/* du/dy */
-mul (1) g7<1>F g7<0,1,0>F g6.4<0,1,0>F { align1 };
-/* dv/dx */
-mul (1) g7.4<1>F g7.4<0,1,0>F g6<0,1,0>F { align1 };
-/* Cx */
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-/* Cy */
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-/* Co */
-mov (8) m3<1>F g3<8,8,1>F { align1 };
-send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_sf_rotation_prog.h b/src/exa_sf_rotation_prog.h
deleted file mode 100644
index 9589130..0000000
--- a/src/exa_sf_rotation_prog.h
+++ /dev/null
@@ -1,20 +0,0 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00000041, 0x20e07fbd, 0x00000060, 0xbf800000 },
-   { 0x00000041, 0x20e47fbd, 0x00000064, 0xbf800000 },
-   { 0x00000040, 0x20e077bd, 0x00000080, 0x000000e0 },
-   { 0x00000040, 0x20e477bd, 0x00000084, 0x000000e4 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c4 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c0 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/src/exa_wm_rotation.g4a b/src/exa_wm_rotation.g4a
deleted file mode 100644
index 613a5cb..0000000
--- a/src/exa_wm_rotation.g4a
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright Â© 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Wang Zhenyu <zhenyu.z.wang at intel.com>
- */
-
-/*
- * This's for exa composite operation in no mask picture case.
- * The simplest case is just sending what src picture has to dst picture.
- */
-
-/* I think this should be same as in g4a program for texture video,
-   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
-
-/* The initial payload of the thread is always g0.
- * WM_URB (incoming URB entries) is g3
- * X0_R is g4
- * X1_R is g5
- * Y0_R is g6
- * Y1_R is g7
- */
-
-    /* Set up the X/Y screen coordinates of the pixels in our 4 subspans.  Each
-     * subspan is a 2x2 rectangle, and the screen x/y of the upper left of each
-     * subspan are given in GRF register 1.2 through 1.5 (which, with the word
-     * addressing below, are 1.4 through 1.11).
-     *
-     * The result is WM_X*_R and WM_Y*R being:
-     *
-     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
-     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
-     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
-     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
-     */
-    /* Set up ss0.x coordinates*/
-mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
-mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
-add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
-    /* Set up ss0.y coordinates */
-mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
-mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
-add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
-add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
-    /* set up ss1.x coordinates */
-mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
-mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
-add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
-    /* set up ss1.y coordinates */
-mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
-mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
-add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
-add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.x coordinates */
-mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
-mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
-add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
-    /* Set up ss2.y coordinates */
-mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
-mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
-add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
-add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.x coordinates */
-mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
-mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
-add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
-    /* Set up ss3.y coordinates */
-mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
-mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
-add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
-add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
-
-    /* Now, map these screen space coordinates into texture coordinates. */
-    /* subtract screen-space X origin of vertex 0. */
-/* for rotation, texture y is from ssX.x, so g4,g5 will be Y */
-add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
-    /* scale by texture X increment */
-mul (8) g4<1>F g4<8,8,1>F g3.20<0,1,0>F { align1 };
-mul (8) g5<1>F g5<8,8,1>F g3.20<0,1,0>F { align1 };
-    /* add in texture X offset */
-add (8) g4<1>F g4<8,8,1>F g3.28<0,1,0>F { align1 };
-add (8) g5<1>F g5<8,8,1>F g3.28<0,1,0>F { align1 };
-
-/* texture Y is from ssX.x */
-    /* subtract screen-space Y origin of vertex 0. */
-add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
-    /* scale by texture Y increment */
-mul (8) g6<1>F g6<8,8,1>F g3.16<0,1,0>F { align1 };
-mul (8) g7<1>F g7<8,8,1>F g3.16<0,1,0>F { align1 };
-    /* add in texture Y offset */
-add (8) g6<1>F g6<8,8,1>F g3.12<0,1,0>F { align1 };
-add (8) g7<1>F g7<8,8,1>F g3.12<0,1,0>F { align1 };
-
-/* prepare sampler read back gX register, which would be written back to output */
-
-/* use simd16 sampler, param 0 is u, param 1 is v. */
-/* 'payload' loading, assuming tex coord start from g4 */
-mov (8) m1<1>F g6<8,8,1>F { align1 };
-mov (8) m2<1>F g7<8,8,1>F { align1 };  
-mov (8) m3<1>F g4<8,8,1>F { align1 };
-mov (8) m4<1>F g5<8,8,1>F { align1 }; 
-
-/* m0 will be copied with g0, as it contains send desc */
-/* emit sampler 'send' cmd */
-send (16) 0 		/* msg reg index */
-	g12<1>UW 	/* readback */
-	g0<8,8,1>UW  	/* copy to msg start reg*/
-	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
-			 /* here(src->dst) we should use src_sampler and src_surface */
-	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
-
-mov (8) g19<1>UD g19<8,8,1>UD { align1 };  /* wait sampler return */
-/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>F g1<8,8,1>F { align1 };
-
-/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
-/* g12 -> m2
-   g13 -> m6
-   g14 -> m3
-   g15 -> m7
-   g16 -> m4
-   g17 -> m8
-   g18 -> m5
-   g19 -> m9
-*/
-mov (8) m2<1>F g12<8,8,1>F { align1 };
-mov (8) m3<1>F g14<8,8,1>F { align1 };
-mov (8) m4<1>F g16<8,8,1>F { align1 };
-mov (8) m5<1>F g18<8,8,1>F { align1 };
-mov (8) m6<1>F g13<8,8,1>F { align1 };
-mov (8) m7<1>F g15<8,8,1>F { align1 };
-mov (8) m8<1>F g17<8,8,1>F { align1 };
-mov (8) m9<1>F g19<8,8,1>F { align1 };
-
-/* m0, m1 are all direct passed by PS thread payload */
-mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
-
-/* write */
-send (16) 0 acc0<1>UW g0<8,8,1>UW write (
-	0,  /* binding_table */
-	8,  /* pixel scordboard clear, msg type simd16 single source */
-	4,  /* render target write */
-	0   /* no write commit message */
-	) 
-	mlen 10
-	rlen 0
-	{ align1 EOT };
-
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
-nop;
diff --git a/src/exa_wm_rotation_prog.h b/src/exa_wm_rotation_prog.h
deleted file mode 100644
index 890d2cf..0000000
--- a/src/exa_wm_rotation_prog.h
+++ /dev/null
@@ -1,70 +0,0 @@
-   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
-   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
-   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
-   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
-   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
-   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
-   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
-   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
-   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
-   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
-   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
-   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
-   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
-   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
-   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
-   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
-   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
-   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
-   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
-   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
-   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
-   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
-   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
-   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
-   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000074 },
-   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000074 },
-   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000007c },
-   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000007c },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
-   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000070 },
-   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000070 },
-   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000006c },
-   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000006c },
-   { 0x00600001, 0x202003be, 0x008d00c0, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d00e0, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d0080, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d00a0, 0x00000000 },
-   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
-   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
-   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
-   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
-   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
-   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
-   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
-   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
-   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
-   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
-   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
-   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
-   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
commit 492ff1494f782240e6ca68919b2d0b9aa400fc53
Author: Keith Packard <keithp at keithp.com>
Date:   Sun Mar 30 19:14:18 2008 -0700

    Fix the sf_mask program to compute and pass corrected uvw cooefficients
    
    sf_mask is the same as sf except that it must compute both src and mask uvw
    cooefficients, which are conveniently adjacent in the same registers, and so
    need only an extended execution width

diff --git a/src/exa_sf_mask.g4a b/src/exa_sf_mask.g4a
index a0d6efc..c830fd8 100644
--- a/src/exa_sf_mask.g4a
+++ b/src/exa_sf_mask.g4a
@@ -21,52 +21,82 @@
  * IN THE SOFTWARE.
  *
  * Authors:
+ *    Keith Packard <keithp at keithp.com>
+ *    Eric Anholt <eric at anholt.net>
  *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  */
 
-/* FIXME how to setup second coeffient for mask tex coord */
 
-/* 
-   g3 (v0) { u0, v0, 1.0, 1.0 }  ==> {u0, v0, 1.0, 1.0, mu0, mv0, 1.0, 1.0}  Co[0](u0) Co[1](v0) Co[2](mu0) Co[3](mv0)
-   g4 (v1) { u1, v1, 1.0, 1.0 }  ==> {u1, v1, 1.0, 1.0, mu1, mv1, 1.0, 1.0}
-   g5 (v2) { u2, v2 }  ==> (u2, v2, mu2, mv2}
-   g6      { 1/(x1-x0), 1/(y1-y0) }
-   g7      { u1-u0, v1-v0, 0, 0}  ==>{u1-u0, v1-v0,0, 0, mu1-mu0, mv1-mv0, 0, 0}
-	   -> { (u1-u0)/(x1-x0), (v1-v0)/(y1-y0) }  ==>{(u1-u0)/(x1-x0), (v1-v0)/(y1-y0),(mu1-mu0)/(x1-x0), (mv1-mv0)/(y1-y0)
-		Cx,		 Cy 			Cx[0],		 Cy[0],		 Cx[1], 	    Cy[1]
+/*
+ * Inputs (note all sub-register addresses are bytes, not float indices)
+ *
+ * Note that the vertices will have been reordered:
+ *
+ * V0 is topmost (leftmost among topmost) (upper left)
+ * V1 is next clockwise (lower right)
+ * V2 is remaining (lower left)
+ *
+ *  V0 ...................... XX
+ *  |                          .
+ *  |                          .
+ *  |                          .
+ *  V2------------------------V1
+ *
+ *  G0	    thread state -- just pass along
+ *
+ *  G1 and G2 are fixed by SF spec
+ *
+ *  G1.0    reserved
+ *  G1.4    Provoking vertex
+ *  G1.8    Determinant
+ *  G1.12   X1 - X0
+ *  G1.16   X2 - X0
+ *  G1.20   Y1 - Y0
+ *  G1.24   Y2 - Y0
+ *  G1.30   reserved
+ *
+ *  G2.0    Z0
+ *  G2.4    1/W0
+ *  G2.8    Z1
+ *  G2.12   1/W1
+ *  G2.16   Z2
+ *  G2.20   1/W2
+ *  G2.24   reserved
+ *  G2.30   reserved
+ *
+ *  G3 is V0 Vertex Attribute Data from URB (upper left)
+ *
+ *  G3.0    u0
+ *  G3.4    v0
+ *
+ *  G4 is V1 Vertex Attribute Data from URB (lower right)
+ *
+ *  G4.0    u1
+ *  G4.4    v1
+ *
+ *  G5 is V2 Vertex Attribute Data from URB (lower left)
+ *
  */
 
-/* assign Cx[0], Cx[1] to src, same to Cy, Co 
-          Cx[2], Cx[3] to mask, same to Cy, Co */
-
-send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
-add (8) g7<1>F g4<8,8,1>F -g3<8,8,1>F { align1 };
-/* Cx[0] */
-mul (1) g7<1>F g7<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[0] */
-mul (1) g7.4<1>F g7.4<0,1,0>F g6.4<0,1,0>F { align1 };
-/* Cx[2] */
-mul (1) g7.8<1>F g7.8<0,1,0>F g6<0,1,0>F { align1 };
-/* Cy[2] */
-mul (1) g7.12<1>F g7.12<0,1,0>F g6.4<0,1,0>F { align1 };
+/* Compute inverses of the input deltas */
+send (4) 0 g6<1>F g1.12<4,4,1>F math inv mlen 1 rlen 1 { align1 };
 
-/* src Cx[0], Cx[1] */
-mov (8) m1<1>F g7<0,1,0>F { align1 };
-/* mask Cx[2], Cx[3] */
-mov (1) m1.8<1>F g7.8<0,1,0>F { align1 };
-mov (1) m1.12<1>F g7.8<0,1,0>F { align1 };
-/* src Cy[0], Cy[1] */
-mov (8) m2<1>F g7.4<0,1,0>F { align1 };
-/* mask Cy[2], Cy[3] */
-mov (1) m2.8<1>F g7.12<0,1,0>F { align1 };
-mov (1) m2.12<1>F g7.12<0,1,0>F { align1 };
-/* src Co[0], Co[1] */
+/* texture location at V0 */
 mov (8) m3<1>F g3<8,8,1>F { align1 };
-/* mask Co[2], Co[3] */
-mov (1) m3.8<1>F g3.8<0,1,0>F { align1 };
-mov (1) m3.12<1>F g3.12<0,1,0>F { align1 };
 
+/* compute V1 - V2 (motion in X) for texture coordinates */
+add (8) g7<1>F g4<8,8,1>F -g5<8,8,1>F { align1 };
+
+/* multiply by 1/dx */
+mul (8) m1<1>F g7<8,8,1>F g6.0<0,1,0>F { align1 };
+
+/* Compute V2 - V0 (motion in Y) for texture coordinates */
+add (8) g7<1>F g5<8,8,1>F -g3<8,8,1>F { align1 };
+
+/* multiply by 1/dy */
+mul (8) m2<1>F g7<8,8,1>F g6.8<0,1,0>F {align1 };
+
+/* and we're done */
 send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
 nop;
 nop;
diff --git a/src/exa_sf_mask_prog.h b/src/exa_sf_mask_prog.h
index 4e9114d..be0a77b 100644
--- a/src/exa_sf_mask_prog.h
+++ b/src/exa_sf_mask_prog.h
@@ -1,19 +1,9 @@
-   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
-   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
-   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d4060 },
-   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c0 },
-   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c4 },
-   { 0x00000041, 0x20e877bd, 0x000000e8, 0x000000c0 },
-   { 0x00000041, 0x20ec77bd, 0x000000ec, 0x000000c4 },
-   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
-   { 0x00000001, 0x202803be, 0x000000e8, 0x00000000 },
-   { 0x00000001, 0x202c03be, 0x000000e8, 0x00000000 },
-   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
-   { 0x00000001, 0x204803be, 0x000000ec, 0x00000000 },
-   { 0x00000001, 0x204c03be, 0x000000ec, 0x00000000 },
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
    { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
-   { 0x00000001, 0x206803be, 0x00000068, 0x00000000 },
-   { 0x00000001, 0x206c03be, 0x0000006c, 0x00000000 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
+   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
+   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
+   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
    { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
    { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },