pixman: Branch 'master' - 2 commits

Sat May 30 19:01:06 PDT 2009

pixman/pixman-cpu.c     |   15 -
 pixman/pixman-general.c |   15 -
 pixman/pixman-private.h |    4 
 pixman/pixman-vmx.c     |  670 +++++++++++++++++++++++++++++++++++++++++++-----
 pixman/pixman-vmx.h     |  310 ----------------------
 5 files changed, 622 insertions(+), 392 deletions(-)

New commits:
commit e3dba0f61af9583728f94210392cc12b602acc2e
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Sat May 30 21:44:08 2009 -0400

    Create a vmx pixman_implementation_t

diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
index 96b4d9b..057c134 100644
--- a/pixman/pixman-cpu.c
+++ b/pixman/pixman-cpu.c
@@ -44,7 +44,7 @@ static volatile pixman_bool_t have_vmx = TRUE;
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 
-pixman_bool_t
+static pixman_bool_t
 pixman_have_vmx (void)
 {
     if(!initialized) {
@@ -66,7 +66,7 @@ pixman_have_vmx (void)
 #include <linux/auxvec.h>
 #include <asm/cputable.h>
 
-pixman_bool_t
+static pixman_bool_t
 pixman_have_vmx (void)
 {
     if (!initialized) {
@@ -120,7 +120,8 @@ static void vmx_test(int sig, siginfo_t *si, void *unused) {
     longjmp (jump_env, 1);
 }
 
-pixman_bool_t pixman_have_vmx (void) {
+static pixman_bool_t
+pixman_have_vmx (void) {
     struct sigaction sa, osa;
     int jmp_result;
     if (!initialized) {
@@ -524,8 +525,10 @@ _pixman_choose_implementation (void)
     if (pixman_have_arm_simd())
 	return _pixman_implementation_create_arm_simd (NULL);
 #endif
-
+#ifdef USE_VMX
+    if (pixman_have_vmx())
+	return _pixman_implementation_create_vmx (NULL);
+#endif
+    
     return _pixman_implementation_create_fast_path (NULL);
 }
-
-
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 4d4759a..ed858fe 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -35,7 +35,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include "pixman-private.h"
-#include "pixman-vmx.h"
 #include "pixman-combine32.h"
 #include "pixman-private.h"
 
@@ -265,10 +264,6 @@ general_composite (pixman_implementation_t *	imp,
     pixman_bool_t srcTransform = src->common.transform != NULL;
     pixman_bool_t maskTransform = FALSE;
     
-#ifdef USE_VMX
-    fbComposeSetupVMX();
-#endif
-    
     if (srcRepeat && srcTransform &&
 	src->bits.width == 1 &&
 	src->bits.height == 1)
@@ -292,16 +287,6 @@ general_composite (pixman_implementation_t *	imp,
 	}
     }
     
-#ifdef USE_VMX
-    if (_pixman_run_fast_path (vmx_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
-	return;
-#endif
-    
     /* CompositeGeneral optimizes 1x1 repeating images itself */
     if (src->type == BITS &&
 	src->bits.width == 1 && src->bits.height == 1)
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 5facfb4..9e770f6 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1054,6 +1054,10 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *toplevel);
 pixman_implementation_t *
 _pixman_implementation_create_arm_neon (pixman_implementation_t *toplevel);
 #endif
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *toplevel);
+#endif
 
 pixman_bool_t
 _pixman_run_fast_path (const FastPathInfo *paths,
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 73b1df1..e371f7f 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -26,16 +26,11 @@
  */
 
 #include <config.h>
-#include "pixman-vmx.h"
+#include "pixman-private.h"
 #include "pixman-combine32.h"
 #include <altivec.h>
 
-typedef struct _FbComposeFunctions32 {
-    CombineFunc32 combineU[PIXMAN_OP_LAST];
-    CombineFunc32 combineC[PIXMAN_OP_LAST];
-} FbComposeFunctions32;
-
-static FbComposeFunctions32 pixman_composeFunctions;
+#define AVV(x...) {x}
 
 static force_inline vector unsigned int
 splat_alpha (vector unsigned int pix) {
@@ -297,7 +292,8 @@ vmxCombineOverUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineOverU(uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+vmxCombineOverU(pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask,
                 int width)
 {
     if (mask)
@@ -380,7 +376,8 @@ vmxCombineOverReverseUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src,
+vmxCombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
+			uint32_t *dest, const uint32_t *src,
                         const uint32_t *mask, int width)
 {
     if (mask)
@@ -461,7 +458,8 @@ vmxCombineInUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineInU (uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+vmxCombineInU (pixman_implementation_t *imp, pixman_op_t op,
+	       uint32_t *dest, const uint32_t *src, const uint32_t *mask,
                int width)
 {
     if (mask)
@@ -542,7 +540,8 @@ vmxCombineInReverseUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineInReverseU (uint32_t *dest, const uint32_t *src,
+vmxCombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
+		      uint32_t *dest, const uint32_t *src,
                       const uint32_t *mask, int width)
 {
     if (mask)
@@ -622,7 +621,8 @@ vmxCombineOutUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineOutU (uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+vmxCombineOutU (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask,
                 int width)
 {
     if (mask)
@@ -703,7 +703,8 @@ vmxCombineOutReverseUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineOutReverseU (uint32_t *dest,
+vmxCombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
+		       uint32_t *dest,
                        const uint32_t *src,
                        const uint32_t *mask,
                        int width)
@@ -794,7 +795,8 @@ vmxCombineAtopUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineAtopU (uint32_t *dest,
+vmxCombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
+		 uint32_t *dest,
                  const uint32_t *src,
                  const uint32_t *mask,
                  int width)
@@ -885,7 +887,8 @@ vmxCombineAtopReverseUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineAtopReverseU (uint32_t *dest,
+vmxCombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
+			uint32_t *dest,
                         const uint32_t *src,
                         const uint32_t *mask,
                         int width)
@@ -976,7 +979,8 @@ vmxCombineXorUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineXorU (uint32_t *dest,
+vmxCombineXorU (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest,
                 const uint32_t *src,
                 const uint32_t *mask,
                 int width)
@@ -1057,7 +1061,8 @@ vmxCombineAddUmask (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineAddU (uint32_t *dest,
+vmxCombineAddU (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest,
                 const uint32_t *src,
                 const uint32_t *mask,
                 int width)
@@ -1069,7 +1074,8 @@ vmxCombineAddU (uint32_t *dest,
 }
 
 static FASTCALL void
-vmxCombineSrcC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1100,7 +1106,8 @@ vmxCombineSrcC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int w
 }
 
 static FASTCALL void
-vmxCombineOverC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineOverC (pixman_implementation_t *imp, pixman_op_t op,
+		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1133,7 +1140,8 @@ vmxCombineOverC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int
 }
 
 static FASTCALL void
-vmxCombineOverReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
+			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1167,7 +1175,8 @@ vmxCombineOverReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mas
 }
 
 static FASTCALL void
-vmxCombineInC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineInC (pixman_implementation_t *imp, pixman_op_t op,
+	       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1201,7 +1210,8 @@ vmxCombineInC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int wi
 }
 
 static FASTCALL void
-vmxCombineInReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
+		      uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1235,7 +1245,8 @@ vmxCombineInReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask,
 }
 
 static FASTCALL void
-vmxCombineOutC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineOutC (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1270,7 +1281,8 @@ vmxCombineOutC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int w
 }
 
 static FASTCALL void
-vmxCombineOutReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
+		       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1306,7 +1318,8 @@ vmxCombineOutReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask
 }
 
 static FASTCALL void
-vmxCombineAtopC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
+		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1347,7 +1360,8 @@ vmxCombineAtopC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int
 }
 
 static FASTCALL void
-vmxCombineAtopReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
+			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1388,7 +1402,8 @@ vmxCombineAtopReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mas
 }
 
 static FASTCALL void
-vmxCombineXorC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineXorC (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1429,7 +1444,8 @@ vmxCombineXorC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int w
 }
 
 static FASTCALL void
-vmxCombineAddC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+vmxCombineAddC (pixman_implementation_t *imp, pixman_op_t op,
+		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc, vmask;
@@ -1532,41 +1548,48 @@ fbCompositeSolid_nx0565vmx (pixman_operator_t	op,
     }
 }
 
-#endif
-
-void fbComposeSetupVMX (void)
-{
-    /* check if we have VMX support and initialize accordingly */
-    if (pixman_have_vmx ()) {
-        pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = vmxCombineOverU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_IN] = vmxCombineInU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = vmxCombineOutU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = vmxCombineAtopU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
-        pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = vmxCombineXorU;
-
-        pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = vmxCombineAddU;
-
-        pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = vmxCombineSrcC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = vmxCombineOverC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_IN] = vmxCombineInC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = vmxCombineOutC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = vmxCombineAtopC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = vmxCombineXorC;
-        pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = vmxCombineAddC;
-    }
-}
-
 static const FastPathInfo vmx_fast_path_array[] =
 {
     { PIXMAN_OP_NONE },
 };
 
 const FastPathInfo *const vmx_fast_paths = vmx_fast_path_array;
+
+#endif
+
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *toplevel)
+{
+    pixman_implementation_t *fast = _pixman_implementation_create_fast_path (NULL);
+    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, fast);
+
+    /* Set up function pointers */
+    
+    /* SSE code patch for fbcompose.c */
+    imp->combine_32[PIXMAN_OP_OVER] = vmxCombineOverU;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
+    imp->combine_32[PIXMAN_OP_IN] = vmxCombineInU;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU;
+    imp->combine_32[PIXMAN_OP_OUT] = vmxCombineOutU;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU;
+    imp->combine_32[PIXMAN_OP_ATOP] = vmxCombineAtopU;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
+    imp->combine_32[PIXMAN_OP_XOR] = vmxCombineXorU;
+
+    imp->combine_32[PIXMAN_OP_ADD] = vmxCombineAddU;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = vmxCombineSrcC;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = vmxCombineOverC;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC;
+    imp->combine_32_ca[PIXMAN_OP_IN] = vmxCombineInC;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = vmxCombineOutC;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmxCombineAtopC;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = vmxCombineXorC;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = vmxCombineAddC;
+    
+    return imp;
+}
+
diff --git a/pixman/pixman-vmx.h b/pixman/pixman-vmx.h
deleted file mode 100644
index 47885f4..0000000
--- a/pixman/pixman-vmx.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright Ã‚Â© 2007 Luca Barbato
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Luca Barbato not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  Luca Barbato makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Luca Barbato (lu_zero at gentoo.org)
- *
- * Based on work by Owen Taylor, SÃ¸ren Sandmann and Lars Knoll
- */
-
-#include "pixman-private.h"
-
-#ifdef USE_VMX
-
-pixman_bool_t pixman_have_vmx(void);
-
-#else
-#define pixman_have_vmx() FALSE
-#endif
-
-#ifdef USE_VMX
-
-extern const FastPathInfo *const vmx_fast_paths;
-
-#define AVV(x...) {x}
-
-void fbComposeSetupVMX (void);
-
-#if 0
-void fbCompositeIn_nx8x8vmx (pixman_operator_t	op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     INT16      xSrc,
-			     INT16      ySrc,
-			     INT16      xMask,
-			     INT16      yMask,
-			     INT16      xDst,
-			     INT16      yDst,
-			     CARD16     width,
-			     CARD16     height);
-
-void fbCompositeSolidMask_nx8888x0565Cvmx (pixman_operator_t      op,
-					   pixman_image_t * pSrc,
-					   pixman_image_t * pMask,
-					   pixman_image_t * pDst,
-					   INT16      xSrc,
-					   INT16      ySrc,
-					   INT16      xMask,
-					   INT16      yMask,
-					   INT16      xDst,
-					   INT16      yDst,
-					   CARD16     width,
-					   CARD16     height);
-
-void fbCompositeSrcAdd_8888x8888vmx (pixman_operator_t	op,
-				     pixman_image_t *	pSrc,
-				     pixman_image_t *	pMask,
-				     pixman_image_t *	pDst,
-				     INT16	xSrc,
-				     INT16      ySrc,
-				     INT16      xMask,
-				     INT16      yMask,
-				     INT16      xDst,
-				     INT16      yDst,
-				     CARD16     width,
-				     CARD16     height);
-
-void fbCompositeSolidMask_nx8888x8888Cvmx (pixman_operator_t	op,
-					   pixman_image_t *	pSrc,
-					   pixman_image_t *	pMask,
-					   pixman_image_t *	pDst,
-					   INT16	xSrc,
-					   INT16	ySrc,
-					   INT16	xMask,
-					   INT16	yMask,
-					   INT16	xDst,
-					   INT16	yDst,
-					   CARD16	width,
-					   CARD16	height);
-
-void fbCompositeSolidMask_nx8x8888vmx (pixman_operator_t      op,
-				       pixman_image_t * pSrc,
-				       pixman_image_t * pMask,
-				       pixman_image_t * pDst,
-				       INT16      xSrc,
-				       INT16      ySrc,
-				       INT16      xMask,
-				       INT16      yMask,
-				       INT16      xDst,
-				       INT16      yDst,
-				       CARD16     width,
-				       CARD16     height);
-
-void fbCompositeSolidMaskSrc_nx8x8888vmx (pixman_operator_t      op,
-					  pixman_image_t * pSrc,
-					  pixman_image_t * pMask,
-					  pixman_image_t * pDst,
-					  INT16      xSrc,
-					  INT16      ySrc,
-					  INT16      xMask,
-					  INT16      yMask,
-					  INT16      xDst,
-					  INT16      yDst,
-					  CARD16     width,
-					  CARD16     height);
-
-void fbCompositeSrcAdd_8888x8x8vmx (pixman_operator_t   op,
-				    pixman_image_t * pSrc,
-				    pixman_image_t * pMask,
-				    pixman_image_t * pDst,
-				    INT16      xSrc,
-				    INT16      ySrc,
-				    INT16      xMask,
-				    INT16      yMask,
-				    INT16      xDst,
-				    INT16      yDst,
-				    CARD16     width,
-				    CARD16     height);
-
-void fbCompositeIn_8x8vmx (pixman_operator_t	op,
-			   pixman_image_t * pSrc,
-			   pixman_image_t * pMask,
-			   pixman_image_t * pDst,
-			   INT16      xSrc,
-			   INT16      ySrc,
-			   INT16      xMask,
-			   INT16      yMask,
-			   INT16      xDst,
-			   INT16      yDst,
-			   CARD16     width,
-			   CARD16     height);
-
-void fbCompositeSrcAdd_8000x8000vmx (pixman_operator_t	op,
-				     pixman_image_t * pSrc,
-				     pixman_image_t * pMask,
-				     pixman_image_t * pDst,
-				     INT16      xSrc,
-				     INT16      ySrc,
-				     INT16      xMask,
-				     INT16      yMask,
-				     INT16      xDst,
-				     INT16      yDst,
-				     CARD16     width,
-				     CARD16     height);
-
-void fbCompositeSrc_8888RevNPx8888vmx (pixman_operator_t      op,
-				       pixman_image_t * pSrc,
-				       pixman_image_t * pMask,
-				       pixman_image_t * pDst,
-				       INT16      xSrc,
-				       INT16      ySrc,
-				       INT16      xMask,
-				       INT16      yMask,
-				       INT16      xDst,
-				       INT16      yDst,
-				       CARD16     width,
-				       CARD16     height);
-
-void fbCompositeSrc_8888x0565vmx (pixman_operator_t      op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  INT16      xSrc,
-				  INT16      ySrc,
-				  INT16      xMask,
-				  INT16      yMask,
-				  INT16      xDst,
-				  INT16      yDst,
-				  CARD16     width,
-				  CARD16     height);
-
-void fbCompositeSrc_8888RevNPx0565vmx (pixman_operator_t      op,
-				       pixman_image_t * pSrc,
-				       pixman_image_t * pMask,
-				       pixman_image_t * pDst,
-				       INT16      xSrc,
-				       INT16      ySrc,
-				       INT16      xMask,
-				       INT16      yMask,
-				       INT16      xDst,
-				       INT16      yDst,
-				       CARD16     width,
-				       CARD16     height);
-
-void fbCompositeSolid_nx8888vmx (pixman_operator_t		op,
-				 pixman_image_t *	pSrc,
-				 pixman_image_t *	pMask,
-				 pixman_image_t *	pDst,
-				 INT16		xSrc,
-				 INT16		ySrc,
-				 INT16		xMask,
-				 INT16		yMask,
-				 INT16		xDst,
-				 INT16		yDst,
-				 CARD16		width,
-				 CARD16		height);
-
-void fbCompositeSolid_nx0565vmx (pixman_operator_t		op,
-				 pixman_image_t *	pSrc,
-				 pixman_image_t *	pMask,
-				 pixman_image_t *	pDst,
-				 INT16		xSrc,
-				 INT16		ySrc,
-				 INT16		xMask,
-				 INT16		yMask,
-				 INT16		xDst,
-				 INT16		yDst,
-				 CARD16		width,
-				 CARD16		height);
-
-void fbCompositeSolidMask_nx8x0565vmx (pixman_operator_t      op,
-				       pixman_image_t * pSrc,
-				       pixman_image_t * pMask,
-				       pixman_image_t * pDst,
-				       INT16      xSrc,
-				       INT16      ySrc,
-				       INT16      xMask,
-				       INT16      yMask,
-				       INT16      xDst,
-				       INT16      yDst,
-				       CARD16     width,
-				       CARD16     height);
-
-void fbCompositeSrc_x888x8x8888vmx (pixman_operator_t	op,
-				    pixman_image_t *  pSrc,
-				    pixman_image_t *  pMask,
-				    pixman_image_t *  pDst,
-				    INT16	xSrc,
-				    INT16	ySrc,
-				    INT16       xMask,
-				    INT16       yMask,
-				    INT16       xDst,
-				    INT16       yDst,
-				    CARD16      width,
-				    CARD16      height);
-
-void fbCompositeSrc_8888x8x8888vmx (pixman_operator_t	op,
-				    pixman_image_t *  pSrc,
-				    pixman_image_t *  pMask,
-				    pixman_image_t *  pDst,
-				    INT16	xSrc,
-				    INT16	ySrc,
-				    INT16       xMask,
-				    INT16       yMask,
-				    INT16       xDst,
-				    INT16       yDst,
-				    CARD16      width,
-				    CARD16      height);
-
-void fbCompositeSrc_8888x8888vmx (pixman_operator_t      op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  INT16      xSrc,
-				  INT16      ySrc,
-				  INT16      xMask,
-				  INT16      yMask,
-				  INT16      xDst,
-				  INT16      yDst,
-				  CARD16     width,
-				  CARD16     height);
-
-pixman_bool_t fbCopyAreavmx (FbPixels	*pSrc,
-		    FbPixels	*pDst,
-		    int		src_x,
-		    int		src_y,
-		    int		dst_x,
-		    int		dst_y,
-		    int		width,
-		    int		height);
-
-void fbCompositeCopyAreavmx (pixman_operator_t	op,
-			     pixman_image_t *	pSrc,
-			     pixman_image_t *	pMask,
-			     pixman_image_t *	pDst,
-			     INT16	xSrc,
-			     INT16      ySrc,
-			     INT16      xMask,
-			     INT16      yMask,
-			     INT16      xDst,
-			     INT16      yDst,
-			     CARD16     width,
-			     CARD16     height);
-
-pixman_bool_t fbSolidFillvmx (FbPixels	*pDraw,
-		     int		x,
-		     int		y,
-		     int		width,
-		     int		height,
-		     FbBits		xor);
-#endif
-#endif /* USE_VMX */
commit 0c92309a8ab887efd73737b627baca36d800c6f8
Author: Luca Barbato <lu_zero at gentoo.org>
Date:   Sun May 24 18:41:06 2009 +0200

    Update vmxCombine*U to the new interface
    
    Make the functions use different codepaths depending on mask being non
    NULL. It could be speed up a bit more but should work as before
    speedwise. Conformance checked with cairo tests.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index b681b7a..73b1df1 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -208,6 +208,12 @@ over (vector unsigned int src, vector unsigned int srca,
         tmp2 = (typeof(tmp2))vec_ld(15, mask); \
         v ## mask = (typeof(v ## mask)) \
                      vec_perm(tmp1, tmp2, mask ## _mask);
+
+#define LOAD_VECTORSM(dest, source, mask) \
+        LOAD_VECTORSC(dest, source, mask) \
+        v ## source = pix_multiply(v ## source, \
+                                   splat_alpha (v ## mask));
+
 #define STORE_VECTOR(dest) \
         edges = vec_perm (tmp4, tmp3, dest ## _mask); \
         tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
@@ -216,7 +222,7 @@ over (vector unsigned int src, vector unsigned int srca,
         vec_st ((vector unsigned int) tmp1, 0, dest );
 
 static FASTCALL void
-vmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineOverUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -248,9 +254,60 @@ vmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
     }
 }
 
+static FASTCALL void
+vmxCombineOverUmask (uint32_t *dest,
+                     const uint32_t *src,
+                     const uint32_t *mask,
+                     int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask);
+
+        vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+        uint32_t ia;
+
+        FbByteMul (s, m);
+
+        ia = Alpha (~s);
+
+        FbByteMulAdd (d, ia, s);
+        dest[i] = d;
+    }
+}
+
+static FASTCALL void
+vmxCombineOverU(uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+                int width)
+{
+    if (mask)
+        vmxCombineOverUmask(dest, src, mask, width);
+    else
+        vmxCombineOverUnomask(dest, src, width);
+}
 
 static FASTCALL void
-vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineOverReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -283,7 +340,57 @@ vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineInU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineOverReverseUmask (uint32_t *dest,
+                            const uint32_t *src,
+                            const uint32_t *mask,
+                            int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = over (vdest, splat_alpha (vdest) , vsrc);
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+        uint32_t ia = Alpha (~dest[i]);
+
+        FbByteMul (s, m);
+
+        FbByteMulAdd (s, ia, d);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src,
+                        const uint32_t *mask, int width)
+{
+    if (mask)
+        vmxCombineOverReverseUmask(dest, src, mask, width);
+    else
+        vmxCombineOverReverseUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineInUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -315,7 +422,56 @@ vmxCombineInU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineInUmask (uint32_t *dest,
+                   const uint32_t *src,
+                   const uint32_t *mask,
+                   int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t a = Alpha (dest[i]);
+
+        FbByteMul (s, m);
+
+        FbByteMul (s, a);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineInU (uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+               int width)
+{
+    if (mask)
+        vmxCombineInUmask(dest, src, mask, width);
+    else
+        vmxCombineInUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineInReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -346,7 +502,57 @@ vmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineInReverseUmask (uint32_t *dest,
+                          const uint32_t *src,
+                          const uint32_t *mask,
+                          int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t d = dest[i];
+        uint32_t a = src[i];
+
+        FbByteMul (a, m);
+
+        a = Alpha (a);
+        FbByteMul (d, a);
+        dest[i] = d;
+    }
+}
+
+static FASTCALL void
+vmxCombineInReverseU (uint32_t *dest, const uint32_t *src,
+                      const uint32_t *mask, int width)
+{
+    if (mask)
+        vmxCombineInReverseUmask(dest, src, mask, width);
+    else
+        vmxCombineInReverseUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineOutUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -377,7 +583,56 @@ vmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineOutUmask (uint32_t *dest,
+                    const uint32_t *src,
+                    const uint32_t *mask,
+                    int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t a = Alpha (~dest[i]);
+
+        FbByteMul (s, m);
+
+        FbByteMul (s, a);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineOutU (uint32_t *dest, const uint32_t *src, const uint32_t *mask,
+                int width)
+{
+    if (mask)
+        vmxCombineOutUmask(dest, src, mask, width);
+    else
+        vmxCombineOutUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineOutReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -408,7 +663,59 @@ vmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineOutReverseUmask (uint32_t *dest,
+                           const uint32_t *src,
+                           const uint32_t *mask,
+                           int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t d = dest[i];
+        uint32_t a = src[i];
+
+        FbByteMul (a, m);
+
+        a = Alpha (~a);
+        FbByteMul (d, a);
+        dest[i] = d;
+    }
+}
+
+static FASTCALL void
+vmxCombineOutReverseU (uint32_t *dest,
+                       const uint32_t *src,
+                       const uint32_t *mask,
+                       int width)
+{
+    if (mask)
+        vmxCombineOutReverseUmask(dest, src, mask, width);
+    else
+        vmxCombineOutReverseUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineAtopUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -443,7 +750,63 @@ vmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineAtopUmask (uint32_t *dest,
+                     const uint32_t *src,
+                     const uint32_t *mask,
+                     int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+                            vdest, splat_alpha (negate (vsrc)));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+        uint32_t dest_a = Alpha (d);
+        uint32_t src_ia;
+
+        FbByteMul (s, m);
+
+        src_ia = Alpha (~s);
+
+        FbByteAddMul (s, dest_a, d, src_ia);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineAtopU (uint32_t *dest,
+                 const uint32_t *src,
+                 const uint32_t *mask,
+                 int width)
+{
+    if (mask)
+        vmxCombineAtopUmask(dest, src, mask, width);
+    else
+        vmxCombineAtopUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineAtopReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -478,7 +841,63 @@ vmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineAtopReverseUmask (uint32_t *dest,
+                            const uint32_t *src,
+                            const uint32_t *mask,
+                            int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+                            vsrc, splat_alpha (negate (vdest)));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+        uint32_t src_a;
+        uint32_t dest_ia = Alpha (~d);
+
+        FbByteMul (s, m);
+
+        src_a = Alpha (s);
+
+        FbByteAddMul (s, dest_ia, d, src_a);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineAtopReverseU (uint32_t *dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int width)
+{
+    if (mask)
+        vmxCombineAtopReverseUmask(dest, src, mask, width);
+    else
+        vmxCombineAtopReverseUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineXorUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -513,7 +932,63 @@ vmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
-vmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
+vmxCombineXorUmask (uint32_t *dest,
+                    const uint32_t *src,
+                    const uint32_t *mask,
+                    int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+                            vdest, splat_alpha (negate (vsrc)));
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+        uint32_t src_ia;
+        uint32_t dest_ia = Alpha (~d);
+
+        FbByteMul (s, m);
+
+        src_ia = Alpha (~s);
+
+        FbByteAddMul (s, dest_ia, d, src_ia);
+        dest[i] = s;
+    }
+}
+
+static FASTCALL void
+vmxCombineXorU (uint32_t *dest,
+                const uint32_t *src,
+                const uint32_t *mask,
+                int width)
+{
+    if (mask)
+        vmxCombineXorUmask(dest, src, mask, width);
+    else
+        vmxCombineXorUnomask(dest, src, width);
+}
+
+static FASTCALL void
+vmxCombineAddUnomask (uint32_t *dest, const uint32_t *src, int width)
 {
     int i;
     vector unsigned int  vdest, vsrc;
@@ -543,6 +1018,57 @@ vmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
 }
 
 static FASTCALL void
+vmxCombineAddUmask (uint32_t *dest,
+                    const uint32_t *src,
+                    const uint32_t *mask,
+                    int width)
+{
+    int i;
+    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+                         dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width/4; i > 0; i--) {
+
+        LOAD_VECTORSM(dest, src, mask)
+
+        vdest = pix_add (vsrc, vdest);
+
+        STORE_VECTOR(dest)
+
+        src+=4;
+        dest+=4;
+        mask+=4;
+    }
+
+    for (i = width%4; --i >=0;) {
+        uint32_t m = Alpha (mask[i]);
+        uint32_t s = src[i];
+        uint32_t d = dest[i];
+
+        FbByteMul (s, m);
+
+        FbByteAdd (d, s);
+        dest[i] = d;
+    }
+}
+
+static FASTCALL void
+vmxCombineAddU (uint32_t *dest,
+                const uint32_t *src,
+                const uint32_t *mask,
+                int width)
+{
+    if (mask)
+        vmxCombineAddUmask(dest, src, mask, width);
+    else
+        vmxCombineAddUnomask(dest, src, width);
+}
+
+static FASTCALL void
 vmxCombineSrcC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
 {
     int i;
@@ -1012,7 +1538,6 @@ void fbComposeSetupVMX (void)
 {
     /* check if we have VMX support and initialize accordingly */
     if (pixman_have_vmx ()) {
-#if 0
         pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = vmxCombineOverU;
         pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
         pixman_composeFunctions.combineU[PIXMAN_OP_IN] = vmxCombineInU;
@@ -1022,8 +1547,8 @@ void fbComposeSetupVMX (void)
         pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = vmxCombineAtopU;
         pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
         pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = vmxCombineXorU;
+
         pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = vmxCombineAddU;
-#endif
 
         pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = vmxCombineSrcC;
         pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = vmxCombineOverC;