pixman: Branch 'master' - 8 commits

Søren Sandmann Pedersen sandmann at kemper.freedesktop.org
Tue Feb 22 02:30:44 PST 2011


 configure.ac                      |    2 
 pixman/pixman-sse2.c              | 1601 +++++++++++++-------------------------
 pixman/pixman-x64-mmx-emulation.h |  263 ------
 3 files changed, 572 insertions(+), 1294 deletions(-)

New commits:
commit 34a7ac047411d6c1f1708cb8dd4469cd1aa40b31
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 07:38:49 2011 -0500

    sse2: Minor coding style cleanups.
    
    Also make pixman_fill_sse2() static.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 0509613..88287b4 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -2587,7 +2587,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 		mmx_dest = unpack_32_1x128 (d);
 
 		*pd = pack_1x128_32 (
-		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
 	    }
 
 	    pd++;
@@ -2635,7 +2636,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 		mmx_dest = unpack_32_1x128 (d);
 
 		*pd = pack_1x128_32 (
-		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
 	    }
 
 	    pd++;
@@ -3333,7 +3335,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 }
 
-pixman_bool_t
+static pixman_bool_t
 pixman_fill_sse2 (uint32_t *bits,
                   int       stride,
                   int       bpp,
@@ -4886,7 +4888,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
         while (w >= 4)
         {
             m = *(uint32_t*) mask;
-            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+            xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
 
             if (m == 0xffffffff)
             {
@@ -4902,9 +4905,12 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+                expand_alpha_rev_2x128 (
+		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
 
                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
             }
commit 10f69e5ec844e2630f8e5b21fd5392719d34d060
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 07:40:02 2011 -0500

    sse2: Remove pixman-x64-mmx-emulation.h
    
    Also stop including mmintrin.h

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 286dea8..0509613 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -30,22 +30,12 @@
 #include <config.h>
 #endif
 
-#include <mmintrin.h>
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-fast-path.h"
 
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-#   include "pixman-x64-mmx-emulation.h"
-#endif
-
 static __m128i mask_0080;
 static __m128i mask_00ff;
 static __m128i mask_0101;
diff --git a/pixman/pixman-x64-mmx-emulation.h b/pixman/pixman-x64-mmx-emulation.h
deleted file mode 100644
index 378019c..0000000
--- a/pixman/pixman-x64-mmx-emulation.h
+++ /dev/null
@@ -1,263 +0,0 @@
-#ifndef MMX_X64_H_INCLUDED
-#define MMX_X64_H_INCLUDED
-
-/* Implementation of x64 MMX substitition functions, before
- * pixman is reimplemented not to use __m64 type on Visual C++
- *
- * Copyright (C)2009 by George Yohng
- * Released in public domain.
- */
-
-#include <intrin.h>
-
-#define M64C(a) (*(const __m64 *)(&a))
-#define M64U(a) (*(const unsigned long long *)(&a))
-
-__inline __m64
-_m_from_int (int a)
-{
-    long long i64 = a;
-
-    return M64C (i64);
-}
-
-__inline __m64
-_mm_setzero_si64 ()
-{
-    long long i64 = 0;
-
-    return M64C (i64);
-}
-
-__inline __m64
-_mm_set_pi32 (int i1,   int i0)
-{
-    unsigned long long i64 = ((unsigned)i0) + (((unsigned long long)(unsigned)i1) << 32);
-
-    return M64C (i64);
-}
-
-__inline void
-_m_empty ()
-{
-}
-
-__inline __m64
-_mm_set1_pi16 (short w)
-{
-    unsigned long long i64 = ((unsigned long long)(unsigned short)(w)) * 0x0001000100010001ULL;
-
-    return M64C (i64);
-}
-
-__inline int
-_m_to_int (__m64 m)
-{
-    return m.m64_i32[0];
-}
-
-__inline __m64
-_mm_movepi64_pi64 (__m128i a)
-{
-    return M64C (a.m128i_i64[0]);
-}
-
-__inline __m64
-_m_pand (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) & M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_por (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) | M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_pxor (__m64 a, __m64 b)
-{
-    unsigned long long i64 = M64U (a) ^ M64U (b);
-
-    return M64C (i64);
-}
-
-__inline __m64
-_m_pmulhuw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]) >> 16),
-	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]) >> 16)
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64
-_m_pmullw2 (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])),
-	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1])),
-	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2])),
-	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64
-_m_pmullw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x =
-	((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])))  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]))) << 16)  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]))) << 32)  +
-	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))) << 48);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_paddusb (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x = (M64U (a) & 0x00FF00FF00FF00FFULL) +
-                           (M64U (b) & 0x00FF00FF00FF00FFULL);
-
-    unsigned long long y = ((M64U (a) >> 8) & 0x00FF00FF00FF00FFULL) +
-                           ((M64U (b) >> 8) & 0x00FF00FF00FF00FFULL);
-
-    x |= ((x & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
-    y |= ((y & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
-
-    x = (x & 0x00FF00FF00FF00FFULL) | ((y & 0x00FF00FF00FF00FFULL) << 8);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_paddusw (__m64 a, __m64 b)        /* unoptimized */
-{
-    unsigned long long x = (M64U (a) & 0x0000FFFF0000FFFFULL) +
-                           (M64U (b) & 0x0000FFFF0000FFFFULL);
-
-    unsigned long long y = ((M64U (a) >> 16) & 0x0000FFFF0000FFFFULL) +
-                           ((M64U (b) >> 16) & 0x0000FFFF0000FFFFULL);
-
-    x |= ((x & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
-    y |= ((y & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
-
-    x = (x & 0x0000FFFF0000FFFFULL) | ((y & 0x0000FFFF0000FFFFULL) << 16);
-
-    return M64C (x);
-}
-
-__inline __m64
-_m_pshufw (__m64 a, int n)         /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	a.m64_u16[n & 3],
-	a.m64_u16[(n >> 2) & 3],
-	a.m64_u16[(n >> 4) & 3],
-	a.m64_u16[(n >> 6) & 3]
-    };
-
-    return M64C (d[0]);
-}
-
-__inline unsigned char
-sat16 (unsigned short d)
-{
-    if (d > 0xFF) return 0xFF;
-    else return d & 0xFF;
-}
-
-__inline __m64
-_m_packuswb (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	sat16 (m1.m64_u16[0]),
-	sat16 (m1.m64_u16[1]),
-	sat16 (m1.m64_u16[2]),
-	sat16 (m1.m64_u16[3]),
-	sat16 (m2.m64_u16[0]),
-	sat16 (m2.m64_u16[1]),
-	sat16 (m2.m64_u16[2]),
-	sat16 (m2.m64_u16[3])
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_punpcklbw (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	m1.m64_u8[0],
-	m2.m64_u8[0],
-	m1.m64_u8[1],
-	m2.m64_u8[1],
-	m1.m64_u8[2],
-	m2.m64_u8[2],
-	m1.m64_u8[3],
-	m2.m64_u8[3],
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_punpckhbw (__m64 m1, __m64 m2)          /* unoptimized */
-{
-    unsigned char d[8] =
-    {
-	m1.m64_u8[4],
-	m2.m64_u8[4],
-	m1.m64_u8[5],
-	m2.m64_u8[5],
-	m1.m64_u8[6],
-	m2.m64_u8[6],
-	m1.m64_u8[7],
-	m2.m64_u8[7],
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_psrlwi (__m64 a, int n)       /* unoptimized */
-{
-    unsigned short d[4] =
-    {
-	a.m64_u16[0] >> n,
-	a.m64_u16[1] >> n,
-	a.m64_u16[2] >> n,
-	a.m64_u16[3] >> n
-    };
-
-    return M64C (d[0]);
-}
-
-__inline __m64 _m_psrlqi (__m64 m, int n)
-{
-    unsigned long long x = M64U (m) >> n;
-
-    return M64C (x);
-}
-
-__inline __m64 _m_psllqi (__m64 m, int n)
-{
-    unsigned long long x = M64U (m) << n;
-
-    return M64C (x);
-}
-
-#endif /* MMX_X64_H_INCLUDED */
commit 984be4def2e62a05e9a91e77ac8c703fed30718b
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 07:38:03 2011 -0500

    sse2: Delete obsolete or redundant comments

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 0753b6d..286dea8 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -46,12 +46,6 @@
 #   include "pixman-x64-mmx-emulation.h"
 #endif
 
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
 static __m128i mask_0080;
 static __m128i mask_00ff;
 static __m128i mask_0101;
@@ -69,9 +63,6 @@ static __m128i mask_blue;
 static __m128i mask_565_fix_rb;
 static __m128i mask_565_fix_g;
 
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
@@ -389,10 +380,6 @@ save_128_unaligned (__m128i* dst,
     _mm_storeu_si128 (dst, data);
 }
 
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
 static force_inline __m128i
 load_32_1x128 (uint32_t data)
 {
@@ -486,9 +473,6 @@ expand565_16_1x128 (uint16_t pixel)
     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 }
 
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
 static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
@@ -2365,9 +2349,6 @@ sse2_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
 static force_inline __m128i
 create_mask_16_128 (uint16_t mask)
 {
@@ -2387,10 +2368,6 @@ create_mask_2x32_128 (uint32_t mask0,
 }
 #endif
 
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
-
 static void
 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
                             pixman_op_t              op,
@@ -2470,9 +2447,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     }
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
 static void
 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
                             pixman_op_t              op,
@@ -2558,9 +2532,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 
 }
 
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
 static void
 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 				   pixman_op_t              op,
@@ -2684,10 +2655,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 
 }
 
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
 static void
 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
                                     pixman_op_t              op,
@@ -2811,10 +2778,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
 }
 
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
 static void
 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
@@ -2929,10 +2892,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 }
 
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
 static void
 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 			      pixman_op_t              op,
@@ -3001,9 +2960,6 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
 static void
 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
@@ -3105,9 +3061,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
 }
 
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
 static void
 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
                                pixman_op_t              op,
@@ -3144,9 +3097,6 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
     }
 }
 
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
 static force_inline uint16_t
 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
@@ -3188,15 +3138,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
 	dst = dst_line;
@@ -3271,10 +3212,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 
 }
 
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
 static void
 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
                               pixman_op_t              op,
@@ -3406,10 +3343,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 }
 
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
 pixman_bool_t
 pixman_fill_sse2 (uint32_t *bits,
                   int       stride,
@@ -3688,10 +3621,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 }
 
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
 static void
 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
                               pixman_op_t              op,
@@ -3839,10 +3768,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 }
 
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
 static void
 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
@@ -3873,15 +3798,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
 	dst = dst_line;
@@ -3972,10 +3888,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
 }
 
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
 static void
 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
@@ -4005,15 +3917,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
 	dst = dst_line;
@@ -4084,10 +3987,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
 }
 
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
 static void
 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
                                     pixman_op_t              op,
@@ -4232,10 +4131,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
 }
 
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
 static void
 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
                          pixman_op_t              op,
@@ -4335,10 +4230,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 
 }
 
-/* -----------------------------------------------------------------------
- * composite_in_n_8
- */
-
 static void
 sse2_composite_in_n_8 (pixman_implementation_t *imp,
 		       pixman_op_t              op,
@@ -4431,10 +4322,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 
 }
 
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
-
 static void
 sse2_composite_in_8_8 (pixman_implementation_t *imp,
                        pixman_op_t              op,
@@ -4516,10 +4403,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 
 }
 
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
 static void
 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 			  pixman_op_t              op,
@@ -4619,10 +4502,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 
 }
 
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
 static void
 sse2_composite_add_n_8 (pixman_implementation_t *imp,
 			pixman_op_t              op,
@@ -4706,10 +4585,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
 }
 
-/* ----------------------------------------------------------------------
- * composite_add_8_8
- */
-
 static void
 sse2_composite_add_8_8 (pixman_implementation_t *imp,
 			pixman_op_t              op,
@@ -4772,9 +4647,6 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
 
 }
 
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
 static void
 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
                               pixman_op_t              op,
@@ -4811,10 +4683,6 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 }
 
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
 static pixman_bool_t
 pixman_blt_sse2 (uint32_t *src_bits,
                  uint32_t *dst_bits,
@@ -6066,10 +5934,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
 
-
     /* Set up function pointers */
-
-    /* SSE code patch for fbcompose.c */
     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
@@ -6102,5 +5967,3 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
 
     return imp;
 }
-
-#endif /* USE_SSE2 */
commit 33d98902261ad73c1b6b1366968e49a1cb2bf68b
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 07:07:45 2011 -0500

    sse2: Remove all the core_combine_* functions
    
    Now that _mm_empty() is not used anymore, they are no longer different
    from the sse2_combine_* functions, so they can be consolidated.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index f581727..0753b6d 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -724,10 +724,12 @@ core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
 }
 
 static force_inline void
-core_combine_over_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     if (pm)
 	core_combine_over_u_sse2_mask (pd, ps, pm, w);
@@ -735,11 +737,13 @@ core_combine_over_u_sse2 (uint32_t*       pd,
 	core_combine_over_u_sse2_no_mask (pd, ps, w);
 }
 
-static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -823,11 +827,13 @@ core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     return dst;
 }
 
-static force_inline void
-core_combine_in_u_sse2 (uint32_t*       pd,
-                        const uint32_t* ps,
-                        const uint32_t* pm,
-                        int             w)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
 {
     uint32_t s, d;
 
@@ -882,11 +888,13 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t*       pd,
-                                const uint32_t* ps,
-                                const uint32_t *pm,
-                                int             w)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
 {
     uint32_t s, d;
 
@@ -941,11 +949,13 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t*       pd,
-                                 const uint32_t* ps,
-                                 const uint32_t* pm,
-                                 int             w)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
     while (w && ((unsigned long) pd & 15))
     {
@@ -1008,11 +1018,13 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_out_u_sse2 (uint32_t*       pd,
-                         const uint32_t* ps,
-                         const uint32_t* pm,
-                         int             w)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
     while (w && ((unsigned long) pd & 15))
     {
@@ -1086,11 +1098,13 @@ core_combine_atop_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, d;
 
@@ -1168,11 +1182,13 @@ core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -1250,11 +1266,13 @@ core_combine_xor_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
 }
 
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t *mask,
-                         int             width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1326,10 +1344,12 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 }
 
 static force_inline void
-core_combine_add_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t* mask,
-                         int             width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1397,11 +1417,13 @@ core_combine_saturate_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
 }
 
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t *      pd,
-                              const uint32_t *ps,
-                              const uint32_t *pm,
-                              int             w)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
 {
     uint32_t s, d;
 
@@ -1482,11 +1504,13 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m;
 
@@ -1547,11 +1571,13 @@ core_combine_over_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t*       pd,
-                           const uint32_t* ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -1621,11 +1647,13 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
 					unpack_32_1x128 (mask))));
 }
 
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
-                                   const uint32_t* ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -1684,11 +1712,13 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t *      pd,
-                         const uint32_t *ps,
-                         const uint32_t *pm,
-                         int             w)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
     uint32_t s, m, d;
 
@@ -1757,11 +1787,13 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
-                                 const uint32_t *ps,
-                                 const uint32_t *pm,
-                                 int             w)
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
     uint32_t s, m, d;
 
@@ -1828,11 +1860,13 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -1902,11 +1936,13 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
-                                  const uint32_t *ps,
-                                  const uint32_t *pm,
-                                  int             w)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, m, d;
 
@@ -1996,11 +2032,13 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t *      pd,
-                           const uint32_t *ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -2087,11 +2125,13 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
-                                   const uint32_t *ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -2181,11 +2221,13 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src,
                                                 &alpha_src));
 }
 
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2257,11 +2299,13 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2343,250 +2387,6 @@ create_mask_2x32_128 (uint32_t mask0,
 }
 #endif
 
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_over_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_over_reverse_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dst,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
-{
-    core_combine_in_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               dst,
-                           const uint32_t *         src,
-                           const uint32_t *         mask,
-                           int                      width)
-{
-    core_combine_reverse_in_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_out_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_reverse_out_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_atop_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_xor_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_add_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint32_t *               dst,
-                         const uint32_t *         src,
-                         const uint32_t *         mask,
-                         int                      width)
-{
-    core_combine_saturate_u_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_src_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_over_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_in_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_out_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_atop_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_xor_ca_sse2 (dst, src, mask, width);
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_add_ca_sse2 (dst, src, mask, width);
-}
-
 /* -------------------------------------------------------------------
  * composite_over_n_8888
  */
@@ -3337,7 +3137,7 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-	core_combine_over_u_sse2 (dst, src, NULL, width);
+	sse2_combine_over_u (imp, op, dst, src, NULL, width);
 
 	dst += dst_stride;
 	src += src_stride;
@@ -4953,7 +4753,8 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+	sse2_combine_add_u (imp, op,
+			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
 
 	/* Small tail */
 	dst += w & 0xfffc;
@@ -5005,7 +4806,7 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 	src = src_line;
 	src_line += src_stride;
 
-	core_combine_add_u_sse2 (dst, src, NULL, width);
+	sse2_combine_add_u (imp, op, dst, src, NULL, width);
     }
 
 }
@@ -5686,7 +5487,7 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 
 }
 
-/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
 static force_inline void
 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              const uint32_t* ps,
commit 87cd6b8056bbacb835eeb991f03b9135dcd58334
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 05:15:50 2011 -0500

    sse2: Don't compile pixman-sse2.c with -mmmx anymore
    
    It's not necessary now that the file doesn't use MMX instructions.

diff --git a/configure.ac b/configure.ac
index 5242799..8d96647 100644
--- a/configure.ac
+++ b/configure.ac
@@ -326,7 +326,7 @@ if test "x$SSE2_CFLAGS" = "x" ; then
          SSE2_CFLAGS="-xarch=sse2"
       fi
    else
-      SSE2_CFLAGS="-mmmx -msse2 -Winline"
+      SSE2_CFLAGS="-msse2 -Winline"
    fi
 fi
 
commit e7fe5e35e9640c6d6bb08c24b96ce882434a7f9f
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 05:07:08 2011 -0500

    sse2: Delete unused MMX functions and constants and all _mm_empty()s
    
    These are not needed because the SSE2 implementation doesn't use MMX
    anymore.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 283e4c4..f581727 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -52,14 +52,6 @@
  * Locals
  */
 
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
-
 static __m128i mask_0080;
 static __m128i mask_00ff;
 static __m128i mask_0101;
@@ -401,49 +393,18 @@ save_128_unaligned (__m128i* dst,
  * MMX inlines
  */
 
-static force_inline __m64
-load_32_1x64 (uint32_t data)
-{
-    return _mm_cvtsi32_si64 (data);
-}
-
 static force_inline __m128i
 load_32_1x128 (uint32_t data)
 {
     return _mm_cvtsi32_si128 (data);
 }
 
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
-    return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
 static force_inline __m128i
 expand_alpha_rev_1x128 (__m128i data)
 {
     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
-{
-    return _mm_shuffle_pi16 (
-	unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
 static force_inline __m128i
 expand_pixel_8_1x128 (uint8_t data)
 {
@@ -451,15 +412,6 @@ expand_pixel_8_1x128 (uint8_t data)
 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
-                   __m64 alpha)
-{
-    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
-                                          mask_x0080),
-                           mask_x0101);
-}
-
 static force_inline __m128i
 pix_multiply_1x128 (__m128i data,
 		    __m128i alpha)
@@ -469,18 +421,6 @@ pix_multiply_1x128 (__m128i data,
 			    mask_0101);
 }
 
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
-                       __m64* alpha_dst,
-                       __m64* dst,
-                       __m64* alpha_src)
-{
-    __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
-    __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
-
-    return _mm_adds_pu8 (t1, t2);
-}
-
 static force_inline __m128i
 pix_add_multiply_1x128 (__m128i* src,
 			__m128i* alpha_dst,
@@ -493,50 +433,24 @@ pix_add_multiply_1x128 (__m128i* src,
     return _mm_adds_epu8 (t1, t2);
 }
 
-static force_inline __m64
-negate_1x64 (__m64 data)
-{
-    return _mm_xor_si64 (data, mask_x00ff);
-}
-
 static force_inline __m128i
 negate_1x128 (__m128i data)
 {
     return _mm_xor_si128 (data, mask_00ff);
 }
 
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
 static force_inline __m128i
 invert_colors_1x128 (__m128i data)
 {
     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
-{
-    return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
-}
-
 static force_inline __m128i
 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 {
     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 }
 
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
-{
-    return over_1x64 (pix_multiply_1x64 (*src, *mask),
-                      pix_multiply_1x64 (*alpha, *mask),
-                      *dst);
-}
-
 static force_inline __m128i
 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 {
@@ -545,17 +459,6 @@ in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 		       *dst);
 }
 
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
-{
-    __m64 alpha = expand_alpha_1x64 (src);
-
-    return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
-                                         _mm_or_si64 (alpha, mask_x_alpha)),
-                      alpha,
-                      dst);
-}
-
 static force_inline __m128i
 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 {
@@ -568,50 +471,11 @@ over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 }
 
 static force_inline uint32_t
-pack_1x64_32 (__m64 data)
-{
-    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
-}
-
-static force_inline uint32_t
 pack_1x128_32 (__m128i data)
 {
     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 }
 
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- *    00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
-{
-    __m64 p;
-    __m64 t1, t2;
-
-    p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
-    t1 = _mm_slli_si64 (p, 36 - 11);
-    t2 = _mm_slli_si64 (p, 16 - 5);
-
-    p = _mm_or_si64 (t1, p);
-    p = _mm_or_si64 (t2, p);
-    p = _mm_and_si64 (p, mask_x565_rgb);
-    p = _mm_mullo_pi16 (p, mask_x565_unpack);
-
-    return _mm_srli_pi16 (p, 8);
-}
-
 static force_inline __m128i
 expand565_16_1x128 (uint16_t pixel)
 {
@@ -2460,25 +2324,12 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 /* ---------------------------------------------------
  * fb_compose_setup_sSE2
  */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
-    return _mm_set1_pi16 (mask);
-}
-
 static force_inline __m128i
 create_mask_16_128 (uint16_t mask)
 {
     return _mm_set1_epi16 (mask);
 }
 
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
-                     uint32_t mask1)
-{
-    return _mm_set_pi32 (mask0, mask1);
-}
-
 /* Work around a code generation bug in Sun Studio 12. */
 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
 # define create_mask_2x32_128(mask0, mask1)				\
@@ -2503,7 +2354,6 @@ sse2_combine_over_u (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_over_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2515,7 +2365,6 @@ sse2_combine_over_reverse_u (pixman_implementation_t *imp,
                              int                      width)
 {
     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2527,7 +2376,6 @@ sse2_combine_in_u (pixman_implementation_t *imp,
                    int                      width)
 {
     core_combine_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2539,7 +2387,6 @@ sse2_combine_in_reverse_u (pixman_implementation_t *imp,
                            int                      width)
 {
     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2551,7 +2398,6 @@ sse2_combine_out_u (pixman_implementation_t *imp,
                     int                      width)
 {
     core_combine_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2563,7 +2409,6 @@ sse2_combine_out_reverse_u (pixman_implementation_t *imp,
                             int                      width)
 {
     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2575,7 +2420,6 @@ sse2_combine_atop_u (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2587,7 +2431,6 @@ sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
                              int                      width)
 {
     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2599,7 +2442,6 @@ sse2_combine_xor_u (pixman_implementation_t *imp,
                     int                      width)
 {
     core_combine_xor_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2611,7 +2453,6 @@ sse2_combine_add_u (pixman_implementation_t *imp,
                     int                      width)
 {
     core_combine_add_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2623,7 +2464,6 @@ sse2_combine_saturate_u (pixman_implementation_t *imp,
                          int                      width)
 {
     core_combine_saturate_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2635,7 +2475,6 @@ sse2_combine_src_ca (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_src_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2647,7 +2486,6 @@ sse2_combine_over_ca (pixman_implementation_t *imp,
                       int                      width)
 {
     core_combine_over_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2659,7 +2497,6 @@ sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
                               int                      width)
 {
     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2671,7 +2508,6 @@ sse2_combine_in_ca (pixman_implementation_t *imp,
                     int                      width)
 {
     core_combine_in_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2683,7 +2519,6 @@ sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
                             int                      width)
 {
     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2695,7 +2530,6 @@ sse2_combine_out_ca (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_out_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2707,7 +2541,6 @@ sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
                              int                      width)
 {
     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2719,7 +2552,6 @@ sse2_combine_atop_ca (pixman_implementation_t *imp,
                       int                      width)
 {
     core_combine_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2731,7 +2563,6 @@ sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
                               int                      width)
 {
     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2743,7 +2574,6 @@ sse2_combine_xor_ca (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_xor_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 static void
@@ -2755,7 +2585,6 @@ sse2_combine_add_ca (pixman_implementation_t *imp,
                      int                      width)
 {
     core_combine_add_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------
@@ -2839,7 +2668,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	}
 
     }
-    _mm_empty ();
 }
 
 /* ---------------------------------------------------------------------
@@ -2928,7 +2756,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ------------------------------
@@ -3055,7 +2882,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ---------------------------------------------------------------------------
@@ -3183,7 +3009,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /*---------------------------------------------------------------------
@@ -3302,7 +3127,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /*---------------------------------------------------------------------
@@ -3375,7 +3199,6 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ---------------------------------------------------------------------
@@ -3480,7 +3303,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* --------------------------------------------------------------------
@@ -3520,7 +3342,6 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 	dst += dst_stride;
 	src += src_stride;
     }
-    _mm_empty ();
 }
 
 /* ------------------------------------------------------------------
@@ -3648,7 +3469,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -----------------------------------------------------------------
@@ -3784,7 +3604,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ----------------------------------------------------------------
@@ -3938,7 +3757,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	}
     }
 
-    _mm_empty ();
     return TRUE;
 }
 
@@ -4068,7 +3886,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /*-----------------------------------------------------------------------
@@ -4220,7 +4037,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -----------------------------------------------------------------------
@@ -4354,7 +4170,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------
@@ -4467,7 +4282,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
@@ -4616,7 +4430,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -----------------------------------------------------------------------
@@ -4720,7 +4533,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -----------------------------------------------------------------------
@@ -4817,7 +4629,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ---------------------------------------------------------------------------
@@ -4903,7 +4714,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------
@@ -5007,7 +4817,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------
@@ -5095,7 +4904,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ----------------------------------------------------------------------
@@ -5161,7 +4969,6 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty ();
 }
 
 /* ---------------------------------------------------------------------
@@ -5201,7 +5008,6 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 	core_combine_add_u_sse2 (dst, src, NULL, width);
     }
 
-    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
@@ -5326,7 +5132,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	}
     }
 
-    _mm_empty ();
 
     return TRUE;
 }
@@ -5484,7 +5289,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
 static void
@@ -5638,7 +5442,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
 static void
@@ -5730,7 +5533,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
     }
 
-    _mm_empty ();
 }
 
 static void
@@ -5882,7 +5684,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
 /* A variant of 'core_combine_over_u_sse2' with minor tweaks */
@@ -5977,7 +5778,6 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
 
 	w--;
     }
-    _mm_empty ();
 }
 
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
@@ -6090,7 +5890,6 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 	w--;
     }
 
-    _mm_empty ();
 }
 
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
@@ -6466,16 +6265,6 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
 
-    /* MMX constants */
-    mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
-    mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
-
-    mask_x0080 = create_mask_16_64 (0x0080);
-    mask_x00ff = create_mask_16_64 (0x00ff);
-    mask_x0101 = create_mask_16_64 (0x0101);
-    mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
-
-    _mm_empty ();
 
     /* Set up function pointers */
 
commit f88ae14c15040345a12ff0488c7b23d25639e49b
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 03:56:20 2011 -0500

    sse2: Convert all uses of MMX registers to use SSE2 registers instead.
    
    By avoiding use of MMX registers we won't need to call emms all over
    the place, which avoids various miscompilation issues.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c4ff3c1..283e4c4 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -407,6 +407,12 @@ load_32_1x64 (uint32_t data)
     return _mm_cvtsi32_si64 (data);
 }
 
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
 static force_inline __m64
 unpack_32_1x64 (uint32_t data)
 {
@@ -425,6 +431,12 @@ expand_alpha_rev_1x64 (__m64 data)
     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
 static force_inline __m64
 expand_pixel_8_1x64 (uint8_t data)
 {
@@ -432,6 +444,13 @@ expand_pixel_8_1x64 (uint8_t data)
 	unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
 static force_inline __m64
 pix_multiply_1x64 (__m64 data,
                    __m64 alpha)
@@ -441,6 +460,15 @@ pix_multiply_1x64 (__m64 data,
                            mask_x0101);
 }
 
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
 static force_inline __m64
 pix_add_multiply_1x64 (__m64* src,
                        __m64* alpha_dst,
@@ -453,24 +481,54 @@ pix_add_multiply_1x64 (__m64* src,
     return _mm_adds_pu8 (t1, t2);
 }
 
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
 static force_inline __m64
 negate_1x64 (__m64 data)
 {
     return _mm_xor_si64 (data, mask_x00ff);
 }
 
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
 static force_inline __m64
 invert_colors_1x64 (__m64 data)
 {
     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
 static force_inline __m64
 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 {
     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 }
 
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
 static force_inline __m64
 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 {
@@ -479,6 +537,14 @@ in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
                       *dst);
 }
 
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
 static force_inline __m64
 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 {
@@ -490,12 +556,29 @@ over_rev_non_pre_1x64 (__m64 src, __m64 dst)
                       dst);
 }
 
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
 static force_inline uint32_t
 pack_1x64_32 (__m64 data)
 {
     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 }
 
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
  *
  *    00RR00GG00BB
@@ -529,6 +612,16 @@ expand565_16_1x64 (uint16_t pixel)
     return _mm_srli_pi16 (p, 8);
 }
 
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
 /* ----------------------------------------------------------------------------
  * Compose Core transformations
  */
@@ -536,7 +629,7 @@ static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
     uint8_t a;
-    __m64 ms;
+    __m128i xmms;
 
     a = src >> 24;
 
@@ -546,9 +639,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
     }
     else if (src)
     {
-	ms = unpack_32_1x64 (src);
-	return pack_1x64_32 (
-	    over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
     }
 
     return dst;
@@ -561,15 +655,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 
     if (pm)
     {
-	__m64 ms, mm;
+	__m128i ms, mm;
 
-	mm = unpack_32_1x64 (*pm);
-	mm = expand_alpha_1x64 (mm);
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
 
-	ms = unpack_32_1x64 (s);
-	ms = pix_multiply_1x64 (ms, mm);
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
 
-	s = pack_1x64_32 (ms);
+	s = pack_1x128_32 (ms);
     }
 
     return s;
@@ -857,9 +951,9 @@ core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     }
     else if (maska != 0xff)
     {
-	return pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (dst),
-			       expand_alpha_1x64 (unpack_32_1x64 (src))));
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
     }
 
     return dst;
@@ -994,10 +1088,10 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
 
 	if (pm)
 	    pm++;
@@ -1039,10 +1133,10 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	ps++;
 	if (pm)
 	    pm++;
@@ -1061,10 +1155,10 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (s), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
 	ps++;
 	if (pm)
@@ -1104,10 +1198,10 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (s), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
 	ps++;
 	if (pm)
@@ -1119,13 +1213,13 @@ static force_inline uint32_t
 core_combine_atop_u_pixel_sse2 (uint32_t src,
                                 uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
 static force_inline void
@@ -1201,13 +1295,13 @@ static force_inline uint32_t
 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
                                         uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
 static force_inline void
@@ -1283,13 +1377,13 @@ static force_inline uint32_t
 core_combine_xor_u_pixel_sse2 (uint32_t src,
                                uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
 }
 
 static force_inline void
@@ -1387,8 +1481,8 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	ps++;
 	if (pm)
 	    pm++;
-	*pd++ = _mm_cvtsi64_si32 (
-	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
 	w--;
     }
 
@@ -1414,8 +1508,8 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	d = *pd;
 
 	ps++;
-	*pd++ = _mm_cvtsi64_si32 (
-	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
 	if (pm)
 	    pm++;
     }
@@ -1425,18 +1519,18 @@ static force_inline uint32_t
 core_combine_saturate_u_pixel_sse2 (uint32_t src,
                                     uint32_t dst)
 {
-    __m64 ms = unpack_32_1x64 (src);
-    __m64 md = unpack_32_1x64 (dst);
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
     uint32_t sa = src >> 24;
     uint32_t da = ~dst >> 24;
 
     if (sa > da)
     {
-	ms = pix_multiply_1x64 (
-	    ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
     }
 
-    return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
 }
 
 static force_inline void
@@ -1540,8 +1634,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     {
 	s = *ps++;
 	m = *pm++;
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
 	w--;
     }
 
@@ -1570,8 +1664,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     {
 	s = *ps++;
 	m = *pm++;
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
 	w--;
     }
 }
@@ -1581,12 +1675,12 @@ core_combine_over_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 expAlpha = expand_alpha_1x64 (s);
-    __m64 unpk_mask = unpack_32_1x64 (mask);
-    __m64 unpk_dst  = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
 static force_inline void
@@ -1655,12 +1749,12 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i d = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (
-	over_1x64 (d, expand_alpha_1x64 (d),
-		   pix_multiply_1x64 (unpack_32_1x64 (src),
-				      unpack_32_1x64 (mask))));
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
 }
 
 static force_inline void
@@ -1745,10 +1839,10 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		expand_alpha_1x64 (unpack_32_1x64 (d))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
 
 	w--;
     }
@@ -1789,11 +1883,11 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		expand_alpha_1x64 (unpack_32_1x64 (d))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
 
 	w--;
     }
@@ -1818,11 +1912,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		pix_multiply_1x64 (unpack_32_1x64 (m),
-				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	w--;
     }
 
@@ -1861,11 +1955,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		pix_multiply_1x64 (unpack_32_1x64 (m),
-				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	w--;
     }
 }
@@ -1889,11 +1983,11 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
     }
 
@@ -1934,11 +2028,11 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
 
 	w--;
     }
@@ -1963,12 +2057,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		negate_1x64 (pix_multiply_1x64 (
-				 unpack_32_1x64 (m),
-				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
 	w--;
     }
 
@@ -2011,12 +2105,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		negate_1x64 (pix_multiply_1x64 (
-				 unpack_32_1x64 (m),
-				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
 	w--;
     }
 }
@@ -2026,16 +2120,16 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
 
-    s = pix_multiply_1x64 (s, m);
-    m = negate_1x64 (pix_multiply_1x64 (m, sa));
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
 static force_inline void
@@ -2116,17 +2210,17 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 sa = expand_alpha_1x64 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
 
-    s = pix_multiply_1x64 (s, m);
-    m = pix_multiply_1x64 (m, sa);
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
 static force_inline void
@@ -2208,16 +2302,16 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src,
                                 uint32_t mask,
                                 uint32_t dst)
 {
-    __m64 a = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
-				       a, expand_alpha_1x64 (s)));
-    __m64 dest      = pix_multiply_1x64 (s, a);
-    __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
                                                 &alpha_dst,
                                                 &dest,
                                                 &alpha_src));
@@ -2317,10 +2411,10 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-					     unpack_32_1x64 (m)),
-			  unpack_32_1x64 (d)));
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
 	w--;
     }
 
@@ -2355,10 +2449,10 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-					     unpack_32_1x64 (m)),
-			  unpack_32_1x64 (d)));
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
 	w--;
     }
 }
@@ -2711,9 +2805,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	while (w && (unsigned long)dst & 15)
 	{
 	    d = *dst;
-	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-	                                      _mm_movepi64_pi64 (xmm_alpha),
-	                                      unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -2738,9 +2832,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	while (w)
 	{
 	    d = *dst;
-	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-	                                      _mm_movepi64_pi64 (xmm_alpha),
-	                                      unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -2796,9 +2890,9 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    d = *dst;
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-					 _mm_movepi64_pi64 (xmm_alpha),
-					 expand565_16_1x64 (d))));
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
 	    w--;
 	}
 
@@ -2829,9 +2923,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	{
 	    d = *dst;
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-					 _mm_movepi64_pi64 (xmm_alpha),
-					 expand565_16_1x64 (d))));
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
 	}
     }
 
@@ -2866,7 +2959,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
     __m128i xmm_dst;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
     srca = src >> 24;
@@ -2882,8 +2975,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
     xmm_src = _mm_unpacklo_epi8 (
 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -2902,11 +2995,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    {
 		d = *pd;
 
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
 	    }
 
 	    pd++;
@@ -2950,11 +3043,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    {
 		d = *pd;
 
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
 	    }
 
 	    pd++;
@@ -2994,7 +3087,7 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -3009,8 +3102,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     xmm_src = _mm_unpacklo_epi8 (
 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3028,10 +3121,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                  &mmx_alpha,
 		                                  &mmx_mask,
 		                                  &mmx_dest));
@@ -3078,11 +3171,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
 	    }
 
 	    pd++;
@@ -3148,13 +3241,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    {
 		uint32_t d = *dst;
 		
-		__m64 ms = unpack_32_1x64 (s);
-		__m64 alpha    = expand_alpha_1x64 (ms);
-		__m64 dest     = _mm_movepi64_pi64 (xmm_mask);
-		__m64 alpha_dst = unpack_32_1x64 (d);
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
 		
-		*dst = pack_1x64_32 (
-		    in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
 	    }
 	    dst++;
 	    w--;
@@ -3195,13 +3288,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    {
 		uint32_t d = *dst;
 		
-		__m64 ms = unpack_32_1x64 (s);
-		__m64 alpha = expand_alpha_1x64 (ms);
-		__m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-		__m64 dest  = unpack_32_1x64 (d);
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
 		
-		*dst = pack_1x64_32 (
-		    in_over_1x64 (&ms, &alpha, &mask, &dest));
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
 	    }
 
 	    dst++;
@@ -3336,13 +3429,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    uint32_t s = (*src++) | 0xff000000;
 	    uint32_t d = *dst;
 
-	    __m64 src   = unpack_32_1x64 (s);
-	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst++ = pack_1x64_32 (
-		in_over_1x64 (&src, &alpha, &mask, &dest));
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
 
 	    w--;
 	}
@@ -3375,13 +3468,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    uint32_t s = (*src++) | 0xff000000;
 	    uint32_t d = *dst;
 
-	    __m64 src  = unpack_32_1x64 (s);
-	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst++ = pack_1x64_32 (
-		in_over_1x64 (&src, &alpha, &mask, &dest));
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
 
 	    w--;
 	}
@@ -3436,13 +3529,13 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 static force_inline uint16_t
 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
-    __m64 ms;
+    __m128i ms;
 
-    ms = unpack_32_1x64 (src);
+    ms = unpack_32_1x128 (src);
     return pack_565_32_16 (
-	pack_1x64_32 (
-	    over_1x64 (
-		ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
 }
 
 static void
@@ -3588,7 +3681,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -3604,8 +3697,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     xmm_def = create_mask_2x32_128 (src, src);
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3622,10 +3715,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_pixel_8_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                   &mmx_alpha,
 		                                   &mmx_mask,
 		                                   &mmx_dest));
@@ -3677,10 +3770,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_pixel_8_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                   &mmx_alpha,
 		                                   &mmx_mask,
 		                                   &mmx_dest));
@@ -3907,9 +4000,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		*dst = pack_1x64_32 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
 	    }
 	    else
 	    {
@@ -3962,9 +4054,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		*dst = pack_1x64_32 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
 	    }
 	    else
 	    {
@@ -4004,7 +4096,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
     uint32_t m;
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -4023,8 +4115,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4041,12 +4133,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4114,12 +4206,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4156,7 +4248,7 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     int32_t w;
     uint32_t opaque, zero;
 
-    __m64 ms;
+    __m128i ms;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
@@ -4187,11 +4279,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    ms = unpack_32_1x64 (s);
+	    ms = unpack_32_1x128 (s);
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (
-		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
 	    w--;
 	}
 
@@ -4253,11 +4345,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    ms = unpack_32_1x64 (s);
+	    ms = unpack_32_1x128 (s);
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (
-		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
 	    w--;
 	}
     }
@@ -4320,9 +4412,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    *dst++ = pack_1x64_32 (
-		over_rev_non_pre_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4367,9 +4459,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    *dst++ = pack_1x64_32 (
-		over_rev_non_pre_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4408,7 +4500,7 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -4422,8 +4514,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4440,12 +4532,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4509,12 +4601,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4582,11 +4674,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
-				       unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4619,11 +4711,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4689,10 +4781,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	{
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    _mm_movepi64_pi64 (xmm_alpha),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4717,10 +4809,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	{
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    _mm_movepi64_pi64 (xmm_alpha),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4774,9 +4866,9 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    s = (uint32_t) *src++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4805,8 +4897,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    s = (uint32_t) *src++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4869,11 +4961,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		_mm_adds_pu16 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4905,11 +4997,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		_mm_adds_pu16 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4973,10 +5065,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
 	while (w && ((unsigned long)dst & 15))
 	{
-	    *dst = (uint8_t)_mm_cvtsi64_si32 (
-		_mm_adds_pu8 (
-		    _mm_movepi64_pi64 (xmm_src),
-		    _mm_cvtsi32_si64 (*dst)));
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
 
 	    w--;
 	    dst++;
@@ -4993,10 +5085,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    *dst = (uint8_t)_mm_cvtsi64_si32 (
-		_mm_adds_pu8 (
-		    _mm_movepi64_pi64 (xmm_src),
-		    _mm_cvtsi32_si64 (*dst)));
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
 
 	    w--;
 	    dst++;
@@ -5284,7 +5376,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     uint32_t m;
     int src_stride, mask_stride, dst_stride;
     int32_t w;
-    __m64 ms;
+    __m128i ms;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -5313,17 +5405,17 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             s = 0xff000000 | *src++;
             m = (uint32_t) *mask++;
             d = *dst;
-            ms = unpack_32_1x64 (s);
+            ms = unpack_32_1x128 (s);
 
             if (m != 0xff)
             {
-		__m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		__m64 md = unpack_32_1x64 (d);
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
 
-                ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
             }
 
-            *dst++ = pack_1x64_32 (ms);
+            *dst++ = pack_1x128_32 (ms);
             w--;
         }
 
@@ -5373,15 +5465,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 }
                 else
                 {
-		    __m64 ma, md, ms;
+		    __m128i ma, md, ms;
 
                     d = *dst;
 
-		    ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		    md = unpack_32_1x64 (d);
-		    ms = unpack_32_1x64 (s);
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
 
-                    *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
                 }
 
             }
@@ -5457,15 +5549,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5529,15 +5621,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5591,12 +5683,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 15)
 	{
-	    __m64 vd;
+	    __m128i vd;
 
-	    vd = unpack_32_1x64 (*dst);
+	    vd = unpack_32_1x128 (*dst);
 
-	    *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
-					    _mm_movepi64_pi64 (xmm_src)));
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
 	    w--;
 	    dst++;
 	}
@@ -5626,12 +5718,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vd;
+	    __m128i vd;
 
-	    vd = unpack_32_1x64 (*dst);
+	    vd = unpack_32_1x128 (*dst);
 
-	    *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
-					    _mm_movepi64_pi64 (xmm_src)));
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
 	    w--;
 	    dst++;
 	}
@@ -5703,15 +5795,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5773,15 +5865,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5927,13 +6019,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 	{
 	    uint32_t d = *dst;
 
-	    __m64 ms = unpack_32_1x64 (s);
-	    __m64 alpha     = expand_alpha_1x64 (ms);
-	    __m64 dest      = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 alpha_dst = unpack_32_1x64 (d);
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
 
-	    *dst = pack_1x64_32 (
-		in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
 	}
 	dst++;
 	w--;
@@ -5985,13 +6077,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 	{
 	    uint32_t d = *dst;
 
-	    __m64 ms = unpack_32_1x64 (s);
-	    __m64 alpha = expand_alpha_1x64 (ms);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst = pack_1x64_32 (
-		in_over_1x64 (&ms, &alpha, &mask, &dest));
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
 	}
 
 	dst++;
commit 7fb75bb3e6c3e004374d186ea2d6f02d1caccba4
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Feb 18 03:57:55 2011 -0500

    Coding style:  core_combine_in_u_pixelsse2 -> core_combine_in_u_pixel_sse2

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 2e135e2..c4ff3c1 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -847,7 +847,7 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 }
 
 static force_inline uint32_t
-core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
     uint32_t maska = src >> 24;
 
@@ -881,7 +881,7 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	s = combine1 (ps, pm);
 	d = *pd;
 
-	*pd++ = core_combine_in_u_pixelsse2 (d, s);
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
 	w--;
 	ps++;
 	if (pm)
@@ -916,7 +916,7 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	s = combine1 (ps, pm);
 	d = *pd;
 
-	*pd++ = core_combine_in_u_pixelsse2 (d, s);
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
 	w--;
 	ps++;
 	if (pm)
@@ -940,7 +940,7 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	s = combine1 (ps, pm);
 	d = *pd;
 
-	*pd++ = core_combine_in_u_pixelsse2 (s, d);
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
 	ps++;
 	w--;
 	if (pm)
@@ -975,7 +975,7 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	s = combine1 (ps, pm);
 	d = *pd;
 
-	*pd++ = core_combine_in_u_pixelsse2 (s, d);
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
 	w--;
 	ps++;
 	if (pm)


More information about the xorg-commit mailing list