pixman: Branch 'master' - 15 commits

Fri Apr 27 13:47:47 PDT 2012

configure.ac               |   54 ++++
 pixman/Makefile.am         |   12 
 pixman/loongson-mmintrin.h |  273 ++++++++++++++++++++++
 pixman/pixman-cpu.c        |   37 ++-
 pixman/pixman-mmx.c        |  546 +++++++++++++++++++++++++++++++++++++--------
 pixman/pixman-private.h    |    2 
 6 files changed, 823 insertions(+), 101 deletions(-)

New commits:
commit 2d431b53d3cdbf1997e2d3b8e17408c12220c3a1
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Apr 27 14:12:56 2012 -0400

    mmx: Use wpackhus in src_x888_0565 on iwMMXt
    
    iwMMXt which has an unsigned saturation pack instruction, while MMX/EXT
    and Loongson don't.
    
    ARM/iwMMXt:
    src_8888_0565 =  L1: 110.38  L2:  82.33  M: 40.92 ( 73.22%)  HT: 35.63  VT: 32.22  R: 30.07  RT: 18.40 ( 132Kops/s)
    src_8888_0565 =  L1: 117.91  L2:  83.05  M: 41.52 ( 75.58%)  HT: 37.63  VT: 35.40  R: 29.37  RT: 19.39 ( 134Kops/s)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 7fe19d5..b14201a 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -589,9 +589,13 @@ pack_4xpacked565 (__m64 a, __m64 b)
     t1 = _mm_or_si64 (t1, g1);
 
     t0 = shift(t0, -5);
+#ifdef USE_ARM_IWMMXT
+    t1 = shift(t1, -5);
+    return _mm_packs_pu32 (t0, t1);
+#else
     t1 = shift(t1, -5 + 16);
-
     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+#endif
 }
 
 #ifndef _MSC_VER
commit 2ddd1c498b723e8e48a38eef01d5befba30b5259
Author: Matt Turner <mattst88 at gmail.com>
Date:   Thu Apr 19 17:33:27 2012 -0400

    mmx: add src_8888_0565
    
    Uses the pmadd technique described in
    http://software.intel.com/sites/landingpage/legacy/mmx/MMX_App_24-16_Bit_Conversion.pdf
    
    The technique uses the packssdw instruction which uses signed
    saturatation. This works in their example because they pack 888 to 555
    leaving the high bit as zero. For packing to 565, it is unsuitable, so
    we replace it with an or+shuffle.
    
    Loongson:
    src_8888_0565 =  L1: 106.13  L2:  83.57  M: 33.46 ( 68.90%)  HT: 30.29  VT: 27.67  R: 26.11  RT: 15.06 ( 135Kops/s)
    src_8888_0565 =  L1: 122.10  L2: 117.53  M: 37.97 ( 78.58%)  HT: 33.14  VT: 30.09  R: 29.01  RT: 15.76 ( 139Kops/s)
    
    ARM/iwMMXt:
    src_8888_0565 =  L1:  67.88  L2:  56.61  M: 31.20 ( 56.74%)  HT: 29.22  VT: 27.01  R: 25.39  RT: 19.29 ( 130Kops/s)
    src_8888_0565 =  L1: 110.38  L2:  82.33  M: 40.92 ( 73.22%)  HT: 35.63  VT: 32.22  R: 30.07  RT: 18.40 ( 132Kops/s)

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 76ae892..8295ba0 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -84,6 +84,17 @@ _mm_empty (void)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmaddhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 320e20a..7fe19d5 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -179,9 +179,12 @@ typedef struct
     mmxdatafield mmx_4x0080;
     mmxdatafield mmx_565_rgb;
     mmxdatafield mmx_565_unpack_multiplier;
+    mmxdatafield mmx_565_pack_multiplier;
     mmxdatafield mmx_565_r;
     mmxdatafield mmx_565_g;
     mmxdatafield mmx_565_b;
+    mmxdatafield mmx_packed_565_rb;
+    mmxdatafield mmx_packed_565_g;
 #ifndef USE_LOONGSON_MMI
     mmxdatafield mmx_mask_0;
     mmxdatafield mmx_mask_1;
@@ -207,9 +210,12 @@ static const mmx_data_t c =
     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
+    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
 #ifndef USE_LOONGSON_MMI
     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
@@ -567,6 +573,27 @@ pack_565 (__m64 pixel, __m64 target, int pos)
 #endif
 }
 
+static force_inline __m64
+pack_4xpacked565 (__m64 a, __m64 b)
+{
+    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
+    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
+
+    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
+    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
+
+    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
+    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
+
+    t0 = _mm_or_si64 (t0, g0);
+    t1 = _mm_or_si64 (t1, g1);
+
+    t0 = shift(t0, -5);
+    t1 = shift(t1, -5 + 16);
+
+    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+}
+
 #ifndef _MSC_VER
 
 static force_inline __m64
@@ -2091,6 +2118,60 @@ pixman_fill_mmx (uint32_t *bits,
 }
 
 static void
+mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest;
+	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
+
+	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
+
+	    *(__m64 *)dst = vdest;
+
+	    w -= 4;
+	    src += 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static void
 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
                             pixman_composite_info_t *info)
 {
@@ -3433,6 +3514,10 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
 
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
commit 3e8fe65a0893fcd82bdea205de49f53be32bb074
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Apr 18 16:24:28 2012 -0400

    mmx: add x8f8g8b8 fetcher
    
    Loongson:
       add_x888_x888 =  L1:  29.36  L2:  27.81  M: 14.05 ( 38.74%)  HT: 12.45  VT: 11.78  R: 11.52  RT:  7.23 (  75Kops/s)
       add_x888_x888 =  L1:  36.06  L2:  34.55  M: 14.81 ( 41.03%)  HT: 14.01  VT: 13.41  R: 13.06  RT:  9.06 (  90Kops/s)
    
     src_x888_8_x888 =  L1:  21.92  L2:  20.15  M: 13.35 ( 41.42%)  HT: 11.70  VT: 10.95  R: 10.53  RT:  6.18 (  65Kops/s)
     src_x888_8_x888 =  L1:  25.43  L2:  23.51  M: 14.12 ( 44.00%)  HT: 13.14  VT: 12.50  R: 11.86  RT:  7.49 (  76Kops/s)
    
    over_x888_8_0565 =  L1:  10.64  L2:  10.17  M:  7.74 ( 21.35%)  HT:  6.83  VT:  6.55  R:  6.34  RT:  4.03 (  46Kops/s)
    over_x888_8_0565 =  L1:  11.41  L2:  10.97  M:  8.07 ( 22.36%)  HT:  7.42  VT:  7.18  R:  6.92  RT:  4.62 (  52Kops/s)
    
    ARM/iwMMXt:
       add_x888_x888 =  L1:  22.10  L2:  18.93  M: 13.48 ( 32.29%)  HT: 11.32  VT: 10.64  R: 10.36  RT:  6.51 (  61Kops/s)
       add_x888_x888 =  L1:  24.26  L2:  20.83  M: 14.52 ( 35.64%)  HT: 12.66  VT: 12.98  R: 11.34  RT:  7.69 (  72Kops/s)
    
     src_x888_8_x888 =  L1:  19.33  L2:  17.66  M: 14.26 ( 38.43%)  HT: 11.53  VT: 10.83  R: 10.57  RT:  6.12 (  58Kops/s)
     src_x888_8_x888 =  L1:  21.23  L2:  19.60  M: 15.41 ( 42.55%)  HT: 12.66  VT: 13.30  R: 11.55  RT:  7.32 (  67Kops/s)
    
    over_x888_8_0565 =  L1:   8.15  L2:   7.56  M:  6.50 ( 15.58%)  HT:  5.73  VT:  5.49  R:  5.50  RT:  3.53 (  38Kops/s)
    over_x888_8_0565 =  L1:   8.35  L2:   7.85  M:  6.68 ( 16.40%)  HT:  6.12  VT:  5.97  R:  5.78  RT:  4.03 (  43Kops/s)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 98fb84e..320e20a 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3199,6 +3199,47 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 }
 
 static uint32_t *
+mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 7)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
+	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
+	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
+	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
+
+	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
+	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
+	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
+	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
 {
     int w = iter->width;
@@ -3296,6 +3337,7 @@ typedef struct
 
 static const fetcher_info_t fetchers[] =
 {
+    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
     { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
     { PIXMAN_a8,		mmx_fetch_a8 },
     { PIXMAN_null }
commit c2b1630d9603f80c2636e8a8bfebca87707d4235
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Apr 18 16:14:08 2012 -0400

    mmx: add a8 fetcher
    
    oprofile of xfce4-terminal-a1
    210535    9.0407  libpixman-1.so.0.25.3    fetch_scanline_a8
    144802    6.0054  libpixman-1.so.0.25.3    mmx_fetch_a8
    
    Loongson:
           add_8_8_8 =  L1:  17.98  L2:  17.28  M: 14.28 ( 19.79%)  HT: 11.11  VT: 10.38  R:  9.97  RT:  5.14 (  55Kops/s)
           add_8_8_8 =  L1:  20.44  L2:  19.65  M: 15.62 ( 21.53%)  HT: 12.86  VT: 11.98  R: 11.32  RT:  6.13 (  64Kops/s)
    
     src_8888_8_0565 =  L1:  19.97  L2:  18.59  M: 13.42 ( 32.55%)  HT: 11.46  VT: 10.78  R: 10.33  RT:  5.87 (  61Kops/s)
     src_8888_8_0565 =  L1:  21.16  L2:  19.68  M: 13.94 ( 33.64%)  HT: 12.31  VT: 11.52  R: 11.02  RT:  6.54 (  68Kops/s)
    
     src_x888_8_x888 =  L1:  20.54  L2:  18.88  M: 13.07 ( 40.74%)  HT: 11.05  VT: 10.36  R: 10.02  RT:  5.68 (  60Kops/s)
     src_x888_8_x888 =  L1:  21.92  L2:  20.15  M: 13.35 ( 41.42%)  HT: 11.70  VT: 10.95  R: 10.53  RT:  6.18 (  65Kops/s)
    
    over_x888_8_0565 =  L1:  10.32  L2:   9.85  M:  7.63 ( 21.13%)  HT:  6.56  VT:  6.30  R:  6.12  RT:  3.80 (  43Kops/s)
    over_x888_8_0565 =  L1:  10.64  L2:  10.17  M:  7.74 ( 21.35%)  HT:  6.83  VT:  6.55  R:  6.34  RT:  4.03 (  46Kops/s)
    
    ARM/iwMMXt:
           add_8_8_8 =  L1:  13.10  L2:  11.67  M: 10.74 ( 13.46%)  HT:  8.62  VT:  8.15  R:  7.94  RT:  4.39 (  44Kops/s)
           add_8_8_8 =  L1:  13.81  L2:  12.79  M: 11.63 ( 13.93%)  HT:  9.33  VT:  9.20  R:  9.04  RT:  5.43 (  52Kops/s)
    
     src_8888_8_0565 =  L1:  16.62  L2:  15.07  M: 12.52 ( 27.46%)  HT: 10.07  VT: 10.17  R:  9.95  RT:  5.64 (  54Kops/s)
     src_8888_8_0565 =  L1:  16.84  L2:  16.11  M: 13.22 ( 27.71%)  HT: 11.74  VT: 10.90  R: 10.80  RT:  6.66 (  62Kops/s)
    
     src_x888_8_x888 =  L1:  17.49  L2:  16.22  M: 13.73 ( 38.73%)  HT: 10.10  VT: 10.33  R:  9.55  RT:  5.21 (  52Kops/s)
     src_x888_8_x888 =  L1:  19.33  L2:  17.66  M: 14.26 ( 38.43%)  HT: 11.53  VT: 10.83  R: 10.57  RT:  6.12 (  58Kops/s)
    
    over_x888_8_0565 =  L1:   7.57  L2:   7.29  M:  6.37 ( 15.97%)  HT:  5.53  VT:  5.33  R:  5.21  RT:  3.22 (  35Kops/s)
    over_x888_8_0565 =  L1:   8.15  L2:   7.56  M:  6.50 ( 15.58%)  HT:  5.73  VT:  5.49  R:  5.50  RT:  3.53 (  38Kops/s)

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 508366c..76ae892 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -183,6 +183,17 @@ _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
@@ -207,6 +218,17 @@ _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_xor_si64 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 80aa59d..98fb84e 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3243,6 +3243,51 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+static uint32_t *
+mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 mm0 = ldq_u ((__m64 *)src);
+
+	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
+	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
+
+	*(__m64 *)(dst + 0) = mm3;
+	*(__m64 *)(dst + 2) = mm4;
+	*(__m64 *)(dst + 4) = mm5;
+	*(__m64 *)(dst + 6) = mm6;
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
 typedef struct
 {
     pixman_format_code_t	format;
@@ -3252,6 +3297,7 @@ typedef struct
 static const fetcher_info_t fetchers[] =
 {
     { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
+    { PIXMAN_a8,		mmx_fetch_a8 },
     { PIXMAN_null }
 };
 
commit 20bad64d9a7ff5c2662f12a87f66fcf77c1f3f2c
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Apr 18 16:08:57 2012 -0400

    mmx: add r5g6b5 fetcher
    
    Loongson:
    add_0565_0565 =  L1:  12.73  L2:  12.26  M: 10.05 ( 13.87%)  HT:  8.77  VT:  8.50  R:  8.25  RT:  5.28 (  58Kops/s)
    add_0565_0565 =  L1:  14.04  L2:  13.63  M: 10.96 ( 15.19%)  HT:  9.73  VT:  9.43  R:  9.11  RT:  5.93 (  64Kops/s)
    
    ARM/iwMMXt:
    add_0565_0565 =  L1:  10.36  L2:  10.03  M:  9.04 ( 10.88%)  HT:  3.11  VT:  7.16  R:  7.72  RT:  5.12 (  51Kops/s)
    add_0565_0565 =  L1:  10.84  L2:  10.20  M:  9.15 ( 11.46%)  HT:  7.60  VT:  7.82  R:  7.70  RT:  5.41 (  53Kops/s)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index d5d34ac..80aa59d 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -190,6 +190,7 @@ typedef struct
 #endif
     mmxdatafield mmx_full_alpha;
     mmxdatafield mmx_4x0101;
+    mmxdatafield mmx_ff000000;
 } mmx_data_t;
 
 #if defined(_MSC_VER)
@@ -217,6 +218,7 @@ static const mmx_data_t c =
 #endif
     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
+    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
 };
 
 #ifdef USE_CVT_INTRINSICS
@@ -3196,6 +3198,102 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+static uint32_t *
+mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m64 vsrc = ldq_u ((__m64 *)src);
+
+	__m64 mm0 = expand565 (vsrc, 0);
+	__m64 mm1 = expand565 (vsrc, 1);
+	__m64 mm2 = expand565 (vsrc, 2);
+	__m64 mm3 = expand565 (vsrc, 3);
+
+	*(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000));
+	*(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
+    { PIXMAN_null }
+};
+
+static void
+mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
+
+    if ((iter->flags & ITER_NARROW)				&&
+	(image->common.flags & FLAGS) == FLAGS			&&
+	x >= 0 && y >= 0					&&
+	x + width <= image->bits.width				&&
+	y + height <= image->bits.height)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return;
+	    }
+	}
+    }
+
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
@@ -3347,6 +3445,8 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
+    imp->src_iter_init = mmx_src_iter_init;
+
     return imp;
 }
 
commit c136e535adf33069cbf229b8773934d78099af85
Author: Matt Turner <mattst88 at gmail.com>
Date:   Tue Apr 17 12:16:55 2012 -0400

    mmx: Use Loongson pextrh instruction in expand565
    
    Same story as pinsrh in the previous commit.
    
     text	data	bss	dec	hex filename
    25336	1952	  0   27288    6a98 .libs/libpixman_loongson_mmi_la-pixman-mmx.o
    25072	1952	  0   27024    6990 .libs/libpixman_loongson_mmi_la-pixman-mmx.o
    
    -dsll: 95
    +dsll: 70
    -dsrl: 135
    +dsrl: 105
    -ldc1: 462
    +ldc1: 445
    -lw: 721
    +lw: 700
    +pextrh: 30

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 1c74ed8..508366c 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -218,6 +218,17 @@ _mm_xor_si64 (__m64 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_extract_pi16 (__m64 __m, int64_t __pos)
+{
+	__m64 ret;
+	asm("pextrh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__pos)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
 {
 	__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 7aa4019..d5d34ac 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -493,7 +493,11 @@ expand565 (__m64 pixel, int pos)
     __m64 t1, t2;
 
     /* move pixel to low 16 bit and zero the rest */
+#ifdef USE_LOONGSON_MMI
+    p = loongson_extract_pi16 (p, pos);
+#else
     p = shift (shift (p, (3 - pos) * 16), -48);
+#endif
 
     t1 = shift (p, 36 - 11);
     t2 = shift (p, 16 - 5);
commit facceb4a1fbba476ad98e76d15868bf7eecd3a30
Author: Matt Turner <mattst88 at gmail.com>
Date:   Tue Apr 17 11:28:33 2012 -0400

    mmx: Use Loongson pinsrh instruction in pack_565
    
    The pinsrh instruction is analogous to MMX EXT's pinsrw, except like
    other Loongson vector instructions it cannot access the general purpose
    registers. In the cases of other Loongson vector instructions, this is a
    headache, but it is actually a good thing here. Since the instruction is
    different from MMX, I've named the intrinsic loongson_insert_pi16.
    
     text	data	bss	dec	 hex filename
    25976	1952	  0   27928	6d18 .libs/libpixman_loongson_mmi_la-pixman-mmx.o
    25336	1952	  0   27288	6a98 .libs/libpixman_loongson_mmi_la-pixman-mmx.o
    
    -and: 181
    +and: 147
    -dsll: 143
    +dsll: 95
    -dsrl: 87
    +dsrl: 135
    -ldc1: 523
    +ldc1: 462
    -lw: 767
    +lw: 721
    +pinsrh: 35

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 44d30f5..1c74ed8 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -216,3 +216,14 @@ _mm_xor_si64 (__m64 __m1, __m64 __m2)
 	);
 	return ret;
 }
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
+{
+	__m64 ret;
+	asm("pinsrh_%3 %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2), "i" (__pos)
+	);
+	return ret;
+}
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index f40fa71..7aa4019 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -182,10 +182,12 @@ typedef struct
     mmxdatafield mmx_565_r;
     mmxdatafield mmx_565_g;
     mmxdatafield mmx_565_b;
+#ifndef USE_LOONGSON_MMI
     mmxdatafield mmx_mask_0;
     mmxdatafield mmx_mask_1;
     mmxdatafield mmx_mask_2;
     mmxdatafield mmx_mask_3;
+#endif
     mmxdatafield mmx_full_alpha;
     mmxdatafield mmx_4x0101;
 } mmx_data_t;
@@ -207,10 +209,12 @@ static const mmx_data_t c =
     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+#ifndef USE_LOONGSON_MMI
     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+#endif
     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
 };
@@ -528,6 +532,15 @@ pack_565 (__m64 pixel, __m64 target, int pos)
     g = _mm_and_si64 (p, MC (565_g));
     b = _mm_and_si64 (p, MC (565_b));
 
+#ifdef USE_LOONGSON_MMI
+    r = shift (r, -(32 - 8));
+    g = shift (g, -(16 - 3));
+    b = shift (b, -(0  + 3));
+
+    p = _mm_or_si64 (r, g);
+    p = _mm_or_si64 (p, b);
+    return loongson_insert_pi16 (t, p, pos);
+#else
     r = shift (r, -(32 - 8) + pos * 16);
     g = shift (g, -(16 - 3) + pos * 16);
     b = shift (b, -(0  + 3) + pos * 16);
@@ -545,6 +558,7 @@ pack_565 (__m64 pixel, __m64 target, int pos)
     p = _mm_or_si64 (g, p);
 
     return _mm_or_si64 (b, p);
+#endif
 }
 
 #ifndef _MSC_VER
commit 6d29b7d7557ccb657054e867f4e27f4aa89cb25e
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Feb 24 15:23:09 2012 -0500

    mmx: don't pack and unpack src unnecessarily
    
    The combine function was store8888'ing the result, and all consumers
    were immediately load8888'ing it, causing lots of unnecessary pack and
    unpack instructions.
    
    It's a very straight forward conversion, except for mmx_combine_over_u
    and mmx_combine_saturate_u. mmx_combine_over_u was testing the integer
    result to skip pixels, so we use the is_* functions to test the __m64
    data directly without loading it into an integer register.
    
    For mmx_combine_saturate_u there's not a lot we can do, since it uses
    DIV_UN8.

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 1290fc4..f40fa71 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -569,23 +569,20 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 
 /* --------------- MMX code patch for fbcompose.c --------------------- */
 
-static force_inline uint32_t
+static force_inline __m64
 combine (const uint32_t *src, const uint32_t *mask)
 {
-    uint32_t ssrc = *src;
+    __m64 vsrc = load8888 (src);
 
     if (mask)
     {
 	__m64 m = load8888 (mask);
-	__m64 s = load8888 (&ssrc);
 
 	m = expand_alpha (m);
-	s = pix_multiply (s, m);
-
-	store8888 (&ssrc, s);
+	vsrc = pix_multiply (vsrc, m);
     }
 
-    return ssrc;
+    return vsrc;
 }
 
 static void
@@ -600,19 +597,16 @@ mmx_combine_over_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	uint32_t ssrc = combine (src, mask);
-	uint32_t a = ssrc >> 24;
+	__m64 vsrc = combine (src, mask);
 
-	if (a == 0xff)
+	if (is_opaque (vsrc))
 	{
-	    *dest = ssrc;
+	    store8888 (dest, vsrc);
 	}
-	else if (ssrc)
+	else if (!is_zero (vsrc))
 	{
-	    __m64 s, sa;
-	    s = load8888 (&ssrc);
-	    sa = expand_alpha (s);
-	    store8888 (dest, over (s, sa, load8888 (dest)));
+	    __m64 sa = expand_alpha (vsrc);
+	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
 	}
 
 	++dest;
@@ -636,11 +630,11 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 d, da;
-	uint32_t s = combine (src, mask);
+	__m64 s = combine (src, mask);
 
 	d = load8888 (dest);
 	da = expand_alpha (d);
-	store8888 (dest, over (d, da, load8888 (&s)));
+	store8888 (dest, over (d, da, s));
 
 	++dest;
 	++src;
@@ -662,10 +656,9 @@ mmx_combine_in_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
-	uint32_t ssrc = combine (src, mask);
+	__m64 a;
+	__m64 x = combine (src, mask);
 
-	x = load8888 (&ssrc);
 	a = load8888 (dest);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
@@ -692,11 +685,10 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
-	uint32_t ssrc = combine (src, mask);
+	__m64 a = combine (src, mask);
+	__m64 x;
 
 	x = load8888 (dest);
-	a = load8888 (&ssrc);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
 	store8888 (dest, x);
@@ -721,10 +713,9 @@ mmx_combine_out_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
-	uint32_t ssrc = combine (src, mask);
+	__m64 a;
+	__m64 x = combine (src, mask);
 
-	x = load8888 (&ssrc);
 	a = load8888 (dest);
 	a = expand_alpha (a);
 	a = negate (a);
@@ -751,11 +742,10 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 x, a;
-	uint32_t ssrc = combine (src, mask);
+	__m64 a = combine (src, mask);
+	__m64 x;
 
 	x = load8888 (dest);
-	a = load8888 (&ssrc);
 	a = expand_alpha (a);
 	a = negate (a);
 	x = pix_multiply (x, a);
@@ -782,10 +772,9 @@ mmx_combine_atop_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, da, d, sia;
-	uint32_t ssrc = combine (src, mask);
+	__m64 da, d, sia;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (&ssrc);
 	d = load8888 (dest);
 	sia = expand_alpha (s);
 	sia = negate (sia);
@@ -815,10 +804,9 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, dia, d, sa;
-	uint32_t ssrc = combine (src, mask);
+	__m64 dia, d, sa;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (&ssrc);
 	d = load8888 (dest);
 	sa = expand_alpha (s);
 	dia = expand_alpha (d);
@@ -846,10 +834,9 @@ mmx_combine_xor_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, dia, d, sia;
-	uint32_t ssrc = combine (src, mask);
+	__m64 dia, d, sia;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (&ssrc);
 	d = load8888 (dest);
 	sia = expand_alpha (s);
 	dia = expand_alpha (d);
@@ -878,10 +865,9 @@ mmx_combine_add_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	__m64 s, d;
-	uint32_t ssrc = combine (src, mask);
+	__m64 d;
+	__m64 s = combine (src, mask);
 
-	s = load8888 (&ssrc);
 	d = load8888 (dest);
 	s = pix_add (s, d);
 	store8888 (dest, s);
@@ -906,12 +892,14 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
 
     while (dest < end)
     {
-	uint32_t s = combine (src, mask);
+	uint32_t s, sa, da;
 	uint32_t d = *dest;
-	__m64 ms = load8888 (&s);
-	__m64 md = load8888 (&d);
-	uint32_t sa = s >> 24;
-	uint32_t da = ~d >> 24;
+	__m64 ms = combine (src, mask);
+	__m64 md = load8888 (dest);
+
+	store8888(&s, ms);
+	da = ~d >> 24;
+	sa = s >> 24;
 
 	if (sa > da)
 	{
commit ee750034252fb8f44c871e84a5057bc114699ae7
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Feb 24 17:39:39 2012 -0500

    mmx: introduce is_equal, is_opaque, and is_zero functions
    
    To be used by the next commit.

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index eedef8e..1290fc4 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -67,6 +67,19 @@ _mm_empty (void)
 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
  * instructions to be generated that we don't want. Just duplicate the
  * functions we want to use.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+    int ret;
+
+    asm ("pmovmskb %1, %0\n\t"
+	: "=r" (ret)
+	: "y" (__A)
+    );
+
+    return ret;
+}
+
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 {
@@ -427,6 +440,34 @@ store8888 (uint32_t *dest, __m64 v)
     store (dest, v);
 }
 
+static force_inline pixman_bool_t
+is_equal (__m64 a, __m64 b)
+{
+#ifdef USE_LOONGSON_MMI
+    /* __m64 is double, we can compare directly. */
+    return a == b;
+#else
+    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
+#endif
+}
+
+static force_inline pixman_bool_t
+is_opaque (__m64 v)
+{
+#ifdef USE_LOONGSON_MMI
+    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
+#else
+    __m64 ffs = _mm_cmpeq_pi8 (v, v);
+    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
+#endif
+}
+
+static force_inline pixman_bool_t
+is_zero (__m64 v)
+{
+    return is_equal (v, _mm_setzero_si64 ());
+}
+
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
  *
  *    00RR00GG00BB
commit 10c77b339f40fc027b682ef16edec234508d327b
Author: Matt Turner <mattst88 at gmail.com>
Date:   Thu Feb 23 16:25:11 2012 -0500

    mmx: simplify srcsrcsrcsrc calculation in over_n_8_0565

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 70a035c..eedef8e 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2169,7 +2169,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
     __m64 vsrc, vsrca, tmp;
-    uint64_t srcsrcsrcsrc, src16;
+    __m64 srcsrcsrcsrc;
 
     CHECKPOINT ();
 
@@ -2186,11 +2186,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
     vsrca = expand_alpha (vsrc);
 
     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
-    src16 = to_uint64 (tmp);
-
-    srcsrcsrcsrc =
-	(uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
-	(uint64_t)src16 << 16 | (uint64_t)src16;
+    srcsrcsrcsrc = expand_alpha_rev (tmp);
 
     while (height--)
     {
@@ -2234,7 +2230,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
 	    {
-		*(uint64_t *)dst = srcsrcsrcsrc;
+		*(__m64 *)dst = srcsrcsrcsrc;
 	    }
 	    else if (m0 | m1 | m2 | m3)
 	    {
commit e06947d1010ffec4903493df4979119b0ac080d3
Author: Matt Turner <mattst88 at gmail.com>
Date:   Thu Feb 23 16:15:56 2012 -0500

    mmx: remove unnecessary uint64_t<->__m64 conversions
    
    Loongson:
    add_8888_8888 =  L1:  68.73  L2:  55.09  M: 25.39 ( 68.18%)  HT: 25.28 VT: 22.42  R: 20.74  RT: 13.26 ( 131Kops/s)
    add_8888_8888 =  L1: 159.19  L2: 114.10  M: 30.74 ( 77.91%)  HT: 27.63 VT: 24.99  R: 24.61  RT: 14.49 ( 141Kops/s)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 86307b0..70a035c 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2883,7 +2883,6 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
                              pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    __m64 dst64;
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -2913,8 +2912,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    dst64 = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
-	    *(uint64_t*)dst = to_uint64 (dst64);
+	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
 	    dst += 2;
 	    src += 2;
 	    w -= 2;
commit c78e986085b3993f1b4355151820228c53d54cad
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Feb 24 12:43:43 2012 -0500

    mmx: compile on MIPS for Loongson MMI optimizations
    
                                 image               image16
               evolution   32.985 ->  29.667    27.314 ->  23.870
    firefox-planet-gnome  197.982 -> 180.437   220.986 -> 205.057
    gnome-system-monitor   48.482 ->  49.752    52.820 ->  49.528
      gnome-terminal-vim   60.799 ->  50.528    51.655 ->  44.131
          grads-heat-map    3.167 ->   3.181     3.328 ->   3.321
                    gvim   38.646 ->  32.552    38.126 ->  34.453
           midori-zoomed   44.371 ->  43.338    28.860 ->  28.865
               ocitysmap   23.065 ->  18.057    23.046 ->  18.055
                 poppler   43.676 ->  36.077    43.065 ->  36.090
      swfdec-giant-steps   20.166 ->  20.365    22.354 ->  16.578
          swfdec-youtube   31.502 ->  28.118    44.052 ->  41.771
       xfce4-terminal-a1   69.517 ->  51.288    62.225 ->  53.309

diff --git a/configure.ac b/configure.ac
index b8d59f6..5478734 100644
--- a/configure.ac
+++ b/configure.ac
@@ -271,6 +271,59 @@ PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
 ])
 
 dnl ===========================================================================
+dnl Check for Loongson Multimedia Instructions
+
+if test "x$LS_CFLAGS" = "x" ; then
+    LS_CFLAGS="-march=loongson2f"
+fi
+
+have_loongson_mmi=no
+AC_MSG_CHECKING(whether to use Loongson MMI)
+
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS=" $CFLAGS $LS_CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#ifndef __mips_loongson_vector_rev
+#error "Loongson Multimedia Instructions are only available on Loongson"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 4.4 for Loongson MMI compilation"
+#endif
+#include "pixman/loongson-mmintrin.h"
+int main () {
+    union {
+        __m64 v;
+        char c[8];
+    } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+    int b = 4;
+    __m64 c = _mm_srli_pi16 (a.v, b);
+    return 0;
+}]])], have_loongson_mmi=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(loongson,
+   [AC_HELP_STRING([--disable-loongson],
+                   [disable Loongson fast paths])],
+   [enable_loongson=$enableval], [enable_loongson=auto])
+
+if test $enable_loongson = no ; then
+   have_loongson_mmi=disabled
+fi
+
+if test $have_loongson_mmi = yes ; then
+   AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions])
+else
+   LS_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_loongson_mmi)
+if test $enable_loongson = yes && test $have_loongson_mmi = no ; then
+   AC_MSG_ERROR([Loongson MMI not detected])
+fi
+
+AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
+
+dnl ===========================================================================
 dnl Check for MMX
 
 if test "x$MMX_CFLAGS" = "x" ; then
@@ -416,6 +469,7 @@ case $host_os in
       ;;
 esac
 
+AC_SUBST(LS_CFLAGS)
 AC_SUBST(IWMMXT_CFLAGS)
 AC_SUBST(MMX_CFLAGS)
 AC_SUBST(MMX_LDFLAGS)
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index fb7e047..b320a58 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -118,5 +118,17 @@ libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
 ASM_CFLAGS_mips_dspr2=
 endif
 
+# loongson code
+if USE_LOONGSON_MMI
+noinst_LTLIBRARIES += libpixman-loongson-mmi.la
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
+libpixman_loongson_mmi_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
+
+ASM_CFLAGS_ls=$(LS_CFLAGS)
+endif
+
 .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
 	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
new file mode 100644
index 0000000..44d30f5
--- /dev/null
+++ b/pixman/loongson-mmintrin.h
@@ -0,0 +1,218 @@
+/* The gcc-provided loongson intrinsic functions are way too fucking broken
+ * to be of any use, otherwise I'd use them.
+ *
+ * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
+ *   close enough that they could have implemented the _mm_*-style intrinsic
+ *   interface and had a ton of optimized code available to them. Instead they
+ *   implemented something much, much worse.
+ *
+ * - pshuf takes a dead first argument, causing extra instructions to be
+ *   generated.
+ *
+ * - There are no 64-bit shift or logical intrinsics, which means you have
+ *   to implement them with inline assembly, but this is a nightmare because
+ *   gcc doesn't understand that the integer vector datatypes are actually in
+ *   floating-point registers, so you end up with braindead code like
+ *
+ *	punpcklwd	$f9,$f9,$f5
+ *	    dmtc1	v0,$f8
+ *	punpcklwd	$f19,$f19,$f5
+ *	    dmfc1	t9,$f9
+ *	    dmtc1	v0,$f9
+ *	    dmtc1	t9,$f20
+ *	    dmfc1	s0,$f19
+ *	punpcklbh	$f20,$f20,$f2
+ *
+ *   where crap just gets copied back and forth between integer and floating-
+ *   point registers ad nauseum.
+ *
+ * Instead of trying to workaround the problems from these crap intrinsics, I
+ * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
+ * assembly.
+ */
+
+#include <stdint.h>
+
+/* vectors are stored in 64-bit floating-point registers */
+typedef double __m64;
+/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
+typedef float  __m32;
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+	return 0.0;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddush %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddusb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("and %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmulhuh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmullh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("or %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packushb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __m, int64_t __n)
+{
+	__m64 ret;
+	asm("pshufh %0, %1, %2\n\t"
+	    : "=f" (ret)
+	    : "f" (__m), "f" (*(__m64 *)&__n)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsll  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsrl  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
+ * allows load8888 to use 32-bit loads */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("xor %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
index 1060f47..aa9036f 100644
--- a/pixman/pixman-cpu.c
+++ b/pixman/pixman-cpu.c
@@ -427,22 +427,19 @@ pixman_have_arm_iwmmxt (void)
 
 #endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
 
-#if defined(USE_MIPS_DSPR2)
+#if defined(USE_MIPS_DSPR2) || defined(USE_LOONGSON_MMI)
 
 #if defined (__linux__) /* linux ELF */
 
-pixman_bool_t
-pixman_have_mips_dspr2 (void)
+static pixman_bool_t
+pixman_have_mips_feature (const char *search_string)
 {
-    const char *search_string = "MIPS 74K";
     const char *file_name = "/proc/cpuinfo";
-    /* Simple detection of MIPS DSP ASE (revision 2) at runtime for Linux.
+    /* Simple detection of MIPS features at runtime for Linux.
      * It is based on /proc/cpuinfo, which reveals hardware configuration
      * to user-space applications.  According to MIPS (early 2010), no similar
      * facility is universally available on the MIPS architectures, so it's up
      * to individual OSes to provide such.
-     *
-     * Only currently available MIPS core that supports DSPr2 is 74K.
      */
 
     char cpuinfo_line[256];
@@ -467,13 +464,32 @@ pixman_have_mips_dspr2 (void)
     return FALSE;
 }
 
+#if defined(USE_MIPS_DSPR2)
+pixman_bool_t
+pixman_have_mips_dspr2 (void)
+{
+     /* Only currently available MIPS core that supports DSPr2 is 74K. */
+    return pixman_have_mips_feature ("MIPS 74K");
+}
+#endif
+
+#if defined(USE_LOONGSON_MMI)
+pixman_bool_t
+pixman_have_loongson_mmi (void)
+{
+    /* I really don't know if some Loongson CPUs don't have MMI. */
+    return pixman_have_mips_feature ("Loongson");
+}
+#endif
+
 #else /* linux ELF */
 
 #define pixman_have_mips_dspr2() FALSE
+#define pixman_have_loongson_mmi() FALSE
 
 #endif /* linux ELF */
 
-#endif /* USE_MIPS_DSPR2 */
+#endif /* USE_MIPS_DSPR2 || USE_LOONGSON_MMI */
 
 #if defined(USE_X86_MMX) || defined(USE_SSE2)
 /* The CPU detection code needs to be in a file not compiled with
@@ -773,7 +789,10 @@ _pixman_choose_implementation (void)
     if (!disabled ("arm-iwmmxt") && pixman_have_arm_iwmmxt ())
 	imp = _pixman_implementation_create_mmx (imp);
 #endif
-
+#ifdef USE_LOONGSON_MMI
+    if (!disabled ("loongson-mmi") && pixman_have_loongson_mmi ())
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
 #ifdef USE_ARM_NEON
     if (!disabled ("arm-neon") && pixman_have_arm_neon ())
 	imp = _pixman_implementation_create_arm_neon (imp);
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index fe3d42d..86307b0 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -33,9 +33,13 @@
 #include <config.h>
 #endif
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
 
+#ifdef USE_LOONGSON_MMI
+#include <loongson-mmintrin.h>
+#else
 #include <mmintrin.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -125,11 +129,14 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
  * If __m64 and uint64_t values can just be cast to each other directly,
  * then define USE_M64_CASTS.
+ * If __m64 is a double datatype, then define USE_M64_DOUBLE.
  */
 #ifdef _MSC_VER
 # define M64_MEMBER m64_u64
 #elif defined(__ICC)
 # define USE_CVT_INTRINSICS
+#elif defined(USE_LOONGSON_MMI)
+# define USE_M64_DOUBLE
 #elif defined(__GNUC__)
 # define USE_M64_CASTS
 #elif defined(__SUNPRO_C)
@@ -147,7 +154,7 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
 # endif
 #endif
 
-#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
+#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 typedef uint64_t mmxdatafield;
 #else
 typedef __m64 mmxdatafield;
@@ -199,6 +206,8 @@ static const mmx_data_t c =
 #    define MC(x) to_m64 (c.mmx_ ## x)
 #elif defined(USE_M64_CASTS)
 #    define MC(x) ((__m64)c.mmx_ ## x)
+#elif defined(USE_M64_DOUBLE)
+#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 #else
 #    define MC(x) c.mmx_ ## x
 #endif
@@ -213,6 +222,8 @@ to_m64 (uint64_t x)
 
     res.M64_MEMBER = x;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(__m64 *)&x;
 #else /* USE_M64_CASTS */
     return (__m64)x;
 #endif
@@ -226,6 +237,8 @@ to_uint64 (__m64 x)
 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     uint64_t res = x.M64_MEMBER;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(uint64_t *)&x;
 #else /* USE_M64_CASTS */
     return (uint64_t)x;
 #endif
@@ -358,13 +371,26 @@ static force_inline uint32_t ldl_u(const uint32_t *p)
 static force_inline __m64
 load (const uint32_t *v)
 {
+#ifdef USE_LOONGSON_MMI
+    __m64 ret;
+    asm ("lwc1 %0, %1\n\t"
+	: "=f" (ret)
+	: "m" (*v)
+    );
+    return ret;
+#else
     return _mm_cvtsi32_si64 (*v);
+#endif
 }
 
 static force_inline __m64
 load8888 (const uint32_t *v)
 {
+#ifdef USE_LOONGSON_MMI
+    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
+#else
     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
+#endif
 }
 
 static force_inline __m64
@@ -383,7 +409,15 @@ pack8888 (__m64 lo, __m64 hi)
 static force_inline void
 store (uint32_t *dest, __m64 v)
 {
+#ifdef USE_LOONGSON_MMI
+    asm ("swc1 %1, %0\n\t"
+	: "=m" (*dest)
+	: "f" (v)
+	: "memory"
+    );
+#else
     *dest = _mm_cvtsi64_si32 (v);
+#endif
 }
 
 static force_inline void
@@ -3275,4 +3309,4 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     return imp;
 }
 
-#endif /* USE_X86_MMX || USE_ARM_IWMMXT */
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 0cba2e9..f456bbd 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -539,7 +539,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
 pixman_implementation_t *
 _pixman_implementation_create_noop (pixman_implementation_t *fallback);
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
 pixman_implementation_t *
 _pixman_implementation_create_mmx (pixman_implementation_t *fallback);
 #endif
commit 4e0c7902b2c8e517d102a8fccb9cf7da9725f59f
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Feb 15 01:19:07 2012 -0500

    mmx: make ldq_u take __m64* directly
    
    Before, if __m64 is allocated in vector or floating-point registers,
    
    	__m64 vs = ldq_u((uint64_t *)src);
    
    would cause src to be loaded into an integer register and then
    transferred to an __m64 register. By switching ldq_u's argument type to
    __m64 we give the compile enough information to recognize that it can
    load to the vector register directly.
    
    This patch is necessary for the Loongson optimizations when __m64 is
    typedef'd as double.

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index f8dfca7..fe3d42d 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -324,7 +324,7 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 
 /* Elemental unaligned loads */
 
-static force_inline __m64 ldq_u(uint64_t *p)
+static force_inline __m64 ldq_u(__m64 *p)
 {
 #ifdef USE_X86_MMX
     /* x86's alignment restrictions are very relaxed. */
@@ -337,7 +337,7 @@ static force_inline __m64 ldq_u(uint64_t *p)
     aligned_p = (__m64 *)((uintptr_t)p & ~7);
     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 #else
-    struct __una_u64 { uint64_t x __attribute__((packed)); };
+    struct __una_u64 { __m64 x __attribute__((packed)); };
     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
     return (__m64) ptr->x;
 #endif
@@ -370,8 +370,8 @@ load8888 (const uint32_t *v)
 static force_inline __m64
 load8888u (const uint32_t *v)
 {
-    uint32_t l = ldl_u(v);
-    return load8888(&l);
+    uint32_t l = ldl_u (v);
+    return load8888 (&l);
 }
 
 static force_inline __m64
@@ -389,7 +389,7 @@ store (uint32_t *dest, __m64 v)
 static force_inline void
 store8888 (uint32_t *dest, __m64 v)
 {
-    v = pack8888 (v, _mm_setzero_si64());
+    v = pack8888 (v, _mm_setzero_si64 ());
     store (dest, v);
 }
 
@@ -1452,7 +1452,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    __m64 vs = ldq_u((uint64_t *)src);
+	    __m64 vs = ldq_u ((__m64 *)src);
 	    __m64 vd = *(__m64 *)dst;
 	    __m64 vsrc0 = expand8888 (vs, 0);
 	    __m64 vsrc1 = expand8888 (vs, 1);
@@ -1534,14 +1534,14 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    __m64 vd6 = *(__m64 *)(dst + 12);
 	    __m64 vd7 = *(__m64 *)(dst + 14);
 
-	    __m64 vs0 = ldq_u((uint64_t *)(src + 0));
-	    __m64 vs1 = ldq_u((uint64_t *)(src + 2));
-	    __m64 vs2 = ldq_u((uint64_t *)(src + 4));
-	    __m64 vs3 = ldq_u((uint64_t *)(src + 6));
-	    __m64 vs4 = ldq_u((uint64_t *)(src + 8));
-	    __m64 vs5 = ldq_u((uint64_t *)(src + 10));
-	    __m64 vs6 = ldq_u((uint64_t *)(src + 12));
-	    __m64 vs7 = ldq_u((uint64_t *)(src + 14));
+	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
+	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
+	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
+	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
+	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
+	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
+	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
 
 	    vd0 = pack8888 (
 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
@@ -2821,7 +2821,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 
 	while (w >= 8)
 	{
-	    *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
 	    dst += 8;
 	    src += 8;
 	    w -= 8;
@@ -2879,7 +2879,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+	    dst64 = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
 	    *(uint64_t*)dst = to_uint64 (dst64);
 	    dst += 2;
 	    src += 2;
@@ -2970,7 +2970,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 
 	while (w >= 4 && ((unsigned long)d & 7))
 	{
-	    *(uint32_t *)d = ldl_u((uint32_t *)s);
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
 
 	    w -= 4;
 	    s += 4;
@@ -3004,14 +3004,14 @@ pixman_blt_mmx (uint32_t *src_bits,
 		  "%mm0", "%mm1", "%mm2", "%mm3",
 		  "%mm4", "%mm5", "%mm6", "%mm7");
 #else
-	    __m64 v0 = ldq_u((uint64_t *)(s + 0));
-	    __m64 v1 = ldq_u((uint64_t *)(s + 8));
-	    __m64 v2 = ldq_u((uint64_t *)(s + 16));
-	    __m64 v3 = ldq_u((uint64_t *)(s + 24));
-	    __m64 v4 = ldq_u((uint64_t *)(s + 32));
-	    __m64 v5 = ldq_u((uint64_t *)(s + 40));
-	    __m64 v6 = ldq_u((uint64_t *)(s + 48));
-	    __m64 v7 = ldq_u((uint64_t *)(s + 56));
+	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
+	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
+	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
+	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
+	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
+	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
+	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
+	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
 	    *(__m64 *)(d + 0)  = v0;
 	    *(__m64 *)(d + 8)  = v1;
 	    *(__m64 *)(d + 16) = v2;
@@ -3028,7 +3028,7 @@ pixman_blt_mmx (uint32_t *src_bits,
 	}
 	while (w >= 4)
 	{
-	    *(uint32_t *)d = ldl_u((uint32_t *)s);
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
 
 	    w -= 4;
 	    s += 4;
commit 2e54b76a2d2203b6a70190f488d76d6d409e879a
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Feb 24 12:34:41 2012 -0500

    mmx: add load function and use it in add_8888_8888

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 36cf2cd..f8dfca7 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -356,9 +356,15 @@ static force_inline uint32_t ldl_u(const uint32_t *p)
 }
 
 static force_inline __m64
+load (const uint32_t *v)
+{
+    return _mm_cvtsi32_si64 (*v);
+}
+
+static force_inline __m64
 load8888 (const uint32_t *v)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ());
+    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
 }
 
 static force_inline __m64
@@ -2864,8 +2870,8 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                              _mm_cvtsi32_si64 (*dst)));
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
 	    dst++;
 	    src++;
 	    w--;
@@ -2882,8 +2888,8 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                              _mm_cvtsi32_si64 (*dst)));
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
 
 	}
     }
commit 084e3f2f4be900041cc35830359606addc1fc3be
Author: Matt Turner <mattst88 at gmail.com>
Date:   Fri Feb 24 12:32:03 2012 -0500

    mmx: add store function and use it in add_8888_8888

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 9fd1a76..36cf2cd 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -375,10 +375,16 @@ pack8888 (__m64 lo, __m64 hi)
 }
 
 static force_inline void
+store (uint32_t *dest, __m64 v)
+{
+    *dest = _mm_cvtsi64_si32 (v);
+}
+
+static force_inline void
 store8888 (uint32_t *dest, __m64 v)
 {
     v = pack8888 (v, _mm_setzero_si64());
-    *dest = _mm_cvtsi64_si32 (v);
+    store (dest, v);
 }
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -2858,8 +2864,8 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                                           _mm_cvtsi32_si64 (*dst)));
+	    store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                              _mm_cvtsi32_si64 (*dst)));
 	    dst++;
 	    src++;
 	    w--;
@@ -2876,8 +2882,8 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
-	                                           _mm_cvtsi32_si64 (*dst)));
+	    store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                              _mm_cvtsi32_si64 (*dst)));
 
 	}
     }