pixman: Branch 'master' - 7 commits

Sun Jul 1 13:47:35 PDT 2012

pixman/loongson-mmintrin.h |  116 ++++++++++++++++++
 pixman/pixman-mmx.c        |  280 ++++++++++++++++++++++++++++++++++++++++++++-
 pixman/pixman-sse2.c       |    3 
 3 files changed, 393 insertions(+), 6 deletions(-)

New commits:
commit 4cdf8e9f3aca1925aeca25debb9268877ba3cd3d
Author: Matt Turner <mattst88 at gmail.com>
Date:   Sun Jul 1 16:35:46 2012 -0400

    sse2: add missing ABGR entires for bilinear src_8888_8888

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index f665b37..665eead 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5861,6 +5861,9 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
commit ef99f9e97260cc55678385a6d691c195f57bd6b1
Author: Matt Turner <mattst88 at gmail.com>
Date:   Mon May 21 05:56:58 2012 -0400

    loongson: optimize _mm_set_pi* functions with shuffle instructions

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index f0931ac..086c6e0 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -182,9 +182,34 @@ _mm_packs_pi32 (__m64 __m1, __m64 __m2)
 	return ret;
 }
 
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
 {
+	if (__builtin_constant_p (__w3) &&
+	    __builtin_constant_p (__w2) &&
+	    __builtin_constant_p (__w1) &&
+	    __builtin_constant_p (__w0))
+	{
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
+	{
+		/* TODO: handle other cases */
+		uint64_t val = __w3;
+		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
 	uint64_t val = ((uint64_t)__w3 << 48)
 		     | ((uint64_t)__w2 << 32)
 		     | ((uint64_t)__w1 << 16)
@@ -195,10 +220,28 @@ _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_pi32 (unsigned __i1, unsigned __i0)
 {
+	if (__builtin_constant_p (__i1) &&
+	    __builtin_constant_p (__i0))
+	{
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__i1 == __i0)
+	{
+		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
 	uint64_t val = ((uint64_t)__i1 << 32)
 		     | ((uint64_t)__i0 <<  0);
 	return *(__m64 *)&val;
 }
+#undef _MM_SHUFFLE
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
commit 9aa8e3a26071739d160496ef9f6126f296c500eb
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Jun 27 13:00:36 2012 -0400

    mmx: optimize bilinear function when using 7-bit precision
    
    Loongson:
    image             firefox-fishtank 1037.738 1040.218   0.19%    3/3
    image             firefox-fishtank 1056.611 1057.581   0.20%    3/3
    
    ARM/iwMMXt:
    image             firefox-fishtank 1487.282 1492.640   0.17%    3/3
    image             firefox-fishtank 1363.913 1364.366   0.11%    3/3

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 0c79f3a..5441d6b 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3522,11 +3522,14 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 }
 
 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
 
 #define BILINEAR_DECLARE_VARIABLES						\
     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
     const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
+    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
+    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
     const __m64 mm_zero = _mm_setzero_si64 ();					\
     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
@@ -3544,21 +3547,37 @@ do {										\
     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
-    /* calculate horizontal weights */						\
-    __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
+    if (BILINEAR_INTERPOLATION_BITS < 8)					\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
+			  _mm_srli_pi16 (mm_x,					\
+					 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\
+	/* horizontal interpolation */						\
+	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
+	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
+	lo = _mm_madd_pi16 (p, mm_wh);						\
+	hi = _mm_madd_pi16 (q, mm_wh);						\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
 					16 - BILINEAR_INTERPOLATION_BITS));	\
-    __m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
+	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
 					16 - BILINEAR_INTERPOLATION_BITS);	\
-    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
-    /* horizontal interpolation */						\
-    __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
-    __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
-    __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
-    __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
-    lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),			\
-		       _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));			\
-    hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),			\
-		       _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));			\
+	mm_x = _mm_add_pi16 (mm_x, mm_ux);					\
+	/* horizontal interpolation */						\
+	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
+	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
+	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
+	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
+	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
+	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
+			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
+    }										\
     /* shift and pack the result */						\
     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
commit 1ad6ae6ee8a350f6fe4f30ba928aacf44d04f86e
Author: Matt Turner <mattst88 at gmail.com>
Date:   Sun May 20 20:51:08 2012 -0400

    mmx: add scaled bilinear over_8888_8_8888
    
    Loongson:
    image             firefox-fishtank 1665.163 1670.370   0.17%    3/3
    image             firefox-fishtank 1037.738 1040.218   0.19%    3/3
    
    ARM/iwMMXt:
    image             firefox-fishtank 2042.723 2045.308   0.10%    3/3
    image             firefox-fishtank 1487.282 1492.640   0.17%    3/3

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index bf66a63..0c79f3a 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3567,6 +3567,12 @@ do {										\
     pix = lo;									\
 } while (0)
 
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+} while(0)
+
 static force_inline void
 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
 					    const uint32_t * mask,
@@ -3659,6 +3665,79 @@ FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
 			       uint32_t, uint32_t, uint32_t,
 			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
+					       const uint8_t  * mask,
+					       const uint32_t * src_top,
+					       const uint32_t * src_bottom,
+					       int32_t          w,
+					       int              wt,
+					       int              wb,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+    uint32_t m;
+
+    while (w)
+    {
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	    if (m == 0xff && is_opaque (pix1))
+	    {
+		store (dst, pix1);
+	    }
+	    else
+	    {
+		__m64 ms, md, ma, msa;
+
+		pix2 = load (dst);
+		ma = expand_alpha_rev (to_m64 (m));
+		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+		msa = expand_alpha (ms);
+
+		store8888 (dst, (in_over (ms, msa, ma, md)));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
 static uint32_t *
 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -3926,6 +4005,11 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
 
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
+
     { PIXMAN_OP_NONE },
 };
 
commit c43de364cbcd195f7d1d6881a6109cbb3d6b73b8
Author: Matt Turner <mattst88 at gmail.com>
Date:   Wed Jun 27 12:57:45 2012 -0400

    mmx: add scaled bilinear over_8888_8888
    
    Loongson:
    image         firefox-planet-gnome  157.012  158.087   0.30%    6/6
    image         firefox-planet-gnome  156.617  157.109   0.15%    5/6
    
    ARM/iwMMXt:
    image         firefox-planet-gnome  148.086  149.339   0.76%    6/6
    image         firefox-planet-gnome  144.939  146.123   0.61%    6/6

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 018a2ba..bf66a63 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -695,6 +695,24 @@ combine (const uint32_t *src, const uint32_t *mask)
     return vsrc;
 }
 
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+    if (is_opaque (vsrc))
+    {
+	return vsrc;
+    }
+    else if (!is_zero (vsrc))
+    {
+	return over (vsrc, expand_alpha (vsrc),
+		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+    }
+
+    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+}
+
 static void
 mmx_combine_over_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
@@ -3546,7 +3564,7 @@ do {										\
     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
     lo = _mm_packs_pi32 (lo, hi);						\
     lo = _mm_packs_pu16 (lo, lo);						\
-    store (&pix, lo);								\
+    pix = lo;									\
 } while (0)
 
 static force_inline void
@@ -3563,12 +3581,13 @@ scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
 					    pixman_bool_t    zero_src)
 {
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix;
+    __m64 pix;
 
     while (w--)
     {
 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
-	*dst++ = pix;
+	store (dst, pix);
+	dst++;
     }
 
     _mm_empty ();
@@ -3591,6 +3610,55 @@ FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
 			       uint32_t, uint32_t, uint32_t,
 			       NORMAL, FLAG_NONE)
 
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (!is_zero (pix1))
+	{
+	    pix2 = load (dst);
+	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
 static uint32_t *
 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -3853,6 +3921,11 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
 
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
+
     { PIXMAN_OP_NONE },
 };
 
commit 9209cd746b7a81d0536df6dadd6a0b0b983291cb
Author: Matt Turner <mattst88 at gmail.com>
Date:   Tue Jun 19 00:30:51 2012 -0400

    mmx: add scaled bilinear src_8888_8888
    
    Loongson:
    image         firefox-planet-gnome  170.025  170.229   0.09%    3/4
    image         firefox-planet-gnome  157.012  158.087   0.30%    6/6
    
    ARM/iwMMXt:
    image         firefox-planet-gnome  164.192  164.875   0.34%    3/4
    image         firefox-planet-gnome  148.086  149.339   0.76%    6/6

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 1a114fe..f0931ac 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -45,6 +45,28 @@ _mm_setzero_si64 (void)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
@@ -150,6 +172,35 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packsswh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+	uint64_t val = ((uint64_t)__w3 << 48)
+		     | ((uint64_t)__w2 << 32)
+		     | ((uint64_t)__w1 << 16)
+		     | ((uint64_t)__w0 <<  0);
+	return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+	uint64_t val = ((uint64_t)__i1 << 32)
+		     | ((uint64_t)__i0 <<  0);
+	return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
 {
 	__m64 ret;
@@ -193,6 +244,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_si64 (__m64 __m, int64_t __count)
 {
 	__m64 ret;
@@ -204,6 +266,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("psubh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 0ebe119..018a2ba 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 
 #define no_vERBOSE
 
@@ -3502,6 +3503,94 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
+    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
+    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
+    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
+    const __m64 mm_zero = _mm_setzero_si64 ();					\
+    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    /* fetch 2x2 pixel block into 2 mmx registers */				\
+    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
+    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
+    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
+    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
+    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
+    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
+    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
+    /* calculate horizontal weights */						\
+    __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+    __m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
+					16 - BILINEAR_INTERPOLATION_BITS);	\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+    /* horizontal interpolation */						\
+    __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
+    __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
+    __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
+    __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
+    lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),			\
+		       _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));			\
+    hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),			\
+		       _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));			\
+    /* shift and pack the result */						\
+    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_packs_pi32 (lo, hi);						\
+    lo = _mm_packs_pu16 (lo, lo);						\
+    store (&pix, lo);								\
+} while (0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
+					    const uint32_t * mask,
+					    const uint32_t * src_top,
+					    const uint32_t * src_bottom,
+					    int32_t          w,
+					    int              wt,
+					    int              wb,
+					    pixman_fixed_t   vx,
+					    pixman_fixed_t   unit_x,
+					    pixman_fixed_t   max_vx,
+					    pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix;
+
+    while (w--)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+	*dst++ = pix;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
 static uint32_t *
 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -3757,6 +3846,13 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+
     { PIXMAN_OP_NONE },
 };
 
commit 51f27d7364d66e47d882ee531b6655368159231a
Author: Matt Turner <mattst88 at gmail.com>
Date:   Thu Jun 28 12:17:16 2012 -0400

    mmx: Use expand_alpha instead of mask/shift

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index d869c04..0ebe119 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -1599,9 +1599,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (&mask);
+    vmask = expand_alpha (load8888 (&mask));
 
     while (height--)
     {
@@ -1670,9 +1668,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
 
-    mask &= 0xff000000;
-    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
-    vmask = load8888 (&mask);
+    vmask = expand_alpha (load8888 (&mask));
     srca = MC (4x00ff);
 
     while (height--)