pixman: Branch 'master'

Sun Apr 28 13:31:21 PDT 2013

pixman/pixman-sse2.c |   34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

New commits:
commit d768558ce195caa208262866f9262b29efff22dc
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Jan 28 07:00:12 2013 +0200

    sse2: faster bilinear interpolation (get rid of XOR instruction)
    
    The old code was calculating horizontal weights for right pixels
    in the following way (for simplicity assume 8-bit interpolation
    precision):
    
      Start with "x = vx" and do increment "x += ux" after each pixel.
      In this case right pixel weight for interpolation can be calculated
      as "((x >> 8) ^ 0xFF) + 1", which is the same as "256 - (x >> 8)".
    
    The new code instead:
    
      Starts with "x = -(vx + 1)", performs increment "x += -ux" after
      each pixel and calculates right weights as just "(x >> 8) + 1",
      eliminating the need for XOR operation in the inner loop.
    
    So we have one instruction less on the critical path. Benchmarks
    with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.7.2 on
    x86-64 system and default optimizations:
    
    Intel Core i7 860 (2.8GHz):
        before: src_8888_8888 =  L1: 291.37  L2: 288.58  M:285.38
        after:  src_8888_8888 =  L1: 319.66  L2: 316.47  M:312.06
    
    Intel Core2 T7300 (2GHz):
        before: src_8888_8888 =  L1: 121.95  L2: 118.38  M:118.52
        after:  src_8888_8888 =  L1: 128.82  L2: 125.12  M:124.88
    
    Intel Atom N450 (1.67GHz):
        before: src_8888_8888 =  L1:  64.25  L2:  62.37  M: 61.80
        after:  src_8888_8888 =  L1:  64.23  L2:  62.37  M: 61.82
    
    Inspired by the "sse2_bilinear_interpolation" function (single
    pixel interpolation) from:
        http://lists.freedesktop.org/archives/pixman/2013-January/002575.html

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c7e9a4b..863bc18 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5554,19 +5554,27 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
-#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-
-#define BILINEAR_DECLARE_VARIABLES						\
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
-    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
-    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
-    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
-					  unit_x, unit_x, unit_x, unit_x);	\
+					  -unit_x, -unit_x, -unit_x, -unit_x);	\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
-    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
+				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
 do {										\
@@ -5585,8 +5593,8 @@ do {										\
     if (BILINEAR_INTERPOLATION_BITS < 8)					\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
-		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
@@ -5595,8 +5603,8 @@ do {										\
     else									\
     {										\
 	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
-		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
 	/* horizontal interpolation */						\
 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\