Siarhei Siamashka siamashka at kemper.freedesktop.org
Sun Jul 1 12:49:48 PDT 2012

commit c430b1dba7bfea0031227dd4b976da3dd7c4ac02
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Tue Jun 26 01:47:18 2012 +0300

    sse2: _mm_madd_epi16 for faster bilinear scaling with 7-bit precision
    Reducing interpolation precision allows the use of PMADDWD instruction.
    This makes bilinear scaling much faster (on Intel Core i7):
    8-bit: image             firefox-fishtank   57.584   58.349   0.74%    3/3
    7-bit: image             firefox-fishtank   51.139   51.229   0.30%    3/3
    8-bit: src_8888_8888 =  L1: 228.71  L2: 226.52  M:224.82 ( 14.95%)  HT:183.22  VT:154.02  R:171.72  RT:109.36
    7-bit: src_8888_8888 =  L1: 320.45  L2: 317.43  M:314.38 ( 20.77%)  HT:215.13  VT:177.35  R:204.46  RT:121.93

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 301bce5..f665b37 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5369,8 +5369,10 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
-    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
+    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
+    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
 					  unit_x, unit_x, unit_x, unit_x);	\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
@@ -5390,15 +5392,28 @@ do {										\
 					xmm_wt),				\
 		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
 					xmm_wb));				\
-    /* calculate horizontal weights */						\
-    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_xor_si128 (xmm_xorc,			\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
 		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
-    /* horizontal interpolation */						\
-    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
-    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
-    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
-		       _mm_unpackhi_epi16 (xmm_lo, xmm_hi));			\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
+    }										\
+    else									\
+    {										\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
+		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+    }										\
     /* shift and pack the result */						\
     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
     a = _mm_packs_epi32 (a, a);							\

