pixman: Branch 'master' - 5 commits

Mon Sep 16 14:51:44 PDT 2013

configure.ac            |   45 +++++
 pixman/Makefile.am      |   12 +
 pixman/pixman-general.c |   22 +-
 pixman/pixman-private.h |    8 +
 pixman/pixman-sse2.c    |  208 +++++++++++++++++++--------
 pixman/pixman-ssse3.c   |  362 ++++++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-utils.c   |    9 +
 pixman/pixman-x86.c     |   15 +
 test/scaling-bench.c    |   29 ++-
 9 files changed, 628 insertions(+), 82 deletions(-)

New commits:
commit 58a79dfe6d1fd62c2b66c69fdb64f6b8ecf61da5
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Thu Jun 6 16:15:39 2013 -0400

    ssse3: Add iterator for separable bilinear scaling
    
    This new iterator uses the SSSE3 instructions pmaddubsw and pabsw to
    implement a fast iterator for bilinear scaling.
    
    There is a graph here recording the per-pixel time for various
    bilinear scaling algorithms as reported by scaling-bench:
    
        http://people.freedesktop.org/~sandmann/ssse3.v2/ssse3.v2.png
    
    As the graph shows, this new iterator is clearly faster than the
    existing C iterator, and when used with an SSE2 combiner, it is also
    faster than the existing SSE2 fast paths for upscaling, though not for
    downscaling.
    
    Another graph:
    
        http://people.freedesktop.org/~sandmann/ssse3.v2/movdqu.png
    
    shows the difference between writing to iter->buffer with movdqa,
    movdqu on an aligned buffer, and movdqu on a deliberately unaligned
    buffer. Since the differences are very small, the patch here avoids
    using movdqa because imposing alignment restrictions on iter->buffer
    may interfere with other optimizations, such as writing directly to
    the destination image.
    
    The data was measured with scaling-bench on a Sandy Bridge Core
    i3-2350M @ 2.3GHz and is available in this directory:
    
        http://people.freedesktop.org/~sandmann/ssse3.v2/
    
    where there is also a Gnumeric spreadsheet ssse3.v2.gnumeric
    containing the per-pixel values and the graph.
    
    V2:
    - Use uintptr_t instead of unsigned long in the ALIGN macro
    - Use _mm_storel_epi64 instead of _mm_cvtsi128_si64 as the latter form
      is not available on x86-32.
    - Use _mm_storeu_si128() instead of _mm_store_si128() to avoid
      imposing alignment requirements on iter->buffer

diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 19d71e7..34763e2 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -35,6 +35,316 @@
 #include "pixman-private.h"
 #include "pixman-inlines.h"
 
+typedef struct
+{
+    int		y;
+    uint64_t *	buffer;
+} line_t;
+
+typedef struct
+{
+    line_t		line0;
+    line_t		line1;
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+    uint32_t *bits = image->bits + y * image->rowstride;
+    __m128i vx = _mm_set_epi16 (
+	- (x + 1), x, - (x + 1), x,
+	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
+    __m128i vux = _mm_set_epi16 (
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+    __m128i *b = (__m128i *)line->buffer;
+    __m128i vrl0, vrl1;
+
+    while ((n -= 2) >= 0)
+    {
+	__m128i vw, vr, s;
+
+	vrl1 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
+	/* vrl1: R1, L1 */
+
+    final_pixel:
+	vrl0 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x)));
+	/* vrl0: R0, L0 */
+
+	/* The weights are based on vx which is a vector of 
+	 *
+	 *    - (x + 1), x, - (x + 1), x,
+	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+	 *
+	 * so the 16 bit weights end up like this:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 *
+	 * and after shifting and packing, we get these bytes:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *
+	 * which means the first and the second input pixel 
+	 * have to be interleaved like this:
+	 *
+	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 *
+	 * before maddubsw can be used.
+	 */
+
+	vw = _mm_add_epi16 (
+	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+
+	vw = _mm_packus_epi16 (vw, vw);
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+	vx = _mm_add_epi16 (vx, vux);
+
+	x += 2 * ux;
+
+	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+	vr = _mm_unpackhi_epi8 (vr, s);
+	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 */
+
+	vr = _mm_maddubs_epi16 (vr, vw);
+
+	/* When the weight is 0, the inverse weight is
+	 * 128 which can't be represented in a signed byte.
+	 * As a result maddubsw computes the following:
+	 *
+	 *     r = l * -128 + r * 0
+	 *
+	 * rather than the desired
+	 *
+	 *     r = l * 128 + r * 0
+	 *
+	 * We fix this by taking the absolute value of the
+	 * result.
+	 */
+	vr = _mm_abs_epi16 (vr);
+
+	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+	_mm_store_si128 (b++, vr);
+    }
+
+    if (n == -1)
+    {
+	vrl1 = _mm_setzero_si128();
+	goto final_pixel;
+    }
+
+    line->y = y;
+}
+
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    __m128i vw;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->common.transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    y1 = y0 + 1;
+
+    line0 = &info->line0;
+    line1 = &info->line1;
+
+    if (line0->y != y0 || line1->y != y1)
+    {
+	if (line0->y == y1 || line1->y == y0)
+	{
+	    line_t tmp = *line0;
+	    *line0 = *line1;
+	    *line1 = tmp;
+	}
+
+	if (line0->y != y0)
+	{
+	    ssse3_fetch_horizontal (
+		&iter->image->bits, line0, y0, fx, ux, iter->width);
+	}
+
+	if (line1->y != y1)
+	{
+	    ssse3_fetch_horizontal (
+		&iter->image->bits, line1, y1, fx, ux, iter->width);
+	}
+    }
+
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+    vw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+    for (i = 0; i + 3 < iter->width; i += 4)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+	__m128i r0, r1, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	r1 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot1, top1), vw);
+	tmp = _mm_cmplt_epi16 (bot1, top1);
+	tmp = _mm_and_si128 (tmp, vw);
+	r1 = _mm_sub_epi16 (r1, tmp);
+	r1 = _mm_add_epi16 (r1, top1);
+	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+
+	p = _mm_packus_epi16 (r0, r1);
+
+	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+    }
+
+    while (i < iter->width)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i r0, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	p = _mm_packus_epi16 (r0, r0);
+
+	if (iter->width - i == 1)
+	{
+	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+	    i++;
+	}
+	else
+	{
+	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+	    i += 2;
+	}
+    }
+    
+    info->y += iter->image->common.transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr)							\
+    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->line0.y = -1;
+    info->line0.buffer = ALIGN (&(info->data[0]));
+    info->line1.y = -1;
+    info->line1.buffer = ALIGN (info->line0.buffer + width);
+
+    iter->get_scanline = ssse3_fetch_bilinear_cover;
+    iter->fini = ssse3_bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    _pixman_log_error (
+	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+    
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+    iter->fini = NULL;
+}
+
+static const pixman_iter_info_t ssse3_iters[] = 
+{
+    { PIXMAN_a8r8g8b8,
+      (FAST_PATH_STANDARD_FLAGS			|
+       FAST_PATH_SCALE_TRANSFORM		|
+       FAST_PATH_BILINEAR_FILTER		|
+       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+      ITER_NARROW | ITER_SRC,
+      ssse3_bilinear_cover_iter_init,
+      NULL, NULL
+    },
+
+    { PIXMAN_null },
+};
+
 static const pixman_fast_path_t ssse3_fast_paths[] =
 {
     { PIXMAN_OP_NONE },
@@ -46,5 +356,7 @@ _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
 	_pixman_implementation_create (fallback, ssse3_fast_paths);
 
+    imp->iter_info = ssse3_iters;
+
     return imp;
 }
commit f1792b32215d3b62084ee99fca5c448f1c7f8e1d
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Thu Jun 6 16:32:59 2013 -0400

    Add empty SSSE3 implementation
    
    This commit adds a new, empty SSSE3 implementation and the associated
    build system support.
    
    configure.ac:   detect whether the compiler understands SSSE3
                    intrinsics and set up the required CFLAGS
    
    Makefile.am:    Add libpixman-ssse3.la
    
    pixman-x86.c:   Add X86_SSSE3 feature flag and detect it in
                    detect_cpu_features().
    
    pixman-ssse3.c: New file with an empty SSSE3 implementation
    
    V2: Remove SSSE3_LDFLAGS since it isn't necessary unless Solaris
    support is added.

diff --git a/configure.ac b/configure.ac
index daf4062..263c63e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -437,6 +437,50 @@ fi
 AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
 
 dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+    SSSE3_CFLAGS="-mssse3 -Winline"
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    c = _mm_maddubs_epi16 (a, b);
+    return 0;
+}]])], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+   [AC_HELP_STRING([--disable-ssse3],
+                   [disable SSSE3 fast paths])],
+   [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+   have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+   AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
 dnl Other special flags needed when building code using MMX or SSE instructions
 case $host_os in
    solaris*)
@@ -471,6 +515,7 @@ AC_SUBST(MMX_CFLAGS)
 AC_SUBST(MMX_LDFLAGS)
 AC_SUBST(SSE2_CFLAGS)
 AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
 
 dnl ===========================================================================
 dnl Check for VMX/Altivec
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index b9ea754..b376d9a 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -52,6 +52,18 @@ libpixman_1_la_LIBADD += libpixman-sse2.la
 ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
 endif
 
+# ssse3 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+	pixman-ssse3.c
+libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 535117d..6ca13b2 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -593,6 +593,11 @@ pixman_implementation_t *
 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
 #endif
 
+#ifdef USE_SSSE3
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback);
+#endif
+
 #ifdef USE_ARM_SIMD
 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
new file mode 100644
index 0000000..19d71e7
--- /dev/null
+++ b/pixman/pixman-ssse3.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright Â© 2013 Soren Sandmann Pedersen
+ * Copyright Â© 2013 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann at gmail.com)
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+static const pixman_fast_path_t ssse3_fast_paths[] =
+{
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, ssse3_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman-x86.c b/pixman/pixman-x86.c
index 57e4d1f..6527760 100644
--- a/pixman/pixman-x86.c
+++ b/pixman/pixman-x86.c
@@ -25,7 +25,7 @@
 
 #include "pixman-private.h"
 
-#if defined(USE_X86_MMX) || defined (USE_SSE2)
+#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3)
 
 /* The CPU detection code needs to be in a file not compiled with
  * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
@@ -39,7 +39,8 @@ typedef enum
     X86_MMX_EXTENSIONS		= (1 << 1),
     X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS,
     X86_SSE2			= (1 << 3),
-    X86_CMOV			= (1 << 4)
+    X86_CMOV			= (1 << 4),
+    X86_SSSE3			= (1 << 5)
 } cpu_features_t;
 
 #ifdef HAVE_GETISAX
@@ -64,6 +65,8 @@ detect_cpu_features (void)
 	    features |= X86_SSE;
 	if (result & AV_386_SSE2)
 	    features |= X86_SSE2;
+	if (result & AV_386_SSSE3)
+	    features |= X86_SSSE3;
     }
 
     return features;
@@ -167,6 +170,8 @@ detect_cpu_features (void)
 	features |= X86_SSE;
     if (d & (1 << 26))
 	features |= X86_SSE2;
+    if (d & (1 << 9))
+	features |= X86_SSSE3;
 
     /* Check for AMD specific features */
     if ((features & X86_MMX) && !(features & X86_SSE))
@@ -222,6 +227,7 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
 {
 #define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS)
 #define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2)
+#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3)
 
 #ifdef USE_X86_MMX
     if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS))
@@ -233,5 +239,10 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
 	imp = _pixman_implementation_create_sse2 (imp);
 #endif
 
+#ifdef USE_SSSE3
+    if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS))
+	imp = _pixman_implementation_create_ssse3 (imp);
+#endif
+
     return imp;
 }
commit f10b5449a8b22a26839c58a716b74d6b7a8bcb80
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Wed Aug 28 15:36:13 2013 -0400

    general: Ensure that iter buffers are aligned to 16 bytes
    
    At the moment iter buffers are only guaranteed to be aligned to a 4
    byte boundary. SIMD implementations benefit from the buffers being
    aligned to 16 bytes, so ensure this is the case.
    
    V2:
    - Use uintptr_t instead of unsigned long
    - allocate 3 * SCANLINE_BUFFER_LENGTH byte on stack rather than just
      SCANLINE_BUFFER_LENGTH
    - use sizeof (stack_scanline_buffer) instead of SCANLINE_BUFFER_LENGTH
      to determine overflow

diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 6310bff..a653fa7 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -114,7 +114,7 @@ general_composite_rect  (pixman_implementation_t *imp,
                          pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH];
     uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
     uint8_t *src_buffer, *mask_buffer, *dest_buffer;
     pixman_iter_t src_iter, mask_iter, dest_iter;
@@ -137,17 +137,25 @@ general_composite_rect  (pixman_implementation_t *imp,
 	Bpp = 16;
     }
 
-    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+#define ALIGN(addr)							\
+    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    src_buffer = ALIGN (scanline_buffer);
+    mask_buffer = ALIGN (src_buffer + width * Bpp);
+    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
+    if (ALIGN (dest_buffer + width * Bpp) >
+	    scanline_buffer + sizeof (stack_scanline_buffer))
     {
-	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
 
 	if (!scanline_buffer)
 	    return;
-    }
 
-    src_buffer = scanline_buffer;
-    mask_buffer = src_buffer + width * Bpp;
-    dest_buffer = mask_buffer + width * Bpp;
+	src_buffer = ALIGN (scanline_buffer);
+	mask_buffer = ALIGN (src_buffer + width * Bpp);
+	dest_buffer = ALIGN (mask_buffer + width * Bpp);
+    }
 
     if (width_flag == ITER_WIDE)
     {
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 120196d..535117d 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -787,6 +787,9 @@ pixman_malloc_ab (unsigned int n, unsigned int b);
 void *
 pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
 
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c);
+
 pixman_bool_t
 _pixman_multiply_overflows_size (size_t a, size_t b);
 
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 98723a8..4a3a835 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -49,6 +49,15 @@ _pixman_addition_overflows_int (unsigned int a, unsigned int b)
 }
 
 void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c)
+{
+    if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c)
+	return NULL;
+
+    return malloc (a * b + c);
+}
+
+void *
 pixman_malloc_ab (unsigned int a,
                   unsigned int b)
 {
commit 700db9d872bdc49399a95565ffe0d345db11717a
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Tue Sep 3 04:39:54 2013 +0300

    sse2: faster bilinear scaling (pack 4 pixels to write with MOVDQA)
    
    The loops are already unrolled, so it was just a matter of packing
    4 pixels into a single XMM register and doing aligned 128-bit
    writes to memory via MOVDQA instructions for the SRC compositing
    operator fast path. For the other fast paths, this XMM register
    is also directly routed to further processing instead of doing
    extra reshuffling. This replaces "8 PACKSSDW/PACKUSWB + 4 MOVD"
    instructions with "3 PACKSSDW/PACKUSWB + 1 MOVDQA" per 4 pixels,
    which results in a clear performance improvement.
    
    There are also some other (less important) tweaks:
    
    1. Convert 'pixman_fixed_t' to 'intptr_t' before using it as an
       index for addressing memory. The problem is that 'pixman_fixed_t'
       is a 32-bit data type and it has to be extended to 64-bit
       offsets, which needs extra instructions on 64-bit systems.
    
    2. Allow to recalculate the horizontal interpolation weights only
       once per 4 pixels by treating the XMM register as four pairs
       of 16-bit values. Each of these 16-bit/16-bit pairs can be
       replicated to fill the whole 128-bit register by using PSHUFD
       instructions. So we get "3 PADDW/PSRLW + 4 PSHUFD" instructions
       per 4 pixels instead of "12 PADDW/PSRLW" per 4 pixels
       (or "3 PADDW/PSRLW" per each pixel).
    
       Now a good question is whether replacing "9 PADDW/PSRLW" with
       "4 PSHUFD" is a favourable exchange. As it turns out, PSHUFD
       instructions are very fast on new Intel processors (including
       Atoms), but are rather slow on the first generation of Core2
       (Merom) and on the other processors from that time or older.
       A good instructions latency/throughput table, covering all the
       relevant processors, can be found at:
            http://www.agner.org/optimize/instruction_tables.pdf
    
       Enabling this optimization is controlled by the PSHUFD_IS_FAST
       define in "pixman-sse2.c".
    
    3. One use of PSHUFD instruction (_mm_shuffle_epi32 intrinsic) in
       the older code has been also replaced by PUNPCKLQDQ equivalent
       (_mm_unpacklo_epi64 intrinsic) in PSHUFD_IS_FAST=0 configuration.
       The PUNPCKLQDQ instruction is usually faster on older processors,
       but has some side effects (instead of fully overwriting the
       destination register like PSHUFD does, it retains half of the
       original value, which may inhibit some compiler optimizations).
    
    Benchmarks with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.8.1 on
    x86-64 system and default optimizations. The results are in MPix/s:
    
    ====== Intel Core2 T7300 (2GHz) ======
    
    old:                     src_8888_8888 =  L1: 128.69  L2: 125.07  M:124.86
                            over_8888_8888 =  L1:  83.19  L2:  81.73  M: 80.63
                          over_8888_n_8888 =  L1:  79.56  L2:  78.61  M: 77.85
                          over_8888_8_8888 =  L1:  77.15  L2:  75.79  M: 74.63
    
    new (PSHUFD_IS_FAST=0):  src_8888_8888 =  L1: 168.67  L2: 163.26  M:162.44
                            over_8888_8888 =  L1: 102.91  L2: 100.43  M: 99.01
                          over_8888_n_8888 =  L1:  97.40  L2:  95.64  M: 94.24
                          over_8888_8_8888 =  L1:  98.04  L2:  95.83  M: 94.33
    
    new (PSHUFD_IS_FAST=1):  src_8888_8888 =  L1: 154.67  L2: 149.16  M:148.48
                            over_8888_8888 =  L1:  95.97  L2:  93.90  M: 91.85
                          over_8888_n_8888 =  L1:  93.18  L2:  91.47  M: 90.15
                          over_8888_8_8888 =  L1:  95.33  L2:  93.32  M: 91.42
    
    ====== Intel Core i7 860 (2.8GHz) ======
    
    old:                     src_8888_8888 =  L1: 323.48  L2: 318.86  M:314.81
                            over_8888_8888 =  L1: 187.38  L2: 186.74  M:182.46
    
    new (PSHUFD_IS_FAST=0):  src_8888_8888 =  L1: 373.06  L2: 370.94  M:368.32
                            over_8888_8888 =  L1: 217.28  L2: 215.57  M:211.32
    
    new (PSHUFD_IS_FAST=1):  src_8888_8888 =  L1: 401.98  L2: 397.65  M:395.61
                            over_8888_8888 =  L1: 218.89  L2: 217.56  M:213.48
    
    The most interesting benchmark is "src_8888_8888" (because this code can
    be reused for a generic non-separable SSE2 bilinear fetch iterator).
    
    The results shows that PSHUFD instructions are bad for Intel Core2 T7300
    (Merom core) and good for Intel Core i7 860 (Nehalem core). Both of these
    processors support SSSE3 instructions though, so they are not the primary
    targets for SSE2 code. But without having any other more relevant hardware
    to test, PSHUFD_IS_FAST=0 seems to be a reasonable default for SSE2 code
    and old processors (until the runtime CPU features detection becomes
    clever enough to recognize different microarchitectures).
    
    (Rebased on top of patch that removes support for 8-bit bilinear
     filtering -ssp)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index a629565..42c7209 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -30,6 +30,9 @@
 #include <config.h>
 #endif
 
+/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
+#define PSHUFD_IS_FAST 0
+
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
@@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
+#if PSHUFD_IS_FAST
+
+/***********************************************************************************/
+
 # define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
-    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					   unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
+				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
+				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
+				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
+    __m128i xmm_wh_state;
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
+do {										\
+    int phase = phase_;								\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
+    /* calculate horizontal weights */						\
+    if (phase <= 0)								\
+    {										\
+	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
+	phase = 0;								\
+    }										\
+    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
+							   phase, phase));	\
+    /* horizontal interpolation */						\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
+} while (0)
+
+#else /************************************************************************/
+
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
 					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
     __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
 				   vx, -(vx + 1), vx, -(vx + 1))
 
-#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
 do {										\
-    __m128i xmm_wh, a;								\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
+    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
     /* fetch 2x2 pixel block into sse2 registers */				\
-    __m128i tltr = _mm_loadl_epi64 (						\
-			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
-    __m128i blbr = _mm_loadl_epi64 (						\
-			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
     vx += unit_x;								\
     /* vertical interpolation */						\
-    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
-					xmm_wt),				\
-		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
-					xmm_wb));				\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
     /* calculate horizontal weights */						\
     xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
-			      16 - BILINEAR_INTERPOLATION_BITS));		\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
     /* horizontal interpolation */						\
-    a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (			\
-			       a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);	\
-    /* shift and pack the result */						\
-    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
-    a = _mm_packs_epi32 (a, a);							\
-    a = _mm_packus_epi16 (a, a);						\
-    pix = _mm_cvtsi128_si32 (a);						\
+    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
 } while (0)
 
+/***********************************************************************************/
+
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
+do {										\
+	__m128i xmm_pix;							\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
+	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
+	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
+	pix = _mm_cvtsi128_si32 (xmm_pix);					\
+} while(0)
+
+#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
+do {										\
+	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
+	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
+	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
+	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
+} while(0)
+
 #define BILINEAR_SKIP_ONE_PIXEL()						\
 do {										\
     vx += unit_x;								\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
 } while(0)
 
+#define BILINEAR_SKIP_FOUR_PIXELS()						\
+do {										\
+    vx += unit_x * 4;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
+} while(0)
+
+/***********************************************************************************/
+
 static force_inline void
 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
 					     const uint32_t * mask,
@@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
 					     int32_t          w,
 					     int              wt,
 					     int              wb,
-					     pixman_fixed_t   vx,
-					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   vx_,
+					     pixman_fixed_t   unit_x_,
 					     pixman_fixed_t   max_vx,
 					     pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
 
-    while ((w -= 4) >= 0)
+    while (w && ((uintptr_t)dst & 15))
     {
 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
 	*dst++ = pix1;
-	*dst++ = pix2;
-	*dst++ = pix3;
-	*dst++ = pix4;
+	w--;
+    }
+
+    while ((w -= 4) >= 0) {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+	_mm_store_si128 ((__m128i *)dst, xmm_src);
+	dst += 4;
     }
 
     if (w & 2)
@@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
 					      int32_t          w,
 					      int              wt,
 					      int              wb,
-					      pixman_fixed_t   vx,
-					      pixman_fixed_t   unit_x,
+					      pixman_fixed_t   vx_,
+					      pixman_fixed_t   unit_x_,
 					      pixman_fixed_t   max_vx,
 					      pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
 
     while (w && ((uintptr_t)dst & 15))
     {
@@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
 	__m128i xmm_alpha_hi, xmm_alpha_lo;
 
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
-	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
 	if (!is_zero (xmm_src))
 	{
@@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 						int32_t          w,
 						int              wt,
 						int              wb,
-						pixman_fixed_t   vx,
-						pixman_fixed_t   unit_x,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
 						pixman_fixed_t   max_vx,
 						pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
     uint32_t m;
 
     while (w && ((uintptr_t)dst & 15))
@@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 
 	if (m)
 	{
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
-	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
 	    if (m == 0xffffffff && is_opaque (xmm_src))
 	    {
@@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 	}
 	else
 	{
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_FOUR_PIXELS ();
 	}
 
 	w -= 4;
@@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
 						int32_t          w,
 						int              wt,
 						int              wb,
-						pixman_fixed_t   vx,
-						pixman_fixed_t   unit_x,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
 						pixman_fixed_t   max_vx,
 						pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1;
     __m128i xmm_mask;
 
     if (zero_src || (*mask >> 24) == 0)
@@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
 
     while (w >= 4)
     {
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
-	if (pix1 | pix2 | pix3 | pix4)
+	if (!is_zero (xmm_src))
 	{
-	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_src_lo, xmm_src_hi;
 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
commit e43cc9c9024957dcc7f160f6abe7be218667dfa2
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Thu Sep 5 08:07:52 2013 +0300

    test: safeguard the scaling-bench test against COW
    
    The calloc call from pixman_image_create_bits may still
    rely on http://en.wikipedia.org/wiki/Copy-on-write
    Explicitly initializing the destination image results in
    a more predictable behaviour.
    
    V2:
     - allocate 16 bytes aligned buffer with aligned stride instead
       of delegating this to pixman_image_create_bits
     - use memset for the allocated buffer instead of pixman solid fill
     - repeat tests 3 times and select best results in order to filter
       out even more measurement noise

diff --git a/test/scaling-bench.c b/test/scaling-bench.c
index b39adef..365e798 100644
--- a/test/scaling-bench.c
+++ b/test/scaling-bench.c
@@ -3,6 +3,7 @@
 
 #define SOURCE_WIDTH 320
 #define SOURCE_HEIGHT 240
+#define TEST_REPEATS 3
 
 static pixman_image_t *
 make_source (void)
@@ -39,30 +40,40 @@ main ()
 	    "time per pixel / ns");
     for (scale = 0.1; scale < 10.005; scale += 0.01)
     {
+	int i;
 	int dest_width = SOURCE_WIDTH * scale + 0.5;
 	int dest_height = SOURCE_HEIGHT * scale + 0.5;
+	int dest_byte_stride = (dest_width * 4 + 15) & ~15;
 	pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5;
 	pixman_transform_t transform;
 	pixman_image_t *dest;
-	double t1, t2;
+	double t1, t2, t = -1;
+	uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height);
+	memset (dest_buf, 0, dest_byte_stride * dest_height);
 
 	pixman_transform_init_scale (&transform, s, s);
 	pixman_image_set_transform (src, &transform);
 	
 	dest = pixman_image_create_bits (
-	    PIXMAN_a8r8g8b8, dest_width, dest_height, NULL, -1);
+	    PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride);
+
+	for (i = 0; i < TEST_REPEATS; i++)
+	{
+	    t1 = gettime();
+	    pixman_image_composite (
+		PIXMAN_OP_OVER, src, NULL, dest,
+		scale, scale, 0, 0, 0, 0, dest_width, dest_height);
+	    t2 = gettime();
+	    if (t < 0 || t2 - t1 < t)
+		t = t2 - t1;
+	}
 
-	t1 = gettime();
-	pixman_image_composite (
-	    PIXMAN_OP_OVER, src, NULL, dest,
-	    scale, scale, 0, 0, 0, 0, dest_width, dest_height);
-	t2 = gettime();
-	
 	printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n",
 		scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height,
-		(t2 - t1) * 1000, ((t2 - t1) / (dest_width * dest_height)) * 1000000000);
+		t * 1000, (t / (dest_width * dest_height)) * 1000000000);
 
 	pixman_image_unref (dest);
+	free (dest_buf);
     }
 
     return 0;