pixman: Branch 'master' - 5 commits
Søren Sandmann Pedersen
sandmann at kemper.freedesktop.org
Mon Sep 16 14:51:44 PDT 2013
configure.ac | 45 +++++
pixman/Makefile.am | 12 +
pixman/pixman-general.c | 22 +-
pixman/pixman-private.h | 8 +
pixman/pixman-sse2.c | 208 +++++++++++++++++++--------
pixman/pixman-ssse3.c | 362 ++++++++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-utils.c | 9 +
pixman/pixman-x86.c | 15 +
test/scaling-bench.c | 29 ++-
9 files changed, 628 insertions(+), 82 deletions(-)
New commits:
commit 58a79dfe6d1fd62c2b66c69fdb64f6b8ecf61da5
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date: Thu Jun 6 16:15:39 2013 -0400
ssse3: Add iterator for separable bilinear scaling
This new iterator uses the SSSE3 instructions pmaddubsw and pabsw to
implement a fast iterator for bilinear scaling.
There is a graph here recording the per-pixel time for various
bilinear scaling algorithms as reported by scaling-bench:
http://people.freedesktop.org/~sandmann/ssse3.v2/ssse3.v2.png
As the graph shows, this new iterator is clearly faster than the
existing C iterator, and when used with an SSE2 combiner, it is also
faster than the existing SSE2 fast paths for upscaling, though not for
downscaling.
Another graph:
http://people.freedesktop.org/~sandmann/ssse3.v2/movdqu.png
shows the difference between writing to iter->buffer with movdqa,
movdqu on an aligned buffer, and movdqu on a deliberately unaligned
buffer. Since the differences are very small, the patch here avoids
using movdqa because imposing alignment restrictions on iter->buffer
may interfere with other optimizations, such as writing directly to
the destination image.
The data was measured with scaling-bench on a Sandy Bridge Core
i3-2350M @ 2.3GHz and is available in this directory:
http://people.freedesktop.org/~sandmann/ssse3.v2/
where there is also a Gnumeric spreadsheet ssse3.v2.gnumeric
containing the per-pixel values and the graph.
V2:
- Use uintptr_t instead of unsigned long in the ALIGN macro
- Use _mm_storel_epi64 instead of _mm_cvtsi128_si64 as the latter form
is not available on x86-32.
- Use _mm_storeu_si128() instead of _mm_store_si128() to avoid
imposing alignment requirements on iter->buffer
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 19d71e7..34763e2 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -35,6 +35,316 @@
#include "pixman-private.h"
#include "pixman-inlines.h"
+typedef struct
+{
+ int y;
+ uint64_t * buffer;
+} line_t;
+
+typedef struct
+{
+ line_t line0;
+ line_t line1;
+ pixman_fixed_t y;
+ pixman_fixed_t x;
+ uint64_t data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+ int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+ uint32_t *bits = image->bits + y * image->rowstride;
+ __m128i vx = _mm_set_epi16 (
+ - (x + 1), x, - (x + 1), x,
+ - (x + ux + 1), x + ux, - (x + ux + 1), x + ux);
+ __m128i vux = _mm_set_epi16 (
+ - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+ - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+ __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+ __m128i *b = (__m128i *)line->buffer;
+ __m128i vrl0, vrl1;
+
+ while ((n -= 2) >= 0)
+ {
+ __m128i vw, vr, s;
+
+ vrl1 = _mm_loadl_epi64 (
+ (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
+ /* vrl1: R1, L1 */
+
+ final_pixel:
+ vrl0 = _mm_loadl_epi64 (
+ (__m128i *)(bits + pixman_fixed_to_int (x)));
+ /* vrl0: R0, L0 */
+
+ /* The weights are based on vx which is a vector of
+ *
+ * - (x + 1), x, - (x + 1), x,
+ * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+ *
+ * so the 16 bit weights end up like this:
+ *
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ *
+ * and after shifting and packing, we get these bytes:
+ *
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ *
+ * which means the first and the second input pixel
+ * have to be interleaved like this:
+ *
+ * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+ * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+ *
+ * before maddubsw can be used.
+ */
+
+ vw = _mm_add_epi16 (
+ vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+ /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ */
+
+ vw = _mm_packus_epi16 (vw, vw);
+ /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+ * iw0, w0, iw0, w0, iw1, w1, iw1, w1
+ */
+ vx = _mm_add_epi16 (vx, vux);
+
+ x += 2 * ux;
+
+ vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+ /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+ s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+ /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+ vr = _mm_unpackhi_epi8 (vr, s);
+ /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+ * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+ */
+
+ vr = _mm_maddubs_epi16 (vr, vw);
+
+ /* When the weight is 0, the inverse weight is
+ * 128 which can't be represented in a signed byte.
+ * As a result maddubsw computes the following:
+ *
+ * r = l * -128 + r * 0
+ *
+ * rather than the desired
+ *
+ * r = l * 128 + r * 0
+ *
+ * We fix this by taking the absolute value of the
+ * result.
+ */
+ vr = _mm_abs_epi16 (vr);
+
+ /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+ _mm_store_si128 (b++, vr);
+ }
+
+ if (n == -1)
+ {
+ vrl1 = _mm_setzero_si128();
+ goto final_pixel;
+ }
+
+ line->y = y;
+}
+
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+ pixman_fixed_t fx, ux;
+ bilinear_info_t *info = iter->data;
+ line_t *line0, *line1;
+ int y0, y1;
+ int32_t dist_y;
+ __m128i vw;
+ int i;
+
+ fx = info->x;
+ ux = iter->image->common.transform->matrix[0][0];
+
+ y0 = pixman_fixed_to_int (info->y);
+ y1 = y0 + 1;
+
+ line0 = &info->line0;
+ line1 = &info->line1;
+
+ if (line0->y != y0 || line1->y != y1)
+ {
+ if (line0->y == y1 || line1->y == y0)
+ {
+ line_t tmp = *line0;
+ *line0 = *line1;
+ *line1 = tmp;
+ }
+
+ if (line0->y != y0)
+ {
+ ssse3_fetch_horizontal (
+ &iter->image->bits, line0, y0, fx, ux, iter->width);
+ }
+
+ if (line1->y != y1)
+ {
+ ssse3_fetch_horizontal (
+ &iter->image->bits, line1, y1, fx, ux, iter->width);
+ }
+ }
+
+ dist_y = pixman_fixed_to_bilinear_weight (info->y);
+ dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+ vw = _mm_set_epi16 (
+ dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+ for (i = 0; i + 3 < iter->width; i += 4)
+ {
+ __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+ __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+ __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+ __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+ __m128i r0, r1, tmp, p;
+
+ r0 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot0, top0), vw);
+ tmp = _mm_cmplt_epi16 (bot0, top0);
+ tmp = _mm_and_si128 (tmp, vw);
+ r0 = _mm_sub_epi16 (r0, tmp);
+ r0 = _mm_add_epi16 (r0, top0);
+ r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+ /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
+ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
+
+ r1 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot1, top1), vw);
+ tmp = _mm_cmplt_epi16 (bot1, top1);
+ tmp = _mm_and_si128 (tmp, vw);
+ r1 = _mm_sub_epi16 (r1, tmp);
+ r1 = _mm_add_epi16 (r1, top1);
+ r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+ r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+
+ p = _mm_packus_epi16 (r0, r1);
+
+ _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+ }
+
+ while (i < iter->width)
+ {
+ __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+ __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+ __m128i r0, tmp, p;
+
+ r0 = _mm_mulhi_epu16 (
+ _mm_sub_epi16 (bot0, top0), vw);
+ tmp = _mm_cmplt_epi16 (bot0, top0);
+ tmp = _mm_and_si128 (tmp, vw);
+ r0 = _mm_sub_epi16 (r0, tmp);
+ r0 = _mm_add_epi16 (r0, top0);
+ r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+ /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
+ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+ /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
+
+ p = _mm_packus_epi16 (r0, r0);
+
+ if (iter->width - i == 1)
+ {
+ *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+ i++;
+ }
+ else
+ {
+ _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+ i += 2;
+ }
+ }
+
+ info->y += iter->image->common.transform->matrix[1][1];
+
+ return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+ free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+ int width = iter->width;
+ bilinear_info_t *info;
+ pixman_vector_t v;
+
+ /* Reference point is the center of the pixel */
+ v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+ v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+ goto fail;
+
+ info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+ if (!info)
+ goto fail;
+
+ info->x = v.vector[0] - pixman_fixed_1 / 2;
+ info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr) \
+ ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+ /* It is safe to set the y coordinates to -1 initially
+ * because COVER_CLIP_BILINEAR ensures that we will only
+ * be asked to fetch lines in the [0, height) interval
+ */
+ info->line0.y = -1;
+ info->line0.buffer = ALIGN (&(info->data[0]));
+ info->line1.y = -1;
+ info->line1.buffer = ALIGN (info->line0.buffer + width);
+
+ iter->get_scanline = ssse3_fetch_bilinear_cover;
+ iter->fini = ssse3_bilinear_cover_iter_fini;
+
+ iter->data = info;
+ return;
+
+fail:
+ /* Something went wrong, either a bad matrix or OOM; in such cases,
+ * we don't guarantee any particular rendering.
+ */
+ _pixman_log_error (
+ FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+
+ iter->get_scanline = _pixman_iter_get_scanline_noop;
+ iter->fini = NULL;
+}
+
+static const pixman_iter_info_t ssse3_iters[] =
+{
+ { PIXMAN_a8r8g8b8,
+ (FAST_PATH_STANDARD_FLAGS |
+ FAST_PATH_SCALE_TRANSFORM |
+ FAST_PATH_BILINEAR_FILTER |
+ FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+ ITER_NARROW | ITER_SRC,
+ ssse3_bilinear_cover_iter_init,
+ NULL, NULL
+ },
+
+ { PIXMAN_null },
+};
+
static const pixman_fast_path_t ssse3_fast_paths[] =
{
{ PIXMAN_OP_NONE },
@@ -46,5 +356,7 @@ _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
pixman_implementation_t *imp =
_pixman_implementation_create (fallback, ssse3_fast_paths);
+ imp->iter_info = ssse3_iters;
+
return imp;
}
commit f1792b32215d3b62084ee99fca5c448f1c7f8e1d
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date: Thu Jun 6 16:32:59 2013 -0400
Add empty SSSE3 implementation
This commit adds a new, empty SSSE3 implementation and the associated
build system support.
configure.ac: detect whether the compiler understands SSSE3
intrinsics and set up the required CFLAGS
Makefile.am: Add libpixman-ssse3.la
pixman-x86.c: Add X86_SSSE3 feature flag and detect it in
detect_cpu_features().
pixman-ssse3.c: New file with an empty SSSE3 implementation
V2: Remove SSSE3_LDFLAGS since it isn't necessary unless Solaris
support is added.
diff --git a/configure.ac b/configure.ac
index daf4062..263c63e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -437,6 +437,50 @@ fi
AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+ SSSE3_CFLAGS="-mssse3 -Winline"
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+int main () {
+ __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+ c = _mm_maddubs_epi16 (a, b);
+ return 0;
+}]])], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+ [AC_HELP_STRING([--disable-ssse3],
+ [disable SSSE3 fast paths])],
+ [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+ have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+ AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+ AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
dnl Other special flags needed when building code using MMX or SSE instructions
case $host_os in
solaris*)
@@ -471,6 +515,7 @@ AC_SUBST(MMX_CFLAGS)
AC_SUBST(MMX_LDFLAGS)
AC_SUBST(SSE2_CFLAGS)
AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
dnl ===========================================================================
dnl Check for VMX/Altivec
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index b9ea754..b376d9a 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -52,6 +52,18 @@ libpixman_1_la_LIBADD += libpixman-sse2.la
ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
endif
+# ssse3 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+ pixman-ssse3.c
+libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
# arm simd code
if USE_ARM_SIMD
noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 535117d..6ca13b2 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -593,6 +593,11 @@ pixman_implementation_t *
_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
#endif
+#ifdef USE_SSSE3
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback);
+#endif
+
#ifdef USE_ARM_SIMD
pixman_implementation_t *
_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
new file mode 100644
index 0000000..19d71e7
--- /dev/null
+++ b/pixman/pixman-ssse3.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2013 Soren Sandmann Pedersen
+ * Copyright © 2013 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann at gmail.com)
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+static const pixman_fast_path_t ssse3_fast_paths[] =
+{
+ { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp =
+ _pixman_implementation_create (fallback, ssse3_fast_paths);
+
+ return imp;
+}
diff --git a/pixman/pixman-x86.c b/pixman/pixman-x86.c
index 57e4d1f..6527760 100644
--- a/pixman/pixman-x86.c
+++ b/pixman/pixman-x86.c
@@ -25,7 +25,7 @@
#include "pixman-private.h"
-#if defined(USE_X86_MMX) || defined (USE_SSE2)
+#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3)
/* The CPU detection code needs to be in a file not compiled with
* "-mmmx -msse", as gcc would generate CMOV instructions otherwise
@@ -39,7 +39,8 @@ typedef enum
X86_MMX_EXTENSIONS = (1 << 1),
X86_SSE = (1 << 2) | X86_MMX_EXTENSIONS,
X86_SSE2 = (1 << 3),
- X86_CMOV = (1 << 4)
+ X86_CMOV = (1 << 4),
+ X86_SSSE3 = (1 << 5)
} cpu_features_t;
#ifdef HAVE_GETISAX
@@ -64,6 +65,8 @@ detect_cpu_features (void)
features |= X86_SSE;
if (result & AV_386_SSE2)
features |= X86_SSE2;
+ if (result & AV_386_SSSE3)
+ features |= X86_SSSE3;
}
return features;
@@ -167,6 +170,8 @@ detect_cpu_features (void)
features |= X86_SSE;
if (d & (1 << 26))
features |= X86_SSE2;
+ if (d & (1 << 9))
+ features |= X86_SSSE3;
/* Check for AMD specific features */
if ((features & X86_MMX) && !(features & X86_SSE))
@@ -222,6 +227,7 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
{
#define MMX_BITS (X86_MMX | X86_MMX_EXTENSIONS)
#define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2)
+#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3)
#ifdef USE_X86_MMX
if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS))
@@ -233,5 +239,10 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
imp = _pixman_implementation_create_sse2 (imp);
#endif
+#ifdef USE_SSSE3
+ if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS))
+ imp = _pixman_implementation_create_ssse3 (imp);
+#endif
+
return imp;
}
commit f10b5449a8b22a26839c58a716b74d6b7a8bcb80
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date: Wed Aug 28 15:36:13 2013 -0400
general: Ensure that iter buffers are aligned to 16 bytes
At the moment iter buffers are only guaranteed to be aligned to a 4
byte boundary. SIMD implementations benefit from the buffers being
aligned to 16 bytes, so ensure this is the case.
V2:
- Use uintptr_t instead of unsigned long
- allocate 3 * SCANLINE_BUFFER_LENGTH byte on stack rather than just
SCANLINE_BUFFER_LENGTH
- use sizeof (stack_scanline_buffer) instead of SCANLINE_BUFFER_LENGTH
to determine overflow
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 6310bff..a653fa7 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -114,7 +114,7 @@ general_composite_rect (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
PIXMAN_COMPOSITE_ARGS (info);
- uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+ uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH];
uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
uint8_t *src_buffer, *mask_buffer, *dest_buffer;
pixman_iter_t src_iter, mask_iter, dest_iter;
@@ -137,17 +137,25 @@ general_composite_rect (pixman_implementation_t *imp,
Bpp = 16;
}
- if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+#define ALIGN(addr) \
+ ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+ src_buffer = ALIGN (scanline_buffer);
+ mask_buffer = ALIGN (src_buffer + width * Bpp);
+ dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
+ if (ALIGN (dest_buffer + width * Bpp) >
+ scanline_buffer + sizeof (stack_scanline_buffer))
{
- scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+ scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
if (!scanline_buffer)
return;
- }
- src_buffer = scanline_buffer;
- mask_buffer = src_buffer + width * Bpp;
- dest_buffer = mask_buffer + width * Bpp;
+ src_buffer = ALIGN (scanline_buffer);
+ mask_buffer = ALIGN (src_buffer + width * Bpp);
+ dest_buffer = ALIGN (mask_buffer + width * Bpp);
+ }
if (width_flag == ITER_WIDE)
{
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 120196d..535117d 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -787,6 +787,9 @@ pixman_malloc_ab (unsigned int n, unsigned int b);
void *
pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c);
+
pixman_bool_t
_pixman_multiply_overflows_size (size_t a, size_t b);
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 98723a8..4a3a835 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -49,6 +49,15 @@ _pixman_addition_overflows_int (unsigned int a, unsigned int b)
}
void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c)
+{
+ if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c)
+ return NULL;
+
+ return malloc (a * b + c);
+}
+
+void *
pixman_malloc_ab (unsigned int a,
unsigned int b)
{
commit 700db9d872bdc49399a95565ffe0d345db11717a
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date: Tue Sep 3 04:39:54 2013 +0300
sse2: faster bilinear scaling (pack 4 pixels to write with MOVDQA)
The loops are already unrolled, so it was just a matter of packing
4 pixels into a single XMM register and doing aligned 128-bit
writes to memory via MOVDQA instructions for the SRC compositing
operator fast path. For the other fast paths, this XMM register
is also directly routed to further processing instead of doing
extra reshuffling. This replaces "8 PACKSSDW/PACKUSWB + 4 MOVD"
instructions with "3 PACKSSDW/PACKUSWB + 1 MOVDQA" per 4 pixels,
which results in a clear performance improvement.
There are also some other (less important) tweaks:
1. Convert 'pixman_fixed_t' to 'intptr_t' before using it as an
index for addressing memory. The problem is that 'pixman_fixed_t'
is a 32-bit data type and it has to be extended to 64-bit
offsets, which needs extra instructions on 64-bit systems.
2. Allow to recalculate the horizontal interpolation weights only
once per 4 pixels by treating the XMM register as four pairs
of 16-bit values. Each of these 16-bit/16-bit pairs can be
replicated to fill the whole 128-bit register by using PSHUFD
instructions. So we get "3 PADDW/PSRLW + 4 PSHUFD" instructions
per 4 pixels instead of "12 PADDW/PSRLW" per 4 pixels
(or "3 PADDW/PSRLW" per each pixel).
Now a good question is whether replacing "9 PADDW/PSRLW" with
"4 PSHUFD" is a favourable exchange. As it turns out, PSHUFD
instructions are very fast on new Intel processors (including
Atoms), but are rather slow on the first generation of Core2
(Merom) and on the other processors from that time or older.
A good instructions latency/throughput table, covering all the
relevant processors, can be found at:
http://www.agner.org/optimize/instruction_tables.pdf
Enabling this optimization is controlled by the PSHUFD_IS_FAST
define in "pixman-sse2.c".
3. One use of PSHUFD instruction (_mm_shuffle_epi32 intrinsic) in
the older code has been also replaced by PUNPCKLQDQ equivalent
(_mm_unpacklo_epi64 intrinsic) in PSHUFD_IS_FAST=0 configuration.
The PUNPCKLQDQ instruction is usually faster on older processors,
but has some side effects (instead of fully overwriting the
destination register like PSHUFD does, it retains half of the
original value, which may inhibit some compiler optimizations).
Benchmarks with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.8.1 on
x86-64 system and default optimizations. The results are in MPix/s:
====== Intel Core2 T7300 (2GHz) ======
old: src_8888_8888 = L1: 128.69 L2: 125.07 M:124.86
over_8888_8888 = L1: 83.19 L2: 81.73 M: 80.63
over_8888_n_8888 = L1: 79.56 L2: 78.61 M: 77.85
over_8888_8_8888 = L1: 77.15 L2: 75.79 M: 74.63
new (PSHUFD_IS_FAST=0): src_8888_8888 = L1: 168.67 L2: 163.26 M:162.44
over_8888_8888 = L1: 102.91 L2: 100.43 M: 99.01
over_8888_n_8888 = L1: 97.40 L2: 95.64 M: 94.24
over_8888_8_8888 = L1: 98.04 L2: 95.83 M: 94.33
new (PSHUFD_IS_FAST=1): src_8888_8888 = L1: 154.67 L2: 149.16 M:148.48
over_8888_8888 = L1: 95.97 L2: 93.90 M: 91.85
over_8888_n_8888 = L1: 93.18 L2: 91.47 M: 90.15
over_8888_8_8888 = L1: 95.33 L2: 93.32 M: 91.42
====== Intel Core i7 860 (2.8GHz) ======
old: src_8888_8888 = L1: 323.48 L2: 318.86 M:314.81
over_8888_8888 = L1: 187.38 L2: 186.74 M:182.46
new (PSHUFD_IS_FAST=0): src_8888_8888 = L1: 373.06 L2: 370.94 M:368.32
over_8888_8888 = L1: 217.28 L2: 215.57 M:211.32
new (PSHUFD_IS_FAST=1): src_8888_8888 = L1: 401.98 L2: 397.65 M:395.61
over_8888_8888 = L1: 218.89 L2: 217.56 M:213.48
The most interesting benchmark is "src_8888_8888" (because this code can
be reused for a generic non-separable SSE2 bilinear fetch iterator).
The results shows that PSHUFD instructions are bad for Intel Core2 T7300
(Merom core) and good for Intel Core i7 860 (Nehalem core). Both of these
processors support SSSE3 instructions though, so they are not the primary
targets for SSE2 code. But without having any other more relevant hardware
to test, PSHUFD_IS_FAST=0 seems to be a reasonable default for SSE2 code
and old processors (until the runtime CPU features detection becomes
clever enough to recognize different microarchitectures).
(Rebased on top of patch that removes support for 8-bit bilinear
filtering -ssp)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index a629565..42c7209 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -30,6 +30,9 @@
#include <config.h>
#endif
+/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
+#define PSHUFD_IS_FAST 0
+
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
@@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+#if PSHUFD_IS_FAST
+
+/***********************************************************************************/
+
# define BILINEAR_DECLARE_VARIABLES \
const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
- const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
+ const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
+ unit_x, -unit_x, unit_x, -unit_x); \
+ const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4); \
+ const __m128i xmm_zero = _mm_setzero_si128 (); \
+ __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
+ vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
+ vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
+ vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
+ __m128i xmm_wh_state;
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
+do { \
+ int phase = phase_; \
+ __m128i xmm_wh, xmm_a, xmm_b; \
+ /* fetch 2x2 pixel block into sse2 registers */ \
+ __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
+ __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
+ xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
+ xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
+ /* calculate horizontal weights */ \
+ if (phase <= 0) \
+ { \
+ xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
+ phase = 0; \
+ } \
+ xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
+ phase, phase)); \
+ /* horizontal interpolation */ \
+ xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
+ xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
+ /* shift the result */ \
+ pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
+} while (0)
+
+#else /************************************************************************/
+
+# define BILINEAR_DECLARE_VARIABLES \
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
+ const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
+ const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
unit_x, -unit_x, unit_x, -unit_x); \
+ const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4, \
+ unit_x * 4, -unit_x * 4); \
const __m128i xmm_zero = _mm_setzero_si128 (); \
__m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
vx, -(vx + 1), vx, -(vx + 1))
-#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
do { \
- __m128i xmm_wh, a; \
+ __m128i xmm_wh, xmm_a, xmm_b; \
+ (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
/* fetch 2x2 pixel block into sse2 registers */ \
- __m128i tltr = _mm_loadl_epi64 ( \
- (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
- __m128i blbr = _mm_loadl_epi64 ( \
- (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
+ __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
+ __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
vx += unit_x; \
/* vertical interpolation */ \
- a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
- xmm_wt), \
- _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
- xmm_wb)); \
+ xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
+ xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
+ xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
/* calculate horizontal weights */ \
xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
- 16 - BILINEAR_INTERPOLATION_BITS)); \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
/* horizontal interpolation */ \
- a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
- a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
- /* shift and pack the result */ \
- a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
- a = _mm_packs_epi32 (a, a); \
- a = _mm_packus_epi16 (a, a); \
- pix = _mm_cvtsi128_si32 (a); \
+ xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
+ xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
+ /* shift the result */ \
+ pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
} while (0)
+/***********************************************************************************/
+
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
+do { \
+ __m128i xmm_pix; \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
+ xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
+ xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
+ pix = _mm_cvtsi128_si32 (xmm_pix); \
+} while(0)
+
+#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
+do { \
+ __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
+ BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
+ xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
+ xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
+ pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
+} while(0)
+
#define BILINEAR_SKIP_ONE_PIXEL() \
do { \
vx += unit_x; \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
} while(0)
+#define BILINEAR_SKIP_FOUR_PIXELS() \
+do { \
+ vx += unit_x * 4; \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
+} while(0)
+
+/***********************************************************************************/
+
static force_inline void
scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
const uint32_t * mask,
@@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
- while ((w -= 4) >= 0)
+ while (w && ((uintptr_t)dst & 15))
{
BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
*dst++ = pix1;
- *dst++ = pix2;
- *dst++ = pix3;
- *dst++ = pix4;
+ w--;
+ }
+
+ while ((w -= 4) >= 0) {
+ __m128i xmm_src;
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+ _mm_store_si128 ((__m128i *)dst, xmm_src);
+ dst += 4;
}
if (w & 2)
@@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
while (w && ((uintptr_t)dst & 15))
{
@@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
__m128i xmm_alpha_hi, xmm_alpha_lo;
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
if (!is_zero (xmm_src))
{
@@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1, pix2;
uint32_t m;
while (w && ((uintptr_t)dst & 15))
@@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
if (m)
{
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
if (m == 0xffffffff && is_opaque (xmm_src))
{
@@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
}
else
{
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
- BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_FOUR_PIXELS ();
}
w -= 4;
@@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
int32_t w,
int wt,
int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x_,
pixman_fixed_t max_vx,
pixman_bool_t zero_src)
{
+ intptr_t vx = vx_;
+ intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
- uint32_t pix1, pix2, pix3, pix4;
+ uint32_t pix1;
__m128i xmm_mask;
if (zero_src || (*mask >> 24) == 0)
@@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
while (w >= 4)
{
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
- BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+ __m128i xmm_src;
+ BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
- if (pix1 | pix2 | pix3 | pix4)
+ if (!is_zero (xmm_src))
{
- __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
commit e43cc9c9024957dcc7f160f6abe7be218667dfa2
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date: Thu Sep 5 08:07:52 2013 +0300
test: safeguard the scaling-bench test against COW
The calloc call from pixman_image_create_bits may still
rely on http://en.wikipedia.org/wiki/Copy-on-write
Explicitly initializing the destination image results in
a more predictable behaviour.
V2:
- allocate 16 bytes aligned buffer with aligned stride instead
of delegating this to pixman_image_create_bits
- use memset for the allocated buffer instead of pixman solid fill
- repeat tests 3 times and select best results in order to filter
out even more measurement noise
diff --git a/test/scaling-bench.c b/test/scaling-bench.c
index b39adef..365e798 100644
--- a/test/scaling-bench.c
+++ b/test/scaling-bench.c
@@ -3,6 +3,7 @@
#define SOURCE_WIDTH 320
#define SOURCE_HEIGHT 240
+#define TEST_REPEATS 3
static pixman_image_t *
make_source (void)
@@ -39,30 +40,40 @@ main ()
"time per pixel / ns");
for (scale = 0.1; scale < 10.005; scale += 0.01)
{
+ int i;
int dest_width = SOURCE_WIDTH * scale + 0.5;
int dest_height = SOURCE_HEIGHT * scale + 0.5;
+ int dest_byte_stride = (dest_width * 4 + 15) & ~15;
pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5;
pixman_transform_t transform;
pixman_image_t *dest;
- double t1, t2;
+ double t1, t2, t = -1;
+ uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height);
+ memset (dest_buf, 0, dest_byte_stride * dest_height);
pixman_transform_init_scale (&transform, s, s);
pixman_image_set_transform (src, &transform);
dest = pixman_image_create_bits (
- PIXMAN_a8r8g8b8, dest_width, dest_height, NULL, -1);
+ PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride);
+
+ for (i = 0; i < TEST_REPEATS; i++)
+ {
+ t1 = gettime();
+ pixman_image_composite (
+ PIXMAN_OP_OVER, src, NULL, dest,
+ scale, scale, 0, 0, 0, 0, dest_width, dest_height);
+ t2 = gettime();
+ if (t < 0 || t2 - t1 < t)
+ t = t2 - t1;
+ }
- t1 = gettime();
- pixman_image_composite (
- PIXMAN_OP_OVER, src, NULL, dest,
- scale, scale, 0, 0, 0, 0, dest_width, dest_height);
- t2 = gettime();
-
printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n",
scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height,
- (t2 - t1) * 1000, ((t2 - t1) / (dest_width * dest_height)) * 1000000000);
+ t * 1000, (t / (dest_width * dest_height)) * 1000000000);
pixman_image_unref (dest);
+ free (dest_buf);
}
return 0;
More information about the xorg-commit
mailing list