pixman: Branch 'master' - 4 commits

Thu Apr 3 21:43:43 PDT 2014

configure.ac                          |   18 +++++++
 pixman/Makefile.am                    |    2 
 pixman/pixman-arm-asm.h               |   37 ++++++++++++++++
 pixman/pixman-arm-neon-asm-bilinear.S |   12 -----
 pixman/pixman-arm-neon-asm.S          |   12 -----
 pixman/pixman-arm-simd-asm-scaled.S   |   11 ----
 pixman/pixman-arm-simd-asm.S          |   78 ++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c              |    6 ++
 test/tolerance-test.c                 |    4 -
 test/utils-prng.c                     |   10 ++--
 test/utils-prng.h                     |    9 +--
 11 files changed, 155 insertions(+), 44 deletions(-)

New commits:
commit 4b76bbfda670f9ede67d0449f3640605e1fc4df0
Author: Pekka Paalanen <pekka.paalanen at collabora.co.uk>
Date:   Mon Mar 31 15:03:43 2014 +0300

    ARM: share pixman_asm_function definition
    
    Several files define identically the asm macro pixman_asm_function.
    Merge all these definitions into a new asm header.
    
    The original definition is taken from pixman-arm-simd-asm-scaled.S with
    the copyright/licence/author blurb verbatim.

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index b376d9a..581b6f6 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -72,6 +72,7 @@ libpixman_arm_simd_la_SOURCES = \
 	pixman-arm-common.h	\
 	pixman-arm-simd-asm.S   \
 	pixman-arm-simd-asm-scaled.S \
+	pixman-arm-asm.h	\
 	pixman-arm-simd-asm.h
 libpixman_1_la_LIBADD += libpixman-arm-simd.la
 
@@ -86,6 +87,7 @@ libpixman_arm_neon_la_SOURCES = \
         pixman-arm-common.h	\
         pixman-arm-neon-asm.S	\
 		pixman-arm-neon-asm-bilinear.S \
+        pixman-arm-asm.h	\
         pixman-arm-neon-asm.h
 libpixman_1_la_LIBADD += libpixman-arm-neon.la
 
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
new file mode 100644
index 0000000..ee78541
--- /dev/null
+++ b/pixman/pixman-arm-asm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright Â© 2008 Mozilla Corporation
+ * Copyright Â© 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff at infidigm.net)
+ *
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index e37b5c2..0fd92d6 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -65,23 +65,13 @@
 .p2align 2
 
 #include "pixman-private.h"
+#include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
 /*
  * Bilinear macros from pixman-arm-neon-asm.S
  */
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-    .func fname
-    .global fname
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-.endm
-
 /*
  * Bilinear scaling support code which tries to provide pixel fetching, color
  * format conversion, and interpolation as separate macros which can be used
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 187197d..7e949a3 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -50,6 +50,7 @@
     .p2align 2
 
 #include "pixman-private.h"
+#include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
 /* Global configuration options and preferences */
@@ -2830,17 +2831,6 @@ generate_composite_function_nearest_scanline \
 
 /******************************************************************************/
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-    .func fname
-    .global fname
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
-.endm
-
 /*
  * Bilinear scaling support code which tries to provide pixel fetching, color
  * format conversion, and interpolation as separate macros which can be used
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index 7110995..e050292 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -37,16 +37,7 @@
 	.altmacro
 	.p2align 2
 
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-	.func fname
-	.global fname
-#ifdef __ELF__
-	.hidden fname
-	.type fname, %function
-#endif
-fname:
-.endm
+#include "pixman-arm-asm.h"
 
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
commit 4ee85b008333a95d4dfc27e7b04c444dcbb3a7e7
Author: Ben Avison <bavison at riscosopen.org>
Date:   Fri Mar 28 11:13:21 2014 +0200

    ARMv6: Add fast path for over_reverse_n_8888
    
    Benchmark results, "before" is upstream commit
    c343846 lowlevel-blt-bench: add in_reverse_8888_8888 test
    and "after" is with this patch only added on top.
    
    lowlevel-blt-bench, over_reverse_n_8888, 100 iterations:
    
           Before          After
          Mean StdDev     Mean StdDev   Confidence   Change
    L1    15.1    0.1    274.5    2.3    100.00%   +1718.9%
    L2    12.8    0.3    181.8    0.7    100.00%   +1315.5%
    M     10.8    0.0     77.9    0.0    100.00%    +621.2%
    HT     9.7    0.0     29.4    0.2    100.00%    +204.9%
    VT     9.5    0.0     26.7    0.1    100.00%    +179.3%
    R      9.3    0.0     25.3    0.1    100.00%    +173.6%
    RT     6.0    0.1     11.0    0.2    100.00%     +82.9%
    
    At most 16 outliers rejected per case per set.
    
    cairo-perf-trace with trimmed traces, 30 iterations:
    
                                        Before          After
                                       Mean StdDev     Mean StdDev   Confidence   Change
    t-poppler.trace                    12.9    0.1      9.7    0.0    100.00%     +32.6%
    t-firefox-talos-gfx.trace          33.2    0.7     32.9    0.4     95.23%      +0.9%  (insignificant)
    t-firefox-particles.trace          27.4    0.1     27.3    0.2     99.65%      +0.4%
    t-firefox-canvas-alpha.trace       20.5    0.3     20.5    0.3     57.51%      +0.3%  (insignificant)
    t-poppler-reseau.trace             22.4    0.1     22.4    0.1     95.69%      +0.3%  (insignificant)
    t-firefox-fishtank.trace           13.2    0.0     13.2    0.0     99.84%      +0.1%
    t-swfdec-giant-steps.trace         14.9    0.0     14.9    0.0     87.68%      +0.1%  (insignificant)
    t-swfdec-youtube.trace              7.8    0.0      7.8    0.0     35.22%      +0.1%  (insignificant)
    t-firefox-planet-gnome.trace       11.5    0.0     11.5    0.0     29.37%      +0.0%  (insignificant)
    t-firefox-fishbowl.trace           21.2    0.0     21.2    0.0     18.09%      +0.0%  (insignificant)
    t-grads-heat-map.trace              4.4    0.0      4.4    0.0      1.84%      +0.0%  (insignificant)
    t-firefox-paintball.trace          18.0    0.0     18.0    0.0     33.43%      -0.0%  (insignificant)
    t-firefox-talos-svg.trace          20.5    0.0     20.5    0.1     68.56%      -0.1%  (insignificant)
    t-midori-zoomed.trace               8.0    0.0      8.0    0.0     99.98%      -0.1%
    t-firefox-canvas-swscroll.trace    32.1    0.1     32.1    0.1     85.27%      -0.1%  (insignificant)
    t-gnome-system-monitor.trace       17.2    0.0     17.2    0.0     99.97%      -0.2%
    t-firefox-chalkboard.trace         36.5    0.0     36.6    0.0    100.00%      -0.2%
    t-firefox-asteroids.trace          11.1    0.0     11.1    0.0    100.00%      -0.2%
    t-firefox-canvas.trace             17.9    0.0     18.0    0.0    100.00%      -0.3%
    t-chromium-tabs.trace               4.9    0.0      4.9    0.0     97.95%      -0.3%  (insignificant)
    t-xfce4-terminal-a1.trace           4.8    0.0      4.8    0.0    100.00%      -0.4%
    t-firefox-scrolling.trace          31.1    0.1     31.2    0.1    100.00%      -0.5%
    t-evolution.trace                  13.7    0.1     13.8    0.1     99.99%      -0.6%
    t-gnome-terminal-vim.trace         22.0    0.2     22.2    0.1     99.99%      -0.7%
    t-gvim.trace                       33.2    0.2     33.5    0.2    100.00%      -0.8%
    
    At most 6 outliers rejected per case per set.
    
    Cairo perf reports the running time, but the change is computed for
    operations per second instead (inverse of running time).
    
    Changes in the order of +/- 1% can be accounted for measurement errors,
    even if they are deemed to be statistically significant. This claim is
    based on comparing two 30-iteration identical "before" runs using the
    exact same binaries, and observing changes from -0.4% to +0.5% with
    >=99% confidence.
    
    Confidence is based on Welch's t-test.
    
    v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Rebased, re-benchmarked on Raspberry Pi, commit message.

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index c209688..dd6f788 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -611,3 +611,81 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro over_reverse_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        ldr     MASK, =0x00800080
+        /* Split source pixel into RB/AG parts */
+        uxtb16  STRIDE_S, SRC
+        uxtb16  STRIDE_M, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs  STRIDE_D, ORIG_W
+.endm
+
+.macro over_reverse_n_8888_newline
+        mov     STRIDE_D, #0xFF
+.endm
+
+.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_reverse_n_8888_1pixel  d, is_only
+        teq     WK&d, #0
+        beq     8f       /* replace with source */
+        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
+ .if is_only == 1
+        beq     49f      /* skip store */
+ .else
+        beq     9f       /* write same value back */
+ .endif
+        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
+        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     ORIG_W, SCRATCH, ORIG_W
+        uqadd8  WK&d, WK&d, ORIG_W
+        b       9f
+8:      mov     WK&d, SRC
+9:
+.endm
+
+.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+        over_reverse_n_8888_1pixel  reg1, 1
+ .else
+        and     SCRATCH, WK&reg1, WK&reg2
+  .if numbytes == 16
+        and     SCRATCH, SCRATCH, WK&reg3
+        and     SCRATCH, SCRATCH, WK&reg4
+  .endif
+        mvns    SCRATCH, SCRATCH, asr #24
+        beq     49f /* skip store if all opaque */
+        over_reverse_n_8888_1pixel  reg1, 0
+        over_reverse_n_8888_1pixel  reg2, 0
+  .if numbytes == 16
+        over_reverse_n_8888_1pixel  reg3, 0
+        over_reverse_n_8888_1pixel  reg4, 0
+  .endif
+ .endif
+        pixst   , numbytes, reg1, DST
+49:
+.endm
+
+.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
+        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+    3, /* prefetch distance */ \
+    over_reverse_n_8888_init, \
+    over_reverse_n_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_reverse_n_8888_process_head, \
+    over_reverse_n_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index af062e1..8fbc439 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -47,6 +47,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
+                                 uint32_t, 1)
+
 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
                                      uint32_t, 1, uint32_t, 1)
 
@@ -225,6 +228,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
 
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
+
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
 
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
commit 56622140e3a8175c8ccc82c9717adf8372043364
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Fri Mar 7 08:23:10 2014 +0200

    test: Fix OpenMP clauses for the tolerance-test
    
    Compiling with the Intel Compiler reveals a problem:
    
    tolerance-test.c(350): error: index variable "i" of for statement following an OpenMP for pragma must be private
      #       pragma omp parallel for default(none) shared(i) private (result)
      ^
    
    In addition to this, the 'result' variable also should not be private
    (otherwise its value does not survive after the end of the loop). It
    needs to be either shared or use the reduction clause to describe how
    the results from multiple threads are combined together. Reduction
    seems to be more appropriate here.

diff --git a/test/tolerance-test.c b/test/tolerance-test.c
index 5625630..320bb7f 100644
--- a/test/tolerance-test.c
+++ b/test/tolerance-test.c
@@ -347,12 +347,12 @@ main (int argc, const char *argv[])
     else
     {
 #ifdef USE_OPENMP
-#       pragma omp parallel for default(none) shared(i) private (result)
+#       pragma omp parallel for default(none) reduction(|:result)
 #endif
         for (i = 0; i < N_TESTS; ++i)
 	{
 	    if (!do_check (i))
-		result = 1;
+		result |= 1;
 	}
     }
     
commit 840912b31159aa8ac7be4ea0cee8bdef95a539a4
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Fri Mar 7 06:39:42 2014 +0200

    configure.ac: Check if the compiler supports GCC vector extensions
    
    The Intel Compiler 14.0.0 claims version GCC 4.7.3 compatibility
    via __GNUC__/__GNUC__MINOR__ macros, but does not provide the same
    level of GCC vector extensions support as the original GCC compiler:
        http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
    
    Which results in the following compilation failure:
    
    In file included from ../test/utils.h(7),
                     from ../test/utils.c(3):
    ../test/utils-prng.h(138): error: expression must have integral type
          uint32x4 e = x->a - ((x->b << 27) + (x->b >> (32 - 27)));
                                ^
    
    The problem is fixed by doing a special check in configure for
    this feature.

diff --git a/configure.ac b/configure.ac
index 6327972..0339494 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1061,6 +1061,24 @@ fi
 
 AC_MSG_RESULT($support_for_builtin_clz)
 
+dnl =====================================
+dnl GCC vector extensions
+
+support_for_gcc_vector_extensions=no
+
+AC_MSG_CHECKING(for GCC vector extensions)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+unsigned int __attribute__ ((vector_size(16))) e, a, b;
+int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
+]])], support_for_gcc_vector_extensions=yes)
+
+if test x$support_for_gcc_vector_extensions = xyes; then
+   AC_DEFINE([HAVE_GCC_VECTOR_EXTENSIONS], [],
+             [Whether the compiler supports GCC vector extensions])
+fi
+
+AC_MSG_RESULT($support_for_gcc_vector_extensions)
+
 dnl ==================
 dnl libpng
 
diff --git a/test/utils-prng.c b/test/utils-prng.c
index 7b32e35..c27b5be 100644
--- a/test/utils-prng.c
+++ b/test/utils-prng.c
@@ -27,7 +27,7 @@
 #include "utils.h"
 #include "utils-prng.h"
 
-#if defined(GCC_VECTOR_EXTENSIONS_SUPPORTED) && defined(__SSE2__)
+#if defined(HAVE_GCC_VECTOR_EXTENSIONS) && defined(__SSE2__)
 #include <xmmintrin.h>
 #endif
 
@@ -52,7 +52,7 @@ void smallprng_srand_r (smallprng_t *x, uint32_t seed)
  */
 void prng_srand_r (prng_t *x, uint32_t seed)
 {
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
     int i;
     prng_rand_128_data_t dummy;
     smallprng_srand_r (&x->p0, seed);
@@ -75,7 +75,7 @@ void prng_srand_r (prng_t *x, uint32_t seed)
 static force_inline void
 store_rand_128_data (void *addr, prng_rand_128_data_t *d, int aligned)
 {
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
     if (aligned)
     {
         *(uint8x16 *)addr = d->vb;
@@ -120,7 +120,7 @@ randmemset_internal (prng_t                  *prng,
         {
             prng_rand_128_r (&local_prng, &t);
             prng_rand_128_r (&local_prng, &randdata);
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
             if (flags & RANDMEMSET_MORE_FF)
             {
                 const uint8x16 const_C0 =
@@ -199,7 +199,7 @@ randmemset_internal (prng_t                  *prng,
         }
         else
         {
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
             const uint8x16 bswap_shufflemask =
             {
                 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
diff --git a/test/utils-prng.h b/test/utils-prng.h
index 564ffce..f9ae8dd 100644
--- a/test/utils-prng.h
+++ b/test/utils-prng.h
@@ -79,8 +79,7 @@
 
 /*****************************************************************************/
 
-#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
-#define GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
 typedef uint32_t uint32x4 __attribute__ ((vector_size(16)));
 typedef uint8_t  uint8x16 __attribute__ ((vector_size(16)));
 #endif
@@ -92,7 +91,7 @@ typedef struct
 
 typedef struct
 {
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
     uint32x4 a, b, c, d;
 #else
     smallprng_t p1, p2, p3, p4;
@@ -104,7 +103,7 @@ typedef union
 {
     uint8_t  b[16];
     uint32_t w[4];
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
     uint8x16 vb;
     uint32x4 vw;
 #endif
@@ -134,7 +133,7 @@ prng_rand_r (prng_t *x)
 static force_inline void
 prng_rand_128_r (prng_t *x, prng_rand_128_data_t *data)
 {
-#ifdef GCC_VECTOR_EXTENSIONS_SUPPORTED
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
     uint32x4 e = x->a - ((x->b << 27) + (x->b >> (32 - 27)));
     x->a = x->b ^ ((x->c << 17) ^ (x->c >> (32 - 17)));
     x->b = x->c + x->d;