pixman: Branch 'master' - 11 commits

Wed Jul 29 00:50:45 PDT 2015

pixman/pixman-vmx.c | 1185 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 1159 insertions(+), 26 deletions(-)

New commits:
commit 8d9be3619a906855a3e3a1e052317833cb24cabe
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Wed Jul 1 14:34:07 2015 +0300

    vmx: implement fast path iterator vmx_fetch_a8
    
    no changes were observed when running cairo trimmed benchmarks.
    
    Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores,
    3.4GHz, RHEL 7.1 ppc64le gave the following results:
    
    reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills)
    
                    Before          After           Change
                  --------------------------------------------
    L1              965.34          3936           +307.73%
    L2              942.99          3436.29        +264.40%
    M               902.24          2757.77        +205.66%
    HT              448.46          784.99         +75.04%
    VT              430.05          819.78         +90.62%
    R               412.9           717.04         +73.66%
    RT              168.93          220.63         +30.60%
    Kops/s          1025            1303           +27.12%
    
    It was benchmarked against commid id e2d211a from pixman/master
    
    Siarhei Siamashka reported that on playstation3, it shows the following
    results:
    
    == before ==
    
                  src_8_8888 =  L1: 194.37  L2: 198.46  M:155.90 (148.35%)
                  HT: 59.18  VT: 36.71  R: 38.93  RT: 12.79 ( 106Kops/s)
    
    == after ==
    
                  src_8_8888 =  L1: 373.96  L2: 391.10  M:245.81 (233.88%)
                  HT: 80.81  VT: 44.33  R: 48.10  RT: 14.79 ( 122Kops/s)
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 773ad76..a9bd024 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	vmx0 = load_128_unaligned((uint32_t *) src);
+
+	unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+	save_128_aligned(dst, vmx6);
+	save_128_aligned((dst +  4), vmx5);
+	save_128_aligned((dst +  8), vmx4);
+	save_128_aligned((dst + 12), vmx3);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
 #define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
@@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] =
     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
       _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
     },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+    },
     { PIXMAN_null },
 };
 
commit 47f74ca94637d79ee66c37a81eea0200e453fcc1
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Mon Jun 29 15:31:02 2015 +0300

    vmx: implement fast path iterator vmx_fetch_x8r8g8b8
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-firefox-asteroids  533.92  -> 489.94 :  1.09x
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 0950850..773ad76 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3105,6 +3105,52 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     {   PIXMAN_OP_NONE	},
 };
 
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    vector unsigned int ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_null },
+};
+
 pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
@@ -3147,5 +3193,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 
     imp->fill = vmx_fill;
 
+    imp->iter_info = vmx_iters;
+
     return imp;
 }
commit fcbb97d4458d717b9c15858aedcbee2d33c8ac5a
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Sun Jun 28 23:25:24 2015 +0300

    vmx: implement fast path scaled nearest vmx_8888_8888_OVER
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              134.36          181.68          +35.22%
    L2              135.07          180.67          +33.76%
    M               134.6           180.51          +34.11%
    HT              121.77          128.79          +5.76%
    VT              120.49          145.07          +20.40%
    R               93.83           102.3           +9.03%
    RT              50.82           46.93           -7.65%
    Kops/s          448             422             -5.80%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-firefox-asteroids  533.92 -> 497.92 :  1.07x
        t-midori-zoomed  692.98 -> 651.24 :  1.06x
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 64e9125..0950850 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2954,6 +2954,129 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
     }
 }
 
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
+                                            const uint32_t* ps,
+                                            int32_t         w,
+                                            pixman_fixed_t  vx,
+                                            pixman_fixed_t  unit_x,
+                                            pixman_fixed_t  src_width_fixed,
+                                            pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    vector unsigned int vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_src_lo, vmx_src_hi;
+    vector unsigned int vmx_alpha_lo, vmx_alpha_hi;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_vmx (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	vector unsigned int tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	tmp[0] = tmp1;
+	tmp[1] = tmp2;
+	tmp[2] = tmp3;
+	tmp[3] = tmp4;
+
+	vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm);
+
+	if (is_opaque (vmx_src_hi))
+	{
+	    save_128_aligned (pd, vmx_src_hi);
+	}
+	else if (!is_zero (vmx_src_hi))
+	{
+	    vmx_dst_hi = load_128_aligned (pd);
+
+	    unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0),
+				&vmx_src_lo, &vmx_src_hi);
+
+	    unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0),
+				&vmx_dst_lo, &vmx_dst_hi);
+
+	    expand_alpha_2x128 (
+		vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi);
+
+	    over_2x128 (&vmx_src_lo, &vmx_src_hi,
+			&vmx_alpha_lo, &vmx_alpha_hi,
+			&vmx_dst_lo, &vmx_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_vmx (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
@@ -2974,6 +3097,11 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
 
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
     {   PIXMAN_OP_NONE	},
 };
 
commit ad612c4205f0ae46fc72a50e0c90ccd05487fcba
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Sun Jun 28 22:23:44 2015 +0300

    vmx: implement fast path vmx_composite_src_x888_8888
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              1115.4          5006.49         +348.85%
    L2              1112.26         4338.01         +290.02%
    M               1110.54         2524.15         +127.29%
    HT              745.41          1140.03         +52.94%
    VT              749.03          1287.13         +71.84%
    R               423.91          547.6           +29.18%
    RT              205.79          194.98          -5.25%
    Kops/s          1414            1361            -3.75%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-gnome-system-monitor  1402.62  -> 1212.75 :  1.16x
       t-firefox-asteroids   533.92  ->  474.50 :  1.13x
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 47393dc..64e9125 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2689,6 +2689,62 @@ vmx_fill (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+	    vmx_src1 = load_128_unaligned (src);
+	    vmx_src2 = load_128_unaligned (src + 4);
+	    vmx_src3 = load_128_unaligned (src + 8);
+	    vmx_src4 = load_128_unaligned (src + 12);
+
+	    save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+	    save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+	    save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+	    save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+}
+
+static void
 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
                                pixman_composite_info_t *info)
 {
@@ -2914,6 +2970,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
 
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+
     {   PIXMAN_OP_NONE	},
 };
 
commit fafc1d403b8405727d3918bcb605cb98044af90a
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Sun Jun 28 10:14:20 2015 +0300

    vmx: implement fast path vmx_composite_over_n_8888_8888_ca
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              61.92            244.91          +295.53%
    L2              62.74            243.3           +287.79%
    M               63.03            241.94          +283.85%
    HT              59.91            144.22          +140.73%
    VT              59.4             174.39          +193.59%
    R               53.6             111.37          +107.78%
    RT              37.99            46.38           +22.08%
    Kops/s          436              506             +16.06%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-xfce4-terminal-a1  1540.37 -> 1226.14 :  1.26x
    t-firefox-talos-gfx  1488.59 -> 1209.19 :  1.23x
    
    Slowdowns
    =========
            t-evolution  553.88  -> 581.63  :  1.05x
              t-poppler  364.99  -> 383.79  :  1.05x
    t-firefox-scrolling  1223.65 -> 1304.34 :  1.07x
    
    The slowdowns can be explained in cases where the images are small and
    un-aligned to 16-byte boundary. In that case, the function will first
    work on the un-aligned area, even in operations of 1 byte. In case of
    small images, the overhead of such operations can be more than the
    savings we get from using the vmx instructions that are done on the
    aligned part of the image.
    
    In the C fast-path implementation, there is no special treatment for the
    un-aligned part, as it works in 4 byte quantities on the entire image.
    
    Because llbb is a synthetic test, I would assume it has much less
    alignment issues than "real-world" scenario, such as cairo benchmarks,
    which are basically recorded traces of real application activity.
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 641c487..47393dc 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    vector unsigned int vsrc, valpha, vmask, vdest;
+
+    vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+			    (vector unsigned int) AVV(0));
+
+    valpha = expand_alpha_1x128(vsrc);
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    /* pm is NOT necessarily 16-byte aligned */
+	    vmx_mask = load_128_unaligned (pm);
+
+	    pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+	    /* if all bits in mask are zero, pack_cmp is not 0 */
+	    if (pack_cmp == 0)
+	    {
+		/* pd is 16-byte aligned */
+		vmx_dst = load_128_aligned (pd);
+
+		unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+				    &vmx_mask_lo, &vmx_mask_hi);
+
+		unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+				    &vmx_dst_lo, &vmx_dst_hi);
+
+		in_over_2x128 (&vsrc, &vsrc,
+			       &valpha, &valpha,
+			       &vmx_mask_lo, &vmx_mask_hi,
+			       &vmx_dst_lo, &vmx_dst_hi);
+
+		save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+}
+
+static void
 vmx_composite_add_8_8 (pixman_implementation_t *imp,
             pixman_composite_info_t *info)
 {
@@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
commit a3e914407e354df70b9200e263608f1fc2e686cf
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Thu Jun 18 15:05:49 2015 +0300

    vmx: implement fast path composite_add_8888_8888
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              248.76          3284.48         +1220.34%
    L2              264.09          2826.47         +970.27%
    M               261.24          2405.06         +820.63%
    HT              217.27          857.3           +294.58%
    VT              213.78          980.09          +358.46%
    R               176.61          442.95          +150.81%
    RT              107.54          150.08          +39.56%
    Kops/s          917             1125            +22.68%
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index e49e8aa..641c487 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2765,6 +2765,31 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
     }
 }
 
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	vmx_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
@@ -2774,6 +2799,8 @@ static const pixman_fast_path_t vmx_fast_paths[] =
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
 
     {   PIXMAN_OP_NONE	},
 };
commit d5b5343c7df99082597e0c37aec937dcf5b6602d
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Thu Jun 18 14:56:47 2015 +0300

    vmx: implement fast path composite_add_8_8
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              687.63          9140.84         +1229.33%
    L2              715             7495.78         +948.36%
    M               717.39          8460.14         +1079.29%
    HT              569.56          1020.12         +79.11%
    VT              520.3           1215.56         +133.63%
    R               514.81          874.35          +69.84%
    RT              341.28          305.42          -10.51%
    Kops/s          1621            1579            -2.59%
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 9eae31c..e49e8aa 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2714,12 +2714,67 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
     }
 }
 
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (uintptr_t)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	vmx_combine_add_u (imp, op,
+		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+
     {   PIXMAN_OP_NONE	},
 };
 
commit 339eeaf095f949694d7f79a45171ac03a3b06f90
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Thu Jun 18 14:12:05 2015 +0300

    vmx: implement fast path composite_over_8888_8888
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              129.47          1054.62         +714.57%
    L2              138.31          1011.02         +630.98%
    M               139.99          1008.65         +620.52%
    HT              122.11          468.45          +283.63%
    VT              121.06          532.21          +339.62%
    R               108.48          240.5           +121.70%
    RT              77.87           116.7           +49.87%
    Kops/s          758             981             +29.42%
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 61fdb80..9eae31c 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2688,8 +2688,38 @@ vmx_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+        vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
     {   PIXMAN_OP_NONE	},
 };
 
commit 0cc8a2e9714efcb7cdd7e2a94c9cba49c3e29e00
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Sun Jun 28 09:42:19 2015 +0300

    vmx: implement fast path vmx_fill
    
    Based on sse2 impl.
    
    It was benchmarked against commid id e2d211a from pixman/master
    
    Tested cairo trimmed benchmarks on POWER8, 8 cores, 3.4GHz,
    RHEL 7.1 ppc64le :
    
    speedups
    ========
         t-swfdec-giant-steps  1383.09 ->  718.63  :  1.92x speedup
       t-gnome-system-monitor  1403.53 ->  918.77  :  1.53x speedup
                  t-evolution  552.34  ->  415.24  :  1.33x speedup
          t-xfce4-terminal-a1  1573.97 ->  1351.46 :  1.16x speedup
          t-firefox-paintball  847.87  ->  734.50  :  1.15x speedup
          t-firefox-asteroids  565.99  ->  492.77  :  1.15x speedup
    t-firefox-canvas-swscroll  1656.87 ->  1447.48 :  1.14x speedup
              t-midori-zoomed  724.73  ->  642.16  :  1.13x speedup
       t-firefox-planet-gnome  975.78  ->  911.92  :  1.07x speedup
              t-chromium-tabs  292.12  ->  274.74  :  1.06x speedup
         t-firefox-chalkboard  690.78  ->  653.93  :  1.06x speedup
          t-firefox-talos-gfx  1375.30 ->  1303.74 :  1.05x speedup
       t-firefox-canvas-alpha  1016.79 ->  967.24  :  1.05x speedup
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 39d1a06..61fdb80 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2537,6 +2537,157 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t		    filler)
+{
+    uint32_t byte_width;
+    uint8_t *byte_line;
+
+    vector unsigned int vfiller;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = filler & 0xff;
+	w = (b << 8) | b;
+	filler = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        filler = (filler & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    vfiller = create_mask_1x32_128(&filler);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+	    vec_st(vfiller, 0, (uint32_t *) d + 8);
+	    vec_st(vfiller, 0, (uint32_t *) d + 12);
+	    vec_st(vfiller, 0, (uint32_t *) d + 16);
+	    vec_st(vfiller, 0, (uint32_t *) d + 20);
+	    vec_st(vfiller, 0, (uint32_t *) d + 24);
+	    vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+	    vec_st(vfiller, 0, (uint32_t *) d + 8);
+	    vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     {   PIXMAN_OP_NONE	},
@@ -2582,5 +2733,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
 
+    imp->fill = vmx_fill;
+
     return imp;
 }
commit c12ee95089e7d281a29a24bf56b81f5c16dec6ee
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Sun Jun 28 09:42:08 2015 +0300

    vmx: add helper functions
    
    This patch adds the following helper functions for reuse of code,
    hiding BE/LE differences and maintainability.
    
    All of the functions were defined as static force_inline.
    
    Names were copied from pixman-sse2.c so conversion of fast-paths between
    sse2 and vmx would be easier from now on. Therefore, I tried to keep the
    input/output of the functions to be as close as possible to the sse2
    definitions.
    
    The functions are:
    
    - load_128_aligned       : load 128-bit from a 16-byte aligned memory
                               address into a vector
    
    - load_128_unaligned     : load 128-bit from memory into a vector,
                               without guarantee of alignment for the
                               source pointer
    
    - save_128_aligned       : save 128-bit vector into a 16-byte aligned
                               memory address
    
    - create_mask_16_128     : take a 16-bit value and fill with it
                               a new vector
    
    - create_mask_1x32_128   : take a 32-bit pointer and fill a new
                               vector with the 32-bit value from that pointer
    
    - create_mask_32_128     : take a 32-bit value and fill with it
                               a new vector
    
    - unpack_32_1x128        : unpack 32-bit value into a vector
    
    - unpacklo_128_16x8      : unpack the eight low 8-bit values of a vector
    
    - unpackhi_128_16x8      : unpack the eight high 8-bit values of a vector
    
    - unpacklo_128_8x16      : unpack the four low 16-bit values of a vector
    
    - unpackhi_128_8x16      : unpack the four high 16-bit values of a vector
    
    - unpack_128_2x128       : unpack the eight low 8-bit values of a vector
                               into one vector and the eight high 8-bit
                               values into another vector
    
    - unpack_128_2x128_16    : unpack the four low 16-bit values of a vector
                               into one vector and the four high 16-bit
                               values into another vector
    
    - unpack_565_to_8888     : unpack an RGB_565 vector to 8888 vector
    
    - pack_1x128_32          : pack a vector and return the LSB 32-bit of it
    
    - pack_2x128_128         : pack two vectors into one and return it
    
    - negate_2x128           : xor two vectors with mask_00ff (separately)
    
    - is_opaque              : returns whether all the pixels contained in
                               the vector are opaque
    
    - is_zero                : returns whether the vector equals 0
    
    - is_transparent         : returns whether all the pixels
                               contained in the vector are transparent
    
    - expand_pixel_8_1x128   : expand an 8-bit pixel into lower 8 bytes of a
                               vector
    
    - expand_alpha_1x128     : expand alpha from vector and return the new
                               vector
    
    - expand_alpha_2x128     : expand alpha from one vector and another alpha
                               from a second vector
    
    - expand_alpha_rev_2x128 : expand a reversed alpha from one vector and
                               another reversed alpha from a second vector
    
    - pix_multiply_2x128     : do pix_multiply for two vectors (separately)
    
    - over_2x128             : perform over op. on two vectors
    
    - in_over_2x128          : perform in-over op. on two vectors
    
    v2: removed expand_pixel_32_1x128 as it was not used by any function and
    its implementation was erroneous
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 880a19a..39d1a06 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -30,10 +30,19 @@
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 #include <altivec.h>
 
 #define AVV(x...) {x}
 
+static vector unsigned int mask_00ff;
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
 static force_inline vector unsigned int
 splat_alpha (vector unsigned int pix)
 {
@@ -233,6 +242,464 @@ do							  \
 #define STORE_VECTOR(dest)						\
     vec_st ((vector unsigned int) v ## dest, 0, dest);
 
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+    return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+    vector unsigned int vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+
+    return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+		  vector unsigned int vdata)
+{
+    STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_16_128 (uint16_t mask)
+{
+    uint16_t* src;
+    vector unsigned short vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    src = &mask;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+    return (vector unsigned int) vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_1x32_128 (const uint32_t *src)
+{
+    vector unsigned int vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+    return vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+    return create_mask_1x32_128(&mask);
+}
+
+static force_inline vector unsigned int
+unpack_32_1x128 (uint32_t data)
+{
+    vector unsigned int vdata = {0, 0, 0, data};
+    vector unsigned short lo;
+
+    lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned char) AVV(0),
+		    (vector unsigned char) vdata);
+#else
+	vec_mergel ((vector unsigned char) vdata,
+		    (vector unsigned char) AVV(0));
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char lo;
+
+    /* unpack to short */
+    lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergel ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char hi;
+
+    /* unpack to short */
+    hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergeh ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short lo;
+
+    /* unpack to char */
+    lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergel ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short hi;
+
+    /* unpack to char */
+    hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergeh ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_16x8(data1, data2);
+    *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_8x16(data1, data2);
+    *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+    vector unsigned int r, g, b, rb, t;
+
+    r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+    g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+    b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+    rb = vec_or (r, b);
+    t  = vec_and (rb, mask_565_fix_rb);
+    t  = vec_sr (t, create_mask_32_128(5));
+    rb = vec_or (rb, t);
+
+    t  = vec_and (g, mask_565_fix_g);
+    t  = vec_sr (t, create_mask_32_128(6));
+    g  = vec_or (g, t);
+
+    return vec_or (rb, g);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (vector unsigned int data)
+{
+    vector unsigned char vpack;
+
+    vpack = vec_packsu((vector unsigned short) data,
+			(vector unsigned short) AVV(0));
+
+    return vec_extract((vector unsigned int) vpack, 1);
+}
+
+static force_inline vector unsigned int
+pack_2x128_128 (vector unsigned int lo, vector unsigned int hi)
+{
+    vector unsigned char vpack;
+
+    vpack = vec_packsu((vector unsigned short) hi,
+			(vector unsigned short) lo);
+
+    return (vector unsigned int) vpack;
+}
+
+static force_inline void
+negate_2x128 (vector unsigned int  data_lo,
+	      vector unsigned int  data_hi,
+	      vector unsigned int* neg_lo,
+	      vector unsigned int* neg_hi)
+{
+    *neg_lo = vec_xor (data_lo, mask_00ff);
+    *neg_hi = vec_xor (data_hi, mask_00ff);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+    uint32_t cmp_result;
+    vector bool int ffs = vec_cmpeq(x, x);
+
+    cmp_result = vec_all_eq(x, ffs);
+
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+    return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline vector unsigned int
+expand_pixel_8_1x128 (uint8_t data)
+{
+    vector unsigned int vdata;
+
+    vdata = unpack_32_1x128 ((uint32_t) data);
+
+#ifdef WORDS_BIGENDIAN
+    return vec_perm (vdata, vdata,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+    return vec_perm (vdata, vdata,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline vector unsigned int
+expand_alpha_1x128 (vector unsigned int data)
+{
+#ifdef WORDS_BIGENDIAN
+    return vec_perm (data, data,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#else
+    return vec_perm (data, data,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#endif
+}
+
+static force_inline void
+expand_alpha_2x128 (vector unsigned int  data_lo,
+		    vector unsigned int  data_hi,
+		    vector unsigned int* alpha_lo,
+		    vector unsigned int* alpha_hi)
+{
+
+    *alpha_lo = expand_alpha_1x128(data_lo);
+    *alpha_hi = expand_alpha_1x128(data_hi);
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (vector unsigned int  data_lo,
+			vector unsigned int  data_hi,
+			vector unsigned int* alpha_lo,
+			vector unsigned int* alpha_hi)
+{
+#ifdef WORDS_BIGENDIAN
+    *alpha_lo = vec_perm (data_lo, data_lo,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+
+    *alpha_hi = vec_perm (data_hi, data_hi,
+		     (vector unsigned char)AVV (
+			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+    *alpha_lo = vec_perm (data_lo, data_lo,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+
+    *alpha_hi = vec_perm (data_hi, data_hi,
+		     (vector unsigned char)AVV (
+			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline void
+pix_multiply_2x128 (vector unsigned int* data_lo,
+		    vector unsigned int* data_hi,
+		    vector unsigned int* alpha_lo,
+		    vector unsigned int* alpha_hi,
+		    vector unsigned int* ret_lo,
+		    vector unsigned int* ret_hi)
+{
+    *ret_lo = pix_multiply(*data_lo, *alpha_lo);
+    *ret_hi = pix_multiply(*data_hi, *alpha_hi);
+}
+
+static force_inline void
+over_2x128 (vector unsigned int* src_lo,
+	    vector unsigned int* src_hi,
+	    vector unsigned int* alpha_lo,
+	    vector unsigned int* alpha_hi,
+	    vector unsigned int* dst_lo,
+	    vector unsigned int* dst_hi)
+{
+    vector unsigned int t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = (vector unsigned int)
+		    vec_adds ((vector unsigned char) *src_lo,
+			      (vector unsigned char) *dst_lo);
+
+    *dst_hi = (vector unsigned int)
+		    vec_adds ((vector unsigned char) *src_hi,
+			      (vector unsigned char) *dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (vector unsigned int* src_lo,
+	       vector unsigned int* src_hi,
+	       vector unsigned int* alpha_lo,
+	       vector unsigned int* alpha_hi,
+	       vector unsigned int* mask_lo,
+	       vector unsigned int* mask_hi,
+	       vector unsigned int* dst_lo,
+	       vector unsigned int* dst_hi)
+{
+    vector unsigned int s_lo, s_hi;
+    vector unsigned int a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    vector unsigned int vmxs;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	vmxs = unpack_32_1x128 (src);
+	return pack_1x128_32(
+		over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	vector unsigned int ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+    vector unsigned int vmx_src_lo, vmx_src_hi;
+    vector unsigned int vmx_msk_lo, vmx_msk_hi;
+    vector unsigned int s;
+
+    if (pm)
+    {
+	vmx_msk_lo = load_128_unaligned(pm);
+
+	if (is_transparent(vmx_msk_lo))
+	    return (vector unsigned int) AVV(0);
+    }
+
+    s = load_128_unaligned(ps);
+
+    if (pm)
+    {
+	unpack_128_2x128(s, (vector unsigned int) AVV(0),
+			    &vmx_src_lo, &vmx_src_hi);
+
+	unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0),
+			    &vmx_msk_lo, &vmx_msk_hi);
+
+	expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi);
+
+	pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi,
+			   &vmx_msk_lo, &vmx_msk_hi,
+			   &vmx_src_lo, &vmx_src_hi);
+
+	s = pack_2x128_128(vmx_src_lo, vmx_src_hi);
+    }
+
+    return s;
+}
+
 static void
 vmx_combine_over_u_no_mask (uint32_t *      dest,
                             const uint32_t *src,
@@ -2080,6 +2547,15 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
 
+    /* VMX constants */
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_ff000000 = create_mask_32_128 (0xff000000);
+    mask_red   = create_mask_32_128 (0x00f80000);
+    mask_green = create_mask_32_128 (0x0000fc00);
+    mask_blue  = create_mask_32_128 (0x000000f8);
+    mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+    mask_565_fix_g = create_mask_32_128  (0x0000c000);
+
     /* Set up function pointers */
 
     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
commit 034149537be94862b43fb09699b8c2149bfe948d
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date:   Thu Jul 2 11:04:20 2015 +0300

    vmx: add LOAD_VECTOR macro
    
    This patch adds a macro for loading a single vector.
    It also make the other LOAD_VECTORx macros use this macro as a base so
    code would be re-used.
    
    In addition, I fixed minor coding style issues.
    
    Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index cef921f..880a19a 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -169,33 +169,29 @@ over (vector unsigned int src,
     mask ## _mask = vec_lvsl (0, mask);					\
     source ## _mask = vec_lvsl (0, source);
 
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
-
-#define LOAD_VECTORS(dest, source)			  \
-do {							  \
+#define LOAD_VECTOR(source)				  \
+do							  \
+{							  \
     vector unsigned char tmp1, tmp2;			  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    v ## source = (typeof(v ## source))			  \
+    v ## source = (typeof(v ## source)) 		  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
+} while (0)
+
+#define LOAD_VECTORS(dest, source)			  \
+do							  \
+{							  \
+    LOAD_VECTOR(source);				  \
     v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
-} while (0);
+} while (0)
 
 #define LOAD_VECTORSC(dest, source, mask)		  \
-do {							  \
-    vector unsigned char tmp1, tmp2;			  \
-    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
-    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    v ## source = (typeof(v ## source))			  \
-	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
-    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
-    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
-    v ## mask = (typeof(v ## mask))			  \
-    vec_perm (tmp1, tmp2, mask ## _mask);		  \
-} while (0);
+do							  \
+{							  \
+    LOAD_VECTORS(dest, source); 			  \
+    LOAD_VECTOR(mask);					  \
+} while (0)
 
 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
@@ -213,14 +209,16 @@ do {							  \
 
 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
 
+# define LOAD_VECTOR(source)				\
+    v ## source = *((typeof(v ## source)*)source);
+
 # define LOAD_VECTORS(dest, source)			\
-    v ## source = *((typeof(v ## source)*)source);	\
-    v ## dest = *((typeof(v ## dest)*)dest);
+    LOAD_VECTOR(source);				\
+    LOAD_VECTOR(dest);					\
 
 # define LOAD_VECTORSC(dest, source, mask)		\
-    v ## source = *((typeof(v ## source)*)source);	\
-    v ## dest = *((typeof(v ## dest)*)dest);		\
-    v ## mask = *((typeof(v ## mask)*)mask);
+    LOAD_VECTORS(dest, source); 			\
+    LOAD_VECTOR(mask);					\
 
 #define DECLARE_SRC_MASK_VAR
 #define DECLARE_MASK_MASK_VAR
@@ -228,7 +226,7 @@ do {							  \
 #endif /* WORDS_BIGENDIAN */
 
 #define LOAD_VECTORSM(dest, source, mask)				\
-    LOAD_VECTORSC (dest, source, mask)					\
+    LOAD_VECTORSC (dest, source, mask); 				\
     v ## source = pix_multiply (v ## source,				\
                                 splat_alpha (v ## mask));