pixman: Branch 'master' - 4 commits

Mon Feb 28 06:03:26 PST 2011

pixman/pixman-arm-neon-asm.S |  197 +++++++++++++++++++
 pixman/pixman-arm-neon.c     |   45 ++++
 pixman/pixman-fast-path.h    |  432 +++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-sse2.c         |  112 +++++++++++
 test/Makefile.am             |    2 
 test/scaling-helpers-test.c  |   93 +++++++++
 6 files changed, 881 insertions(+)

New commits:
commit 17feaa9c50bb8521b0366345efe181bd99754957
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Feb 22 18:45:03 2011 +0200

    ARM: NEON optimization for bilinear scaled 'src_8888_8888'
    
    Initial NEON optimization for bilinear scaling. Can be probably
    improved more.
    
    Benchmark on ARM Cortex-A8:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
      after:  op=1, src=20028888, dst=20028888, speed=44.27 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 47daf45..c168e10 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \
     10,  /* dst_r_basereg */ \
     8,  /* src_basereg   */ \
     15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+.macro bilinear_interpolate_last_pixel
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d0}, [TMP1]
+    vshr.u16  d30, d24, #8
+    vld1.32   {d1}, [TMP2]
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    vst1.32   {d0[0]}, [OUT, :32]!
+.endm
+
+.macro bilinear_interpolate_two_pixels
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d0}, [TMP1]
+    vld1.32   {d1}, [TMP2]
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d20}, [TMP1]
+    vld1.32   {d21}, [TMP2]
+    vmull.u8  q11, d20, d28
+    vmlal.u8  q11, d21, d29
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d30, q0, #16
+    vshrn.u32 d31, q10, #16
+    vmovn.u16 d0, q15
+    vst1.32   {d0}, [OUT]!
+.endm
+
+.macro bilinear_interpolate_four_pixels
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d0}, [TMP1]
+    vld1.32   {d1}, [TMP2]
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d20}, [TMP1]
+    vld1.32   {d21}, [TMP2]
+    vmull.u8  q11, d20, d28
+    vmlal.u8  q11, d21, d29
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d4}, [TMP1]
+    vld1.32   {d5}, [TMP2]
+    vmull.u8  q3, d4, d28
+    vmlal.u8  q3, d5, d29
+    mov       TMP1, X, asr #16
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {d16}, [TMP1]
+    vld1.32   {d17}, [TMP2]
+    vmull.u8  q9, d16, d28
+    vmlal.u8  q9, d17, d29
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vst1.32   {d0, d1}, [OUT]!
+.endm
+
+
+/*
+ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t *       out,
+ *                                                const uint32_t * top,
+ *                                                const uint32_t * bottom,
+ *                                                int              wt,
+ *                                                int              wb,
+ *                                                pixman_fixed_t   x,
+ *                                                pixman_fixed_t   ux,
+ *                                                int              width)
+ */
+
+pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7}
+    ldmia     ip, {WB, X, UX, WIDTH}
+
+    cmp       WIDTH, #0
+    ble       3f
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+    vadd.u16  q13, q13, q13
+
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+0:
+    bilinear_interpolate_four_pixels
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+1:
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel
+3:
+    pop       {r4, r5, r6, r7}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    BOTTOM
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+.endfunc
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3e0c0d1..c7c0254 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits,
     }
 }
 
+void
+pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t *       out,
+                                                        const uint32_t * top,
+                                                        const uint32_t * bottom,
+                                                        int              wt,
+                                                        int              wb,
+                                                        pixman_fixed_t   x,
+                                                        pixman_fixed_t   ux,
+                                                        int              width);
+
+static force_inline void
+scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
+                                                            src_bottom, wt, wb,
+                                                            vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_neon_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_neon_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_neon_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FALSE, FALSE)
+
 static const pixman_fast_path_t arm_neon_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit 350029396d911941591149cc82b5e68a78ad6747
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Feb 21 20:18:02 2011 +0200

    SSE2 optimization for bilinear scaled 'src_8888_8888'
    
    A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
    which only handles one pixel at a time. It is approximately 2x faster than
    pixman general compositing path. Single pass processing without intermediate
    temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
    of this speedup.
    
    Benchmark on Intel Core i7 (x86-64):
     Using cairo-perf-trace:
      before: image        firefox-planet-gnome   12.566   12.610   0.23%    6/6
      after:  image        firefox-planet-gnome   10.961   11.013   0.19%    5/6
    
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
      after:  op=1, src=20028888, dst=20028888, speed=165.38 MPix/s

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 88287b4..696005f 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
 
+static void
+bilinear_interpolate_line_sse2 (uint32_t *       out,
+                                const uint32_t * top,
+                                const uint32_t * bottom,
+                                int              wt,
+                                int              wb,
+                                pixman_fixed_t   x,
+                                pixman_fixed_t   ux,
+                                int              width)
+{
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+    const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+    const __m128i xmm_zero = _mm_setzero_si128 ();
+    __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+    uint32_t pix1, pix2, pix3, pix4;
+
+    #define INTERPOLATE_ONE_PIXEL(pix)						\
+    do {									\
+	__m128i xmm_wh, xmm_lo, xmm_hi, a;					\
+	/* fetch 2x2 pixel block into sse2 register */				\
+	uint32_t tl = top [pixman_fixed_to_int (x)];				\
+	uint32_t tr = top [pixman_fixed_to_int (x) + 1];			\
+	uint32_t bl = bottom [pixman_fixed_to_int (x)];				\
+	uint32_t br = bottom [pixman_fixed_to_int (x) + 1];			\
+	a = _mm_set_epi32 (tr, tl, br, bl);					\
+        x += ux;								\
+	/* vertical interpolation */						\
+	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
+					    xmm_wt),				\
+			   _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
+					    xmm_wb));				\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc,					\
+				_mm_xor_si128 (xmm_xorc,			\
+					       _mm_srli_epi16 (xmm_x, 8)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+	/* shift and pack the result */						\
+	a = _mm_srli_epi32 (a, 16);						\
+	a = _mm_packs_epi32 (a, a);						\
+	a = _mm_packus_epi16 (a, a);						\
+	pix = _mm_cvtsi128_si32 (a);						\
+    } while (0)
+
+    while ((width -= 4) >= 0)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	INTERPOLATE_ONE_PIXEL (pix3);
+	INTERPOLATE_ONE_PIXEL (pix4);
+	*out++ = pix1;
+	*out++ = pix2;
+	*out++ = pix3;
+	*out++ = pix4;
+    }
+    if (width & 2)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	*out++ = pix1;
+	*out++ = pix2;
+    }
+    if (width & 1)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	*out = pix1;
+    }
+
+    #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+				    wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FALSE, FALSE)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit 0df43b8ae5031dd83775d00b57b6bed809db0e89
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Feb 21 02:07:09 2011 +0200

    test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds'
    
    Individual correctness check for the new bilinear scaling related
    supplementary function. This test program uses a bit wider range
    of input arguments, not covered by other tests.

diff --git a/test/Makefile.am b/test/Makefile.am
index 057e9ce..9dc7219 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -13,6 +13,7 @@ TESTPROGRAMS =			\
 	trap-crasher		\
 	alpha-loop		\
 	scaling-crash-test	\
+	scaling-helpers-test	\
 	gradient-crash-test	\
 	alphamap		\
 	stress-test		\
@@ -33,6 +34,7 @@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
 composite_SOURCES = composite.c utils.c utils.h
 gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
 stress_test_SOURCES = stress-test.c utils.c utils.h
+scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h
 
 # Benchmarks
 
diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c
new file mode 100644
index 0000000..c186138
--- /dev/null
+++ b/test/scaling-helpers-test.c
@@ -0,0 +1,93 @@
+#include <config.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-fast-path.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t        source_image_width,
+					     pixman_fixed_t vx_,
+					     pixman_fixed_t unit_x,
+					     int32_t *      left_pad,
+					     int32_t *      left_tz,
+					     int32_t *      width,
+					     int32_t *      right_tz,
+					     int32_t *      right_pad)
+{
+    int w = *width;
+    *left_pad = 0;
+    *left_tz = 0;
+    *width = 0;
+    *right_tz = 0;
+    *right_pad = 0;
+    int64_t vx = vx_;
+    while (--w >= 0)
+    {
+	if (vx < 0)
+	{
+	    if (vx + pixman_fixed_1 < 0)
+		*left_pad += 1;
+	    else
+		*left_tz += 1;
+	}
+	else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+	{
+	    if (vx >= pixman_int_to_fixed (source_image_width))
+		*right_pad += 1;
+	    else
+		*right_tz += 1;
+	}
+	else
+	{
+	    *width += 1;
+	}
+	vx += unit_x;
+    }
+}
+
+int
+main (void)
+{
+    int i;
+    for (i = 0; i < 10000; i++)
+    {
+	int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+	int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+	pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16);
+	int32_t width = lcg_rand_N(10000);
+	int32_t source_image_width = lcg_rand_N(10000) + 1;
+	pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1;
+	width1 = width2 = width;
+
+	bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+						     vx,
+						     unit_x,
+						     &left_pad1,
+						     &left_tz1,
+						     &width1,
+						     &right_tz1,
+						     &right_pad1);
+
+	bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+						 vx,
+						 unit_x,
+						 &left_pad2,
+						 &left_tz2,
+						 &width2,
+						 &right_tz2,
+						 &right_pad2);
+
+	assert (left_pad1 == left_pad2);
+	assert (left_tz1 == left_tz2);
+	assert (width1 == width2);
+	assert (right_tz1 == right_tz2);
+	assert (right_pad1 == right_pad2);
+    }
+
+    return 0;
+}
commit d506bf68fd0e9a1c5dd484daee70631699918387
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Feb 21 01:29:02 2011 +0200

    Main loop template for fast single pass bilinear scaling
    
    Can be used for implementing SIMD optimized fast path
    functions which work with bilinear scaled source images.
    
    Similar to the template for nearest scaling main loop, the
    following types of mask are supported:
    1. no mask
    2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag
    3. solid mask
    
    PAD repeat is fully supported. NONE repeat is partially
    supported (right now only works if source image has alpha
    channel or when alpha channel of the source image does not
    have any effect on the compositing operation).

diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index d081222..1885d47 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -587,4 +587,436 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
 
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ *       but sometimes it may be less than that for NONE repeat when handling
+ *       fuzzy antialiased top or bottom image edges. Also both top and
+ *       bottom weight variables are guaranteed to have value in 0-255
+ *       range and can fit into unsigned byte or be used with 8-bit SIMD
+ *       multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_op_t              op,			\
+						   pixman_image_t *         src_image,		\
+						   pixman_image_t *         mask_image,		\
+						   pixman_image_t *         dst_image,		\
+						   int32_t                  src_x,		\
+						   int32_t                  src_y,		\
+						   int32_t                  mask_x,		\
+						   int32_t                  mask_y,		\
+						   int32_t                  dst_x,		\
+						   int32_t                  dst_y,		\
+						   int32_t                  width,		\
+						   int32_t                  height)		\
+{												\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	{											\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format);	\
+	    mask_stride = 0;									\
+	}											\
+	else											\
+	{											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+	}											\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = (vy >> 8) & 0xff;								\
+	if (weight2)										\
+	{											\
+	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
+	    y2 = y1 + 1;									\
+	    weight1 = 256 - weight2;								\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
+	    y2 = y1;										\
+	    weight1 = weight2 = 128;								\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (have_mask && !mask_is_solid)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (have_mask && !mask_is_solid)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (have_mask && !mask_is_solid)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (have_mask && !mask_is_solid)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (have_mask && !mask_is_solid)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (have_mask && !mask_is_solid)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
 #endif