pixman: Branch 'master'
Søren Sandmann Pedersen
sandmann at kemper.freedesktop.org
Tue Sep 25 21:25:45 PDT 2012
pixman/pixman-arm-common.h | 24 +++++++++++----
pixman/pixman-arm-neon-asm.h | 45 ++++++++++++++++++++---------
pixman/pixman-arm-simd-asm.S | 47 ++++++++++++++++++------------
pixman/pixman-fast-path.c | 14 ++++-----
pixman/pixman-inlines.h | 59 +++++++++++++++++++++-----------------
pixman/pixman-sse2.c | 66 +++++++++++++++++++++++++++++++++----------
6 files changed, 170 insertions(+), 85 deletions(-)
New commits:
commit aff796d6cee4cb81f0352c2f7d0c994229bd5ca1
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date: Mon Jun 25 22:36:52 2012 -0400
Add scaled nearest repeat fast paths
Before this patch it was often faster to scale and repeat
in two passes because each pass used a fast path vs.
the slow path that the single pass approach takes. This
makes it so that the single pass approach has competitive
performance.
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index f56264e..3a7cb2b 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -236,7 +236,8 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
dst_type * dst, \
const src_type * src, \
pixman_fixed_t vx, \
- pixman_fixed_t unit_x); \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx); \
\
static force_inline void \
scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \
@@ -248,7 +249,8 @@ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \
pixman_bool_t zero_src) \
{ \
pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
- vx, unit_x);\
+ vx, unit_x, \
+ max_vx); \
} \
\
FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op, \
@@ -259,13 +261,17 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op, \
src_type, dst_type, NONE) \
FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \
scaled_nearest_scanline_##cputype##_##name##_##op, \
- src_type, dst_type, PAD)
+ src_type, dst_type, PAD) \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op, \
+ src_type, dst_type, NORMAL)
/* Provide entries for the fast path table */
#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+ SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \
src_type, dst_type) \
@@ -276,6 +282,7 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \
const src_type * src, \
pixman_fixed_t vx, \
pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
const uint8_t * mask); \
\
static force_inline void \
@@ -292,6 +299,7 @@ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \
return; \
pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \
vx, unit_x, \
+ max_vx, \
mask); \
} \
\
@@ -303,13 +311,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
scaled_nearest_scanline_##cputype##_##name##_##op,\
- src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+ src_type, uint8_t, dst_type, PAD, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \
+ scaled_nearest_scanline_##cputype##_##name##_##op,\
+ src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
/* Provide entries for the fast path table */
#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
/*****************************************************************************/
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 97adc6a..1673b08 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -212,27 +212,39 @@
.macro pixld1_s elem_size, reg1, mem_operand
.if elem_size == 16
mov TMP1, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP1, mem_operand, TMP1, asl #1
mov TMP2, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP2, mem_operand, TMP2, asl #1
vld1.16 {d®1&[0]}, [TMP1, :16]
mov TMP1, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP1, mem_operand, TMP1, asl #1
vld1.16 {d®1&[1]}, [TMP2, :16]
mov TMP2, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP2, mem_operand, TMP2, asl #1
vld1.16 {d®1&[2]}, [TMP1, :16]
vld1.16 {d®1&[3]}, [TMP2, :16]
.elseif elem_size == 32
mov TMP1, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP1, mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP2, mem_operand, TMP2, asl #2
vld1.32 {d®1&[0]}, [TMP1, :32]
vld1.32 {d®1&[1]}, [TMP2, :32]
@@ -242,7 +254,7 @@
.endm
.macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
+.if 0 /* elem_size == 32 */
mov TMP1, VX, asr #16
add VX, VX, UNIT_X, asl #1
add TMP1, mem_operand, TMP1, asl #2
@@ -268,12 +280,16 @@
.macro pixld0_s elem_size, reg1, idx, mem_operand
.if elem_size == 16
mov TMP1, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP1, mem_operand, TMP1, asl #1
vld1.16 {d®1&[idx]}, [TMP1, :16]
.elseif elem_size == 32
mov TMP1, VX, asr #16
- add VX, VX, UNIT_X
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
add TMP1, mem_operand, TMP1, asl #2
vld1.32 {d®1&[idx]}, [TMP1, :32]
.endif
@@ -964,15 +980,17 @@ fname:
TMP1 .req r4
TMP2 .req r5
DST_R .req r6
+ SRC_WIDTH_FIXED .req r7
.macro pixld_src x:vararg
pixld_s x
.endm
ldr UNIT_X, [sp]
- push {r4-r6, lr}
+ push {r4-r8, lr}
+ ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)]
.if mask_bpp != 0
- ldr MASK, [sp, #(16 + 4)]
+ ldr MASK, [sp, #(24 + 8)]
.endif
.else
/*
@@ -1044,7 +1062,7 @@ fname:
cleanup
.if use_nearest_scaling != 0
- pop {r4-r6, pc} /* exit */
+ pop {r4-r8, pc} /* exit */
.else
bx lr /* exit */
.endif
@@ -1058,7 +1076,7 @@ fname:
cleanup
.if use_nearest_scaling != 0
- pop {r4-r6, pc} /* exit */
+ pop {r4-r8, pc} /* exit */
.unreq DST_R
.unreq SRC
@@ -1069,6 +1087,7 @@ fname:
.unreq TMP2
.unreq DST_W
.unreq MASK
+ .unreq SRC_WIDTH_FIXED
.else
bx lr /* exit */
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 8fe1b50..b438001 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -355,49 +355,57 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
prefetch_braking_distance
pixman_asm_function fname
- W .req r0
- DST .req r1
- SRC .req r2
- VX .req r3
- UNIT_X .req ip
- TMP1 .req r4
- TMP2 .req r5
- VXMASK .req r6
- PF_OFFS .req r7
+ W .req r0
+ DST .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ VXMASK .req r6
+ PF_OFFS .req r7
+ SRC_WIDTH_FIXED .req r8
ldr UNIT_X, [sp]
- push {r4, r5, r6, r7}
+ push {r4, r5, r6, r7, r8, r10}
mvn VXMASK, #((1 << bpp_shift) - 1)
+ ldr SRC_WIDTH_FIXED, [sp, #28]
/* define helper macro */
.macro scale_2_pixels
ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
+ and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
str&t TMP1, [DST], #(1 << bpp_shift)
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
+ and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
str&t TMP2, [DST], #(1 << bpp_shift)
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
.endm
/* now do the scaling */
- and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
- add VX, VX, UNIT_X
+ and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
subs W, W, #(8 + prefetch_braking_distance)
blt 2f
/* calculate prefetch offset */
mov PF_OFFS, #prefetch_distance
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
- subs W, W, #8
+ pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
add PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
scale_2_pixels
scale_2_pixels
- pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+ subs W, W, #8
bge 1b
2:
subs W, W, #(4 - 8 - prefetch_braking_distance)
@@ -426,8 +434,9 @@ pixman_asm_function fname
.unreq TMP2
.unreq VXMASK
.unreq PF_OFFS
+ .unreq SRC_WIDTH_FIXED
/* return */
- pop {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7, r8, r10}
bx lr
.endfunc
.endm
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 86ed821..22bfd30 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1415,13 +1415,13 @@ scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
uint16_t tmp1, tmp2, tmp3, tmp4;
while ((w -= 4) >= 0)
{
- tmp1 = src[pixman_fixed_to_int (vx)];
+ tmp1 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
+ tmp2 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
+ tmp3 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
+ tmp4 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
*dst++ = tmp1;
*dst++ = tmp2;
@@ -1430,15 +1430,15 @@ scaled_nearest_scanline_565_565_SRC (uint16_t * dst,
}
if (w & 2)
{
- tmp1 = src[pixman_fixed_to_int (vx)];
+ tmp1 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
+ tmp2 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
*dst++ = tmp1;
*dst++ = tmp2;
}
if (w & 1)
- *dst++ = src[pixman_fixed_to_int (vx)];
+ *dst = *(src + pixman_fixed_to_int (vx));
}
FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
index 3a3c658..7f2e404 100644
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -271,7 +271,7 @@ scanline_func_name (dst_type_t *dst, \
int32_t w, \
pixman_fixed_t vx, \
pixman_fixed_t unit_x, \
- pixman_fixed_t max_vx, \
+ pixman_fixed_t src_width_fixed, \
pixman_bool_t fully_transparent_src) \
{ \
uint32_t d; \
@@ -287,25 +287,25 @@ scanline_func_name (dst_type_t *dst, \
\
while ((w -= 2) >= 0) \
{ \
- x1 = vx >> 16; \
+ x1 = pixman_fixed_to_int (vx); \
vx += unit_x; \
if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
{ \
/* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
+ while (vx >= 0) \
+ vx -= src_width_fixed; \
} \
- s1 = src[x1]; \
+ s1 = *(src + x1); \
\
- x2 = vx >> 16; \
+ x2 = pixman_fixed_to_int (vx); \
vx += unit_x; \
if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
{ \
/* This works because we know that unit_x is positive */ \
- while (vx >= max_vx) \
- vx -= max_vx; \
+ while (vx >= 0) \
+ vx -= src_width_fixed; \
} \
- s2 = src[x2]; \
+ s2 = *(src + x2); \
\
if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
{ \
@@ -349,8 +349,8 @@ scanline_func_name (dst_type_t *dst, \
\
if (w & 1) \
{ \
- x1 = vx >> 16; \
- s1 = src[x1]; \
+ x1 = pixman_fixed_to_int (vx); \
+ s1 = *(src + x1); \
\
if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \
{ \
@@ -388,7 +388,7 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
mask_type_t *mask_line; \
src_type_t *src_first_line; \
int y; \
- pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
+ pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width); \
pixman_fixed_t max_vy; \
pixman_vector_t v; \
pixman_fixed_t vx, vy; \
@@ -434,11 +434,10 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
\
if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
{ \
- /* Clamp repeating positions inside the actual samples */ \
- max_vx = src_image->bits.width << 16; \
- max_vy = src_image->bits.height << 16; \
+ max_vy = pixman_int_to_fixed (src_image->bits.height); \
\
- repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \
+ /* Clamp repeating positions inside the actual samples */ \
+ repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed); \
repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
} \
\
@@ -460,7 +459,7 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
mask_line += mask_stride; \
} \
\
- y = vy >> 16; \
+ y = pixman_fixed_to_int (vy); \
vy += unit_y; \
if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \
repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \
@@ -470,18 +469,21 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
src = src_first_line + src_stride * y; \
if (left_pad > 0) \
{ \
- scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \
+ scanline_func (mask, dst, \
+ src + src_image->bits.width - src_image->bits.width + 1, \
+ left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \
} \
if (width > 0) \
{ \
scanline_func (mask + (mask_is_solid ? 0 : left_pad), \
- dst + left_pad, src, width, vx, unit_x, 0, FALSE); \
+ dst + left_pad, src + src_image->bits.width, width, \
+ vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \
} \
if (right_pad > 0) \
{ \
scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \
- dst + left_pad + width, src + src_image->bits.width - 1, \
- right_pad, 0, 0, 0, FALSE); \
+ dst + left_pad + width, src + src_image->bits.width, \
+ right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \
} \
} \
else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
@@ -489,29 +491,34 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
static const src_type_t zero[1] = { 0 }; \
if (y < 0 || y >= src_image->bits.height) \
{ \
- scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \
+ scanline_func (mask, dst, zero + 1, left_pad + width + right_pad, \
+ -pixman_fixed_e, 0, src_width_fixed, TRUE); \
continue; \
} \
src = src_first_line + src_stride * y; \
if (left_pad > 0) \
{ \
- scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \
+ scanline_func (mask, dst, zero + 1, left_pad, \
+ -pixman_fixed_e, 0, src_width_fixed, TRUE); \
} \
if (width > 0) \
{ \
scanline_func (mask + (mask_is_solid ? 0 : left_pad), \
- dst + left_pad, src, width, vx, unit_x, 0, FALSE); \
+ dst + left_pad, src + src_image->bits.width, width, \
+ vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \
} \
if (right_pad > 0) \
{ \
scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \
- dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \
+ dst + left_pad + width, zero + 1, right_pad, \
+ -pixman_fixed_e, 0, src_width_fixed, TRUE); \
} \
} \
else \
{ \
src = src_first_line + src_stride * y; \
- scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \
+ scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed, \
+ unit_x, src_width_fixed, FALSE); \
} \
} \
}
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index cf21ef8..efed310 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
+ pixman_fixed_t src_width_fixed,
pixman_bool_t fully_transparent_src)
{
uint32_t s, d;
@@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
while (w && ((unsigned long)pd & 15))
{
d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
@@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
__m128i tmp;
uint32_t tmp1, tmp2, tmp3, tmp4;
- tmp1 = ps[vx >> 16];
+ tmp1 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp3 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp4 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
@@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
while (w)
{
d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
@@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
static force_inline void
scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
@@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
+ pixman_fixed_t src_width_fixed,
pixman_bool_t zero_src)
{
__m128i xmm_mask;
@@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
while (w && (unsigned long)dst & 15)
{
- uint32_t s = src[pixman_fixed_to_int (vx)];
+ uint32_t s = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
if (s)
{
@@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
{
uint32_t tmp1, tmp2, tmp3, tmp4;
- tmp1 = src[pixman_fixed_to_int (vx)];
+ tmp1 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
@@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
while (w)
{
- uint32_t s = src[pixman_fixed_to_int (vx)];
+ uint32_t s = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
if (s)
{
@@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
@@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
More information about the xorg-commit
mailing list