pixman: Branch 'master' - 6 commits
Chris Wilson
ickle at kemper.freedesktop.org
Sun Jan 27 06:12:33 PST 2013
pixman/pixman-fast-path.c | 224 +++++++++++++++-----------------
pixman/pixman-general.c | 3
pixman/pixman-glyph.c | 8 -
pixman/pixman-implementation.c | 28 +++-
pixman/pixman-private.h | 4
pixman/pixman-sse2.c | 284 +++++++++++++++++++++++++++++++++++++++++
pixman/pixman.c | 87 +++++-------
7 files changed, 461 insertions(+), 177 deletions(-)
New commits:
commit 794033ed43ed74ad66075a4d0c83fd36565da876
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Wed Jan 23 10:27:22 2013 +0000
Eliminate duplicate copies of channel flags for pixman_image_composite32()
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 97a4590..184f0c4 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -581,7 +581,6 @@ pixman_image_composite32 (pixman_op_t op,
int32_t height)
{
pixman_format_code_t src_format, mask_format, dest_format;
- uint32_t src_flags, mask_flags, dest_flags;
pixman_region32_t region;
pixman_box32_t extents;
pixman_implementation_t *imp;
@@ -596,27 +595,27 @@ pixman_image_composite32 (pixman_op_t op,
_pixman_image_validate (dest);
src_format = src->common.extended_format_code;
- src_flags = src->common.flags;
+ info.src_flags = src->common.flags;
if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE))
{
mask_format = mask->common.extended_format_code;
- mask_flags = mask->common.flags;
+ info.mask_flags = mask->common.flags;
}
else
{
mask_format = PIXMAN_null;
- mask_flags = FAST_PATH_IS_OPAQUE;
+ info.mask_flags = FAST_PATH_IS_OPAQUE;
}
dest_format = dest->common.extended_format_code;
- dest_flags = dest->common.flags;
+ info.dest_flags = dest->common.flags;
/* Check for pixbufs */
if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
(src->type == BITS && src->bits.bits == mask->bits.bits) &&
(src->common.repeat == mask->common.repeat) &&
- (src_flags & mask_flags & FAST_PATH_ID_TRANSFORM) &&
+ (info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM) &&
(src_x == mask_x && src_y == mask_y))
{
if (src_format == PIXMAN_x8b8g8r8)
@@ -641,7 +640,7 @@ pixman_image_composite32 (pixman_op_t op,
extents.x2 -= dest_x - src_x;
extents.y2 -= dest_y - src_y;
- if (!analyze_extent (src, &extents, &src_flags))
+ if (!analyze_extent (src, &extents, &info.src_flags))
goto out;
extents.x1 -= src_x - mask_x;
@@ -649,7 +648,7 @@ pixman_image_composite32 (pixman_op_t op,
extents.x2 -= src_x - mask_x;
extents.y2 -= src_y - mask_y;
- if (!analyze_extent (mask, &extents, &mask_flags))
+ if (!analyze_extent (mask, &extents, &info.mask_flags))
goto out;
/* If the clip is within the source samples, and the samples are
@@ -662,16 +661,16 @@ pixman_image_composite32 (pixman_op_t op,
FAST_PATH_BILINEAR_FILTER | \
FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
- if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
- (src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+ if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+ (info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
{
- src_flags |= FAST_PATH_IS_OPAQUE;
+ info.src_flags |= FAST_PATH_IS_OPAQUE;
}
- if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
- (mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+ if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+ (info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
{
- mask_flags |= FAST_PATH_IS_OPAQUE;
+ info.mask_flags |= FAST_PATH_IS_OPAQUE;
}
/*
@@ -679,20 +678,18 @@ pixman_image_composite32 (pixman_op_t op,
* if the src or dest are opaque. The output operator should be
* mathematically equivalent to the source.
*/
- op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+ info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags);
_pixman_implementation_lookup_composite (
- get_implementation (), op,
- src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
+ get_implementation (), info.op,
+ src_format, info.src_flags,
+ mask_format, info.mask_flags,
+ dest_format, info.dest_flags,
&imp, &func);
- info.op = op;
info.src_image = src;
info.mask_image = mask;
info.dest_image = dest;
- info.src_flags = src_flags;
- info.mask_flags = mask_flags;
- info.dest_flags = dest_flags;
pbox = pixman_region32_rectangles (®ion, &n);
commit a59f081df45ec5c15b295bb31b22dbe787e2f2b1
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Sat Jan 12 16:52:47 2013 +0000
Always return a valid function from lookup_combiner()
We should always have at least a C combiner available, so we never
expect the search to fail. If it does, emit an error and return a
dummy function.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index f175d77..93a1b9a 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -188,9 +188,6 @@ general_composite_rect (pixman_implementation_t *imp,
compose = _pixman_implementation_lookup_combiner (
imp->toplevel, op, component_alpha, narrow);
- if (!compose)
- return;
-
for (i = 0; i < height; ++i)
{
uint32_t *s, *m, *d;
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
index 05cb5ea..c0a6436 100644
--- a/pixman/pixman-implementation.c
+++ b/pixman/pixman-implementation.c
@@ -172,6 +172,16 @@ update_cache:
}
}
+static void
+dummy_combine (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+}
+
pixman_combine_32_func_t
_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
pixman_op_t op,
@@ -207,7 +217,9 @@ _pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
imp = imp->fallback;
}
- return NULL;
+ /* We should never reach this point */
+ _pixman_log_error (FUNC, "No known combine function\n");
+ return dummy_combine;
}
pixman_bool_t
commit 520230914bbb56473b872f2ef7dc59092f426415
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Sat Jan 12 08:28:32 2013 +0000
Always return a valid function from lookup_composite()
We never expect to fail to find the appropriate function as the
general_composite_rect should always match. So if somehow we fallthrough
the search, emit a _pixman_log_error() and return a dummy function.
Note that we remove some conditionals and a level of indentation hence a
large amount of code movement. This also reveals that in a few places we
are duplicating stack variables that can be eliminated later.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c625e0c..1ac2d11 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1243,6 +1243,18 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
pixman_composite_func_t func;
pixman_format_code_t mask_format;
uint32_t src_flags, mask_flags;
+ int32_t sx, sy;
+ int32_t width_remain;
+ int32_t num_pixels;
+ int32_t src_width;
+ int32_t i, j;
+ pixman_image_t extended_src_image;
+ uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+ pixman_bool_t need_src_extension;
+ uint32_t *src_line;
+ int32_t src_stride;
+ int32_t src_bpp;
+ pixman_composite_info_t info2 = *info;
src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
@@ -1258,149 +1270,131 @@ fast_composite_tiled_repeat (pixman_implementation_t *imp,
mask_flags = FAST_PATH_IS_OPAQUE;
}
- if (_pixman_implementation_lookup_composite (
- imp->toplevel, info->op,
- src_image->common.extended_format_code, src_flags,
- mask_format, mask_flags,
- dest_image->common.extended_format_code, info->dest_flags,
- &imp, &func))
+ _pixman_implementation_lookup_composite (
+ imp->toplevel, info->op,
+ src_image->common.extended_format_code, src_flags,
+ mask_format, mask_flags,
+ dest_image->common.extended_format_code, info->dest_flags,
+ &imp, &func);
+
+ src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+ if (src_image->bits.width < REPEAT_MIN_WIDTH &&
+ (src_bpp == 32 || src_bpp == 16 || src_bpp == 8) &&
+ !src_image->bits.indexed)
{
- int32_t sx, sy;
- int32_t width_remain;
- int32_t num_pixels;
- int32_t src_width;
- int32_t i, j;
- pixman_image_t extended_src_image;
- uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
- pixman_bool_t need_src_extension;
- uint32_t *src_line;
- int32_t src_stride;
- int32_t src_bpp;
- pixman_composite_info_t info2 = *info;
-
- src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
-
- if (src_image->bits.width < REPEAT_MIN_WIDTH &&
- (src_bpp == 32 || src_bpp == 16 || src_bpp == 8) &&
- !src_image->bits.indexed)
- {
- sx = src_x;
- sx = MOD (sx, src_image->bits.width);
- sx += width;
- src_width = 0;
+ sx = src_x;
+ sx = MOD (sx, src_image->bits.width);
+ sx += width;
+ src_width = 0;
- while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
- src_width += src_image->bits.width;
+ while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+ src_width += src_image->bits.width;
- src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+ src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
- /* Initialize/validate stack-allocated temporary image */
- _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
- src_width, 1, &extended_src[0], src_stride,
- FALSE);
- _pixman_image_validate (&extended_src_image);
+ /* Initialize/validate stack-allocated temporary image */
+ _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+ src_width, 1, &extended_src[0], src_stride,
+ FALSE);
+ _pixman_image_validate (&extended_src_image);
- info2.src_image = &extended_src_image;
- need_src_extension = TRUE;
- }
- else
- {
- src_width = src_image->bits.width;
- need_src_extension = FALSE;
- }
+ info2.src_image = &extended_src_image;
+ need_src_extension = TRUE;
+ }
+ else
+ {
+ src_width = src_image->bits.width;
+ need_src_extension = FALSE;
+ }
- sx = src_x;
- sy = src_y;
+ sx = src_x;
+ sy = src_y;
- while (--height >= 0)
- {
- sx = MOD (sx, src_width);
- sy = MOD (sy, src_image->bits.height);
+ while (--height >= 0)
+ {
+ sx = MOD (sx, src_width);
+ sy = MOD (sy, src_image->bits.height);
- if (need_src_extension)
+ if (need_src_extension)
+ {
+ if (src_bpp == 32)
{
- if (src_bpp == 32)
- {
- PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
- for (i = 0; i < src_width; )
- {
- for (j = 0; j < src_image->bits.width; j++, i++)
- extended_src[i] = src_line[j];
- }
- }
- else if (src_bpp == 16)
+ for (i = 0; i < src_width; )
{
- uint16_t *src_line_16;
-
- PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
- src_line_16, 1);
- src_line = (uint32_t*)src_line_16;
-
- for (i = 0; i < src_width; )
- {
- for (j = 0; j < src_image->bits.width; j++, i++)
- ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
- }
+ for (j = 0; j < src_image->bits.width; j++, i++)
+ extended_src[i] = src_line[j];
}
- else if (src_bpp == 8)
- {
- uint8_t *src_line_8;
+ }
+ else if (src_bpp == 16)
+ {
+ uint16_t *src_line_16;
- PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
- src_line_8, 1);
- src_line = (uint32_t*)src_line_8;
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+ src_line_16, 1);
+ src_line = (uint32_t*)src_line_16;
- for (i = 0; i < src_width; )
- {
- for (j = 0; j < src_image->bits.width; j++, i++)
- ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
- }
+ for (i = 0; i < src_width; )
+ {
+ for (j = 0; j < src_image->bits.width; j++, i++)
+ ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
}
-
- info2.src_y = 0;
}
- else
+ else if (src_bpp == 8)
{
- info2.src_y = sy;
+ uint8_t *src_line_8;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+ src_line_8, 1);
+ src_line = (uint32_t*)src_line_8;
+
+ for (i = 0; i < src_width; )
+ {
+ for (j = 0; j < src_image->bits.width; j++, i++)
+ ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+ }
}
- width_remain = width;
+ info2.src_y = 0;
+ }
+ else
+ {
+ info2.src_y = sy;
+ }
- while (width_remain > 0)
- {
- num_pixels = src_width - sx;
+ width_remain = width;
- if (num_pixels > width_remain)
- num_pixels = width_remain;
+ while (width_remain > 0)
+ {
+ num_pixels = src_width - sx;
- info2.src_x = sx;
- info2.width = num_pixels;
- info2.height = 1;
+ if (num_pixels > width_remain)
+ num_pixels = width_remain;
- func (imp, &info2);
+ info2.src_x = sx;
+ info2.width = num_pixels;
+ info2.height = 1;
- width_remain -= num_pixels;
- info2.mask_x += num_pixels;
- info2.dest_x += num_pixels;
- sx = 0;
- }
+ func (imp, &info2);
- sx = src_x;
- sy++;
- info2.mask_x = info->mask_x;
- info2.mask_y++;
- info2.dest_x = info->dest_x;
- info2.dest_y++;
+ width_remain -= num_pixels;
+ info2.mask_x += num_pixels;
+ info2.dest_x += num_pixels;
+ sx = 0;
}
- if (need_src_extension)
- _pixman_image_fini (&extended_src_image);
- }
- else
- {
- _pixman_log_error (FUNC, "Didn't find a suitable function ");
+ sx = src_x;
+ sy++;
+ info2.mask_x = info->mask_x;
+ info2.mask_y++;
+ info2.dest_x = info->dest_x;
+ info2.dest_y++;
}
+
+ if (need_src_extension)
+ _pixman_image_fini (&extended_src_image);
}
/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
diff --git a/pixman/pixman-glyph.c b/pixman/pixman-glyph.c
index 6d2c8bb..5a271b6 100644
--- a/pixman/pixman-glyph.c
+++ b/pixman/pixman-glyph.c
@@ -463,16 +463,13 @@ pixman_composite_glyphs_no_mask (pixman_op_t op,
{
glyph_format = glyph_img->common.extended_format_code;
glyph_flags = glyph_img->common.flags;
-
+
_pixman_implementation_lookup_composite (
get_implementation(), op,
src->common.extended_format_code, src->common.flags,
glyph_format, glyph_flags | extra,
dest_format, dest_flags,
&implementation, &func);
-
- if (!func)
- goto out;
}
info.src_x = src_x + composite_box.x1 - dest_x;
@@ -582,9 +579,6 @@ add_glyphs (pixman_glyph_cache_t *cache,
mask_format, info.mask_flags,
dest_format, dest_flags,
&implementation, &func);
-
- if (!func)
- goto out;
}
glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x;
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
index ec467a6..05cb5ea 100644
--- a/pixman/pixman-implementation.c
+++ b/pixman/pixman-implementation.c
@@ -65,7 +65,13 @@ typedef struct
PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
-pixman_bool_t
+static void
+dummy_composite_rect (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+}
+
+void
_pixman_implementation_lookup_composite (pixman_implementation_t *toplevel,
pixman_op_t op,
pixman_format_code_t src_format,
@@ -142,7 +148,11 @@ _pixman_implementation_lookup_composite (pixman_implementation_t *toplevel,
++info;
}
}
- return FALSE;
+
+ /* We should never reach this point */
+ _pixman_log_error (FUNC, "No known composite function\n");
+ *out_imp = NULL;
+ *out_func = dummy_composite_rect;
update_cache:
if (i)
@@ -160,8 +170,6 @@ update_cache:
cache->cache[0].fast_path.dest_flags = dest_flags;
cache->cache[0].fast_path.func = *out_func;
}
-
- return TRUE;
}
pixman_combine_32_func_t
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index e5ab873..3981873 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -497,7 +497,7 @@ pixman_implementation_t *
_pixman_implementation_create (pixman_implementation_t *fallback,
const pixman_fast_path_t *fast_paths);
-pixman_bool_t
+void
_pixman_implementation_lookup_composite (pixman_implementation_t *toplevel,
pixman_op_t op,
pixman_format_code_t src_format,
@@ -1052,7 +1052,7 @@ _pixman_log_error (const char *function, const char *message);
#else
-#define _pixman_log_error(f,m) do { } while (0) \
+#define _pixman_log_error(f,m) do { } while (0)
#define return_if_fail(expr) \
do \
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 3fabed1..97a4590 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -586,6 +586,9 @@ pixman_image_composite32 (pixman_op_t op,
pixman_box32_t extents;
pixman_implementation_t *imp;
pixman_composite_func_t func;
+ pixman_composite_info_t info;
+ const pixman_box32_t *pbox;
+ int n;
_pixman_image_validate (src);
if (mask)
@@ -678,40 +681,35 @@ pixman_image_composite32 (pixman_op_t op,
*/
op = optimize_operator (op, src_flags, mask_flags, dest_flags);
- if (_pixman_implementation_lookup_composite (
- get_implementation (), op,
- src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
- &imp, &func))
- {
- pixman_composite_info_t info;
- const pixman_box32_t *pbox;
- int n;
+ _pixman_implementation_lookup_composite (
+ get_implementation (), op,
+ src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
+ &imp, &func);
- info.op = op;
- info.src_image = src;
- info.mask_image = mask;
- info.dest_image = dest;
- info.src_flags = src_flags;
- info.mask_flags = mask_flags;
- info.dest_flags = dest_flags;
+ info.op = op;
+ info.src_image = src;
+ info.mask_image = mask;
+ info.dest_image = dest;
+ info.src_flags = src_flags;
+ info.mask_flags = mask_flags;
+ info.dest_flags = dest_flags;
- pbox = pixman_region32_rectangles (®ion, &n);
+ pbox = pixman_region32_rectangles (®ion, &n);
- while (n--)
- {
- info.src_x = pbox->x1 + src_x - dest_x;
- info.src_y = pbox->y1 + src_y - dest_y;
- info.mask_x = pbox->x1 + mask_x - dest_x;
- info.mask_y = pbox->y1 + mask_y - dest_y;
- info.dest_x = pbox->x1;
- info.dest_y = pbox->y1;
- info.width = pbox->x2 - pbox->x1;
- info.height = pbox->y2 - pbox->y1;
-
- func (imp, &info);
-
- pbox++;
- }
+ while (n--)
+ {
+ info.src_x = pbox->x1 + src_x - dest_x;
+ info.src_y = pbox->y1 + src_y - dest_y;
+ info.mask_x = pbox->x1 + mask_x - dest_x;
+ info.mask_y = pbox->y1 + mask_y - dest_y;
+ info.dest_x = pbox->x1;
+ info.dest_y = pbox->y1;
+ info.width = pbox->x2 - pbox->x1;
+ info.height = pbox->y2 - pbox->y1;
+
+ func (imp, &info);
+
+ pbox++;
}
out:
commit b283c864a3de039f9213adaf402c6597db12d0c4
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Jan 8 18:39:03 2013 +0000
sse2: Add fast paths for bilinear source with a solid mask
Based on the existing sse2_8888_n_8888 nearest scaling routines.
fishbowl on an i5-2500: 60.9s -> 56.9s
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index ff8c946..fc873cc 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5942,6 +5942,121 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
uint32_t, uint8_t, uint32_t,
NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+ __m128i xmm_mask;
+
+ if (zero_src || (*mask >> 24) == 0)
+ return;
+
+ xmm_mask = create_mask_16_128 (*mask >> 24);
+
+ while (w && ((uintptr_t)dst & 15))
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ if (pix1)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (pix1);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32
+ (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+ if (pix1 | pix2 | pix3 | pix4)
+ {
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned
+ ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ if (pix1)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (pix1);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32
+ (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+
+ dst++;
+ w--;
+ }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+ scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+ scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+ scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_HAVE_SOLID_MASK)
+
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
@@ -6076,6 +6191,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
commit d00ce4091215e8a648c6f1912829b35c02b06add
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Jan 1 19:41:54 2013 +0000
sse2: Add a fast path for add_n_8_8888
This path is being exercised by compositing of trapezoids for clipmasks, for
instance as used in the firefox-asteroids cairo-trace.
IVB i7-3720qm ./tests/lowlevel-blt-bench add_n_8_8888:
reference memcpy speed = 14846.7MB/s (3711.7MP/s for 32bpp fills)
before: L1: 681.10 L2: 735.14 M:701.44 ( 28.35%) HT:283.32 VT:213.23 R:208.93 RT: 77.89 ( 793Kops/s)
after: L1: 992.91 L2:1017.33 M:982.58 ( 39.88%) HT:458.93 VT:332.32 R:326.13 RT:136.66 (1287Kops/s)
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index f4a7d51..ff8c946 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4586,6 +4586,101 @@ sse2_composite_add_n_8888 (pixman_implementation_t *imp,
}
}
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+ if (src == 0)
+ return;
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t m = *(uint32_t*)mask;
+ if (m)
+ {
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+ __m128i xmm_mask =
+ _mm_unpacklo_epi8 (unpack_32_1x128(m),
+ _mm_setzero_si128 ());
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+ }
+}
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
@@ -5913,6 +6008,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
commit 7ced3beec99e9965717f76cc822d0702383a1fce
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Jan 1 19:41:54 2013 +0000
sse2: Add a fast path for add_n_8888
This path is being exercised by inplace compositing of trapezoids, for
instance as used in the firefox-asteroids cairo-trace.
IVB i3-3720qm ./tests/lowlevel-blt-bench add_n_888:
reference memcpy speed = 14918.3MB/s (3729.6MP/s for 32bpp fills)
before: L1:1752.44 L2:2259.48 M:2215.73 ( 58.80%) HT:589.49 VT:404.04 R:424.69 RT:134.68 (1182Kops/s)
after: L1:3931.21 L2:6132.78 M:3440.17 ( 92.24%) HT:1337.70 VT:1357.64 R:1270.27 RT:359.78 (2161Kops/s)
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 5a0e062..f4a7d51 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4523,9 +4523,70 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
sse2_combine_add_u (imp, op, dst, src, NULL, width);
}
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst, src;
+ int dst_stride;
+
+ __m128i xmm_src;
+
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+ if (src == 0)
+ return;
+
+ if (src == ~0)
+ {
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+ dest_x, dest_y, width, height, ~0);
+
+ return;
+ }
+
+ xmm_src = _mm_set_epi32 (src, src, src, src);
+ while (height--)
+ {
+ int w = width;
+ uint32_t d;
+
+ dst = dst_line;
+ dst_line += dst_stride;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ =
+ _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned
+ ((__m128i*)dst,
+ _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+ dst += 4;
+ w -= 4;
+ }
+ while (w--)
+ {
+ d = *dst;
+ *dst++ =
+ _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+ _mm_cvtsi32_si128 (d)));
+ }
+ }
}
+
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
uint32_t * src_bits,
@@ -5848,6 +5909,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
More information about the xorg-commit
mailing list