xf86-video-intel: 5 commits - src/sna/compiler.h src/sna/gen2_render.c src/sna/gen3_render.c src/sna/sna_glyphs.c src/sna/sna_trapezoids.c
Chris Wilson
ickle at kemper.freedesktop.org
Tue Feb 26 11:09:15 PST 2013
src/sna/compiler.h | 6
src/sna/gen2_render.c | 326 +++++++++++++++++++++++++-
src/sna/gen3_render.c | 586 +++++++++++++++++++++++++++++++++++++++++++++--
src/sna/sna_glyphs.c | 17 -
src/sna/sna_trapezoids.c | 4
5 files changed, 891 insertions(+), 48 deletions(-)
New commits:
commit 11b72628cb54ab0b78a0969fa8fabb591f6cf93f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Feb 26 19:08:58 2013 +0000
sna/gen2: Add SSE2 fast paths for vertex emission
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 4d92adc..58f2578 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -959,6 +959,124 @@ gen2_emit_composite_primitive_constant_identity_mask(struct sna *sna,
v[7] = v[3] = v[11] + h * op->mask.scale[1];
}
+#if defined(sse2) && !defined(__x86_64__)
+sse2 fastcall static void
+gen2_emit_composite_primitive_constant__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ int16_t dst_x = r->dst.x + op->dst.x;
+ int16_t dst_y = r->dst.y + op->dst.y;
+
+ gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
+ gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
+ gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
+}
+
+sse2 fastcall static void
+gen2_emit_composite_primitive_linear__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ int16_t dst_x = r->dst.x + op->dst.x;
+ int16_t dst_y = r->dst.y + op->dst.y;
+
+ gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
+ gen2_emit_composite_linear(sna, &op->src,
+ r->src.x + r->width, r->src.y + r->height);
+
+ gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
+ gen2_emit_composite_linear(sna, &op->src,
+ r->src.x, r->src.y + r->height);
+
+ gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
+ gen2_emit_composite_linear(sna, &op->src,
+ r->src.x, r->src.y);
+}
+
+sse2 fastcall static void
+gen2_emit_composite_primitive_identity__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ sna->kgem.nbatch += 12;
+
+ v[8] = v[4] = r->dst.x + op->dst.x;
+ v[0] = v[4] + w;
+
+ v[9] = r->dst.y + op->dst.y;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+ v[2] = v[6] + w * op->src.scale[0];
+
+ v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+ v[7] = v[3] = v[11] + h * op->src.scale[1];
+}
+
+sse2 fastcall static void
+gen2_emit_composite_primitive_affine__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ PictTransform *transform = op->src.transform;
+ int src_x = r->src.x + (int)op->src.offset[0];
+ int src_y = r->src.y + (int)op->src.offset[1];
+ float *v;
+
+ v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ sna->kgem.nbatch += 12;
+
+ v[8] = v[4] = r->dst.x + op->dst.x;
+ v[0] = v[4] + r->width;
+
+ v[9] = r->dst.y + op->dst.y;
+ v[5] = v[1] = v[9] + r->height;
+
+ _sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
+ transform, op->src.scale,
+ &v[2], &v[3]);
+
+ _sna_get_transformed_scaled(src_x, src_y + r->height,
+ transform, op->src.scale,
+ &v[6], &v[7]);
+
+ _sna_get_transformed_scaled(src_x, src_y,
+ transform, op->src.scale,
+ &v[10], &v[11]);
+}
+
+sse2 fastcall static void
+gen2_emit_composite_primitive_constant_identity_mask__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ sna->kgem.nbatch += 12;
+
+ v[8] = v[4] = r->dst.x + op->dst.x;
+ v[0] = v[4] + w;
+
+ v[9] = r->dst.y + op->dst.y;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
+ v[2] = v[6] + w * op->mask.scale[0];
+
+ v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
+ v[7] = v[3] = v[11] + h * op->mask.scale[1];
+}
+#endif
+
static void gen2_magic_ca_pass(struct sna *sna,
const struct sna_composite_op *op)
{
@@ -1855,24 +1973,59 @@ gen2_render_composite(struct sna *sna,
if (tmp->mask.transform == NULL) {
if (tmp->src.is_solid) {
assert(tmp->floats_per_rect == 12);
- tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask;
+ }
}
}
} else {
if (tmp->src.is_solid) {
assert(tmp->floats_per_rect == 6);
- tmp->prim_emit = gen2_emit_composite_primitive_constant;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_primitive_constant__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_primitive_constant;
+ }
} else if (tmp->src.is_linear) {
assert(tmp->floats_per_rect == 12);
- tmp->prim_emit = gen2_emit_composite_primitive_linear;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_primitive_linear__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_primitive_linear;
+ }
} else if (tmp->src.transform == NULL) {
assert(tmp->floats_per_rect == 12);
- tmp->prim_emit = gen2_emit_composite_primitive_identity;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_primitive_identity__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_primitive_identity;
+ }
} else if (tmp->src.is_affine) {
assert(tmp->floats_per_rect == 12);
tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
- tmp->prim_emit = gen2_emit_composite_primitive_affine;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_primitive_affine__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_primitive_affine;
+ }
}
}
@@ -2030,6 +2183,129 @@ gen2_emit_composite_spans_primitive_affine_source(struct sna *sna,
&v[13], &v[14]);
}
+#if defined(sse2) && !defined(__x86_64__)
+sse2 fastcall static void
+gen2_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
+ const struct sna_composite_spans_op *op,
+ const BoxRec *box,
+ float opacity)
+{
+ float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ uint32_t alpha = (uint8_t)(255 * opacity) << 24;
+ sna->kgem.nbatch += 9;
+
+ v[0] = op->base.dst.x + box->x2;
+ v[1] = op->base.dst.y + box->y2;
+ *((uint32_t *)v + 2) = alpha;
+
+ v[3] = op->base.dst.x + box->x1;
+ v[4] = v[1];
+ *((uint32_t *)v + 5) = alpha;
+
+ v[6] = v[3];
+ v[7] = op->base.dst.y + box->y1;
+ *((uint32_t *)v + 8) = alpha;
+}
+
+sse2 fastcall static void
+gen2_emit_composite_spans_primitive_linear__sse2(struct sna *sna,
+ const struct sna_composite_spans_op *op,
+ const BoxRec *box,
+ float opacity)
+{
+ union {
+ float f;
+ uint32_t u;
+ } alpha;
+
+ alpha.u = (uint8_t)(255 * opacity) << 24;
+
+ gen2_emit_composite_dstcoord(sna,
+ op->base.dst.x + box->x2,
+ op->base.dst.y + box->y2);
+ VERTEX(alpha.f);
+ gen2_emit_composite_linear(sna, &op->base.src, box->x2, box->y2);
+
+ gen2_emit_composite_dstcoord(sna,
+ op->base.dst.x + box->x1,
+ op->base.dst.y + box->y2);
+ VERTEX(alpha.f);
+ gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y2);
+
+ gen2_emit_composite_dstcoord(sna,
+ op->base.dst.x + box->x1,
+ op->base.dst.y + box->y1);
+ VERTEX(alpha.f);
+ gen2_emit_composite_linear(sna, &op->base.src, box->x1, box->y1);
+}
+
+sse2 fastcall static void
+gen2_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
+ const struct sna_composite_spans_op *op,
+ const BoxRec *box,
+ float opacity)
+{
+ float *v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ uint32_t alpha = (uint8_t)(255 * opacity) << 24;
+ sna->kgem.nbatch += 15;
+
+ v[0] = op->base.dst.x + box->x2;
+ v[1] = op->base.dst.y + box->y2;
+ *((uint32_t *)v + 2) = alpha;
+ v[3] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
+ v[4] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
+
+ v[5] = op->base.dst.x + box->x1;
+ v[6] = v[1];
+ *((uint32_t *)v + 7) = alpha;
+ v[8] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
+ v[9] = v[4];
+
+ v[10] = v[5];
+ v[11] = op->base.dst.y + box->y1;
+ *((uint32_t *)v + 12) = alpha;
+ v[13] = v[8];
+ v[14] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
+}
+
+sse2 fastcall static void
+gen2_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
+ const struct sna_composite_spans_op *op,
+ const BoxRec *box,
+ float opacity)
+{
+ PictTransform *transform = op->base.src.transform;
+ uint32_t alpha = (uint8_t)(255 * opacity) << 24;
+ float *v;
+
+ v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+ sna->kgem.nbatch += 15;
+
+ v[0] = op->base.dst.x + box->x2;
+ v[6] = v[1] = op->base.dst.y + box->y2;
+ v[10] = v[5] = op->base.dst.x + box->x1;
+ v[11] = op->base.dst.y + box->y1;
+ *((uint32_t *)v + 2) = alpha;
+ *((uint32_t *)v + 7) = alpha;
+ *((uint32_t *)v + 12) = alpha;
+
+ _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+ (int)op->base.src.offset[1] + box->y2,
+ transform, op->base.src.scale,
+ &v[3], &v[4]);
+
+ _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+ (int)op->base.src.offset[1] + box->y2,
+ transform, op->base.src.scale,
+ &v[8], &v[9]);
+
+ _sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+ (int)op->base.src.offset[1] + box->y1,
+ transform, op->base.src.scale,
+ &v[13], &v[14]);
+}
+#endif
+
static void
gen2_emit_composite_spans_vertex(struct sna *sna,
const struct sna_composite_spans_op *op,
@@ -2293,19 +2569,47 @@ gen2_render_composite_spans(struct sna *sna,
tmp->prim_emit = gen2_emit_composite_spans_primitive;
tmp->base.floats_per_vertex = 3;
if (tmp->base.src.is_solid) {
- tmp->prim_emit = gen2_emit_composite_spans_primitive_constant;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_constant__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_constant;
+ }
} else if (tmp->base.src.is_linear) {
tmp->base.floats_per_vertex += 2;
- tmp->prim_emit = gen2_emit_composite_spans_primitive_linear;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_linear__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_linear;
+ }
} else {
assert(tmp->base.src.bo);
tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
- if (tmp->base.src.transform == NULL)
- tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source;
- else if (tmp->base.src.is_affine) {
+ if (tmp->base.src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source;
+ }
+ } else if (tmp->base.src.is_affine) {
tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
- tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source;
+ }
}
}
tmp->base.mask.bo = NULL;
commit f2597c89d023beceddd65e99fa595741f2400218
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Feb 26 18:44:45 2013 +0000
sna: Force GCC to use the SSE unit for SSE2 routines
Merely hinting that it was preferred by using sse+387 was not enough
for GCC to emit the faster SSE2 code.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index 321e697..d6af442 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -53,12 +53,12 @@
#endif
#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 5)
-#define sse2 __attribute__((target("sse2,fpmath=sse+387")))
-#define sse4_2 __attribute__((target("sse4.2,sse2,fpmath=sse+387")))
+#define sse2 __attribute__((target("sse2,fpmath=sse")))
+#define sse4_2 __attribute__((target("sse4.2,sse2,fpmath=sse")))
#endif
#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 7)
-#define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse+387")))
+#define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse")))
#endif
#ifdef HAVE_VALGRIND
commit a18ce0f642fa347b61e4ca501bd2f747338a2975
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Feb 26 18:44:35 2013 +0000
sna: Flatten the glyph emitters
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 5fed8b4..3b1cf37 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -502,7 +502,7 @@ static void apply_damage_clipped_to_dst(struct sna_composite_op *op,
sna_damage_add_box(op->damage, &box);
}
-static bool
+flatten static bool
glyphs_to_dst(struct sna *sna,
CARD8 op,
PicturePtr src,
@@ -830,7 +830,7 @@ sna_glyph_get_image(GlyphPtr g, ScreenPtr s)
return image;
}
-static bool
+flatten static bool
glyphs_via_mask(struct sna *sna,
CARD8 op,
PicturePtr src,
@@ -1694,17 +1694,10 @@ sna_glyphs(CARD8 op,
goto fallback;
}
- if (mask == NULL) {
- if (glyphs_to_dst(sna, op,
- src, dst,
- src_x, src_y,
- nlist, list, glyphs))
- return;
- }
-
/* Try to discard the mask for non-overlapping glyphs */
- if (mask && dst->pCompositeClip->data == NULL &&
- can_discard_mask(op, src, mask, nlist, list, glyphs)) {
+ if (mask == NULL ||
+ (dst->pCompositeClip->data == NULL &&
+ can_discard_mask(op, src, mask, nlist, list, glyphs))) {
DBG(("%s: discarding mask\n", __FUNCTION__));
if (glyphs_to_dst(sna, op,
src, dst,
commit 7bb06b02e67435354778fe87a3e0429fe3750c23
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Feb 26 18:12:06 2013 +0000
sna/gen3: Expand the number of SSE2 routines for basic composite ops
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 79233ca..0bac8d3 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -962,6 +962,473 @@ gen3_emit_composite_primitive(struct sna *sna,
op->dst.y + r->dst.y);
}
+#if defined(sse2) && !defined(__x86_64__)
+sse2 fastcall static void
+gen3_emit_composite_primitive_constant__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ int16_t dst_x = r->dst.x + op->dst.x;
+ int16_t dst_y = r->dst.y + op->dst.y;
+
+ gen3_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
+ gen3_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
+ gen3_emit_composite_dstcoord(sna, dst_x, dst_y);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_constant__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ do {
+ v[0] = box->x2;
+ v[1] = box->y2;
+
+ v[2] = box->x1;
+ v[3] = box->y2;
+
+ v[4] = box->x1;
+ v[5] = box->y1;
+
+ box++;
+ v += 6;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_identity_gradient__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ int16_t dst_x, dst_y;
+ int16_t src_x, src_y;
+
+ dst_x = r->dst.x + op->dst.x;
+ dst_y = r->dst.y + op->dst.y;
+ src_x = r->src.x + op->src.offset[0];
+ src_y = r->src.y + op->src.offset[1];
+
+ gen3_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
+ OUT_VERTEX(src_x + r->width);
+ OUT_VERTEX(src_y + r->height);
+
+ gen3_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
+ OUT_VERTEX(src_x);
+ OUT_VERTEX(src_y + r->height);
+
+ gen3_emit_composite_dstcoord(sna, dst_x, dst_y);
+ OUT_VERTEX(src_x);
+ OUT_VERTEX(src_y);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_identity_gradient__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ do {
+ v[0] = box->x2;
+ v[1] = box->y2;
+ v[2] = box->x2 + op->src.offset[0];
+ v[3] = box->y2 + op->src.offset[1];
+
+ v[4] = box->x1;
+ v[5] = box->y2;
+ v[6] = box->x1 + op->src.offset[0];
+ v[7] = box->y2 + op->src.offset[1];
+
+ v[8] = box->x1;
+ v[9] = box->y1;
+ v[10] = box->x1 + op->src.offset[0];
+ v[11] = box->y1 + op->src.offset[1];
+
+ v += 12;
+ box++;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_affine_gradient__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ PictTransform *transform = op->src.transform;
+ int16_t dst_x, dst_y;
+ int16_t src_x, src_y;
+ float *v;
+
+ dst_x = r->dst.x + op->dst.x;
+ dst_y = r->dst.y + op->dst.y;
+ src_x = r->src.x + op->src.offset[0];
+ src_y = r->src.y + op->src.offset[1];
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[0] = dst_x + r->width;
+ v[1] = dst_y + r->height;
+ _sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
+ transform, op->src.scale,
+ &v[2], &v[3]);
+
+ v[4] = dst_x;
+ v[5] = dst_y + r->height;
+ _sna_get_transformed_scaled(src_x, src_y + r->height,
+ transform, op->src.scale,
+ &v[6], &v[7]);
+
+ v[8] = dst_x;
+ v[9] = dst_y;
+ _sna_get_transformed_scaled(src_x, src_y,
+ transform, op->src.scale,
+ &v[10], &v[11]);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_affine_gradient__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ const PictTransform *transform = op->src.transform;
+
+ do {
+ v[0] = box->x2;
+ v[1] = box->y2;
+ _sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ box->y2 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[2], &v[3]);
+
+ v[4] = box->x1;
+ v[5] = box->y2;
+ _sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ box->y2 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[6], &v[7]);
+
+ v[8] = box->x1;
+ v[9] = box->y1;
+ _sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ box->y1 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[10], &v[11]);
+
+ box++;
+ v += 12;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_identity_source__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[8] = v[4] = r->dst.x + op->dst.x;
+ v[0] = v[4] + w;
+
+ v[9] = r->dst.y + op->dst.y;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+ v[2] = v[6] + w * op->src.scale[0];
+
+ v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+ v[7] = v[3] = v[11] + h * op->src.scale[1];
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_identity_source__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ do {
+ v[0] = box->x2 + op->dst.x;
+ v[8] = v[4] = box->x1 + op->dst.x;
+ v[5] = v[1] = box->y2 + op->dst.y;
+ v[9] = box->y1 + op->dst.y;
+
+ v[10] = v[6] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+ v[2] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+ v[11] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+ v[7] = v[3] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+ v += 12;
+ box++;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_identity_source_no_offset__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[8] = v[4] = r->dst.x;
+ v[9] = r->dst.y;
+
+ v[0] = v[4] + w;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = r->src.x * op->src.scale[0];
+ v[11] = r->src.y * op->src.scale[1];
+
+ v[2] = v[6] + w * op->src.scale[0];
+ v[7] = v[3] = v[11] + h * op->src.scale[1];
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_identity_source_no_offset__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ do {
+ v[0] = box->x2;
+ v[8] = v[4] = box->x1;
+ v[5] = v[1] = box->y2;
+ v[9] = box->y1;
+
+ v[10] = v[6] = box->x1 * op->src.scale[0];
+ v[2] = box->x2 * op->src.scale[0];
+
+ v[11] = box->y1 * op->src.scale[1];
+ v[7] = v[3] = box->y2 * op->src.scale[1];
+
+ v += 12;
+ box++;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_affine_source__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ PictTransform *transform = op->src.transform;
+ int16_t dst_x = r->dst.x + op->dst.x;
+ int16_t dst_y = r->dst.y + op->dst.y;
+ int src_x = r->src.x + (int)op->src.offset[0];
+ int src_y = r->src.y + (int)op->src.offset[1];
+ float *v;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[0] = dst_x + r->width;
+ v[5] = v[1] = dst_y + r->height;
+ v[8] = v[4] = dst_x;
+ v[9] = dst_y;
+
+ _sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
+ transform, op->src.scale,
+ &v[2], &v[3]);
+
+ _sna_get_transformed_scaled(src_x, src_y + r->height,
+ transform, op->src.scale,
+ &v[6], &v[7]);
+
+ _sna_get_transformed_scaled(src_x, src_y,
+ transform, op->src.scale,
+ &v[10], &v[11]);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_boxes_affine_source__sse2(const struct sna_composite_op *op,
+ const BoxRec *box, int nbox,
+ float *v)
+{
+ const PictTransform *transform = op->src.transform;
+
+ do {
+ v[0] = box->x2;
+ v[5] = v[1] = box->y2;
+ v[8] = v[4] = box->x1;
+ v[9] = box->y1;
+
+ _sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+ box->y2 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[2], &v[3]);
+
+ _sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ box->y2 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[6], &v[7]);
+
+ _sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+ box->y1 + op->src.offset[1],
+ transform, op->src.scale,
+ &v[10], &v[11]);
+
+ v += 12;
+ box++;
+ } while (--nbox);
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_constant_identity_mask__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[8] = v[4] = r->dst.x + op->dst.x;
+ v[0] = v[4] + w;
+
+ v[9] = r->dst.y + op->dst.y;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
+ v[2] = v[6] + w * op->mask.scale[0];
+
+ v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
+ v[7] = v[3] = v[11] + h * op->mask.scale[1];
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_constant_identity_mask_no_offset__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float w = r->width;
+ float h = r->height;
+ float *v;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 12;
+
+ v[8] = v[4] = r->dst.x;
+ v[9] = r->dst.y;
+
+ v[0] = v[4] + w;
+ v[5] = v[1] = v[9] + h;
+
+ v[10] = v[6] = r->mask.x * op->mask.scale[0];
+ v[11] = r->mask.y * op->mask.scale[1];
+
+ v[2] = v[6] + w * op->mask.scale[0];
+ v[7] = v[3] = v[11] + h * op->mask.scale[1];
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_identity_source_mask__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ float dst_x, dst_y;
+ float src_x, src_y;
+ float msk_x, msk_y;
+ float w, h;
+ float *v;
+
+ dst_x = r->dst.x + op->dst.x;
+ dst_y = r->dst.y + op->dst.y;
+ src_x = r->src.x + op->src.offset[0];
+ src_y = r->src.y + op->src.offset[1];
+ msk_x = r->mask.x + op->mask.offset[0];
+ msk_y = r->mask.y + op->mask.offset[1];
+ w = r->width;
+ h = r->height;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 18;
+
+ v[0] = dst_x + w;
+ v[1] = dst_y + h;
+ v[2] = (src_x + w) * op->src.scale[0];
+ v[3] = (src_y + h) * op->src.scale[1];
+ v[4] = (msk_x + w) * op->mask.scale[0];
+ v[5] = (msk_y + h) * op->mask.scale[1];
+
+ v[6] = dst_x;
+ v[7] = v[1];
+ v[8] = src_x * op->src.scale[0];
+ v[9] = v[3];
+ v[10] = msk_x * op->mask.scale[0];
+ v[11] =v[5];
+
+ v[12] = v[6];
+ v[13] = dst_y;
+ v[14] = v[8];
+ v[15] = src_y * op->src.scale[1];
+ v[16] = v[10];
+ v[17] = msk_y * op->mask.scale[1];
+}
+
+sse2 fastcall static void
+gen3_emit_composite_primitive_affine_source_mask__sse2(struct sna *sna,
+ const struct sna_composite_op *op,
+ const struct sna_composite_rectangles *r)
+{
+ int16_t src_x, src_y;
+ float dst_x, dst_y;
+ float msk_x, msk_y;
+ float w, h;
+ float *v;
+
+ dst_x = r->dst.x + op->dst.x;
+ dst_y = r->dst.y + op->dst.y;
+ src_x = r->src.x + op->src.offset[0];
+ src_y = r->src.y + op->src.offset[1];
+ msk_x = r->mask.x + op->mask.offset[0];
+ msk_y = r->mask.y + op->mask.offset[1];
+ w = r->width;
+ h = r->height;
+
+ v = sna->render.vertices + sna->render.vertex_used;
+ sna->render.vertex_used += 18;
+
+ v[0] = dst_x + w;
+ v[1] = dst_y + h;
+ sna_get_transformed_coordinates(src_x + r->width, src_y + r->height,
+ op->src.transform,
+ &v[2], &v[3]);
+ v[2] *= op->src.scale[0];
+ v[3] *= op->src.scale[1];
+ v[4] = (msk_x + w) * op->mask.scale[0];
+ v[5] = (msk_y + h) * op->mask.scale[1];
+
+ v[6] = dst_x;
+ v[7] = v[1];
+ sna_get_transformed_coordinates(src_x, src_y + r->height,
+ op->src.transform,
+ &v[8], &v[9]);
+ v[8] *= op->src.scale[0];
+ v[9] *= op->src.scale[1];
+ v[10] = msk_x * op->mask.scale[0];
+ v[11] =v[5];
+
+ v[12] = v[6];
+ v[13] = dst_y;
+ sna_get_transformed_coordinates(src_x, src_y,
+ op->src.transform,
+ &v[14], &v[15]);
+ v[14] *= op->src.scale[0];
+ v[15] *= op->src.scale[1];
+ v[16] = v[10];
+ v[17] = msk_y * op->mask.scale[1];
+}
+#endif
+
static inline void
gen3_2d_perspective(struct sna *sna, int in, int out)
{
@@ -3201,48 +3668,127 @@ gen3_render_composite(struct sna *sna,
case SHADER_BLACK:
case SHADER_WHITE:
case SHADER_CONSTANT:
- tmp->prim_emit = gen3_emit_composite_primitive_constant;
- tmp->emit_boxes = gen3_emit_composite_boxes_constant;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_constant__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant;
+ tmp->emit_boxes = gen3_emit_composite_boxes_constant;
+ }
+
break;
case SHADER_LINEAR:
case SHADER_RADIAL:
if (tmp->src.transform == NULL) {
- tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient;
- tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient;
+ }
} else if (tmp->src.is_affine) {
tmp->src.scale[1] = tmp->src.scale[0] = 1. / tmp->src.transform->matrix[2][2];
- tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
- tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
+ tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient;
+ }
}
break;
case SHADER_TEXTURE:
if (tmp->src.transform == NULL) {
if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
- tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
- tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset;
+ }
} else {
- tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
- tmp->emit_boxes = gen3_emit_composite_boxes_identity_source;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_source__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
+ tmp->emit_boxes = gen3_emit_composite_boxes_identity_source;
+ }
}
} else if (tmp->src.is_affine) {
tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
- tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
- tmp->emit_boxes = gen3_emit_composite_boxes_affine_source;
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_source__sse2;
+ tmp->emit_boxes = gen3_emit_composite_boxes_affine_source__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
+ tmp->emit_boxes = gen3_emit_composite_boxes_affine_source;
+ }
}
break;
}
} else if (tmp->mask.u.gen3.type == SHADER_TEXTURE) {
if (tmp->mask.transform == NULL) {
if (is_constant_ps(tmp->src.u.gen3.type)) {
- if ((tmp->mask.offset[0]|tmp->mask.offset[1]|tmp->dst.x|tmp->dst.y) == 0)
- tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset;
- else
- tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask;
- } else if (tmp->src.transform == NULL)
- tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask;
- else if (tmp->src.is_affine)
- tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask;
+ if ((tmp->mask.offset[0]|tmp->mask.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset;
+ }
+ } else {
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask;
+ }
+ }
+ } else if (tmp->src.transform == NULL) {
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask;
+ }
+ } else if (tmp->src.is_affine) {
+#if defined(sse2) && !defined(__x86_64__)
+ if (sna->cpu_features & SSE2) {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask__sse2;
+ } else
+#endif
+ {
+ tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask;
+ }
+ }
}
}
commit 417c3f9b8c6b9a50dc1af440c53e94d2c6401251
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Tue Feb 26 17:41:57 2013 +0000
sna/trapezoids: Add a pair of unlikely hints for forced box emission
Overflowing the buffer is unlikely, so pass the hint on to the compiler.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index bed9168..baba827 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -2061,7 +2061,7 @@ thread_mono_span_add_boxes(struct mono *c, const BoxRec *box, int count)
struct mono_span_thread_boxes *b = c->op.priv;
assert(count > 0 && count <= MONO_SPAN_MAX_BOXES);
- if (b->num_boxes + count > MONO_SPAN_MAX_BOXES) {
+ if (unlikely(b->num_boxes + count > MONO_SPAN_MAX_BOXES)) {
b->op->thread_boxes(c->sna, b->op, b->boxes, b->num_boxes);
b->num_boxes = 0;
}
@@ -4405,7 +4405,7 @@ static void span_thread_add_boxes(struct sna *sna, void *data,
__FUNCTION__, count, alpha));
assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
- if (b->num_boxes + count > SPAN_THREAD_MAX_BOXES) {
+ if (unlikely(b->num_boxes + count > SPAN_THREAD_MAX_BOXES)) {
DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
More information about the xorg-commit
mailing list