xf86-video-intel: 13 commits - src/i830_3d.c src/i830_batchbuffer.c src/i830_batchbuffer.h src/i830_driver.c src/i830.h src/i830_render.c src/i830_uxa.c src/i915_3d.c src/i915_3d.h src/i915_reg.h src/i915_render.c src/i915_video.c src/i965_render.c src/i965_video.c uxa/uxa-accel.c uxa/uxa-glyphs.c uxa/uxa.h uxa/uxa-priv.h uxa/uxa-render.c

Chris Wilson ickle at kemper.freedesktop.org
Mon May 24 11:16:33 PDT 2010


 src/i830.h             |   36 +
 src/i830_3d.c          |    4 
 src/i830_batchbuffer.c |   60 +-
 src/i830_batchbuffer.h |   51 +-
 src/i830_driver.c      |    7 
 src/i830_render.c      |   36 +
 src/i830_uxa.c         |   31 +
 src/i915_3d.c          |   11 
 src/i915_3d.h          |   30 -
 src/i915_reg.h         |   27 -
 src/i915_render.c      |  991 ++++++++++++++++++++++++++++++++-----------------
 src/i915_video.c       |   33 -
 src/i965_render.c      |   22 -
 src/i965_video.c       |   18 
 uxa/uxa-accel.c        |  207 ++++++++--
 uxa/uxa-glyphs.c       |  529 ++++++++++++++++++++++----
 uxa/uxa-priv.h         |   25 +
 uxa/uxa-render.c       |  400 ++++++++++---------
 uxa/uxa.h              |   20 
 19 files changed, 1708 insertions(+), 830 deletions(-)

New commits:
commit 9d8ac271404ff219c3d40ae8f829e8b76ac7c359
Merge: 509df27... ea07535...
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 21 11:01:30 2010 +0100

    Merge branch 'glyphs'
    
    Tweak glyphs to improve x11perf on i915 by about 33%.
    PineView, aa10text:  460 -> 617 kglyphs/s.
    PineView, rgb10text: 434 -> 610 kglyphs/s.
    
    Speedups
    ========
      xcb                    poppler    18.636 -> 13.958:  1.34x speedup
     xlib          firefox-talos-gfx    71.905 -> 56.232:  1.28x speedup
      xcb          firefox-talos-gfx    72.882 -> 57.969:  1.26x speedup
     xlib         gnome-terminal-vim    38.126 -> 34.472:  1.11x speedup
      xcb         gnome-terminal-vim    35.164 -> 32.573:  1.08x speedup
     xlib                    poppler    19.634 -> 18.246:  1.08x speedup
    
    Note the lack of significant improvement for firefox-planet-gnome.

commit ea07535240dafc4c6ef55b4b7a2eeaa595febe86
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 21 14:33:18 2010 +0100

    i915: Emit CA over using OutReverse + Add passes
    
    On PineView:
      578/621 -> 610/617 kglyphs/sec [rgb/aa]

diff --git a/src/i830.h b/src/i830.h
index 229a4e6..a69f60d 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -344,6 +344,7 @@ typedef struct intel_screen_private {
 	Bool render_mask_is_solid;
 	Bool needs_render_state_emit;
 	Bool needs_render_vertex_emit;
+	Bool needs_render_ca_pass;
 
 	/* i830 render accel state */
 	uint32_t render_dest_format;
diff --git a/src/i830_uxa.c b/src/i830_uxa.c
index 0a6b6f8..a2da530 100644
--- a/src/i830_uxa.c
+++ b/src/i830_uxa.c
@@ -1055,7 +1055,6 @@ Bool i830_uxa_init(ScreenPtr screen)
 	intel->uxa_driver->uxa_major = 1;
 	intel->uxa_driver->uxa_minor = 0;
 
-	intel->needs_render_vertex_emit = TRUE;
 	intel->prim_offset = 0;
 	intel->vertex_count = 0;
 	intel->floats_per_vertex = 0;
diff --git a/src/i915_3d.c b/src/i915_3d.c
index 7f07b4b..906043b 100644
--- a/src/i915_3d.c
+++ b/src/i915_3d.c
@@ -85,8 +85,13 @@ void I915EmitInvarientState(ScrnInfoPtr scrn)
 		  ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
 		  ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
 
-	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | 0);
+	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
 	OUT_BATCH(0x00000000);	/* Disable texture coordinate wrap-shortest */
+	OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
+		  S4_LINE_WIDTH_ONE |
+		  S4_CULLMODE_NONE |
+		  S4_VFMT_XY);
+	OUT_BATCH(0x00000000);	/* Stencil. */
 
 	OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
 	OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
diff --git a/src/i915_render.c b/src/i915_render.c
index 4eb4046..3d38397 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -133,8 +133,10 @@ static uint32_t i915_get_blend_cntl(int op, PicturePtr mask,
 		}
 	}
 
-	return (sblend << S6_CBUF_SRC_BLEND_FACT_SHIFT) |
-	    (dblend << S6_CBUF_DST_BLEND_FACT_SHIFT);
+	return S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
+		(BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT) |
+		(sblend << S6_CBUF_SRC_BLEND_FACT_SHIFT) |
+		(dblend << S6_CBUF_DST_BLEND_FACT_SHIFT);
 }
 
 #define DSTORG_HORT_BIAS(x)             ((x)<<20)
@@ -204,11 +206,13 @@ i915_check_composite(int op,
 		 */
 		if (i915_blend_op[op].src_alpha &&
 		    (i915_blend_op[op].src_blend != BLENDFACT_ZERO)) {
-			intel_debug_fallback(scrn,
-					     "Component alpha not supported "
-					     "with source alpha and source "
-					     "value blending.\n");
-			return FALSE;
+			if (op != PictOpOver) {
+				intel_debug_fallback(scrn,
+						     "Component alpha not supported "
+						     "with source alpha and source "
+						     "value blending.\n");
+				return FALSE;
+			}
 		}
 	}
 
@@ -814,6 +818,23 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 
 	if (!i830_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
 		return FALSE;
+
+	intel->needs_render_ca_pass = FALSE;
+	if (mask_picture != NULL && mask_picture->componentAlpha &&
+	    PICT_FORMAT_RGB(mask_picture->format)) {
+		/* Check if it's component alpha that relies on a source alpha
+		 * and on the source value.  We can only get one of those
+		 * into the single source value that we get to blend with.
+		 */
+		if (i915_blend_op[op].src_alpha &&
+		    (i915_blend_op[op].src_blend != BLENDFACT_ZERO)) {
+			if (op != PictOpOver)
+				return FALSE;
+
+			intel->needs_render_ca_pass = TRUE;
+		}
+	}
+
 	intel->dst_coord_adjust = 0;
 	intel->src_coord_adjust = 0;
 	intel->mask_coord_adjust = 0;
@@ -902,6 +923,120 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 	return TRUE;
 }
 
+static void
+i915_composite_emit_shader(intel_screen_private *intel, CARD8 op)
+{
+	PicturePtr mask_picture = intel->render_mask_picture;
+	PixmapPtr mask = intel->render_mask;
+	int src_reg, mask_reg;
+	Bool is_solid_src, is_solid_mask;
+	uint32_t dst_format = intel->i915_render_state.dst_format;
+	int tex_unit, t;
+	FS_LOCALS();
+
+	is_solid_src = intel->render_source_is_solid;
+	is_solid_mask = intel->render_mask_is_solid;
+
+	FS_BEGIN();
+
+	/* Declare the registers necessary for our program.  */
+	t = 0;
+	if (is_solid_src) {
+		i915_fs_dcl(FS_T8);
+		src_reg = FS_T8;
+	} else {
+		i915_fs_dcl(FS_T0);
+		i915_fs_dcl(FS_S0);
+		t++;
+	}
+	if (!mask) {
+		/* No mask, so load directly to output color */
+		if (! is_solid_src) {
+			if (dst_format == COLR_BUF_8BIT)
+				src_reg = FS_R0;
+			else
+				src_reg = FS_OC;
+
+			if (i830_transform_is_affine(intel->transform[0]))
+				i915_fs_texld(src_reg, FS_S0, FS_T0);
+			else
+				i915_fs_texldp(src_reg, FS_S0, FS_T0);
+		}
+
+		if (src_reg != FS_OC) {
+			if (dst_format == COLR_BUF_8BIT)
+				i915_fs_mov(FS_OC, i915_fs_operand(src_reg, W, W, W, W));
+			else
+				i915_fs_mov(FS_OC, i915_fs_operand_reg(src_reg));
+		}
+	} else {
+		if (is_solid_mask) {
+			i915_fs_dcl(FS_T9);
+			mask_reg = FS_T9;
+		} else {
+			i915_fs_dcl(FS_T0 + t);
+			i915_fs_dcl(FS_S0 + t);
+		}
+
+		tex_unit = 0;
+		if (! is_solid_src) {
+			/* Load the source_picture texel */
+			if (i830_transform_is_affine(intel->transform[tex_unit]))
+				i915_fs_texld(FS_R0, FS_S0, FS_T0);
+			else
+				i915_fs_texldp(FS_R0, FS_S0, FS_T0);
+
+			src_reg = FS_R0;
+			tex_unit++;
+		}
+
+		if (! is_solid_mask) {
+			/* Load the mask_picture texel */
+			if (i830_transform_is_affine(intel->transform[tex_unit]))
+				i915_fs_texld(FS_R1, FS_S0 + t, FS_T0 + t);
+			else
+				i915_fs_texldp(FS_R1, FS_S0 + t, FS_T0 + t);
+
+			mask_reg = FS_R1;
+		}
+
+		if (dst_format == COLR_BUF_8BIT) {
+			i915_fs_mul(FS_OC,
+				    i915_fs_operand(src_reg, W, W, W, W),
+				    i915_fs_operand(mask_reg, W, W, W, W));
+		} else {
+			/* If component alpha is active in the mask and the blend
+			 * operation uses the source alpha, then we know we don't
+			 * need the source value (otherwise we would have hit a
+			 * fallback earlier), so we provide the source alpha (src.A *
+			 * mask.X) as output color.
+			 * Conversely, if CA is set and we don't need the source alpha,
+			 * then we produce the source value (src.X * mask.X) and the
+			 * source alpha is unused.  Otherwise, we provide the non-CA
+			 * source value (src.X * mask.A).
+			 */
+			if (mask_picture->componentAlpha &&
+			    PICT_FORMAT_RGB(mask_picture->format)) {
+				if (i915_blend_op[op].src_alpha) {
+					i915_fs_mul(FS_OC,
+						    i915_fs_operand(src_reg, W, W, W, W),
+						    i915_fs_operand_reg(mask_reg));
+				} else {
+					i915_fs_mul(FS_OC,
+						    i915_fs_operand_reg(src_reg),
+						    i915_fs_operand_reg(mask_reg));
+				}
+			} else {
+				i915_fs_mul(FS_OC,
+					    i915_fs_operand_reg(src_reg),
+					    i915_fs_operand(mask_reg, W, W, W, W));
+			}
+		}
+	}
+
+	FS_END();
+}
+
 static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
@@ -911,8 +1046,7 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	PixmapPtr mask = intel->render_mask;
 	PixmapPtr dest = intel->render_dest;
 	uint32_t dst_format = intel->i915_render_state.dst_format, dst_pitch;
-	uint32_t blendctl, tiling_bits;
-	Bool is_affine_src, is_affine_mask;
+	uint32_t tiling_bits;
 	Bool is_solid_src, is_solid_mask;
 	int tex_count, t;
 
@@ -923,9 +1057,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 
 	dst_pitch = intel_get_pixmap_pitch(dest);
 
-	is_affine_src = i830_transform_is_affine(intel->transform[0]);
-	is_affine_mask = i830_transform_is_affine(intel->transform[1]);
-
 	is_solid_src = intel->render_source_is_solid;
 	is_solid_mask = intel->render_mask_is_solid;
 
@@ -982,33 +1113,31 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	{
 		uint32_t ss2;
 
-		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) |
-			  I1_LOAD_S(4) | I1_LOAD_S(5) | I1_LOAD_S(6) | 3);
 		ss2 = ~0;
 		t = 0;
 		if (! is_solid_src) {
 		    ss2 &= ~S2_TEXCOORD_FMT(t, TEXCOORDFMT_NOT_PRESENT);
 		    ss2 |= S2_TEXCOORD_FMT(t,
-					   is_affine_src ? TEXCOORDFMT_2D :
-					   TEXCOORDFMT_4D);
+					   i830_transform_is_affine(intel->transform[t]) ?
+					   TEXCOORDFMT_2D : TEXCOORDFMT_4D);
 		    t++;
 		}
 		if (mask && ! is_solid_mask) {
 		    ss2 &= ~S2_TEXCOORD_FMT(t, TEXCOORDFMT_NOT_PRESENT);
 		    ss2 |= S2_TEXCOORD_FMT(t,
-			    is_affine_mask ? TEXCOORDFMT_2D :
-			    TEXCOORDFMT_4D);
+					   i830_transform_is_affine(intel->transform[t]) ?
+					   TEXCOORDFMT_2D : TEXCOORDFMT_4D);
 		    t++;
 		}
-		OUT_BATCH(ss2);
-		OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) | S4_LINE_WIDTH_ONE |
-			  S4_CULLMODE_NONE | S4_VFMT_XY);
-		blendctl =
-		    i915_get_blend_cntl(op, mask_picture, dest_picture->format);
-		OUT_BATCH(0x00000000);	/* Disable stencil buffer */
-		OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
-			  (BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT) |
-			  blendctl);
+
+		if (intel->needs_render_ca_pass) {
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | 0);
+			OUT_BATCH(ss2);
+		} else {
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
+			OUT_BATCH(ss2);
+			OUT_BATCH(i915_get_blend_cntl(op, mask_picture, dest_picture->format));
+		}
 
 		/* draw rect is unconditional */
 		OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
@@ -1020,109 +1149,8 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 		OUT_BATCH(0x00000000);
 	}
 
-	{
-	    FS_LOCALS();
-	    int src_reg, mask_reg;
-
-	    FS_BEGIN();
-
-	    /* Declare the registers necessary for our program.  */
-	    t = 0;
-	    if (is_solid_src) {
-		i915_fs_dcl(FS_T8);
-		src_reg = FS_T8;
-	    } else {
-		i915_fs_dcl(FS_T0);
-		i915_fs_dcl(FS_S0);
-		t++;
-	    }
-	    if (!mask) {
-		    /* No mask, so load directly to output color */
-		    if (! is_solid_src) {
-			    if (dst_format == COLR_BUF_8BIT)
-				    src_reg = FS_R0;
-			    else
-				    src_reg = FS_OC;
-
-			    if (is_affine_src)
-				    i915_fs_texld(src_reg, FS_S0, FS_T0);
-			    else
-				    i915_fs_texldp(src_reg, FS_S0, FS_T0);
-		    }
-
-		    if (src_reg != FS_OC) {
-			    if (dst_format == COLR_BUF_8BIT)
-				    i915_fs_mov(FS_OC, i915_fs_operand(src_reg, W, W, W, W));
-			    else
-				    i915_fs_mov(FS_OC, i915_fs_operand_reg(src_reg));
-		    }
-	    } else {
-		    if (is_solid_mask) {
-			    i915_fs_dcl(FS_T9);
-			    mask_reg = FS_T9;
-		    } else {
-			    i915_fs_dcl(FS_T0 + t);
-			    i915_fs_dcl(FS_S0 + t);
-		    }
-
-		    if (! is_solid_src) {
-			    /* Load the source_picture texel */
-			    if (is_affine_src) {
-				    i915_fs_texld(FS_R0, FS_S0, FS_T0);
-			    } else {
-				    i915_fs_texldp(FS_R0, FS_S0, FS_T0);
-			    }
-
-			    src_reg = FS_R0;
-		    }
-
-		    if (! is_solid_mask) {
-			    /* Load the mask_picture texel */
-			    if (is_affine_mask) {
-				    i915_fs_texld(FS_R1, FS_S0 + t, FS_T0 + t);
-			    } else {
-				    i915_fs_texldp(FS_R1, FS_S0 + t, FS_T0 + t);
-			    }
-
-			    mask_reg = FS_R1;
-		    }
-
-		    if (dst_format == COLR_BUF_8BIT) {
-			    i915_fs_mul(FS_OC,
-					i915_fs_operand(src_reg, W, W, W, W),
-					i915_fs_operand(mask_reg, W, W, W, W));
-		    } else {
-			    /* If component alpha is active in the mask and the blend
-			     * operation uses the source alpha, then we know we don't
-			     * need the source value (otherwise we would have hit a
-			     * fallback earlier), so we provide the source alpha (src.A *
-			     * mask.X) as output color.
-			     * Conversely, if CA is set and we don't need the source alpha,
-			     * then we produce the source value (src.X * mask.X) and the
-			     * source alpha is unused.  Otherwise, we provide the non-CA
-			     * source value (src.X * mask.A).
-			     */
-			    if (mask_picture->componentAlpha &&
-				PICT_FORMAT_RGB(mask_picture->format)) {
-				    if (i915_blend_op[op].src_alpha) {
-					    i915_fs_mul(FS_OC,
-							i915_fs_operand(src_reg, W, W, W, W),
-							i915_fs_operand_reg(mask_reg));
-				    } else {
-					    i915_fs_mul(FS_OC,
-							i915_fs_operand_reg(src_reg),
-							i915_fs_operand_reg(mask_reg));
-				    }
-			    } else {
-				    i915_fs_mul(FS_OC,
-						i915_fs_operand_reg(src_reg),
-						i915_fs_operand(mask_reg, W, W, W, W));
-			    }
-		    }
-	    }
-
-	    FS_END();
-	}
+	if (! intel->needs_render_ca_pass)
+		i915_composite_emit_shader(intel, op);
 }
 
 void
@@ -1168,6 +1196,14 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	}
 
 	if (intel->prim_offset == 0) {
+		if (intel->needs_render_ca_pass) {
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
+			OUT_BATCH(i915_get_blend_cntl(PictOpOutReverse,
+						      intel->render_mask_picture,
+						      intel->render_dest_picture->format));
+			i915_composite_emit_shader(intel, PictOpOutReverse);
+		}
+
 		intel->prim_offset = intel->batch_used;
 		OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL);
 		OUT_BATCH(intel->vertex_index);
@@ -1192,6 +1228,16 @@ i915_vertex_flush(intel_screen_private *intel)
 	intel->batch_ptr[intel->prim_offset] |= intel->vertex_count;
 	intel->prim_offset = 0;
 
+	if (intel->needs_render_ca_pass) {
+		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
+		OUT_BATCH(i915_get_blend_cntl(PictOpAdd,
+					      intel->render_mask_picture,
+					      intel->render_dest_picture->format));
+		i915_composite_emit_shader(intel, PictOpAdd);
+		OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL | intel->vertex_count);
+		OUT_BATCH(intel->vertex_index);
+	}
+
 	intel->vertex_index += intel->vertex_count;
 	intel->vertex_count = 0;
 }
commit 80a9e64f50aeda6004e3aba1fbfdda50bb1f1c82
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri May 21 09:55:55 2010 +0100

    uxa: Use temporary dest when target is too large for compositor
    
    If the destination cannot fit into the 3D pipeline when we need to
    composite, we fallback to doing the operation on the CPU. This is very
    slow, and quite easy to trigger on i915 by plugging in an external
    display.
    
    An alternative is to extract the extents of the operation from the
    destination using the blitter which can usually handle much larger
    operations. This gives us a temporary target that can fit into the 3D
    pipeline and thus be accelerated, before copying back into the larger
    real destination.
    
    For x11perf this boosts glyph rendering on PineView, from 38kglyphs/s to
    480kglyphs/s. Just a little shy of the native performance of 601kglyphs/s
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830.h b/src/i830.h
index f8aa824..229a4e6 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -469,8 +469,10 @@ void i830_set_gem_max_sizes(ScrnInfoPtr scrn);
 drm_intel_bo *i830_allocate_framebuffer(ScrnInfoPtr scrn);
 
 /* i830_render.c */
-Bool i830_check_composite(int op, PicturePtr source, PicturePtr mask,
-			  PicturePtr dest);
+Bool i830_check_composite(int op,
+			  PicturePtr sourcec, PicturePtr mask, PicturePtr dest,
+			  int width, int height);
+Bool i830_check_composite_target(PixmapPtr pixmap);
 Bool i830_check_composite_texture(ScreenPtr screen, PicturePtr picture);
 Bool i830_prepare_composite(int op, PicturePtr sourcec, PicturePtr mask,
 			    PicturePtr dest, PixmapPtr sourcecPixmap,
@@ -481,8 +483,10 @@ void i830_composite(PixmapPtr dest, int srcX, int srcY,
 		    int maskX, int maskY, int dstX, int dstY, int w, int h);
 void i830_done_composite(PixmapPtr dest);
 /* i915_render.c */
-Bool i915_check_composite(int op, PicturePtr sourcec, PicturePtr mask,
-			  PicturePtr dest);
+Bool i915_check_composite(int op,
+			  PicturePtr sourcec, PicturePtr mask, PicturePtr dest,
+			  int width, int height);
+Bool i915_check_composite_target(PixmapPtr pixmap);
 Bool i915_check_composite_texture(ScreenPtr screen, PicturePtr picture);
 Bool i915_prepare_composite(int op, PicturePtr sourcec, PicturePtr mask,
 			    PicturePtr dest, PixmapPtr sourcecPixmap,
@@ -496,8 +500,9 @@ void i830_batch_flush_notify(ScrnInfoPtr scrn);
 unsigned int gen4_render_state_size(ScrnInfoPtr scrn);
 void gen4_render_state_init(ScrnInfoPtr scrn);
 void gen4_render_state_cleanup(ScrnInfoPtr scrn);
-Bool i965_check_composite(int op, PicturePtr sourcec, PicturePtr mask,
-			  PicturePtr dest);
+Bool i965_check_composite(int op,
+			  PicturePtr sourcec, PicturePtr mask, PicturePtr dest,
+			  int width, int height);
 Bool i965_check_composite_texture(ScreenPtr screen, PicturePtr picture);
 Bool i965_prepare_composite(int op, PicturePtr sourcec, PicturePtr mask,
 			    PicturePtr dest, PixmapPtr sourcecPixmap,
diff --git a/src/i830_render.c b/src/i830_render.c
index b0413c5..cba65eb 100644
--- a/src/i830_render.c
+++ b/src/i830_render.c
@@ -340,8 +340,11 @@ static void i830_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 }
 
 Bool
-i830_check_composite(int op, PicturePtr source_picture, PicturePtr mask_picture,
-		     PicturePtr dest_picture)
+i830_check_composite(int op,
+		     PicturePtr source_picture,
+		     PicturePtr mask_picture,
+		     PicturePtr dest_picture,
+		     int width, int height)
 {
 	ScrnInfoPtr scrn = xf86Screens[dest_picture->pDrawable->pScreen->myNum];
 	uint32_t tmp1;
@@ -373,6 +376,23 @@ i830_check_composite(int op, PicturePtr source_picture, PicturePtr mask_picture,
 		return FALSE;
 	}
 
+	if (width > 2048 || height > 2048) {
+		intel_debug_fallback(scrn, "Operation is too large (%d, %d)\n", width, height);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+Bool
+i830_check_composite_target(PixmapPtr pixmap)
+{
+	if (pixmap->drawable.width > 2048 || pixmap->drawable.height > 2048)
+		return FALSE;
+
+	if(!intel_check_pitch_3d(pixmap))
+		return FALSE;
+
 	return TRUE;
 }
 
diff --git a/src/i830_uxa.c b/src/i830_uxa.c
index a79dde1..0a6b6f8 100644
--- a/src/i830_uxa.c
+++ b/src/i830_uxa.c
@@ -323,10 +323,10 @@ static void i830_uxa_done_solid(PixmapPtr pixmap)
  *   - support planemask using FULL_BLT_CMD?
  */
 static Bool
-i830_uxa_check_copy(DrawablePtr source, DrawablePtr dest,
+i830_uxa_check_copy(PixmapPtr source, PixmapPtr dest,
 		    int alu, Pixel planemask)
 {
-	ScrnInfoPtr scrn = xf86Screens[dest->pScreen->myNum];
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
 	if (IS_GEN6(intel)) {
@@ -335,16 +335,16 @@ i830_uxa_check_copy(DrawablePtr source, DrawablePtr dest,
 		return FALSE;
 	}
 
-	if (!UXA_PM_IS_SOLID(source, planemask)) {
+	if (!UXA_PM_IS_SOLID(&source->drawable, planemask)) {
 		intel_debug_fallback(scrn, "planemask is not solid");
 		return FALSE;
 	}
 
-	if (source->bitsPerPixel != dest->bitsPerPixel) {
+	if (source->drawable.bitsPerPixel != dest->drawable.bitsPerPixel) {
 		intel_debug_fallback(scrn, "mixed bpp copies unsupported\n");
 		return FALSE;
 	}
-	switch (source->bitsPerPixel) {
+	switch (source->drawable.bitsPerPixel) {
 	case 8:
 	case 16:
 	case 32:
@@ -353,6 +353,11 @@ i830_uxa_check_copy(DrawablePtr source, DrawablePtr dest,
 		return FALSE;
 	}
 
+	if (!intel_check_pitch_2d(source))
+		return FALSE;
+	if (!intel_check_pitch_2d(dest))
+		return FALSE;
+
 	return TRUE;
 }
 
@@ -368,11 +373,6 @@ i830_uxa_prepare_copy(PixmapPtr source, PixmapPtr dest, int xdir,
 		i830_get_pixmap_bo(dest),
 	};
 
-	if (!intel_check_pitch_2d(source))
-		return FALSE;
-	if (!intel_check_pitch_2d(dest))
-		return FALSE;
-
 	if (!i830_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
 		return FALSE;
 
@@ -1076,6 +1076,7 @@ Bool i830_uxa_init(ScreenPtr screen)
 	/* Composite */
 	if (!IS_I9XX(intel)) {
 		intel->uxa_driver->check_composite = i830_check_composite;
+		intel->uxa_driver->check_composite_target = i830_check_composite_target;
 		intel->uxa_driver->check_composite_texture = i830_check_composite_texture;
 		intel->uxa_driver->prepare_composite = i830_prepare_composite;
 		intel->uxa_driver->composite = i830_composite;
@@ -1083,6 +1084,7 @@ Bool i830_uxa_init(ScreenPtr screen)
 	} else if (IS_I915G(intel) || IS_I915GM(intel) ||
 		   IS_I945G(intel) || IS_I945GM(intel) || IS_G33CLASS(intel)) {
 		intel->uxa_driver->check_composite = i915_check_composite;
+		intel->uxa_driver->check_composite_target = i915_check_composite_target;
 		intel->uxa_driver->check_composite_texture = i915_check_composite_texture;
 		intel->uxa_driver->prepare_composite = i915_prepare_composite;
 		intel->uxa_driver->composite = i915_composite;
diff --git a/src/i915_render.c b/src/i915_render.c
index f9988b9..4eb4046 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -184,7 +184,8 @@ Bool
 i915_check_composite(int op,
 		     PicturePtr source_picture,
 		     PicturePtr mask_picture,
-		     PicturePtr dest_picture)
+		     PicturePtr dest_picture,
+		     int width, int height)
 {
 	ScrnInfoPtr scrn = xf86Screens[dest_picture->pDrawable->pScreen->myNum];
 	uint32_t tmp1;
@@ -216,6 +217,21 @@ i915_check_composite(int op,
 		return FALSE;
 	}
 
+	if (width > 2048 || height > 2048)
+		return FALSE;
+
+	return TRUE;
+}
+
+Bool
+i915_check_composite_target(PixmapPtr pixmap)
+{
+	if (pixmap->drawable.width > 2048 || pixmap->drawable.height > 2048)
+		return FALSE;
+
+	if(!intel_check_pitch_3d(pixmap))
+		return FALSE;
+
 	return TRUE;
 }
 
diff --git a/src/i965_render.c b/src/i965_render.c
index ccfc008..ed4b4d9 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -196,8 +196,11 @@ static Bool i965_get_dest_format(PicturePtr dest_picture, uint32_t * dst_format)
 }
 
 Bool
-i965_check_composite(int op, PicturePtr source_picture, PicturePtr mask_picture,
-		     PicturePtr dest_picture)
+i965_check_composite(int op,
+		     PicturePtr source_picture,
+		     PicturePtr mask_picture,
+		     PicturePtr dest_picture,
+		     int width, int height)
 {
 	ScrnInfoPtr scrn = xf86Screens[dest_picture->pDrawable->pScreen->myNum];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
diff --git a/uxa/uxa-accel.c b/uxa/uxa-accel.c
index 0528d79..77963f3 100644
--- a/uxa/uxa-accel.c
+++ b/uxa/uxa-accel.c
@@ -103,7 +103,7 @@ uxa_fill_spans(DrawablePtr pDrawable, GCPtr pGC, int n,
 		goto solid;
 	}
 
-	if (!uxa_screen->info->check_composite(PictOpSrc, src, NULL, dst)) {
+	if (!uxa_screen->info->check_composite(PictOpSrc, src, NULL, dst, 0, 0)) {
 		FreePicture(src, 0);
 		FreePicture(dst, 0);
 		goto solid;
@@ -586,17 +586,17 @@ uxa_copy_n_to_n(DrawablePtr pSrcDrawable,
 	int dst_off_x, dst_off_y;
 	PixmapPtr pSrcPixmap, pDstPixmap;
 
-	if (uxa_screen->info->check_copy &&
-	    !uxa_screen->info->check_copy(pSrcDrawable, pDstDrawable,
-					  pGC ? pGC->alu : GXcopy,
-					  pGC ? pGC->planemask : FB_ALLONES))
-		goto fallback;
-
 	pSrcPixmap = uxa_get_drawable_pixmap(pSrcDrawable);
 	pDstPixmap = uxa_get_drawable_pixmap(pDstDrawable);
 	if (!pSrcPixmap || !pDstPixmap)
 		goto fallback;
 
+	if (uxa_screen->info->check_copy &&
+	    !uxa_screen->info->check_copy(pSrcPixmap, pDstPixmap,
+					  pGC ? pGC->alu : GXcopy,
+					  pGC ? pGC->planemask : FB_ALLONES))
+		goto fallback;
+
 	uxa_get_drawable_deltas(pSrcDrawable, pSrcPixmap, &src_off_x,
 				&src_off_y);
 	uxa_get_drawable_deltas(pDstDrawable, pDstPixmap, &dst_off_x,
@@ -1121,20 +1121,16 @@ uxa_fill_region_tiled(DrawablePtr pDrawable,
 					     uxa_get_pixmap_first_pixel(pTile),
 					     planemask, alu);
 
+	pPixmap = uxa_get_offscreen_pixmap(pDrawable, &xoff, &yoff);
+	if (!pPixmap || !uxa_pixmap_is_offscreen(pTile))
+		goto out;
+
 	if (uxa_screen->info->check_copy &&
-	    !uxa_screen->info->check_copy(&pTile->drawable, pDrawable, alu, planemask))
+	    !uxa_screen->info->check_copy(pTile, pPixmap, alu, planemask))
 		return FALSE;
 
-
-	pPixmap = uxa_get_drawable_pixmap(pDrawable);
-	uxa_get_drawable_deltas(pDrawable, pPixmap, &xoff, &yoff);
 	REGION_TRANSLATE(pScreen, pRegion, xoff, yoff);
 
-	pPixmap = uxa_get_offscreen_pixmap(pDrawable, &xoff, &yoff);
-
-	if (!pPixmap || !uxa_pixmap_is_offscreen(pTile))
-		goto out;
-
 	if ((*uxa_screen->info->prepare_copy) (pTile, pPixmap, 1, 1, alu,
 					       planemask)) {
 		while (nbox--) {
diff --git a/uxa/uxa-glyphs.c b/uxa/uxa-glyphs.c
index 921297a..93a738e 100644
--- a/uxa/uxa-glyphs.c
+++ b/uxa/uxa-glyphs.c
@@ -656,7 +656,7 @@ uxa_glyphs_try_driver_composite(CARD8 op,
 	int nrect;
 
 	if (uxa_screen->info->check_composite &&
-	    !(*uxa_screen->info->check_composite) (op, pSrc, buffer->source, pDst)) {
+	    !(*uxa_screen->info->check_composite) (op, pSrc, buffer->source, pDst, 0, 0)) {
 		return -1;
 	}
 
@@ -787,7 +787,7 @@ uxa_glyphs_try_driver_add_to_mask(PicturePtr pDst,
 	int nrect;
 
 	if (uxa_screen->info->check_composite &&
-	    !(*uxa_screen->info->check_composite) (PictOpAdd, buffer->source, NULL, pDst)) {
+	    !(*uxa_screen->info->check_composite) (PictOpAdd, buffer->source, NULL, pDst, 0, 0)) {
 		return -1;
 	}
 
@@ -1109,13 +1109,12 @@ uxa_glyphs(CARD8 op,
 	   PicturePtr pSrc,
 	   PicturePtr pDst,
 	   PictFormatPtr maskFormat,
-	   INT16 xSrc,
-	   INT16 ySrc, int nlist, GlyphListPtr list, GlyphPtr * glyphs)
+	   INT16 xSrc, INT16 ySrc,
+	   int nlist, GlyphListPtr list, GlyphPtr * glyphs)
 {
-	ScreenPtr pScreen = pDst->pDrawable->pScreen;
-	uxa_screen_t *uxa_screen = uxa_get_screen(pScreen);
-	PixmapPtr pMaskPixmap = 0;
-	PicturePtr pMask;
+	ScreenPtr screen = pDst->pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
+	PicturePtr pMask = NULL;
 	int width = 0, height = 0;
 	int x, y;
 	int xDst = list->xOff, yDst = list->yOff;
@@ -1123,13 +1122,16 @@ uxa_glyphs(CARD8 op,
 	GlyphPtr glyph;
 	int error;
 	BoxRec extents = { 0, 0, 0, 0 };
+	Bool have_extents = FALSE;
 	CARD32 component_alpha;
 	uxa_glyph_buffer_t buffer;
+	PicturePtr localDst = pDst;
 
 	if (!uxa_screen->info->prepare_composite ||
 	    uxa_screen->swappedOut ||
 	    !uxa_drawable_is_offscreen(pDst->pDrawable) ||
 	    pDst->alphaMap || pSrc->alphaMap) {
+fallback:
 	    uxa_check_glyphs(op, pSrc, pDst, maskFormat, xSrc, ySrc, nlist, list, glyphs);
 	    return;
 	}
@@ -1161,55 +1163,129 @@ uxa_glyphs(CARD8 op,
 		}
 	}
 
-	if (maskFormat) {
-		GCPtr pGC;
-		xRectangle rect;
+	x = y = 0;
+	if (!maskFormat &&
+	    uxa_screen->info->check_composite_target &&
+	    !uxa_screen->info->check_composite_target(uxa_get_drawable_pixmap(pDst->pDrawable))) {
+		int depth = pDst->pDrawable->depth;
+		PixmapPtr pixmap;
+		int error;
+		GCPtr gc;
+
+		pixmap = uxa_get_drawable_pixmap(pDst->pDrawable);
+		if (uxa_screen->info->check_copy &&
+		    !uxa_screen->info->check_copy(pixmap, pixmap, GXcopy, FB_ALLONES))
+			goto fallback;
 
 		uxa_glyph_extents(nlist, list, glyphs, &extents);
 
+		/* clip against dst bounds */
+		if (extents.x1 < 0)
+			extents.x1 = 0;
+		if (extents.y1 < 0)
+			extents.y1 = 0;
+		if (extents.x2 > pDst->pDrawable->width)
+			extents.x2 = pDst->pDrawable->width;
+		if (extents.y2 > pDst->pDrawable->height)
+			extents.y2 = pDst->pDrawable->height;
+
 		if (extents.x2 <= extents.x1 || extents.y2 <= extents.y1)
 			return;
-		width = extents.x2 - extents.x1;
+		width  = extents.x2 - extents.x1;
 		height = extents.y2 - extents.y1;
+		x = -extents.x1;
+		y = -extents.y1;
+		have_extents = TRUE;
+
+		xDst += x;
+		yDst += y;
+
+		pixmap = screen->CreatePixmap(screen,
+					      width, height, depth,
+					      CREATE_PIXMAP_USAGE_SCRATCH);
+		if (!pixmap)
+			return;
+
+		gc = GetScratchGC(depth, screen);
+		if (!gc) {
+			screen->DestroyPixmap(pixmap);
+			return;
+		}
+
+		ValidateGC(&pixmap->drawable, gc);
+		gc->ops->CopyArea(pDst->pDrawable, &pixmap->drawable, gc,
+				  extents.x1, extents.y1,
+				  width, height,
+				  0, 0);
+		FreeScratchGC(gc);
+
+		localDst = CreatePicture(0, &pixmap->drawable,
+					 PictureMatchFormat(screen, depth, pDst->format),
+					 0, 0, serverClient, &error);
+		screen->DestroyPixmap(pixmap);
+
+		if (!localDst)
+			return;
+
+		ValidatePicture(localDst);
+	}
+
+	if (maskFormat) {
+		PixmapPtr pixmap;
+		GCPtr gc;
+		xRectangle rect;
+
+		if (!have_extents) {
+			uxa_glyph_extents(nlist, list, glyphs, &extents);
+
+			if (extents.x2 <= extents.x1 || extents.y2 <= extents.y1)
+				return;
+			width  = extents.x2 - extents.x1;
+			height = extents.y2 - extents.y1;
+			x = -extents.x1;
+			y = -extents.y1;
+			have_extents = TRUE;
+		}
 
 		if (maskFormat->depth == 1) {
 			PictFormatPtr a8Format =
-			    PictureMatchFormat(pScreen, 8, PICT_a8);
+			    PictureMatchFormat(screen, 8, PICT_a8);
 
 			if (a8Format)
 				maskFormat = a8Format;
 		}
 
-		pMaskPixmap = (*pScreen->CreatePixmap) (pScreen, width, height,
-							maskFormat->depth,
-							CREATE_PIXMAP_USAGE_SCRATCH);
-		if (!pMaskPixmap)
+		pixmap = screen->CreatePixmap(screen, width, height,
+					      maskFormat->depth,
+					      CREATE_PIXMAP_USAGE_SCRATCH);
+		if (!pixmap) {
+			if (localDst != pDst)
+				FreePicture(localDst, 0);
 			return;
+		}
+
+		gc = GetScratchGC(pixmap->drawable.depth, screen);
+		ValidateGC(&pixmap->drawable, gc);
+		rect.x = 0;
+		rect.y = 0;
+		rect.width = width;
+		rect.height = height;
+		gc->ops->PolyFillRect(&pixmap->drawable, gc, 1, &rect);
+		FreeScratchGC(gc);
+
 		component_alpha = NeedsComponent(maskFormat->format);
-		pMask = CreatePicture(0, &pMaskPixmap->drawable,
+		pMask = CreatePicture(0, &pixmap->drawable,
 				      maskFormat, CPComponentAlpha,
 				      &component_alpha, serverClient, &error);
+		screen->DestroyPixmap(pixmap);
+
 		if (!pMask) {
-			(*pScreen->DestroyPixmap) (pMaskPixmap);
+			if (localDst != pDst)
+				FreePicture(localDst, 0);
 			return;
 		}
-		pGC = GetScratchGC(pMaskPixmap->drawable.depth, pScreen);
-		ValidateGC(&pMaskPixmap->drawable, pGC);
-		rect.x = 0;
-		rect.y = 0;
-		rect.width = width;
-		rect.height = height;
-		(*pGC->ops->PolyFillRect) (&pMaskPixmap->drawable, pGC, 1,
-					   &rect);
-		FreeScratchGC(pGC);
-		x = -extents.x1;
-		y = -extents.y1;
 
 		ValidatePicture(pMask);
-	} else {
-		pMask = pDst;
-		x = 0;
-		y = 0;
 	}
 
 	buffer.count = 0;
@@ -1222,19 +1298,19 @@ uxa_glyphs(CARD8 op,
 			glyph = *glyphs++;
 
 			if (glyph->info.width > 0 && glyph->info.height > 0 &&
-			    uxa_buffer_glyph(pScreen, &buffer, glyph, x,
+			    uxa_buffer_glyph(screen, &buffer, glyph, x,
 					     y) == UXA_GLYPH_NEED_FLUSH) {
 				if (maskFormat)
 					uxa_glyphs_to_mask(pMask, &buffer);
 				else
-					uxa_glyphs_to_dst(op, pSrc, pDst,
+					uxa_glyphs_to_dst(op, pSrc, localDst,
 							  &buffer, xSrc, ySrc,
 							  xDst, yDst);
 
 				buffer.count = 0;
 				buffer.source = NULL;
 
-				uxa_buffer_glyph(pScreen, &buffer, glyph, x, y);
+				uxa_buffer_glyph(screen, &buffer, glyph, x, y);
 			}
 
 			x += glyph->info.xOff;
@@ -1247,20 +1323,41 @@ uxa_glyphs(CARD8 op,
 		if (maskFormat)
 			uxa_glyphs_to_mask(pMask, &buffer);
 		else
-			uxa_glyphs_to_dst(op, pSrc, pDst, &buffer,
+			uxa_glyphs_to_dst(op, pSrc, localDst, &buffer,
 					  xSrc, ySrc, xDst, yDst);
 	}
 
 	if (maskFormat) {
-		x = extents.x1;
-		y = extents.y1;
+		if (localDst == pDst) {
+			x = extents.x1;
+			y = extents.y1;
+		} else
+			x = y = 0;
 		CompositePicture(op,
 				 pSrc,
 				 pMask,
-				 pDst,
+				 localDst,
 				 xSrc + x - xDst,
-				 ySrc + y - yDst, 0, 0, x, y, width, height);
-		FreePicture((pointer) pMask, (XID) 0);
-		(*pScreen->DestroyPixmap) (pMaskPixmap);
+				 ySrc + y - yDst,
+				 0, 0,
+				 x, y,
+				 width, height);
+		FreePicture(pMask, 0);
+	}
+
+	if (localDst != pDst) {
+		GCPtr gc;
+
+		gc = GetScratchGC(pDst->pDrawable->depth, screen);
+		if (gc) {
+			ValidateGC(pDst->pDrawable, gc);
+			gc->ops->CopyArea(localDst->pDrawable, pDst->pDrawable, gc,
+					  0, 0,
+					  width, height,
+					  extents.x1, extents.y1);
+			FreeScratchGC(gc);
+		}
+
+		FreePicture(localDst, 0);
 	}
 }
diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index c866b85..ec59871 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -788,7 +788,9 @@ uxa_acquire_drawable(ScreenPtr pScreen,
 		}
 	}
 
-	pPixmap = pScreen->CreatePixmap(pScreen, width, height, depth, 0);
+	pPixmap = pScreen->CreatePixmap(pScreen,
+					width, height, depth,
+					CREATE_PIXMAP_USAGE_SCRATCH);
 	if (!pPixmap)
 		return 0;
 
@@ -935,7 +937,7 @@ uxa_solid_rects (CARD8		op,
 	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
 	PixmapPtr dst_pixmap, src_pixmap = NULL;
 	pixman_region16_t region;
-	pixman_box16_t *boxes;
+	pixman_box16_t *boxes, *extents;
 	PicturePtr src;
 	int dst_x, dst_y;
 	int num_boxes;
@@ -960,6 +962,7 @@ uxa_solid_rects (CARD8		op,
 
 	pixman_region_translate(&region, dst_x, dst_y);
 	boxes = pixman_region_rectangles(&region, &num_boxes);
+	extents = pixman_region_extents (&region);
 
 	if (op == PictOpClear)
 		color->red = color->green = color->blue = color->alpha = 0;
@@ -970,6 +973,7 @@ uxa_solid_rects (CARD8		op,
 	if (num_boxes == 1 && (op == PictOpSrc || op == PictOpClear)) {
 		CARD32 pixel;
 
+try_solid:
 		if (uxa_screen->info->check_solid &&
 		    !uxa_screen->info->check_solid(&dst_pixmap->drawable, GXcopy, FB_ALLONES))
 			goto err_region;
@@ -985,9 +989,12 @@ uxa_solid_rects (CARD8		op,
 		if (!uxa_screen->info->prepare_solid(dst_pixmap, GXcopy, FB_ALLONES, pixel))
 			goto err_region;
 
-		uxa_screen->info->solid(dst_pixmap,
-					boxes->x1, boxes->y1,
-					boxes->x2, boxes->y2);
+		while (num_boxes--) {
+			uxa_screen->info->solid(dst_pixmap,
+						boxes->x1, boxes->y1,
+						boxes->x2, boxes->y2);
+			boxes++;
+		}
 
 		uxa_screen->info->done_solid(dst_pixmap);
 	} else {
@@ -997,8 +1004,16 @@ uxa_solid_rects (CARD8		op,
 		if (!src)
 			goto err_region;
 
-		if (!uxa_screen->info->check_composite(op, src, NULL, dst))
+		if (!uxa_screen->info->check_composite(op, src, NULL, dst,
+						       extents->x2 - extents->x1,
+						       extents->y2 - extents->y1)) {
+			if (op == PictOpSrc || op == PictOpClear) {
+				FreePicture(src, 0);
+				goto try_solid;
+			}
+
 			goto err_src;
+		}
 
 		if (!uxa_screen->info->check_composite_texture ||
 		    !uxa_screen->info->check_composite_texture(screen, src)) {
@@ -1053,53 +1068,109 @@ uxa_try_driver_composite(CARD8 op,
 			 INT16 xDst, INT16 yDst,
 			 CARD16 width, CARD16 height)
 {
-	uxa_screen_t *uxa_screen = uxa_get_screen(pDst->pDrawable->pScreen);
+	ScreenPtr screen = pDst->pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
 	RegionRec region;
 	BoxPtr pbox;
 	int nbox;
+	int xDst_copy, yDst_copy;
 	int src_off_x, src_off_y, mask_off_x, mask_off_y, dst_off_x, dst_off_y;
 	PixmapPtr pSrcPix, pMaskPix = NULL, pDstPix;
 	PicturePtr localSrc, localMask = NULL;
+	PicturePtr localDst = pDst;
 
 	if (uxa_screen->info->check_composite &&
-	    !(*uxa_screen->info->check_composite) (op, pSrc, pMask, pDst))
+	    !(*uxa_screen->info->check_composite) (op, pSrc, pMask, pDst, width, height))
 		return -1;
 
+	if (uxa_screen->info->check_composite_target &&
+	    !uxa_screen->info->check_composite_target(uxa_get_drawable_pixmap(pDst->pDrawable))) {
+		int depth = pDst->pDrawable->depth;
+		PixmapPtr pixmap;
+		int error;
+		GCPtr gc;
+
+		pixmap = uxa_get_drawable_pixmap(pDst->pDrawable);
+		if (uxa_screen->info->check_copy &&
+		    !uxa_screen->info->check_copy(pixmap, pixmap, GXcopy, FB_ALLONES))
+			return -1;
+
+		pixmap = screen->CreatePixmap(screen,
+					      width, height, depth,
+					      CREATE_PIXMAP_USAGE_SCRATCH);
+		if (!pixmap)
+			return 0;
+
+		gc = GetScratchGC(depth, screen);
+		if (!gc) {
+			screen->DestroyPixmap(pixmap);
+			return 0;
+		}
+
+		ValidateGC(&pixmap->drawable, gc);
+		gc->ops->CopyArea(pDst->pDrawable, &pixmap->drawable, gc,
+				  xDst, yDst, width, height, 0, 0);
+		FreeScratchGC(gc);
+
+		xDst_copy = xDst; xDst = 0;
+		yDst_copy = yDst; yDst = 0;
+
+		localDst = CreatePicture(0, &pixmap->drawable,
+					 PictureMatchFormat(screen, depth, pDst->format),
+					 0, 0, serverClient, &error);
+		screen->DestroyPixmap(pixmap);
+
+		if (!localDst)
+			return 0;
+
+		ValidatePicture(localDst);
+	}
+
 	pDstPix =
-	    uxa_get_offscreen_pixmap(pDst->pDrawable, &dst_off_x, &dst_off_y);
-	if (!pDstPix)
+	    uxa_get_offscreen_pixmap(localDst->pDrawable, &dst_off_x, &dst_off_y);
+	if (!pDstPix) {
+		if (localDst != pDst)
+			FreePicture(localDst, 0);
 		return -1;
+	}
 
-	xDst += pDst->pDrawable->x;
-	yDst += pDst->pDrawable->y;
+	xDst += localDst->pDrawable->x;
+	yDst += localDst->pDrawable->y;
 
-	localSrc = uxa_acquire_source(pDst->pDrawable->pScreen, pSrc,
+	localSrc = uxa_acquire_source(screen, pSrc,
 				      xSrc, ySrc,
 				      width, height,
 				      &xSrc, &ySrc);
-	if (!localSrc)
+	if (!localSrc) {
+		if (localDst != pDst)
+			FreePicture(localDst, 0);
 		return 0;
+	}
 
 	if (pMask) {
-		localMask = uxa_acquire_mask(pDst->pDrawable->pScreen, pMask,
+		localMask = uxa_acquire_mask(screen, pMask,
 					     xMask, yMask,
 					     width, height,
 					     &xMask, &yMask);
 		if (!localMask) {
 			if (localSrc != pSrc)
 				FreePicture(localSrc, 0);
+			if (localDst != pDst)
+				FreePicture(localDst, 0);
 
 			return 0;
 		}
 	}
 
-	if (!miComputeCompositeRegion(&region, localSrc, localMask, pDst,
+	if (!miComputeCompositeRegion(&region, localSrc, localMask, localDst,
 				      xSrc, ySrc, xMask, yMask, xDst, yDst,
 				      width, height)) {
 		if (localSrc != pSrc)
 			FreePicture(localSrc, 0);
 		if (localMask && localMask != pMask)
 			FreePicture(localMask, 0);
+		if (localDst != pDst)
+			FreePicture(localDst, 0);
 
 		return 1;
 	}
@@ -1108,12 +1179,14 @@ uxa_try_driver_composite(CARD8 op,
 		pSrcPix = uxa_get_offscreen_pixmap(localSrc->pDrawable,
 						   &src_off_x, &src_off_y);
 		if (!pSrcPix) {
-			REGION_UNINIT(pDst->pDrawable->pScreen, &region);
+			REGION_UNINIT(screen, &region);
 
 			if (localSrc != pSrc)
 				FreePicture(localSrc, 0);
 			if (localMask && localMask != pMask)
 				FreePicture(localMask, 0);
+			if (localDst != pDst)
+				FreePicture(localDst, 0);
 
 			return 0;
 		}
@@ -1126,12 +1199,14 @@ uxa_try_driver_composite(CARD8 op,
 			pMaskPix = uxa_get_offscreen_pixmap(localMask->pDrawable,
 							    &mask_off_x, &mask_off_y);
 			if (!pMaskPix) {
-				REGION_UNINIT(pDst->pDrawable->pScreen, &region);
+				REGION_UNINIT(screen, &region);
 
 				if (localSrc != pSrc)
 					FreePicture(localSrc, 0);
 				if (localMask && localMask != pMask)
 					FreePicture(localMask, 0);
+				if (localDst != pDst)
+					FreePicture(localDst, 0);
 
 				return 0;
 			}
@@ -1141,13 +1216,15 @@ uxa_try_driver_composite(CARD8 op,
 	}
 
 	if (!(*uxa_screen->info->prepare_composite)
-	    (op, localSrc, localMask, pDst, pSrcPix, pMaskPix, pDstPix)) {
-		REGION_UNINIT(pDst->pDrawable->pScreen, &region);
+	    (op, localSrc, localMask, localDst, pSrcPix, pMaskPix, pDstPix)) {
+		REGION_UNINIT(screen, &region);
 
 		if (localSrc != pSrc)
 			FreePicture(localSrc, 0);
 		if (localMask && localMask != pMask)
 			FreePicture(localMask, 0);
+		if (localDst != pDst)
+			FreePicture(localDst, 0);
 
 		return -1;
 	}
@@ -1176,13 +1253,27 @@ uxa_try_driver_composite(CARD8 op,
 	}
 	(*uxa_screen->info->done_composite) (pDstPix);
 
-	REGION_UNINIT(pDst->pDrawable->pScreen, &region);
+	REGION_UNINIT(screen, &region);
 
 	if (localSrc != pSrc)
 		FreePicture(localSrc, 0);
 	if (localMask && localMask != pMask)
 		FreePicture(localMask, 0);
 
+	if (localDst != pDst) {
+		GCPtr gc;
+
+		gc = GetScratchGC(pDst->pDrawable->depth, screen);
+		if (gc) {
+			ValidateGC(pDst->pDrawable, gc);
+			gc->ops->CopyArea(localDst->pDrawable, pDst->pDrawable, gc,
+					  0, 0, width, height, xDst_copy, yDst_copy);
+			FreeScratchGC(gc);
+		}
+
+		FreePicture(localDst, 0);
+	}
+
 	return 1;
 }
 
@@ -1240,36 +1331,100 @@ uxa_try_magic_two_pass_composite_helper(CARD8 op,
 					PicturePtr pSrc,
 					PicturePtr pMask,
 					PicturePtr pDst,
-					INT16 xSrc,
-					INT16 ySrc,
-					INT16 xMask,
-					INT16 yMask,
-					INT16 xDst,
-					INT16 yDst, CARD16 width, CARD16 height)
+					INT16 xSrc, INT16 ySrc,
+					INT16 xMask, INT16 yMask,
+					INT16 xDst, INT16 yDst,
+					CARD16 width, CARD16 height)
 {
-	uxa_screen_t *uxa_screen = uxa_get_screen(pDst->pDrawable->pScreen);
+	ScreenPtr screen = pDst->pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
+	PicturePtr localDst = pDst;
+	int xDst_copy, yDst_copy;
 
 	assert(op == PictOpOver);
 
 	if (uxa_screen->info->check_composite &&
 	    (!(*uxa_screen->info->check_composite) (PictOpOutReverse, pSrc,
-						    pMask, pDst)
+						    pMask, pDst, width, height)
 	     || !(*uxa_screen->info->check_composite) (PictOpAdd, pSrc, pMask,
-						       pDst))) {
+						       pDst, width, height))) {
 		return -1;
 	}
 
+	if (uxa_screen->info->check_composite_target &&
+	    !uxa_screen->info->check_composite_target(uxa_get_drawable_pixmap(pDst->pDrawable))) {
+		int depth = pDst->pDrawable->depth;
+		PixmapPtr pixmap;
+		int error;
+		GCPtr gc;
+
+		pixmap = uxa_get_drawable_pixmap(pDst->pDrawable);
+		if (uxa_screen->info->check_copy &&
+		    !uxa_screen->info->check_copy(pixmap, pixmap, GXcopy, FB_ALLONES))
+			return -1;
+
+		pixmap = screen->CreatePixmap(screen,
+					      width, height, depth,
+					      CREATE_PIXMAP_USAGE_SCRATCH);
+		if (!pixmap)
+			return 0;
+
+		gc = GetScratchGC(depth, screen);
+		if (!gc) {
+			screen->DestroyPixmap(pixmap);
+			return 0;
+		}
+
+		ValidateGC(&pixmap->drawable, gc);
+		gc->ops->CopyArea(pDst->pDrawable, &pixmap->drawable, gc,
+				  xDst, yDst, width, height, 0, 0);
+		FreeScratchGC(gc);
+
+		xDst_copy = xDst; xDst = 0;
+		yDst_copy = yDst; yDst = 0;
+
+		localDst = CreatePicture(0, &pixmap->drawable,
+					 PictureMatchFormat(screen, depth, pDst->format),
+					 0, 0, serverClient, &error);
+		screen->DestroyPixmap(pixmap);
+
+		if (!localDst)
+			return 0;
+
+		ValidatePicture(localDst);
+	}
+
 	/* Now, we think we should be able to accelerate this operation. First,
 	 * composite the destination to be the destination times the source alpha
 	 * factors.
 	 */
-	uxa_composite(PictOpOutReverse, pSrc, pMask, pDst, xSrc, ySrc, xMask,
-		      yMask, xDst, yDst, width, height);
+	uxa_composite(PictOpOutReverse, pSrc, pMask, localDst,
+		      xSrc, ySrc,
+		      xMask, yMask,
+		      xDst, yDst,
+		      width, height);
 
 	/* Then, add in the source value times the destination alpha factors (1.0).
 	 */
-	uxa_composite(PictOpAdd, pSrc, pMask, pDst, xSrc, ySrc, xMask, yMask,
-		      xDst, yDst, width, height);
+	uxa_composite(PictOpAdd, pSrc, pMask, localDst,
+		      xSrc, ySrc,
+		      xMask, yMask,
+		      xDst, yDst,
+		      width, height);
+
+	if (localDst != pDst) {
+		GCPtr gc;
+
+		gc = GetScratchGC(pDst->pDrawable->depth, screen);
+		if (gc) {
+			ValidateGC(pDst->pDrawable, gc);
+			gc->ops->CopyArea(localDst->pDrawable, pDst->pDrawable, gc,
+					0, 0, width, height, xDst_copy, yDst_copy);
+			FreeScratchGC(gc);
+		}
+
+		FreePicture(localDst, 0);
+	}
 
 	return 1;
 }
@@ -1325,10 +1480,10 @@ uxa_composite(CARD8 op,
 	      PicturePtr pSrc,
 	      PicturePtr pMask,
 	      PicturePtr pDst,
-	      INT16 xSrc,
-	      INT16 ySrc,
-	      INT16 xMask,
-	      INT16 yMask, INT16 xDst, INT16 yDst, CARD16 width, CARD16 height)
+	      INT16 xSrc, INT16 ySrc,
+	      INT16 xMask, INT16 yMask,
+	      INT16 xDst, INT16 yDst,
+	      CARD16 width, CARD16 height)
 {
 	uxa_screen_t *uxa_screen = uxa_get_screen(pDst->pDrawable->pScreen);
 	int ret = -1;
@@ -1343,9 +1498,11 @@ uxa_composite(CARD8 op,
 	if (!uxa_drawable_is_offscreen(pDst->pDrawable))
 		goto fallback;
 
+
 	if (pDst->alphaMap || pSrc->alphaMap || (pMask && pMask->alphaMap))
 		goto fallback;
 
+
 	/* Remove repeat in source if useless */
 	if (pSrc->pDrawable && pSrc->repeat && pSrc->filter != PictFilterConvolution &&
 	    transform_is_integer_translation(pSrc->transform, &tx, &ty) &&
diff --git a/uxa/uxa.h b/uxa/uxa.h
index bf7ec0b..23f9465 100644
--- a/uxa/uxa.h
+++ b/uxa/uxa.h
@@ -151,9 +151,7 @@ typedef struct _UxaDriver {
 	/**
 	 * check_copy() checks whether the driver can blit between the two Pictures
 	 */
-	Bool(*check_copy) (DrawablePtr pSrcDrawable,
-			   DrawablePtr pDstDrawable,
-			   int alu, Pixel planemask);
+	Bool(*check_copy) (PixmapPtr pSrc, PixmapPtr pDst, int alu, Pixel planemask);
 	/**
 	 * prepare_copy() sets up the driver for doing a copy within video
 	 * memory.
@@ -249,6 +247,8 @@ typedef struct _UxaDriver {
 	 * @param pSrcPicture source Picture
 	 * @param pMaskPicture mask picture
 	 * @param pDstPicture destination Picture
+	 * @param width The width of the composite operation
+	 * @param height The height of the composite operation
 	 *
 	 * The check_composite() call checks if the driver could handle
 	 * acceleration of op with the given source, mask, and destination
@@ -266,7 +266,19 @@ typedef struct _UxaDriver {
 	Bool(*check_composite) (int op,
 				PicturePtr pSrcPicture,
 				PicturePtr pMaskPicture,
-				PicturePtr pDstPicture);
+				PicturePtr pDstPicture,
+				int width, int height);
+
+	/**
+	 * check_composite_target() checks to see if the destination of the composite
+	 * operation can be used without midification.
+	 *
+	 * @param pixmap Destination Pixmap
+	 *
+	 * The check_composite_target() call is recommended if prepare_composite() is
+	 * implemented, but is not required.
+	 */
+	Bool(*check_composite_target) (PixmapPtr pixmap);
 
 	/**
 	 * check_composite_texture() checks to see if a source to the composite
commit 91f560034fc2695680d1208a78fc56d814b0da79
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu May 20 10:26:59 2010 +0100

    uxa: Composite glyphs directly onto dst when possible.
    
    Without using a mask and compositing directly onto the destination,
    takes us from 580 kglyphs/s to 850 kglyphs/s on i945 [x11perf -aa10text].
    
    However, the extra intersection check almost entirely cancels out the
    speed up and we discover that the glyphs in x11perf are always
    overlapping. Nothing is ever easy.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-glyphs.c b/uxa/uxa-glyphs.c
index f8bb7f5..921297a 100644
--- a/uxa/uxa-glyphs.c
+++ b/uxa/uxa-glyphs.c
@@ -205,6 +205,8 @@ static Bool uxa_realize_glyph_caches(ScreenPtr pScreen, unsigned int format)
 	if (!pPicture)
 		return FALSE;
 
+	ValidatePicture(pPicture);
+
 	/* And store the picture in all the caches for the format */
 
 	for (i = 0; i < UXA_NUM_GLYPH_CACHES; i++) {
@@ -573,41 +575,294 @@ uxa_buffer_glyph(ScreenPtr pScreen,
 	return UXA_GLYPH_SUCCESS;
 }
 
-static void uxa_glyphs_to_mask(PicturePtr pMask, uxa_glyph_buffer_t * buffer)
+static PicturePtr
+uxa_glyphs_acquire_source(ScreenPtr screen,
+			  PicturePtr src,
+			  INT16 x, INT16 y,
+			  const uxa_glyph_buffer_t * buffer,
+			  INT16 * out_x, INT16 * out_y)
+{
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
+	int x1, y1, x2, y2;
+	int width, height;
+	int i;
+
+	if (uxa_screen->info->check_composite_texture &&
+	    uxa_screen->info->check_composite_texture(screen, src)) {
+		if (src->pDrawable) {
+			*out_x = x + src->pDrawable->x;
+			*out_y = y + src->pDrawable->y;
+		} else {
+			*out_x = x;
+			*out_y = y;
+		}
+		return src;
+	}
+
+	for (i = 0; i < buffer->count; i++) {
+	    const uxa_composite_rect_t *r = &buffer->rects[i];
+
+	    if (r->xDst < x1)
+		x1 = r->xDst;
+	    if (r->xDst + r->width > x2)
+		x2 = r->xDst + r->width;
+
+	    if (r->yDst < y1)
+		y1 = r->yDst;
+	    if (r->yDst + r->height > y2)
+		y2 = r->yDst + r->height;
+	}
+
+	width  = x2 - x1;
+	height = y2 - y1;
+
+	if (src->pDrawable) {
+		PicturePtr dst;
+
+		dst = uxa_acquire_drawable(screen, src,
+					   x, y,
+					   width, height,
+					   out_x, out_y);
+		if (uxa_screen->info->check_composite_texture &&
+		    !uxa_screen->info->check_composite_texture(screen, dst)) {
+			if (dst != src)
+				FreePicture(dst, 0);
+			return 0;
+		}
+
+		return dst;
+	}
+
+	*out_x = 0;
+	*out_y = 0;
+	return uxa_acquire_pattern(screen, src,
+				   PICT_a8r8g8b8, x, y, width, height);
+}
+
+static int
+uxa_glyphs_try_driver_composite(CARD8 op,
+				PicturePtr pSrc,
+				PicturePtr pDst,
+				const uxa_glyph_buffer_t * buffer,
+				INT16 xSrc, INT16 ySrc,
+				INT16 xDst, INT16 yDst)
 {
-	uxa_composite_rects(PictOpAdd, buffer->source, pMask,
-			    buffer->count, buffer->rects);
+	ScreenPtr screen = pDst->pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
+	PicturePtr localSrc;
+	int src_off_x = 0, src_off_y = 0, mask_off_x, mask_off_y, dst_off_x, dst_off_y;
+	PixmapPtr pSrcPix = NULL, pMaskPix, pDstPix;
+	const uxa_composite_rect_t *rects;
+	int nrect;
+
+	if (uxa_screen->info->check_composite &&
+	    !(*uxa_screen->info->check_composite) (op, pSrc, buffer->source, pDst)) {
+		return -1;
+	}
 
-	buffer->count = 0;
-	buffer->source = NULL;
+	pDstPix =
+	    uxa_get_offscreen_pixmap(pDst->pDrawable, &dst_off_x, &dst_off_y);
+
+	pMaskPix =
+	    uxa_get_offscreen_pixmap(buffer->source->pDrawable, &mask_off_x, &mask_off_y);
+	if(!pMaskPix)
+		return -1;
+
+	localSrc = uxa_glyphs_acquire_source(screen, pSrc,
+					     xSrc, ySrc,
+					     buffer,
+					     &xSrc, &ySrc);
+	if (!localSrc)
+		return 0;
+
+	if (localSrc->pDrawable) {
+		pSrcPix =
+			uxa_get_offscreen_pixmap(localSrc->pDrawable, &src_off_x, &src_off_y);
+		if (!pSrcPix) {
+			if (localSrc != pSrc)
+				FreePicture(localSrc, 0);
+			return 0;
+		}
+
+		xSrc += localSrc->pDrawable->x;
+		ySrc += localSrc->pDrawable->y;
+	}
+
+	if (!(*uxa_screen->info->prepare_composite)
+	    (op, localSrc, buffer->source, pDst, pSrcPix, pMaskPix, pDstPix)) {
+		if (localSrc != pSrc)
+			FreePicture(localSrc, 0);
+		return -1;
+	}
+
+	nrect = buffer->count;
+	rects = buffer->rects;
+	do {
+		INT16 _xDst = rects->xDst + pDst->pDrawable->x;
+		INT16 _yDst = rects->yDst + pDst->pDrawable->y;
+		INT16 _xMask = rects->xSrc + buffer->source->pDrawable->x;
+		INT16 _yMask = rects->ySrc + buffer->source->pDrawable->y;
+		INT16 _xSrc = xSrc, _ySrc = ySrc;
+
+		RegionRec region;
+		BoxPtr pbox;
+		int nbox;
+
+		if (!miComputeCompositeRegion(&region,
+					      localSrc, buffer->source, pDst,
+					      _xSrc, _ySrc,
+					      _xMask, _yMask,
+					      _xDst, _yDst,
+					      rects->width, rects->height))
+			goto next_rect;
+
+		_xSrc += src_off_x - _xDst;
+		_ySrc += src_off_y - _yDst;
+		_xMask += mask_off_x - _xDst;
+		_yMask += mask_off_y - _yDst;
+
+		nbox = REGION_NUM_RECTS(&region);
+		pbox = REGION_RECTS(&region);
+		while (nbox--) {
+			(*uxa_screen->info->composite) (pDstPix,
+							pbox->x1 + _xSrc,
+							pbox->y1 + _ySrc,
+							pbox->x1 + _xMask,
+							pbox->y1 + _yMask,
+							pbox->x1 + dst_off_x,
+							pbox->y1 + dst_off_y,
+							pbox->x2 - pbox->x1,
+							pbox->y2 - pbox->y1);
+			pbox++;
+		}
+
+next_rect:
+		REGION_UNINIT(screen, &region);
+
+		rects++;
+	} while (--nrect);
+	(*uxa_screen->info->done_composite) (pDstPix);
+
+	if (localSrc != pSrc)
+		FreePicture(localSrc, 0);
+
+	return 1;
 }
 
 static void
 uxa_glyphs_to_dst(CARD8 op,
 		  PicturePtr pSrc,
 		  PicturePtr pDst,
-		  uxa_glyph_buffer_t * buffer,
-		  INT16 xSrc, INT16 ySrc, INT16 xDst, INT16 yDst)
+		  const uxa_glyph_buffer_t * buffer,
+		  INT16 xSrc, INT16 ySrc,
+		  INT16 xDst, INT16 yDst)
 {
-	int i;
+	if (uxa_glyphs_try_driver_composite(op, pSrc, pDst, buffer,
+					    xSrc, ySrc,
+					    xDst, yDst) != 1) {
+		int i;
 
-	for (i = 0; i < buffer->count; i++) {
-		uxa_composite_rect_t *rect = &buffer->rects[i];
+		for (i = 0; i < buffer->count; i++) {
+			const uxa_composite_rect_t *rect = &buffer->rects[i];
 
-		CompositePicture(op,
-				 pSrc,
-				 buffer->source,
-				 pDst,
-				 xSrc + rect->xDst - xDst,
-				 ySrc + rect->yDst - yDst,
-				 rect->xSrc,
-				 rect->ySrc,
-				 rect->xDst,
-				 rect->yDst, rect->width, rect->height);
+			CompositePicture(op,
+					 pSrc, buffer->source, pDst,
+					 xSrc + rect->xDst - xDst,
+					 ySrc + rect->yDst - yDst,
+					 rect->xSrc, rect->ySrc,
+					 rect->xDst, rect->yDst,
+					 rect->width, rect->height);
+		}
 	}
+}
+
+static int
+uxa_glyphs_try_driver_add_to_mask(PicturePtr pDst,
+				  const uxa_glyph_buffer_t *buffer)
+{
+	uxa_screen_t *uxa_screen = uxa_get_screen(pDst->pDrawable->pScreen);
+	int src_off_x, src_off_y, dst_off_x, dst_off_y;
+	PixmapPtr pSrcPix, pDstPix;
+	const uxa_composite_rect_t *rects;
+	int nrect;
+
+	if (uxa_screen->info->check_composite &&
+	    !(*uxa_screen->info->check_composite) (PictOpAdd, buffer->source, NULL, pDst)) {
+		return -1;
+	}
+
+	pDstPix =
+	    uxa_get_offscreen_pixmap(pDst->pDrawable, &dst_off_x, &dst_off_y);
+
+	pSrcPix =
+	    uxa_get_offscreen_pixmap(buffer->source->pDrawable, &src_off_x, &src_off_y);
+	if(!pSrcPix)
+		return -1;
+
+	if (!(*uxa_screen->info->prepare_composite)
+	    (PictOpAdd, buffer->source, NULL, pDst, pSrcPix, NULL, pDstPix))
+		return -1;
+
+	rects = buffer->rects;
+	nrect = buffer->count;
+	do {
+		INT16 xDst = rects->xDst + pDst->pDrawable->x;
+		INT16 yDst = rects->yDst + pDst->pDrawable->y;
+		INT16 xSrc = rects->xSrc + buffer->source->pDrawable->x;
+		INT16 ySrc = rects->ySrc + buffer->source->pDrawable->y;
+
+		RegionRec region;
+		BoxPtr pbox;
+		int nbox;
+
+		if (!miComputeCompositeRegion(&region, buffer->source, NULL, pDst,
+					      xSrc, ySrc, 0, 0, xDst, yDst,
+					      rects->width, rects->height))
+			goto next_rect;
+
+		xSrc += src_off_x - xDst;
+		ySrc += src_off_y - yDst;
+
+		nbox = REGION_NUM_RECTS(&region);
+		pbox = REGION_RECTS(&region);
+
+		while (nbox--) {
+			(*uxa_screen->info->composite) (pDstPix,
+							pbox->x1 + xSrc,
+							pbox->y1 + ySrc,
+							0, 0,
+							pbox->x1 + dst_off_x,
+							pbox->y1 + dst_off_y,
+							pbox->x2 - pbox->x1,
+							pbox->y2 - pbox->y1);
+			pbox++;
+		}
+
+next_rect:
+		REGION_UNINIT(pDst->pDrawable->pScreen, &region);
+
+		rects++;
+	} while (--nrect);
+	(*uxa_screen->info->done_composite) (pDstPix);
+
+	return 1;
+}
+
+static void uxa_glyphs_to_mask(PicturePtr pDst, const uxa_glyph_buffer_t *buffer)
+{
+	if (uxa_glyphs_try_driver_add_to_mask(pDst, buffer) != 1) {
+		int i;
+
+		for (i = 0; i < buffer->count; i++) {
+			const uxa_composite_rect_t *r = &buffer->rects[i];
 
-	buffer->count = 0;
-	buffer->source = NULL;
+			uxa_check_composite(PictOpAdd, buffer->source, NULL, pDst,
+					    r->xSrc, r->ySrc,
+					    0, 0,
+					    r->xDst, r->yDst,
+					    r->width, r->height);
+		}
+	}
 }
 
 /* Cut and paste from render/glyph.c - probably should export it instead */
@@ -857,9 +1112,10 @@ uxa_glyphs(CARD8 op,
 	   INT16 xSrc,
 	   INT16 ySrc, int nlist, GlyphListPtr list, GlyphPtr * glyphs)
 {
+	ScreenPtr pScreen = pDst->pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(pScreen);
 	PixmapPtr pMaskPixmap = 0;
 	PicturePtr pMask;
-	ScreenPtr pScreen = pDst->pDrawable->pScreen;
 	int width = 0, height = 0;
 	int x, y;
 	int xDst = list->xOff, yDst = list->yOff;
@@ -870,32 +1126,38 @@ uxa_glyphs(CARD8 op,
 	CARD32 component_alpha;
 	uxa_glyph_buffer_t buffer;
 
-	if (!uxa_drawable_is_offscreen(pDst->pDrawable)) {
+	if (!uxa_screen->info->prepare_composite ||
+	    uxa_screen->swappedOut ||
+	    !uxa_drawable_is_offscreen(pDst->pDrawable) ||
+	    pDst->alphaMap || pSrc->alphaMap) {
 	    uxa_check_glyphs(op, pSrc, pDst, maskFormat, xSrc, ySrc, nlist, list, glyphs);
 	    return;
 	}
 
-	/* If we don't have a mask format but all the glyphs have the same format
-	 * and don't intersect, use the glyph format as mask format for the full
-	 * benefits of the glyph cache.
-	 */
+	ValidatePicture(pSrc);
+	ValidatePicture(pDst);
+
 	if (!maskFormat) {
-		Bool sameFormat = TRUE;
-		int i;
+		/* If we don't have a mask format but all the glyphs have the same format,
+		 * require ComponentAlpha and don't intersect, use the glyph format as mask
+		 * format for the full benefits of the glyph cache.
+		 */
+		if (NeedsComponent(list[0].format->format)) {
+			Bool sameFormat = TRUE;
+			int i;
 
-		maskFormat = list[0].format;
+			maskFormat = list[0].format;
 
-		for (i = 0; i < nlist; i++) {
-			if (maskFormat->format != list[i].format->format) {
-				sameFormat = FALSE;
-				break;
+			for (i = 0; i < nlist; i++) {
+				if (maskFormat->format != list[i].format->format) {
+					sameFormat = FALSE;
+					break;
+				}
 			}
-		}
 
-		if (!sameFormat || (maskFormat->depth != 1 &&
-				    uxa_glyphs_intersect(nlist, list,
-							 glyphs))) {
-			maskFormat = NULL;
+			if (!sameFormat ||
+			    uxa_glyphs_intersect(nlist, list, glyphs))
+				maskFormat = NULL;
 		}
 	}
 
@@ -942,11 +1204,14 @@ uxa_glyphs(CARD8 op,
 		FreeScratchGC(pGC);
 		x = -extents.x1;
 		y = -extents.y1;
+
+		ValidatePicture(pMask);
 	} else {
 		pMask = pDst;
 		x = 0;
 		y = 0;
 	}
+
 	buffer.count = 0;
 	buffer.source = NULL;
 	while (nlist--) {
@@ -966,6 +1231,9 @@ uxa_glyphs(CARD8 op,
 							  &buffer, xSrc, ySrc,
 							  xDst, yDst);
 
+				buffer.count = 0;
+				buffer.source = NULL;
+
 				uxa_buffer_glyph(pScreen, &buffer, glyph, x, y);
 			}
 
diff --git a/uxa/uxa-priv.h b/uxa/uxa-priv.h
index 8ff2c9c..bace679 100644
--- a/uxa/uxa-priv.h
+++ b/uxa/uxa-priv.h
@@ -437,6 +437,20 @@ uxa_triangles(CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 PicturePtr
 uxa_acquire_solid(ScreenPtr screen, SourcePict *source);
 
+PicturePtr
+uxa_acquire_drawable(ScreenPtr pScreen,
+		     PicturePtr pSrc,
+		     INT16 x, INT16 y,
+		     CARD16 width, CARD16 height,
+		     INT16 * out_x, INT16 * out_y);
+
+PicturePtr
+uxa_acquire_pattern(ScreenPtr pScreen,
+		    PicturePtr pSrc,
+		    pixman_format_code_t format,
+		    INT16 x, INT16 y,
+		    CARD16 width, CARD16 height);
+
 Bool
 uxa_get_rgba_from_pixel(CARD32 pixel,
 			CARD16 * red,
diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index 139d42e..c866b85 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -664,7 +664,7 @@ DONE:
 	return picture;
 }
 
-static PicturePtr
+PicturePtr
 uxa_acquire_pattern(ScreenPtr pScreen,
 		    PicturePtr pSrc,
 		    pixman_format_code_t format,
@@ -757,7 +757,7 @@ uxa_render_picture(ScreenPtr screen,
 	return picture;
 }
 
-static PicturePtr
+PicturePtr
 uxa_acquire_drawable(ScreenPtr pScreen,
 		     PicturePtr pSrc,
 		     INT16 x, INT16 y,
@@ -1044,122 +1044,6 @@ fallback:
 }
 
 static int
-uxa_try_driver_composite_rects(CARD8 op,
-			       PicturePtr pSrc,
-			       PicturePtr pDst,
-			       int nrect, uxa_composite_rect_t * rects)
-{
-	uxa_screen_t *uxa_screen = uxa_get_screen(pDst->pDrawable->pScreen);
-	int src_off_x, src_off_y, dst_off_x, dst_off_y;
-	PixmapPtr pSrcPix, pDstPix;
-
-	if (!uxa_screen->info->prepare_composite || uxa_screen->swappedOut)
-		return -1;
-
-	if (uxa_screen->info->check_composite &&
-	    !(*uxa_screen->info->check_composite) (op, pSrc, NULL, pDst)) {
-		return -1;
-	}
-
-	pDstPix =
-	    uxa_get_offscreen_pixmap(pDst->pDrawable, &dst_off_x, &dst_off_y);
-	if (!pDstPix)
-		return 0;
-
-	pSrcPix =
-	    uxa_get_offscreen_pixmap(pSrc->pDrawable, &src_off_x, &src_off_y);
-	if (!pSrcPix)
-		return 0;
-
-	if (!(*uxa_screen->info->prepare_composite)
-	    (op, pSrc, NULL, pDst, pSrcPix, NULL, pDstPix))
-		return -1;
-
-	while (nrect--) {
-		INT16 xDst = rects->xDst + pDst->pDrawable->x;
-		INT16 yDst = rects->yDst + pDst->pDrawable->y;
-		INT16 xSrc = rects->xSrc + pSrc->pDrawable->x;
-		INT16 ySrc = rects->ySrc + pSrc->pDrawable->y;
-
-		RegionRec region;
-		BoxPtr pbox;
-		int nbox;
-
-		if (!miComputeCompositeRegion(&region, pSrc, NULL, pDst,
-					      xSrc, ySrc, 0, 0, xDst, yDst,
-					      rects->width, rects->height))
-			goto next_rect;
-
-		xSrc = xSrc + src_off_x - xDst;
-		ySrc = ySrc + src_off_y - yDst;
-
-		nbox = REGION_NUM_RECTS(&region);
-		pbox = REGION_RECTS(&region);
-
-		while (nbox--) {
-			(*uxa_screen->info->composite) (pDstPix,
-							pbox->x1 + xSrc,
-							pbox->y1 + ySrc,
-							0, 0,
-							pbox->x1 + dst_off_x,
-							pbox->y1 + dst_off_y,
-							pbox->x2 - pbox->x1,
-							pbox->y2 - pbox->y1);
-			pbox++;
-		}
-
-next_rect:
-		REGION_UNINIT(pDst->pDrawable->pScreen, &region);
-
-		rects++;
-	}
-	(*uxa_screen->info->done_composite) (pDstPix);
-
-	return 1;
-}
-
-/**
- * Copy a number of rectangles from source to destination in a single
- * operation. This is specialized for building a glyph mask: we don'y
- * have a mask argument because we don't need it for that, and we
- * don't have he special-case fallbacks found in uxa_composite() - if the
- * driver can support it, we use the driver functionality, otherwise we
- * fallback straight to software.
- */
-void
-uxa_composite_rects(CARD8 op,
-		    PicturePtr pSrc,
-		    PicturePtr pDst, int nrect, uxa_composite_rect_t * rects)
-{
-	int n;
-	uxa_composite_rect_t *r;
-
-    /************************************************************/
-
-	ValidatePicture(pSrc);
-	ValidatePicture(pDst);
-
-	if (uxa_try_driver_composite_rects(op, pSrc, pDst, nrect, rects) != 1) {
-		uxa_print_composite_fallback("uxa_composite_rects",
-					     op, pSrc, NULL, pDst);
-
-		n = nrect;
-		r = rects;
-		while (n--) {
-			uxa_check_composite(op, pSrc, NULL, pDst,
-					    r->xSrc, r->ySrc,
-					    0, 0,
-					    r->xDst, r->yDst,
-					    r->width, r->height);
-			r++;
-		}
-	}
-
-    /************************************************************/
-
-}
-
-static int
 uxa_try_driver_composite(CARD8 op,
 			 PicturePtr pSrc,
 			 PicturePtr pMask,
commit e3ece83f577d3664962edeec6ab5bdc41c5d77cf
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 16:52:11 2010 +0100

    i915: compute normalized texcoords using a scale factor.
    
    500 -> 580kglyphs/s on i945.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i915_render.c b/src/i915_render.c
index fb5efb4..f9988b9 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -288,8 +288,8 @@ static Bool i915_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 	pitch = intel_get_pixmap_pitch(pixmap);
 	w = picture->pDrawable->width;
 	h = picture->pDrawable->height;
-	intel->scale_units[unit][0] = pixmap->drawable.width;
-	intel->scale_units[unit][1] = pixmap->drawable.height;
+	intel->scale_units[unit][0] = 1. / pixmap->drawable.width;
+	intel->scale_units[unit][1] = 1. / pixmap->drawable.height;
 
 	for (i = 0; i < sizeof(i915_tex_formats) / sizeof(i915_tex_formats[0]);
 	     i++) {
@@ -410,18 +410,18 @@ i915_emit_composite_primitive_identity_source(PixmapPtr dest,
 
 	OUT_VERTEX(dst_x + w);
 	OUT_VERTEX(dst_y + h);
-	OUT_VERTEX((src_x + w) / intel->scale_units[0][0]);
-	OUT_VERTEX((src_y + h) / intel->scale_units[0][1]);
+	OUT_VERTEX((src_x + w) * intel->scale_units[0][0]);
+	OUT_VERTEX((src_y + h) * intel->scale_units[0][1]);
 
 	OUT_VERTEX(dst_x);
 	OUT_VERTEX(dst_y + h);
-	OUT_VERTEX(src_x / intel->scale_units[0][0]);
-	OUT_VERTEX((src_y + h) / intel->scale_units[0][1]);
+	OUT_VERTEX(src_x * intel->scale_units[0][0]);
+	OUT_VERTEX((src_y + h) * intel->scale_units[0][1]);
 
 	OUT_VERTEX(dst_x);
 	OUT_VERTEX(dst_y);
-	OUT_VERTEX(src_x / intel->scale_units[0][0]);
-	OUT_VERTEX(src_y / intel->scale_units[0][1]);
+	OUT_VERTEX(src_x * intel->scale_units[0][0]);
+	OUT_VERTEX(src_y * intel->scale_units[0][1]);
 }
 
 static void
@@ -461,18 +461,18 @@ i915_emit_composite_primitive_affine_source(PixmapPtr dest,
 
 	OUT_VERTEX(x + w);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX(src_x[2] / intel->scale_units[0][0]);
-	OUT_VERTEX(src_y[2] / intel->scale_units[0][1]);
+	OUT_VERTEX(src_x[2] * intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[2] * intel->scale_units[0][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX(src_x[1] / intel->scale_units[0][0]);
-	OUT_VERTEX(src_y[1] / intel->scale_units[0][1]);
+	OUT_VERTEX(src_x[1] * intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[1] * intel->scale_units[0][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y);
-	OUT_VERTEX(src_x[0] / intel->scale_units[0][0]);
-	OUT_VERTEX(src_y[0] / intel->scale_units[0][1]);
+	OUT_VERTEX(src_x[0] * intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[0] * intel->scale_units[0][1]);
 }
 
 static void
@@ -493,18 +493,18 @@ i915_emit_composite_primitive_constant_identity_mask(PixmapPtr dest,
 
 	OUT_VERTEX(x + w);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX((mx + w) / intel->scale_units[0][0]);
-	OUT_VERTEX((my + h) / intel->scale_units[0][1]);
+	OUT_VERTEX((mx + w) * intel->scale_units[0][0]);
+	OUT_VERTEX((my + h) * intel->scale_units[0][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX(mx / intel->scale_units[0][0]);
-	OUT_VERTEX((my + h) / intel->scale_units[0][1]);
+	OUT_VERTEX(mx * intel->scale_units[0][0]);
+	OUT_VERTEX((my + h) * intel->scale_units[0][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y);
-	OUT_VERTEX(mx / intel->scale_units[0][0]);
-	OUT_VERTEX(my / intel->scale_units[0][1]);
+	OUT_VERTEX(mx * intel->scale_units[0][0]);
+	OUT_VERTEX(my * intel->scale_units[0][1]);
 }
 
 static void
@@ -527,24 +527,24 @@ i915_emit_composite_primitive_identity_source_mask(PixmapPtr dest,
 
 	OUT_VERTEX(x + w);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX((sx + w) / intel->scale_units[0][0]);
-	OUT_VERTEX((sy + h) / intel->scale_units[0][1]);
-	OUT_VERTEX((mx + w) / intel->scale_units[1][0]);
-	OUT_VERTEX((my + h) / intel->scale_units[1][1]);
+	OUT_VERTEX((sx + w) * intel->scale_units[0][0]);
+	OUT_VERTEX((sy + h) * intel->scale_units[0][1]);
+	OUT_VERTEX((mx + w) * intel->scale_units[1][0]);
+	OUT_VERTEX((my + h) * intel->scale_units[1][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y + h);
-	OUT_VERTEX(sx / intel->scale_units[0][0]);
-	OUT_VERTEX((sy + h) / intel->scale_units[0][1]);
-	OUT_VERTEX(mx / intel->scale_units[1][0]);
-	OUT_VERTEX((my + h) / intel->scale_units[1][1]);
+	OUT_VERTEX(sx * intel->scale_units[0][0]);
+	OUT_VERTEX((sy + h) * intel->scale_units[0][1]);
+	OUT_VERTEX(mx * intel->scale_units[1][0]);
+	OUT_VERTEX((my + h) * intel->scale_units[1][1]);
 
 	OUT_VERTEX(x);
 	OUT_VERTEX(y);
-	OUT_VERTEX(sx / intel->scale_units[0][0]);
-	OUT_VERTEX(sy / intel->scale_units[0][1]);
-	OUT_VERTEX(mx / intel->scale_units[1][0]);
-	OUT_VERTEX(my / intel->scale_units[1][1]);
+	OUT_VERTEX(sx * intel->scale_units[0][0]);
+	OUT_VERTEX(sy * intel->scale_units[0][1]);
+	OUT_VERTEX(mx * intel->scale_units[1][0]);
+	OUT_VERTEX(my * intel->scale_units[1][1]);
 }
 
 static void
@@ -687,16 +687,16 @@ i915_emit_composite_primitive(PixmapPtr dest,
 	OUT_VERTEX(intel->dst_coord_adjust + dstX + w);
 	OUT_VERTEX(intel->dst_coord_adjust + dstY + h);
 	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[2] / intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[2] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[2] * intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[2] * intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[2]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_VERTEX(mask_x[2] / intel->scale_units[mask_unit][0]);
-		OUT_VERTEX(mask_y[2] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[2] * intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[2] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
 			OUT_VERTEX(0.0);
 			OUT_VERTEX(mask_w[2]);
@@ -706,16 +706,16 @@ i915_emit_composite_primitive(PixmapPtr dest,
 	OUT_VERTEX(intel->dst_coord_adjust + dstX);
 	OUT_VERTEX(intel->dst_coord_adjust + dstY + h);
 	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[1] / intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[1] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[1] * intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[1] * intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[1]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_VERTEX(mask_x[1] / intel->scale_units[mask_unit][0]);
-		OUT_VERTEX(mask_y[1] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[1] * intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[1] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
 			OUT_VERTEX(0.0);
 			OUT_VERTEX(mask_w[1]);
@@ -725,16 +725,16 @@ i915_emit_composite_primitive(PixmapPtr dest,
 	OUT_VERTEX(intel->dst_coord_adjust + dstX);
 	OUT_VERTEX(intel->dst_coord_adjust + dstY);
 	if (! intel->render_source_is_solid) {
-	    OUT_VERTEX(src_x[0] / intel->scale_units[src_unit][0]);
-	    OUT_VERTEX(src_y[0] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[0] * intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[0] * intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
 		OUT_VERTEX(0.0);
 		OUT_VERTEX(src_w[0]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_VERTEX(mask_x[0] / intel->scale_units[mask_unit][0]);
-		OUT_VERTEX(mask_y[0] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[0] * intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[0] * intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
 			OUT_VERTEX(0.0);
 			OUT_VERTEX(mask_w[0]);
commit c2abf8d659b8b161a4f9df100b614ee3c8f8e458
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 16:18:40 2010 +0100

    uxa: translate the region in line for composites
    
    When compositing, we need to convert the box into a rect and so the
    advantages of using REGION_TRANSLATE are lost.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index 68e3651..139d42e 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -391,8 +391,6 @@ uxa_try_driver_solid_fill(PicturePtr pSrc,
 				      width, height))
 		return 1;
 
-	REGION_TRANSLATE(pScreen, &region, dst_off_x, dst_off_y);
-
 	if (pSrcPix) {
 		if (! uxa_get_color_for_pixmap (pSrcPix, pSrc->format, pDst->format, &pixel)) {
 			REGION_UNINIT(pDst->pDrawable->pScreen, &region);
@@ -432,6 +430,8 @@ uxa_try_driver_solid_fill(PicturePtr pSrc,
 		return -1;
 	}
 
+	REGION_TRANSLATE(pScreen, &region, dst_off_x, dst_off_y);
+
 	nbox = REGION_NUM_RECTS(&region);
 	pbox = REGION_RECTS(&region);
 
@@ -1090,21 +1090,19 @@ uxa_try_driver_composite_rects(CARD8 op,
 					      rects->width, rects->height))
 			goto next_rect;
 
-		REGION_TRANSLATE(pScreen, &region, dst_off_x, dst_off_y);
+		xSrc = xSrc + src_off_x - xDst;
+		ySrc = ySrc + src_off_y - yDst;
 
 		nbox = REGION_NUM_RECTS(&region);
 		pbox = REGION_RECTS(&region);
 
-		xSrc = xSrc + src_off_x - xDst - dst_off_x;
-		ySrc = ySrc + src_off_y - yDst - dst_off_y;
-
 		while (nbox--) {
 			(*uxa_screen->info->composite) (pDstPix,
 							pbox->x1 + xSrc,
 							pbox->y1 + ySrc,
 							0, 0,
-							pbox->x1,
-							pbox->y1,
+							pbox->x1 + dst_off_x,
+							pbox->y1 + dst_off_y,
 							pbox->x2 - pbox->x1,
 							pbox->y2 - pbox->y1);
 			pbox++;
@@ -1258,8 +1256,6 @@ uxa_try_driver_composite(CARD8 op,
 		}
 	}
 
-	REGION_TRANSLATE(pScreen, &region, dst_off_x, dst_off_y);
-
 	if (!(*uxa_screen->info->prepare_composite)
 	    (op, localSrc, localMask, pDst, pSrcPix, pMaskPix, pDstPix)) {
 		REGION_UNINIT(pDst->pDrawable->pScreen, &region);
@@ -1272,25 +1268,24 @@ uxa_try_driver_composite(CARD8 op,
 		return -1;
 	}
 
-	nbox = REGION_NUM_RECTS(&region);
-	pbox = REGION_RECTS(&region);
-
 	if (pMask) {
-		xMask = xMask + mask_off_x - xDst - dst_off_x;
-		yMask = yMask + mask_off_y - yDst - dst_off_y;
+		xMask = xMask + mask_off_x - xDst;
+		yMask = yMask + mask_off_y - yDst;
 	}
 
-	xSrc = xSrc + src_off_x - xDst - dst_off_x;
-	ySrc = ySrc + src_off_y - yDst - dst_off_y;
+	xSrc = xSrc + src_off_x - xDst;
+	ySrc = ySrc + src_off_y - yDst;
 
+	nbox = REGION_NUM_RECTS(&region);
+	pbox = REGION_RECTS(&region);
 	while (nbox--) {
 		(*uxa_screen->info->composite) (pDstPix,
 						pbox->x1 + xSrc,
 						pbox->y1 + ySrc,
 						pbox->x1 + xMask,
 						pbox->y1 + yMask,
-						pbox->x1,
-						pbox->y1,
+						pbox->x1 + dst_off_x,
+						pbox->y1 + dst_off_y,
 						pbox->x2 - pbox->x1,
 						pbox->y2 - pbox->y1);
 		pbox++;
commit 2adf823b80b7b1f6df1bf3422a1219e93321a8fb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 15:02:58 2010 +0100

    i915: Add special case primitive emitters for glyphs.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i915_render.c b/src/i915_render.c
index e74ca13..fb5efb4 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -476,10 +476,83 @@ i915_emit_composite_primitive_affine_source(PixmapPtr dest,
 }
 
 static void
+i915_emit_composite_primitive_constant_identity_mask(PixmapPtr dest,
+						     int srcX, int srcY,
+						     int maskX, int maskY,
+						     int dstX, int dstY,
+						     int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	float x, y, mx, my;
+
+	x = dstX + intel->dst_coord_adjust;
+	y = dstY + intel->dst_coord_adjust;
+	mx = maskX + intel->mask_coord_adjust;
+	my = maskY + intel->mask_coord_adjust;
+
+	OUT_VERTEX(x + w);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX((mx + w) / intel->scale_units[0][0]);
+	OUT_VERTEX((my + h) / intel->scale_units[0][1]);
+
+	OUT_VERTEX(x);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX(mx / intel->scale_units[0][0]);
+	OUT_VERTEX((my + h) / intel->scale_units[0][1]);
+
+	OUT_VERTEX(x);
+	OUT_VERTEX(y);
+	OUT_VERTEX(mx / intel->scale_units[0][0]);
+	OUT_VERTEX(my / intel->scale_units[0][1]);
+}
+
+static void
+i915_emit_composite_primitive_identity_source_mask(PixmapPtr dest,
+						   int srcX, int srcY,
+						   int maskX, int maskY,
+						   int dstX, int dstY,
+						   int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	float x, y, sx, sy, mx, my;
+
+	x = dstX + intel->dst_coord_adjust;
+	y = dstY + intel->dst_coord_adjust;
+	sx = srcX + intel->src_coord_adjust;
+	sy = srcY + intel->src_coord_adjust;
+	mx = maskX + intel->mask_coord_adjust;
+	my = maskY + intel->mask_coord_adjust;
+
+	OUT_VERTEX(x + w);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX((sx + w) / intel->scale_units[0][0]);
+	OUT_VERTEX((sy + h) / intel->scale_units[0][1]);
+	OUT_VERTEX((mx + w) / intel->scale_units[1][0]);
+	OUT_VERTEX((my + h) / intel->scale_units[1][1]);
+
+	OUT_VERTEX(x);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX(sx / intel->scale_units[0][0]);
+	OUT_VERTEX((sy + h) / intel->scale_units[0][1]);
+	OUT_VERTEX(mx / intel->scale_units[1][0]);
+	OUT_VERTEX((my + h) / intel->scale_units[1][1]);
+
+	OUT_VERTEX(x);
+	OUT_VERTEX(y);
+	OUT_VERTEX(sx / intel->scale_units[0][0]);
+	OUT_VERTEX(sy / intel->scale_units[0][1]);
+	OUT_VERTEX(mx / intel->scale_units[1][0]);
+	OUT_VERTEX(my / intel->scale_units[1][1]);
+}
+
+static void
 i915_emit_composite_primitive(PixmapPtr dest,
 			      int srcX, int srcY,
 			      int maskX, int maskY,
-			      int dstX, int dstY, int w, int h)
+			      int dstX, int dstY,
+			      int w, int h)
 {
 	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
 	intel_screen_private *intel = intel_get_screen_private(scrn);
@@ -788,6 +861,7 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 
 	intel->needs_render_state_emit = TRUE;
 
+	intel->prim_emit = i915_emit_composite_primitive;
 	if (!mask) {
 		if (intel->render_source_is_solid)
 			intel->prim_emit = i915_emit_composite_primitive_constant;
@@ -795,10 +869,14 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 			intel->prim_emit = i915_emit_composite_primitive_identity_source;
 		else if (i830_transform_is_affine(intel->transform[0]))
 			intel->prim_emit = i915_emit_composite_primitive_affine_source;
-		else
-			intel->prim_emit = i915_emit_composite_primitive;
-	} else
-		intel->prim_emit = i915_emit_composite_primitive;
+	} else {
+		if (intel->transform[0] == NULL) {
+			if (intel->render_source_is_solid)
+				intel->prim_emit = i915_emit_composite_primitive_constant_identity_mask;
+			else if (intel->transform[1] == NULL)
+				intel->prim_emit = i915_emit_composite_primitive_identity_source_mask;
+		}
+	}
 
 	if (floats_per_vertex != intel->floats_per_vertex) {
 		intel->floats_per_vertex = floats_per_vertex;
commit f64ab9e0d97dd9c654b4ae1924e62ef6813d9bb0
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 14:50:12 2010 +0100

    i915: Move vertices into a vertex buffer object.
    
    In theory this should allow us to pack far more operations into a single
    batch buffer, and reduce our overheads.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830.h b/src/i830.h
index 3597878..f8aa824 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -343,6 +343,7 @@ typedef struct intel_screen_private {
 	Bool render_source_is_solid;
 	Bool render_mask_is_solid;
 	Bool needs_render_state_emit;
+	Bool needs_render_vertex_emit;
 
 	/* i830 render accel state */
 	uint32_t render_dest_format;
@@ -359,12 +360,17 @@ typedef struct intel_screen_private {
 	} i915_render_state;
 
 	uint32_t prim_offset;
-	uint32_t prim_count;
 	void (*prim_emit)(PixmapPtr dest,
 			  int srcX, int srcY,
 			  int maskX, int maskY,
 			  int dstX, int dstY,
 			  int w, int h);
+	int floats_per_vertex;
+	uint32_t vertex_count;
+	uint32_t vertex_index;
+	uint32_t vertex_used;
+	float vertex_ptr[4*1024];
+	dri_bo *vertex_bo;
 
 	/* 965 render acceleration state */
 	struct gen4_render_state *gen4_render_state;
diff --git a/src/i830_batchbuffer.c b/src/i830_batchbuffer.c
index 80539b3..69961c6 100644
--- a/src/i830_batchbuffer.c
+++ b/src/i830_batchbuffer.c
@@ -39,6 +39,26 @@
 
 #define DUMP_BATCHBUFFERS NULL /* "/tmp/i915-batchbuffers.dump" */
 
+static void intel_end_vertex(intel_screen_private *intel)
+{
+	if (intel->vertex_bo) {
+		if (intel->vertex_used)
+			dri_bo_subdata(intel->vertex_bo, 0, intel->vertex_used*4, intel->vertex_ptr);
+
+		dri_bo_unreference(intel->vertex_bo);
+		intel->vertex_bo = NULL;
+	}
+}
+
+void intel_next_vertex(intel_screen_private *intel)
+{
+	intel_end_vertex(intel);
+
+	intel->vertex_bo =
+		dri_bo_alloc(intel->bufmgr, "vertex", sizeof (intel->vertex_ptr), 4096);
+	intel->vertex_used = 0;
+}
+
 static void intel_next_batch(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
@@ -158,6 +178,7 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 
 	if (intel->vertex_flush)
 		intel->vertex_flush(intel);
+	intel_end_vertex(intel);
 
 	/* Mark the end of the batchbuffer. */
 	OUT_BATCH(MI_BATCH_BUFFER_END);
diff --git a/src/i830_batchbuffer.h b/src/i830_batchbuffer.h
index 874916d..b5c729a 100644
--- a/src/i830_batchbuffer.h
+++ b/src/i830_batchbuffer.h
@@ -44,6 +44,11 @@ static inline int intel_batch_space(intel_screen_private *intel)
 	return (intel->batch_bo->size - BATCH_RESERVED) - (4*intel->batch_used);
 }
 
+static inline int intel_vertex_space(intel_screen_private *intel)
+{
+	return intel->vertex_bo ? intel->vertex_bo->size - (4*intel->vertex_used) : 0;
+}
+
 static inline void
 intel_batch_require_space(ScrnInfoPtr scrn, intel_screen_private *intel, GLuint sz)
 {
@@ -203,4 +208,11 @@ do {									\
 	intel->batch_emitting = 0;					\
 } while (0)
 
+void intel_next_vertex(intel_screen_private *intel);
+static inline void intel_vertex_emit(intel_screen_private *intel, float v)
+{
+	intel->vertex_ptr[intel->vertex_used++] = v;
+}
+#define OUT_VERTEX(v) intel_vertex_emit(intel, v)
+
 #endif /* _INTEL_BATCHBUFFER_H */
diff --git a/src/i830_uxa.c b/src/i830_uxa.c
index f68ec70..a79dde1 100644
--- a/src/i830_uxa.c
+++ b/src/i830_uxa.c
@@ -1055,8 +1055,11 @@ Bool i830_uxa_init(ScreenPtr screen)
 	intel->uxa_driver->uxa_major = 1;
 	intel->uxa_driver->uxa_minor = 0;
 
+	intel->needs_render_vertex_emit = TRUE;
 	intel->prim_offset = 0;
-	intel->prim_count = 0;
+	intel->vertex_count = 0;
+	intel->floats_per_vertex = 0;
+	intel->vertex_bo = NULL;
 
 	/* Solid fill */
 	intel->uxa_driver->check_solid = i830_uxa_check_solid;
diff --git a/src/i915_reg.h b/src/i915_reg.h
index a61bc40..746a413 100644
--- a/src/i915_reg.h
+++ b/src/i915_reg.h
@@ -32,19 +32,20 @@
 
 #define CMD_3D (0x3<<29)
 
-#define PRIM3D_INLINE		(CMD_3D | (0x1f<<24))
-#define PRIM3D_TRILIST		(0x0<<18)
-#define PRIM3D_TRISTRIP 	(0x1<<18)
-#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
-#define PRIM3D_TRIFAN		(0x3<<18)
-#define PRIM3D_POLY		(0x4<<18)
-#define PRIM3D_LINELIST 	(0x5<<18)
-#define PRIM3D_LINESTRIP	(0x6<<18)
-#define PRIM3D_RECTLIST 	(0x7<<18)
-#define PRIM3D_POINTLIST	(0x8<<18)
-#define PRIM3D_DIB		(0x9<<18)
-#define PRIM3D_CLEAR_RECT	(0xa<<18)
-#define PRIM3D_ZONE_INIT	(0xd<<18)
+#define PRIM3D			(CMD_3D | (0x1f<<24))
+#define PRIM3D_INDIRECT_SEQUENTIAL      ((1<<23) | (0<<17))
+#define PRIM3D_TRILIST		(PRIM3D | (0x0<<18))
+#define PRIM3D_TRISTRIP 	(PRIM3D | (0x1<<18))
+#define PRIM3D_TRISTRIP_RVRSE	(PRIM3D | (0x2<<18))
+#define PRIM3D_TRIFAN		(PRIM3D | (0x3<<18))
+#define PRIM3D_POLY		(PRIM3D | (0x4<<18))
+#define PRIM3D_LINELIST 	(PRIM3D | (0x5<<18))
+#define PRIM3D_LINESTRIP	(PRIM3D | (0x6<<18))
+#define PRIM3D_RECTLIST 	(PRIM3D | (0x7<<18))
+#define PRIM3D_POINTLIST	(PRIM3D | (0x8<<18))
+#define PRIM3D_DIB		(PRIM3D | (0x9<<18))
+#define PRIM3D_CLEAR_RECT	(PRIM3D | (0xa<<18))
+#define PRIM3D_ZONE_INIT	(PRIM3D | (0xd<<18))
 #define PRIM3D_MASK		(0x1f<<18)
 
 /* p137 */
diff --git a/src/i915_render.c b/src/i915_render.c
index de68c5c..e74ca13 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -368,116 +368,6 @@ static Bool i915_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 	return TRUE;
 }
 
-Bool
-i915_prepare_composite(int op, PicturePtr source_picture,
-		       PicturePtr mask_picture, PicturePtr dest_picture,
-		       PixmapPtr source, PixmapPtr mask, PixmapPtr dest)
-{
-	ScrnInfoPtr scrn = xf86Screens[dest_picture->pDrawable->pScreen->myNum];
-	intel_screen_private *intel = intel_get_screen_private(scrn);
-	drm_intel_bo *bo_table[] = {
-		NULL,		/* batch_bo */
-		i830_get_pixmap_bo(dest),
-		source ? i830_get_pixmap_bo(source) : NULL,
-		mask ? i830_get_pixmap_bo(mask) : NULL,
-	};
-	int tex_unit = 0;
-
-	intel->render_source_picture = source_picture;
-	intel->render_source = source;
-	intel->render_mask_picture = mask_picture;
-	intel->render_mask = mask;
-	intel->render_dest_picture = dest_picture;
-	intel->render_dest = dest;
-
-	intel->render_source_is_solid = FALSE;
-	if (source_picture->pSourcePict) {
-		SourcePict *source = source_picture->pSourcePict;
-		if (source->type == SourcePictTypeSolidFill) {
-			intel->render_source_is_solid = TRUE;
-			intel->render_source_solid = source->solidFill.color;
-		}
-	}
-	if (!intel->render_source_is_solid && !intel_check_pitch_3d(source))
-		return FALSE;
-
-	intel->render_mask_is_solid = FALSE;
-	if (mask) {
-		if (mask_picture->pSourcePict) {
-			SourcePict *source = mask_picture->pSourcePict;
-			if (source->type == SourcePictTypeSolidFill) {
-				intel->render_mask_is_solid = TRUE;
-				intel->render_mask_solid = source->solidFill.color;
-			}
-		}
-		if (!intel->render_mask_is_solid && !intel_check_pitch_3d(mask))
-			return FALSE;
-	}
-
-	if (!intel_check_pitch_3d(dest))
-		return FALSE;
-
-	if (!i915_get_dest_format(dest_picture,
-				  &intel->i915_render_state.dst_format))
-		return FALSE;
-
-	if (!i830_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
-		return FALSE;
-	intel->dst_coord_adjust = 0;
-	intel->src_coord_adjust = 0;
-	intel->mask_coord_adjust = 0;
-
-	intel->transform[0] = NULL;
-	intel->scale_units[0][0] = -1;
-	intel->scale_units[0][1] = -1;
-	intel->transform[1] = NULL;
-	intel->scale_units[1][0] = -1;
-	intel->scale_units[1][1] = -1;
-
-	if (! intel->render_source_is_solid) {
-		if (!i915_texture_setup(source_picture, source, tex_unit++)) {
-			intel_debug_fallback(scrn, "fail to setup src texture\n");
-			return FALSE;
-		}
-
-		if (source_picture->filter == PictFilterNearest) {
-#if PIXEL_CENTRE_SAMPLE
-			intel->src_coord_adjust = 0.375;
-#else
-			intel->dst_coord_adjust = -0.125;
-#endif
-		}
-	}
-
-	if (mask != NULL) {
-		if (! intel->render_mask_is_solid) {
-			if (!i915_texture_setup(mask_picture, mask, tex_unit++)) {
-				intel_debug_fallback(scrn,
-						"fail to setup mask texture\n");
-				return FALSE;
-			}
-
-			if (mask_picture->filter == PictFilterNearest) {
-#if PIXEL_CENTRE_SAMPLE
-			    intel->mask_coord_adjust = 0.375;
-#else
-			    intel->dst_coord_adjust = -0.125;
-#endif
-			}
-		}
-	}
-
-	intel->i915_render_state.op = op;
-
-	if((source && i830_uxa_pixmap_is_dirty(source)) ||
-	   (mask && i830_uxa_pixmap_is_dirty(mask)))
-		intel_batch_emit_flush(scrn);
-
-	intel->needs_render_state_emit = TRUE;
-
-	return TRUE;
-}
-
 static void
 i915_emit_composite_primitive_constant(PixmapPtr dest,
 				       int srcX, int srcY,
@@ -489,23 +379,17 @@ i915_emit_composite_primitive_constant(PixmapPtr dest,
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	float x, y;
 
-	if (intel->prim_offset == 0) {
-		intel->prim_offset = intel->batch_used;
-		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
-	}
-	intel->prim_count += 6;
-
 	x = dstX + intel->dst_coord_adjust;
 	y = dstY + intel->dst_coord_adjust;
 
-	OUT_BATCH_F(x + w);
-	OUT_BATCH_F(y + h);
+	OUT_VERTEX(x + w);
+	OUT_VERTEX(y + h);
 
-	OUT_BATCH_F(x);
-	OUT_BATCH_F(y + h);
+	OUT_VERTEX(x);
+	OUT_VERTEX(y + h);
 
-	OUT_BATCH_F(x);
-	OUT_BATCH_F(y);
+	OUT_VERTEX(x);
+	OUT_VERTEX(y);
 }
 
 static void
@@ -519,31 +403,25 @@ i915_emit_composite_primitive_identity_source(PixmapPtr dest,
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	float dst_x, dst_y, src_x, src_y;
 
-	if (intel->prim_offset == 0) {
-		intel->prim_offset = intel->batch_used;
-		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
-	}
-	intel->prim_count += 12;
-
 	dst_x = dstX + intel->dst_coord_adjust;
 	dst_y = dstY + intel->dst_coord_adjust;
 	src_x = srcX + intel->src_coord_adjust;
 	src_y = srcY + intel->src_coord_adjust;
 
-	OUT_BATCH_F(dst_x + w);
-	OUT_BATCH_F(dst_y + h);
-	OUT_BATCH_F((src_x + w) / intel->scale_units[0][0]);
-	OUT_BATCH_F((src_y + h) / intel->scale_units[0][1]);
+	OUT_VERTEX(dst_x + w);
+	OUT_VERTEX(dst_y + h);
+	OUT_VERTEX((src_x + w) / intel->scale_units[0][0]);
+	OUT_VERTEX((src_y + h) / intel->scale_units[0][1]);
 
-	OUT_BATCH_F(dst_x);
-	OUT_BATCH_F(dst_y + h);
-	OUT_BATCH_F(src_x / intel->scale_units[0][0]);
-	OUT_BATCH_F((src_y + h) / intel->scale_units[0][1]);
+	OUT_VERTEX(dst_x);
+	OUT_VERTEX(dst_y + h);
+	OUT_VERTEX(src_x / intel->scale_units[0][0]);
+	OUT_VERTEX((src_y + h) / intel->scale_units[0][1]);
 
-	OUT_BATCH_F(dst_x);
-	OUT_BATCH_F(dst_y);
-	OUT_BATCH_F(src_x / intel->scale_units[0][0]);
-	OUT_BATCH_F(src_y / intel->scale_units[0][1]);
+	OUT_VERTEX(dst_x);
+	OUT_VERTEX(dst_y);
+	OUT_VERTEX(src_x / intel->scale_units[0][0]);
+	OUT_VERTEX(src_y / intel->scale_units[0][1]);
 }
 
 static void
@@ -578,29 +456,23 @@ i915_emit_composite_primitive_affine_source(PixmapPtr dest,
 					      &src_y[2]))
 		return;
 
-	if (intel->prim_offset == 0) {
-		intel->prim_offset = intel->batch_used;
-		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
-	}
-	intel->prim_count += 12;
-
 	x = dstX + intel->dst_coord_adjust;
 	y = dstY + intel->dst_coord_adjust;
 
-	OUT_BATCH_F(x + w);
-	OUT_BATCH_F(y + h);
-	OUT_BATCH_F(src_x[2] / intel->scale_units[0][0]);
-	OUT_BATCH_F(src_y[2] / intel->scale_units[0][1]);
+	OUT_VERTEX(x + w);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX(src_x[2] / intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[2] / intel->scale_units[0][1]);
 
-	OUT_BATCH_F(x);
-	OUT_BATCH_F(y + h);
-	OUT_BATCH_F(src_x[1] / intel->scale_units[0][0]);
-	OUT_BATCH_F(src_y[1] / intel->scale_units[0][1]);
+	OUT_VERTEX(x);
+	OUT_VERTEX(y + h);
+	OUT_VERTEX(src_x[1] / intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[1] / intel->scale_units[0][1]);
 
-	OUT_BATCH_F(x);
-	OUT_BATCH_F(y);
-	OUT_BATCH_F(src_x[0] / intel->scale_units[0][0]);
-	OUT_BATCH_F(src_y[0] / intel->scale_units[0][1]);
+	OUT_VERTEX(x);
+	OUT_VERTEX(y);
+	OUT_VERTEX(src_x[0] / intel->scale_units[0][0]);
+	OUT_VERTEX(src_y[0] / intel->scale_units[0][1]);
 }
 
 static void
@@ -739,64 +611,201 @@ i915_emit_composite_primitive(PixmapPtr dest,
 
 	num_floats = 3 * per_vertex;
 
-	intel->prim_count += num_floats;
-
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX + w);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
+	OUT_VERTEX(intel->dst_coord_adjust + dstX + w);
+	OUT_VERTEX(intel->dst_coord_adjust + dstY + h);
 	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[2] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[2] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[2] / intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[2] / intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[2]);
+		OUT_VERTEX(0.0);
+		OUT_VERTEX(src_w[2]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[2] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[2] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[2] / intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[2] / intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[2]);
+			OUT_VERTEX(0.0);
+			OUT_VERTEX(mask_w[2]);
 		}
 	}
 
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
+	OUT_VERTEX(intel->dst_coord_adjust + dstX);
+	OUT_VERTEX(intel->dst_coord_adjust + dstY + h);
 	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[1] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[1] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[1] / intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[1] / intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[1]);
+		OUT_VERTEX(0.0);
+		OUT_VERTEX(src_w[1]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[1] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[1] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[1] / intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[1] / intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[1]);
+			OUT_VERTEX(0.0);
+			OUT_VERTEX(mask_w[1]);
 		}
 	}
 
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY);
+	OUT_VERTEX(intel->dst_coord_adjust + dstX);
+	OUT_VERTEX(intel->dst_coord_adjust + dstY);
 	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[0] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[0] / intel->scale_units[src_unit][1]);
+	    OUT_VERTEX(src_x[0] / intel->scale_units[src_unit][0]);
+	    OUT_VERTEX(src_y[0] / intel->scale_units[src_unit][1]);
 	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[0]);
+		OUT_VERTEX(0.0);
+		OUT_VERTEX(src_w[0]);
 	    }
 	}
 	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[0] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[0] / intel->scale_units[mask_unit][1]);
+		OUT_VERTEX(mask_x[0] / intel->scale_units[mask_unit][0]);
+		OUT_VERTEX(mask_y[0] / intel->scale_units[mask_unit][1]);
 		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[0]);
+			OUT_VERTEX(0.0);
+			OUT_VERTEX(mask_w[0]);
+		}
+	}
+}
+
+Bool
+i915_prepare_composite(int op, PicturePtr source_picture,
+		       PicturePtr mask_picture, PicturePtr dest_picture,
+		       PixmapPtr source, PixmapPtr mask, PixmapPtr dest)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest_picture->pDrawable->pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	drm_intel_bo *bo_table[] = {
+		NULL,		/* batch_bo */
+		i830_get_pixmap_bo(dest),
+		source ? i830_get_pixmap_bo(source) : NULL,
+		mask ? i830_get_pixmap_bo(mask) : NULL,
+	};
+	int tex_unit = 0;
+	int floats_per_vertex;
+
+	intel->render_source_picture = source_picture;
+	intel->render_source = source;
+	intel->render_mask_picture = mask_picture;
+	intel->render_mask = mask;
+	intel->render_dest_picture = dest_picture;
+	intel->render_dest = dest;
+
+	intel->render_source_is_solid = FALSE;
+	if (source_picture->pSourcePict) {
+		SourcePict *source = source_picture->pSourcePict;
+		if (source->type == SourcePictTypeSolidFill) {
+			intel->render_source_is_solid = TRUE;
+			intel->render_source_solid = source->solidFill.color;
 		}
 	}
+	if (!intel->render_source_is_solid && !intel_check_pitch_3d(source))
+		return FALSE;
+
+	intel->render_mask_is_solid = FALSE;
+	if (mask) {
+		if (mask_picture->pSourcePict) {
+			SourcePict *source = mask_picture->pSourcePict;
+			if (source->type == SourcePictTypeSolidFill) {
+				intel->render_mask_is_solid = TRUE;
+				intel->render_mask_solid = source->solidFill.color;
+			}
+		}
+		if (!intel->render_mask_is_solid && !intel_check_pitch_3d(mask))
+			return FALSE;
+	}
+
+	if (!intel_check_pitch_3d(dest))
+		return FALSE;
+
+	if (!i915_get_dest_format(dest_picture,
+				  &intel->i915_render_state.dst_format))
+		return FALSE;
+
+	if (!i830_get_aperture_space(scrn, bo_table, ARRAY_SIZE(bo_table)))
+		return FALSE;
+	intel->dst_coord_adjust = 0;
+	intel->src_coord_adjust = 0;
+	intel->mask_coord_adjust = 0;
+
+	intel->transform[0] = NULL;
+	intel->scale_units[0][0] = -1;
+	intel->scale_units[0][1] = -1;
+	intel->transform[1] = NULL;
+	intel->scale_units[1][0] = -1;
+	intel->scale_units[1][1] = -1;
+
+	floats_per_vertex = 2;		/* dest x/y */
+	if (! intel->render_source_is_solid) {
+		if (!i915_texture_setup(source_picture, source, tex_unit++)) {
+			intel_debug_fallback(scrn, "fail to setup src texture\n");
+			return FALSE;
+		}
+
+		if (i830_transform_is_affine(source_picture->transform))
+			floats_per_vertex += 2;	/* src x/y */
+		else
+			floats_per_vertex += 4;	/* src x/y/z/w */
+
+		if (source_picture->filter == PictFilterNearest) {
+#if PIXEL_CENTRE_SAMPLE
+			intel->src_coord_adjust = 0.375;
+#else
+			intel->dst_coord_adjust = -0.125;
+#endif
+		}
+	}
+
+	if (mask != NULL) {
+		if (! intel->render_mask_is_solid) {
+			if (!i915_texture_setup(mask_picture, mask, tex_unit++)) {
+				intel_debug_fallback(scrn,
+						"fail to setup mask texture\n");
+				return FALSE;
+			}
+
+			if (i830_transform_is_affine(mask_picture->transform))
+				floats_per_vertex += 2;	/* mask x/y */
+			else
+				floats_per_vertex += 4;	/* mask x/y/z/w */
+
+			if (mask_picture->filter == PictFilterNearest) {
+#if PIXEL_CENTRE_SAMPLE
+			    intel->mask_coord_adjust = 0.375;
+#else
+			    intel->dst_coord_adjust = -0.125;
+#endif
+			}
+		}
+	}
+
+	intel->i915_render_state.op = op;
+
+	if((source && i830_uxa_pixmap_is_dirty(source)) ||
+	   (mask && i830_uxa_pixmap_is_dirty(mask)))
+		intel_batch_emit_flush(scrn);
+
+	intel->needs_render_state_emit = TRUE;
+
+	if (!mask) {
+		if (intel->render_source_is_solid)
+			intel->prim_emit = i915_emit_composite_primitive_constant;
+		else if (intel->transform[0] == NULL)
+			intel->prim_emit = i915_emit_composite_primitive_identity_source;
+		else if (i830_transform_is_affine(intel->transform[0]))
+			intel->prim_emit = i915_emit_composite_primitive_affine_source;
+		else
+			intel->prim_emit = i915_emit_composite_primitive;
+	} else
+		intel->prim_emit = i915_emit_composite_primitive;
+
+	if (floats_per_vertex != intel->floats_per_vertex) {
+		intel->floats_per_vertex = floats_per_vertex;
+		intel->needs_render_vertex_emit = TRUE;
+	}
+
+	return TRUE;
 }
 
 static void i915_emit_composite_setup(ScrnInfoPtr scrn)
@@ -1020,20 +1029,6 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 
 	    FS_END();
 	}
-
-	intel->prim_offset = 0;
-	intel->prim_count = 0;
-	if (!mask) {
-		if (is_solid_src)
-			intel->prim_emit = i915_emit_composite_primitive_constant;
-		else if (intel->transform[0] == NULL)
-			intel->prim_emit = i915_emit_composite_primitive_identity_source;
-		else if (i830_transform_is_affine(intel->transform[0]))
-			intel->prim_emit = i915_emit_composite_primitive_affine_source;
-		else
-			intel->prim_emit = i915_emit_composite_primitive;
-	} else
-		intel->prim_emit = i915_emit_composite_primitive;
 }
 
 void
@@ -1049,10 +1044,41 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	if (intel->needs_render_state_emit)
 		i915_emit_composite_setup(scrn);
 
+	if (intel_vertex_space(intel) < 3*4*intel->floats_per_vertex)
+		intel->needs_render_vertex_emit = TRUE;
+
+	if (intel->needs_render_vertex_emit) {
+		i915_vertex_flush(intel);
+
+		if (intel_vertex_space(intel) < 256) {
+			intel_next_vertex(intel);
+
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+				  I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
+			OUT_RELOC(intel->vertex_bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
+			OUT_BATCH((intel->floats_per_vertex << S1_VERTEX_WIDTH_SHIFT) |
+				  (intel->floats_per_vertex << S1_VERTEX_PITCH_SHIFT));
+			intel->vertex_index = 0;
+		} else {
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+				  I1_LOAD_S(1) | 0);
+			OUT_BATCH((intel->floats_per_vertex << S1_VERTEX_WIDTH_SHIFT) |
+				  (intel->floats_per_vertex << S1_VERTEX_PITCH_SHIFT));
+
+			intel->vertex_index =
+				(intel->vertex_used + intel->floats_per_vertex - 1) /  intel->floats_per_vertex;
+			intel->vertex_used = intel->vertex_index * intel->floats_per_vertex;
+		}
+
+		intel->needs_render_vertex_emit = FALSE;
+	}
+
 	if (intel->prim_offset == 0) {
 		intel->prim_offset = intel->batch_used;
-		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+		OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL);
+		OUT_BATCH(intel->vertex_index);
 	}
+	intel->vertex_count += 3;
 
 	intel->prim_emit(dest,
 			 srcX, srcY,
@@ -1066,10 +1092,14 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 void
 i915_vertex_flush(intel_screen_private *intel)
 {
-	if (intel->prim_offset) {
-		intel->batch_ptr[intel->prim_offset] |= intel->prim_count - 1;
-		intel->prim_offset = 0;
-	}
+	if (intel->prim_offset == 0)
+		return;
+
+	intel->batch_ptr[intel->prim_offset] |= intel->vertex_count;
+	intel->prim_offset = 0;
+
+	intel->vertex_index += intel->vertex_count;
+	intel->vertex_count = 0;
 }
 
 void
diff --git a/src/i915_video.c b/src/i915_video.c
index 2ccd502..bbac610 100644
--- a/src/i915_video.c
+++ b/src/i915_video.c
@@ -397,7 +397,7 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 		dxo = dstRegion->extents.x1;
 		dyo = dstRegion->extents.y1;
 
-		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
+		OUT_BATCH(PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
 		while (nbox_this_time--) {
 			int box_x1 = pbox->x1;
 			int box_y1 = pbox->y1;
commit 2b050f330f78d02e7f476e55be29d760271ac61c
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 10:57:46 2010 +0100

    Use pwrite to upload the batch buffer
    
    By using pwrite() instead of dri_bo_map() we can write to the batch buffer
    through the GTT and not be forced to map it back into the CPU domain and
    out again, eliminating a double clflush.
    
    Measing x11perf text performance on PineView:
    
    Before:
    16000000 trep @   0.0020 msec (511000.0/sec): Char in 80-char aa line (Charter 10)
    16000000 trep @   0.0021 msec (480000.0/sec): Char in 80-char rgb line (Charter 10)
    After:
    16000000 trep @   0.0019 msec (532000.0/sec): Char in 80-char aa line (Charter 10)
    16000000 trep @   0.0020 msec (496000.0/sec): Char in 80-char rgb line (Charter 10)
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830.h b/src/i830.h
index 0b638f2..3597878 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -252,7 +252,7 @@ typedef struct intel_screen_private {
 
 	dri_bufmgr *bufmgr;
 
-	uint8_t *batch_ptr;
+	uint32_t batch_ptr[4096];
 	/** Byte offset in batch_ptr for the next dword to be emitted. */
 	unsigned int batch_used;
 	/** Position in batch_ptr at the start of the current BEGIN_BATCH */
diff --git a/src/i830_batchbuffer.c b/src/i830_batchbuffer.c
index 0fe81d0..80539b3 100644
--- a/src/i830_batchbuffer.c
+++ b/src/i830_batchbuffer.c
@@ -42,7 +42,6 @@
 static void intel_next_batch(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
-	int ret;
 
 	/* The 865 has issues with larger-than-page-sized batch buffers. */
 	if (IS_I865G(intel))
@@ -52,12 +51,7 @@ static void intel_next_batch(ScrnInfoPtr scrn)
 		intel->batch_bo =
 		    dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);
 
-	ret = dri_bo_map(intel->batch_bo, 1);
-	if (ret != 0)
-		FatalError("Failed to map batchbuffer: %s\n", strerror(-ret));
-
 	intel->batch_used = 0;
-	intel->batch_ptr = intel->batch_bo->virtual;
 
 	/* We don't know when another client has executed, so we have
 	 * to reinitialize our 3D state per batch.
@@ -80,9 +74,6 @@ void intel_batch_teardown(ScrnInfoPtr scrn)
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
 	if (intel->batch_ptr != NULL) {
-		dri_bo_unmap(intel->batch_bo);
-		intel->batch_ptr = NULL;
-
 		dri_bo_unreference(intel->batch_bo);
 		intel->batch_bo = NULL;
 
@@ -168,31 +159,24 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->vertex_flush)
 		intel->vertex_flush(intel);
 
-	/* Emit a padding dword if we aren't going to be quad-word aligned. */
-	if ((intel->batch_used & 4) == 0) {
-		*(uint32_t *) (intel->batch_ptr + intel->batch_used) = MI_NOOP;
-		intel->batch_used += 4;
-	}
-
 	/* Mark the end of the batchbuffer. */
-	*(uint32_t *) (intel->batch_ptr + intel->batch_used) =
-	    MI_BATCH_BUFFER_END;
-	intel->batch_used += 4;
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+	/* Emit a padding dword if we aren't going to be quad-word aligned. */
+	if (intel->batch_used & 1)
+		OUT_BATCH(MI_NOOP);
 
 	if (DUMP_BATCHBUFFERS) {
 	    FILE *file = fopen(DUMP_BATCHBUFFERS, "a");
 	    if (file) {
-		fwrite (intel->batch_ptr, intel->batch_used, 1, file);
+		fwrite (intel->batch_ptr, intel->batch_used*4, 1, file);
 		fclose(file);
 	    }
 	}
 
-	dri_bo_unmap(intel->batch_bo);
-	intel->batch_ptr = NULL;
-
-	ret =
-	    dri_bo_exec(intel->batch_bo, intel->batch_used, NULL, 0,
-			0xffffffff);
+	ret = dri_bo_subdata(intel->batch_bo, 0, intel->batch_used*4, intel->batch_ptr);
+	if (ret == 0)
+		ret = dri_bo_exec(intel->batch_bo, intel->batch_used*4,
+				  NULL, 0, 0xffffffff);
 	if (ret != 0) {
 		static int once;
 
@@ -269,6 +253,6 @@ void intel_batch_wait_last(ScrnInfoPtr scrn)
 	/* Map it CPU write, which guarantees it's done.  This is a completely
 	 * non performance path, so we don't need anything better.
 	 */
-	drm_intel_bo_map(intel->last_batch_bo, TRUE);
-	drm_intel_bo_unmap(intel->last_batch_bo);
+	drm_intel_gem_bo_map_gtt(intel->last_batch_bo);
+	drm_intel_gem_bo_unmap_gtt(intel->last_batch_bo);
 }
diff --git a/src/i830_batchbuffer.h b/src/i830_batchbuffer.h
index c912a1d..874916d 100644
--- a/src/i830_batchbuffer.h
+++ b/src/i830_batchbuffer.h
@@ -41,7 +41,7 @@ void intel_batch_wait_last(ScrnInfoPtr scrn);
 
 static inline int intel_batch_space(intel_screen_private *intel)
 {
-	return (intel->batch_bo->size - BATCH_RESERVED) - (intel->batch_used);
+	return (intel->batch_bo->size - BATCH_RESERVED) - (4*intel->batch_used);
 }
 
 static inline void
@@ -60,7 +60,7 @@ static inline void intel_batch_start_atomic(ScrnInfoPtr scrn, unsigned int sz)
 	intel_batch_require_space(scrn, intel, sz * 4);
 
 	intel->in_batch_atomic = TRUE;
-	intel->batch_atomic_limit = intel->batch_used + sz * 4;
+	intel->batch_atomic_limit = intel->batch_used + sz;
 }
 
 static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
@@ -74,19 +74,19 @@ static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
 
 static inline void intel_batch_emit_dword(intel_screen_private *intel, uint32_t dword)
 {
-	*(uint32_t *) (intel->batch_ptr + intel->batch_used) = dword;
-	intel->batch_used += 4;
+	intel->batch_ptr[intel->batch_used++] = dword;
 }
 
 static inline void intel_batch_align(intel_screen_private *intel, uint32_t align)
 {
 	uint32_t delta;
 
+	align /= 4;
 	assert(align);
 
 	if ((delta = intel->batch_used & (align - 1))) {
 		delta = align - delta;
-		memset (intel->batch_ptr + intel->batch_used, 0, delta);
+		memset (intel->batch_ptr + intel->batch_used, 0, 4*delta);
 		intel->batch_used += delta;
 	}
 }
@@ -99,11 +99,11 @@ intel_batch_emit_reloc(intel_screen_private *intel,
 {
 	if (needs_fence)
 		drm_intel_bo_emit_reloc_fence(intel->batch_bo,
-					      intel->batch_used,
+					      intel->batch_used * 4,
 					      bo, delta,
 					      read_domains, write_domains);
 	else
-		drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used,
+		drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used * 4,
 					bo, delta,
 					read_domains, write_domains);
 
@@ -175,7 +175,7 @@ do {									\
 			   "ADVANCE_BATCH\n", __FUNCTION__);		\
 	assert(!intel->in_batch_atomic);				\
 	intel_batch_require_space(scrn, intel, (n) * 4);		\
-	intel->batch_emitting = (n) * 4;				\
+	intel->batch_emitting = (n);					\
 	intel->batch_emit_start = intel->batch_used;			\
 } while (0)
 
diff --git a/src/i915_3d.h b/src/i915_3d.h
index ab4fbb5..043a6d5 100644
--- a/src/i915_3d.h
+++ b/src/i915_3d.h
@@ -423,8 +423,7 @@ do {									\
 
 #define FS_BEGIN()							\
 do {									\
-    _shader_offset = intel->batch_used;					\
-   intel->batch_used += 4;						\
+    _shader_offset = intel->batch_used++;				\
 } while (0)
 
 #define FS_OUT(_shaderop)						\
@@ -436,7 +435,7 @@ do {									\
 
 #define FS_END()							\
 do {									\
-    *(uint32_t *)(intel->batch_ptr + _shader_offset) =			\
-	(_3DSTATE_PIXEL_SHADER_PROGRAM |				\
-	 ((intel->batch_used - _shader_offset) / 4 - 2));		\
+    intel->batch_ptr[_shader_offset] =					\
+	_3DSTATE_PIXEL_SHADER_PROGRAM |					\
+	(intel->batch_used - _shader_offset - 2);			\
 } while (0);
diff --git a/src/i915_render.c b/src/i915_render.c
index 87f1ca4..de68c5c 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -1067,7 +1067,7 @@ void
 i915_vertex_flush(intel_screen_private *intel)
 {
 	if (intel->prim_offset) {
-		*(uint32_t *) (intel->batch_ptr + intel->prim_offset) |= intel->prim_count - 1;
+		intel->batch_ptr[intel->prim_offset] |= intel->prim_count - 1;
 		intel->prim_offset = 0;
 	}
 }
commit dcef703a7cdcf360f12312a338361697acffc3e9
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed May 19 09:30:02 2010 +0100

    Kill paranoid assertions on every write into the batchbuffer.
    
    On my PineView box these represent ~5% overhead on x11perf text:
    
    Before:
    16000000 trep @   0.0020 msec (495000.0/sec): Char in 80-char aa line (Charter 10)
    12000000 trep @   0.0022 msec (461000.0/sec): Char in 80-char rgb line (Charter 10)
    
    After:
    16000000 trep @   0.0020 msec (511000.0/sec): Char in 80-char aa line (Charter 10)
    16000000 trep @   0.0021 msec (480000.0/sec): Char in 80-char rgb line (Charter 10)
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830_3d.c b/src/i830_3d.c
index e83cb3f..a92da05 100644
--- a/src/i830_3d.c
+++ b/src/i830_3d.c
@@ -38,7 +38,7 @@ void I830EmitInvarientState(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-	ATOMIC_BATCH(58);
+	assert(intel->in_batch_atomic);
 
 	OUT_BATCH(_3DSTATE_MAP_CUBE | MAP_UNIT(0));
 	OUT_BATCH(_3DSTATE_MAP_CUBE | MAP_UNIT(1));
@@ -222,6 +222,4 @@ void I830EmitInvarientState(ScrnInfoPtr scrn)
 		  AA_LINE_ECAAR_WIDTH_1_0 |
 		  AA_LINE_REGION_WIDTH_ENABLE |
 		  AA_LINE_REGION_WIDTH_1_0 | AA_LINE_DISABLE);
-
-	ADVANCE_BATCH();
 }
diff --git a/src/i830_batchbuffer.h b/src/i830_batchbuffer.h
index 2793bc0..c912a1d 100644
--- a/src/i830_batchbuffer.h
+++ b/src/i830_batchbuffer.h
@@ -74,8 +74,6 @@ static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
 
 static inline void intel_batch_emit_dword(intel_screen_private *intel, uint32_t dword)
 {
-	assert(intel->batch_ptr != NULL);
-	assert(intel->batch_emitting);
 	*(uint32_t *) (intel->batch_ptr + intel->batch_used) = dword;
 	intel->batch_used += 4;
 }
@@ -84,7 +82,6 @@ static inline void intel_batch_align(intel_screen_private *intel, uint32_t align
 {
 	uint32_t delta;
 
-	assert(intel->batch_ptr != NULL);
 	assert(align);
 
 	if ((delta = intel->batch_used & (align - 1))) {
@@ -100,9 +97,6 @@ intel_batch_emit_reloc(intel_screen_private *intel,
 		       uint32_t read_domains,
 		       uint32_t write_domains, uint32_t delta, int needs_fence)
 {
-	assert(intel_batch_space(intel) >= 4);
-	*(uint32_t *) (intel->batch_ptr + intel->batch_used) =
-	    bo->offset + delta;
 	if (needs_fence)
 		drm_intel_bo_emit_reloc_fence(intel->batch_bo,
 					      intel->batch_used,
@@ -113,7 +107,7 @@ intel_batch_emit_reloc(intel_screen_private *intel,
 					bo, delta,
 					read_domains, write_domains);
 
-	intel->batch_used += 4;
+	intel_batch_emit_dword(intel, bo->offset + delta);
 }
 
 static inline void
@@ -144,9 +138,6 @@ intel_batch_emit_reloc_pixmap(intel_screen_private *intel, PixmapPtr pixmap,
 {
 	struct intel_pixmap *priv = i830_get_pixmap_intel(pixmap);
 
-	assert(intel->batch_ptr != NULL);
-	assert(intel_batch_space(intel) >= 4);
-
 	intel_batch_mark_pixmap_domains(intel, priv, read_domains, write_domain);
 
 	intel_batch_emit_reloc(intel, priv->bo,
@@ -188,18 +179,6 @@ do {									\
 	intel->batch_emit_start = intel->batch_used;			\
 } while (0)
 
-/* special-case variant for when we have preallocated space */
-#define ATOMIC_BATCH(n)							\
-do {									\
-	if (intel->batch_emitting != 0)					\
-		FatalError("%s: ATOMIC_BATCH called without closing "	\
-			   "ADVANCE_BATCH\n", __FUNCTION__);		\
-	assert(intel->in_batch_atomic);					\
-	assert(intel->batch_used + (n) * 4 <= intel->batch_atomic_limit); \
-	intel->batch_emitting = (n) * 4;				\
-	intel->batch_emit_start = intel->batch_used;			\
-} while (0)
-
 #define ADVANCE_BATCH() do {						\
 	if (intel->batch_emitting == 0)					\
 		FatalError("%s: ADVANCE_BATCH called with no matching "	\
diff --git a/src/i830_render.c b/src/i830_render.c
index d3bc18e..b0413c5 100644
--- a/src/i830_render.c
+++ b/src/i830_render.c
@@ -302,7 +302,8 @@ static void i830_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 
 	format = i8xx_get_card_format(intel, picture);
 
-	ATOMIC_BATCH(10);
+	assert(intel->in_batch_atomic);
+
 	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
 		  LOAD_TEXTURE_MAP(unit) | 4);
 	OUT_RELOC_PIXMAP(pixmap, I915_GEM_DOMAIN_SAMPLER, 0, 0);
@@ -336,7 +337,6 @@ static void i830_texture_setup(PicturePtr picture, PixmapPtr pixmap, int unit)
 		  ENABLE_TEX_STREAM_COORD_SET |
 		  TEX_STREAM_COORD_SET(unit) |
 		  ENABLE_TEX_STREAM_MAP_IDX | TEX_STREAM_MAP_IDX(unit));
-	ADVANCE_BATCH();
 }
 
 Bool
@@ -549,7 +549,7 @@ static void i830_emit_composite_state(ScrnInfoPtr scrn)
 	IntelEmitInvarientState(scrn);
 	intel->last_3d = LAST_3D_RENDER;
 
-	ATOMIC_BATCH(21);
+	assert(intel->in_batch_atomic);
 
 	if (i830_pixmap_tiled(intel->render_dest)) {
 		tiling_bits = BUF_3D_TILED_SURFACE;
@@ -615,8 +615,6 @@ static void i830_emit_composite_state(ScrnInfoPtr scrn)
 	}
 	OUT_BATCH(_3DSTATE_VERTEX_FORMAT_2_CMD | texcoordfmt);
 
-	ADVANCE_BATCH();
-
 	i830_texture_setup(intel->render_source_picture, intel->render_source, 0);
 	if (intel->render_mask) {
 		i830_texture_setup(intel->render_mask_picture,
@@ -756,8 +754,6 @@ i830_emit_composite_primitive(PixmapPtr dest,
 
 	num_floats = 3 * per_vertex;
 
-	ATOMIC_BATCH(1 + num_floats);
-
 	OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (num_floats - 1));
 	OUT_BATCH_F(dstX + w);
 	OUT_BATCH_F(dstY + h);
@@ -803,8 +799,6 @@ i830_emit_composite_primitive(PixmapPtr dest,
 			OUT_BATCH_F(mask_w[0]);
 		}
 	}
-
-	ADVANCE_BATCH();
 }
 
 /**
diff --git a/src/i915_3d.c b/src/i915_3d.c
index 517c685..7f07b4b 100644
--- a/src/i915_3d.c
+++ b/src/i915_3d.c
@@ -38,7 +38,7 @@ void I915EmitInvarientState(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
-	ATOMIC_BATCH(24);
+	assert(intel->in_batch_atomic);
 
 	OUT_BATCH(_3DSTATE_AA_CMD |
 		  AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -104,6 +104,4 @@ void I915EmitInvarientState(ScrnInfoPtr scrn)
 	OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE |
 		  0);
 	OUT_BATCH(MI_NOOP);
-
-	ADVANCE_BATCH();
 }
diff --git a/src/i915_3d.h b/src/i915_3d.h
index f85780a..ab4fbb5 100644
--- a/src/i915_3d.h
+++ b/src/i915_3d.h
@@ -418,36 +418,25 @@ do {									\
  * \param x maximum number of shader commands that may be used between
  *        a FS_START and FS_END
  */
-#define FS_LOCALS(x)							\
-    uint32_t _shader_buf[(x) * 3];					\
-    unsigned int _max_shader_commands = x;				\
-    unsigned int _cur_shader_commands
+#define FS_LOCALS()							\
+    uint32_t _shader_offset
 
 #define FS_BEGIN()							\
 do {									\
-    _cur_shader_commands = 0;						\
+    _shader_offset = intel->batch_used;					\
+   intel->batch_used += 4;						\
 } while (0)
 
 #define FS_OUT(_shaderop)						\
 do {									\
-    if (_cur_shader_commands >= _max_shader_commands)			\
-	 FatalError("fragment shader command buffer exceeded (%d)\n",	\
-		    _cur_shader_commands);				\
-    _shader_buf[_cur_shader_commands * 3 + 0] = _shaderop.ui[0];	\
-    _shader_buf[_cur_shader_commands * 3 + 1] = _shaderop.ui[1];	\
-    _shader_buf[_cur_shader_commands * 3 + 2] = _shaderop.ui[2];	\
-    ++_cur_shader_commands;						\
+    OUT_BATCH(_shaderop.ui[0]);	\
+    OUT_BATCH(_shaderop.ui[1]);	\
+    OUT_BATCH(_shaderop.ui[2]);	\
 } while (0)
 
 #define FS_END()							\
 do {									\
-    int _i, _pad = (_cur_shader_commands & 0x1) ? 0 : 1;		\
-    ATOMIC_BATCH(_cur_shader_commands * 3 + 1 + _pad);			\
-    OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM |				\
-	     (_cur_shader_commands * 3 - 1));				\
-    for (_i = 0; _i < _cur_shader_commands * 3; _i++)			\
-	OUT_BATCH(_shader_buf[_i]);					\
-    if (_pad != 0)							\
-	OUT_BATCH(MI_NOOP);						\
-    ADVANCE_BATCH();							\
+    *(uint32_t *)(intel->batch_ptr + _shader_offset) =			\
+	(_3DSTATE_PIXEL_SHADER_PROGRAM |				\
+	 ((intel->batch_used - _shader_offset) / 4 - 2));		\
 } while (0);
diff --git a/src/i915_render.c b/src/i915_render.c
index ed7bec5..87f1ca4 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -489,8 +489,6 @@ i915_emit_composite_primitive_constant(PixmapPtr dest,
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	float x, y;
 
-	ATOMIC_BATCH((intel->prim_offset == 0) + 6);
-
 	if (intel->prim_offset == 0) {
 		intel->prim_offset = intel->batch_used;
 		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
@@ -508,8 +506,6 @@ i915_emit_composite_primitive_constant(PixmapPtr dest,
 
 	OUT_BATCH_F(x);
 	OUT_BATCH_F(y);
-
-	ADVANCE_BATCH();
 }
 
 static void
@@ -523,8 +519,6 @@ i915_emit_composite_primitive_identity_source(PixmapPtr dest,
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 	float dst_x, dst_y, src_x, src_y;
 
-	ATOMIC_BATCH((intel->prim_offset == 0) + 12);
-
 	if (intel->prim_offset == 0) {
 		intel->prim_offset = intel->batch_used;
 		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
@@ -550,8 +544,6 @@ i915_emit_composite_primitive_identity_source(PixmapPtr dest,
 	OUT_BATCH_F(dst_y);
 	OUT_BATCH_F(src_x / intel->scale_units[0][0]);
 	OUT_BATCH_F(src_y / intel->scale_units[0][1]);
-
-	ADVANCE_BATCH();
 }
 
 static void
@@ -586,8 +578,6 @@ i915_emit_composite_primitive_affine_source(PixmapPtr dest,
 					      &src_y[2]))
 		return;
 
-	ATOMIC_BATCH((intel->prim_offset == 0) + 12);
-
 	if (intel->prim_offset == 0) {
 		intel->prim_offset = intel->batch_used;
 		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
@@ -611,8 +601,6 @@ i915_emit_composite_primitive_affine_source(PixmapPtr dest,
 	OUT_BATCH_F(y);
 	OUT_BATCH_F(src_x[0] / intel->scale_units[0][0]);
 	OUT_BATCH_F(src_y[0] / intel->scale_units[0][1]);
-
-	ADVANCE_BATCH();
 }
 
 static void
@@ -751,8 +739,6 @@ i915_emit_composite_primitive(PixmapPtr dest,
 
 	num_floats = 3 * per_vertex;
 
-	ATOMIC_BATCH(num_floats);
-
 	intel->prim_count += num_floats;
 
 	OUT_BATCH_F(intel->dst_coord_adjust + dstX + w);
@@ -811,8 +797,6 @@ i915_emit_composite_primitive(PixmapPtr dest,
 			OUT_BATCH_F(mask_w[0]);
 		}
 	}
-
-	ADVANCE_BATCH();
 }
 
 static void i915_emit_composite_setup(ScrnInfoPtr scrn)
@@ -846,14 +830,7 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 	tex_count += ! is_solid_src;
 	tex_count += mask && ! is_solid_mask;
 
-	t = 15;
-	if (tex_count)
-	    t += 6 * tex_count + 4;
-	if (is_solid_src)
-	    t += 2;
-	if (mask && is_solid_mask)
-	    t += 2;
-	ATOMIC_BATCH (t);
+	assert(intel->in_batch_atomic);
 
 	if (tex_count != 0) {
 	    OUT_BATCH(_3DSTATE_MAP_STATE | (3 * tex_count));
@@ -940,10 +917,8 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 		OUT_BATCH(0x00000000);
 	}
 
-	ADVANCE_BATCH();
-
 	{
-	    FS_LOCALS(20);
+	    FS_LOCALS();
 	    int src_reg, mask_reg;
 
 	    FS_BEGIN();
@@ -1076,9 +1051,7 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 
 	if (intel->prim_offset == 0) {
 		intel->prim_offset = intel->batch_used;
-		ATOMIC_BATCH(1);
 		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
-		ADVANCE_BATCH();
 	}
 
 	intel->prim_emit(dest,
diff --git a/src/i915_video.c b/src/i915_video.c
index 927047b..2ccd502 100644
--- a/src/i915_video.c
+++ b/src/i915_video.c
@@ -75,8 +75,6 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 		IntelEmitInvarientState(scrn);
 		intel->last_3d = LAST_3D_VIDEO;
 
-		ATOMIC_BATCH(20);
-
 		/* flush map & render cache */
 		OUT_BATCH(MI_FLUSH | MI_WRITE_DIRTY_STATE |
 			  MI_INVALIDATE_MAP_CACHE);
@@ -134,12 +132,10 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			  BUF_3D_PITCH(intel_get_pixmap_pitch(pixmap)));
 		OUT_RELOC_PIXMAP(pixmap, I915_GEM_DOMAIN_RENDER,
 				 I915_GEM_DOMAIN_RENDER, 0);
-		ADVANCE_BATCH();
 
 		if (!is_planar_fourcc(id)) {
-			FS_LOCALS(10);
+			FS_LOCALS();
 
-			ATOMIC_BATCH(16);
 			OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | 4);
 			OUT_BATCH(0x0000001);	/* constant 0 */
 			/* constant 0: brightness/contrast */
@@ -184,8 +180,6 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			OUT_BATCH(ms3);
 			OUT_BATCH(((video_pitch / 4) - 1) << MS4_PITCH_SHIFT);
 
-			ADVANCE_BATCH();
-
 			FS_BEGIN();
 			i915_fs_dcl(FS_S0);
 			i915_fs_dcl(FS_T0);
@@ -198,9 +192,8 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			}
 			FS_END();
 		} else {
-			FS_LOCALS(16);
+			FS_LOCALS();
 
-			ATOMIC_BATCH(22 + 11 + 11);
 			/* For the planar formats, we set up three samplers --
 			 * one for each plane, in a Y8 format.  Because I
 			 * couldn't get the special PLANAR_TO_PACKED
@@ -332,7 +325,6 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			ms3 |= (width / 2 - 1) << MS3_WIDTH_SHIFT;
 			OUT_BATCH(ms3);
 			OUT_BATCH(((video_pitch / 4) - 1) << MS4_PITCH_SHIFT);
-			ADVANCE_BATCH();
 
 			FS_BEGIN();
 			/* Declare samplers */
@@ -389,13 +381,7 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			FS_END();
 		}
 
-		{
-			ATOMIC_BATCH(2);
-			OUT_BATCH(MI_FLUSH | MI_WRITE_DIRTY_STATE |
-				  MI_INVALIDATE_MAP_CACHE);
-			OUT_BATCH(0x00000000);
-			ADVANCE_BATCH();
-		}
+		OUT_BATCH(MI_FLUSH | MI_WRITE_DIRTY_STATE | MI_INVALIDATE_MAP_CACHE);
 
 		/* Set up the offset for translating from the given region
 		 * (in screen coordinates) to the backing pixmap.
@@ -411,6 +397,7 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 		dxo = dstRegion->extents.x1;
 		dyo = dstRegion->extents.y1;
 
+		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
 		while (nbox_this_time--) {
 			int box_x1 = pbox->x1;
 			int box_y1 = pbox->y1;
@@ -423,19 +410,9 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			src_scale_x = ((float)src_w / width) / drw_w;
 			src_scale_y = ((float)src_h / height) / drw_h;
 
-			ATOMIC_BATCH(8 + 12);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-			OUT_BATCH(MI_NOOP);
-
 			/* vertex data - rect list consists of bottom right,
 			 * bottom left, and top left vertices.
 			 */
-			OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (12 - 1));
 
 			/* bottom right */
 			OUT_BATCH_F(box_x2 + pix_xoff);
@@ -454,8 +431,6 @@ I915DisplayVideoTextured(ScrnInfoPtr scrn,
 			OUT_BATCH_F(box_y1 + pix_yoff);
 			OUT_BATCH_F((box_x1 - dxo) * src_scale_x);
 			OUT_BATCH_F((box_y1 - dyo) * src_scale_y);
-
-			ADVANCE_BATCH();
 		}
 
 		intel_batch_end_atomic(scrn);
diff --git a/src/i965_render.c b/src/i965_render.c
index 7355ed1..ccfc008 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1181,12 +1181,8 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 	 */
 	ALIGN_BATCH(64);
 
+	assert(intel->in_batch_atomic);
 	{
-		if (IS_IGDNG(intel))
-			ATOMIC_BATCH(14);
-		else
-			ATOMIC_BATCH(12);
-
 		/* Match Mesa driver setup */
 		OUT_BATCH(MI_FLUSH |
 			  MI_STATE_INSTRUCTION_CACHE_FLUSH |
@@ -1229,12 +1225,10 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 		OUT_BATCH(BRW_STATE_SIP | 0);
 		OUT_RELOC(render_state->sip_kernel_bo,
 			  I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-		ADVANCE_BATCH();
 	}
 
 	{
 		int pipe_ctrl;
-		ATOMIC_BATCH(26);
 		/* Pipe control */
 
 		if (IS_IGDNG(intel))
@@ -1329,7 +1323,6 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 		OUT_BATCH(BRW_CS_URB_STATE | 0);
 		OUT_BATCH(((URB_CS_ENTRY_SIZE - 1) << 4) |
 			  (URB_CS_ENTRIES << 0));
-		ADVANCE_BATCH();
 	}
 	{
 		/*
@@ -1356,7 +1349,6 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 		}
 
 		if (IS_IGDNG(intel)) {
-			ATOMIC_BATCH(mask ? 9 : 7);
 			/*
 			 * The reason to add this extra vertex element in the header is that
 			 * IGDNG has different vertex header definition and origin method to
@@ -1386,7 +1378,6 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 				  (BRW_VFCOMPONENT_STORE_0 <<
 				   VE1_VFCOMPONENT_3_SHIFT));
 		} else {
-			ATOMIC_BATCH(mask ? 7 : 5);
 			/* Set up our vertex elements, sourced from the single vertex buffer.
 			 * that will be set up later.
 			 */
@@ -1448,8 +1439,6 @@ static void i965_emit_composite_state(ScrnInfoPtr scrn)
 			else
 				OUT_BATCH((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) | (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) | (w_component << VE1_VFCOMPONENT_2_SHIFT) | (BRW_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT) | ((4 + 4 + 4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));	/* VUE offset in dwords */
 		}
-
-		ADVANCE_BATCH();
 	}
 }
 
@@ -1835,7 +1824,6 @@ i965_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	if (intel->needs_render_state_emit)
 		i965_emit_composite_state(scrn);
 
-	ATOMIC_BATCH(12);
 	OUT_BATCH(MI_FLUSH);
 	/* Set up the pointer to our (single) vertex buffer */
 	OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
@@ -1860,7 +1848,6 @@ i965_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	OUT_BATCH(1);		/* single instance */
 	OUT_BATCH(0);		/* start instance location */
 	OUT_BATCH(0);		/* index buffer offset, ignored */
-	ADVANCE_BATCH();
 
 	render_state->vb_offset += i;
 	drm_intel_bo_unreference(vb_bo);
diff --git a/src/i965_video.c b/src/i965_video.c
index 2675556..855f0b5 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -778,18 +778,12 @@ i965_emit_video_setup(ScrnInfoPtr scrn, drm_intel_bo * bind_bo, int n_src_surf)
 	urb_cs_start = urb_sf_start + urb_sf_size;
 	urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
 
-	ATOMIC_BATCH(2);
 	OUT_BATCH(MI_FLUSH |
 		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
 		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
 	OUT_BATCH(MI_NOOP);
-	ADVANCE_BATCH();
 
 	/* brw_debug (scrn, "before base address modify"); */
-	if (IS_IGDNG(intel))
-		ATOMIC_BATCH(14);
-	else
-		ATOMIC_BATCH(12);
 	/* Match Mesa driver setup */
 	if (IS_G4X(intel) || IS_IGDNG(intel))
 		OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
@@ -833,9 +827,6 @@ i965_emit_video_setup(ScrnInfoPtr scrn, drm_intel_bo * bind_bo, int n_src_surf)
 	OUT_RELOC(intel->video.gen4_sip_kernel_bo,
 		  I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
 
-	OUT_BATCH(MI_NOOP);
-	ADVANCE_BATCH();
-
 	/* brw_debug (scrn, "after base address modify"); */
 
 	if (IS_IGDNG(intel))
@@ -843,10 +834,6 @@ i965_emit_video_setup(ScrnInfoPtr scrn, drm_intel_bo * bind_bo, int n_src_surf)
 	else
 		pipe_ctl = BRW_PIPE_CONTROL_NOWRITE | BRW_PIPE_CONTROL_IS_FLUSH;
 
-	ATOMIC_BATCH(38);
-
-	OUT_BATCH(MI_NOOP);
-
 	/* Pipe control */
 	OUT_BATCH(BRW_PIPE_CONTROL | pipe_ctl | 2);
 	OUT_BATCH(0);		/* Destination address */
@@ -970,9 +957,6 @@ i965_emit_video_setup(ScrnInfoPtr scrn, drm_intel_bo * bind_bo, int n_src_surf)
 			   VE1_VFCOMPONENT_3_SHIFT) | (4 <<
 						       VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
 	}
-
-	OUT_BATCH(MI_NOOP);	/* pad to quadword */
-	ADVANCE_BATCH();
 }
 
 void
@@ -1219,7 +1203,6 @@ I965DisplayVideoTextured(ScrnInfoPtr scrn,
 
 		i965_emit_video_setup(scrn, bind_bo, n_src_surf);
 
-		ATOMIC_BATCH(12);
 		/* Set up the pointer to our vertex buffer */
 		OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
 		/* four 32-bit floats per vertex */
@@ -1241,7 +1224,6 @@ I965DisplayVideoTextured(ScrnInfoPtr scrn,
 		OUT_BATCH(0);	/* start instance location */
 		OUT_BATCH(0);	/* index buffer offset, ignored */
 		OUT_BATCH(MI_NOOP);
-		ADVANCE_BATCH();
 
 		intel_batch_end_atomic(scrn);
 
commit bc41f84e01f18548b05c670e1fd0d641adc28d0f
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue May 18 23:54:13 2010 +0100

    i915: Emit composite primitive with specialised functions.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830.h b/src/i830.h
index bf0fc49..0b638f2 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -360,6 +360,11 @@ typedef struct intel_screen_private {
 
 	uint32_t prim_offset;
 	uint32_t prim_count;
+	void (*prim_emit)(PixmapPtr dest,
+			  int srcX, int srcY,
+			  int maskX, int maskY,
+			  int dstX, int dstY,
+			  int w, int h);
 
 	/* 965 render acceleration state */
 	struct gen4_render_state *gen4_render_state;
diff --git a/src/i915_render.c b/src/i915_render.c
index 7c204b6..ed7bec5 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -478,6 +478,343 @@ i915_prepare_composite(int op, PicturePtr source_picture,
 	return TRUE;
 }
 
+static void
+i915_emit_composite_primitive_constant(PixmapPtr dest,
+				       int srcX, int srcY,
+				       int maskX, int maskY,
+				       int dstX, int dstY,
+				       int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	float x, y;
+
+	ATOMIC_BATCH((intel->prim_offset == 0) + 6);
+
+	if (intel->prim_offset == 0) {
+		intel->prim_offset = intel->batch_used;
+		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+	}
+	intel->prim_count += 6;
+
+	x = dstX + intel->dst_coord_adjust;
+	y = dstY + intel->dst_coord_adjust;
+
+	OUT_BATCH_F(x + w);
+	OUT_BATCH_F(y + h);
+
+	OUT_BATCH_F(x);
+	OUT_BATCH_F(y + h);
+
+	OUT_BATCH_F(x);
+	OUT_BATCH_F(y);
+
+	ADVANCE_BATCH();
+}
+
+static void
+i915_emit_composite_primitive_identity_source(PixmapPtr dest,
+					      int srcX, int srcY,
+					      int maskX, int maskY,
+					      int dstX, int dstY,
+					      int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	float dst_x, dst_y, src_x, src_y;
+
+	ATOMIC_BATCH((intel->prim_offset == 0) + 12);
+
+	if (intel->prim_offset == 0) {
+		intel->prim_offset = intel->batch_used;
+		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+	}
+	intel->prim_count += 12;
+
+	dst_x = dstX + intel->dst_coord_adjust;
+	dst_y = dstY + intel->dst_coord_adjust;
+	src_x = srcX + intel->src_coord_adjust;
+	src_y = srcY + intel->src_coord_adjust;
+
+	OUT_BATCH_F(dst_x + w);
+	OUT_BATCH_F(dst_y + h);
+	OUT_BATCH_F((src_x + w) / intel->scale_units[0][0]);
+	OUT_BATCH_F((src_y + h) / intel->scale_units[0][1]);
+
+	OUT_BATCH_F(dst_x);
+	OUT_BATCH_F(dst_y + h);
+	OUT_BATCH_F(src_x / intel->scale_units[0][0]);
+	OUT_BATCH_F((src_y + h) / intel->scale_units[0][1]);
+
+	OUT_BATCH_F(dst_x);
+	OUT_BATCH_F(dst_y);
+	OUT_BATCH_F(src_x / intel->scale_units[0][0]);
+	OUT_BATCH_F(src_y / intel->scale_units[0][1]);
+
+	ADVANCE_BATCH();
+}
+
+static void
+i915_emit_composite_primitive_affine_source(PixmapPtr dest,
+					    int srcX, int srcY,
+					    int maskX, int maskY,
+					    int dstX, int dstY,
+					    int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	float x, y, src_x[3], src_y[3];
+
+	x = srcX + intel->src_coord_adjust;
+	y = srcY + intel->src_coord_adjust;
+
+	if (!i830_get_transformed_coordinates(x, y,
+					      intel->transform[0],
+					      &src_x[0],
+					      &src_y[0]))
+		return;
+
+	if (!i830_get_transformed_coordinates(x, y + h,
+					      intel->transform[0],
+					      &src_x[1],
+					      &src_y[1]))
+		return;
+
+	if (!i830_get_transformed_coordinates(x + w, y + h,
+					      intel->transform[0],
+					      &src_x[2],
+					      &src_y[2]))
+		return;
+
+	ATOMIC_BATCH((intel->prim_offset == 0) + 12);
+
+	if (intel->prim_offset == 0) {
+		intel->prim_offset = intel->batch_used;
+		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+	}
+	intel->prim_count += 12;
+
+	x = dstX + intel->dst_coord_adjust;
+	y = dstY + intel->dst_coord_adjust;
+
+	OUT_BATCH_F(x + w);
+	OUT_BATCH_F(y + h);
+	OUT_BATCH_F(src_x[2] / intel->scale_units[0][0]);
+	OUT_BATCH_F(src_y[2] / intel->scale_units[0][1]);
+
+	OUT_BATCH_F(x);
+	OUT_BATCH_F(y + h);
+	OUT_BATCH_F(src_x[1] / intel->scale_units[0][0]);
+	OUT_BATCH_F(src_y[1] / intel->scale_units[0][1]);
+
+	OUT_BATCH_F(x);
+	OUT_BATCH_F(y);
+	OUT_BATCH_F(src_x[0] / intel->scale_units[0][0]);
+	OUT_BATCH_F(src_y[0] / intel->scale_units[0][1]);
+
+	ADVANCE_BATCH();
+}
+
+static void
+i915_emit_composite_primitive(PixmapPtr dest,
+			      int srcX, int srcY,
+			      int maskX, int maskY,
+			      int dstX, int dstY, int w, int h)
+{
+	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	Bool is_affine_src, is_affine_mask = TRUE;
+	int per_vertex, num_floats;
+	int tex_unit = 0;
+	int src_unit = -1, mask_unit = -1;
+	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
+
+	per_vertex = 2;		/* dest x/y */
+
+	if (! intel->render_source_is_solid) {
+		float x = srcX + intel->src_coord_adjust;
+		float y = srcY + intel->src_coord_adjust;
+
+		src_unit = tex_unit++;
+
+		is_affine_src = i830_transform_is_affine(intel->transform[src_unit]);
+		if (is_affine_src) {
+			if (!i830_get_transformed_coordinates(x, y,
+							      intel->
+							      transform[src_unit],
+							      &src_x[0],
+							      &src_y[0]))
+				return;
+
+			if (!i830_get_transformed_coordinates(x, y + h,
+							      intel->
+							      transform[src_unit],
+							      &src_x[1],
+							      &src_y[1]))
+				return;
+
+			if (!i830_get_transformed_coordinates(x + w, y + h,
+							      intel->
+							      transform[src_unit],
+							      &src_x[2],
+							      &src_y[2]))
+				return;
+
+			per_vertex += 2;	/* src x/y */
+		} else {
+			if (!i830_get_transformed_coordinates_3d(x, y,
+								 intel->
+								 transform[src_unit],
+								 &src_x[0],
+								 &src_y[0],
+								 &src_w[0]))
+				return;
+
+			if (!i830_get_transformed_coordinates_3d(x, y + h,
+								 intel->
+								 transform[src_unit],
+								 &src_x[1],
+								 &src_y[1],
+								 &src_w[1]))
+				return;
+
+			if (!i830_get_transformed_coordinates_3d(x + w, y + h,
+								 intel->
+								 transform[src_unit],
+								 &src_x[2],
+								 &src_y[2],
+								 &src_w[2]))
+				return;
+
+			per_vertex += 4;	/* src x/y/z/w */
+		}
+	}
+
+	if (intel->render_mask && ! intel->render_mask_is_solid) {
+		float x = maskX + intel->mask_coord_adjust;
+		float y = maskY + intel->mask_coord_adjust;
+
+		mask_unit = tex_unit++;
+
+		is_affine_mask = i830_transform_is_affine(intel->transform[mask_unit]);
+		if (is_affine_mask) {
+			if (!i830_get_transformed_coordinates(x, y,
+							      intel->
+							      transform[mask_unit],
+							      &mask_x[0],
+							      &mask_y[0]))
+				return;
+
+			if (!i830_get_transformed_coordinates(x, y + h,
+							      intel->
+							      transform[mask_unit],
+							      &mask_x[1],
+							      &mask_y[1]))
+				return;
+
+			if (!i830_get_transformed_coordinates(x + w, y + h,
+							      intel->
+							      transform[mask_unit],
+							      &mask_x[2],
+							      &mask_y[2]))
+				return;
+
+			per_vertex += 2;	/* mask x/y */
+		} else {
+			if (!i830_get_transformed_coordinates_3d(x, y,
+								 intel->
+								 transform[mask_unit],
+								 &mask_x[0],
+								 &mask_y[0],
+								 &mask_w[0]))
+				return;
+
+			if (!i830_get_transformed_coordinates_3d(x, y + h,
+								 intel->
+								 transform[mask_unit],
+								 &mask_x[1],
+								 &mask_y[1],
+								 &mask_w[1]))
+				return;
+
+			if (!i830_get_transformed_coordinates_3d(x + w, y + h,
+								 intel->
+								 transform[mask_unit],
+								 &mask_x[2],
+								 &mask_y[2],
+								 &mask_w[2]))
+				return;
+
+			per_vertex += 4;	/* mask x/y/z/w */
+		}
+	}
+
+	num_floats = 3 * per_vertex;
+
+	ATOMIC_BATCH(num_floats);
+
+	intel->prim_count += num_floats;
+
+	OUT_BATCH_F(intel->dst_coord_adjust + dstX + w);
+	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
+	if (! intel->render_source_is_solid) {
+	    OUT_BATCH_F(src_x[2] / intel->scale_units[src_unit][0]);
+	    OUT_BATCH_F(src_y[2] / intel->scale_units[src_unit][1]);
+	    if (!is_affine_src) {
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(src_w[2]);
+	    }
+	}
+	if (intel->render_mask && ! intel->render_mask_is_solid) {
+		OUT_BATCH_F(mask_x[2] / intel->scale_units[mask_unit][0]);
+		OUT_BATCH_F(mask_y[2] / intel->scale_units[mask_unit][1]);
+		if (!is_affine_mask) {
+			OUT_BATCH_F(0.0);
+			OUT_BATCH_F(mask_w[2]);
+		}
+	}
+
+	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
+	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
+	if (! intel->render_source_is_solid) {
+	    OUT_BATCH_F(src_x[1] / intel->scale_units[src_unit][0]);
+	    OUT_BATCH_F(src_y[1] / intel->scale_units[src_unit][1]);
+	    if (!is_affine_src) {
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(src_w[1]);
+	    }
+	}
+	if (intel->render_mask && ! intel->render_mask_is_solid) {
+		OUT_BATCH_F(mask_x[1] / intel->scale_units[mask_unit][0]);
+		OUT_BATCH_F(mask_y[1] / intel->scale_units[mask_unit][1]);
+		if (!is_affine_mask) {
+			OUT_BATCH_F(0.0);
+			OUT_BATCH_F(mask_w[1]);
+		}
+	}
+
+	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
+	OUT_BATCH_F(intel->dst_coord_adjust + dstY);
+	if (! intel->render_source_is_solid) {
+	    OUT_BATCH_F(src_x[0] / intel->scale_units[src_unit][0]);
+	    OUT_BATCH_F(src_y[0] / intel->scale_units[src_unit][1]);
+	    if (!is_affine_src) {
+		OUT_BATCH_F(0.0);
+		OUT_BATCH_F(src_w[0]);
+	    }
+	}
+	if (intel->render_mask && ! intel->render_mask_is_solid) {
+		OUT_BATCH_F(mask_x[0] / intel->scale_units[mask_unit][0]);
+		OUT_BATCH_F(mask_y[0] / intel->scale_units[mask_unit][1]);
+		if (!is_affine_mask) {
+			OUT_BATCH_F(0.0);
+			OUT_BATCH_F(mask_w[0]);
+		}
+	}
+
+	ADVANCE_BATCH();
+}
+
 static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
@@ -711,210 +1048,17 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 
 	intel->prim_offset = 0;
 	intel->prim_count = 0;
-}
-
-/* Emit the vertices for a single composite rectangle.
- *
- * This function is no longer shared between i830 and i915 generation code.
- */
-static void
-i915_emit_composite_primitive(PixmapPtr dest,
-			      int srcX, int srcY,
-			      int maskX, int maskY,
-			      int dstX, int dstY, int w, int h)
-{
-	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
-	intel_screen_private *intel = intel_get_screen_private(scrn);
-	Bool is_affine_src, is_affine_mask = TRUE;
-	int per_vertex, num_floats;
-	int tex_unit = 0;
-	int src_unit = -1, mask_unit = -1;
-	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
-
-	per_vertex = 2;		/* dest x/y */
-
-	if (! intel->render_source_is_solid) {
-		float x = srcX + intel->src_coord_adjust;
-		float y = srcY + intel->src_coord_adjust;
-
-		src_unit = tex_unit++;
-
-		is_affine_src = i830_transform_is_affine(intel->transform[src_unit]);
-		if (is_affine_src) {
-			if (!i830_get_transformed_coordinates(x, y,
-							      intel->
-							      transform[src_unit],
-							      &src_x[0],
-							      &src_y[0]))
-				return;
-
-			if (!i830_get_transformed_coordinates(x, y + h,
-							      intel->
-							      transform[src_unit],
-							      &src_x[1],
-							      &src_y[1]))
-				return;
-
-			if (!i830_get_transformed_coordinates(x + w, y + h,
-							      intel->
-							      transform[src_unit],
-							      &src_x[2],
-							      &src_y[2]))
-				return;
-
-			per_vertex += 2;	/* src x/y */
-		} else {
-			if (!i830_get_transformed_coordinates_3d(x, y,
-								 intel->
-								 transform[src_unit],
-								 &src_x[0],
-								 &src_y[0],
-								 &src_w[0]))
-				return;
-
-			if (!i830_get_transformed_coordinates_3d(x, y + h,
-								 intel->
-								 transform[src_unit],
-								 &src_x[1],
-								 &src_y[1],
-								 &src_w[1]))
-				return;
-
-			if (!i830_get_transformed_coordinates_3d(x + w, y + h,
-								 intel->
-								 transform[src_unit],
-								 &src_x[2],
-								 &src_y[2],
-								 &src_w[2]))
-				return;
-
-			per_vertex += 4;	/* src x/y/z/w */
-		}
-	}
-
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		float x = maskX + intel->mask_coord_adjust;
-		float y = maskY + intel->mask_coord_adjust;
-
-		mask_unit = tex_unit++;
-
-		is_affine_mask = i830_transform_is_affine(intel->transform[mask_unit]);
-		if (is_affine_mask) {
-			if (!i830_get_transformed_coordinates(x, y,
-							      intel->
-							      transform[mask_unit],
-							      &mask_x[0],
-							      &mask_y[0]))
-				return;
-
-			if (!i830_get_transformed_coordinates(x, y + h,
-							      intel->
-							      transform[mask_unit],
-							      &mask_x[1],
-							      &mask_y[1]))
-				return;
-
-			if (!i830_get_transformed_coordinates(x + w, y + h,
-							      intel->
-							      transform[mask_unit],
-							      &mask_x[2],
-							      &mask_y[2]))
-				return;
-
-			per_vertex += 2;	/* mask x/y */
-		} else {
-			if (!i830_get_transformed_coordinates_3d(x, y,
-								 intel->
-								 transform[mask_unit],
-								 &mask_x[0],
-								 &mask_y[0],
-								 &mask_w[0]))
-				return;
-
-			if (!i830_get_transformed_coordinates_3d(x, y + h,
-								 intel->
-								 transform[mask_unit],
-								 &mask_x[1],
-								 &mask_y[1],
-								 &mask_w[1]))
-				return;
-
-			if (!i830_get_transformed_coordinates_3d(x + w, y + h,
-								 intel->
-								 transform[mask_unit],
-								 &mask_x[2],
-								 &mask_y[2],
-								 &mask_w[2]))
-				return;
-
-			per_vertex += 4;	/* mask x/y/z/w */
-		}
-	}
-
-	num_floats = 3 * per_vertex;
-
-	ATOMIC_BATCH(num_floats);
-
-	intel->prim_count += num_floats;
-
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX + w);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
-	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[2] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[2] / intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[2]);
-	    }
-	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[2] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[2] / intel->scale_units[mask_unit][1]);
-		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[2]);
-		}
-	}
-
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
-	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[1] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[1] / intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[1]);
-	    }
-	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[1] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[1] / intel->scale_units[mask_unit][1]);
-		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[1]);
-		}
-	}
-
-	OUT_BATCH_F(intel->dst_coord_adjust + dstX);
-	OUT_BATCH_F(intel->dst_coord_adjust + dstY);
-	if (! intel->render_source_is_solid) {
-	    OUT_BATCH_F(src_x[0] / intel->scale_units[src_unit][0]);
-	    OUT_BATCH_F(src_y[0] / intel->scale_units[src_unit][1]);
-	    if (!is_affine_src) {
-		OUT_BATCH_F(0.0);
-		OUT_BATCH_F(src_w[0]);
-	    }
-	}
-	if (intel->render_mask && ! intel->render_mask_is_solid) {
-		OUT_BATCH_F(mask_x[0] / intel->scale_units[mask_unit][0]);
-		OUT_BATCH_F(mask_y[0] / intel->scale_units[mask_unit][1]);
-		if (!is_affine_mask) {
-			OUT_BATCH_F(0.0);
-			OUT_BATCH_F(mask_w[0]);
-		}
-	}
-
-	ADVANCE_BATCH();
+	if (!mask) {
+		if (is_solid_src)
+			intel->prim_emit = i915_emit_composite_primitive_constant;
+		else if (intel->transform[0] == NULL)
+			intel->prim_emit = i915_emit_composite_primitive_identity_source;
+		else if (i830_transform_is_affine(intel->transform[0]))
+			intel->prim_emit = i915_emit_composite_primitive_affine_source;
+		else
+			intel->prim_emit = i915_emit_composite_primitive;
+	} else
+		intel->prim_emit = i915_emit_composite_primitive;
 }
 
 void
@@ -937,8 +1081,11 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 		ADVANCE_BATCH();
 	}
 
-	i915_emit_composite_primitive(dest, srcX, srcY, maskX, maskY, dstX,
-				      dstY, w, h);
+	intel->prim_emit(dest,
+			 srcX, srcY,
+			 maskX, maskY,
+			 dstX, dstY,
+			 w, h);
 
 	intel_batch_end_atomic(scrn);
 }
commit 4a3476ea094e84887fefb558e0bba023fee34151
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue May 18 23:25:59 2010 +0100

    i915: amalgamate composite into a single primitive list
    
    Combine all the calls to composite between prepare_composite and
    done_composite into a single primitive list, rather than a primitive
    call per composite().
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/i830.h b/src/i830.h
index 2c875f3..bf0fc49 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -295,6 +295,7 @@ typedef struct intel_screen_private {
 
 	CloseScreenProcPtr CloseScreen;
 
+	void (*vertex_flush) (struct intel_screen_private *intel);
 	void (*batch_flush_notify) (ScrnInfoPtr scrn);
 
 	uxa_driver_t *uxa_driver;
@@ -357,6 +358,9 @@ typedef struct intel_screen_private {
 		uint32_t dst_format;
 	} i915_render_state;
 
+	uint32_t prim_offset;
+	uint32_t prim_count;
+
 	/* 965 render acceleration state */
 	struct gen4_render_state *gen4_render_state;
 
@@ -474,6 +478,7 @@ Bool i915_prepare_composite(int op, PicturePtr sourcec, PicturePtr mask,
 			    PixmapPtr maskPixmap, PixmapPtr destPixmap);
 void i915_composite(PixmapPtr dest, int srcX, int srcY,
 		    int maskX, int maskY, int dstX, int dstY, int w, int h);
+void i915_vertex_flush(intel_screen_private *intel);
 void i915_batch_flush_notify(ScrnInfoPtr scrn);
 void i830_batch_flush_notify(ScrnInfoPtr scrn);
 /* i965_render.c */
diff --git a/src/i830_batchbuffer.c b/src/i830_batchbuffer.c
index c23b0b8..0fe81d0 100644
--- a/src/i830_batchbuffer.c
+++ b/src/i830_batchbuffer.c
@@ -165,6 +165,9 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->batch_used == 0)
 		return;
 
+	if (intel->vertex_flush)
+		intel->vertex_flush(intel);
+
 	/* Emit a padding dword if we aren't going to be quad-word aligned. */
 	if ((intel->batch_used & 4) == 0) {
 		*(uint32_t *) (intel->batch_ptr + intel->batch_used) = MI_NOOP;
diff --git a/src/i830_driver.c b/src/i830_driver.c
index db8af06..6ec6f51 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -1227,11 +1227,12 @@ I830ScreenInit(int scrnIndex, ScreenPtr screen, int argc, char **argv)
 		return FALSE;
 	}
 
-	if (IS_I965G(intel))
+	if (IS_I965G(intel)) {
 		intel->batch_flush_notify = i965_batch_flush_notify;
-	else if (IS_I9XX(intel))
+	} else if (IS_I9XX(intel)) {
+		intel->vertex_flush = i915_vertex_flush;
 		intel->batch_flush_notify = i915_batch_flush_notify;
-	else
+	} else
 		intel->batch_flush_notify = i830_batch_flush_notify;
 
 	miInitializeBackingStore(screen);
diff --git a/src/i830_uxa.c b/src/i830_uxa.c
index d402895..f68ec70 100644
--- a/src/i830_uxa.c
+++ b/src/i830_uxa.c
@@ -467,6 +467,10 @@ static void i830_uxa_done_copy(PixmapPtr dest)
 void i830_done_composite(PixmapPtr dest)
 {
 	ScrnInfoPtr scrn = xf86Screens[dest->drawable.pScreen->myNum];
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+
+	if (intel->vertex_flush)
+		intel->vertex_flush(intel);
 
 	i830_debug_flush(scrn);
 }
@@ -1051,6 +1055,9 @@ Bool i830_uxa_init(ScreenPtr screen)
 	intel->uxa_driver->uxa_major = 1;
 	intel->uxa_driver->uxa_minor = 0;
 
+	intel->prim_offset = 0;
+	intel->prim_count = 0;
+
 	/* Solid fill */
 	intel->uxa_driver->check_solid = i830_uxa_check_solid;
 	intel->uxa_driver->prepare_solid = i830_uxa_prepare_solid;
diff --git a/src/i915_render.c b/src/i915_render.c
index 59d9248..7c204b6 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -708,6 +708,9 @@ static void i915_emit_composite_setup(ScrnInfoPtr scrn)
 
 	    FS_END();
 	}
+
+	intel->prim_offset = 0;
+	intel->prim_count = 0;
 }
 
 /* Emit the vertices for a single composite rectangle.
@@ -850,9 +853,10 @@ i915_emit_composite_primitive(PixmapPtr dest,
 
 	num_floats = 3 * per_vertex;
 
-	ATOMIC_BATCH(1 + num_floats);
+	ATOMIC_BATCH(num_floats);
+
+	intel->prim_count += num_floats;
 
-	OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (num_floats - 1));
 	OUT_BATCH_F(intel->dst_coord_adjust + dstX + w);
 	OUT_BATCH_F(intel->dst_coord_adjust + dstY + h);
 	if (! intel->render_source_is_solid) {
@@ -926,13 +930,30 @@ i915_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	if (intel->needs_render_state_emit)
 		i915_emit_composite_setup(scrn);
 
+	if (intel->prim_offset == 0) {
+		intel->prim_offset = intel->batch_used;
+		ATOMIC_BATCH(1);
+		OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
+		ADVANCE_BATCH();
+	}
+
 	i915_emit_composite_primitive(dest, srcX, srcY, maskX, maskY, dstX,
 				      dstY, w, h);
 
 	intel_batch_end_atomic(scrn);
 }
 
-void i915_batch_flush_notify(ScrnInfoPtr scrn)
+void
+i915_vertex_flush(intel_screen_private *intel)
+{
+	if (intel->prim_offset) {
+		*(uint32_t *) (intel->batch_ptr + intel->prim_offset) |= intel->prim_count - 1;
+		intel->prim_offset = 0;
+	}
+}
+
+void
+i915_batch_flush_notify(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
 
commit e5c971e7639095d38da3518a5dc404b708d45cfb
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue May 18 22:16:17 2010 +0100

    uxa: Spans! OMG!
    
    Use composite rather than solid blits in order to bring performance on
    a par with the CPU when using GEM and relocations.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/uxa/uxa-accel.c b/uxa/uxa-accel.c
index cfc2d38..0528d79 100644
--- a/uxa/uxa-accel.c
+++ b/uxa/uxa-accel.c
@@ -34,33 +34,181 @@
 #include <X11/fonts/fontstruct.h>
 #include "dixfontstr.h"
 #include "uxa.h"
+#include "mipict.h"
+
+static CARD32
+format_for_depth(int depth)
+{
+	switch (depth) {
+	case 1: return PICT_a1;
+	case 8: return PICT_a8;
+	case 15: return PICT_x1r5g5b5;
+	case 16: return PICT_r5g6b5;
+	default:
+	case 24: return PICT_x8r8g8b8;
+	case 32: return PICT_a8r8g8b8;
+	}
+}
 
 static void
 uxa_fill_spans(DrawablePtr pDrawable, GCPtr pGC, int n,
 	       DDXPointPtr ppt, int *pwidth, int fSorted)
 {
-	ScreenPtr pScreen = pDrawable->pScreen;
-	uxa_screen_t *uxa_screen = uxa_get_screen(pScreen);
+	ScreenPtr screen = pDrawable->pScreen;
+	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
 	RegionPtr pClip = fbGetCompositeClip(pGC);
-	PixmapPtr pPixmap = uxa_get_drawable_pixmap(pDrawable);
+	PixmapPtr dst_pixmap, src_pixmap = NULL;
 	BoxPtr pextent, pbox;
 	int nbox;
 	int extentX1, extentX2, extentY1, extentY2;
 	int fullX1, fullX2, fullY1;
 	int partX1, partX2;
 	int off_x, off_y;
+	xRenderColor color;
+	PictFormatPtr format;
+	PicturePtr dst, src;
+	int error;
 
-	if (uxa_screen->swappedOut || pGC->fillStyle != FillSolid ||
-	    !(pPixmap = uxa_get_offscreen_pixmap(pDrawable, &off_x, &off_y)) ||
-	    !(*uxa_screen->info->prepare_solid) (pPixmap,
+	if (uxa_screen->swappedOut)
+		goto fallback;
+
+	if (pGC->fillStyle != FillSolid)
+		goto fallback;
+
+	dst_pixmap = uxa_get_offscreen_pixmap(pDrawable, &off_x, &off_y);
+	if (!dst_pixmap)
+		goto fallback;
+
+	if (pGC->alu != GXcopy || pGC->planemask != FB_ALLONES)
+		goto solid;
+
+	format = PictureMatchFormat(screen,
+				    dst_pixmap->drawable.depth,
+				    format_for_depth(dst_pixmap->drawable.depth));
+	dst = CreatePicture(0, &dst_pixmap->drawable, format, 0, 0, serverClient, &error);
+	if (!dst)
+		goto solid;
+
+	ValidatePicture(dst);
+
+	uxa_get_rgba_from_pixel(pGC->fgPixel,
+				&color.red,
+				&color.green,
+				&color.blue,
+				&color.alpha,
+				format_for_depth(dst_pixmap->drawable.depth));
+	src = CreateSolidPicture(0, &color, &error);
+	if (!src) {
+		FreePicture(dst, 0);
+		goto solid;
+	}
+
+	if (!uxa_screen->info->check_composite(PictOpSrc, src, NULL, dst)) {
+		FreePicture(src, 0);
+		FreePicture(dst, 0);
+		goto solid;
+	}
+
+	if (!uxa_screen->info->check_composite_texture ||
+	    !uxa_screen->info->check_composite_texture(screen, src)) {
+		PicturePtr solid;
+		int src_off_x, src_off_y;
+
+		solid = uxa_acquire_solid(screen, src->pSourcePict);
+		FreePicture(src, 0);
+
+		src = solid;
+		src_pixmap = uxa_get_offscreen_pixmap(src->pDrawable,
+						      &src_off_x, &src_off_y);
+		if (!src_pixmap) {
+			FreePicture(src, 0);
+			FreePicture(dst, 0);
+			goto solid;
+		}
+	}
+
+	if (!uxa_screen->info->prepare_composite(PictOpSrc, src, NULL, dst, src_pixmap, NULL, dst_pixmap)) {
+		FreePicture(src, 0);
+		FreePicture(dst, 0);
+		goto solid;
+	}
+
+	pextent = REGION_EXTENTS(pGC->screen, pClip);
+	extentX1 = pextent->x1;
+	extentY1 = pextent->y1;
+	extentX2 = pextent->x2;
+	extentY2 = pextent->y2;
+	while (n--) {
+		fullX1 = ppt->x;
+		fullY1 = ppt->y;
+		fullX2 = fullX1 + (int)*pwidth;
+		ppt++;
+		pwidth++;
+
+		if (fullY1 < extentY1 || extentY2 <= fullY1)
+			continue;
+
+		if (fullX1 < extentX1)
+			fullX1 = extentX1;
+
+		if (fullX2 > extentX2)
+			fullX2 = extentX2;
+
+		if (fullX1 >= fullX2)
+			continue;
+
+		nbox = REGION_NUM_RECTS(pClip);
+		if (nbox == 1) {
+			uxa_screen->info->composite(dst_pixmap,
+						    0, 0, 0, 0,
+						    fullX1 + off_x,
+						    fullY1 + off_y,
+						    fullX2 - fullX1, 1);
+		} else {
+			pbox = REGION_RECTS(pClip);
+			while (nbox--) {
+				if (pbox->y2 >= fullY1)
+					break;
+
+				if (pbox->y1 <= fullY1) {
+					partX1 = pbox->x1;
+					if (partX1 < fullX1)
+						partX1 = fullX1;
+
+					partX2 = pbox->x2;
+					if (partX2 > fullX2)
+						partX2 = fullX2;
+
+					if (partX2 > partX1) {
+						uxa_screen->info->composite(dst_pixmap,
+									    0, 0, 0, 0,
+									    partX1 + off_x,
+									    fullY1 + off_y,
+									    partX2 - partX1, 1);
+					}
+				}
+				pbox++;
+			}
+		}
+	}
+
+	uxa_screen->info->done_composite(dst_pixmap);
+	FreePicture(src, 0);
+	FreePicture(dst, 0);
+	return;
+
+solid:
+	if (uxa_screen->info->check_solid &&
+	    !uxa_screen->info->check_solid(pDrawable, pGC->alu, pGC->planemask))
+		goto fallback;
+
+	if (!(*uxa_screen->info->prepare_solid) (dst_pixmap,
 						 pGC->alu,
 						 pGC->planemask,
-						 pGC->fgPixel)) {
-		uxa_check_fill_spans(pDrawable, pGC, n, ppt, pwidth, fSorted);
-		return;
-	}
+						 pGC->fgPixel))
+		goto fallback;
 
-	pextent = REGION_EXTENTS(pGC->pScreen, pClip);
+	pextent = REGION_EXTENTS(pGC->screen, pClip);
 	extentX1 = pextent->x1;
 	extentY1 = pextent->y1;
 	extentX2 = pextent->x2;
@@ -86,7 +234,7 @@ uxa_fill_spans(DrawablePtr pDrawable, GCPtr pGC, int n,
 
 		nbox = REGION_NUM_RECTS(pClip);
 		if (nbox == 1) {
-			(*uxa_screen->info->solid) (pPixmap,
+			(*uxa_screen->info->solid) (dst_pixmap,
 						    fullX1 + off_x,
 						    fullY1 + off_y,
 						    fullX2 + off_x,
@@ -103,7 +251,7 @@ uxa_fill_spans(DrawablePtr pDrawable, GCPtr pGC, int n,
 						partX2 = fullX2;
 					if (partX2 > partX1) {
 						(*uxa_screen->info->
-						 solid) (pPixmap,
+						 solid) (dst_pixmap,
 							 partX1 + off_x,
 							 fullY1 + off_y,
 							 partX2 + off_x,
@@ -114,7 +262,12 @@ uxa_fill_spans(DrawablePtr pDrawable, GCPtr pGC, int n,
 			}
 		}
 	}
-	(*uxa_screen->info->done_solid) (pPixmap);
+	(*uxa_screen->info->done_solid) (dst_pixmap);
+
+	return;
+
+fallback:
+	uxa_check_fill_spans(pDrawable, pGC, n, ppt, pwidth, fSorted);
 }
 
 static Bool
diff --git a/uxa/uxa-priv.h b/uxa/uxa-priv.h
index a4763b4..8ff2c9c 100644
--- a/uxa/uxa-priv.h
+++ b/uxa/uxa-priv.h
@@ -434,6 +434,17 @@ uxa_triangles(CARD8 op, PicturePtr pSrc, PicturePtr pDst,
 	      PictFormatPtr maskFormat, INT16 xSrc, INT16 ySrc,
 	      int ntri, xTriangle * tris);
 
+PicturePtr
+uxa_acquire_solid(ScreenPtr screen, SourcePict *source);
+
+Bool
+uxa_get_rgba_from_pixel(CARD32 pixel,
+			CARD16 * red,
+			CARD16 * green,
+			CARD16 * blue,
+			CARD16 * alpha,
+			CARD32 format);
+
 /* uxa_glyph.c */
 void uxa_glyphs_init(ScreenPtr pScreen);
 
diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index 726079a..68e3651 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -251,7 +251,7 @@ uxa_get_pixel_from_rgba(CARD32 * pixel,
 	return TRUE;
 }
 
-static Bool
+Bool
 uxa_get_rgba_from_pixel(CARD32 pixel,
 			CARD16 * red,
 			CARD16 * green,
@@ -607,7 +607,7 @@ uxa_solid_clear(ScreenPtr screen)
 	return picture;
 }
 
-static PicturePtr
+PicturePtr
 uxa_acquire_solid(ScreenPtr screen, SourcePict *source)
 {
 	uxa_screen_t *uxa_screen = uxa_get_screen(screen);
@@ -974,17 +974,13 @@ uxa_solid_rects (CARD8		op,
 		    !uxa_screen->info->check_solid(&dst_pixmap->drawable, GXcopy, FB_ALLONES))
 			goto err_region;
 
-		if (op == PictOpClear) {
-			pixel = 0;
-		} else {
-			if (!uxa_get_pixel_from_rgba(&pixel,
-						     color->red,
-						     color->green,
-						     color->blue,
-						     color->alpha,
-						     dst->format))
-				goto err_region;
-		}
+		if (!uxa_get_pixel_from_rgba(&pixel,
+					     color->red,
+					     color->green,
+					     color->blue,
+					     color->alpha,
+					     dst->format))
+			goto err_region;
 
 		if (!uxa_screen->info->prepare_solid(dst_pixmap, GXcopy, FB_ALLONES, pixel))
 			goto err_region;


More information about the xorg-commit mailing list