xf86-video-intel: 2 commits - src/sna/gen6_render.c src/sna/sna_accel.c src/sna/sna_dri.c

Wed Aug 31 16:02:35 PDT 2011

src/sna/gen6_render.c |   10 ++++++----
 src/sna/sna_accel.c   |    5 +++++
 src/sna/sna_dri.c     |   16 ++++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

New commits:
commit 9a563ea03b6ad87d41bc091c5819e6c462100450
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Aug 31 23:29:00 2011 +0100

    sna: Use the shadow buffer for PutImage
    
    This is optimising for the x11perf putimage benchmark, but nevertheless,
    uploading the PutImage directly into the uncached scanout is between
    2-20x slower than making a temporary copy in the shaodw buffer and
    doing a deferred update. Most of the overhead is in the kernel, and
    should be addressed there (rather than worked around) and a portion is
    due to the overdraw in the benchmark (which is not likely to be
    realistic, but then again neither should PutImage be!).
    
    The argument for uploading inplace when possible is that given that the
    buffer already exists on the GPU implies that is likely to be used again
    in future by the GPU and so we will be uploading it at some point.
    Deferring that upload incurs an extra copy. The putimage benchmark does
    not actually use the pixel data and so that extra cost is not being
    measured.
    
    Reported-by: Michael Larabel <Michael at phoronix.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 7edff77..41da573 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -994,6 +994,10 @@ sna_put_image_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (gc->alu != GXcopy)
 		return false;
 
+	/* XXX performing the upload inplace is currently about 20x slower
+	 * for putimage10 on gen6 -- mostly due to slow page faulting in kernel.
+	 */
+#if 0
 	if (priv->gpu_bo->rq == NULL &&
 	    sna_put_image_upload_blt(drawable, gc, region,
 				     x, y, w, h, bits, stride)) {
@@ -1010,6 +1014,7 @@ sna_put_image_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 
 		return true;
 	}
+#endif
 
 	if (priv->cpu_bo)
 		kgem_bo_sync(&sna->kgem, priv->cpu_bo, true);
commit 32fc0c896e0dfd06617c12beda1ccacedf69fb4a
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Wed Aug 31 23:21:54 2011 +0100

    sna/gen6: Prefer the BLT ring, except for copies on behalf of DRI
    
    As demonstrated by the all-important trap300, using the BLT is 2x faster
    than the RENDER ring for the simple case of solid fills. (Though note
    that performing the relocations costs 3x as much CPU for 2x GPU
    performance.) One case that may regress from this change is copywinpix
    which should benefit from the batching in the RENDER commands, and might
    warrant revisiting in the future (with realistic and synthetic
    benchmarks in hand!)
    
    However, due to the forced stall when switching rings, we still want to
    perform RENDER copies on behalf of DRI clients and before page-flips.
    
    Checking against cairo-perf-trace indicated no major impact -- I had
    worried that setting the BLT flag for some clears might have had a
    knock-on effect causing too many operations that could be pipelined on
    the RENDER ring to be sent to the BLT ring instead.
    
    Reported-by: Michael Larabel <Michael at phoronix.com>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 5641b3c..eb67fc6 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2307,7 +2307,8 @@ gen6_render_copy_boxes(struct sna *sna, uint8_t alu,
 	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu,
 	     src_bo == dst_bo));
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	/* XXX benchmark me! */
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
@@ -2464,7 +2465,8 @@ gen6_render_copy(struct sna *sna, uint8_t alu,
 	     src->drawable.width, src->drawable.height,
 	     dst->drawable.width, dst->drawable.height));
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	/* XXX benchmark me! */
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
@@ -2577,7 +2579,7 @@ gen6_render_fill_boxes(struct sna *sna,
 		return FALSE;
 	}
 
-	if (sna->kgem.mode == KGEM_BLT ||
+	if (sna->kgem.mode != KGEM_RENDER ||
 	    dst->drawable.width > 8192 ||
 	    dst->drawable.height > 8192 ||
 	    !gen6_check_dst_format(format)) {
@@ -2734,7 +2736,7 @@ gen6_render_fill(struct sna *sna, uint8_t alu,
 			    op);
 #endif
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_fill(sna, alu,
 			 dst_bo, dst->drawable.bitsPerPixel,
 			 color,
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 0a01f8a..f4049f1 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -461,6 +461,22 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		get_drawable_deltas(draw, dst, &dx, &dy);
 	}
 
+	if (sna->kgem.gen >= 60) {
+		/* Sandybridge introduced a separate ring which it uses to
+		 * perform blits. Switching rendering between rings incurs
+		 * a stall as we wait upon the old ring to finish and
+		 * flush its render cache before we can proceed on with
+		 * the operation on the new ring.
+		 *
+		 * As this buffer, we presume, has just been written to by
+		 * the DRI client using the RENDER ring, we want to perform
+		 * our operation on the same ring, and ideally on the same
+		 * ring as we will flip from (which should be the RENDER ring
+		 * as well).
+		 */
+		kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	}
+
 	if (region) {
 		boxes = REGION_RECTS(region);
 		n = REGION_NUM_RECTS(region);