xf86-video-intel: 4 commits - src/intel_dri.c src/sna/blt.c src/sna/compiler.h src/sna/gen4_render.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_dri.c src/sna/sna_io.c src/sna/sna_video_textured.c

Tue Apr 2 03:00:51 PDT 2013

src/intel_dri.c              |   14 ++++++++-
 src/sna/blt.c                |    4 +-
 src/sna/compiler.h           |    6 ++++
 src/sna/gen4_render.c        |   62 ++++++++++++++++++++++++++-----------------
 src/sna/kgem.c               |   29 +++++++++++++-------
 src/sna/kgem.h               |    2 +
 src/sna/sna_dri.c            |   21 ++++++++++++--
 src/sna/sna_io.c             |   25 ++++-------------
 src/sna/sna_video_textured.c |    5 +++
 9 files changed, 108 insertions(+), 60 deletions(-)

New commits:
commit 3d7e16addb2fb5f35936aafe8e16685a91d30f59
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 2 10:58:52 2013 +0100

    sna/gen4: Break the Video rendering loop into 16 rectangle chunks
    
    If we feed more than 16 rectangles into the video rendering pipeline,
    the GPU goes crazy and starts emitting corruption. Lalalala.
    
    Bugzilla: https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1162046
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index 1bf5ad2..c05b37b 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1387,37 +1387,51 @@ gen4_render_video(struct sna *sna,
 
 	box = REGION_RECTS(dstRegion);
 	nbox = REGION_NUM_RECTS(dstRegion);
-	while (nbox--) {
-		BoxRec r;
+	do {
+		int n;
 
-		r.x1 = box->x1 + pix_xoff;
-		r.x2 = box->x2 + pix_xoff;
-		r.y1 = box->y1 + pix_yoff;
-		r.y2 = box->y2 + pix_yoff;
+		n = gen4_get_rectangles(sna, &tmp, min(nbox, 16),
+					gen4_video_bind_surfaces);
+		ErrorF("n=%d/%d\n", n, nbox);
+		assert(n);
+		nbox -= n;
 
-		gen4_get_rectangles(sna, &tmp, 1, gen4_video_bind_surfaces);
+		do {
+			BoxRec r;
 
-		OUT_VERTEX(r.x2, r.y2);
-		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
-		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+			r.x1 = box->x1 + pix_xoff;
+			r.x2 = box->x2 + pix_xoff;
+			r.y1 = box->y1 + pix_yoff;
+			r.y2 = box->y2 + pix_yoff;
 
-		OUT_VERTEX(r.x1, r.y2);
-		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
-		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
+			OUT_VERTEX(r.x2, r.y2);
+			OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
+			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
 
-		OUT_VERTEX(r.x1, r.y1);
-		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
-		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
+			OUT_VERTEX(r.x1, r.y2);
+			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
 
-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
-			sna_damage_add_box(&priv->gpu_damage, &r);
-			sna_damage_subtract_box(&priv->cpu_damage, &r);
-		}
-		box++;
-	}
-	priv->clear = false;
+			OUT_VERTEX(r.x1, r.y1);
+			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
+			OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
 
-	gen4_vertex_flush(sna);
+			if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+				sna_damage_add_box(&priv->gpu_damage, &r);
+				sna_damage_subtract_box(&priv->cpu_damage, &r);
+			}
+			box++;
+		} while (--n);
+
+		gen4_vertex_flush(sna);
+		if (!nbox)
+			break;
+
+		/* VUE corruption strikes again */
+		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
+	} while (1);
+
+	priv->clear = false;
 	return true;
 }
 
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index bd20325..d94dbd8 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -230,6 +230,11 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 	     drw_x, drw_y, drw_w, drw_h,
 	     id, width, height, sync));
 
+	DBG(("%s: region %d:(%d, %d), (%d, %d)\n", __FUNCTION__,
+	     RegionNumRects(clip),
+	     clip->extents.x1, clip->extents.y1,
+	     clip->extents.x2, clip->extents.y2));
+
 	if (buf == 0) {
 		DBG(("%s: garbage video buffer\n", __FUNCTION__));
 		return BadAlloc;
commit f09aa788d79d36688bcfdd3b49b92367590c5f16
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Apr 2 10:01:21 2013 +0100

    DRI2GetMSC: Do not send a bogus ust for when the drawable is not displayed
    
    According to the opengl glx_sync_control spec, the Unadjusted System Time
    (or UST) is a 64-bit monotonically increasing counter that is available
    throughout the system:
    http://www.opengl.org/registry/specs/OML/glx_sync_control.txt
    
    Therefore, sending 0, even in this corner case, is out of spec. However,
    we cannot just return FALSE here as that triggers a BadDrawable error to
    be sent, and as is often the case mishandled, to the client. This results
    in a certain compositor terminating, for example.
    
    As an alternative we can use the monotonic system timestamp which in
    theory should also be monotonic with the previous and subsequent vblank
    times.
    
    Based on a patch by Daniel Kurtz.
    
    Reported-by: Daniel Kurtz <djkurtz at chromium.org>
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel_dri.c b/src/intel_dri.c
index f351203..8f27921 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -1326,6 +1326,16 @@ blit_fallback:
 	return TRUE;
 }
 
+static uint64_t gettime_us(void)
+{
+	struct timespec tv;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &tv))
+		return 0;
+
+	return (uint64_t)tv.tv_sec * 1000000 + tv.tv_nsec / 1000;
+}
+
 /*
  * Get current frame count and frame count timestamp, based on drawable's
  * crtc.
@@ -1339,9 +1349,9 @@ I830DRI2GetMSC(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 	drmVBlank vbl;
 	int ret, pipe = I830DRI2DrawablePipe(draw);
 
-	/* Drawable not displayed, make up a value */
+	/* Drawable not displayed, make up a *monotonic* value */
 	if (pipe == -1) {
-		*ust = 0;
+		*ust = gettime_us();
 		*msc = 0;
 		return TRUE;
 	}
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 0962e25..5fb1662 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -37,6 +37,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #endif
 
 #include <errno.h>
+#include <time.h>
 #include <string.h>
 
 #include "sna.h"
@@ -2216,6 +2217,16 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 }
 #endif
 
+static uint64_t gettime_us(void)
+{
+	struct timespec tv;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &tv))
+		return 0;
+
+	return (uint64_t)tv.tv_sec * 1000000 + tv.tv_nsec / 1000;
+}
+
 /*
  * Get current frame count and frame count timestamp, based on drawable's
  * crtc.
@@ -2227,13 +2238,16 @@ sna_dri_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 	drmVBlank vbl;
 	int pipe;
 
-	/* Drawable not displayed, make up a value */
-	*ust = *msc = 0;
 
 	pipe = sna_dri_get_pipe(draw);
 	DBG(("%s(pipe=%d)\n", __FUNCTION__, pipe));
-	if (pipe == -1)
+	if (pipe == -1) {
+fail:
+		/* Drawable not displayed, make up a *monotonic* value */
+		*ust = gettime_us();
+		*msc = 0;
 		return TRUE;
+	}
 
 	VG_CLEAR(vbl);
 	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
@@ -2246,6 +2260,7 @@ sna_dri_get_msc(DrawablePtr draw, CARD64 *ust, CARD64 *msc)
 	} else {
 		DBG(("%s: query failed on pipe %d, ret=%d\n",
 		     __FUNCTION__, pipe, errno));
+		goto fail;
 	}
 
 	return TRUE;
commit 4af622edfc18af523e1fa9063379f68374e19b04
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 1 22:44:13 2013 +0100

    sna: Try to eliminate pending operations to the bo being replaced
    
    When we are replacing a bo with fresh data, we can drop pending
    operations to it and thereby reduce the complexity of the replacement.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 231dc8e..9013e68 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -1732,6 +1732,23 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 	return NULL;
 }
 
+void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (kgem->nexec != 1 || bo->exec == NULL)
+		return;
+
+	DBG(("%s: only handle in batch, discarding last operations\n",
+	     __FUNCTION__));
+
+	assert(bo->exec == &kgem->exec[0]);
+	assert(kgem->exec[0].handle == bo->handle);
+	assert(RQ(bo->rq) == kgem->next_request);
+
+	bo->refcnt++;
+	kgem_reset(kgem);
+	bo->refcnt--;
+}
+
 static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
@@ -1782,16 +1799,8 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->io == false);
 	assert(bo->scanout == false);
 
-	if (bo->exec && kgem->nexec == 1) {
-		DBG(("%s: only handle in batch, discarding last operations\n",
-		     __FUNCTION__));
-		assert(bo->exec == &kgem->exec[0]);
-		assert(kgem->exec[0].handle == bo->handle);
-		assert(RQ(bo->rq) == kgem->next_request);
-		bo->refcnt = 1;
-		kgem_reset(kgem);
-		bo->refcnt = 0;
-	}
+	kgem_bo_undo(kgem, bo);
+	assert(bo->refcnt == 0);
 
 	if (bo->rq && bo->exec == NULL && !__kgem_busy(kgem, bo->handle))
 		__kgem_bo_clear_busy(bo);
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 82f9b52..f2b1c98 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -573,6 +573,8 @@ static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
 	return bo->snoop;
 }
 
+void kgem_bo_undo(struct kgem *kgem, struct kgem_bo *bo);
+
 bool __kgem_busy(struct kgem *kgem, int handle);
 
 static inline void kgem_bo_mark_busy(struct kgem_bo *bo, int ring)
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 540f3a6..14c0d8c 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -1362,38 +1362,25 @@ bool sna_replace(struct sna *sna,
 {
 	struct kgem_bo *bo = *_bo;
 	struct kgem *kgem = &sna->kgem;
-	bool busy;
 	void *dst;
 
-	busy = __kgem_bo_is_busy(kgem, bo);
 	DBG(("%s(handle=%d, %dx%d, bpp=%d, tiling=%d) busy?=%d\n",
 	     __FUNCTION__, bo->handle,
 	     pixmap->drawable.width,
 	     pixmap->drawable.height,
 	     pixmap->drawable.bitsPerPixel,
-	     bo->tiling, busy));
+	     bo->tiling,
+	     __kgem_bo_is_busy(kgem, bo)));
 
 	assert(!sna_pixmap(pixmap)->pinned);
 
-	if (!busy && upload_inplace__tiled(kgem, bo)) {
-		BoxRec box;
+	kgem_bo_undo(kgem, bo);
 
-		box.x1 = box.y1 = 0;
-		box.x2 = pixmap->drawable.width;
-		box.y2 = pixmap->drawable.height;
+	if (__kgem_bo_is_busy(kgem, bo)) {
+		struct kgem_bo *new_bo;
 
-		if (write_boxes_inplace__tiled(kgem, src,
-					       stride, pixmap->drawable.bitsPerPixel, 0, 0,
-					       bo, 0, 0, &box, 1))
+		if (indirect_replace(sna, pixmap, bo, src, stride))
 			return true;
-	}
-
-	if ((busy || !kgem_bo_can_map(kgem, bo)) &&
-	    indirect_replace(sna, pixmap, bo, src, stride))
-		return true;
-
-	if (busy) {
-		struct kgem_bo *new_bo;
 
 		new_bo = kgem_create_2d(kgem,
 					pixmap->drawable.width,
commit ef0038d358e613381e03c077e06a87fc49108d87
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Mon Apr 1 22:43:48 2013 +0100

    sna: Allow the compiler to inline memcpy for the bitblt routines
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index 4735d14..af87667 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -138,7 +138,7 @@ xmm_save_128(__m128i *dst, __m128i data)
 }
 #endif
 
-void
+fast_memcpy void
 memcpy_blt(const void *src, void *dst, int bpp,
 	   int32_t src_stride, int32_t dst_stride,
 	   int16_t src_x, int16_t src_y,
@@ -213,7 +213,7 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	}
 }
 
-void
+fast_memcpy void
 memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
 		  int32_t src_stride, int32_t dst_stride,
 		  int16_t src_x, int16_t src_y,
diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index b5c9ac2..62f51f0 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -63,6 +63,12 @@
 #define avx2 __attribute__((target("avx2,sse4.2,sse2,fpmath=sse")))
 #endif
 
+#if HAS_GCC(4, 5) && defined(__OPTIMIZE__)
+#define fast_memcpy __attribute__((target("inline-all-stringops")))
+#else
+#define fast_memcpy
+#endif
+
 #ifdef HAVE_VALGRIND
 #define VG(x) x
 #else