xf86-video-intel: 6 commits - src/intel_batchbuffer.c src/intel.h src/sna/kgem.h src/sna/sna_accel.c src/sna/sna_dri.c src/sna/sna_io.c

Fri Jan 11 13:52:34 PST 2013

src/intel.h             |    2 +-
 src/intel_batchbuffer.c |   41 +++++++++++++++++++++++++++++++----------
 src/sna/kgem.h          |   12 ++++++------
 src/sna/sna_accel.c     |    5 ++++-
 src/sna/sna_dri.c       |    8 ++++----
 src/sna/sna_io.c        |    9 +++++++++
 6 files changed, 55 insertions(+), 22 deletions(-)

New commits:
commit ec77a07b41f1062b941774f3782b51d21e7824dd
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 11 11:40:57 2013 +0000

    sna/dri: Prefer to preserve the ring of the destination bo
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index f4058cc..35457ac 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -526,16 +526,16 @@ static void sna_dri_select_mode(struct sna *sna, struct kgem_bo *dst, struct kge
 	}
 
 	VG_CLEAR(busy);
-	busy.handle = src->handle;
+	busy.handle = dst->handle;
 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
 		return;
 
-	DBG(("%s: src busy?=%x\n", __FUNCTION__, busy.busy));
+	DBG(("%s: dst busy?=%x\n", __FUNCTION__, busy.busy));
 	if (busy.busy == 0) {
-		busy.handle = dst->handle;
+		busy.handle = src->handle;
 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
 			return;
-		DBG(("%s: dst busy?=%x\n", __FUNCTION__, busy.busy));
+		DBG(("%s: src busy?=%x\n", __FUNCTION__, busy.busy));
 		if (busy.busy == 0) {
 			DBG(("%s: src/dst is idle, using defaults\n", __FUNCTION__));
 			return;
commit 42f1026e11527cb62b4522b44e71a4e72582a876
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 11 11:40:16 2013 +0000

    sna: Reorder struct kgem_bo to move related data into the same cacheline
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index ac63488..92fbaec 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -43,6 +43,12 @@
 #endif
 
 struct kgem_bo {
+	struct kgem_request *rq;
+#define RQ(rq) ((struct kgem_request *)((uintptr_t)(rq) & ~3))
+#define RQ_RING(rq) ((uintptr_t)(rq) & 3)
+#define RQ_IS_BLT(rq) (RQ_RING(rq) == KGEM_BLT)
+	struct drm_i915_gem_exec_object2 *exec;
+
 	struct kgem_bo *proxy;
 
 	struct list list;
@@ -52,12 +58,6 @@ struct kgem_bo {
 	void *map;
 #define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
 #define IS_GTT_MAP(ptr) (ptr && ((uintptr_t)(ptr) & 1) == 0)
-	struct kgem_request *rq;
-#define RQ(rq) ((struct kgem_request *)((uintptr_t)(rq) & ~3))
-#define RQ_RING(rq) ((uintptr_t)(rq) & 3)
-#define RQ_IS_BLT(rq) (RQ_RING(rq) == KGEM_BLT)
-
-	struct drm_i915_gem_exec_object2 *exec;
 
 	struct kgem_bo_binding {
 		struct kgem_bo_binding *next;
commit aead71051ed757e7565d395c858bf8ab8f0b0ff6
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Jan 11 01:30:43 2013 +0000

    sna: Disable memcpy_to_tiled_x() uploads on 32-bit systems
    
    It's far too slow due to the register starved instruction set producing
    attrocious code and the extra overhead in the kernel for managing memory
    mappings.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index ec254fc..8b4c25e 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -483,6 +483,15 @@ fallback:
 
 static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
 {
+#ifndef __x86_64__
+	/* Between a register starved compiler emitting attrocious code
+	 * and the extra overhead in the kernel for managing the tight
+	 * 32-bit address space, unless we have a 64-bit system,
+	 * using memcpy_to_tiled_x() is extremely slow.
+	 */
+	return false;
+#endif
+
 	if (kgem->gen < 050) /* bit17 swizzling :( */
 		return false;
 
commit 220970b1a484e283e2bbb44f79df613ce1ee1146
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 10 19:43:05 2013 +0000

    sna: Also prefer to use the GPU for uploads into a tiled bo
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index 2725143..b828cbf 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -1408,6 +1408,9 @@ static inline bool use_cpu_bo_for_upload(struct sna *sna,
 	if (flags & (MOVE_WRITE | MOVE_ASYNC_HINT))
 		return true;
 
+	if (priv->gpu_bo->tiling)
+		return true;
+
 	return kgem_bo_is_busy(priv->gpu_bo) || kgem_bo_is_busy(priv->cpu_bo);
 }
 
commit 672e59851c427c63f43cde7dfd1688a72100e3b3
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 10 19:35:29 2013 +0000

    sna: Prefer userptr if copying to a tiled bo
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index ca1fd27..2725143 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -4615,7 +4615,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 		if (src_priv == NULL &&
 		    sna->kgem.has_userptr &&
-		    __kgem_bo_is_busy(&sna->kgem, bo) &&
+		    ((bo->tiling && !bo->scanout) || __kgem_bo_is_busy(&sna->kgem, bo)) &&
 		    box_inplace(src_pixmap, &region->extents)) {
 			struct kgem_bo *src_bo;
 			bool ok = false;
commit 441ef916ae6569c88b3d6abaf7fea4d69be49d76
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Thu Jan 10 19:14:21 2013 +0000

    intel: Throttle harder
    
    Filling the rings is a very unpleasant user experience, so cap the
    number of batches we allow to be inflight at any one time.
    
    Interestingly, as also found with SNA, throttling can improve
    performance by reducing RSS. However, typically throughput is improved
    (at the expense of latency) by oversubscribing work to the GPU and a
    10-20% slowdown is commonplace for cairo-traces. Notably, x11perf is
    less affected and in particular application level benchmarks show no
    change.
    
    Note that this exposes another bug in libdrm-intel 2.4.40 on gen2/3.
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/intel.h b/src/intel.h
index 53ce33c..d4c9aff 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -182,7 +182,7 @@ typedef struct intel_screen_private {
 	unsigned int batch_emit_start;
 	/** Number of bytes to be emitted in the current BEGIN_BATCH. */
 	uint32_t batch_emitting;
-	dri_bo *batch_bo;
+	dri_bo *batch_bo, *last_batch_bo[2];
 	/** Whether we're in a section of code that can't tolerate flushing */
 	Bool in_batch_atomic;
 	/** Ending batch_used that was verified by intel_start_batch_atomic() */
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 334deb7..4e74a0f 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -67,17 +67,26 @@ void intel_next_vertex(intel_screen_private *intel)
 		dri_bo_alloc(intel->bufmgr, "vertex", sizeof (intel->vertex_ptr), 4096);
 }
 
-static void intel_next_batch(ScrnInfoPtr scrn)
+static dri_bo *bo_alloc(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
-
+	int size = 4 * 4096;
 	/* The 865 has issues with larger-than-page-sized batch buffers. */
 	if (IS_I865G(intel))
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096, 4096);
-	else
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);
+		size = 4096;
+	return dri_bo_alloc(intel->bufmgr, "batch", size, 4096);
+}
+
+static void intel_next_batch(ScrnInfoPtr scrn, int mode)
+{
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	dri_bo *tmp;
+
+	drm_intel_gem_bo_clear_relocs(intel->batch_bo, 0);
+
+	tmp = intel->last_batch_bo[mode];
+	intel->last_batch_bo[mode] = intel->batch_bo;
+	intel->batch_bo = tmp;
 
 	intel->batch_used = 0;
 
@@ -95,12 +104,25 @@ void intel_batch_init(ScrnInfoPtr scrn)
 	intel->batch_emitting = 0;
 	intel->vertex_id = 0;
 
-	intel_next_batch(scrn);
+	intel->last_batch_bo[0] = bo_alloc(scrn);
+	intel->last_batch_bo[1] = bo_alloc(scrn);
+
+	intel->batch_bo = bo_alloc(scrn);
+	intel->batch_used = 0;
+	intel->last_3d = LAST_3D_OTHER;
 }
 
 void intel_batch_teardown(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(intel->last_batch_bo); i++) {
+		if (intel->last_batch_bo[i] != NULL) {
+			dri_bo_unreference(intel->last_batch_bo[i]);
+			intel->last_batch_bo[i] = NULL;
+		}
+	}
 
 	if (intel->batch_bo != NULL) {
 		dri_bo_unreference(intel->batch_bo);
@@ -273,8 +295,7 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->debug_flush & DEBUG_FLUSH_WAIT)
 		drm_intel_bo_wait_rendering(intel->batch_bo);
 
-	dri_bo_unreference(intel->batch_bo);
-	intel_next_batch(scrn);
+	intel_next_batch(scrn, intel->current_batch == I915_EXEC_BLT);
 
 	if (intel->batch_commit_notify)
 		intel->batch_commit_notify(intel);