xf86-video-intel: 2 commits - src/sna/blt.c src/sna/kgem.c src/sna/kgem.h src/sna/sna_display.c src/sna/sna.h src/sna/sna_io.c

Fri Sep 21 04:43:06 PDT 2012

src/sna/blt.c         |  117 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/sna/kgem.c        |   63 ++++++++++++++++++++++++++
 src/sna/kgem.h        |   10 +++-
 src/sna/sna.h         |    6 ++
 src/sna/sna_display.c |    5 +-
 src/sna/sna_io.c      |   95 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 288 insertions(+), 8 deletions(-)

New commits:
commit b8967aff382c1b6bef2335dea51c979a3f0800c7
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Fri Sep 21 12:16:27 2012 +0100

    sna: Do not query for the NULL edid property
    
    If the EDID blob id is set to 0, that means that it does not exist and
    so we can safely skip it.
    
    References: https://bugs.freedesktop.org/show_bug.cgi?id=55193
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index a0129e4..d7b131f 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -1630,9 +1630,12 @@ sna_output_attach_edid(xf86OutputPtr output)
 		if (strcmp(prop.name, "EDID"))
 			continue;
 
+		if (koutput->prop_values[i] == 0)
+			continue;
+
 		VG_CLEAR(blob);
 		blob.length = 0;
-		blob.data =0;
+		blob.data = 0;
 		blob.blob_id = koutput->prop_values[i];
 
 		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPROPBLOB, &blob))
commit 0be1d964713ca407f029278a8256d02d925dc9da
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date:   Tue Sep 11 21:48:24 2012 +0100

    sna: Use inplace X tiling for LLC uploads
    
    Based on a suggestion by Chad Versace (taken from a patch for mesa).
    
    This allows for a faster upload of pixel data through a ShmImage, or for
    complete replacement of a GPU bo.
    
    Using a modified version of x11perf to upload to a pixmap rather than
    scanout on an IVB i7-3720qm:
    
    Before:
    40000000 trep @   0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square
     4000000 trep @   0.0110 msec (  90700.0/sec): ShmPutImage 100x100 square
      160000 trep @   0.1689 msec (   5920.0/sec): ShmPutImage 500x500 square
    
    After:
    40000000 trep @   0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square
     6000000 trep @   0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square
      400000 trep @   0.1126 msec (   8880.0/sec): ShmPutImage 500x500 square
    
    However, the real takeaway from this is that the overheads for
    ShmPutImage are substantial, only hitting around 70% expected efficiency,
    and overshadowed by PutImage, which for reference is
    
    60000000 trep @   0.0006 msec (1800000.0/sec): PutImage 10x10 square
    
    Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

diff --git a/src/sna/blt.c b/src/sna/blt.c
index 853eb20..4735d14 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp,
 }
 
 void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		if (swizzling) {
+			while (x >= 64) {
+				offset = tile_row +
+					(dx >> tile_pixels) * tile_size +
+					(dx & tile_mask) * cpp;
+				switch (swizzling) {
+				case I915_BIT_6_SWIZZLE_9:
+					offset ^= (offset >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_10:
+					offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_11:
+					offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+					break;
+				}
+
+				memcpy((char *)dst + offset, src_row, 64);
+
+				src_row += 64;
+				x -= 64;
+				dx += swizzle_pixels;
+			}
+		} else {
+			while (x >= 512) {
+				assert((dx & tile_mask) == 0);
+				offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+				memcpy((char *)dst + offset, src_row, 512);
+
+				src_row += 512;
+				x -= 512;
+				dx += swizzle_pixels;
+			}
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index fc7c881..0ea14f0 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4082,6 +4082,56 @@ retry:
 	return (void *)(uintptr_t)mmap_arg.addr_ptr;
 }
 
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_mmap mmap_arg;
+
+	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+        assert(bo->refcnt);
+	assert(!bo->purged);
+	assert(list_is_empty(&bo->list));
+	assert(bo->proxy == NULL);
+
+	if (IS_CPU_MAP(bo->map))
+		return MAP(bo->map);
+
+retry:
+	VG_CLEAR(mmap_arg);
+	mmap_arg.handle = bo->handle;
+	mmap_arg.offset = 0;
+	mmap_arg.size = bytes(bo);
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n",
+		       __FUNCTION__, bo->handle, bytes(bo), errno);
+		if (__kgem_throttle_retire(kgem, 0))
+			goto retry;
+
+		return NULL;
+	}
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+	if (bo->map == NULL) {
+		DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+		bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
+	}
+	return (void *)(uintptr_t)mmap_arg.addr_ptr;
+}
+
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr)
+{
+	DBG(("%s(handle=%d, size=%d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo)));
+        assert(bo->refcnt);
+
+	if (IS_CPU_MAP(bo->map)) {
+                assert(ptr == MAP(bo->map));
+                return;
+        }
+
+	munmap(ptr, bytes(bo));
+}
+
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_gem_flink flink;
@@ -4961,6 +5011,19 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 	}
 }
 
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_get_tiling tiling;
+
+	VG_CLEAR(tiling);
+	tiling.handle = bo->handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
+		return 0;
+
+	assert(bo->tiling == tiling.tiling_mode);
+	return tiling.swizzle_mode;
+}
+
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
 		struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 832b3f0..cdbb7cb 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -262,6 +262,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
 
 void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo);
 bool kgem_retire(struct kgem *kgem);
@@ -419,6 +420,8 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 
 bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
@@ -494,7 +497,7 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
-static inline bool kgem_bo_mapped(struct kgem_bo *bo)
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: map=%p, tiling=%d, domain=%d\n",
 	     __FUNCTION__, bo->map, bo->tiling, bo->domain));
@@ -502,12 +505,15 @@ static inline bool kgem_bo_mapped(struct kgem_bo *bo)
 	if (bo->map == NULL)
 		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
 
+	if (bo->tiling == I915_TILING_X && !bo->scanout && kgem->has_llc)
+		return IS_CPU_MAP(bo->map);
+
 	return IS_CPU_MAP(bo->map) == !bo->tiling;
 }
 
 static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
 {
-	if (kgem_bo_mapped(bo))
+	if (kgem_bo_mapped(kgem, bo))
 		return true;
 
 	if (!bo->tiling && kgem->has_llc)
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 382c0a5..28dff6d 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -764,6 +764,12 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	   int16_t dst_x, int16_t dst_y,
 	   uint16_t width, uint16_t height);
 void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height);
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index a466f55..cdaadc0 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -482,6 +482,49 @@ fallback:
 	sna->blt_state.fill_bo = 0;
 }
 
+static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (kgem->gen < 50) /* bit17 swizzling :( */
+		return false;
+
+	if (bo->tiling != I915_TILING_X)
+		return false;
+
+	if (bo->scanout)
+		return false;
+
+	return bo->domain == DOMAIN_CPU || kgem->has_llc;
+}
+
+static bool
+write_boxes_inplace__tiled(struct kgem *kgem,
+                           const uint8_t *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
+                           struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
+                           const BoxRec *box, int n)
+{
+	uint8_t *dst;
+	int swizzle;
+
+	assert(bo->tiling == I915_TILING_X);
+
+	dst = __kgem_bo_map__cpu(kgem, bo);
+	if (dst == NULL)
+		return false;
+
+	kgem_bo_sync__cpu(kgem, bo);
+	swizzle = kgem_bo_get_swizzling(kgem, bo);
+	do {
+		memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch,
+				  box->x1 + src_dx, box->y1 + src_dy,
+				  box->x1 + dst_dx, box->y1 + dst_dy,
+				  box->x2 - box->x1, box->y2 - box->y1);
+		box++;
+	} while (--n);
+	__kgem_bo_unmap__cpu(kgem, bo, dst);
+
+	return true;
+}
+
 static bool write_boxes_inplace(struct kgem *kgem,
 				const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
 				struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
@@ -492,6 +535,11 @@ static bool write_boxes_inplace(struct kgem *kgem,
 	DBG(("%s x %d, handle=%d, tiling=%d\n",
 	     __FUNCTION__, n, bo->handle, bo->tiling));
 
+	if (upload_inplace__tiled(kgem, bo) &&
+	    write_boxes_inplace__tiled(kgem, src, stride, bpp, src_dx, src_dy,
+				       bo, dst_dx, dst_dy, box, n))
+		return true;
+
 	if (!kgem_bo_can_map(kgem, bo))
 		return false;
 
@@ -539,7 +587,7 @@ static bool upload_inplace(struct kgem *kgem,
 {
 	unsigned int bytes;
 
-	if (!kgem_bo_can_map(kgem, bo))
+	if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo))
 		return false;
 
 	if (FORCE_INPLACE)
@@ -871,8 +919,6 @@ write_boxes_inplace__xor(struct kgem *kgem,
 			 const BoxRec *box, int n,
 			 uint32_t and, uint32_t or)
 {
-	int dst_pitch = bo->pitch;
-	int src_pitch = stride;
 	void *dst;
 
 	DBG(("%s x %d, tiling=%d\n", __FUNCTION__, n, bo->tiling));
@@ -888,10 +934,22 @@ write_boxes_inplace__xor(struct kgem *kgem,
 		     box->x1 + src_dx, box->y1 + src_dy,
 		     box->x1 + dst_dx, box->y1 + dst_dy,
 		     box->x2 - box->x1, box->y2 - box->y1,
-		     bpp, src_pitch, dst_pitch));
+		     bpp, stride, bo->pitch));
+
+		assert(box->x2 > box->x1);
+		assert(box->y2 > box->y1);
+
+		assert(box->x1 + dst_dx >= 0);
+		assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
+		assert(box->y1 + dst_dy >= 0);
+		assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
+
+		assert(box->x1 + src_dx >= 0);
+		assert((box->x2 + src_dx)*bpp <= 8*stride);
+		assert(box->y1 + src_dy >= 0);
 
 		memcpy_xor(src, dst, bpp,
-			   src_pitch, dst_pitch,
+			   stride, bo->pitch,
 			   box->x1 + src_dx, box->y1 + src_dy,
 			   box->x1 + dst_dx, box->y1 + dst_dy,
 			   box->x2 - box->x1, box->y2 - box->y1,
@@ -1282,6 +1340,19 @@ bool sna_replace(struct sna *sna,
 	     pixmap->drawable.bitsPerPixel,
 	     bo->tiling, busy));
 
+	if (!busy && upload_inplace__tiled(kgem, bo)) {
+		BoxRec box;
+
+		box.x1 = box.y1 = 0;
+		box.x2 = pixmap->drawable.width;
+		box.y2 = pixmap->drawable.height;
+
+		if (write_boxes_inplace__tiled(kgem, src,
+					       stride, pixmap->drawable.bitsPerPixel, 0, 0,
+					       bo, 0, 0, &box, 1))
+			return true;
+	}
+
 	if ((busy || !kgem_bo_can_map(kgem, bo)) &&
 	    indirect_replace(sna, pixmap, bo, src, stride))
 		return true;
@@ -1304,6 +1375,19 @@ bool sna_replace(struct sna *sna,
 				   (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
 			goto err;
 	} else {
+		if (upload_inplace__tiled(kgem, bo)) {
+			BoxRec box;
+
+			box.x1 = box.y1 = 0;
+			box.x2 = pixmap->drawable.width;
+			box.y2 = pixmap->drawable.height;
+
+			if (write_boxes_inplace__tiled(kgem, src,
+						       stride, pixmap->drawable.bitsPerPixel, 0, 0,
+						       bo, 0, 0, &box, 1))
+				goto done;
+		}
+
 		if (kgem_bo_is_mappable(kgem, bo)) {
 			dst = kgem_bo_map(kgem, bo);
 			if (!dst)
@@ -1330,6 +1414,7 @@ bool sna_replace(struct sna *sna,
 		}
 	}
 
+done:
 	if (bo != *_bo)
 		kgem_bo_destroy(kgem, *_bo);
 	*_bo = bo;