EXA patches

Thu Aug 6 01:44:56 PDT 2009

On Wed, 2009-08-05 at 14:21 -0400, Alex Deucher wrote: 
> 2009/8/5 Michel Dänzer <michel at daenzer.net>:
> > On Wed, 2009-08-05 at 01:39 -0400, Alex Deucher wrote:
> >> 2009/8/4 Michel Dänzer <michel at daenzer.net>:
> >> > On Tue, 2009-08-04 at 11:54 +1000, Dave Airlie wrote:
> >> >> On Tue, 2009-08-04 at 09:49 +0800, Joel Feiner wrote:
> >> >> > 2009/8/4 Michel Dänzer <michel at daenzer.net>
> >> >> >         <snip>
> >> >> >
> >> >> >         I wonder if maybe the slowdown I'm seeing is because the
> >> >> >         radeon driver
> >> >> >         is temporarily lacking UploadToScreen and DownloadFromScreen
> >> >> >         hooks with
> >> >> >         KMS, though I'm not sure what those would be used for with
> >> >> >         text
> >> >> >         rendering.
> >> >> >
> >> >> > Off-topic, but if I may enquire out of intellectual curiosity: why
> >> >> > doesn't the KMS version of Radeon have UTS and DFS acceleration?
> >> >>
> >> >> The code is in the branch, but its really ugly, I was hoping to actual
> >> >> do it cleaner which might involve a new kernel interface to ask for the
> >> >> current placement of a buffer object so we can decide whether to just
> >> >> memcpy or we need to blit for DFS. I'm not sure I saw a good reason for
> >> >> UTS, doing Host data blits isn't really useful with BOs at least with
> >> >> the current code.
> >> >
> >> > One potential benefit of UTS is pipelining. But I agree that DFS is
> >> > probably more important.
> >>
> >> FWIW... untested.
> >> http://www.botchco.com/alex/xorg/radeon_kms_uts.diff
> >
> > You want to avoid any explicit flushes or even waits for destination BO
> > idle, those would defeat any potential pipelining benefits. Otherwise
> > looks like it could work though. :)
> 
> Updated patch at the same link.

The below works, but it's a slight loss for (Shm)PutImage performance
here. I guess it might only be a win for that if we can make the GPU use
the data directly, if at all.

However, one benefit of this is that with Maarten's pending changes to
core EXA, we no longer need to allocate BOs for pixmaps of glyphs
handled by the glyph cache, thereby wasting less BO memory.

diff --git a/src/radeon_exa_funcs.c b/src/radeon_exa_funcs.c
index 7fc6463..8542b45 100644
--- a/src/radeon_exa_funcs.c
+++ b/src/radeon_exa_funcs.c
@@ -451,7 +451,67 @@ RADEONBlitChunk(ScrnInfoPtr pScrn, struct radeon_bo *src_bo,
     FINISH_ACCEL();
 }
 
-#if defined(ACCEL_CP) && defined(XF86DRM_MODE)
+#if defined(XF86DRM_MODE)
+static Bool
+RADEONUploadToScreenCS(PixmapPtr pDst, int x, int y, int w, int h,
+		       char *src, int src_pitch)
+{
+    RINFO_FROM_SCREEN(pDst->drawable.pScreen);
+    struct radeon_exa_pixmap_priv *driver_priv;
+    struct radeon_bo *scratch;
+    unsigned size;
+    uint32_t datatype = 0;
+    uint32_t dst_pitch_offset;
+    unsigned bpp = pDst->drawable.bitsPerPixel;
+    uint32_t scratch_pitch = (w * bpp / 8 + 63) & ~63;
+    Bool r;
+    int i;
+
+    if (bpp < 8)
+	return FALSE;
+
+    driver_priv = exaGetPixmapDriverPrivate(pDst);
+
+    size = scratch_pitch * h;
+    scratch = radeon_bo_open(info->bufmgr, 0, size, 0, RADEON_GEM_DOMAIN_GTT, 0);
+    if (scratch == NULL) {
+	return FALSE;
+    }
+    radeon_cs_space_reset_bos(info->cs);
+    radeon_add_pixmap(info->cs, pDst, 0, RADEON_GEM_DOMAIN_VRAM);
+    radeon_cs_space_add_persistent_bo(info->cs, scratch, RADEON_GEM_DOMAIN_GTT, 0);
+    r = radeon_cs_space_check(info->cs);
+    if (r) {
+        r = FALSE;
+        goto out;
+    }
+
+    r = radeon_bo_map(scratch, 0);
+    if (r) {
+        r = FALSE;
+        goto out;
+    }
+    r = TRUE;
+    size = w * bpp / 8;
+    for (i = 0; i < h; i++) {
+        memcpy(scratch->ptr + i * scratch_pitch, src, size);
+        src += src_pitch;
+    }
+    radeon_bo_unmap(scratch);
+
+    RADEONGetDatatypeBpp(pDst->drawable.bitsPerPixel, &datatype);
+    RADEONGetPixmapOffsetPitch(pDst, &dst_pitch_offset);
+    ACCEL_PREAMBLE();
+    RADEON_SWITCH_TO_2D();
+    RADEONBlitChunk(pScrn, scratch, driver_priv->bo, datatype, scratch_pitch << 16,
+                    dst_pitch_offset, 0, 0, x, y, w, h,
+                    RADEON_GEM_DOMAIN_GTT, RADEON_GEM_DOMAIN_VRAM);
+
+out:
+    radeon_bo_unref(scratch);
+    return r;
+}
+
 static Bool
 RADEONDownloadFromScreenCS(PixmapPtr pSrc, int x, int y, int w,
                            int h, char *dst, int dst_pitch)
@@ -464,7 +524,7 @@ RADEONDownloadFromScreenCS(PixmapPtr pSrc, int x, int y, int w,
     uint32_t src_pitch_offset;
     unsigned bpp = pSrc->drawable.bitsPerPixel;
     uint32_t scratch_pitch = (w * bpp / 8 + 63) & ~63;
-    int r;
+    Bool r;
 
     driver_priv = exaGetPixmapDriverPrivate(pSrc);
     /* if we have more refs than just the BO then flush */
@@ -656,6 +716,7 @@ Bool FUNC_NAME(RADEONDrawInit)(ScreenPtr pScreen)
     }
 # if defined(XF86DRM_MODE)
     else {
+	info->accel_state->exa->UploadToScreen = &RADEONUploadToScreenCS;
         info->accel_state->exa->DownloadFromScreen = &RADEONDownloadFromScreenCS;
     }
 # endif


-- 
Earthling Michel Dänzer           |                http://www.vmware.com
Libre software enthusiast         |          Debian, X and DRI developer