[PATCH] micro-optimize RADEONCopySwap in radeon_accel.c for powerpc

Jochen Rollwagen joro-2013 at t-online.de
Fri Oct 28 08:28:20 UTC 2016


Hi there,

gcc seems to create some sub-optimal code for the following code 
sequence in radeon_accel.c:

  for (; nwords > 0; --nwords, ++d, ++s)
             *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);

the body of the loop compiles to

     lwz 9,40(31)
     lwz 9,0(9)
     rotlwi 10,9,16
     lwz 9,36(31)
     stw 10,0(9)
     lwz 9,44(31)
     addi 9,9,-1
     stw 9,44(31)
     lwz 9,36(31)
     addi 9,9,4
     stw 9,36(31)
     lwz 9,40(31)
     addi 9,9,4
     stw 9,40(31)

this patch adds some (hopefully optimal) assembler code, bringing it in 
line with the other cases in the switch:

diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 1def2a3..580fa33 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -138,7 +138,16 @@ void RADEONCopySwap(uint8_t *dst, uint8_t *src, 
unsigned int size, int swap)
             unsigned int nwords = size >> 2;

             for (; nwords > 0; --nwords, ++d, ++s)
-               *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#ifdef __powerpc__
+                       __asm__ volatile ("rlwinm %0,%1,%2,%3,%4\n\t"
+                                                "rlwimi %0,%1,%5,%6,%7\n\t"
+                                                 : "=&r" (*d)
+                                                 : "r" (*s),"i" (16), 
"i" (16),"i" (31) ,"i" (16), "i" (0),"i" (15)
+                                                 :);
+
+#else
+                       *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#endif
             return;
          }
      case RADEON_HOST_DATA_SWAP_32BIT:



More information about the xorg-driver-ati mailing list