[PATCH] micro-optimize RADEONCopySwap in radeon_accel.c for powerpc
Jochen Rollwagen
joro-2013 at t-online.de
Fri Oct 28 08:28:20 UTC 2016
Hi there,
gcc seems to create some sub-optimal code for the following code
sequence in radeon_accel.c:
for (; nwords > 0; --nwords, ++d, ++s)
*d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
the body of the loop compiles to
lwz 9,40(31)
lwz 9,0(9)
rotlwi 10,9,16
lwz 9,36(31)
stw 10,0(9)
lwz 9,44(31)
addi 9,9,-1
stw 9,44(31)
lwz 9,36(31)
addi 9,9,4
stw 9,36(31)
lwz 9,40(31)
addi 9,9,4
stw 9,40(31)
this patch adds some (hopefully optimal) assembler code, bringing it in
line with the other cases in the switch:
diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 1def2a3..580fa33 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -138,7 +138,16 @@ void RADEONCopySwap(uint8_t *dst, uint8_t *src,
unsigned int size, int swap)
unsigned int nwords = size >> 2;
for (; nwords > 0; --nwords, ++d, ++s)
- *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#ifdef __powerpc__
+ __asm__ volatile ("rlwinm %0,%1,%2,%3,%4\n\t"
+ "rlwimi %0,%1,%5,%6,%7\n\t"
+ : "=&r" (*d)
+ : "r" (*s),"i" (16),
"i" (16),"i" (31) ,"i" (16), "i" (0),"i" (15)
+ :);
+
+#else
+ *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#endif
return;
}
case RADEON_HOST_DATA_SWAP_32BIT:
More information about the xorg-driver-ati
mailing list