xf86-video-intel: 4 commits - src/sna/blt.c
Chris Wilson
ickle at kemper.freedesktop.org
Fri Apr 8 09:09:27 UTC 2016
src/sna/blt.c | 334 ++++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 245 insertions(+), 89 deletions(-)
New commits:
commit ab041b9b91b9bd65861b8a4c30ea8d776041e56d
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Fri Apr 8 08:34:20 2016 +0100
sna: Specialise alignment paths for storing
Switch between aligned/unaligned stores for the bulk copy inner loops.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/blt.c b/src/sna/blt.c
index c246140..a4738f5 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -444,7 +444,6 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
width *= cpp;
assert(src_stride >= width);
src_stride -= width;
- src_stride += width & 15;
while (height--) {
unsigned w = width;
@@ -452,6 +451,8 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
tile_row += dst_y / tile_height * dst_stride * tile_height;
tile_row += (dst_y & (tile_height-1)) * tile_width;
+ dst_y++;
+
if (dst_x) {
tile_row += (dst_x >> tile_shift) * tile_size;
if (dst_x & tile_mask) {
@@ -488,33 +489,18 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
tile_row += 16;
src = (const uint8_t *)src + 16;
}
- memcpy(tile_row, src, w & 15);
- src = (const uint8_t *)src + src_stride;
- dst_y++;
+ memcpy(assume_aligned(tile_row, 16), src, w & 15);
+ src = (const uint8_t *)src + src_stride + (w & 15);
}
}
sse2 static force_inline void
-from_sse64(uint8_t *dst, const uint8_t *src)
-{
- __m128i xmm1, xmm2, xmm3, xmm4;
-
- xmm1 = xmm_load_128((const __m128i*)src + 0);
- xmm2 = xmm_load_128((const __m128i*)src + 1);
- xmm3 = xmm_load_128((const __m128i*)src + 2);
- xmm4 = xmm_load_128((const __m128i*)src + 3);
-
- xmm_save_128u((__m128i*)dst + 0, xmm1);
- xmm_save_128u((__m128i*)dst + 1, xmm2);
- xmm_save_128u((__m128i*)dst + 2, xmm3);
- xmm_save_128u((__m128i*)dst + 3, xmm4);
-}
-
-sse2 static force_inline void
-from_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
+from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
{
int i;
+ assert(((uintptr_t)src & 15) == 0);
+
for (i = 0; i < bytes / 128; i++) {
__m128i xmm0, xmm1, xmm2, xmm3;
__m128i xmm4, xmm5, xmm6, xmm7;
@@ -543,7 +529,79 @@ from_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
}
sse2 static force_inline void
-from_sse32(uint8_t *dst, const uint8_t *src)
+from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
+{
+ int i;
+
+ assert(((uintptr_t)dst & 15) == 0);
+ assert(((uintptr_t)src & 15) == 0);
+
+ for (i = 0; i < bytes / 128; i++) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ __m128i xmm4, xmm5, xmm6, xmm7;
+
+ xmm0 = xmm_load_128((const __m128i*)src + 0);
+ xmm1 = xmm_load_128((const __m128i*)src + 1);
+ xmm2 = xmm_load_128((const __m128i*)src + 2);
+ xmm3 = xmm_load_128((const __m128i*)src + 3);
+ xmm4 = xmm_load_128((const __m128i*)src + 4);
+ xmm5 = xmm_load_128((const __m128i*)src + 5);
+ xmm6 = xmm_load_128((const __m128i*)src + 6);
+ xmm7 = xmm_load_128((const __m128i*)src + 7);
+
+ xmm_save_128((__m128i*)dst + 0, xmm0);
+ xmm_save_128((__m128i*)dst + 1, xmm1);
+ xmm_save_128((__m128i*)dst + 2, xmm2);
+ xmm_save_128((__m128i*)dst + 3, xmm3);
+ xmm_save_128((__m128i*)dst + 4, xmm4);
+ xmm_save_128((__m128i*)dst + 5, xmm5);
+ xmm_save_128((__m128i*)dst + 6, xmm6);
+ xmm_save_128((__m128i*)dst + 7, xmm7);
+
+ dst += 128;
+ src += 128;
+ }
+}
+
+sse2 static force_inline void
+from_sse64u(uint8_t *dst, const uint8_t *src)
+{
+ __m128i xmm1, xmm2, xmm3, xmm4;
+
+ assert(((uintptr_t)src & 15) == 0);
+
+ xmm1 = xmm_load_128((const __m128i*)src + 0);
+ xmm2 = xmm_load_128((const __m128i*)src + 1);
+ xmm3 = xmm_load_128((const __m128i*)src + 2);
+ xmm4 = xmm_load_128((const __m128i*)src + 3);
+
+ xmm_save_128u((__m128i*)dst + 0, xmm1);
+ xmm_save_128u((__m128i*)dst + 1, xmm2);
+ xmm_save_128u((__m128i*)dst + 2, xmm3);
+ xmm_save_128u((__m128i*)dst + 3, xmm4);
+}
+
+sse2 static force_inline void
+from_sse64a(uint8_t *dst, const uint8_t *src)
+{
+ __m128i xmm1, xmm2, xmm3, xmm4;
+
+ assert(((uintptr_t)dst & 15) == 0);
+ assert(((uintptr_t)src & 15) == 0);
+
+ xmm1 = xmm_load_128((const __m128i*)src + 0);
+ xmm2 = xmm_load_128((const __m128i*)src + 1);
+ xmm3 = xmm_load_128((const __m128i*)src + 2);
+ xmm4 = xmm_load_128((const __m128i*)src + 3);
+
+ xmm_save_128((__m128i*)dst + 0, xmm1);
+ xmm_save_128((__m128i*)dst + 1, xmm2);
+ xmm_save_128((__m128i*)dst + 2, xmm3);
+ xmm_save_128((__m128i*)dst + 3, xmm4);
+}
+
+sse2 static force_inline void
+from_sse32u(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -555,11 +613,37 @@ from_sse32(uint8_t *dst, const uint8_t *src)
}
sse2 static force_inline void
-from_sse16(uint8_t *dst, const uint8_t *src)
+from_sse32a(uint8_t *dst, const uint8_t *src)
+{
+ __m128i xmm1, xmm2;
+
+ assert(((uintptr_t)dst & 15) == 0);
+ assert(((uintptr_t)src & 15) == 0);
+
+ xmm1 = xmm_load_128((const __m128i*)src + 0);
+ xmm2 = xmm_load_128((const __m128i*)src + 1);
+
+ xmm_save_128((__m128i*)dst + 0, xmm1);
+ xmm_save_128((__m128i*)dst + 1, xmm2);
+}
+
+sse2 static force_inline void
+from_sse16u(uint8_t *dst, const uint8_t *src)
{
+ assert(((uintptr_t)src & 15) == 0);
+
xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
}
+sse2 static force_inline void
+from_sse16a(uint8_t *dst, const uint8_t *src)
+{
+ assert(((uintptr_t)dst & 15) == 0);
+ assert(((uintptr_t)src & 15) == 0);
+
+ xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
+}
+
sse2 static fast_memcpy void
memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
int32_t src_stride, int32_t dst_stride,
@@ -576,6 +660,8 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
const unsigned tile_shift = ffs(tile_pixels) - 1;
const unsigned tile_mask = tile_pixels - 1;
+ unsigned offset_x;
+
DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
__FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
assert(src != dst);
@@ -584,8 +670,14 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
width *= cpp;
assert(dst_stride >= width);
- dst_stride -= width;
- dst_stride += width & 15;
+ if (src_x & tile_mask) {
+ const unsigned x = (src_x & tile_mask) * cpp;
+ dst_stride -= width;
+ offset_x = min(tile_width - x, width);
+ dst_stride += (width - offset_x) & 15;
+ } else
+ dst_stride -= width & ~15;
+ assert(dst_stride >= 0);
while (height--) {
unsigned w = width;
@@ -593,47 +685,73 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
tile_row += src_y / tile_height * src_stride * tile_height;
tile_row += (src_y & (tile_height-1)) * tile_width;
+ src_y++;
+
if (src_x) {
tile_row += (src_x >> tile_shift) * tile_size;
if (src_x & tile_mask) {
- const unsigned x = (src_x & tile_mask) * cpp;
- const unsigned len = min(tile_width - x, w);
- memcpy(dst,
- assume_misaligned(tile_row, tile_width, x),
- len);
-
+ memcpy(dst, tile_row, offset_x);
tile_row += tile_size;
- dst = (uint8_t *)dst + len;
- w -= len;
+ dst = (uint8_t *)dst + offset_x;
+ w -= offset_x;
}
}
- while (w >= tile_width) {
- from_sse128xN(dst,
- assume_aligned(tile_row, tile_width),
- tile_width);
- tile_row += tile_size;
- dst = (uint8_t *)dst + tile_width;
- w -= tile_width;
- }
- while (w >= 64) {
- from_sse64(dst, tile_row);
- tile_row += 64;
- dst = (uint8_t *)dst + 64;
- w -= 64;
- }
- if (w & 32) {
- from_sse32(dst, tile_row);
- tile_row += 32;
- dst = (uint8_t *)dst + 32;
- }
- if (w & 16) {
- from_sse16(dst, tile_row);
- tile_row += 16;
- dst = (uint8_t *)dst + 16;
+ if ((uintptr_t)dst & 15) {
+ while (w >= tile_width) {
+ from_sse128xNu(dst,
+ assume_aligned(tile_row, tile_width),
+ tile_width);
+ tile_row += tile_size;
+ dst = (uint8_t *)dst + tile_width;
+ w -= tile_width;
+ }
+ while (w >= 64) {
+ from_sse64u(dst, tile_row);
+ tile_row += 64;
+ dst = (uint8_t *)dst + 64;
+ w -= 64;
+ }
+ if (w & 32) {
+ from_sse32u(dst, tile_row);
+ tile_row += 32;
+ dst = (uint8_t *)dst + 32;
+ }
+ if (w & 16) {
+ from_sse16u(dst, tile_row);
+ tile_row += 16;
+ dst = (uint8_t *)dst + 16;
+ }
+ memcpy(dst, assume_aligned(tile_row, 16), w & 15);
+ } else {
+ while (w >= tile_width) {
+ from_sse128xNa(assume_aligned(dst, 16),
+ assume_aligned(tile_row, tile_width),
+ tile_width);
+ tile_row += tile_size;
+ dst = (uint8_t *)dst + tile_width;
+ w -= tile_width;
+ }
+ while (w >= 64) {
+ from_sse64a(dst, tile_row);
+ tile_row += 64;
+ dst = (uint8_t *)dst + 64;
+ w -= 64;
+ }
+ if (w & 32) {
+ from_sse32a(dst, tile_row);
+ tile_row += 32;
+ dst = (uint8_t *)dst + 32;
+ }
+ if (w & 16) {
+ from_sse16a(dst, tile_row);
+ tile_row += 16;
+ dst = (uint8_t *)dst + 16;
+ }
+ memcpy(assume_aligned(dst, 16),
+ assume_aligned(tile_row, 16),
+ w & 15);
}
- memcpy(dst, tile_row, w & 15);
dst = (uint8_t *)dst + dst_stride;
- src_y++;
}
}
commit e62010374c0ffd1b0103285a4bf5572ce0359f51
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Fri Apr 8 08:24:44 2016 +0100
sna: Unroll the innermost SSE2 loop one more time
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 3aff6b5..c246140 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -354,6 +354,38 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
#if defined(sse2) && defined(__x86_64__)
sse2 static force_inline void
+to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
+{
+ int i;
+
+ for (i = 0; i < bytes / 128; i++) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ __m128i xmm4, xmm5, xmm6, xmm7;
+
+ xmm0 = xmm_load_128u((const __m128i*)src + 0);
+ xmm1 = xmm_load_128u((const __m128i*)src + 1);
+ xmm2 = xmm_load_128u((const __m128i*)src + 2);
+ xmm3 = xmm_load_128u((const __m128i*)src + 3);
+ xmm4 = xmm_load_128u((const __m128i*)src + 4);
+ xmm5 = xmm_load_128u((const __m128i*)src + 5);
+ xmm6 = xmm_load_128u((const __m128i*)src + 6);
+ xmm7 = xmm_load_128u((const __m128i*)src + 7);
+
+ xmm_save_128((__m128i*)dst + 0, xmm0);
+ xmm_save_128((__m128i*)dst + 1, xmm1);
+ xmm_save_128((__m128i*)dst + 2, xmm2);
+ xmm_save_128((__m128i*)dst + 3, xmm3);
+ xmm_save_128((__m128i*)dst + 4, xmm4);
+ xmm_save_128((__m128i*)dst + 5, xmm5);
+ xmm_save_128((__m128i*)dst + 6, xmm6);
+ xmm_save_128((__m128i*)dst + 7, xmm7);
+
+ dst += 128;
+ src += 128;
+ }
+}
+
+sse2 static force_inline void
to_sse64(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2, xmm3, xmm4;
@@ -370,18 +402,6 @@ to_sse64(uint8_t *dst, const uint8_t *src)
}
sse2 static force_inline void
-to_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
-{
- int i;
-
- for (i = 0; i < bytes / 64; i++) {
- to_sse64(dst, src);
- dst += 64;
- src += 64;
- }
-}
-
-sse2 static force_inline void
to_sse32(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -421,11 +441,13 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
if (src_x | src_y)
src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
- assert(src_stride >= width * cpp);
- src_stride -= width * cpp;
+ width *= cpp;
+ assert(src_stride >= width);
+ src_stride -= width;
+ src_stride += width & 15;
while (height--) {
- unsigned w = width * cpp;
+ unsigned w = width;
uint8_t *tile_row = dst;
tile_row += dst_y / tile_height * dst_stride * tile_height;
@@ -444,8 +466,8 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
while (w >= tile_width) {
- to_sse64xN(assume_aligned(tile_row, tile_width),
- src, tile_width);
+ to_sse128xN(assume_aligned(tile_row, tile_width),
+ src, tile_width);
tile_row += tile_size;
src = (const uint8_t *)src + tile_width;
w -= tile_width;
@@ -460,16 +482,14 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
to_sse32(tile_row, src);
tile_row += 32;
src = (const uint8_t *)src + 32;
- w -= 32;
}
if (w & 16) {
to_sse16(tile_row, src);
tile_row += 16;
src = (const uint8_t *)src + 16;
- w -= 16;
}
- memcpy(tile_row, src, w);
- src = (const uint8_t *)src + src_stride + w;
+ memcpy(tile_row, src, w & 15);
+ src = (const uint8_t *)src + src_stride;
dst_y++;
}
}
@@ -491,14 +511,34 @@ from_sse64(uint8_t *dst, const uint8_t *src)
}
sse2 static force_inline void
-from_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
+from_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
{
int i;
- for (i = 0; i < bytes / 64; i++) {
- from_sse64(dst, src);
- dst += 64;
- src += 64;
+ for (i = 0; i < bytes / 128; i++) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ __m128i xmm4, xmm5, xmm6, xmm7;
+
+ xmm0 = xmm_load_128((const __m128i*)src + 0);
+ xmm1 = xmm_load_128((const __m128i*)src + 1);
+ xmm2 = xmm_load_128((const __m128i*)src + 2);
+ xmm3 = xmm_load_128((const __m128i*)src + 3);
+ xmm4 = xmm_load_128((const __m128i*)src + 4);
+ xmm5 = xmm_load_128((const __m128i*)src + 5);
+ xmm6 = xmm_load_128((const __m128i*)src + 6);
+ xmm7 = xmm_load_128((const __m128i*)src + 7);
+
+ xmm_save_128u((__m128i*)dst + 0, xmm0);
+ xmm_save_128u((__m128i*)dst + 1, xmm1);
+ xmm_save_128u((__m128i*)dst + 2, xmm2);
+ xmm_save_128u((__m128i*)dst + 3, xmm3);
+ xmm_save_128u((__m128i*)dst + 4, xmm4);
+ xmm_save_128u((__m128i*)dst + 5, xmm5);
+ xmm_save_128u((__m128i*)dst + 6, xmm6);
+ xmm_save_128u((__m128i*)dst + 7, xmm7);
+
+ dst += 128;
+ src += 128;
}
}
@@ -542,11 +582,13 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
if (dst_x | dst_y)
dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
- assert(dst_stride >= width * cpp);
- dst_stride -= width * cpp;
+ width *= cpp;
+ assert(dst_stride >= width);
+ dst_stride -= width;
+ dst_stride += width & 15;
while (height--) {
- unsigned w = width * cpp;
+ unsigned w = width;
const uint8_t *tile_row = src;
tile_row += src_y / tile_height * src_stride * tile_height;
@@ -566,9 +608,9 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
while (w >= tile_width) {
- from_sse64xN(dst,
- assume_aligned(tile_row, tile_width),
- tile_width);
+ from_sse128xN(dst,
+ assume_aligned(tile_row, tile_width),
+ tile_width);
tile_row += tile_size;
dst = (uint8_t *)dst + tile_width;
w -= tile_width;
@@ -583,16 +625,14 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
from_sse32(dst, tile_row);
tile_row += 32;
dst = (uint8_t *)dst + 32;
- w -= 32;
}
if (w & 16) {
from_sse16(dst, tile_row);
tile_row += 16;
dst = (uint8_t *)dst + 16;
- w -= 16;
}
- memcpy(dst, assume_aligned(tile_row, tile_width), w);
- dst = (uint8_t *)dst + dst_stride + w;
+ memcpy(dst, tile_row, w & 15);
+ dst = (uint8_t *)dst + dst_stride;
src_y++;
}
}
commit 27ec7e49daca956733b2756dc6ae3c3eda4dd56b
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Fri Apr 8 08:03:11 2016 +0100
sna: Force inlinement of SSE2 builtins
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 6f797f4..3aff6b5 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -120,31 +120,31 @@ static bool have_sse2(void)
}
#endif
-static inline __m128i
+static force_inline __m128i
xmm_create_mask_32(uint32_t mask)
{
return _mm_set_epi32(mask, mask, mask, mask);
}
-static inline __m128i
+static force_inline __m128i
xmm_load_128(const __m128i *src)
{
return _mm_load_si128(src);
}
-static inline __m128i
+static force_inline __m128i
xmm_load_128u(const __m128i *src)
{
return _mm_loadu_si128(src);
}
-static inline void
+static force_inline void
xmm_save_128(__m128i *dst, __m128i data)
{
_mm_store_si128(dst, data);
}
-static inline void
+static force_inline void
xmm_save_128u(__m128i *dst, __m128i data)
{
_mm_storeu_si128(dst, data);
commit 65c72d9871fed7db47b5d5a2abdd43566844dcd5
Author: Chris Wilson <chris at chris-wilson.co.uk>
Date: Fri Apr 8 07:56:07 2016 +0100
sna: Invert the function wrapping for sse64xN/sse64
We should be consistent in making the code simpler for the compiler and
so not rely on it eliminating the dead-code for a single loop of
sse64xN!
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 60d2549..6f797f4 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -354,35 +354,34 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
#if defined(sse2) && defined(__x86_64__)
sse2 static force_inline void
-to_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
+to_sse64(uint8_t *dst, const uint8_t *src)
{
- int i;
+ __m128i xmm1, xmm2, xmm3, xmm4;
- for (i = 0; i < bytes / 64; i++) {
- __m128i xmm1, xmm2, xmm3, xmm4;
+ xmm1 = xmm_load_128u((const __m128i*)src + 0);
+ xmm2 = xmm_load_128u((const __m128i*)src + 1);
+ xmm3 = xmm_load_128u((const __m128i*)src + 2);
+ xmm4 = xmm_load_128u((const __m128i*)src + 3);
- xmm1 = xmm_load_128u((const __m128i*)src + 0);
- xmm2 = xmm_load_128u((const __m128i*)src + 1);
- xmm3 = xmm_load_128u((const __m128i*)src + 2);
- xmm4 = xmm_load_128u((const __m128i*)src + 3);
+ xmm_save_128((__m128i*)dst + 0, xmm1);
+ xmm_save_128((__m128i*)dst + 1, xmm2);
+ xmm_save_128((__m128i*)dst + 2, xmm3);
+ xmm_save_128((__m128i*)dst + 3, xmm4);
+}
- xmm_save_128((__m128i*)dst + 0, xmm1);
- xmm_save_128((__m128i*)dst + 1, xmm2);
- xmm_save_128((__m128i*)dst + 2, xmm3);
- xmm_save_128((__m128i*)dst + 3, xmm4);
+sse2 static force_inline void
+to_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
+{
+ int i;
+ for (i = 0; i < bytes / 64; i++) {
+ to_sse64(dst, src);
dst += 64;
src += 64;
}
}
sse2 static force_inline void
-to_sse64(uint8_t *dst, const uint8_t *src)
-{
- to_sse64xN(dst, src, 64);
-}
-
-sse2 static force_inline void
to_sse32(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -476,35 +475,34 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
sse2 static force_inline void
-from_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
+from_sse64(uint8_t *dst, const uint8_t *src)
{
- int i;
+ __m128i xmm1, xmm2, xmm3, xmm4;
- for (i = 0; i < bytes / 64; i++) {
- __m128i xmm1, xmm2, xmm3, xmm4;
+ xmm1 = xmm_load_128((const __m128i*)src + 0);
+ xmm2 = xmm_load_128((const __m128i*)src + 1);
+ xmm3 = xmm_load_128((const __m128i*)src + 2);
+ xmm4 = xmm_load_128((const __m128i*)src + 3);
- xmm1 = xmm_load_128((const __m128i*)src + 0);
- xmm2 = xmm_load_128((const __m128i*)src + 1);
- xmm3 = xmm_load_128((const __m128i*)src + 2);
- xmm4 = xmm_load_128((const __m128i*)src + 3);
+ xmm_save_128u((__m128i*)dst + 0, xmm1);
+ xmm_save_128u((__m128i*)dst + 1, xmm2);
+ xmm_save_128u((__m128i*)dst + 2, xmm3);
+ xmm_save_128u((__m128i*)dst + 3, xmm4);
+}
- xmm_save_128u((__m128i*)dst + 0, xmm1);
- xmm_save_128u((__m128i*)dst + 1, xmm2);
- xmm_save_128u((__m128i*)dst + 2, xmm3);
- xmm_save_128u((__m128i*)dst + 3, xmm4);
+sse2 static force_inline void
+from_sse64xN(uint8_t *dst, const uint8_t *src, int bytes)
+{
+ int i;
+ for (i = 0; i < bytes / 64; i++) {
+ from_sse64(dst, src);
dst += 64;
src += 64;
}
}
sse2 static force_inline void
-from_sse64(uint8_t *dst, const uint8_t *src)
-{
- from_sse64xN(dst, src, 64);
-}
-
-sse2 static force_inline void
from_sse32(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
More information about the xorg-commit
mailing list