[Mesa-dev] [PATCH 2/5] i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear
Chris Wilson
chris at chris-wilson.co.uk
Thu Apr 5 19:54:54 UTC 2018
Quoting Scott D Phillips (2018-04-03 21:05:42)
> The reference for MOVNTDQA says:
>
> For WC memory type, the nontemporal hint may be implemented by
> loading a temporary internal buffer with the equivalent of an
> aligned cache line without filling this data to the cache.
> [...] Subsequent MOVNTDQA reads to unread portions of the WC
> cache line will receive data from the temporary internal
> buffer if data is available.
>
> This hidden cache line sized temporary buffer can improve the
> read performance from wc maps.
> ---
> src/mesa/drivers/dri/i965/Makefile.am | 7 ++++
> src/mesa/drivers/dri/i965/Makefile.sources | 6 ++-
> src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 52 ++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/meson.build | 18 +++++++--
> 4 files changed, 78 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
> index 889d4c68a2b..ff47add93f4 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.am
> +++ b/src/mesa/drivers/dri/i965/Makefile.am
> @@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
>
> noinst_LTLIBRARIES = \
> libi965_dri.la \
> + libintel_tiled_memcpy.la \
> $(I965_PERGEN_LIBS)
>
> +libintel_tiled_memcpy_la_SOURCES = \
> + $(intel_tiled_memcpy_FILES)
> +libintel_tiled_memcpy_la_CFLAGS = \
> + $(AM_CFLAGS) $(SSE41_CFLAGS)
> +
> libi965_dri_la_SOURCES = \
> $(i965_FILES) \
> $(i965_oa_GENERATED_FILES)
> @@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
> $(top_builddir)/src/intel/compiler/libintel_compiler.la \
> $(top_builddir)/src/intel/blorp/libblorp.la \
> $(I965_PERGEN_LIBS) \
> + libintel_tiled_memcpy.la
> $(LIBDRM_LIBS)
Makes sense.
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index 7c6bde990d6..d076351b322 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -36,6 +36,10 @@
> #include "brw_context.h"
> #include "intel_tiled_memcpy.h"
>
> +#if defined(USE_SSE41)
> +#include "main/streaming-load-memcpy.h"
> +#include <smmintrin.h>
> +#endif
> #if defined(__SSSE3__)
> #include <tmmintrin.h>
> #elif defined(__SSE2__)
> @@ -213,6 +217,30 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
> return dst;
> }
>
> +#if defined(USE_SSE41)
> +static ALWAYS_INLINE void*
Space in that void*? (but don't quote me on mesa/i965 preferred style!)
> +_memcpy_streaming_load(void *dest, const void *src, size_t count)
> +{
> + if (count == 16) {
> + __m128i val = _mm_stream_load_si128((__m128i *)src);
> + _mm_store_si128((__m128i *)dest, val);
> + return dest;
> + } else if (count == 64) {
> + __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
> + __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
> + __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
> + __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
> + _mm_store_si128(((__m128i *)dest) + 0, val0);
> + _mm_store_si128(((__m128i *)dest) + 1, val1);
> + _mm_store_si128(((__m128i *)dest) + 2, val2);
> + _mm_store_si128(((__m128i *)dest) + 3, val3);
> + return dest;
> + } else {
assert(count < 16); or assert(count < 64) ?
Might as well remind the reader (and caller?!) that this is only for
copying the residuals.
> + return memcpy(dest, src, count);
> + }
> +}
> +#endif
> +
> /**
> * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
> * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
> @@ -677,6 +705,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
> return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
> dst, src, dst_pitch, swizzle_bit,
> rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> + return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
> + dst, src, dst_pitch, swizzle_bit, memcpy,
> + _memcpy_streaming_load);
Please group memcpy and _memcpy_streaming_load (put the line brea before
to keep them on the same line).
> +#endif
> else
> unreachable("not reached");
> } else {
> @@ -687,6 +721,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
> return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
> dst, src, dst_pitch, swizzle_bit,
> rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> + return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
> + dst, src, dst_pitch, swizzle_bit, memcpy,
> + _memcpy_streaming_load);
> +#endif
> else
> unreachable("not reached");
> }
> @@ -719,6 +759,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
> return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
> dst, src, dst_pitch, swizzle_bit,
> rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> + return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
> + dst, src, dst_pitch, swizzle_bit,
> + memcpy, _memcpy_streaming_load);
> +#endif
> else
> unreachable("not reached");
> } else {
> @@ -729,6 +775,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
> return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
> dst, src, dst_pitch, swizzle_bit,
> rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> + return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
> + dst, src, dst_pitch, swizzle_bit,
> + memcpy, _memcpy_streaming_load);
> +#endif
> else
> unreachable("not reached");
> }
Ok, was hoping to see how you choose to use the streaming load, but I
guess that's the next patch.
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris
More information about the mesa-dev
mailing list