[Mesa-dev] [PATCH 2/5] i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear

Thu Apr 5 19:54:54 UTC 2018

Quoting Scott D Phillips (2018-04-03 21:05:42)
> The reference for MOVNTDQA says:
> 
>     For WC memory type, the nontemporal hint may be implemented by
>     loading a temporary internal buffer with the equivalent of an
>     aligned cache line without filling this data to the cache.
>     [...] Subsequent MOVNTDQA reads to unread portions of the WC
>     cache line will receive data from the temporary internal
>     buffer if data is available.
> 
> This hidden cache line sized temporary buffer can improve the
> read performance from wc maps.
> ---
>  src/mesa/drivers/dri/i965/Makefile.am          |  7 ++++
>  src/mesa/drivers/dri/i965/Makefile.sources     |  6 ++-
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 52 ++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/meson.build          | 18 +++++++--
>  4 files changed, 78 insertions(+), 5 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
> index 889d4c68a2b..ff47add93f4 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.am
> +++ b/src/mesa/drivers/dri/i965/Makefile.am
> @@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
>  
>  noinst_LTLIBRARIES = \
>         libi965_dri.la \
> +       libintel_tiled_memcpy.la \
>         $(I965_PERGEN_LIBS)
>  
> +libintel_tiled_memcpy_la_SOURCES = \
> +       $(intel_tiled_memcpy_FILES)
> +libintel_tiled_memcpy_la_CFLAGS = \
> +       $(AM_CFLAGS) $(SSE41_CFLAGS)
> +
>  libi965_dri_la_SOURCES = \
>         $(i965_FILES) \
>         $(i965_oa_GENERATED_FILES)
> @@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
>         $(top_builddir)/src/intel/compiler/libintel_compiler.la \
>         $(top_builddir)/src/intel/blorp/libblorp.la \
>         $(I965_PERGEN_LIBS) \
> +       libintel_tiled_memcpy.la
>         $(LIBDRM_LIBS)

Makes sense.

> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index 7c6bde990d6..d076351b322 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -36,6 +36,10 @@
>  #include "brw_context.h"
>  #include "intel_tiled_memcpy.h"
>  
> +#if defined(USE_SSE41)
> +#include "main/streaming-load-memcpy.h"
> +#include <smmintrin.h>
> +#endif
>  #if defined(__SSSE3__)
>  #include <tmmintrin.h>
>  #elif defined(__SSE2__)
> @@ -213,6 +217,30 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
>     return dst;
>  }
>  
> +#if defined(USE_SSE41)
> +static ALWAYS_INLINE void*

Space in that void*? (but don't quote me on mesa/i965 preferred style!)

> +_memcpy_streaming_load(void *dest, const void *src, size_t count)
> +{
> +   if (count == 16) {
> +      __m128i val = _mm_stream_load_si128((__m128i *)src);
> +      _mm_store_si128((__m128i *)dest, val);
> +      return dest;
> +   } else if (count == 64) {
> +      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
> +      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
> +      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
> +      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
> +      _mm_store_si128(((__m128i *)dest) + 0, val0);
> +      _mm_store_si128(((__m128i *)dest) + 1, val1);
> +      _mm_store_si128(((__m128i *)dest) + 2, val2);
> +      _mm_store_si128(((__m128i *)dest) + 3, val3);
> +      return dest;
> +   } else {

assert(count < 16); or assert(count < 64) ?

Might as well remind the reader (and caller?!) that this is only for
copying the residuals.

> +      return memcpy(dest, src, count);
> +   }
> +}
> +#endif
> +
>  /**
>   * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
>   * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
> @@ -677,6 +705,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
>                                   dst, src, dst_pitch, swizzle_bit,
>                                   rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> +      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> +         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
> +                                 dst, src, dst_pitch, swizzle_bit, memcpy,
> +                                 _memcpy_streaming_load);

Please group memcpy and _memcpy_streaming_load (put the line brea before
to keep them on the same line).

> +#endif
>        else
>           unreachable("not reached");
>     } else {
> @@ -687,6 +721,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
>                                   dst, src, dst_pitch, swizzle_bit,
>                                   rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> +      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> +         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, dst_pitch, swizzle_bit, memcpy,
> +                                 _memcpy_streaming_load);
> +#endif
>        else
>           unreachable("not reached");
>     }
> @@ -719,6 +759,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
>                                   dst, src, dst_pitch, swizzle_bit,
>                                   rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> +      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> +         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 memcpy, _memcpy_streaming_load);
> +#endif
>        else
>           unreachable("not reached");
>     } else {
> @@ -729,6 +775,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
>                                   dst, src, dst_pitch, swizzle_bit,
>                                   rgba8_copy, rgba8_copy_aligned_src);
> +#if defined(USE_SSE41)
> +      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
> +         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 memcpy, _memcpy_streaming_load);
> +#endif
>        else
>           unreachable("not reached");
>     }

Ok, was hoping to see how you choose to use the streaming load, but I
guess that's the next patch.

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris