[Mesa-dev] [PATCH 2/5] i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear
Scott D Phillips
scott.d.phillips at intel.com
Tue Apr 3 20:05:42 UTC 2018
The reference for MOVNTDQA says:
For WC memory type, the nontemporal hint may be implemented by
loading a temporary internal buffer with the equivalent of an
aligned cache line without filling this data to the cache.
[...] Subsequent MOVNTDQA reads to unread portions of the WC
cache line will receive data from the temporary internal
buffer if data is available.
This hidden cache line sized temporary buffer can improve the
read performance from wc maps.
---
src/mesa/drivers/dri/i965/Makefile.am | 7 ++++
src/mesa/drivers/dri/i965/Makefile.sources | 6 ++-
src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 52 ++++++++++++++++++++++++++
src/mesa/drivers/dri/i965/meson.build | 18 +++++++--
4 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 889d4c68a2b..ff47add93f4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
noinst_LTLIBRARIES = \
libi965_dri.la \
+ libintel_tiled_memcpy.la \
$(I965_PERGEN_LIBS)
+libintel_tiled_memcpy_la_SOURCES = \
+ $(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+ $(AM_CFLAGS) $(SSE41_CFLAGS)
+
libi965_dri_la_SOURCES = \
$(i965_FILES) \
$(i965_oa_GENERATED_FILES)
@@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
$(top_builddir)/src/intel/compiler/libintel_compiler.la \
$(top_builddir)/src/intel/blorp/libblorp.la \
$(I965_PERGEN_LIBS) \
+ libintel_tiled_memcpy.la
$(LIBDRM_LIBS)
BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 3479ceb9d16..ab7db3be7ed 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,13 @@ i965_FILES = \
intel_tex_image.c \
intel_tex_obj.h \
intel_tex_validate.c \
- intel_tiled_memcpy.c \
- intel_tiled_memcpy.h \
intel_upload.c \
libdrm_macros.h
+intel_tiled_memcpy_FILES = \
+ intel_tiled_memcpy.c \
+ intel_tiled_memcpy.h
+
i965_gen4_FILES = \
genX_blorp_exec.c \
genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 7c6bde990d6..d076351b322 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,6 +36,10 @@
#include "brw_context.h"
#include "intel_tiled_memcpy.h"
+#if defined(USE_SSE41)
+#include "main/streaming-load-memcpy.h"
+#include <smmintrin.h>
+#endif
#if defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE2__)
@@ -213,6 +217,30 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
return dst;
}
+#if defined(USE_SSE41)
+static ALWAYS_INLINE void*
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+ if (count == 16) {
+ __m128i val = _mm_stream_load_si128((__m128i *)src);
+ _mm_store_si128((__m128i *)dest, val);
+ return dest;
+ } else if (count == 64) {
+ __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+ __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+ __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+ __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+ _mm_store_si128(((__m128i *)dest) + 0, val0);
+ _mm_store_si128(((__m128i *)dest) + 1, val1);
+ _mm_store_si128(((__m128i *)dest) + 2, val2);
+ _mm_store_si128(((__m128i *)dest) + 3, val3);
+ return dest;
+ } else {
+ return memcpy(dest, src, count);
+ }
+}
+#endif
+
/**
* Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
* These ranges are in bytes, i.e. pixels * bytes-per-pixel.
@@ -677,6 +705,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+ dst, src, dst_pitch, swizzle_bit, memcpy,
+ _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
@@ -687,6 +721,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit, memcpy,
+ _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
@@ -719,6 +759,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
@@ -729,6 +775,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index 4d8280df573..f2207f68b9d 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,15 @@ files_i965 = files(
'intel_tex_image.c',
'intel_tex_obj.h',
'intel_tex_validate.c',
- 'intel_tiled_memcpy.c',
- 'intel_tiled_memcpy.h',
'intel_upload.c',
'libdrm_macros.h',
)
+files_intel_tiled_memcpy = files(
+ 'intel_tiled_memcpy.c',
+ 'intel_tiled_memcpy.h',
+)
+
i965_gen_libs = []
foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
i965_gen_libs += static_library(
@@ -176,6 +179,15 @@ i965_oa_sources = custom_target(
],
)
+intel_tiled_memcpy = static_library(
+ 'intel_tiled_memcpy',
+ [files_intel_tiled_memcpy],
+ include_directories : [
+ inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+ ],
+ c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args],
+)
+
libi965 = static_library(
'i965',
[files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +199,7 @@ libi965 = static_library(
cpp_args : [cpp_vis_args, '-msse2'],
link_with : [
i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
- libblorp,
+ libblorp, intel_tiled_memcpy,
],
dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
)
--
2.14.3
More information about the mesa-dev
mailing list