i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear

author Scott D Phillips <scott.d.phillips@intel.com>

Mon, 30 Apr 2018 17:25:48 +0000 (10:25 -0700)

committer Kenneth Graunke <kenneth@whitecape.org>

Fri, 25 May 2018 18:05:46 +0000 (11:05 -0700)
author Scott D Phillips <scott.d.phillips@intel.com>
Mon, 30 Apr 2018 17:25:48 +0000 (10:25 -0700)
committer Kenneth Graunke <kenneth@whitecape.org>
Fri, 25 May 2018 18:05:46 +0000 (11:05 -0700)
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am

index 889d4c6..ff47add 100644 (file)
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
  
  noinst_LTLIBRARIES = \
         libi965_dri.la \
+       libintel_tiled_memcpy.la \
         $(I965_PERGEN_LIBS)
  
+libintel_tiled_memcpy_la_SOURCES = \
+       $(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+       $(AM_CFLAGS) $(SSE41_CFLAGS)
+
  libi965_dri_la_SOURCES = \
         $(i965_FILES) \
         $(i965_oa_GENERATED_FILES)
@@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
         $(top_builddir)/src/intel/compiler/libintel_compiler.la \
         $(top_builddir)/src/intel/blorp/libblorp.la \
         $(I965_PERGEN_LIBS) \
+       libintel_tiled_memcpy.la
         $(LIBDRM_LIBS)
  
  BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources

index db6591a..ce7633c 100644 (file)
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,13 @@ i965_FILES = \
         intel_tex_image.c \
         intel_tex_obj.h \
         intel_tex_validate.c \
-       intel_tiled_memcpy.c \
-       intel_tiled_memcpy.h \
         intel_upload.c \
         libdrm_macros.h
  
+intel_tiled_memcpy_FILES = \
+       intel_tiled_memcpy.c \
+       intel_tiled_memcpy.h
+
  i965_gen4_FILES = \
         genX_blorp_exec.c \
         genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c

index 7c6bde9..fac5427 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,6 +36,10 @@
  #include "brw_context.h"
  #include "intel_tiled_memcpy.h"
  
+#if defined(USE_SSE41)
+#include "main/streaming-load-memcpy.h"
+#include <smmintrin.h>
+#endif
  #if defined(__SSSE3__)
  #include <tmmintrin.h>
  #elif defined(__SSE2__)
@@ -213,6 +217,31 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
     return dst;
  }
  
+#if defined(USE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+   if (count == 16) {
+      __m128i val = _mm_stream_load_si128((__m128i *)src);
+      _mm_store_si128((__m128i *)dest, val);
+      return dest;
+   } else if (count == 64) {
+      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+      _mm_store_si128(((__m128i *)dest) + 0, val0);
+      _mm_store_si128(((__m128i *)dest) + 1, val1);
+      _mm_store_si128(((__m128i *)dest) + 2, val2);
+      _mm_store_si128(((__m128i *)dest) + 3, val3);
+      return dest;
+   } else {
+      assert(count < 64); /* and (count < 16) for ytiled */
+      return memcpy(dest, src, count);
+   }
+}
+#endif
+
  /**
   * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
   * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
@@ -677,6 +706,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                   dst, src, dst_pitch, swizzle_bit,
                                   rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
        else
           unreachable("not reached");
     } else {
@@ -687,6 +722,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit,
                                   rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
        else
           unreachable("not reached");
     }
@@ -719,6 +760,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                   dst, src, dst_pitch, swizzle_bit,
                                   rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
        else
           unreachable("not reached");
     } else {
@@ -729,6 +776,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit,
                                   rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
        else
           unreachable("not reached");
     }
@@ -868,6 +921,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
        unreachable("unsupported tiling");
     }
  
+#if defined(USE_SSE41)
+   if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) {
+      /* The hidden cacheline sized register used by movntdqa can apparently
+       * give you stale data, so do an mfence to invalidate it.
+       */
+      _mm_mfence();
+   }
+#endif
+
     /* Round out to tile boundaries. */
     xt0 = ALIGN_DOWN(xt1, tw);
     xt3 = ALIGN_UP  (xt2, tw);
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build

index 20404d5..1eac329 100644 (file)
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,15 @@ files_i965 = files(
    'intel_tex_image.c',
    'intel_tex_obj.h',
    'intel_tex_validate.c',
-  'intel_tiled_memcpy.c',
-  'intel_tiled_memcpy.h',
    'intel_upload.c',
    'libdrm_macros.h',
  )
  
+files_intel_tiled_memcpy = files(
+  'intel_tiled_memcpy.c',
+  'intel_tiled_memcpy.h',
+)
+
  i965_gen_libs = []
  foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
    i965_gen_libs += static_library(
@@ -176,6 +179,15 @@ i965_oa_sources = custom_target(
    ],
  )
  
+intel_tiled_memcpy = static_library(
+  'intel_tiled_memcpy',
+  [files_intel_tiled_memcpy],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args],
+)
+
  libi965 = static_library(
    'i965',
    [files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +199,7 @@ libi965 = static_library(
    cpp_args : [cpp_vis_args, '-msse2'],
    link_with : [
      i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
-    libblorp,
+    libblorp, intel_tiled_memcpy,
    ],
    dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
  )
author	Scott D Phillips <scott.d.phillips@intel.com>
	Mon, 30 Apr 2018 17:25:48 +0000 (10:25 -0700)
committer	Kenneth Graunke <kenneth@whitecape.org>
	Fri, 25 May 2018 18:05:46 +0000 (11:05 -0700)
src/mesa/drivers/dri/i965/Makefile.am		patch \| blob \| history
src/mesa/drivers/dri/i965/Makefile.sources		patch \| blob \| history
src/mesa/drivers/dri/i965/intel_tiled_memcpy.c		patch \| blob \| history
src/mesa/drivers/dri/i965/meson.build		patch \| blob \| history