From 93b8b987d096d53c5451178bdab07af654a95bd1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 2 Aug 2018 18:15:48 -0400
Subject: [PATCH] radeonsi: add a thorough clear/copy_buffer benchmark

---
 src/gallium/drivers/radeonsi/Makefile.sources      |   2 +-
 src/gallium/drivers/radeonsi/meson.build           |   2 +-
 src/gallium/drivers/radeonsi/si_blit.c             |   2 +-
 src/gallium/drivers/radeonsi/si_cp_dma.c           |   8 +-
 src/gallium/drivers/radeonsi/si_pipe.c             |   8 +-
 src/gallium/drivers/radeonsi/si_pipe.h             |  11 +-
 src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c   | 105 +++++
 src/gallium/drivers/radeonsi/si_test_clearbuffer.c | 139 ------
 src/gallium/drivers/radeonsi/si_test_dma_perf.c    | 475 +++++++++++++++++++++
 9 files changed, 599 insertions(+), 153 deletions(-)
 delete mode 100644 src/gallium/drivers/radeonsi/si_test_clearbuffer.c
 create mode 100644 src/gallium/drivers/radeonsi/si_test_dma_perf.c

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index b52db3a..abdc4e0 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -44,8 +44,8 @@ C_SOURCES := \
 	si_state_streamout.c \
 	si_state_viewport.c \
 	si_state.h \
-	si_test_clearbuffer.c \
 	si_test_dma.c \
+	si_test_dma_perf.c \
 	si_texture.c \
 	si_uvd.c \
 	../radeon/r600_perfcounter.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 5722904..4d6044f 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -60,8 +60,8 @@ files_libradeonsi = files(
   'si_state_shaders.c',
   'si_state_streamout.c',
   'si_state_viewport.c',
-  'si_test_clearbuffer.c',
   'si_test_dma.c',
+  'si_test_dma_perf.c',
   'si_texture.c',
   'si_uvd.c',
   '../radeon/r600_perfcounter.c',
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index cf64952..fcaff80 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -910,7 +910,7 @@ void si_resource_copy_region(struct pipe_context *ctx,
 
 	/* Handle buffers first. */
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, 0);
+		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, 0, -1);
 		return;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 61be22f..486ae75 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -436,18 +436,20 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    unsigned user_flags)
+		    unsigned user_flags, enum si_cache_policy cache_policy)
 {
 	uint64_t main_dst_offset, main_src_offset;
 	unsigned skipped_size = 0;
 	unsigned realign_size = 0;
 	enum si_coherency coher = SI_COHERENCY_SHADER;
-	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
 	bool is_first = true;
 
 	if (!size)
 		return;
 
+	if (cache_policy == -1)
+		cache_policy = get_cache_policy(sctx, coher);
+
 	if (dst != src || dst_offset != src_offset) {
 		/* Mark the buffer range of destination as valid (initialized),
 		 * so that transfer_map knows it should wait for the GPU when mapping
@@ -539,7 +541,7 @@ void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf
 {
 	assert(sctx->chip_class >= CIK);
 
-	si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL);
+	si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, L2_LRU);
 }
 
 static void cik_prefetch_shader_async(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 13fcf1f..c259c26 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -101,7 +101,7 @@ static const struct debug_named_value debug_options[] = {
 	{ "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
 	{ "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
 	{ "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
-	{ "testclearbufperf", DBG(TEST_CLEARBUF_PERF), "Test Clearbuffer Performance" },
+	{ "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -730,7 +730,7 @@ static void si_test_vmfault(struct si_screen *sscreen)
 	r600_resource(buf)->gpu_address = 0; /* cause a VM fault */
 
 	if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) {
-		si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0);
+		si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, -1);
 		ctx->flush(ctx, NULL, 0);
 		puts("VM fault test: CP - done.");
 	}
@@ -1070,8 +1070,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 	if (sscreen->debug_flags & DBG(TEST_DMA))
 		si_test_dma(sscreen);
 
-	if (sscreen->debug_flags & DBG(TEST_CLEARBUF_PERF)) {
-		si_test_clearbuffer_perf(sscreen);
+	if (sscreen->debug_flags & DBG(TEST_DMA_PERF)) {
+		si_test_dma_perf(sscreen);
 	}
 
 	if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) |
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index fe06064..ef4f06f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -165,7 +165,7 @@ enum {
 	DBG_TEST_VMFAULT_CP,
 	DBG_TEST_VMFAULT_SDMA,
 	DBG_TEST_VMFAULT_SHADER,
-	DBG_TEST_CLEARBUF_PERF,
+	DBG_TEST_DMA_PERF,
 };
 
 #define DBG_ALL_SHADERS		(((1 << (DBG_CS + 1)) - 1))
@@ -1112,8 +1112,8 @@ void si_init_clear_functions(struct si_context *sctx);
 
 enum si_cache_policy {
 	L2_BYPASS,
-	L2_LRU,    /* same as SLC=0 */
 	L2_STREAM, /* same as SLC=1 */
+	L2_LRU,    /* same as SLC=0 */
 };
 
 enum si_coherency {
@@ -1133,7 +1133,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    unsigned user_flags);
+		    unsigned user_flags, enum si_cache_policy cache_policy);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
 			      uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
@@ -1217,13 +1217,16 @@ void si_resume_queries(struct si_context *sctx);
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
 			unsigned num_layers);
 void *si_create_fixed_func_tcs(struct si_context *sctx);
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+				   unsigned num_dwords_per_thread,
+				   bool dst_stream_cache_policy, bool is_copy);
 void *si_create_query_result_cs(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
 
 /* si_test_clearbuffer.c */
-void si_test_clearbuffer_perf(struct si_screen *sscreen);
+void si_test_dma_perf(struct si_screen *sscreen);
 
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 45bc93e..da55c81 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -119,6 +119,111 @@ void *si_create_fixed_func_tcs(struct si_context *sctx)
 	return ureg_create_shader_and_destroy(ureg, &sctx->b);
 }
 
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+				   unsigned num_dwords_per_thread,
+				   bool dst_stream_cache_policy, bool is_copy)
+{
+	assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+	unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+	if (dst_stream_cache_policy)
+		store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	/* Don't cache loads, because there is no reuse. */
+	unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+	unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+	for (unsigned i = 0; i < num_mem_ops; i++) {
+		if (i*4 < num_dwords_per_thread)
+			inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
+	}
+
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+	if (!ureg)
+		return NULL;
+
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+	struct ureg_src value;
+	if (!is_copy) {
+		ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, inst_dwords[0]);
+		value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA, 0);
+	}
+
+	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+	struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+	struct ureg_src srcbuf;
+	struct ureg_src *values = NULL;
+
+	if (is_copy) {
+		srcbuf = ureg_DECL_buffer(ureg, 1, false);
+		values = malloc(num_mem_ops * sizeof(struct ureg_src));
+	}
+
+	/* If there are multiple stores, the first store writes into 0+tid,
+	 * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc.
+	 */
+	ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid);
+	/* Convert from a "store size unit" into bytes. */
+	ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
+		  ureg_imm1u(ureg, 4 * inst_dwords[0]));
+	ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+	/* Distance between a load and a store for latency hiding. */
+	unsigned load_store_distance = is_copy ? 8 : 0;
+
+	for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+		int d = i - load_store_distance;
+
+		if (is_copy && i < num_mem_ops) {
+			if (i) {
+				ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[i] * 64));
+			}
+
+			values[i] = ureg_src(ureg_DECL_temporary(ureg));
+			struct ureg_dst dst =
+				ureg_writemask(ureg_dst(values[i]),
+					       u_bit_consecutive(0, inst_dwords[i]));
+			struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+			ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
+					 load_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+
+		if (d >= 0) {
+			if (d) {
+				ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[d] * 64));
+			}
+
+			struct ureg_dst dst =
+				ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+			struct ureg_src srcs[] =
+				{ureg_src(store_addr), is_copy ? values[d] : value};
+			ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
+					 store_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+	}
+	ureg_END(ureg);
+
+	struct pipe_compute_state state = {};
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = ureg_get_tokens(ureg, NULL);
+
+	void *cs = ctx->create_compute_state(ctx, &state);
+	ureg_destroy(ureg);
+	free(values);
+	return cs;
+}
+
 /* Create the compute shader that is used to collect the results.
  *
  * One compute grid with a single thread is launched for every query result
diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
deleted file mode 100644
index e863381..0000000
--- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2018 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-/* This file implements tests on the si_clearbuffer function. */
-
-#include "si_pipe.h"
-
-#define CLEARBUF_MIN 32
-#define CLEARBUF_COUNT 16
-#define CLEARBUF_MEMSZ 1024
-
-static uint64_t
-measure_clearbuf_time(struct pipe_context *ctx,
-		      uint64_t memory_size)
-{
-	struct pipe_query *query_te;
-	union pipe_query_result qresult;
-	struct pipe_resource *buf;
-
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_screen *screen = ctx->screen;
-
-	buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size);
-
-	query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
-
-	ctx->begin_query(ctx, query_te);
-	/* operation  */
-	si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00,
-			       SI_COHERENCY_SHADER, L2_LRU);
-	ctx->end_query(ctx, query_te);
-	ctx->get_query_result(ctx, query_te, true, &qresult);
-
-	/* Cleanup. */
-	ctx->destroy_query(ctx, query_te);
-	pipe_resource_reference(&buf, NULL);
-
-	/* Report Results */
-	return qresult.u64;
-}
-
-/**
- * @brief Analyze rate of clearing a 1K Buffer averaged over 16 iterations
- * @param ctx Context of pipe to perform analysis on
- */
-static void
-analyze_clearbuf_perf_avg(struct pipe_context *ctx)
-{
-	uint index = 0;
-	uint64_t result[CLEARBUF_COUNT];
-	uint64_t sum = 0;
-	long long int rate_kBps;
-
-	/* Run Tests. */
-	for (index = 0 ; index < CLEARBUF_COUNT ; index++) {
-		result[index] = measure_clearbuf_time(ctx, CLEARBUF_MEMSZ);
-		sum += result[index];
-	}
-
-	/* Calculate Results. */
-	/*  kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */
-	rate_kBps = CLEARBUF_COUNT*CLEARBUF_MEMSZ;
-	rate_kBps *= 1000UL*1000UL;
-	rate_kBps /= sum;
-
-	/* Display Results. */
-	printf("CP DMA clear_buffer performance (buffer %lu ,repeat %u ):",
-	       (uint64_t)CLEARBUF_MEMSZ,
-	       CLEARBUF_COUNT );
-	printf(" %llu kB/s\n", rate_kBps );
-}
-
-/**
- * @brief Analyze rate of clearing a range of Buffer sizes
- * @param ctx Context of pipe to perform analysis on
- */
-static void
-analyze_clearbuf_perf_rng(struct pipe_context *ctx)
-{
-	uint index = 0;
-	uint64_t result[CLEARBUF_COUNT];
-	uint64_t mem_size;
-	long long int rate_kBps;
-
-	/* Run Tests. */
-	mem_size = CLEARBUF_MIN;
-	for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) {
-		result[index] = measure_clearbuf_time(ctx, mem_size);
-		mem_size <<= 1;
-	}
-
-	/* Calculate & Display Results. */
-	/*  kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */
-	mem_size = CLEARBUF_MIN;
-	for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) {
-		rate_kBps = mem_size;
-		rate_kBps *= 1000UL*1000UL;
-		rate_kBps /= result[index];
-
-		printf("CP DMA clear_buffer performance (buffer %lu):",
-		       mem_size );
-		printf(" %llu kB/s\n", rate_kBps );
-
-		mem_size <<= 1;
-	}
-}
-
-void si_test_clearbuffer_perf(struct si_screen *sscreen)
-{
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-
-	analyze_clearbuf_perf_avg(ctx);
-	analyze_clearbuf_perf_rng(ctx);
-
-	exit(0);
-}
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
new file mode 100644
index 0000000..be2ad07
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file implements tests on the si_clearbuffer function. */
+
+#include "si_pipe.h"
+#include "si_query.h"
+
+#define MIN_SIZE	512
+#define MAX_SIZE	(128 * 1024 * 1024)
+#define SIZE_SHIFT	1
+#define NUM_RUNS	128
+
+static double get_MBps_rate(unsigned num_bytes, unsigned ns)
+{
+	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+}
+
+void si_test_dma_perf(struct si_screen *sscreen)
+{
+	struct pipe_screen *screen = &sscreen->b;
+	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+	struct si_context *sctx = (struct si_context*)ctx;
+	const uint32_t clear_value = 0x12345678;
+	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+
+#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
+#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+	static const char *method_str[] = {
+		"CP MC   ",
+		"CP L2   ",
+		"CP L2   ",
+		"SDMA    ",
+	};
+	static const char *placement_str[] = {
+		/* Clear */
+		"fill->VRAM",
+		"fill->GTT ",
+		/* Copy */
+		"VRAM->VRAM",
+		"VRAM->GTT ",
+		"GTT ->VRAM",
+	};
+
+	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+	printf("Heap       ,Method  ,L2p,Wa,");
+	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+		if (size >= 1024)
+			printf("%6uKB,", size / 1024);
+		else
+			printf(" %6uB,", size);
+	}
+	printf("\n");
+
+	/* results[log2(size)][placement][method][] */
+	struct si_result {
+		bool is_valid;
+		bool is_cp;
+		bool is_sdma;
+		bool is_cs;
+		unsigned cache_policy;
+		unsigned dwords_per_thread;
+		unsigned waves_per_sh;
+		unsigned score;
+		unsigned index; /* index in results[x][y][index] */
+	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+	/* Run benchmarks. */
+	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+		bool is_copy = placement >= 2;
+
+		printf("-----------,--------,---,--,");
+		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+			printf("--------,");
+		printf("\n");
+
+		for (unsigned method = 0; method < NUM_METHODS; method++) {
+			bool test_cp = method <= 2;
+			bool test_sdma = method == 3;
+			bool test_cs = method >= 4;
+			unsigned cs_method = method - 4;
+			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+			unsigned cs_waves_per_sh =
+				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
+			cs_method %= 2*NUM_SHADERS;
+			unsigned cache_policy = test_cp ? method % 3 :
+						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+			unsigned cs_dwords_per_thread =
+				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+			if (sctx->chip_class == SI) {
+				/* SI doesn't support CP DMA operations through L2. */
+				if (test_cp && cache_policy != L2_BYPASS)
+					continue;
+				/* WAVES_PER_SH is in multiples of 16 on SI. */
+				if (test_cs && cs_waves_per_sh % 16 != 0)
+					continue;
+			}
+
+			printf("%s ,", placement_str[placement]);
+			if (test_cs) {
+				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+				       cache_policy == L2_LRU ? "LRU" :
+				       cache_policy == L2_STREAM ? "Str" : "");
+			} else {
+				printf("%s,%3s,", method_str[method],
+				       method == L2_LRU ? "LRU" :
+				       method == L2_STREAM ? "Str" : "");
+			}
+			if (test_cs && cs_waves_per_sh)
+				printf("%2u,", cs_waves_per_sh);
+			else
+				printf("  ,");
+
+			double score = 0;
+			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+				/* Don't test bigger sizes if it's too slow. Print 0. */
+				if (size >= 512*1024 &&
+				    score < 400 * (size / (4*1024*1024))) {
+					printf("%7.0f ,", 0.0);
+					continue;
+				}
+
+				enum pipe_resource_usage dst_usage, src_usage;
+				struct pipe_resource *dst, *src;
+				struct pipe_query *q[NUM_RUNS];
+				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+				if (test_sdma) {
+					if (sctx->chip_class == SI)
+						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+					else
+						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+				}
+
+				if (placement == 0 || placement == 2 || placement == 4)
+					dst_usage = PIPE_USAGE_DEFAULT;
+				else
+					dst_usage = PIPE_USAGE_STREAM;
+
+				if (placement == 2 || placement == 3)
+					src_usage = PIPE_USAGE_DEFAULT;
+				else
+					src_usage = PIPE_USAGE_STREAM;
+
+				dst = pipe_buffer_create(screen, 0, dst_usage, size);
+				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+				/* Run tests. */
+				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+					q[iter] = ctx->create_query(ctx, query_type, 0);
+					ctx->begin_query(ctx, q[iter]);
+
+					if (test_cp) {
+						/* CP DMA */
+						if (is_copy) {
+							si_copy_buffer(sctx, dst, src, 0, 0, size, 0,
+								       cache_policy);
+						} else {
+							si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
+									       SI_COHERENCY_NONE, cache_policy);
+						}
+					} else if (test_sdma) {
+						/* SDMA */
+						if (is_copy) {
+							struct pipe_box box;
+							u_box_1d(0, size, &box);
+							sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
+						} else {
+							sctx->dma_clear_buffer(sctx, dst, 0, size, clear_value);
+						}
+					} else {
+						/* Compute */
+						/* The memory accesses are coalesced, meaning that the 1st instruction writes
+						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
+						 * writes the 2nd contiguous block of data, etc.
+						 */
+						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+						unsigned num_dwords = size / 4;
+						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+											cache_policy == L2_STREAM, is_copy);
+
+						struct pipe_grid_info info = {};
+						info.block[0] = MIN2(64, num_instructions);
+						info.block[1] = 1;
+						info.block[2] = 1;
+						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+						info.grid[1] = 1;
+						info.grid[2] = 1;
+
+						struct pipe_shader_buffer sb[2] = {};
+						sb[0].buffer = dst;
+						sb[0].buffer_size = size;
+
+						if (is_copy) {
+							sb[1].buffer = src;
+							sb[1].buffer_size = size;
+						} else {
+							for (unsigned i = 0; i < 4; i++)
+								sctx->cs_user_data[i] = clear_value;
+						}
+
+						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
+							       SI_CONTEXT_INV_SMEM_L1;
+
+						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
+						ctx->bind_compute_state(ctx, cs);
+						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+						ctx->launch_grid(ctx, &info);
+
+						ctx->bind_compute_state(ctx, NULL);
+						ctx->delete_compute_state(ctx, cs);
+						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+					}
+
+					/* Flush L2, so that we don't just test L2 cache performance. */
+					if (!test_sdma) {
+						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+						si_emit_cache_flush(sctx);
+					}
+
+					ctx->end_query(ctx, q[iter]);
+					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+				}
+				pipe_resource_reference(&dst, NULL);
+				pipe_resource_reference(&src, NULL);
+
+				/* Get results. */
+				uint64_t min = ~0ull, max = 0, total = 0;
+
+				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+					union pipe_query_result result;
+
+					ctx->get_query_result(ctx, q[iter], true, &result);
+					ctx->destroy_query(ctx, q[iter]);
+
+					min = MIN2(min, result.u64);
+					max = MAX2(max, result.u64);
+					total += result.u64;
+				}
+
+				score = get_MBps_rate(size, total / (double)NUM_RUNS);
+				printf("%7.0f ,", score);
+				fflush(stdout);
+
+				struct si_result *r = &results[util_logbase2(size)][placement][method];
+				r->is_valid = true;
+				r->is_cp = test_cp;
+				r->is_sdma = test_sdma;
+				r->is_cs = test_cs;
+				r->cache_policy = cache_policy;
+				r->dwords_per_thread = cs_dwords_per_thread;
+				r->waves_per_sh = cs_waves_per_sh;
+				r->score = score;
+				r->index = method;
+			}
+			puts("");
+		}
+	}
+
+	puts("");
+	puts("static struct si_method");
+	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
+	       sctx->screen->info.name);
+	puts("{");
+	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+	/* Analyze results and find the best methods. */
+	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+		if (placement == 0)
+			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+		else if (placement == 1)
+			puts("   } else { /* GTT */");
+		else if (placement == 2) {
+			puts("}");
+			puts("");
+			puts("static struct si_method");
+			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+			       sctx->screen->info.name);
+			printf("                     uint64_t size64, bool async, bool cached)\n");
+			puts("{");
+			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+		} else if (placement == 3)
+			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+		else
+			puts("   } else { /* GTT -> VRAM */");
+
+		for (unsigned mode = 0; mode < 3; mode++) {
+			bool async = mode == 0;
+			bool cached = mode == 1;
+
+			if (async)
+				puts("      if (async) { /* SDMA or async compute */");
+			else if (cached)
+				puts("      if (cached) { /* gfx ring */");
+			else
+				puts("      } else { /* gfx ring - uncached */");
+
+			/* The list of best chosen methods. */
+			struct si_result *methods[32];
+			unsigned method_max_size[32];
+			unsigned num_methods = 0;
+
+			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+				/* Find the best method. */
+				struct si_result *best = NULL;
+
+				for (unsigned i = 0; i < NUM_METHODS; i++) {
+					struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+					if (!r->is_valid)
+						continue;
+
+					/* Ban CP DMA clears via MC on <= VI. They are super slow
+					 * on GTT, which we can get due to BO evictions.
+					 */
+					if (sctx->chip_class <= VI && placement == 1 &&
+					    r->is_cp && r->cache_policy == L2_BYPASS)
+						continue;
+
+					if (async) {
+						/* The following constraints for compute IBs try to limit
+						 * resource usage so as not to decrease the performance
+						 * of gfx IBs too much.
+						 */
+
+						/* Don't use CP DMA on asynchronous rings, because
+						 * the engine is shared with gfx IBs.
+						 */
+						if (r->is_cp)
+							continue;
+
+						/* Don't use L2 caching on asynchronous rings to minimize
+						 * L2 usage.
+						 */
+						if (r->cache_policy == L2_LRU)
+							continue;
+
+						/* Asynchronous compute recommends waves_per_sh != 0
+						 * to limit CU usage. */
+						if (r->is_cs && r->waves_per_sh == 0)
+							continue;
+					} else {
+						/* SDMA is always asynchronous */
+						if (r->is_sdma)
+							continue;
+
+						if (cached && r->cache_policy == L2_BYPASS)
+							continue;
+						if (!cached && r->cache_policy == L2_LRU)
+							continue;
+					}
+
+					if (!best) {
+						best = r;
+						continue;
+					}
+
+					/* Assume some measurement error. Earlier methods occupy fewer
+					 * resources, so the next method is always more greedy, and we
+					 * don't want to select it due to a measurement error.
+					 */
+					double min_improvement = 1.03;
+
+					if (best->score * min_improvement < r->score)
+						best = r;
+				}
+
+				if (num_methods > 0) {
+					unsigned prev_index = num_methods - 1;
+					struct si_result *prev = methods[prev_index];
+					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
+
+					/* If the best one is also the best for the previous size,
+					 * just bump the size for the previous one.
+					 *
+					 * If there is no best, it means all methods were too slow
+					 * for this size and were not tested. Use the best one for
+					 * the previous size.
+					 */
+					if (!best ||
+					    /* If it's the same method as for the previous size: */
+					    (prev->is_cp == best->is_cp &&
+					     prev->is_sdma == best->is_sdma &&
+					     prev->is_cs == best->is_cs &&
+					     prev->cache_policy == best->cache_policy &&
+					     prev->dwords_per_thread == best->dwords_per_thread &&
+					     prev->waves_per_sh == best->waves_per_sh) ||
+					    /* If the method for the previous size is also the best
+					     * for this size: */
+					    (prev_this_size->is_valid &&
+					     prev_this_size->score * 1.03 > best->score)) {
+						method_max_size[prev_index] = size;
+						continue;
+					}
+				}
+
+				/* Add it to the list. */
+				assert(num_methods < ARRAY_SIZE(methods));
+				methods[num_methods] = best;
+				method_max_size[num_methods] = size;
+				num_methods++;
+			}
+
+			for (unsigned i = 0; i < num_methods; i++) {
+				struct si_result *best = methods[i];
+				unsigned size = method_max_size[i];
+
+				/* The size threshold is between the current benchmarked
+				 * size and the next benchmarked size. */
+				if (i < num_methods - 1)
+					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+				else if (i > 0)
+					printf("         else                   ");
+				else
+					printf("         ");
+				printf("return ");
+
+				assert(best);
+				if (best->is_cp) {
+					printf("CP_DMA(%s);\n",
+					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
+				}
+				if (best->is_sdma)
+					printf("SDMA;\n");
+				if (best->is_cs) {
+					printf("COMPUTE(%s, %u, %u);\n",
+					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
+					       best->dwords_per_thread,
+					       best->waves_per_sh);
+				}
+			}
+		}
+		puts("      }");
+	}
+	puts("   }");
+	puts("}");
+
+	ctx->destroy(ctx);
+	exit(0);
+}
-- 
2.7.4