From 726a48c94f49010d9b4d48bfc5efeabae77825b2 Mon Sep 17 00:00:00 2001 From: Darren Powell Date: Wed, 13 Jun 2018 18:54:24 -0400 Subject: [PATCH] radeonsi: add new R600_DEBUG test "testclearbufperf" MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Darren Powell Signed-off-by: Marek Olšák --- src/gallium/drivers/radeonsi/Makefile.sources | 1 + src/gallium/drivers/radeonsi/meson.build | 1 + src/gallium/drivers/radeonsi/si_clear.c | 6 +- src/gallium/drivers/radeonsi/si_cp_dma.c | 12 +- src/gallium/drivers/radeonsi/si_pipe.c | 7 +- src/gallium/drivers/radeonsi/si_pipe.h | 11 +- src/gallium/drivers/radeonsi/si_test_clearbuffer.c | 140 +++++++++++++++++++++ src/gallium/drivers/radeonsi/si_test_dma.c | 3 +- 8 files changed, 170 insertions(+), 11 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_test_clearbuffer.c diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index f760b5b..c052e8d 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -43,6 +43,7 @@ C_SOURCES := \ si_state_streamout.c \ si_state_viewport.c \ si_state.h \ + si_test_clearbuffer.c \ si_test_dma.c \ si_texture.c \ si_uvd.c \ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 9049839..9e249ad 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -59,6 +59,7 @@ files_libradeonsi = files( 'si_state_shaders.c', 'si_state_streamout.c', 'si_state_viewport.c', + 'si_test_clearbuffer.c', 'si_test_dma.c', 'si_texture.c', 'si_uvd.c', diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 4e07de8..654ff0a 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -256,7 +256,7 @@ void vi_dcc_clear_level(struct si_context *sctx, } si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - clear_value, SI_COHERENCY_CB_META); + clear_value, SI_COHERENCY_CB_META, SI_METHOD_BEST); } /* Set the same micro tile mode as the destination of the last MSAA resolve. @@ -489,7 +489,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - 0xCCCCCCCC, SI_COHERENCY_CB_META); + 0xCCCCCCCC, SI_COHERENCY_CB_META, SI_METHOD_BEST); need_decompress_pass = true; } @@ -520,7 +520,7 @@ static void si_do_fast_color_clear(struct si_context *sctx, /* Do the fast clear. */ si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, 0, - SI_COHERENCY_CB_META); + SI_COHERENCY_CB_META, SI_METHOD_BEST); need_decompress_pass = true; } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index f98fad4..b013332 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -215,7 +215,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher) + enum si_coherency coher, enum si_method xfer ) { struct radeon_winsys *ws = sctx->ws; struct r600_resource *rdst = r600_resource(dst); @@ -227,7 +227,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, if (!size) return; - dma_clear_size = size & ~3ull; + dma_clear_size = size & ~3ull; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping @@ -250,7 +250,9 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, * For example, DeusEx:MD has 21 buffer clears per frame and all * of them are moved to SDMA thanks to this. */ !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, - RADEON_USAGE_READWRITE))) { + RADEON_USAGE_READWRITE)) && + /* bypass sdma transfer with param xfer */ + (xfer != SI_METHOD_CP_DMA)) { sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value); offset += dma_clear_size; @@ -263,7 +265,7 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, /* Flush the caches. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; while (dma_clear_size) { unsigned byte_count = MIN2(dma_clear_size, cp_dma_max_byte_count(sctx)); @@ -356,7 +358,7 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, } si_clear_buffer(sctx, dst, offset, size, dword_value, - SI_COHERENCY_SHADER); + SI_COHERENCY_SHADER, SI_METHOD_BEST); } /** diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index cc05d2f..e9cf1c3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -101,6 +101,7 @@ static const struct debug_named_value debug_options[] = { { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." }, { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." }, { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." }, + { "testclearbufperf", DBG(TEST_CLEARBUF_PERF), "Test Clearbuffer Performance" }, DEBUG_NAMED_VALUE_END /* must be last */ }; @@ -545,7 +546,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, /* Clear the NULL constant buffer, because loads should return zeros. */ si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, 0, - SI_COHERENCY_SHADER); + SI_COHERENCY_SHADER, SI_METHOD_BEST); } uint64_t max_threads_per_block; @@ -1069,6 +1070,10 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, if (sscreen->debug_flags & DBG(TEST_DMA)) si_test_dma(sscreen); + if (sscreen->debug_flags & DBG(TEST_CLEARBUF_PERF)) { + si_test_clearbuffer(sscreen); + } + if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9ab79bc..7bfc9f5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -165,6 +165,7 @@ enum { DBG_TEST_VMFAULT_CP, DBG_TEST_VMFAULT_SDMA, DBG_TEST_VMFAULT_SHADER, + DBG_TEST_CLEARBUF_PERF, }; #define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) @@ -1110,10 +1111,15 @@ enum si_coherency { SI_COHERENCY_CB_META, }; +enum si_method { + SI_METHOD_CP_DMA, + SI_METHOD_BEST, +}; + void si_cp_dma_wait_for_idle(struct si_context *sctx); void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher); + enum si_coherency coher, enum si_method xfer); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, @@ -1199,6 +1205,9 @@ void si_resume_queries(struct si_context *sctx); /* si_test_dma.c */ void si_test_dma(struct si_screen *sscreen); +/* si_test_clearbuffer.c */ +void si_test_clearbuffer(struct si_screen *sscreen); + /* si_uvd.c */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, const struct pipe_video_codec *templ); diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c new file mode 100644 index 0000000..00fbd2d --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c @@ -0,0 +1,140 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* This file implements tests on the si_clearbuffer function. */ + +#include "si_pipe.h" + +#define CLEARBUF_MIN 32 +#define CLEARBUF_COUNT 16 +#define CLEARBUF_MEMSZ 1024 + +static uint64_t +measure_clearbuf_time(struct pipe_context *ctx, + uint64_t memory_size) +{ + struct pipe_query *query_te; + union pipe_query_result qresult; + struct pipe_resource *buf; + + struct si_context *sctx = (struct si_context*)ctx; + struct pipe_screen *screen = ctx->screen; + + buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size); + + query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0); + + ctx->begin_query(ctx, query_te); + /* operation */ + si_clear_buffer(sctx, buf, 0, memory_size, 0x00, + SI_COHERENCY_SHADER, SI_METHOD_CP_DMA); + ctx->end_query(ctx, query_te); + ctx->get_query_result(ctx, query_te, true, &qresult); + + /* Cleanup. */ + ctx->destroy_query(ctx, query_te); + pipe_resource_reference(&buf, NULL); + + /* Report Results */ + return qresult.u64; +} + +/** + * @brief Analyze rate of clearing a 1K Buffer averaged over 16 iterations + * @param ctx Context of pipe to perform analysis on + */ +static void +analyze_clearbuf_perf_avg(struct pipe_context *ctx) +{ + uint index = 0; + uint64_t result[CLEARBUF_COUNT]; + uint64_t sum = 0; + long long int rate_kBps; + + /* Run Tests. */ + for (index = 0 ; index < CLEARBUF_COUNT ; index++) { + result[index] = measure_clearbuf_time(ctx, CLEARBUF_MEMSZ); + sum += result[index]; + } + + /* Calculate Results. */ + /* kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */ + rate_kBps = CLEARBUF_COUNT*CLEARBUF_MEMSZ; + rate_kBps *= 1000UL*1000UL; + rate_kBps /= sum; + + /* Display Results. */ + printf("CP DMA clear_buffer performance (buffer %lu ,repeat %u ):", + (uint64_t)CLEARBUF_MEMSZ, + CLEARBUF_COUNT ); + printf(" %llu kB/s\n", rate_kBps ); +} + +/** + * @brief Analyze rate of clearing a range of Buffer sizes + * @param ctx Context of pipe to perform analysis on + */ +static void +analyze_clearbuf_perf_rng(struct pipe_context *ctx) +{ + uint index = 0; + uint64_t result[CLEARBUF_COUNT]; + uint64_t mem_size; + long long int rate_kBps; + + /* Run Tests. */ + mem_size = CLEARBUF_MIN; + for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) { + result[index] = measure_clearbuf_time(ctx, mem_size); + mem_size <<= 1; + } + + /* Calculate & Display Results. */ + /* kBps = (size(bytes))/(1000) / (time(ns)/(1000*1000*1000)) */ + mem_size = CLEARBUF_MIN; + for (index = 0 ; index < CLEARBUF_COUNT ; index++ ) { + rate_kBps = mem_size; + rate_kBps *= 1000UL*1000UL; + rate_kBps /= result[index]; + + printf("CP DMA clear_buffer performance (buffer %lu):", + mem_size ); + printf(" %llu kB/s\n", rate_kBps ); + + mem_size <<= 1; + } +} + +void si_test_clearbuffer(struct si_screen *sscreen) +{ + struct pipe_screen *screen = &sscreen->b; + struct pipe_context *ctx = screen->context_create(screen, NULL, 0); + + analyze_clearbuf_perf_avg(ctx); + analyze_clearbuf_perf_rng(ctx); + + exit(0); +} + diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index ee6ab7c..baab580 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -307,7 +307,8 @@ void si_test_dma(struct si_screen *sscreen) set_random_pixels(ctx, src, &src_cpu); /* clear dst pixels */ - si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0, true); + si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0, + true, SI_METHOD_BEST); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ -- 2.7.4