From 7a0162905b157fd0ea8f2489eb52ebbd6224a717 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Oct 2020 00:55:55 -0400 Subject: [PATCH] radeonsi: update the DMA perf test - don't test 2 waves/SA - create the compute shader only once per subtest - use only 1 TIME_ELAPSED query per subtest - don't invalidate sL0 (it's not used) - don't invalidate L2 for L2_LRU to test L2 throughput - don't flush the CS after every run - remove unused min/max computation Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_test_dma_perf.c | 62 +++++++++++++------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 1820899..0ca8dce 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -45,7 +45,7 @@ void si_test_dma_perf(struct si_screen *sscreen) struct si_context *sctx = (struct si_context *)ctx; const uint32_t clear_value = 0x12345678; static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; - static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16}; + static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16}; #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) @@ -142,6 +142,12 @@ void si_test_dma_perf(struct si_screen *sscreen) else printf(" ,"); + void *compute_shader = NULL; + if (test_cs) { + compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, + cache_policy == L2_STREAM, is_copy); + } + double score = 0; for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { /* Don't test bigger sizes if it's too slow. Print 0. */ @@ -152,7 +158,6 @@ void si_test_dma_perf(struct si_screen *sscreen) enum pipe_resource_usage dst_usage, src_usage; struct pipe_resource *dst, *src; - struct pipe_query *q[NUM_RUNS]; unsigned query_type = PIPE_QUERY_TIME_ELAPSED; unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0; @@ -176,11 +181,17 @@ void si_test_dma_perf(struct si_screen *sscreen) dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256); src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL; + /* Wait for idle before testing, so that other processes don't mess up the results. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_FLUSH_AND_INV_DB; + sctx->emit_cache_flush(sctx); + + struct pipe_query *q = ctx->create_query(ctx, query_type, 0); + ctx->begin_query(ctx, q); + /* Run tests. */ for (unsigned iter = 0; iter < NUM_RUNS; iter++) { - q[iter] = ctx->create_query(ctx, query_type, 0); - ctx->begin_query(ctx, q[iter]); - if (test_cp) { /* CP DMA */ if (is_copy) { @@ -210,9 +221,6 @@ void si_test_dma_perf(struct si_screen *sscreen) unsigned num_dwords = size / 4; unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); - void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, - cache_policy == L2_STREAM, is_copy); - struct pipe_grid_info info = {}; info.block[0] = MIN2(64, num_instructions); info.block[1] = 1; @@ -233,48 +241,39 @@ void si_test_dma_perf(struct si_screen *sscreen) sctx->cs_user_data[i] = clear_value; } - sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); - ctx->bind_compute_state(ctx, cs); + ctx->bind_compute_state(ctx, compute_shader); sctx->cs_max_waves_per_sh = cs_waves_per_sh; ctx->launch_grid(ctx, &info); ctx->bind_compute_state(ctx, NULL); - ctx->delete_compute_state(ctx, cs); sctx->cs_max_waves_per_sh = 0; /* disable the limit */ - - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; } - /* Flush L2, so that we don't just test L2 cache performance. */ + /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */ if (!test_sdma) { - sctx->flags |= SI_CONTEXT_WB_L2; + sctx->flags |= SI_CONTEXT_INV_VCACHE | + (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | + SI_CONTEXT_CS_PARTIAL_FLUSH; sctx->emit_cache_flush(sctx); } - - ctx->end_query(ctx, q[iter]); - ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); } + + ctx->end_query(ctx, q); + ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); + pipe_resource_reference(&dst, NULL); pipe_resource_reference(&src, NULL); /* Get results. */ - uint64_t min = ~0ull, max = 0, total = 0; - for (unsigned iter = 0; iter < NUM_RUNS; iter++) { - union pipe_query_result result; - - ctx->get_query_result(ctx, q[iter], true, &result); - ctx->destroy_query(ctx, q[iter]); + union pipe_query_result result; - min = MIN2(min, result.u64); - max = MAX2(max, result.u64); - total += result.u64; - } + ctx->get_query_result(ctx, q, true, &result); + ctx->destroy_query(ctx, q); - score = get_MBps_rate(size, total / (double)NUM_RUNS); + score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS); printf("%7.0f ,", score); fflush(stdout); @@ -290,6 +289,9 @@ void si_test_dma_perf(struct si_screen *sscreen) r->index = method; } puts(""); + + if (compute_shader) + ctx->delete_compute_state(ctx, compute_shader); } } -- 2.7.4