radeonsi: update the DMA perf test

author Marek Olšák <marek.olsak@amd.com>

Mon, 5 Oct 2020 04:55:55 +0000 (00:55 -0400)

committer Marge Bot <eric+marge@anholt.net>

Fri, 9 Oct 2020 23:13:40 +0000 (23:13 +0000)
author Marek Olšák <marek.olsak@amd.com>
Mon, 5 Oct 2020 04:55:55 +0000 (00:55 -0400)
committer Marge Bot <eric+marge@anholt.net>
Fri, 9 Oct 2020 23:13:40 +0000 (23:13 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c

index 1820899..0ca8dce 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -45,7 +45,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
     struct si_context *sctx = (struct si_context *)ctx;
     const uint32_t clear_value = 0x12345678;
     static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-   static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
+   static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
  
  #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
  #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
@@ -142,6 +142,12 @@ void si_test_dma_perf(struct si_screen *sscreen)
           else
              printf("  ,");
  
+         void *compute_shader = NULL;
+         if (test_cs) {
+            compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                              cache_policy == L2_STREAM, is_copy);
+         }
+
           double score = 0;
           for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
              /* Don't test bigger sizes if it's too slow. Print 0. */
@@ -152,7 +158,6 @@ void si_test_dma_perf(struct si_screen *sscreen)
  
              enum pipe_resource_usage dst_usage, src_usage;
              struct pipe_resource *dst, *src;
-            struct pipe_query *q[NUM_RUNS];
              unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
              unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
  
@@ -176,11 +181,17 @@ void si_test_dma_perf(struct si_screen *sscreen)
              dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
              src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
  
+            /* Wait for idle before testing, so that other processes don't mess up the results. */
+            sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+                           SI_CONTEXT_FLUSH_AND_INV_CB |
+                           SI_CONTEXT_FLUSH_AND_INV_DB;
+            sctx->emit_cache_flush(sctx);
+
+            struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
+            ctx->begin_query(ctx, q);
+
              /* Run tests. */
              for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-               q[iter] = ctx->create_query(ctx, query_type, 0);
-               ctx->begin_query(ctx, q[iter]);
-
                 if (test_cp) {
                    /* CP DMA */
                    if (is_copy) {
@@ -210,9 +221,6 @@ void si_test_dma_perf(struct si_screen *sscreen)
                    unsigned num_dwords = size / 4;
                    unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
  
-                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-                                                          cache_policy == L2_STREAM, is_copy);
-
                    struct pipe_grid_info info = {};
                    info.block[0] = MIN2(64, num_instructions);
                    info.block[1] = 1;
@@ -233,48 +241,39 @@ void si_test_dma_perf(struct si_screen *sscreen)
                          sctx->cs_user_data[i] = clear_value;
                    }
  
-                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
-
                    ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
-                  ctx->bind_compute_state(ctx, cs);
+                  ctx->bind_compute_state(ctx, compute_shader);
                    sctx->cs_max_waves_per_sh = cs_waves_per_sh;
  
                    ctx->launch_grid(ctx, &info);
  
                    ctx->bind_compute_state(ctx, NULL);
-                  ctx->delete_compute_state(ctx, cs);
                    sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
-                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
                 }
  
-               /* Flush L2, so that we don't just test L2 cache performance. */
+               /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
                 if (!test_sdma) {
-                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE |
+                                 (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
+                                 SI_CONTEXT_CS_PARTIAL_FLUSH;
                    sctx->emit_cache_flush(sctx);
                 }
-
-               ctx->end_query(ctx, q[iter]);
-               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
              }
+
+            ctx->end_query(ctx, q);
+            ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+
              pipe_resource_reference(&dst, NULL);
              pipe_resource_reference(&src, NULL);
  
              /* Get results. */
-            uint64_t min = ~0ull, max = 0, total = 0;
  
-            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-               union pipe_query_result result;
-
-               ctx->get_query_result(ctx, q[iter], true, &result);
-               ctx->destroy_query(ctx, q[iter]);
+            union pipe_query_result result;
  
-               min = MIN2(min, result.u64);
-               max = MAX2(max, result.u64);
-               total += result.u64;
-            }
+            ctx->get_query_result(ctx, q, true, &result);
+            ctx->destroy_query(ctx, q);
  
-            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS);
              printf("%7.0f ,", score);
              fflush(stdout);
  
@@ -290,6 +289,9 @@ void si_test_dma_perf(struct si_screen *sscreen)
              r->index = method;
           }
           puts("");
+
+         if (compute_shader)
+            ctx->delete_compute_state(ctx, compute_shader);
        }
     }
author	Marek Olšák <marek.olsak@amd.com>
	Mon, 5 Oct 2020 04:55:55 +0000 (00:55 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Fri, 9 Oct 2020 23:13:40 +0000 (23:13 +0000)