From 5b96689fa050c08a8422716d8c54da1611c1745a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 9 Mar 2021 15:50:26 -0800 Subject: [PATCH] freedreno: Autotune bypass vs GMEM rendering decision In some cases, like gl_driver2, we have all the characteristics that make our current simplistic bypass vs GMEM decision pick GMEM (ie. batch starts with a clear, has blend enabled, has a high draw count, etc), but each draw touches very few pixels and the per-tile state-change overhead leaves us CP limited. We would be better in this case picking the bypass path. So use feedback from # of samples-passed in previous render passes to the same FBO to give us a bit more information to make better choices. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2798 Signed-off-by: Rob Clark Part-of: --- src/gallium/drivers/freedreno/Makefile.sources | 2 + src/gallium/drivers/freedreno/a6xx/fd6_gmem.c | 52 ++++ src/gallium/drivers/freedreno/freedreno_autotune.c | 261 +++++++++++++++++++++ src/gallium/drivers/freedreno/freedreno_autotune.h | 177 ++++++++++++++ src/gallium/drivers/freedreno/freedreno_batch.h | 8 + src/gallium/drivers/freedreno/freedreno_context.c | 4 + src/gallium/drivers/freedreno/freedreno_context.h | 3 + src/gallium/drivers/freedreno/freedreno_gmem.c | 6 +- src/gallium/drivers/freedreno/meson.build | 2 + 9 files changed, 511 insertions(+), 4 deletions(-) create mode 100644 src/gallium/drivers/freedreno/freedreno_autotune.c create mode 100644 src/gallium/drivers/freedreno/freedreno_autotune.h diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index e56ba3a..dd36a0a 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -1,5 +1,7 @@ C_SOURCES := \ disasm.h \ + freedreno_autotune.c \ + freedreno_autotune.h \ freedreno_batch.c \ freedreno_batch.h \ freedreno_batch_cache.c \ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c index 3a9ea60..7fdc0eb 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c @@ -471,6 +471,50 @@ check_vsc_overflow(struct fd_context *ctx) } } +static void +emit_common_init(struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->gmem; + struct fd_autotune *at = &batch->ctx->autotune; + struct fd_batch_result *result = batch->autotune_result; + + if (!result) + return; + + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start)); + + fd6_event_write(batch, ring, ZPASS_DONE, false); +} + +static void +emit_common_fini(struct fd_batch *batch) +{ + struct fd_ringbuffer *ring = batch->gmem; + struct fd_autotune *at = &batch->ctx->autotune; + struct fd_batch_result *result = batch->autotune_result; + + if (!result) + return; + + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end)); + + fd6_event_write(batch, ring, ZPASS_DONE, false); + + // TODO is there a better event to use.. a single ZPASS_DONE_TS would be nice + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); + OUT_RELOC(ring, results_ptr(at, fence)); + OUT_RING(ring, result->fence); +} + /* * Emit conditional CP_INDIRECT_BRANCH based on VSC_STATE[p], ie. the IB * is skipped for tiles that have no visible geometry. @@ -731,6 +775,8 @@ fd6_emit_tile_init(struct fd_batch *batch) } update_render_cntl(batch, pfb, false); + + emit_common_init(batch); } static void @@ -1316,6 +1362,8 @@ fd6_emit_tile_fini(struct fd_batch *batch) { struct fd_ringbuffer *ring = batch->gmem; + emit_common_fini(batch); + OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE); @@ -1479,6 +1527,8 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) emit_msaa(ring, pfb->samples); update_render_cntl(batch, pfb, false); + + emit_common_init(batch); } static void @@ -1486,6 +1536,8 @@ fd6_emit_sysmem_fini(struct fd_batch *batch) { struct fd_ringbuffer *ring = batch->gmem; + emit_common_fini(batch); + if (batch->epilogue) fd6_emit_ib(batch->gmem, batch->epilogue); diff --git a/src/gallium/drivers/freedreno/freedreno_autotune.c b/src/gallium/drivers/freedreno/freedreno_autotune.c new file mode 100644 index 0000000..1aff434 --- /dev/null +++ b/src/gallium/drivers/freedreno/freedreno_autotune.c @@ -0,0 +1,261 @@ +/* + * Copyright © 2021 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "freedreno_autotune.h" +#include "freedreno_batch.h" +#include "freedreno_util.h" + + +/** + * Tracks, for a given batch key (which maps to a FBO/framebuffer state), + * + * ralloc parent is fd_autotune::ht + */ +struct fd_batch_history { + struct fd_batch_key *key; + + /* Entry in fd_autotune::lru: */ + struct list_head node; + + unsigned num_results; + + /** + * List of recent fd_batch_result's + */ + struct list_head results; +#define MAX_RESULTS 5 +}; + + +static struct fd_batch_history * +get_history(struct fd_autotune *at, struct fd_batch *batch) +{ + struct fd_batch_history *history; + + if (!batch->key) + return NULL; + + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key); + + if (entry) { + history = entry->data; + goto found; + } + + history = rzalloc_size(at->ht, sizeof(*history)); + + history->key = fd_batch_key_clone(history, batch->key); + list_inithead(&history->node); + list_inithead(&history->results); + + /* Note: We cap # of cached GMEM states at 20.. so assuming double- + * buffering, 40 should be a good place to cap cached autotune state + */ + if (at->ht->entries >= 40) { + struct fd_batch_history *last = + list_last_entry(&at->lru, struct fd_batch_history, node); + _mesa_hash_table_remove_key(at->ht, last->key); + list_del(&last->node); + ralloc_free(last); + } + + _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key, history); + +found: + /* Move to the head of the LRU: */ + list_delinit(&history->node); + list_add(&history->node, &at->lru); + + return history; +} + +static void +result_destructor(void *r) +{ + struct fd_batch_result *result = r; + + /* Just in case we manage to somehow still be on the pending_results list: */ + list_del(&result->node); +} + +static struct fd_batch_result * +get_result(struct fd_autotune *at, struct fd_batch_history *history) +{ + struct fd_batch_result *result = rzalloc_size(history, sizeof(*result)); + + result->fence = ++at->fence_counter; /* pre-increment so zero isn't valid fence */ + result->idx = at->idx_counter++; + + if (at->idx_counter >= ARRAY_SIZE(at->results->result)) + at->idx_counter = 0; + + result->history = history; + list_addtail(&result->node, &at->pending_results); + + ralloc_set_destructor(result, result_destructor); + + return result; +} + +static void +process_results(struct fd_autotune *at) +{ + uint32_t current_fence = at->results->fence; + + list_for_each_entry_safe (struct fd_batch_result, result, &at->pending_results, node) { + if (result->fence > current_fence) + break; + + struct fd_batch_history *history = result->history; + + result->samples_passed = at->results->result[result->idx].samples_end - + at->results->result[result->idx].samples_start; + + list_delinit(&result->node); + list_add(&result->node, &history->results); + + if (history->num_results < MAX_RESULTS) { + history->num_results++; + } else { + /* Once above a limit, start popping old results off the + * tail of the list: + */ + struct fd_batch_result *old_result = + list_last_entry(&history->results, struct fd_batch_result, node); + list_delinit(&old_result->node); + ralloc_free(old_result); + } + } +} + +static bool +fallback_use_bypass(struct fd_batch *batch) +{ + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + /* Fallback logic if we have no historical data about the rendertarget: */ + if (batch->cleared || batch->gmem_reason || + ((batch->num_draws > 5) && !batch->blit) || + (pfb->samples > 1)) { + return false; + } + + return true; +} + +/** + * A magic 8-ball that tells the gmem code whether we should do bypass mode + * for moar fps. + */ +bool +fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch) +{ + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + process_results(at); + + /* Only enable on gen's that opt-in (and actually have sample-passed + * collection wired up: + */ + if (!batch->ctx->screen->gmem_reason_mask) + return fallback_use_bypass(batch); + + if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask) + return fallback_use_bypass(batch); + + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + /* If ms-rtt is involved, force GMEM, as we don't currently + * implement a temporary render target that we can MSAA resolve + * from + */ + if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples) + return fallback_use_bypass(batch); + } + + struct fd_batch_history *history = get_history(at, batch); + if (!history) + return fallback_use_bypass(batch); + + batch->autotune_result = get_result(at, history); + batch->autotune_result->cost = batch->cost; + + bool use_bypass = fallback_use_bypass(batch); + + if (use_bypass) + return true; + + if (history->num_results > 0) { + uint32_t total_samples = 0; + + // TODO we should account for clears somehow + // TODO should we try to notice if there is a drastic change from + // frame to frame? + list_for_each_entry (struct fd_batch_result, result, &history->results, node) { + total_samples += result->samples_passed; + } + + float avg_samples = (float)total_samples / (float)history->num_results; + + /* Low sample count could mean there was only a clear.. or there was + * a clear plus draws that touch no or few samples + */ + if (avg_samples < 500.0) + return true; + + /* Cost-per-sample is an estimate for the average number of reads+ + * writes for a given passed sample. + */ + float sample_cost = batch->cost; + sample_cost /= batch->num_draws; + + float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws; + DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, total_draw_cost=%f\n", + batch->hash, batch->num_draws, total_samples, avg_samples, sample_cost, total_draw_cost); + + if (total_draw_cost < 3000.0) + return true; + } + + return use_bypass; +} + +void +fd_autotune_init(struct fd_autotune *at, struct fd_device *dev) +{ + at->ht = _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals); + list_inithead(&at->lru); + + at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results), + DRM_FREEDRENO_GEM_TYPE_KMEM, "autotune"); + at->results = fd_bo_map(at->results_mem); + + list_inithead(&at->pending_results); +} + +void +fd_autotune_fini(struct fd_autotune *at) +{ + _mesa_hash_table_destroy(at->ht, NULL); + fd_bo_del(at->results_mem); +} diff --git a/src/gallium/drivers/freedreno/freedreno_autotune.h b/src/gallium/drivers/freedreno/freedreno_autotune.h new file mode 100644 index 0000000..373af58 --- /dev/null +++ b/src/gallium/drivers/freedreno/freedreno_autotune.h @@ -0,0 +1,177 @@ +/* + * Copyright © 2021 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef FREEDRENO_AUTOTUNE_H +#define FREEDRENO_AUTOTUNE_H + +#include "util/hash_table.h" +#include "util/list.h" + +#include "freedreno_util.h" + +struct fd_autotune_results; + +/** + * "autotune" our decisions about bypass vs GMEM rendering, based on historical + * data about a given render target. + * + * In deciding which path to take there are tradeoffs, including some that + * are not reasonably estimateable without having some additional information: + * + * (1) If you know you are touching every pixel (ie. there is a glClear()), + * then the GMEM path will at least not cost more memory bandwidth than + * sysmem[1] + * + * (2) If there is no clear, GMEM could potentially cost *more* bandwidth + * due to sysmem->GMEM restore pass. + * + * (3) If you see a high draw count, that is an indication that there will be + * enough pixels accessed multiple times to benefit from the reduced + * memory bandwidth that GMEM brings + * + * (4) But high draw count where there is not much overdraw can actually be + * faster in bypass mode if it is pushing a lot of state change, due to + * not having to go thru the state changes per-tile[2] + * + * The approach taken is to measure the samples-passed for the batch to estimate + * the amount of overdraw to detect cases where the number of pixels touched is + * low. + * + * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE} + * performance countables, which give a more direct measurement of what we want + * to know (ie. is framebuffer memory access high enough to prefer GMEM), but + * with the downside of consuming half of the available RB counters. With the + * additional complication that external perfcntr collection (fdperf, perfetto) + * and the drive could be stomping on each other's feet. (Also reading the + * perfcntrs accurately requires a WFI.) + * + * [1] ignoring UBWC + * [2] ignoring early-tile-exit optimizations, but any draw that touches all/ + * most of the tiles late in the tile-pass can defeat that + */ +struct fd_autotune { + + /** + * Cache to map batch->key (also used for batch-cache) to historical + * information about rendering to that particular render target. + */ + struct hash_table *ht; + + /** + * List of recently used historical results (to age out old results) + */ + struct list_head lru; + + /** + * GPU buffer used to communicate back results to the CPU + */ + struct fd_bo *results_mem; + struct fd_autotune_results *results; + + /** + * List of per-batch results that we are waiting for the GPU to finish + * with before reading back the results. + */ + struct list_head pending_results; + + uint32_t fence_counter; + uint32_t idx_counter; +}; + +/** + * The layout of the memory used to read back per-batch results from the + * GPU + * + * Note this struct is intentionally aligned to 4k. And hw requires the + * sample start/stop locations to be 128b aligned. + */ +struct fd_autotune_results { + + /** + * The GPU writes back a "fence" seqno value from the cmdstream after + * it finishes writing it's result slot, so that the CPU knows when + * results are valid + */ + uint32_t fence; + + uint32_t __pad0; + uint64_t __pad1; + + /** + * From the cmdstream, the captured samples-passed values are recorded + * at the start and end of the batch. + * + * Note that we do the math on the CPU to avoid a WFI. But pre-emption + * may force us to revisit that. + */ + struct { + uint64_t samples_start; + uint64_t __pad0; + uint64_t samples_end; + uint64_t __pad1; + } result[127]; +}; + +#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) +#define results_ptr(at, member) \ + (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0 + +struct fd_batch_history; + +/** + * Tracks the results from an individual batch. Initially created per batch, + * and appended to the tail of at->pending_results. At a later time, when + * the GPU has finished writing the results, + * + * ralloc parent is the associated fd_batch_history + */ +struct fd_batch_result { + + /** + * The index/slot in fd_autotune_results::result[] to write start/end + * counter to + */ + unsigned idx; + + /** + * Fence value to write back to fd_autotune_results::fence after both + * start/end values written + */ + uint32_t fence; + + /* + * Below here, only used internally within autotune + */ + struct fd_batch_history *history; + struct list_head node; + uint32_t cost; + uint64_t samples_passed; +}; + +void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev); +void fd_autotune_fini(struct fd_autotune *at); + +struct fd_batch; +bool fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch) assert_dt; + +#endif /* FREEDRENO_AUTOTUNE_H */ diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index af4bc91..785f0c3 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -44,6 +44,7 @@ struct fd_resource; struct fd_batch_key; +struct fd_batch_result; /* A batch tracks everything about a cmdstream batch/submit, including the * ringbuffers used for binning, draw, and gmem cmds, list of associated @@ -144,6 +145,13 @@ struct fd_batch { */ unsigned cost; + /* Tells the gen specific backend where to write stats used for + * the autotune module. + * + * Pointer only valid during gmem emit code. + */ + struct fd_batch_result *autotune_result; + unsigned num_draws; /* number of draws in current batch */ unsigned num_vertices; /* number of vertices in current batch */ diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 6e9fe58..f7de932 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -371,6 +371,8 @@ fd_context_destroy(struct pipe_context *pctx) u_trace_context_fini(&ctx->trace_context); + fd_autotune_fini(&ctx->autotune); + if (FD_DBG(BSTAT) || FD_DBG(MSGS)) { mesa_logi("batch_total=%u, batch_sysmem=%u, batch_gmem=%u, batch_nondraw=%u, batch_restore=%u\n", (uint32_t)ctx->stats.batch_total, (uint32_t)ctx->stats.batch_sysmem, @@ -644,6 +646,8 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, u_trace_context_init(&ctx->trace_context, pctx, fd_trace_record_ts, fd_trace_read_ts); + fd_autotune_init(&ctx->autotune, screen->dev); + return pctx; fail: diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 75c25f8..36ac624 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -37,6 +37,7 @@ #include "util/u_threaded_context.h" #include "util/u_trace.h" +#include "freedreno_autotune.h" #include "freedreno_screen.h" #include "freedreno_gmem.h" #include "freedreno_util.h" @@ -209,6 +210,8 @@ struct fd_context { struct slab_child_pool transfer_pool dt; struct slab_child_pool transfer_pool_unsync; /* for threaded_context */ + struct fd_autotune autotune dt; + /** * query related state: */ diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index bd06660..65f52cf 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -680,10 +680,8 @@ fd_gmem_render_tiles(struct fd_batch *batch) } if (ctx->emit_sysmem_prep && !batch->nondraw) { - if (batch->cleared || batch->gmem_reason || - ((batch->num_draws > 5) && !batch->blit) || - (pfb->samples > 1)) { - } else if (!FD_DBG(NOBYPASS)) { + if (fd_autotune_use_bypass(&ctx->autotune, batch) && + !FD_DBG(NOBYPASS)) { sysmem = true; } diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build index 8143309..5b49393 100644 --- a/src/gallium/drivers/freedreno/meson.build +++ b/src/gallium/drivers/freedreno/meson.build @@ -19,6 +19,8 @@ # SOFTWARE. files_libfreedreno = files( + 'freedreno_autotune.c', + 'freedreno_autotune.h', 'freedreno_batch.c', 'freedreno_batch.h', 'freedreno_batch_cache.c', -- 2.7.4