From ea339137b0cef22385b9076921f7325e82776674 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 14 Mar 2022 17:14:59 -0700 Subject: [PATCH] freedreno/drm: Extract out "softpin" submit/ringbuffer base class We are going to want basically the identical thing, other than flush_submit_list, for virtio backend. Now that we've moved various other dependencies into the base classes, extract out an abstract base class for submit/ringbuffer. Signed-off-by: Rob Clark Part-of: --- src/freedreno/drm/freedreno_ringbuffer_sp.c | 651 ++++++++++++++++++ src/freedreno/drm/freedreno_ringbuffer_sp.h | 125 ++++ ...buffer_sp.h => freedreno_ringbuffer_sp_reloc.h} | 56 +- src/freedreno/drm/meson.build | 4 +- src/freedreno/drm/msm/msm_pipe.c | 9 +- src/freedreno/drm/msm/msm_priv.h | 6 - src/freedreno/drm/msm/msm_ringbuffer_sp.c | 731 +-------------------- 7 files changed, 839 insertions(+), 743 deletions(-) create mode 100644 src/freedreno/drm/freedreno_ringbuffer_sp.c create mode 100644 src/freedreno/drm/freedreno_ringbuffer_sp.h rename src/freedreno/drm/{msm/msm_ringbuffer_sp.h => freedreno_ringbuffer_sp_reloc.h} (62%) diff --git a/src/freedreno/drm/freedreno_ringbuffer_sp.c b/src/freedreno/drm/freedreno_ringbuffer_sp.c new file mode 100644 index 0000000..37d8eeb --- /dev/null +++ b/src/freedreno/drm/freedreno_ringbuffer_sp.c @@ -0,0 +1,651 @@ +/* + * Copyright (C) 2018 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include +#include +#include + +#include "util/hash_table.h" +#include "util/os_file.h" +#include "util/slab.h" + +#include "freedreno_ringbuffer_sp.h" + +/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead + * by avoiding the additional tracking necessary to build cmds/relocs tables + * (but still builds a bos table) + */ + +#define INIT_SIZE 0x1000 + +#define SUBALLOC_SIZE (32 * 1024) + +/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on, + * instead use a condition-variable. Note that pipe->flush() is not expected + * to be a common/hot path. + */ +static pthread_cond_t flush_cnd = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER; + +static void finalize_current_cmd(struct fd_ringbuffer *ring); +static struct fd_ringbuffer * +fd_ringbuffer_sp_init(struct fd_ringbuffer_sp *fd_ring, uint32_t size, + enum fd_ringbuffer_flags flags); + +/* add (if needed) bo to submit and return index: */ +uint32_t +fd_submit_append_bo(struct fd_submit_sp *submit, struct fd_bo *bo) +{ + uint32_t idx; + + /* NOTE: it is legal to use the same bo on different threads for + * different submits. But it is not legal to use the same submit + * from different threads. + */ + idx = READ_ONCE(bo->idx); + + if (unlikely((idx >= submit->nr_bos) || (submit->bos[idx] != bo))) { + uint32_t hash = _mesa_hash_pointer(bo); + struct hash_entry *entry; + + entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo); + if (entry) { + /* found */ + idx = (uint32_t)(uintptr_t)entry->data; + } else { + idx = APPEND(submit, bos, fd_bo_ref(bo)); + + _mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo, + (void *)(uintptr_t)idx); + } + bo->idx = idx; + } + + return idx; +} + +static void +fd_submit_suballoc_ring_bo(struct fd_submit *submit, + struct fd_ringbuffer_sp *fd_ring, uint32_t size) +{ + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + unsigned suballoc_offset = 0; + struct fd_bo *suballoc_bo = NULL; + + if (fd_submit->suballoc_ring) { + struct fd_ringbuffer_sp *suballoc_ring = + to_fd_ringbuffer_sp(fd_submit->suballoc_ring); + + suballoc_bo = suballoc_ring->ring_bo; + suballoc_offset = + fd_ringbuffer_size(fd_submit->suballoc_ring) + suballoc_ring->offset; + + suballoc_offset = align(suballoc_offset, 0x10); + + if ((size + suballoc_offset) > suballoc_bo->size) { + suballoc_bo = NULL; + } + } + + if (!suballoc_bo) { + // TODO possibly larger size for streaming bo? + fd_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, SUBALLOC_SIZE); + fd_ring->offset = 0; + } else { + fd_ring->ring_bo = fd_bo_ref(suballoc_bo); + fd_ring->offset = suballoc_offset; + } + + struct fd_ringbuffer *old_suballoc_ring = fd_submit->suballoc_ring; + + fd_submit->suballoc_ring = fd_ringbuffer_ref(&fd_ring->base); + + if (old_suballoc_ring) + fd_ringbuffer_del(old_suballoc_ring); +} + +static struct fd_ringbuffer * +fd_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size, + enum fd_ringbuffer_flags flags) +{ + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + struct fd_ringbuffer_sp *fd_ring; + + fd_ring = slab_alloc(&fd_submit->ring_pool); + + fd_ring->u.submit = submit; + + /* NOTE: needs to be before _suballoc_ring_bo() since it could + * increment the refcnt of the current ring + */ + fd_ring->base.refcnt = 1; + + if (flags & FD_RINGBUFFER_STREAMING) { + fd_submit_suballoc_ring_bo(submit, fd_ring, size); + } else { + if (flags & FD_RINGBUFFER_GROWABLE) + size = INIT_SIZE; + + fd_ring->offset = 0; + fd_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size); + } + + if (!fd_ringbuffer_sp_init(fd_ring, size, flags)) + return NULL; + + return &fd_ring->base; +} + +/** + * Prepare submit for flush, always done synchronously. + * + * 1) Finalize primary ringbuffer, at this point no more cmdstream may + * be written into it, since from the PoV of the upper level driver + * the submit is flushed, even if deferred + * 2) Add cmdstream bos to bos table + * 3) Update bo fences + */ +static bool +fd_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd, + struct fd_submit_fence *out_fence) +{ + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + bool has_shared = false; + + finalize_current_cmd(submit->primary); + + struct fd_ringbuffer_sp *primary = + to_fd_ringbuffer_sp(submit->primary); + + for (unsigned i = 0; i < primary->u.nr_cmds; i++) + fd_submit_append_bo(fd_submit, primary->u.cmds[i].ring_bo); + + simple_mtx_lock(&table_lock); + for (unsigned i = 0; i < fd_submit->nr_bos; i++) { + fd_bo_add_fence(fd_submit->bos[i], submit->pipe, submit->fence); + has_shared |= fd_submit->bos[i]->shared; + } + simple_mtx_unlock(&table_lock); + + fd_submit->out_fence = out_fence; + fd_submit->in_fence_fd = (in_fence_fd == -1) ? + -1 : os_dupfd_cloexec(in_fence_fd); + + return has_shared; +} + +static void +fd_submit_sp_flush_execute(void *job, void *gdata, int thread_index) +{ + struct fd_submit *submit = job; + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + struct fd_pipe *pipe = submit->pipe; + + fd_submit->flush_submit_list(&fd_submit->submit_list); + + pthread_mutex_lock(&flush_mtx); + assert(fd_fence_before(pipe->last_submit_fence, fd_submit->base.fence)); + pipe->last_submit_fence = fd_submit->base.fence; + pthread_cond_broadcast(&flush_cnd); + pthread_mutex_unlock(&flush_mtx); + + DEBUG_MSG("finish: %u", submit->fence); +} + +static void +fd_submit_sp_flush_cleanup(void *job, void *gdata, int thread_index) +{ + struct fd_submit *submit = job; + fd_submit_del(submit); +} + +static int +enqueue_submit_list(struct list_head *submit_list) +{ + struct fd_submit *submit = last_submit(submit_list); + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + + list_replace(submit_list, &fd_submit->submit_list); + list_inithead(submit_list); + + struct util_queue_fence *fence; + if (fd_submit->out_fence) { + fence = &fd_submit->out_fence->ready; + } else { + util_queue_fence_init(&fd_submit->fence); + fence = &fd_submit->fence; + } + + DEBUG_MSG("enqueue: %u", submit->fence); + + util_queue_add_job(&submit->pipe->dev->submit_queue, + submit, fence, + fd_submit_sp_flush_execute, + fd_submit_sp_flush_cleanup, + 0); + + return 0; +} + +static bool +should_defer(struct fd_submit *submit) +{ + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + + /* if too many bo's, it may not be worth the CPU cost of submit merging: */ + if (fd_submit->nr_bos > 30) + return false; + + /* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k + * cmds before we exceed the size of the ringbuffer, which results in + * deadlock writing into the RB (ie. kernel doesn't finish writing into + * the RB so it doesn't kick the GPU to start consuming from the RB) + */ + if (submit->pipe->dev->deferred_cmds > 128) + return false; + + return true; +} + +static int +fd_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, + struct fd_submit_fence *out_fence) +{ + struct fd_device *dev = submit->pipe->dev; + struct fd_pipe *pipe = submit->pipe; + + /* Acquire lock before flush_prep() because it is possible to race between + * this and pipe->flush(): + */ + simple_mtx_lock(&dev->submit_lock); + + /* If there are deferred submits from another fd_pipe, flush them now, + * since we can't merge submits from different submitqueue's (ie. they + * could have different priority, etc) + */ + if (!list_is_empty(&dev->deferred_submits) && + (last_submit(&dev->deferred_submits)->pipe != submit->pipe)) { + struct list_head submit_list; + + list_replace(&dev->deferred_submits, &submit_list); + list_inithead(&dev->deferred_submits); + dev->deferred_cmds = 0; + + enqueue_submit_list(&submit_list); + } + + list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits); + + bool has_shared = fd_submit_sp_flush_prep(submit, in_fence_fd, out_fence); + + assert(fd_fence_before(pipe->last_enqueue_fence, submit->fence)); + pipe->last_enqueue_fence = submit->fence; + + /* If we don't need an out-fence, we can defer the submit. + * + * TODO we could defer submits with in-fence as well.. if we took our own + * reference to the fd, and merged all the in-fence-fd's when we flush the + * deferred submits + */ + if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) { + DEBUG_MSG("defer: %u", submit->fence); + dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary); + assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); + simple_mtx_unlock(&dev->submit_lock); + + return 0; + } + + struct list_head submit_list; + + list_replace(&dev->deferred_submits, &submit_list); + list_inithead(&dev->deferred_submits); + dev->deferred_cmds = 0; + + simple_mtx_unlock(&dev->submit_lock); + + return enqueue_submit_list(&submit_list); +} + +void +fd_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence) +{ + struct fd_device *dev = pipe->dev; + struct list_head submit_list; + + DEBUG_MSG("flush: %u", fence); + + list_inithead(&submit_list); + + simple_mtx_lock(&dev->submit_lock); + + assert(!fd_fence_after(fence, pipe->last_enqueue_fence)); + + foreach_submit_safe (deferred_submit, &dev->deferred_submits) { + /* We should never have submits from multiple pipes in the deferred + * list. If we did, we couldn't compare their fence to our fence, + * since each fd_pipe is an independent timeline. + */ + if (deferred_submit->pipe != pipe) + break; + + if (fd_fence_after(deferred_submit->fence, fence)) + break; + + list_del(&deferred_submit->node); + list_addtail(&deferred_submit->node, &submit_list); + dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary); + } + + assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); + + simple_mtx_unlock(&dev->submit_lock); + + if (list_is_empty(&submit_list)) + goto flush_sync; + + enqueue_submit_list(&submit_list); + +flush_sync: + /* Once we are sure that we've enqueued at least up to the requested + * submit, we need to be sure that submitq has caught up and flushed + * them to the kernel + */ + pthread_mutex_lock(&flush_mtx); + while (fd_fence_before(pipe->last_submit_fence, fence)) { + pthread_cond_wait(&flush_cnd, &flush_mtx); + } + pthread_mutex_unlock(&flush_mtx); +} + +static void +fd_submit_sp_destroy(struct fd_submit *submit) +{ + struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit); + + if (fd_submit->suballoc_ring) + fd_ringbuffer_del(fd_submit->suballoc_ring); + + _mesa_hash_table_destroy(fd_submit->bo_table, NULL); + + // TODO it would be nice to have a way to debug_assert() if all + // rb's haven't been free'd back to the slab, because that is + // an indication that we are leaking bo's + slab_destroy_child(&fd_submit->ring_pool); + + for (unsigned i = 0; i < fd_submit->nr_bos; i++) + fd_bo_del(fd_submit->bos[i]); + + free(fd_submit->bos); + free(fd_submit); +} + +static const struct fd_submit_funcs submit_funcs = { + .new_ringbuffer = fd_submit_sp_new_ringbuffer, + .flush = fd_submit_sp_flush, + .destroy = fd_submit_sp_destroy, +}; + +struct fd_submit * +fd_submit_sp_new(struct fd_pipe *pipe, flush_submit_list_fn flush_submit_list) +{ + struct fd_submit_sp *fd_submit = calloc(1, sizeof(*fd_submit)); + struct fd_submit *submit; + + fd_submit->bo_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + slab_create_child(&fd_submit->ring_pool, &pipe->ring_pool); + + fd_submit->flush_submit_list = flush_submit_list; + + submit = &fd_submit->base; + submit->funcs = &submit_funcs; + + return submit; +} + +void +fd_pipe_sp_ringpool_init(struct fd_pipe *pipe) +{ + // TODO tune size: + slab_create_parent(&pipe->ring_pool, sizeof(struct fd_ringbuffer_sp), 16); +} + +void +fd_pipe_sp_ringpool_fini(struct fd_pipe *pipe) +{ + if (pipe->ring_pool.num_elements) + slab_destroy_parent(&pipe->ring_pool); +} + +static void +finalize_current_cmd(struct fd_ringbuffer *ring) +{ + debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT)); + + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); + APPEND(&fd_ring->u, cmds, + (struct fd_cmd_sp){ + .ring_bo = fd_bo_ref(fd_ring->ring_bo), + .size = offset_bytes(ring->cur, ring->start), + }); +} + +static void +fd_ringbuffer_sp_grow(struct fd_ringbuffer *ring, uint32_t size) +{ + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); + struct fd_pipe *pipe = fd_ring->u.submit->pipe; + + debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE); + + finalize_current_cmd(ring); + + fd_bo_del(fd_ring->ring_bo); + fd_ring->ring_bo = fd_bo_new_ring(pipe->dev, size); + + ring->start = fd_bo_map(fd_ring->ring_bo); + ring->end = &(ring->start[size / 4]); + ring->cur = ring->start; + ring->size = size; +} + +static inline bool +fd_ringbuffer_references_bo(struct fd_ringbuffer *ring, struct fd_bo *bo) +{ + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); + + for (int i = 0; i < fd_ring->u.nr_reloc_bos; i++) { + if (fd_ring->u.reloc_bos[i] == bo) + return true; + } + return false; +} + +#define PTRSZ 64 +#include "freedreno_ringbuffer_sp_reloc.h" +#undef PTRSZ +#define PTRSZ 32 +#include "freedreno_ringbuffer_sp_reloc.h" +#undef PTRSZ + +static uint32_t +fd_ringbuffer_sp_cmd_count(struct fd_ringbuffer *ring) +{ + if (ring->flags & FD_RINGBUFFER_GROWABLE) + return to_fd_ringbuffer_sp(ring)->u.nr_cmds + 1; + return 1; +} + +static bool +fd_ringbuffer_sp_check_size(struct fd_ringbuffer *ring) +{ + assert(!(ring->flags & _FD_RINGBUFFER_OBJECT)); + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); + struct fd_submit *submit = fd_ring->u.submit; + + if (to_fd_submit_sp(submit)->nr_bos > MAX_ARRAY_SIZE/2) { + return false; + } + + return true; +} + +static void +fd_ringbuffer_sp_destroy(struct fd_ringbuffer *ring) +{ + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); + + fd_bo_del(fd_ring->ring_bo); + + if (ring->flags & _FD_RINGBUFFER_OBJECT) { + for (unsigned i = 0; i < fd_ring->u.nr_reloc_bos; i++) { + fd_bo_del(fd_ring->u.reloc_bos[i]); + } + free(fd_ring->u.reloc_bos); + + free(fd_ring); + } else { + struct fd_submit *submit = fd_ring->u.submit; + + for (unsigned i = 0; i < fd_ring->u.nr_cmds; i++) { + fd_bo_del(fd_ring->u.cmds[i].ring_bo); + } + free(fd_ring->u.cmds); + + slab_free(&to_fd_submit_sp(submit)->ring_pool, fd_ring); + } +} + +static const struct fd_ringbuffer_funcs ring_funcs_nonobj_32 = { + .grow = fd_ringbuffer_sp_grow, + .emit_reloc = fd_ringbuffer_sp_emit_reloc_nonobj_32, + .emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_32, + .cmd_count = fd_ringbuffer_sp_cmd_count, + .check_size = fd_ringbuffer_sp_check_size, + .destroy = fd_ringbuffer_sp_destroy, +}; + +static const struct fd_ringbuffer_funcs ring_funcs_obj_32 = { + .grow = fd_ringbuffer_sp_grow, + .emit_reloc = fd_ringbuffer_sp_emit_reloc_obj_32, + .emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_32, + .cmd_count = fd_ringbuffer_sp_cmd_count, + .destroy = fd_ringbuffer_sp_destroy, +}; + +static const struct fd_ringbuffer_funcs ring_funcs_nonobj_64 = { + .grow = fd_ringbuffer_sp_grow, + .emit_reloc = fd_ringbuffer_sp_emit_reloc_nonobj_64, + .emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_64, + .cmd_count = fd_ringbuffer_sp_cmd_count, + .check_size = fd_ringbuffer_sp_check_size, + .destroy = fd_ringbuffer_sp_destroy, +}; + +static const struct fd_ringbuffer_funcs ring_funcs_obj_64 = { + .grow = fd_ringbuffer_sp_grow, + .emit_reloc = fd_ringbuffer_sp_emit_reloc_obj_64, + .emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_64, + .cmd_count = fd_ringbuffer_sp_cmd_count, + .destroy = fd_ringbuffer_sp_destroy, +}; + +static inline struct fd_ringbuffer * +fd_ringbuffer_sp_init(struct fd_ringbuffer_sp *fd_ring, uint32_t size, + enum fd_ringbuffer_flags flags) +{ + struct fd_ringbuffer *ring = &fd_ring->base; + + debug_assert(fd_ring->ring_bo); + + uint8_t *base = fd_bo_map(fd_ring->ring_bo); + ring->start = (void *)(base + fd_ring->offset); + ring->end = &(ring->start[size / 4]); + ring->cur = ring->start; + + ring->size = size; + ring->flags = flags; + + if (flags & _FD_RINGBUFFER_OBJECT) { + if (fd_dev_64b(&fd_ring->u.pipe->dev_id)) { + ring->funcs = &ring_funcs_obj_64; + } else { + ring->funcs = &ring_funcs_obj_32; + } + } else { + if (fd_dev_64b(&fd_ring->u.submit->pipe->dev_id)) { + ring->funcs = &ring_funcs_nonobj_64; + } else { + ring->funcs = &ring_funcs_nonobj_32; + } + } + + // TODO initializing these could probably be conditional on flags + // since unneed for FD_RINGBUFFER_STAGING case.. + fd_ring->u.cmds = NULL; + fd_ring->u.nr_cmds = fd_ring->u.max_cmds = 0; + + fd_ring->u.reloc_bos = NULL; + fd_ring->u.nr_reloc_bos = fd_ring->u.max_reloc_bos = 0; + + return ring; +} + +struct fd_ringbuffer * +fd_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size) +{ + struct fd_device *dev = pipe->dev; + struct fd_ringbuffer_sp *fd_ring = malloc(sizeof(*fd_ring)); + + /* Lock access to the fd_pipe->suballoc_* since ringbuffer object allocation + * can happen both on the frontend (most CSOs) and the driver thread (a6xx + * cached tex state, for example) + */ + simple_mtx_lock(&dev->suballoc_lock); + + /* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */ + fd_ring->offset = align(dev->suballoc_offset, 64); + if (!dev->suballoc_bo || + fd_ring->offset + size > fd_bo_size(dev->suballoc_bo)) { + if (dev->suballoc_bo) + fd_bo_del(dev->suballoc_bo); + dev->suballoc_bo = + fd_bo_new_ring(dev, MAX2(SUBALLOC_SIZE, align(size, 4096))); + fd_ring->offset = 0; + } + + fd_ring->u.pipe = pipe; + fd_ring->ring_bo = fd_bo_ref(dev->suballoc_bo); + fd_ring->base.refcnt = 1; + + dev->suballoc_offset = fd_ring->offset + size; + + simple_mtx_unlock(&dev->suballoc_lock); + + return fd_ringbuffer_sp_init(fd_ring, size, _FD_RINGBUFFER_OBJECT); +} diff --git a/src/freedreno/drm/freedreno_ringbuffer_sp.h b/src/freedreno/drm/freedreno_ringbuffer_sp.h new file mode 100644 index 0000000..b24eb01 --- /dev/null +++ b/src/freedreno/drm/freedreno_ringbuffer_sp.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2018 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#ifndef FREEDRENO_RINGBUFFER_SP_H_ +#define FREEDRENO_RINGBUFFER_SP_H_ + +#include +#include +#include + +#include "util/hash_table.h" +#include "util/os_file.h" +#include "util/slab.h" + +#include "freedreno_priv.h" +#include "freedreno_ringbuffer.h" + +/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead + * by avoiding the additional tracking necessary to build cmds/relocs tables + * (but still builds a bos table) + */ + +typedef int (*flush_submit_list_fn)(struct list_head *submit_list); + +struct fd_submit_sp { + struct fd_submit base; + + DECLARE_ARRAY(struct fd_bo *, bos); + + /* maps fd_bo to idx in bos table: */ + struct hash_table *bo_table; + + struct slab_child_pool ring_pool; + + /* Allow for sub-allocation of stateobj ring buffers (ie. sharing + * the same underlying bo).. + * + * We also rely on previous stateobj having been fully constructed + * so we can reclaim extra space at it's end. + */ + struct fd_ringbuffer *suballoc_ring; + + /* Flush args, potentially attached to the last submit in the list + * of submits to merge: + */ + int in_fence_fd; + struct fd_submit_fence *out_fence; + + /* State for enqueued submits: + */ + struct list_head submit_list; /* includes this submit as last element */ + + /* Used in case out_fence==NULL: */ + struct util_queue_fence fence; + + flush_submit_list_fn flush_submit_list; +}; +FD_DEFINE_CAST(fd_submit, fd_submit_sp); + +/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers + * and sizes. Ie. a finalized buffer can have no more commands appended to + * it. + */ +struct fd_cmd_sp { + struct fd_bo *ring_bo; + unsigned size; +}; + +struct fd_ringbuffer_sp { + struct fd_ringbuffer base; + + /* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */ + unsigned offset; + + union { + /* for _FD_RINGBUFFER_OBJECT case, the array of BOs referenced from + * this one + */ + struct { + struct fd_pipe *pipe; + DECLARE_ARRAY(struct fd_bo *, reloc_bos); + }; + /* for other cases: */ + struct { + struct fd_submit *submit; + DECLARE_ARRAY(struct fd_cmd_sp, cmds); + }; + } u; + + struct fd_bo *ring_bo; +}; +FD_DEFINE_CAST(fd_ringbuffer, fd_ringbuffer_sp); + +void fd_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence); +uint32_t fd_submit_append_bo(struct fd_submit_sp *submit, struct fd_bo *bo); +struct fd_submit *fd_submit_sp_new(struct fd_pipe *pipe, + flush_submit_list_fn flush_submit_list); +void fd_pipe_sp_ringpool_init(struct fd_pipe *pipe); +void fd_pipe_sp_ringpool_fini(struct fd_pipe *pipe); +struct fd_ringbuffer *fd_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size); + +#endif /* FREEDRENO_RINGBUFFER_SP_H_ */ diff --git a/src/freedreno/drm/msm/msm_ringbuffer_sp.h b/src/freedreno/drm/freedreno_ringbuffer_sp_reloc.h similarity index 62% rename from src/freedreno/drm/msm/msm_ringbuffer_sp.h rename to src/freedreno/drm/freedreno_ringbuffer_sp_reloc.h index 8b8f61b..9374681 100644 --- a/src/freedreno/drm/msm/msm_ringbuffer_sp.h +++ b/src/freedreno/drm/freedreno_ringbuffer_sp_reloc.h @@ -40,28 +40,28 @@ static void X(emit_reloc_common)(struct fd_ringbuffer *ring, #endif } -static void X(msm_ringbuffer_sp_emit_reloc_nonobj)(struct fd_ringbuffer *ring, +static void X(fd_ringbuffer_sp_emit_reloc_nonobj)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc) { X(emit_reloc_common)(ring, reloc); assert(!(ring->flags & _FD_RINGBUFFER_OBJECT)); - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); - struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit); + struct fd_submit_sp *fd_submit = to_fd_submit_sp(fd_ring->u.submit); - msm_submit_append_bo(msm_submit, reloc->bo); + fd_submit_append_bo(fd_submit, reloc->bo); } -static void X(msm_ringbuffer_sp_emit_reloc_obj)(struct fd_ringbuffer *ring, +static void X(fd_ringbuffer_sp_emit_reloc_obj)(struct fd_ringbuffer *ring, const struct fd_reloc *reloc) { X(emit_reloc_common)(ring, reloc); assert(ring->flags & _FD_RINGBUFFER_OBJECT); - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); /* Avoid emitting duplicate BO references into the list. Ringbuffer * objects are long-lived, so this saves ongoing work at draw time in @@ -69,60 +69,60 @@ static void X(msm_ringbuffer_sp_emit_reloc_obj)(struct fd_ringbuffer *ring, * relocs per ringbuffer object is fairly small, so the O(n^2) doesn't * hurt much. */ - if (!msm_ringbuffer_references_bo(ring, reloc->bo)) { - APPEND(&msm_ring->u, reloc_bos, fd_bo_ref(reloc->bo)); + if (!fd_ringbuffer_references_bo(ring, reloc->bo)) { + APPEND(&fd_ring->u, reloc_bos, fd_bo_ref(reloc->bo)); } } -static uint32_t X(msm_ringbuffer_sp_emit_reloc_ring)( +static uint32_t X(fd_ringbuffer_sp_emit_reloc_ring)( struct fd_ringbuffer *ring, struct fd_ringbuffer *target, uint32_t cmd_idx) { - struct msm_ringbuffer_sp *msm_target = to_msm_ringbuffer_sp(target); + struct fd_ringbuffer_sp *fd_target = to_fd_ringbuffer_sp(target); struct fd_bo *bo; uint32_t size; if ((target->flags & FD_RINGBUFFER_GROWABLE) && - (cmd_idx < msm_target->u.nr_cmds)) { - bo = msm_target->u.cmds[cmd_idx].ring_bo; - size = msm_target->u.cmds[cmd_idx].size; + (cmd_idx < fd_target->u.nr_cmds)) { + bo = fd_target->u.cmds[cmd_idx].ring_bo; + size = fd_target->u.cmds[cmd_idx].size; } else { - bo = msm_target->ring_bo; + bo = fd_target->ring_bo; size = offset_bytes(target->cur, target->start); } if (ring->flags & _FD_RINGBUFFER_OBJECT) { - X(msm_ringbuffer_sp_emit_reloc_obj)(ring, &(struct fd_reloc){ + X(fd_ringbuffer_sp_emit_reloc_obj)(ring, &(struct fd_reloc){ .bo = bo, - .iova = bo->iova + msm_target->offset, - .offset = msm_target->offset, + .iova = bo->iova + fd_target->offset, + .offset = fd_target->offset, }); } else { - X(msm_ringbuffer_sp_emit_reloc_nonobj)(ring, &(struct fd_reloc){ + X(fd_ringbuffer_sp_emit_reloc_nonobj)(ring, &(struct fd_reloc){ .bo = bo, - .iova = bo->iova + msm_target->offset, - .offset = msm_target->offset, + .iova = bo->iova + fd_target->offset, + .offset = fd_target->offset, }); } if (!(target->flags & _FD_RINGBUFFER_OBJECT)) return size; - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); + struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring); if (ring->flags & _FD_RINGBUFFER_OBJECT) { - for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) { - struct fd_bo *target_bo = msm_target->u.reloc_bos[i]; - if (!msm_ringbuffer_references_bo(ring, target_bo)) - APPEND(&msm_ring->u, reloc_bos, fd_bo_ref(target_bo)); + for (unsigned i = 0; i < fd_target->u.nr_reloc_bos; i++) { + struct fd_bo *target_bo = fd_target->u.reloc_bos[i]; + if (!fd_ringbuffer_references_bo(ring, target_bo)) + APPEND(&fd_ring->u, reloc_bos, fd_bo_ref(target_bo)); } } else { // TODO it would be nice to know whether we have already // seen this target before. But hopefully we hit the // append_bo() fast path enough for this to not matter: - struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit); + struct fd_submit_sp *fd_submit = to_fd_submit_sp(fd_ring->u.submit); - for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) { - msm_submit_append_bo(msm_submit, msm_target->u.reloc_bos[i]); + for (unsigned i = 0; i < fd_target->u.nr_reloc_bos; i++) { + fd_submit_append_bo(fd_submit, fd_target->u.reloc_bos[i]); } } diff --git a/src/freedreno/drm/meson.build b/src/freedreno/drm/meson.build index 1284f5f..7712422 100644 --- a/src/freedreno/drm/meson.build +++ b/src/freedreno/drm/meson.build @@ -27,6 +27,9 @@ libfreedreno_drm_files = files( 'freedreno_priv.h', 'freedreno_ringbuffer.c', 'freedreno_ringbuffer.h', + 'freedreno_ringbuffer_sp.c', + 'freedreno_ringbuffer_sp.h', + 'freedreno_ringbuffer_sp_reloc.h', ) libfreedreno_drm_flags = [] libfreedreno_drm_includes = [ @@ -46,7 +49,6 @@ libfreedreno_drm_msm_files = files( 'msm/msm_priv.h', 'msm/msm_ringbuffer.c', 'msm/msm_ringbuffer_sp.c', - 'msm/msm_ringbuffer_sp.h', ) libfreedreno_drm_files += libfreedreno_drm_msm_files diff --git a/src/freedreno/drm/msm/msm_pipe.c b/src/freedreno/drm/msm/msm_pipe.c index babb89b..775fd59 100644 --- a/src/freedreno/drm/msm/msm_pipe.c +++ b/src/freedreno/drm/msm/msm_pipe.c @@ -26,6 +26,7 @@ #include "util/slab.h" +#include "freedreno_ringbuffer_sp.h" #include "msm_priv.h" static int @@ -199,14 +200,14 @@ msm_pipe_destroy(struct fd_pipe *pipe) struct msm_pipe *msm_pipe = to_msm_pipe(pipe); close_submitqueue(pipe, msm_pipe->queue_id); - msm_pipe_sp_ringpool_fini(pipe); + fd_pipe_sp_ringpool_fini(pipe); free(msm_pipe); } static const struct fd_pipe_funcs sp_funcs = { - .ringbuffer_new_object = msm_ringbuffer_sp_new_object, + .ringbuffer_new_object = fd_ringbuffer_sp_new_object, .submit_new = msm_submit_sp_new, - .flush = msm_pipe_sp_flush, + .flush = fd_pipe_sp_flush, .get_param = msm_pipe_get_param, .set_param = msm_pipe_set_param, .wait = msm_pipe_wait, @@ -281,7 +282,7 @@ msm_pipe_new(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio) if (open_submitqueue(pipe, prio)) goto fail; - msm_pipe_sp_ringpool_init(pipe); + fd_pipe_sp_ringpool_init(pipe); return pipe; fail: diff --git a/src/freedreno/drm/msm/msm_priv.h b/src/freedreno/drm/msm/msm_priv.h index ca2bca8..b41d1ff 100644 --- a/src/freedreno/drm/msm/msm_priv.h +++ b/src/freedreno/drm/msm/msm_priv.h @@ -68,15 +68,9 @@ struct fd_pipe *msm_pipe_new(struct fd_device *dev, enum fd_pipe_id id, struct fd_ringbuffer *msm_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size); -struct fd_ringbuffer *msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, - uint32_t size); struct fd_submit *msm_submit_new(struct fd_pipe *pipe); struct fd_submit *msm_submit_sp_new(struct fd_pipe *pipe); -void msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence); - -void msm_pipe_sp_ringpool_init(struct fd_pipe *pipe); -void msm_pipe_sp_ringpool_fini(struct fd_pipe *pipe); struct msm_bo { struct fd_bo base; diff --git a/src/freedreno/drm/msm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm/msm_ringbuffer_sp.c index a607f0e..5fd0dfe 100644 --- a/src/freedreno/drm/msm/msm_ringbuffer_sp.c +++ b/src/freedreno/drm/msm/msm_ringbuffer_sp.c @@ -28,249 +28,16 @@ #include #include -#include "util/hash_table.h" #include "util/os_file.h" -#include "util/slab.h" -#include "drm/freedreno_ringbuffer.h" +#include "drm/freedreno_ringbuffer_sp.h" #include "msm_priv.h" -/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead - * by avoiding the additional tracking necessary to build cmds/relocs tables - * (but still builds a bos table) - */ - -#define INIT_SIZE 0x1000 - -#define SUBALLOC_SIZE (32 * 1024) - -/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on, - * instead use a condition-variable. Note that pipe->flush() is not expected - * to be a common/hot path. - */ -static pthread_cond_t flush_cnd = PTHREAD_COND_INITIALIZER; -static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER; - - -struct msm_submit_sp { - struct fd_submit base; - - DECLARE_ARRAY(struct fd_bo *, bos); - - /* maps fd_bo to idx in bos table: */ - struct hash_table *bo_table; - - struct slab_child_pool ring_pool; - - /* Allow for sub-allocation of stateobj ring buffers (ie. sharing - * the same underlying bo).. - * - * We also rely on previous stateobj having been fully constructed - * so we can reclaim extra space at it's end. - */ - struct fd_ringbuffer *suballoc_ring; - - /* Flush args, potentially attached to the last submit in the list - * of submits to merge: - */ - int in_fence_fd; - struct fd_submit_fence *out_fence; - - /* State for enqueued submits: - */ - struct list_head submit_list; /* includes this submit as last element */ - - /* Used in case out_fence==NULL: */ - struct util_queue_fence fence; -}; -FD_DEFINE_CAST(fd_submit, msm_submit_sp); - -/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers - * and sizes. Ie. a finalized buffer can have no more commands appended to - * it. - */ -struct msm_cmd_sp { - struct fd_bo *ring_bo; - unsigned size; -}; - -struct msm_ringbuffer_sp { - struct fd_ringbuffer base; - - /* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */ - unsigned offset; - - union { - /* for _FD_RINGBUFFER_OBJECT case, the array of BOs referenced from - * this one - */ - struct { - struct fd_pipe *pipe; - DECLARE_ARRAY(struct fd_bo *, reloc_bos); - }; - /* for other cases: */ - struct { - struct fd_submit *submit; - DECLARE_ARRAY(struct msm_cmd_sp, cmds); - }; - } u; - - struct fd_bo *ring_bo; -}; -FD_DEFINE_CAST(fd_ringbuffer, msm_ringbuffer_sp); - -static void finalize_current_cmd(struct fd_ringbuffer *ring); -static struct fd_ringbuffer * -msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size, - enum fd_ringbuffer_flags flags); - -/* add (if needed) bo to submit and return index: */ -static uint32_t -msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo) -{ - uint32_t idx; - - /* NOTE: it is legal to use the same bo on different threads for - * different submits. But it is not legal to use the same submit - * from different threads. - */ - idx = READ_ONCE(bo->idx); - - if (unlikely((idx >= submit->nr_bos) || (submit->bos[idx] != bo))) { - uint32_t hash = _mesa_hash_pointer(bo); - struct hash_entry *entry; - - entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo); - if (entry) { - /* found */ - idx = (uint32_t)(uintptr_t)entry->data; - } else { - idx = APPEND(submit, bos, fd_bo_ref(bo)); - - _mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo, - (void *)(uintptr_t)idx); - } - bo->idx = idx; - } - - return idx; -} - -static void -msm_submit_suballoc_ring_bo(struct fd_submit *submit, - struct msm_ringbuffer_sp *msm_ring, uint32_t size) -{ - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - unsigned suballoc_offset = 0; - struct fd_bo *suballoc_bo = NULL; - - if (msm_submit->suballoc_ring) { - struct msm_ringbuffer_sp *suballoc_ring = - to_msm_ringbuffer_sp(msm_submit->suballoc_ring); - - suballoc_bo = suballoc_ring->ring_bo; - suballoc_offset = - fd_ringbuffer_size(msm_submit->suballoc_ring) + suballoc_ring->offset; - - suballoc_offset = align(suballoc_offset, 0x10); - - if ((size + suballoc_offset) > suballoc_bo->size) { - suballoc_bo = NULL; - } - } - - if (!suballoc_bo) { - // TODO possibly larger size for streaming bo? - msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, SUBALLOC_SIZE); - msm_ring->offset = 0; - } else { - msm_ring->ring_bo = fd_bo_ref(suballoc_bo); - msm_ring->offset = suballoc_offset; - } - - struct fd_ringbuffer *old_suballoc_ring = msm_submit->suballoc_ring; - - msm_submit->suballoc_ring = fd_ringbuffer_ref(&msm_ring->base); - - if (old_suballoc_ring) - fd_ringbuffer_del(old_suballoc_ring); -} - -static struct fd_ringbuffer * -msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size, - enum fd_ringbuffer_flags flags) -{ - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - struct msm_ringbuffer_sp *msm_ring; - - msm_ring = slab_alloc(&msm_submit->ring_pool); - - msm_ring->u.submit = submit; - - /* NOTE: needs to be before _suballoc_ring_bo() since it could - * increment the refcnt of the current ring - */ - msm_ring->base.refcnt = 1; - - if (flags & FD_RINGBUFFER_STREAMING) { - msm_submit_suballoc_ring_bo(submit, msm_ring, size); - } else { - if (flags & FD_RINGBUFFER_GROWABLE) - size = INIT_SIZE; - - msm_ring->offset = 0; - msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size); - } - - if (!msm_ringbuffer_sp_init(msm_ring, size, flags)) - return NULL; - - return &msm_ring->base; -} - -/** - * Prepare submit for flush, always done synchronously. - * - * 1) Finalize primary ringbuffer, at this point no more cmdstream may - * be written into it, since from the PoV of the upper level driver - * the submit is flushed, even if deferred - * 2) Add cmdstream bos to bos table - * 3) Update bo fences - */ -static bool -msm_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd, - struct fd_submit_fence *out_fence) -{ - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - bool has_shared = false; - - finalize_current_cmd(submit->primary); - - struct msm_ringbuffer_sp *primary = - to_msm_ringbuffer_sp(submit->primary); - - for (unsigned i = 0; i < primary->u.nr_cmds; i++) - msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo); - - simple_mtx_lock(&table_lock); - for (unsigned i = 0; i < msm_submit->nr_bos; i++) { - fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence); - has_shared |= msm_submit->bos[i]->shared; - } - simple_mtx_unlock(&table_lock); - - msm_submit->out_fence = out_fence; - msm_submit->in_fence_fd = (in_fence_fd == -1) ? - -1 : os_dupfd_cloexec(in_fence_fd); - - return has_shared; -} - static int flush_submit_list(struct list_head *submit_list) { - struct msm_submit_sp *msm_submit = to_msm_submit_sp(last_submit(submit_list)); - struct msm_pipe *msm_pipe = to_msm_pipe(msm_submit->base.pipe); + struct fd_submit_sp *fd_submit = to_fd_submit_sp(last_submit(submit_list)); + struct msm_pipe *msm_pipe = to_msm_pipe(fd_submit->base.pipe); struct drm_msm_gem_submit req = { .flags = msm_pipe->pipe, .queueid = msm_pipe->queue_id, @@ -284,7 +51,7 @@ flush_submit_list(struct list_head *submit_list) */ foreach_submit (submit, submit_list) { assert(submit->pipe == &msm_pipe->base); - nr_cmds += to_msm_ringbuffer_sp(submit->primary)->u.nr_cmds; + nr_cmds += to_fd_ringbuffer_sp(submit->primary)->u.nr_cmds; } struct drm_msm_gem_submit_cmd cmds[nr_cmds]; @@ -295,13 +62,13 @@ flush_submit_list(struct list_head *submit_list) * list, merge their bo tables into the last submit. */ foreach_submit_safe (submit, submit_list) { - struct msm_ringbuffer_sp *deferred_primary = - to_msm_ringbuffer_sp(submit->primary); + struct fd_ringbuffer_sp *deferred_primary = + to_fd_ringbuffer_sp(submit->primary); for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) { cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF; cmds[cmd_idx].submit_idx = - msm_submit_append_bo(msm_submit, deferred_primary->u.cmds[i].ring_bo); + fd_submit_append_bo(fd_submit, deferred_primary->u.cmds[i].ring_bo); cmds[cmd_idx].submit_offset = deferred_primary->offset; cmds[cmd_idx].size = deferred_primary->u.cmds[i].size; cmds[cmd_idx].pad = 0; @@ -318,13 +85,13 @@ flush_submit_list(struct list_head *submit_list) break; } - struct msm_submit_sp *msm_deferred_submit = to_msm_submit_sp(submit); - for (unsigned i = 0; i < msm_deferred_submit->nr_bos; i++) { + struct fd_submit_sp *fd_deferred_submit = to_fd_submit_sp(submit); + for (unsigned i = 0; i < fd_deferred_submit->nr_bos; i++) { /* Note: if bo is used in both the current submit and the deferred * submit being merged, we expect to hit the fast-path as we add it * to the current submit: */ - msm_submit_append_bo(msm_submit, msm_deferred_submit->bos[i]); + fd_submit_append_bo(fd_submit, fd_deferred_submit->bos[i]); } /* Now that the cmds/bos have been transfered over to the current submit, @@ -334,9 +101,9 @@ flush_submit_list(struct list_head *submit_list) fd_submit_del(submit); } - if (msm_submit->in_fence_fd != -1) { + if (fd_submit->in_fence_fd != -1) { req.flags |= MSM_SUBMIT_FENCE_FD_IN; - req.fence_fd = msm_submit->in_fence_fd; + req.fence_fd = fd_submit->in_fence_fd; msm_pipe->no_implicit_sync = true; } @@ -344,7 +111,7 @@ flush_submit_list(struct list_head *submit_list) req.flags |= MSM_SUBMIT_NO_IMPLICIT; } - if (msm_submit->out_fence && msm_submit->out_fence->use_fence_fd) { + if (fd_submit->out_fence && fd_submit->out_fence->use_fence_fd) { req.flags |= MSM_SUBMIT_FENCE_FD_OUT; } @@ -354,24 +121,24 @@ flush_submit_list(struct list_head *submit_list) * bound to limit on-stack allocation to 4k: */ const unsigned bo_limit = sizeof(struct drm_msm_gem_submit_bo) / 4096; - bool bos_on_stack = msm_submit->nr_bos < bo_limit; + bool bos_on_stack = fd_submit->nr_bos < bo_limit; struct drm_msm_gem_submit_bo - _submit_bos[bos_on_stack ? msm_submit->nr_bos : 0]; + _submit_bos[bos_on_stack ? fd_submit->nr_bos : 0]; struct drm_msm_gem_submit_bo *submit_bos; if (bos_on_stack) { submit_bos = _submit_bos; } else { - submit_bos = malloc(msm_submit->nr_bos * sizeof(submit_bos[0])); + submit_bos = malloc(fd_submit->nr_bos * sizeof(submit_bos[0])); } - for (unsigned i = 0; i < msm_submit->nr_bos; i++) { - submit_bos[i].flags = msm_submit->bos[i]->reloc_flags; - submit_bos[i].handle = msm_submit->bos[i]->handle; + for (unsigned i = 0; i < fd_submit->nr_bos; i++) { + submit_bos[i].flags = fd_submit->bos[i]->reloc_flags; + submit_bos[i].handle = fd_submit->bos[i]->handle; submit_bos[i].presumed = 0; } req.bos = VOID2U64(submit_bos); - req.nr_bos = msm_submit->nr_bos; + req.nr_bos = fd_submit->nr_bos; req.cmds = VOID2U64(cmds); req.nr_cmds = nr_cmds; @@ -382,472 +149,28 @@ flush_submit_list(struct list_head *submit_list) if (ret) { ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno)); msm_dump_submit(&req); - } else if (!ret && msm_submit->out_fence) { - msm_submit->out_fence->fence.kfence = req.fence; - msm_submit->out_fence->fence.ufence = msm_submit->base.fence; - msm_submit->out_fence->fence_fd = req.fence_fd; + } else if (!ret && fd_submit->out_fence) { + fd_submit->out_fence->fence.kfence = req.fence; + fd_submit->out_fence->fence.ufence = fd_submit->base.fence; + fd_submit->out_fence->fence_fd = req.fence_fd; } if (!bos_on_stack) free(submit_bos); - pthread_mutex_lock(&flush_mtx); - assert(fd_fence_before(msm_pipe->base.last_submit_fence, msm_submit->base.fence)); - msm_pipe->base.last_submit_fence = msm_submit->base.fence; - pthread_cond_broadcast(&flush_cnd); - pthread_mutex_unlock(&flush_mtx); - - if (msm_submit->in_fence_fd != -1) - close(msm_submit->in_fence_fd); + if (fd_submit->in_fence_fd != -1) + close(fd_submit->in_fence_fd); return ret; } -static void -msm_submit_sp_flush_execute(void *job, void *gdata, int thread_index) -{ - struct fd_submit *submit = job; - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - - flush_submit_list(&msm_submit->submit_list); - - DEBUG_MSG("finish: %u", submit->fence); -} - -static void -msm_submit_sp_flush_cleanup(void *job, void *gdata, int thread_index) -{ - struct fd_submit *submit = job; - fd_submit_del(submit); -} - -static int -enqueue_submit_list(struct list_head *submit_list) -{ - struct fd_submit *submit = last_submit(submit_list); - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - - list_replace(submit_list, &msm_submit->submit_list); - list_inithead(submit_list); - - struct util_queue_fence *fence; - if (msm_submit->out_fence) { - fence = &msm_submit->out_fence->ready; - } else { - util_queue_fence_init(&msm_submit->fence); - fence = &msm_submit->fence; - } - - DEBUG_MSG("enqueue: %u", submit->fence); - - util_queue_add_job(&submit->pipe->dev->submit_queue, - submit, fence, - msm_submit_sp_flush_execute, - msm_submit_sp_flush_cleanup, - 0); - - return 0; -} - -static bool -should_defer(struct fd_submit *submit) -{ - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - - /* if too many bo's, it may not be worth the CPU cost of submit merging: */ - if (msm_submit->nr_bos > 30) - return false; - - /* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k - * cmds before we exceed the size of the ringbuffer, which results in - * deadlock writing into the RB (ie. kernel doesn't finish writing into - * the RB so it doesn't kick the GPU to start consuming from the RB) - */ - if (submit->pipe->dev->deferred_cmds > 128) - return false; - - return true; -} - -static int -msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, - struct fd_submit_fence *out_fence) -{ - struct fd_device *dev = submit->pipe->dev; - struct fd_pipe *pipe = submit->pipe; - - /* Acquire lock before flush_prep() because it is possible to race between - * this and pipe->flush(): - */ - simple_mtx_lock(&dev->submit_lock); - - /* If there are deferred submits from another fd_pipe, flush them now, - * since we can't merge submits from different submitqueue's (ie. they - * could have different priority, etc) - */ - if (!list_is_empty(&dev->deferred_submits) && - (last_submit(&dev->deferred_submits)->pipe != submit->pipe)) { - struct list_head submit_list; - - list_replace(&dev->deferred_submits, &submit_list); - list_inithead(&dev->deferred_submits); - dev->deferred_cmds = 0; - - enqueue_submit_list(&submit_list); - } - - list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits); - - bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence); - - assert(fd_fence_before(pipe->last_enqueue_fence, submit->fence)); - pipe->last_enqueue_fence = submit->fence; - - /* If we don't need an out-fence, we can defer the submit. - * - * TODO we could defer submits with in-fence as well.. if we took our own - * reference to the fd, and merged all the in-fence-fd's when we flush the - * deferred submits - */ - if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) { - DEBUG_MSG("defer: %u", submit->fence); - dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary); - assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); - simple_mtx_unlock(&dev->submit_lock); - - return 0; - } - - struct list_head submit_list; - - list_replace(&dev->deferred_submits, &submit_list); - list_inithead(&dev->deferred_submits); - dev->deferred_cmds = 0; - - simple_mtx_unlock(&dev->submit_lock); - - return enqueue_submit_list(&submit_list); -} - -void -msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence) -{ - struct fd_device *dev = pipe->dev; - struct list_head submit_list; - - DEBUG_MSG("flush: %u", fence); - - list_inithead(&submit_list); - - simple_mtx_lock(&dev->submit_lock); - - assert(!fd_fence_after(fence, pipe->last_enqueue_fence)); - - foreach_submit_safe (deferred_submit, &dev->deferred_submits) { - /* We should never have submits from multiple pipes in the deferred - * list. If we did, we couldn't compare their fence to our fence, - * since each fd_pipe is an independent timeline. - */ - if (deferred_submit->pipe != pipe) - break; - - if (fd_fence_after(deferred_submit->fence, fence)) - break; - - list_del(&deferred_submit->node); - list_addtail(&deferred_submit->node, &submit_list); - dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary); - } - - assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev)); - - simple_mtx_unlock(&dev->submit_lock); - - if (list_is_empty(&submit_list)) - goto flush_sync; - - enqueue_submit_list(&submit_list); - -flush_sync: - /* Once we are sure that we've enqueued at least up to the requested - * submit, we need to be sure that submitq has caught up and flushed - * them to the kernel - */ - pthread_mutex_lock(&flush_mtx); - while (fd_fence_before(pipe->last_submit_fence, fence)) { - pthread_cond_wait(&flush_cnd, &flush_mtx); - } - pthread_mutex_unlock(&flush_mtx); -} - -static void -msm_submit_sp_destroy(struct fd_submit *submit) -{ - struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit); - - if (msm_submit->suballoc_ring) - fd_ringbuffer_del(msm_submit->suballoc_ring); - - _mesa_hash_table_destroy(msm_submit->bo_table, NULL); - - // TODO it would be nice to have a way to debug_assert() if all - // rb's haven't been free'd back to the slab, because that is - // an indication that we are leaking bo's - slab_destroy_child(&msm_submit->ring_pool); - - for (unsigned i = 0; i < msm_submit->nr_bos; i++) - fd_bo_del(msm_submit->bos[i]); - - free(msm_submit->bos); - free(msm_submit); -} - -static const struct fd_submit_funcs submit_funcs = { - .new_ringbuffer = msm_submit_sp_new_ringbuffer, - .flush = msm_submit_sp_flush, - .destroy = msm_submit_sp_destroy, -}; - struct fd_submit * msm_submit_sp_new(struct fd_pipe *pipe) { - struct msm_submit_sp *msm_submit = calloc(1, sizeof(*msm_submit)); - struct fd_submit *submit; - - msm_submit->bo_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - slab_create_child(&msm_submit->ring_pool, &pipe->ring_pool); - - submit = &msm_submit->base; - submit->funcs = &submit_funcs; - - return submit; -} - -void -msm_pipe_sp_ringpool_init(struct fd_pipe *pipe) -{ - // TODO tune size: - slab_create_parent(&pipe->ring_pool, sizeof(struct msm_ringbuffer_sp), 16); -} - -void -msm_pipe_sp_ringpool_fini(struct fd_pipe *pipe) -{ - if (pipe->ring_pool.num_elements) - slab_destroy_parent(&pipe->ring_pool); -} - -static void -finalize_current_cmd(struct fd_ringbuffer *ring) -{ - debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT)); - - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); - APPEND(&msm_ring->u, cmds, - (struct msm_cmd_sp){ - .ring_bo = fd_bo_ref(msm_ring->ring_bo), - .size = offset_bytes(ring->cur, ring->start), - }); -} - -static void -msm_ringbuffer_sp_grow(struct fd_ringbuffer *ring, uint32_t size) -{ - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); - struct fd_pipe *pipe = msm_ring->u.submit->pipe; - - debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE); - - finalize_current_cmd(ring); - - fd_bo_del(msm_ring->ring_bo); - msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size); - - ring->start = fd_bo_map(msm_ring->ring_bo); - ring->end = &(ring->start[size / 4]); - ring->cur = ring->start; - ring->size = size; -} - -static inline bool -msm_ringbuffer_references_bo(struct fd_ringbuffer *ring, struct fd_bo *bo) -{ - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); - - for (int i = 0; i < msm_ring->u.nr_reloc_bos; i++) { - if (msm_ring->u.reloc_bos[i] == bo) - return true; - } - return false; -} - -#define PTRSZ 64 -#include "msm_ringbuffer_sp.h" -#undef PTRSZ -#define PTRSZ 32 -#include "msm_ringbuffer_sp.h" -#undef PTRSZ - -static uint32_t -msm_ringbuffer_sp_cmd_count(struct fd_ringbuffer *ring) -{ - if (ring->flags & FD_RINGBUFFER_GROWABLE) - return to_msm_ringbuffer_sp(ring)->u.nr_cmds + 1; - return 1; -} - -static bool -msm_ringbuffer_sp_check_size(struct fd_ringbuffer *ring) -{ - assert(!(ring->flags & _FD_RINGBUFFER_OBJECT)); - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); - struct fd_submit *submit = msm_ring->u.submit; - - if (to_msm_submit_sp(submit)->nr_bos > MAX_ARRAY_SIZE/2) { - return false; - } - - return true; -} - -static void -msm_ringbuffer_sp_destroy(struct fd_ringbuffer *ring) -{ - struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring); - - fd_bo_del(msm_ring->ring_bo); - - if (ring->flags & _FD_RINGBUFFER_OBJECT) { - for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) { - fd_bo_del(msm_ring->u.reloc_bos[i]); - } - free(msm_ring->u.reloc_bos); - - free(msm_ring); - } else { - struct fd_submit *submit = msm_ring->u.submit; - - for (unsigned i = 0; i < msm_ring->u.nr_cmds; i++) { - fd_bo_del(msm_ring->u.cmds[i].ring_bo); - } - free(msm_ring->u.cmds); - - slab_free(&to_msm_submit_sp(submit)->ring_pool, msm_ring); - } -} - -static const struct fd_ringbuffer_funcs ring_funcs_nonobj_32 = { - .grow = msm_ringbuffer_sp_grow, - .emit_reloc = msm_ringbuffer_sp_emit_reloc_nonobj_32, - .emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring_32, - .cmd_count = msm_ringbuffer_sp_cmd_count, - .check_size = msm_ringbuffer_sp_check_size, - .destroy = msm_ringbuffer_sp_destroy, -}; - -static const struct fd_ringbuffer_funcs ring_funcs_obj_32 = { - .grow = msm_ringbuffer_sp_grow, - .emit_reloc = msm_ringbuffer_sp_emit_reloc_obj_32, - .emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring_32, - .cmd_count = msm_ringbuffer_sp_cmd_count, - .destroy = msm_ringbuffer_sp_destroy, -}; - -static const struct fd_ringbuffer_funcs ring_funcs_nonobj_64 = { - .grow = msm_ringbuffer_sp_grow, - .emit_reloc = msm_ringbuffer_sp_emit_reloc_nonobj_64, - .emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring_64, - .cmd_count = msm_ringbuffer_sp_cmd_count, - .check_size = msm_ringbuffer_sp_check_size, - .destroy = msm_ringbuffer_sp_destroy, -}; - -static const struct fd_ringbuffer_funcs ring_funcs_obj_64 = { - .grow = msm_ringbuffer_sp_grow, - .emit_reloc = msm_ringbuffer_sp_emit_reloc_obj_64, - .emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring_64, - .cmd_count = msm_ringbuffer_sp_cmd_count, - .destroy = msm_ringbuffer_sp_destroy, -}; - -static inline struct fd_ringbuffer * -msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size, - enum fd_ringbuffer_flags flags) -{ - struct fd_ringbuffer *ring = &msm_ring->base; - /* We don't do any translation from internal FD_RELOC flags to MSM flags. */ STATIC_ASSERT(FD_RELOC_READ == MSM_SUBMIT_BO_READ); STATIC_ASSERT(FD_RELOC_WRITE == MSM_SUBMIT_BO_WRITE); STATIC_ASSERT(FD_RELOC_DUMP == MSM_SUBMIT_BO_DUMP); - debug_assert(msm_ring->ring_bo); - - uint8_t *base = fd_bo_map(msm_ring->ring_bo); - ring->start = (void *)(base + msm_ring->offset); - ring->end = &(ring->start[size / 4]); - ring->cur = ring->start; - - ring->size = size; - ring->flags = flags; - - if (flags & _FD_RINGBUFFER_OBJECT) { - if (fd_dev_64b(&msm_ring->u.pipe->dev_id)) { - ring->funcs = &ring_funcs_obj_64; - } else { - ring->funcs = &ring_funcs_obj_32; - } - } else { - if (fd_dev_64b(&msm_ring->u.submit->pipe->dev_id)) { - ring->funcs = &ring_funcs_nonobj_64; - } else { - ring->funcs = &ring_funcs_nonobj_32; - } - } - - // TODO initializing these could probably be conditional on flags - // since unneed for FD_RINGBUFFER_STAGING case.. - msm_ring->u.cmds = NULL; - msm_ring->u.nr_cmds = msm_ring->u.max_cmds = 0; - - msm_ring->u.reloc_bos = NULL; - msm_ring->u.nr_reloc_bos = msm_ring->u.max_reloc_bos = 0; - - return ring; -} - -struct fd_ringbuffer * -msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size) -{ - struct fd_device *dev = pipe->dev; - struct msm_ringbuffer_sp *msm_ring = malloc(sizeof(*msm_ring)); - - /* Lock access to the msm_pipe->suballoc_* since ringbuffer object allocation - * can happen both on the frontend (most CSOs) and the driver thread (a6xx - * cached tex state, for example) - */ - simple_mtx_lock(&dev->suballoc_lock); - - /* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */ - msm_ring->offset = align(dev->suballoc_offset, 64); - if (!dev->suballoc_bo || - msm_ring->offset + size > fd_bo_size(dev->suballoc_bo)) { - if (dev->suballoc_bo) - fd_bo_del(dev->suballoc_bo); - dev->suballoc_bo = - fd_bo_new_ring(dev, MAX2(SUBALLOC_SIZE, align(size, 4096))); - msm_ring->offset = 0; - } - - msm_ring->u.pipe = pipe; - msm_ring->ring_bo = fd_bo_ref(dev->suballoc_bo); - msm_ring->base.refcnt = 1; - - dev->suballoc_offset = msm_ring->offset + size; - - simple_mtx_unlock(&dev->suballoc_lock); - - return msm_ringbuffer_sp_init(msm_ring, size, _FD_RINGBUFFER_OBJECT); + return fd_submit_sp_new(pipe, flush_submit_list); } -- 2.7.4