#include <inttypes.h>
#include "util/hash_table.h"
+#include "util/os_file.h"
#include "util/slab.h"
#include "drm/freedreno_ringbuffer.h"
* so we can reclaim extra space at it's end.
*/
struct fd_ringbuffer *suballoc_ring;
+
+ /* Flush args, potentially attached to the last submit in the list
+ * of submits to merge:
+ */
+ int in_fence_fd;
+ struct fd_submit_fence *out_fence;
};
FD_DEFINE_CAST(fd_submit, msm_submit_sp);
/* NOTE: it is legal to use the same bo on different threads for
* different submits. But it is not legal to use the same submit
- * from given threads.
+ * from different threads.
*/
idx = READ_ONCE(msm_bo->idx);
* 2) Add cmdstream bos to bos table
* 3) Update bo fences
*/
-static void
-msm_submit_sp_flush_prep(struct fd_submit *submit)
+static bool
+msm_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd,
+ struct fd_submit_fence *out_fence)
{
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+ bool has_shared = false;
finalize_current_cmd(submit->primary);
msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
simple_mtx_lock(&table_lock);
- for (unsigned i = 0; i < msm_submit->nr_bos; i++)
+ for (unsigned i = 0; i < msm_submit->nr_bos; i++) {
fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence);
+ has_shared |= msm_submit->bos[i]->shared;
+ }
simple_mtx_unlock(&table_lock);
+
+ msm_submit->out_fence = out_fence;
+ msm_submit->in_fence_fd = (in_fence_fd == -1) ?
+ -1 : os_dupfd_cloexec(in_fence_fd);
+
+ return has_shared;
}
static int
-msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd,
- struct fd_submit_fence *out_fence)
+flush_submit_list(struct list_head *submit_list)
{
- struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
- struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
+ struct msm_submit_sp *msm_submit = to_msm_submit_sp(last_submit(submit_list));
+ struct msm_pipe *msm_pipe = to_msm_pipe(msm_submit->base.pipe);
struct drm_msm_gem_submit req = {
.flags = msm_pipe->pipe,
.queueid = msm_pipe->queue_id,
};
int ret;
- struct msm_ringbuffer_sp *primary =
- to_msm_ringbuffer_sp(submit->primary);
- struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds];
-
- for (unsigned i = 0; i < primary->u.nr_cmds; i++) {
- cmds[i].type = MSM_SUBMIT_CMD_BUF;
- cmds[i].submit_idx =
- msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
- cmds[i].submit_offset = primary->offset;
- cmds[i].size = primary->u.cmds[i].size;
- cmds[i].pad = 0;
- cmds[i].nr_relocs = 0;
+ unsigned nr_cmds = 0;
+
+ /* Determine the number of extra cmds's from deferred submits that
+ * we will be merging in:
+ */
+ foreach_submit (submit, submit_list) {
+ assert(submit->pipe == &msm_pipe->base);
+ nr_cmds += to_msm_ringbuffer_sp(submit->primary)->u.nr_cmds;
+ }
+
+ struct drm_msm_gem_submit_cmd cmds[nr_cmds];
+
+ unsigned cmd_idx = 0;
+
+ /* Build up the table of cmds, and for all but the last submit in the
+ * list, merge their bo tables into the last submit.
+ */
+ foreach_submit_safe (submit, submit_list) {
+ struct msm_ringbuffer_sp *deferred_primary =
+ to_msm_ringbuffer_sp(submit->primary);
+
+ for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) {
+ cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF;
+ cmds[cmd_idx].submit_idx =
+ msm_submit_append_bo(msm_submit, deferred_primary->u.cmds[i].ring_bo);
+ cmds[cmd_idx].submit_offset = deferred_primary->offset;
+ cmds[cmd_idx].size = deferred_primary->u.cmds[i].size;
+ cmds[cmd_idx].pad = 0;
+ cmds[cmd_idx].nr_relocs = 0;
+
+ cmd_idx++;
+ }
+
+ /* We are merging all the submits in the list into the last submit,
+ * so the remainder of the loop body doesn't apply to the last submit
+ */
+ if (submit == last_submit(submit_list)) {
+ DEBUG_MSG("merged %u submits", cmd_idx);
+ break;
+ }
+
+ struct msm_submit_sp *msm_deferred_submit = to_msm_submit_sp(submit);
+ for (unsigned i = 0; i < msm_deferred_submit->nr_bos; i++) {
+ /* Note: if bo is used in both the current submit and the deferred
+ * submit being merged, we expect to hit the fast-path as we add it
+ * to the current submit:
+ */
+ msm_submit_append_bo(msm_submit, msm_deferred_submit->bos[i]);
+ }
+
+ /* Now that the cmds/bos have been transfered over to the current submit,
+ * we can remove the deferred submit from the list and drop it's reference
+ */
+ list_del(&submit->node);
+ fd_submit_del(submit);
}
- if (in_fence_fd != -1) {
+ if (msm_submit->in_fence_fd != -1) {
req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
- req.fence_fd = in_fence_fd;
+ req.fence_fd = msm_submit->in_fence_fd;
}
- if (out_fence && out_fence->use_fence_fd) {
+ if (msm_submit->out_fence && msm_submit->out_fence->use_fence_fd) {
req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
}
submit_bos[i].presumed = 0;
}
- req.bos = VOID2U64(submit_bos), req.nr_bos = msm_submit->nr_bos;
- req.cmds = VOID2U64(cmds), req.nr_cmds = primary->u.nr_cmds;
+ req.bos = VOID2U64(submit_bos);
+ req.nr_bos = msm_submit->nr_bos;
+ req.cmds = VOID2U64(cmds);
+ req.nr_cmds = nr_cmds;
DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
- ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT, &req,
+ ret = drmCommandWriteRead(msm_pipe->base.dev->fd, DRM_MSM_GEM_SUBMIT, &req,
sizeof(req));
if (ret) {
ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
msm_dump_submit(&req);
- } else if (!ret && out_fence) {
- out_fence->fence.kfence = req.fence;
- out_fence->fence.ufence = submit->fence;
- out_fence->fence_fd = req.fence_fd;
+ } else if (!ret && msm_submit->out_fence) {
+ msm_submit->out_fence->fence.kfence = req.fence;
+ msm_submit->out_fence->fence.ufence = msm_submit->base.fence;
+ msm_submit->out_fence->fence_fd = req.fence_fd;
}
if (!bos_on_stack)
free(submit_bos);
+ if (msm_submit->in_fence_fd != -1)
+ close(msm_submit->in_fence_fd);
+
+ fd_submit_del(&msm_submit->base);
+
return ret;
}
+static bool
+should_defer(struct fd_submit *submit)
+{
+ struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+
+ /* if too many bo's, it may not be worth the CPU cost of submit merging: */
+ if (msm_submit->nr_bos > 30)
+ return false;
+
+ /* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k
+ * cmds before we exceed the size of the ringbuffer, which results in
+ * deadlock writing into the RB (ie. kernel doesn't finish writing into
+ * the RB so it doesn't kick the GPU to start consuming from the RB)
+ */
+ if (submit->pipe->dev->deferred_cmds > 128)
+ return false;
+
+ return true;
+}
+
static int
msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
struct fd_submit_fence *out_fence)
{
- msm_submit_sp_flush_prep(submit);
+ struct fd_device *dev = submit->pipe->dev;
+
+ /* Acquire lock before flush_prep() because it is possible to race between
+ * this and pipe->flush():
+ */
+ simple_mtx_lock(&dev->submit_lock);
+
+ /* If there are deferred submits from another fd_pipe, flush them now,
+ * since we can't merge submits from different submitqueue's (ie. they
+ * could have different priority, etc)
+ */
+ if (!list_is_empty(&dev->deferred_submits) &&
+ (last_submit(&dev->deferred_submits)->pipe != submit->pipe)) {
+ struct list_head submit_list;
+
+ list_replace(&dev->deferred_submits, &submit_list);
+ list_inithead(&dev->deferred_submits);
+ dev->deferred_cmds = 0;
+
+ simple_mtx_unlock(&dev->submit_lock);
+ flush_submit_list(&submit_list);
+ simple_mtx_lock(&dev->submit_lock);
+ }
+
+ list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
+
+ bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
+
+ /* If we don't need an out-fence, we can defer the submit.
+ *
+ * TODO we could defer submits with in-fence as well.. if we took our own
+ * reference to the fd, and merged all the in-fence-fd's when we flush the
+ * deferred submits
+ */
+ if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
+ dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
+ assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
+ simple_mtx_unlock(&dev->submit_lock);
+
+ return 0;
+ }
+
+ struct list_head submit_list;
+
+ list_replace(&dev->deferred_submits, &submit_list);
+ list_inithead(&dev->deferred_submits);
+ dev->deferred_cmds = 0;
+
+ simple_mtx_unlock(&dev->submit_lock);
+
+ return flush_submit_list(&submit_list);
+}
+
+void
+msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
+{
+ struct fd_device *dev = pipe->dev;
+ struct list_head submit_list;
+
+ list_inithead(&submit_list);
+
+ simple_mtx_lock(&dev->submit_lock);
+
+ foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
+ /* We should never have submits from multiple pipes in the deferred
+ * list. If we did, we couldn't compare their fence to our fence,
+ * since each fd_pipe is an independent timeline.
+ */
+ if (deferred_submit->pipe != pipe)
+ break;
+
+ if (fd_fence_after(deferred_submit->fence, fence))
+ break;
+
+ list_del(&deferred_submit->node);
+ list_addtail(&deferred_submit->node, &submit_list);
+ dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary);
+ }
+
+ assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
+
+ simple_mtx_unlock(&dev->submit_lock);
+
+ if (list_is_empty(&submit_list))
+ return;
- return msm_submit_sp_flush_finish(submit, in_fence_fd, out_fence);
+ flush_submit_list(&submit_list);
}
static void