From c0bb487dc19fc45dbeede7dcf8f513df51a3cd33 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 23 Sep 2019 12:00:55 +0100 Subject: [PATCH] drm/i915: Only enqueue already completed requests If we are asked to submit a completed request, just move it onto the active-list without modifying it's payload. If we try to emit the modified payload of a completed request, we risk racing with the ring->head update during retirement which may advance the head past our breadcrumb and so we generate a warning for the emission being behind the RING_HEAD. v2: Commentary for the sneaky, shared responsibility between functions. v3: Spelling mistakes and bonus assertion Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20190923110056.15176-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 66 +++++++++++++++++++++++-------------- drivers/gpu/drm/i915/i915_request.c | 44 ++++++++++++++++++------- drivers/gpu/drm/i915/i915_request.h | 2 +- 3 files changed, 74 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0a4812e..bc9badd 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -799,6 +799,17 @@ static bool can_merge_rq(const struct i915_request *prev, GEM_BUG_ON(prev == next); GEM_BUG_ON(!assert_priority_queue(prev, next)); + /* + * We do not submit known completed requests. Therefore if the next + * request is already completed, we can pretend to merge it in + * with the previous context (and we will skip updating the ELSP + * and tracking). Thus hopefully keeping the ELSP full with active + * contexts, despite the best efforts of preempt-to-busy to confuse + * us. + */ + if (i915_request_completed(next)) + return true; + if (!can_merge_ctx(prev->hw_context, next->hw_context)) return false; @@ -1181,21 +1192,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) continue; } - if (i915_request_completed(rq)) { - ve->request = NULL; - ve->base.execlists.queue_priority_hint = INT_MIN; - rb_erase_cached(rb, &execlists->virtual); - RB_CLEAR_NODE(rb); - - rq->engine = engine; - __i915_request_submit(rq); - - spin_unlock(&ve->base.active.lock); - - rb = rb_first_cached(&execlists->virtual); - continue; - } - if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.active.lock); return; /* leave this for another */ @@ -1249,11 +1245,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine) GEM_BUG_ON(ve->siblings[0] != engine); } - __i915_request_submit(rq); - if (!i915_request_completed(rq)) { + if (__i915_request_submit(rq)) { submit = true; last = rq; } + + /* + * Hmm, we have a bunch of virtual engine requests, + * but the first one was already completed (thanks + * preempt-to-busy!). Keep looking at the veng queue + * until we have no more relevant requests (i.e. + * the normal submit queue has higher priority). + */ + if (!submit) { + spin_unlock(&ve->base.active.lock); + rb = rb_first_cached(&execlists->virtual); + continue; + } } spin_unlock(&ve->base.active.lock); @@ -1266,8 +1274,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) int i; priolist_for_each_request_consume(rq, rn, p, i) { - if (i915_request_completed(rq)) - goto skip; + bool merge = true; /* * Can we combine this request with the current port? @@ -1308,14 +1315,23 @@ static void execlists_dequeue(struct intel_engine_cs *engine) ctx_single_port_submission(rq->hw_context)) goto done; - *port = execlists_schedule_in(last, port - execlists->pending); - port++; + merge = false; } - last = rq; - submit = true; -skip: - __i915_request_submit(rq); + if (__i915_request_submit(rq)) { + if (!merge) { + *port = execlists_schedule_in(last, port - execlists->pending); + port++; + last = NULL; + } + + GEM_BUG_ON(last && + !can_merge_ctx(last->hw_context, + rq->hw_context)); + + submit = true; + last = rq; + } } rb_erase_cached(&p->node, &execlists->queue); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 9bd8538..a891641 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -377,9 +377,10 @@ __i915_request_await_execution(struct i915_request *rq, return 0; } -void __i915_request_submit(struct i915_request *request) +bool __i915_request_submit(struct i915_request *request) { struct intel_engine_cs *engine = request->engine; + bool result = false; GEM_TRACE("%s fence %llx:%lld, current %d\n", engine->name, @@ -389,6 +390,25 @@ void __i915_request_submit(struct i915_request *request) GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->active.lock); + /* + * With the advent of preempt-to-busy, we frequently encounter + * requests that we have unsubmitted from HW, but left running + * until the next ack and so have completed in the meantime. On + * resubmission of that completed request, we can skip + * updating the payload, and execlists can even skip submitting + * the request. + * + * We must remove the request from the caller's priority queue, + * and the caller must only call us when the request is in their + * priority queue, under the active.lock. This ensures that the + * request has *not* yet been retired and we can safely move + * the request into the engine->active.list where it will be + * dropped upon retiring. (Otherwise if resubmit a *retired* + * request, this would be a horrible use-after-free.) + */ + if (i915_request_completed(request)) + goto xfer; + if (i915_gem_context_is_banned(request->gem_context)) i915_request_skip(request, -EIO); @@ -412,13 +432,18 @@ void __i915_request_submit(struct i915_request *request) i915_sw_fence_signaled(&request->semaphore)) engine->saturated |= request->sched.semaphores; - /* We may be recursing from the signal callback of another i915 fence */ - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); + engine->emit_fini_breadcrumb(request, + request->ring->vaddr + request->postfix); + + trace_i915_request_execute(request); + engine->serial++; + result = true; - list_move_tail(&request->sched.link, &engine->active.requests); +xfer: /* We may be recursing from the signal callback of another i915 fence */ + spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); - GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); - set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); + if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) + list_move_tail(&request->sched.link, &engine->active.requests); if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && @@ -429,12 +454,7 @@ void __i915_request_submit(struct i915_request *request) spin_unlock(&request->lock); - engine->emit_fini_breadcrumb(request, - request->ring->vaddr + request->postfix); - - engine->serial++; - - trace_i915_request_execute(request); + return result; } void i915_request_submit(struct i915_request *request) diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index b18f495..ec5bb4c 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -292,7 +292,7 @@ int i915_request_await_execution(struct i915_request *rq, void i915_request_add(struct i915_request *rq); -void __i915_request_submit(struct i915_request *request); +bool __i915_request_submit(struct i915_request *request); void i915_request_submit(struct i915_request *request); void i915_request_skip(struct i915_request *request, int error); -- 2.7.4