drm/i915: use a separate context for gpu relocs

author Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Tue, 27 Aug 2019 18:58:05 +0000 (11:58 -0700)

committer Chris Wilson <chris@chris-wilson.co.uk>

Tue, 27 Aug 2019 20:14:43 +0000 (21:14 +0100)
author Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Tue, 27 Aug 2019 18:58:05 +0000 (11:58 -0700)
committer Chris Wilson <chris@chris-wilson.co.uk>
Tue, 27 Aug 2019 20:14:43 +0000 (21:14 +0100)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

index 5a2238d..8fbb454 100644 (file)
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -252,6 +252,7 @@ struct i915_execbuffer {
                 bool has_fence : 1;
                 bool needs_unfenced : 1;
  
+               struct intel_context *ce;
                 struct i915_request *rq;
                 u32 *rq_cmd;
                 unsigned int rq_size;
@@ -880,6 +881,9 @@ static void eb_destroy(const struct i915_execbuffer *eb)
  {
         GEM_BUG_ON(eb->reloc_cache.rq);
  
+       if (eb->reloc_cache.ce)
+               intel_context_put(eb->reloc_cache.ce);
+
         if (eb->lut_size > 0)
                 kfree(eb->buckets);
  }
@@ -903,6 +907,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
         cache->has_fence = cache->gen < 4;
         cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
         cache->node.allocated = false;
+       cache->ce = NULL;
         cache->rq = NULL;
         cache->rq_size = 0;
  }
@@ -1168,7 +1173,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
         if (err)
                 goto err_unmap;
  
-       rq = i915_request_create(eb->context);
+       rq = intel_context_create_request(cache->ce);
         if (IS_ERR(rq)) {
                 err = PTR_ERR(rq);
                 goto err_unpin;
@@ -1239,6 +1244,29 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
                 if (!intel_engine_can_store_dword(eb->engine))
                         return ERR_PTR(-ENODEV);
  
+               if (!cache->ce) {
+                       struct intel_context *ce;
+
+                       /*
+                        * The CS pre-parser can pre-fetch commands across
+                        * memory sync points and starting gen12 it is able to
+                        * pre-fetch across BB_START and BB_END boundaries
+                        * (within the same context). We therefore use a
+                        * separate context gen12+ to guarantee that the reloc
+                        * writes land before the parser gets to the target
+                        * memory location.
+                        */
+                       if (cache->gen >= 12)
+                               ce = intel_context_create(eb->context->gem_context,
+                                                         eb->engine);
+                       else
+                               ce = intel_context_get(eb->context);
+                       if (IS_ERR(ce))
+                               return ERR_CAST(ce);
+
+                       cache->ce = ce;
+               }
+
                 err = __reloc_gpu_alloc(eb, vma, len);
                 if (unlikely(err))
                         return ERR_PTR(err);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c

index cfbdcca..a141e9e 100644 (file)
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2931,6 +2931,24 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
         return gen8_emit_fini_breadcrumb_footer(request, cs);
  }
  
+/*
+ * Note that the CS instruction pre-parser will not stall on the breadcrumb
+ * flush and will continue pre-fetching the instructions after it before the
+ * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
+ * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
+ * of the next request before the memory has been flushed, we're guaranteed that
+ * we won't access the batch itself too early.
+ * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
+ * so, if the current request is modifying an instruction in the next request on
+ * the same intel_context, we might pre-fetch and then execute the pre-update
+ * instruction. To avoid this, the users of self-modifying code should either
+ * disable the parser around the code emitting the memory writes, via a new flag
+ * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
+ * the in-kernel use-cases we've opted to use a separate context, see
+ * reloc_gpu() as an example.
+ * All the above applies only to the instructions themselves. Non-inline data
+ * used by the instructions is not pre-fetched.
+ */
  static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
                                            u32 *cs)
  {
author	Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
	Tue, 27 Aug 2019 18:58:05 +0000 (11:58 -0700)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Tue, 27 Aug 2019 20:14:43 +0000 (21:14 +0100)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c		patch \| blob \| history
drivers/gpu/drm/i915/gt/intel_lrc.c		patch \| blob \| history