From: Weinan Li Date: Wed, 13 Dec 2017 02:47:00 +0000 (+0800) Subject: drm/i915/gvt: optimize for vGPU mmio switch X-Git-Tag: v4.19~298^2~46^2~859^2~6 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e47107ad37c3774be9d5bf6fb4625c59e59f632c;p=platform%2Fkernel%2Flinux-rpi.git drm/i915/gvt: optimize for vGPU mmio switch Now mmio switch between vGPUs need to switch to host first then to expected vGPU, it waste one time mmio save/restore. r/w mmio usually is time-consuming, and there are so many mocs registers need to save/restore during vGPU switch. Combine the switch_to_host and switch_to_vgpu can reduce 1 time mmio save/restore, it will reduce the CPU utilization and performance while there is multi VMs with heavy work load. Signed-off-by: Weinan Li Signed-off-by: Zhenyu Wang --- diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c index 77d3a0d..5ad72fc 100644 --- a/drivers/gpu/drm/i915/gvt/mmio_context.c +++ b/drivers/gpu/drm/i915/gvt/mmio_context.c @@ -198,9 +198,10 @@ static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id) gvt_dbg_core("invalidate TLB for ring %d\n", ring_id); } -static void load_mocs(struct intel_vgpu *vgpu, int ring_id) +static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next, + int ring_id) { - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + struct drm_i915_private *dev_priv; i915_reg_t offset, l3_offset; u32 regs[] = { [RCS] = 0xc800, @@ -211,54 +212,44 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id) }; int i; + dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv; if (WARN_ON(ring_id >= ARRAY_SIZE(regs))) return; offset.reg = regs[ring_id]; - for (i = 0; i < 64; i++) { - gen9_render_mocs[ring_id][i] = I915_READ_FW(offset); - I915_WRITE_FW(offset, vgpu_vreg(vgpu, offset)); - offset.reg += 4; - } - - if (ring_id == RCS) { - l3_offset.reg = 0xb020; - for (i = 0; i < 32; i++) { - gen9_render_mocs_L3[i] = I915_READ_FW(l3_offset); - I915_WRITE_FW(l3_offset, vgpu_vreg(vgpu, l3_offset)); - l3_offset.reg += 4; - } - } -} -static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) -{ - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; - i915_reg_t offset, l3_offset; - u32 regs[] = { - [RCS] = 0xc800, - [VCS] = 0xc900, - [VCS2] = 0xca00, - [BCS] = 0xcc00, - [VECS] = 0xcb00, - }; - int i; + for (i = 0; i < 64; i++) { + if (pre) + vgpu_vreg(pre, offset) = + I915_READ_FW(offset); + else + gen9_render_mocs[ring_id][i] = + I915_READ_FW(offset); - if (WARN_ON(ring_id >= ARRAY_SIZE(regs))) - return; + if (next) + I915_WRITE_FW(offset, vgpu_vreg(next, offset)); + else + I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]); - offset.reg = regs[ring_id]; - for (i = 0; i < 64; i++) { - vgpu_vreg(vgpu, offset) = I915_READ_FW(offset); - I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]); offset.reg += 4; } if (ring_id == RCS) { l3_offset.reg = 0xb020; for (i = 0; i < 32; i++) { - vgpu_vreg(vgpu, l3_offset) = I915_READ_FW(l3_offset); - I915_WRITE_FW(l3_offset, gen9_render_mocs_L3[i]); + if (pre) + vgpu_vreg(pre, l3_offset) = + I915_READ_FW(l3_offset); + else + gen9_render_mocs_L3[i] = + I915_READ_FW(l3_offset); + if (next) + I915_WRITE_FW(l3_offset, + vgpu_vreg(next, l3_offset)); + else + I915_WRITE_FW(l3_offset, + gen9_render_mocs_L3[i]); + l3_offset.reg += 4; } } @@ -266,84 +257,77 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) #define CTX_CONTEXT_CONTROL_VAL 0x03 -/* Switch ring mmio values (context) from host to a vgpu. */ -static void switch_mmio_to_vgpu(struct intel_vgpu *vgpu, int ring_id) +/* Switch ring mmio values (context). */ +static void switch_mmio(struct intel_vgpu *pre, + struct intel_vgpu *next, + int ring_id) { - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; - struct intel_vgpu_submission *s = &vgpu->submission; - u32 *reg_state = s->shadow_ctx->engine[ring_id].lrc_reg_state; - u32 ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL]; + struct drm_i915_private *dev_priv; + struct intel_vgpu_submission *s; + u32 *reg_state, ctx_ctrl; u32 inhibit_mask = _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); struct engine_mmio *mmio; - u32 v; - - if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) - load_mocs(vgpu, ring_id); - - mmio = vgpu->gvt->engine_mmio_list; - while (i915_mmio_reg_offset((mmio++)->reg)) { - if (mmio->ring_id != ring_id) - continue; - - mmio->value = I915_READ_FW(mmio->reg); - - /* - * if it is an inhibit context, load in_context mmio - * into HW by mmio write. If it is not, skip this mmio - * write. - */ - if (mmio->in_context && - (ctx_ctrl & inhibit_mask) != inhibit_mask) - continue; - - if (mmio->mask) - v = vgpu_vreg(vgpu, mmio->reg) | (mmio->mask << 16); - else - v = vgpu_vreg(vgpu, mmio->reg); - - I915_WRITE_FW(mmio->reg, v); - - trace_render_mmio(0, vgpu->id, "switch", - i915_mmio_reg_offset(mmio->reg), - mmio->value, v); - } - - handle_tlb_pending_event(vgpu, ring_id); -} - -/* Switch ring mmio values (context) from vgpu to host. */ -static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id) -{ - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; - struct engine_mmio *mmio; - u32 v; + u32 old_v, new_v; + dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv; if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) - restore_mocs(vgpu, ring_id); + switch_mocs(pre, next, ring_id); - mmio = vgpu->gvt->engine_mmio_list; + mmio = dev_priv->gvt->engine_mmio_list; while (i915_mmio_reg_offset((mmio++)->reg)) { if (mmio->ring_id != ring_id) continue; - - vgpu_vreg(vgpu, mmio->reg) = I915_READ_FW(mmio->reg); - - if (mmio->mask) { - vgpu_vreg(vgpu, mmio->reg) &= ~(mmio->mask << 16); - v = mmio->value | (mmio->mask << 16); + // save + if (pre) { + vgpu_vreg(pre, mmio->reg) = I915_READ_FW(mmio->reg); + if (mmio->mask) + vgpu_vreg(pre, mmio->reg) &= + ~(mmio->mask << 16); + old_v = vgpu_vreg(pre, mmio->reg); } else - v = mmio->value; - - if (mmio->in_context) - continue; + old_v = mmio->value = I915_READ_FW(mmio->reg); + + // restore + if (next) { + s = &next->submission; + reg_state = + s->shadow_ctx->engine[ring_id].lrc_reg_state; + ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL]; + /* + * if it is an inhibit context, load in_context mmio + * into HW by mmio write. If it is not, skip this mmio + * write. + */ + if (mmio->in_context && + (ctx_ctrl & inhibit_mask) != inhibit_mask) + continue; + + if (mmio->mask) + new_v = vgpu_vreg(next, mmio->reg) | + (mmio->mask << 16); + else + new_v = vgpu_vreg(next, mmio->reg); + } else { + if (mmio->in_context) + continue; + if (mmio->mask) + new_v = mmio->value | (mmio->mask << 16); + else + new_v = mmio->value; + } - I915_WRITE_FW(mmio->reg, v); + I915_WRITE_FW(mmio->reg, new_v); - trace_render_mmio(vgpu->id, 0, "switch", + trace_render_mmio(pre ? pre->id : 0, + next ? next->id : 0, + "switch", i915_mmio_reg_offset(mmio->reg), - mmio->value, v); + old_v, new_v); } + + if (next) + handle_tlb_pending_event(next, ring_id); } /** @@ -374,17 +358,7 @@ void intel_gvt_switch_mmio(struct intel_vgpu *pre, * handle forcewake mannually. */ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); - - /** - * TODO: Optimize for vGPU to vGPU switch by merging - * switch_mmio_to_host() and switch_mmio_to_vgpu(). - */ - if (pre) - switch_mmio_to_host(pre, ring_id); - - if (next) - switch_mmio_to_vgpu(next, ring_id); - + switch_mmio(pre, next, ring_id); intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); }