Merge drm/drm-next into drm-intel-next
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / i915 / gt / intel_workarounds.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014-2018 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "i915_reg.h"
8 #include "intel_context.h"
9 #include "intel_engine_pm.h"
10 #include "intel_engine_regs.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt.h"
13 #include "intel_gt_mcr.h"
14 #include "intel_gt_regs.h"
15 #include "intel_ring.h"
16 #include "intel_workarounds.h"
17
18 /**
19  * DOC: Hardware workarounds
20  *
21  * This file is intended as a central place to implement most [1]_ of the
22  * required workarounds for hardware to work as originally intended. They fall
23  * in five basic categories depending on how/when they are applied:
24  *
25  * - Workarounds that touch registers that are saved/restored to/from the HW
26  *   context image. The list is emitted (via Load Register Immediate commands)
27  *   everytime a new context is created.
28  * - GT workarounds. The list of these WAs is applied whenever these registers
29  *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
30  * - Display workarounds. The list is applied during display clock-gating
31  *   initialization.
32  * - Workarounds that whitelist a privileged register, so that UMDs can manage
33  *   them directly. This is just a special case of a MMMIO workaround (as we
34  *   write the list of these to/be-whitelisted registers to some special HW
35  *   registers).
36  * - Workaround batchbuffers, that get executed automatically by the hardware
37  *   on every HW context restore.
38  *
39  * .. [1] Please notice that there are other WAs that, due to their nature,
40  *    cannot be applied from a central place. Those are peppered around the rest
41  *    of the code, as needed.
42  *
43  * .. [2] Technically, some registers are powercontext saved & restored, so they
44  *    survive a suspend/resume. In practice, writing them again is not too
45  *    costly and simplifies things. We can revisit this in the future.
46  *
47  * Layout
48  * ~~~~~~
49  *
50  * Keep things in this file ordered by WA type, as per the above (context, GT,
51  * display, register whitelist, batchbuffer). Then, inside each type, keep the
52  * following order:
53  *
54  * - Infrastructure functions and macros
55  * - WAs per platform in standard gen/chrono order
56  * - Public functions to init or apply the given workaround type.
57  */
58
59 static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
60 {
61         wal->name = name;
62         wal->engine_name = engine_name;
63 }
64
65 #define WA_LIST_CHUNK (1 << 4)
66
67 static void wa_init_finish(struct i915_wa_list *wal)
68 {
69         /* Trim unused entries. */
70         if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
71                 struct i915_wa *list = kmemdup(wal->list,
72                                                wal->count * sizeof(*list),
73                                                GFP_KERNEL);
74
75                 if (list) {
76                         kfree(wal->list);
77                         wal->list = list;
78                 }
79         }
80
81         if (!wal->count)
82                 return;
83
84         DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
85                          wal->wa_count, wal->name, wal->engine_name);
86 }
87
88 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
89 {
90         unsigned int addr = i915_mmio_reg_offset(wa->reg);
91         unsigned int start = 0, end = wal->count;
92         const unsigned int grow = WA_LIST_CHUNK;
93         struct i915_wa *wa_;
94
95         GEM_BUG_ON(!is_power_of_2(grow));
96
97         if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
98                 struct i915_wa *list;
99
100                 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
101                                      GFP_KERNEL);
102                 if (!list) {
103                         DRM_ERROR("No space for workaround init!\n");
104                         return;
105                 }
106
107                 if (wal->list) {
108                         memcpy(list, wal->list, sizeof(*wa) * wal->count);
109                         kfree(wal->list);
110                 }
111
112                 wal->list = list;
113         }
114
115         while (start < end) {
116                 unsigned int mid = start + (end - start) / 2;
117
118                 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
119                         start = mid + 1;
120                 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
121                         end = mid;
122                 } else {
123                         wa_ = &wal->list[mid];
124
125                         if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
126                                 DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
127                                           i915_mmio_reg_offset(wa_->reg),
128                                           wa_->clr, wa_->set);
129
130                                 wa_->set &= ~wa->clr;
131                         }
132
133                         wal->wa_count++;
134                         wa_->set |= wa->set;
135                         wa_->clr |= wa->clr;
136                         wa_->read |= wa->read;
137                         return;
138                 }
139         }
140
141         wal->wa_count++;
142         wa_ = &wal->list[wal->count++];
143         *wa_ = *wa;
144
145         while (wa_-- > wal->list) {
146                 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
147                            i915_mmio_reg_offset(wa_[1].reg));
148                 if (i915_mmio_reg_offset(wa_[1].reg) >
149                     i915_mmio_reg_offset(wa_[0].reg))
150                         break;
151
152                 swap(wa_[1], wa_[0]);
153         }
154 }
155
156 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
157                    u32 clear, u32 set, u32 read_mask, bool masked_reg)
158 {
159         struct i915_wa wa = {
160                 .reg  = reg,
161                 .clr  = clear,
162                 .set  = set,
163                 .read = read_mask,
164                 .masked_reg = masked_reg,
165         };
166
167         _wa_add(wal, &wa);
168 }
169
170 static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
171                        u32 clear, u32 set, u32 read_mask, bool masked_reg)
172 {
173         struct i915_wa wa = {
174                 .mcr_reg = reg,
175                 .clr  = clear,
176                 .set  = set,
177                 .read = read_mask,
178                 .masked_reg = masked_reg,
179                 .is_mcr = 1,
180         };
181
182         _wa_add(wal, &wa);
183 }
184
185 static void
186 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
187 {
188         wa_add(wal, reg, clear, set, clear, false);
189 }
190
191 static void
192 wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
193 {
194         wa_mcr_add(wal, reg, clear, set, clear, false);
195 }
196
197 static void
198 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
199 {
200         wa_write_clr_set(wal, reg, ~0, set);
201 }
202
203 static void
204 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
205 {
206         wa_write_clr_set(wal, reg, set, set);
207 }
208
209 static void
210 wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
211 {
212         wa_mcr_write_clr_set(wal, reg, set, set);
213 }
214
215 static void
216 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
217 {
218         wa_write_clr_set(wal, reg, clr, 0);
219 }
220
221 static void
222 wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
223 {
224         wa_mcr_write_clr_set(wal, reg, clr, 0);
225 }
226
227 /*
228  * WA operations on "masked register". A masked register has the upper 16 bits
229  * documented as "masked" in b-spec. Its purpose is to allow writing to just a
230  * portion of the register without a rmw: you simply write in the upper 16 bits
231  * the mask of bits you are going to modify.
232  *
233  * The wa_masked_* family of functions already does the necessary operations to
234  * calculate the mask based on the parameters passed, so user only has to
235  * provide the lower 16 bits of that register.
236  */
237
238 static void
239 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
240 {
241         wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
242 }
243
244 static void
245 wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
246 {
247         wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
248 }
249
250 static void
251 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
252 {
253         wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
254 }
255
256 static void
257 wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
258 {
259         wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
260 }
261
262 static void
263 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
264                     u32 mask, u32 val)
265 {
266         wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
267 }
268
269 static void
270 wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
271                         u32 mask, u32 val)
272 {
273         wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
274 }
275
276 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
277                                       struct i915_wa_list *wal)
278 {
279         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
280 }
281
282 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
283                                       struct i915_wa_list *wal)
284 {
285         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
286 }
287
288 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
289                                       struct i915_wa_list *wal)
290 {
291         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
292
293         /* WaDisableAsyncFlipPerfMode:bdw,chv */
294         wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
295
296         /* WaDisablePartialInstShootdown:bdw,chv */
297         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
298                          PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
299
300         /* Use Force Non-Coherent whenever executing a 3D context. This is a
301          * workaround for a possible hang in the unlikely event a TLB
302          * invalidation occurs during a PSD flush.
303          */
304         /* WaForceEnableNonCoherent:bdw,chv */
305         /* WaHdcDisableFetchWhenMasked:bdw,chv */
306         wa_masked_en(wal, HDC_CHICKEN0,
307                      HDC_DONOT_FETCH_MEM_WHEN_MASKED |
308                      HDC_FORCE_NON_COHERENT);
309
310         /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
311          * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
312          *  polygons in the same 8x4 pixel/sample area to be processed without
313          *  stalling waiting for the earlier ones to write to Hierarchical Z
314          *  buffer."
315          *
316          * This optimization is off by default for BDW and CHV; turn it on.
317          */
318         wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
319
320         /* Wa4x4STCOptimizationDisable:bdw,chv */
321         wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
322
323         /*
324          * BSpec recommends 8x4 when MSAA is used,
325          * however in practice 16x4 seems fastest.
326          *
327          * Note that PS/WM thread counts depend on the WIZ hashing
328          * disable bit, which we don't touch here, but it's good
329          * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
330          */
331         wa_masked_field_set(wal, GEN7_GT_MODE,
332                             GEN6_WIZ_HASHING_MASK,
333                             GEN6_WIZ_HASHING_16x4);
334 }
335
336 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
337                                      struct i915_wa_list *wal)
338 {
339         struct drm_i915_private *i915 = engine->i915;
340
341         gen8_ctx_workarounds_init(engine, wal);
342
343         /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
344         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
345
346         /* WaDisableDopClockGating:bdw
347          *
348          * Also see the related UCGTCL1 write in bdw_init_clock_gating()
349          * to disable EUTC clock gating.
350          */
351         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
352                          DOP_CLOCK_GATING_DISABLE);
353
354         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
355                          GEN8_SAMPLER_POWER_BYPASS_DIS);
356
357         wa_masked_en(wal, HDC_CHICKEN0,
358                      /* WaForceContextSaveRestoreNonCoherent:bdw */
359                      HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
360                      /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
361                      (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
362 }
363
364 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
365                                      struct i915_wa_list *wal)
366 {
367         gen8_ctx_workarounds_init(engine, wal);
368
369         /* WaDisableThreadStallDopClockGating:chv */
370         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
371
372         /* Improve HiZ throughput on CHV. */
373         wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
374 }
375
376 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
377                                       struct i915_wa_list *wal)
378 {
379         struct drm_i915_private *i915 = engine->i915;
380
381         if (HAS_LLC(i915)) {
382                 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
383                  *
384                  * Must match Display Engine. See
385                  * WaCompressedResourceDisplayNewHashMode.
386                  */
387                 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
388                              GEN9_PBE_COMPRESSED_HASH_SELECTION);
389                 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
390                                  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
391         }
392
393         /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
394         /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
395         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
396                          FLOW_CONTROL_ENABLE |
397                          PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
398
399         /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
400         /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
401         wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
402                          GEN9_ENABLE_YV12_BUGFIX |
403                          GEN9_ENABLE_GPGPU_PREEMPTION);
404
405         /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
406         /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
407         wa_masked_en(wal, CACHE_MODE_1,
408                      GEN8_4x4_STC_OPTIMIZATION_DISABLE |
409                      GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
410
411         /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
412         wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
413                           GEN9_CCS_TLB_PREFETCH_ENABLE);
414
415         /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
416         wa_masked_en(wal, HDC_CHICKEN0,
417                      HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
418                      HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
419
420         /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
421          * both tied to WaForceContextSaveRestoreNonCoherent
422          * in some hsds for skl. We keep the tie for all gen9. The
423          * documentation is a bit hazy and so we want to get common behaviour,
424          * even though there is no clear evidence we would need both on kbl/bxt.
425          * This area has been source of system hangs so we play it safe
426          * and mimic the skl regardless of what bspec says.
427          *
428          * Use Force Non-Coherent whenever executing a 3D context. This
429          * is a workaround for a possible hang in the unlikely event
430          * a TLB invalidation occurs during a PSD flush.
431          */
432
433         /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
434         wa_masked_en(wal, HDC_CHICKEN0,
435                      HDC_FORCE_NON_COHERENT);
436
437         /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
438         if (IS_SKYLAKE(i915) ||
439             IS_KABYLAKE(i915) ||
440             IS_COFFEELAKE(i915) ||
441             IS_COMETLAKE(i915))
442                 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
443                                  GEN8_SAMPLER_POWER_BYPASS_DIS);
444
445         /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
446         wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
447
448         /*
449          * Supporting preemption with fine-granularity requires changes in the
450          * batch buffer programming. Since we can't break old userspace, we
451          * need to set our default preemption level to safe value. Userspace is
452          * still able to use more fine-grained preemption levels, since in
453          * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
454          * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
455          * not real HW workarounds, but merely a way to start using preemption
456          * while maintaining old contract with userspace.
457          */
458
459         /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
460         wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
461
462         /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
463         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
464                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
465                             GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
466
467         /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
468         if (IS_GEN9_LP(i915))
469                 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
470 }
471
472 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
473                                 struct i915_wa_list *wal)
474 {
475         struct intel_gt *gt = engine->gt;
476         u8 vals[3] = { 0, 0, 0 };
477         unsigned int i;
478
479         for (i = 0; i < 3; i++) {
480                 u8 ss;
481
482                 /*
483                  * Only consider slices where one, and only one, subslice has 7
484                  * EUs
485                  */
486                 if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
487                         continue;
488
489                 /*
490                  * subslice_7eu[i] != 0 (because of the check above) and
491                  * ss_max == 4 (maximum number of subslices possible per slice)
492                  *
493                  * ->    0 <= ss <= 3;
494                  */
495                 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
496                 vals[i] = 3 - ss;
497         }
498
499         if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
500                 return;
501
502         /* Tune IZ hashing. See intel_device_info_runtime_init() */
503         wa_masked_field_set(wal, GEN7_GT_MODE,
504                             GEN9_IZ_HASHING_MASK(2) |
505                             GEN9_IZ_HASHING_MASK(1) |
506                             GEN9_IZ_HASHING_MASK(0),
507                             GEN9_IZ_HASHING(2, vals[2]) |
508                             GEN9_IZ_HASHING(1, vals[1]) |
509                             GEN9_IZ_HASHING(0, vals[0]));
510 }
511
512 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
513                                      struct i915_wa_list *wal)
514 {
515         gen9_ctx_workarounds_init(engine, wal);
516         skl_tune_iz_hashing(engine, wal);
517 }
518
519 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
520                                      struct i915_wa_list *wal)
521 {
522         gen9_ctx_workarounds_init(engine, wal);
523
524         /* WaDisableThreadStallDopClockGating:bxt */
525         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
526                          STALL_DOP_GATING_DISABLE);
527
528         /* WaToEnableHwFixForPushConstHWBug:bxt */
529         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
530                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
531 }
532
533 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
534                                      struct i915_wa_list *wal)
535 {
536         struct drm_i915_private *i915 = engine->i915;
537
538         gen9_ctx_workarounds_init(engine, wal);
539
540         /* WaToEnableHwFixForPushConstHWBug:kbl */
541         if (IS_KBL_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
542                 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
543                              GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
544
545         /* WaDisableSbeCacheDispatchPortSharing:kbl */
546         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
547                          GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
548 }
549
550 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
551                                      struct i915_wa_list *wal)
552 {
553         gen9_ctx_workarounds_init(engine, wal);
554
555         /* WaToEnableHwFixForPushConstHWBug:glk */
556         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
557                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
558 }
559
560 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
561                                      struct i915_wa_list *wal)
562 {
563         gen9_ctx_workarounds_init(engine, wal);
564
565         /* WaToEnableHwFixForPushConstHWBug:cfl */
566         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
567                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
568
569         /* WaDisableSbeCacheDispatchPortSharing:cfl */
570         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
571                          GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
572 }
573
574 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
575                                      struct i915_wa_list *wal)
576 {
577         /* Wa_1406697149 (WaDisableBankHangMode:icl) */
578         wa_write(wal,
579                  GEN8_L3CNTLREG,
580                  intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
581                  GEN8_ERRDETBCTRL);
582
583         /* WaForceEnableNonCoherent:icl
584          * This is not the same workaround as in early Gen9 platforms, where
585          * lacking this could cause system hangs, but coherency performance
586          * overhead is high and only a few compute workloads really need it
587          * (the register is whitelisted in hardware now, so UMDs can opt in
588          * for coherency if they have a good reason).
589          */
590         wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
591
592         /* WaEnableFloatBlendOptimization:icl */
593         wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
594                    _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
595                    0 /* write-only, so skip validation */,
596                    true);
597
598         /* WaDisableGPGPUMidThreadPreemption:icl */
599         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
600                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
601                             GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
602
603         /* allow headerless messages for preemptible GPGPU context */
604         wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
605                          GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
606
607         /* Wa_1604278689:icl,ehl */
608         wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
609         wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
610                          0, /* write-only register; skip validation */
611                          0xFFFFFFFF);
612
613         /* Wa_1406306137:icl,ehl */
614         wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
615 }
616
617 /*
618  * These settings aren't actually workarounds, but general tuning settings that
619  * need to be programmed on dg2 platform.
620  */
621 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
622                                    struct i915_wa_list *wal)
623 {
624         wa_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
625         wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
626                              REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
627         wa_mcr_add(wal,
628                    XEHP_FF_MODE2,
629                    FF_MODE2_TDS_TIMER_MASK,
630                    FF_MODE2_TDS_TIMER_128,
631                    0, false);
632 }
633
634 /*
635  * These settings aren't actually workarounds, but general tuning settings that
636  * need to be programmed on several platforms.
637  */
638 static void gen12_ctx_gt_tuning_init(struct intel_engine_cs *engine,
639                                      struct i915_wa_list *wal)
640 {
641         /*
642          * Although some platforms refer to it as Wa_1604555607, we need to
643          * program it even on those that don't explicitly list that
644          * workaround.
645          *
646          * Note that the programming of this register is further modified
647          * according to the FF_MODE2 guidance given by Wa_1608008084:gen12.
648          * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
649          * value when read. The default value for this register is zero for all
650          * fields and there are no bit masks. So instead of doing a RMW we
651          * should just write TDS timer value. For the same reason read
652          * verification is ignored.
653          */
654         wa_add(wal,
655                GEN12_FF_MODE2,
656                FF_MODE2_TDS_TIMER_MASK,
657                FF_MODE2_TDS_TIMER_128,
658                0, false);
659 }
660
661 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
662                                        struct i915_wa_list *wal)
663 {
664         struct drm_i915_private *i915 = engine->i915;
665
666         gen12_ctx_gt_tuning_init(engine, wal);
667
668         /*
669          * Wa_1409142259:tgl,dg1,adl-p
670          * Wa_1409347922:tgl,dg1,adl-p
671          * Wa_1409252684:tgl,dg1,adl-p
672          * Wa_1409217633:tgl,dg1,adl-p
673          * Wa_1409207793:tgl,dg1,adl-p
674          * Wa_1409178076:tgl,dg1,adl-p
675          * Wa_1408979724:tgl,dg1,adl-p
676          * Wa_14010443199:tgl,rkl,dg1,adl-p
677          * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
678          * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
679          */
680         wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
681                      GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
682
683         /* WaDisableGPGPUMidThreadPreemption:gen12 */
684         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
685                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
686                             GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
687
688         /*
689          * Wa_16011163337
690          *
691          * Like in gen12_ctx_gt_tuning_init(), read verification is ignored due
692          * to Wa_1608008084.
693          */
694         wa_add(wal,
695                GEN12_FF_MODE2,
696                FF_MODE2_GS_TIMER_MASK,
697                FF_MODE2_GS_TIMER_224,
698                0, false);
699
700         if (!IS_DG1(i915))
701                 /* Wa_1806527549 */
702                 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
703 }
704
705 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
706                                      struct i915_wa_list *wal)
707 {
708         gen12_ctx_workarounds_init(engine, wal);
709
710         /* Wa_1409044764 */
711         wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
712                       DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
713
714         /* Wa_22010493298 */
715         wa_masked_en(wal, HIZ_CHICKEN,
716                      DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
717 }
718
719 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
720                                      struct i915_wa_list *wal)
721 {
722         dg2_ctx_gt_tuning_init(engine, wal);
723
724         /* Wa_16011186671:dg2_g11 */
725         if (IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
726                 wa_mcr_masked_dis(wal, VFLSKPD, DIS_MULT_MISS_RD_SQUASH);
727                 wa_mcr_masked_en(wal, VFLSKPD, DIS_OVER_FETCH_CACHE);
728         }
729
730         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
731                 /* Wa_14010469329:dg2_g10 */
732                 wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3,
733                                  XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE);
734
735                 /*
736                  * Wa_22010465075:dg2_g10
737                  * Wa_22010613112:dg2_g10
738                  * Wa_14010698770:dg2_g10
739                  */
740                 wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3,
741                                  GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
742         }
743
744         /* Wa_16013271637:dg2 */
745         wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
746                          MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
747
748         /* Wa_14014947963:dg2 */
749         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_B0, STEP_FOREVER) ||
750                 IS_DG2_G11(engine->i915) || IS_DG2_G12(engine->i915))
751                 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
752
753         /* Wa_15010599737:dg2 */
754         wa_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN);
755 }
756
757 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
758                                          struct i915_wa_list *wal)
759 {
760         /*
761          * This is a "fake" workaround defined by software to ensure we
762          * maintain reliable, backward-compatible behavior for userspace with
763          * regards to how nested MI_BATCH_BUFFER_START commands are handled.
764          *
765          * The per-context setting of MI_MODE[12] determines whether the bits
766          * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
767          * in the traditional manner or whether they should instead use a new
768          * tgl+ meaning that breaks backward compatibility, but allows nesting
769          * into 3rd-level batchbuffers.  When this new capability was first
770          * added in TGL, it remained off by default unless a context
771          * intentionally opted in to the new behavior.  However Xe_HPG now
772          * flips this on by default and requires that we explicitly opt out if
773          * we don't want the new behavior.
774          *
775          * From a SW perspective, we want to maintain the backward-compatible
776          * behavior for userspace, so we'll apply a fake workaround to set it
777          * back to the legacy behavior on platforms where the hardware default
778          * is to break compatibility.  At the moment there is no Linux
779          * userspace that utilizes third-level batchbuffers, so this will avoid
780          * userspace from needing to make any changes.  using the legacy
781          * meaning is the correct thing to do.  If/when we have userspace
782          * consumers that want to utilize third-level batch nesting, we can
783          * provide a context parameter to allow them to opt-in.
784          */
785         wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
786 }
787
788 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
789                                    struct i915_wa_list *wal)
790 {
791         u8 mocs;
792
793         /*
794          * Some blitter commands do not have a field for MOCS, those
795          * commands will use MOCS index pointed by BLIT_CCTL.
796          * BLIT_CCTL registers are needed to be programmed to un-cached.
797          */
798         if (engine->class == COPY_ENGINE_CLASS) {
799                 mocs = engine->gt->mocs.uc_index;
800                 wa_write_clr_set(wal,
801                                  BLIT_CCTL(engine->mmio_base),
802                                  BLIT_CCTL_MASK,
803                                  BLIT_CCTL_MOCS(mocs, mocs));
804         }
805 }
806
807 /*
808  * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
809  * defined by the hardware team, but it programming general context registers.
810  * Adding those context register programming in context workaround
811  * allow us to use the wa framework for proper application and validation.
812  */
813 static void
814 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
815                           struct i915_wa_list *wal)
816 {
817         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
818                 fakewa_disable_nestedbb_mode(engine, wal);
819
820         gen12_ctx_gt_mocs_init(engine, wal);
821 }
822
823 static void
824 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
825                            struct i915_wa_list *wal,
826                            const char *name)
827 {
828         struct drm_i915_private *i915 = engine->i915;
829
830         wa_init_start(wal, name, engine->name);
831
832         /* Applies to all engines */
833         /*
834          * Fake workarounds are not the actual workaround but
835          * programming of context registers using workaround framework.
836          */
837         if (GRAPHICS_VER(i915) >= 12)
838                 gen12_ctx_gt_fake_wa_init(engine, wal);
839
840         if (engine->class != RENDER_CLASS)
841                 goto done;
842
843         if (IS_PONTEVECCHIO(i915))
844                 ; /* noop; none at this time */
845         else if (IS_DG2(i915))
846                 dg2_ctx_workarounds_init(engine, wal);
847         else if (IS_XEHPSDV(i915))
848                 ; /* noop; none at this time */
849         else if (IS_DG1(i915))
850                 dg1_ctx_workarounds_init(engine, wal);
851         else if (GRAPHICS_VER(i915) == 12)
852                 gen12_ctx_workarounds_init(engine, wal);
853         else if (GRAPHICS_VER(i915) == 11)
854                 icl_ctx_workarounds_init(engine, wal);
855         else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
856                 cfl_ctx_workarounds_init(engine, wal);
857         else if (IS_GEMINILAKE(i915))
858                 glk_ctx_workarounds_init(engine, wal);
859         else if (IS_KABYLAKE(i915))
860                 kbl_ctx_workarounds_init(engine, wal);
861         else if (IS_BROXTON(i915))
862                 bxt_ctx_workarounds_init(engine, wal);
863         else if (IS_SKYLAKE(i915))
864                 skl_ctx_workarounds_init(engine, wal);
865         else if (IS_CHERRYVIEW(i915))
866                 chv_ctx_workarounds_init(engine, wal);
867         else if (IS_BROADWELL(i915))
868                 bdw_ctx_workarounds_init(engine, wal);
869         else if (GRAPHICS_VER(i915) == 7)
870                 gen7_ctx_workarounds_init(engine, wal);
871         else if (GRAPHICS_VER(i915) == 6)
872                 gen6_ctx_workarounds_init(engine, wal);
873         else if (GRAPHICS_VER(i915) < 8)
874                 ;
875         else
876                 MISSING_CASE(GRAPHICS_VER(i915));
877
878 done:
879         wa_init_finish(wal);
880 }
881
882 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
883 {
884         __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
885 }
886
887 int intel_engine_emit_ctx_wa(struct i915_request *rq)
888 {
889         struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
890         struct i915_wa *wa;
891         unsigned int i;
892         u32 *cs;
893         int ret;
894
895         if (wal->count == 0)
896                 return 0;
897
898         ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
899         if (ret)
900                 return ret;
901
902         cs = intel_ring_begin(rq, (wal->count * 2 + 2));
903         if (IS_ERR(cs))
904                 return PTR_ERR(cs);
905
906         *cs++ = MI_LOAD_REGISTER_IMM(wal->count);
907         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
908                 *cs++ = i915_mmio_reg_offset(wa->reg);
909                 *cs++ = wa->set;
910         }
911         *cs++ = MI_NOOP;
912
913         intel_ring_advance(rq, cs);
914
915         ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
916         if (ret)
917                 return ret;
918
919         return 0;
920 }
921
922 static void
923 gen4_gt_workarounds_init(struct intel_gt *gt,
924                          struct i915_wa_list *wal)
925 {
926         /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
927         wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
928 }
929
930 static void
931 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
932 {
933         gen4_gt_workarounds_init(gt, wal);
934
935         /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
936         wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
937 }
938
939 static void
940 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
941 {
942         g4x_gt_workarounds_init(gt, wal);
943
944         wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
945 }
946
947 static void
948 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
949 {
950 }
951
952 static void
953 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
954 {
955         /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
956         wa_masked_dis(wal,
957                       GEN7_COMMON_SLICE_CHICKEN1,
958                       GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
959
960         /* WaApplyL3ControlAndL3ChickenMode:ivb */
961         wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
962         wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
963
964         /* WaForceL3Serialization:ivb */
965         wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
966 }
967
968 static void
969 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
970 {
971         /* WaForceL3Serialization:vlv */
972         wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
973
974         /*
975          * WaIncreaseL3CreditsForVLVB0:vlv
976          * This is the hardware default actually.
977          */
978         wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
979 }
980
981 static void
982 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
983 {
984         /* L3 caching of data atomics doesn't work -- disable it. */
985         wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
986
987         wa_add(wal,
988                HSW_ROW_CHICKEN3, 0,
989                _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
990                0 /* XXX does this reg exist? */, true);
991
992         /* WaVSRefCountFullforceMissDisable:hsw */
993         wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
994 }
995
996 static void
997 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
998 {
999         const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1000         unsigned int slice, subslice;
1001         u32 mcr, mcr_mask;
1002
1003         GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1004
1005         /*
1006          * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1007          * Before any MMIO read into slice/subslice specific registers, MCR
1008          * packet control register needs to be programmed to point to any
1009          * enabled s/ss pair. Otherwise, incorrect values will be returned.
1010          * This means each subsequent MMIO read will be forwarded to an
1011          * specific s/ss combination, but this is OK since these registers
1012          * are consistent across s/ss in almost all cases. In the rare
1013          * occasions, such as INSTDONE, where this value is dependent
1014          * on s/ss combo, the read should be done with read_subslice_reg.
1015          */
1016         slice = ffs(sseu->slice_mask) - 1;
1017         GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1018         subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1019         GEM_BUG_ON(!subslice);
1020         subslice--;
1021
1022         /*
1023          * We use GEN8_MCR..() macros to calculate the |mcr| value for
1024          * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1025          */
1026         mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1027         mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1028
1029         drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1030
1031         wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1032 }
1033
1034 static void
1035 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1036 {
1037         struct drm_i915_private *i915 = gt->i915;
1038
1039         /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1040         gen9_wa_init_mcr(i915, wal);
1041
1042         /* WaDisableKillLogic:bxt,skl,kbl */
1043         if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1044                 wa_write_or(wal,
1045                             GAM_ECOCHK,
1046                             ECOCHK_DIS_TLB);
1047
1048         if (HAS_LLC(i915)) {
1049                 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1050                  *
1051                  * Must match Display Engine. See
1052                  * WaCompressedResourceDisplayNewHashMode.
1053                  */
1054                 wa_write_or(wal,
1055                             MMCD_MISC_CTRL,
1056                             MMCD_PCLA | MMCD_HOTSPOT_EN);
1057         }
1058
1059         /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1060         wa_write_or(wal,
1061                     GAM_ECOCHK,
1062                     BDW_DISABLE_HDC_INVALIDATION);
1063 }
1064
1065 static void
1066 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1067 {
1068         gen9_gt_workarounds_init(gt, wal);
1069
1070         /* WaDisableGafsUnitClkGating:skl */
1071         wa_write_or(wal,
1072                     GEN7_UCGCTL4,
1073                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1074
1075         /* WaInPlaceDecompressionHang:skl */
1076         if (IS_SKL_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1077                 wa_write_or(wal,
1078                             GEN9_GAMT_ECO_REG_RW_IA,
1079                             GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1080 }
1081
1082 static void
1083 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1084 {
1085         gen9_gt_workarounds_init(gt, wal);
1086
1087         /* WaDisableDynamicCreditSharing:kbl */
1088         if (IS_KBL_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1089                 wa_write_or(wal,
1090                             GAMT_CHKN_BIT_REG,
1091                             GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1092
1093         /* WaDisableGafsUnitClkGating:kbl */
1094         wa_write_or(wal,
1095                     GEN7_UCGCTL4,
1096                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1097
1098         /* WaInPlaceDecompressionHang:kbl */
1099         wa_write_or(wal,
1100                     GEN9_GAMT_ECO_REG_RW_IA,
1101                     GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1102 }
1103
1104 static void
1105 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1106 {
1107         gen9_gt_workarounds_init(gt, wal);
1108 }
1109
1110 static void
1111 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1112 {
1113         gen9_gt_workarounds_init(gt, wal);
1114
1115         /* WaDisableGafsUnitClkGating:cfl */
1116         wa_write_or(wal,
1117                     GEN7_UCGCTL4,
1118                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1119
1120         /* WaInPlaceDecompressionHang:cfl */
1121         wa_write_or(wal,
1122                     GEN9_GAMT_ECO_REG_RW_IA,
1123                     GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1124 }
1125
1126 static void __set_mcr_steering(struct i915_wa_list *wal,
1127                                i915_reg_t steering_reg,
1128                                unsigned int slice, unsigned int subslice)
1129 {
1130         u32 mcr, mcr_mask;
1131
1132         mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1133         mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1134
1135         wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1136 }
1137
1138 static void debug_dump_steering(struct intel_gt *gt)
1139 {
1140         struct drm_printer p = drm_debug_printer("MCR Steering:");
1141
1142         if (drm_debug_enabled(DRM_UT_DRIVER))
1143                 intel_gt_mcr_report_steering(&p, gt, false);
1144 }
1145
1146 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1147                          unsigned int slice, unsigned int subslice)
1148 {
1149         __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1150
1151         gt->default_steering.groupid = slice;
1152         gt->default_steering.instanceid = subslice;
1153
1154         debug_dump_steering(gt);
1155 }
1156
1157 static void
1158 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1159 {
1160         const struct sseu_dev_info *sseu = &gt->info.sseu;
1161         unsigned int subslice;
1162
1163         GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1164         GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1165
1166         /*
1167          * Although a platform may have subslices, we need to always steer
1168          * reads to the lowest instance that isn't fused off.  When Render
1169          * Power Gating is enabled, grabbing forcewake will only power up a
1170          * single subslice (the "minconfig") if there isn't a real workload
1171          * that needs to be run; this means that if we steer register reads to
1172          * one of the higher subslices, we run the risk of reading back 0's or
1173          * random garbage.
1174          */
1175         subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1176
1177         /*
1178          * If the subslice we picked above also steers us to a valid L3 bank,
1179          * then we can just rely on the default steering and won't need to
1180          * worry about explicitly re-steering L3BANK reads later.
1181          */
1182         if (gt->info.l3bank_mask & BIT(subslice))
1183                 gt->steering_table[L3BANK] = NULL;
1184
1185         __add_mcr_wa(gt, wal, 0, subslice);
1186 }
1187
1188 static void
1189 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1190 {
1191         const struct sseu_dev_info *sseu = &gt->info.sseu;
1192         unsigned long slice, subslice = 0, slice_mask = 0;
1193         u32 lncf_mask = 0;
1194         int i;
1195
1196         /*
1197          * On Xe_HP the steering increases in complexity. There are now several
1198          * more units that require steering and we're not guaranteed to be able
1199          * to find a common setting for all of them. These are:
1200          * - GSLICE (fusable)
1201          * - DSS (sub-unit within gslice; fusable)
1202          * - L3 Bank (fusable)
1203          * - MSLICE (fusable)
1204          * - LNCF (sub-unit within mslice; always present if mslice is present)
1205          *
1206          * We'll do our default/implicit steering based on GSLICE (in the
1207          * sliceid field) and DSS (in the subsliceid field).  If we can
1208          * find overlap between the valid MSLICE and/or LNCF values with
1209          * a suitable GSLICE, then we can just re-use the default value and
1210          * skip and explicit steering at runtime.
1211          *
1212          * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1213          * a valid sliceid value.  DSS steering is the only type of steering
1214          * that utilizes the 'subsliceid' bits.
1215          *
1216          * Also note that, even though the steering domain is called "GSlice"
1217          * and it is encoded in the register using the gslice format, the spec
1218          * says that the combined (geometry | compute) fuse should be used to
1219          * select the steering.
1220          */
1221
1222         /* Find the potential gslice candidates */
1223         slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1224                                                        GEN_DSS_PER_GSLICE);
1225
1226         /*
1227          * Find the potential LNCF candidates.  Either LNCF within a valid
1228          * mslice is fine.
1229          */
1230         for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1231                 lncf_mask |= (0x3 << (i * 2));
1232
1233         /*
1234          * Are there any sliceid values that work for both GSLICE and LNCF
1235          * steering?
1236          */
1237         if (slice_mask & lncf_mask) {
1238                 slice_mask &= lncf_mask;
1239                 gt->steering_table[LNCF] = NULL;
1240         }
1241
1242         /* How about sliceid values that also work for MSLICE steering? */
1243         if (slice_mask & gt->info.mslice_mask) {
1244                 slice_mask &= gt->info.mslice_mask;
1245                 gt->steering_table[MSLICE] = NULL;
1246         }
1247
1248         if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1249                 gt->steering_table[GAM] = NULL;
1250
1251         slice = __ffs(slice_mask);
1252         subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1253                 GEN_DSS_PER_GSLICE;
1254
1255         __add_mcr_wa(gt, wal, slice, subslice);
1256
1257         /*
1258          * SQIDI ranges are special because they use different steering
1259          * registers than everything else we work with.  On XeHP SDV and
1260          * DG2-G10, any value in the steering registers will work fine since
1261          * all instances are present, but DG2-G11 only has SQIDI instances at
1262          * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1263          * we'll just steer to a hardcoded "2" since that value will work
1264          * everywhere.
1265          */
1266         __set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1267         __set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1268
1269         /*
1270          * On DG2, GAM registers have a dedicated steering control register
1271          * and must always be programmed to a hardcoded groupid of "1."
1272          */
1273         if (IS_DG2(gt->i915))
1274                 __set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1275 }
1276
1277 static void
1278 pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1279 {
1280         unsigned int dss;
1281
1282         /*
1283          * Setup implicit steering for COMPUTE and DSS ranges to the first
1284          * non-fused-off DSS.  All other types of MCR registers will be
1285          * explicitly steered.
1286          */
1287         dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1288         __add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1289 }
1290
1291 static void
1292 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1293 {
1294         struct drm_i915_private *i915 = gt->i915;
1295
1296         icl_wa_init_mcr(gt, wal);
1297
1298         /* WaModifyGamTlbPartitioning:icl */
1299         wa_write_clr_set(wal,
1300                          GEN11_GACB_PERF_CTRL,
1301                          GEN11_HASH_CTRL_MASK,
1302                          GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1303
1304         /* Wa_1405766107:icl
1305          * Formerly known as WaCL2SFHalfMaxAlloc
1306          */
1307         wa_write_or(wal,
1308                     GEN11_LSN_UNSLCVC,
1309                     GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1310                     GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1311
1312         /* Wa_220166154:icl
1313          * Formerly known as WaDisCtxReload
1314          */
1315         wa_write_or(wal,
1316                     GEN8_GAMW_ECO_DEV_RW_IA,
1317                     GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1318
1319         /* Wa_1406463099:icl
1320          * Formerly known as WaGamTlbPendError
1321          */
1322         wa_write_or(wal,
1323                     GAMT_CHKN_BIT_REG,
1324                     GAMT_CHKN_DISABLE_L3_COH_PIPE);
1325
1326         /* Wa_1407352427:icl,ehl */
1327         wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1328                     PSDUNIT_CLKGATE_DIS);
1329
1330         /* Wa_1406680159:icl,ehl */
1331         wa_mcr_write_or(wal,
1332                         GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1333                         GWUNIT_CLKGATE_DIS);
1334
1335         /* Wa_1607087056:icl,ehl,jsl */
1336         if (IS_ICELAKE(i915) ||
1337             IS_JSL_EHL_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1338                 wa_write_or(wal,
1339                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1340                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1341
1342         /*
1343          * This is not a documented workaround, but rather an optimization
1344          * to reduce sampler power.
1345          */
1346         wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1347 }
1348
1349 /*
1350  * Though there are per-engine instances of these registers,
1351  * they retain their value through engine resets and should
1352  * only be provided on the GT workaround list rather than
1353  * the engine-specific workaround list.
1354  */
1355 static void
1356 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1357 {
1358         struct intel_engine_cs *engine;
1359         int id;
1360
1361         for_each_engine(engine, gt, id) {
1362                 if (engine->class != VIDEO_DECODE_CLASS ||
1363                     (engine->instance % 2))
1364                         continue;
1365
1366                 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1367                             IECPUNIT_CLKGATE_DIS);
1368         }
1369 }
1370
1371 static void
1372 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1373 {
1374         icl_wa_init_mcr(gt, wal);
1375
1376         /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1377         wa_14011060649(gt, wal);
1378
1379         /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1380         wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1381 }
1382
1383 static void
1384 tgl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1385 {
1386         struct drm_i915_private *i915 = gt->i915;
1387
1388         gen12_gt_workarounds_init(gt, wal);
1389
1390         /* Wa_1409420604:tgl */
1391         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1392                 wa_mcr_write_or(wal,
1393                                 SUBSLICE_UNIT_LEVEL_CLKGATE2,
1394                                 CPSSUNIT_CLKGATE_DIS);
1395
1396         /* Wa_1607087056:tgl also know as BUG:1409180338 */
1397         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1398                 wa_write_or(wal,
1399                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1400                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1401
1402         /* Wa_1408615072:tgl[a0] */
1403         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1404                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1405                             VSUNIT_CLKGATE_DIS_TGL);
1406 }
1407
1408 static void
1409 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1410 {
1411         struct drm_i915_private *i915 = gt->i915;
1412
1413         gen12_gt_workarounds_init(gt, wal);
1414
1415         /* Wa_1607087056:dg1 */
1416         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1417                 wa_write_or(wal,
1418                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1419                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1420
1421         /* Wa_1409420604:dg1 */
1422         if (IS_DG1(i915))
1423                 wa_mcr_write_or(wal,
1424                                 SUBSLICE_UNIT_LEVEL_CLKGATE2,
1425                                 CPSSUNIT_CLKGATE_DIS);
1426
1427         /* Wa_1408615072:dg1 */
1428         /* Empirical testing shows this register is unaffected by engine reset. */
1429         if (IS_DG1(i915))
1430                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1431                             VSUNIT_CLKGATE_DIS_TGL);
1432 }
1433
1434 static void
1435 xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1436 {
1437         struct drm_i915_private *i915 = gt->i915;
1438
1439         xehp_init_mcr(gt, wal);
1440
1441         /* Wa_1409757795:xehpsdv */
1442         wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1443
1444         /* Wa_16011155590:xehpsdv */
1445         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1446                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1447                             TSGUNIT_CLKGATE_DIS);
1448
1449         /* Wa_14011780169:xehpsdv */
1450         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1451                 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1452                             GAMTLBVDBOX7_CLKGATE_DIS |
1453                             GAMTLBVDBOX6_CLKGATE_DIS |
1454                             GAMTLBVDBOX5_CLKGATE_DIS |
1455                             GAMTLBVDBOX4_CLKGATE_DIS |
1456                             GAMTLBVDBOX3_CLKGATE_DIS |
1457                             GAMTLBVDBOX2_CLKGATE_DIS |
1458                             GAMTLBVDBOX1_CLKGATE_DIS |
1459                             GAMTLBVDBOX0_CLKGATE_DIS |
1460                             GAMTLBKCR_CLKGATE_DIS |
1461                             GAMTLBGUC_CLKGATE_DIS |
1462                             GAMTLBBLT_CLKGATE_DIS);
1463                 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1464                             GAMTLBGFXA1_CLKGATE_DIS |
1465                             GAMTLBCOMPA0_CLKGATE_DIS |
1466                             GAMTLBCOMPA1_CLKGATE_DIS |
1467                             GAMTLBCOMPB0_CLKGATE_DIS |
1468                             GAMTLBCOMPB1_CLKGATE_DIS |
1469                             GAMTLBCOMPC0_CLKGATE_DIS |
1470                             GAMTLBCOMPC1_CLKGATE_DIS |
1471                             GAMTLBCOMPD0_CLKGATE_DIS |
1472                             GAMTLBCOMPD1_CLKGATE_DIS |
1473                             GAMTLBMERT_CLKGATE_DIS   |
1474                             GAMTLBVEBOX3_CLKGATE_DIS |
1475                             GAMTLBVEBOX2_CLKGATE_DIS |
1476                             GAMTLBVEBOX1_CLKGATE_DIS |
1477                             GAMTLBVEBOX0_CLKGATE_DIS);
1478         }
1479
1480         /* Wa_16012725990:xehpsdv */
1481         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1482                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1483
1484         /* Wa_14011060649:xehpsdv */
1485         wa_14011060649(gt, wal);
1486 }
1487
1488 static void
1489 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1490 {
1491         struct intel_engine_cs *engine;
1492         int id;
1493
1494         xehp_init_mcr(gt, wal);
1495
1496         /* Wa_14011060649:dg2 */
1497         wa_14011060649(gt, wal);
1498
1499         /*
1500          * Although there are per-engine instances of these registers,
1501          * they technically exist outside the engine itself and are not
1502          * impacted by engine resets.  Furthermore, they're part of the
1503          * GuC blacklist so trying to treat them as engine workarounds
1504          * will result in GuC initialization failure and a wedged GPU.
1505          */
1506         for_each_engine(engine, gt, id) {
1507                 if (engine->class != VIDEO_DECODE_CLASS)
1508                         continue;
1509
1510                 /* Wa_16010515920:dg2_g10 */
1511                 if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0))
1512                         wa_write_or(wal, VDBOX_CGCTL3F18(engine->mmio_base),
1513                                     ALNUNIT_CLKGATE_DIS);
1514         }
1515
1516         if (IS_DG2_G10(gt->i915)) {
1517                 /* Wa_22010523718:dg2 */
1518                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1519                             CG3DDISCFEG_CLKGATE_DIS);
1520
1521                 /* Wa_14011006942:dg2 */
1522                 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1523                                 DSS_ROUTER_CLKGATE_DIS);
1524         }
1525
1526         if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) {
1527                 /* Wa_14010948348:dg2_g10 */
1528                 wa_write_or(wal, UNSLCGCTL9430, MSQDUNIT_CLKGATE_DIS);
1529
1530                 /* Wa_14011037102:dg2_g10 */
1531                 wa_write_or(wal, UNSLCGCTL9444, LTCDD_CLKGATE_DIS);
1532
1533                 /* Wa_14011371254:dg2_g10 */
1534                 wa_mcr_write_or(wal, XEHP_SLICE_UNIT_LEVEL_CLKGATE, NODEDSS_CLKGATE_DIS);
1535
1536                 /* Wa_14011431319:dg2_g10 */
1537                 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1538                             GAMTLBVDBOX7_CLKGATE_DIS |
1539                             GAMTLBVDBOX6_CLKGATE_DIS |
1540                             GAMTLBVDBOX5_CLKGATE_DIS |
1541                             GAMTLBVDBOX4_CLKGATE_DIS |
1542                             GAMTLBVDBOX3_CLKGATE_DIS |
1543                             GAMTLBVDBOX2_CLKGATE_DIS |
1544                             GAMTLBVDBOX1_CLKGATE_DIS |
1545                             GAMTLBVDBOX0_CLKGATE_DIS |
1546                             GAMTLBKCR_CLKGATE_DIS |
1547                             GAMTLBGUC_CLKGATE_DIS |
1548                             GAMTLBBLT_CLKGATE_DIS);
1549                 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1550                             GAMTLBGFXA1_CLKGATE_DIS |
1551                             GAMTLBCOMPA0_CLKGATE_DIS |
1552                             GAMTLBCOMPA1_CLKGATE_DIS |
1553                             GAMTLBCOMPB0_CLKGATE_DIS |
1554                             GAMTLBCOMPB1_CLKGATE_DIS |
1555                             GAMTLBCOMPC0_CLKGATE_DIS |
1556                             GAMTLBCOMPC1_CLKGATE_DIS |
1557                             GAMTLBCOMPD0_CLKGATE_DIS |
1558                             GAMTLBCOMPD1_CLKGATE_DIS |
1559                             GAMTLBMERT_CLKGATE_DIS   |
1560                             GAMTLBVEBOX3_CLKGATE_DIS |
1561                             GAMTLBVEBOX2_CLKGATE_DIS |
1562                             GAMTLBVEBOX1_CLKGATE_DIS |
1563                             GAMTLBVEBOX0_CLKGATE_DIS);
1564
1565                 /* Wa_14010569222:dg2_g10 */
1566                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1567                             GAMEDIA_CLKGATE_DIS);
1568
1569                 /* Wa_14011028019:dg2_g10 */
1570                 wa_mcr_write_or(wal, SSMCGCTL9530, RTFUNIT_CLKGATE_DIS);
1571         }
1572
1573         /* Wa_14014830051:dg2 */
1574         wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1575
1576         /*
1577          * The following are not actually "workarounds" but rather
1578          * recommended tuning settings documented in the bspec's
1579          * performance guide section.
1580          */
1581         wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1582
1583         /* Wa_14015795083 */
1584         wa_mcr_write_clr(wal, GEN8_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1585 }
1586
1587 static void
1588 pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1589 {
1590         pvc_init_mcr(gt, wal);
1591
1592         /* Wa_14015795083 */
1593         wa_mcr_write_clr(wal, GEN8_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1594 }
1595
1596 static void
1597 xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1598 {
1599         /* FIXME: Actual workarounds will be added in future patch(es) */
1600
1601         /*
1602          * Unlike older platforms, we no longer setup implicit steering here;
1603          * all MCR accesses are explicitly steered.
1604          */
1605         debug_dump_steering(gt);
1606 }
1607
1608 static void
1609 xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1610 {
1611         /* FIXME: Actual workarounds will be added in future patch(es) */
1612
1613         debug_dump_steering(gt);
1614 }
1615
1616 static void
1617 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1618 {
1619         struct drm_i915_private *i915 = gt->i915;
1620
1621         if (gt->type == GT_MEDIA) {
1622                 if (MEDIA_VER(i915) >= 13)
1623                         xelpmp_gt_workarounds_init(gt, wal);
1624                 else
1625                         MISSING_CASE(MEDIA_VER(i915));
1626
1627                 return;
1628         }
1629
1630         if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
1631                 xelpg_gt_workarounds_init(gt, wal);
1632         else if (IS_PONTEVECCHIO(i915))
1633                 pvc_gt_workarounds_init(gt, wal);
1634         else if (IS_DG2(i915))
1635                 dg2_gt_workarounds_init(gt, wal);
1636         else if (IS_XEHPSDV(i915))
1637                 xehpsdv_gt_workarounds_init(gt, wal);
1638         else if (IS_DG1(i915))
1639                 dg1_gt_workarounds_init(gt, wal);
1640         else if (IS_TIGERLAKE(i915))
1641                 tgl_gt_workarounds_init(gt, wal);
1642         else if (GRAPHICS_VER(i915) == 12)
1643                 gen12_gt_workarounds_init(gt, wal);
1644         else if (GRAPHICS_VER(i915) == 11)
1645                 icl_gt_workarounds_init(gt, wal);
1646         else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1647                 cfl_gt_workarounds_init(gt, wal);
1648         else if (IS_GEMINILAKE(i915))
1649                 glk_gt_workarounds_init(gt, wal);
1650         else if (IS_KABYLAKE(i915))
1651                 kbl_gt_workarounds_init(gt, wal);
1652         else if (IS_BROXTON(i915))
1653                 gen9_gt_workarounds_init(gt, wal);
1654         else if (IS_SKYLAKE(i915))
1655                 skl_gt_workarounds_init(gt, wal);
1656         else if (IS_HASWELL(i915))
1657                 hsw_gt_workarounds_init(gt, wal);
1658         else if (IS_VALLEYVIEW(i915))
1659                 vlv_gt_workarounds_init(gt, wal);
1660         else if (IS_IVYBRIDGE(i915))
1661                 ivb_gt_workarounds_init(gt, wal);
1662         else if (GRAPHICS_VER(i915) == 6)
1663                 snb_gt_workarounds_init(gt, wal);
1664         else if (GRAPHICS_VER(i915) == 5)
1665                 ilk_gt_workarounds_init(gt, wal);
1666         else if (IS_G4X(i915))
1667                 g4x_gt_workarounds_init(gt, wal);
1668         else if (GRAPHICS_VER(i915) == 4)
1669                 gen4_gt_workarounds_init(gt, wal);
1670         else if (GRAPHICS_VER(i915) <= 8)
1671                 ;
1672         else
1673                 MISSING_CASE(GRAPHICS_VER(i915));
1674 }
1675
1676 void intel_gt_init_workarounds(struct intel_gt *gt)
1677 {
1678         struct i915_wa_list *wal = &gt->wa_list;
1679
1680         wa_init_start(wal, "GT", "global");
1681         gt_init_workarounds(gt, wal);
1682         wa_init_finish(wal);
1683 }
1684
1685 static enum forcewake_domains
1686 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1687 {
1688         enum forcewake_domains fw = 0;
1689         struct i915_wa *wa;
1690         unsigned int i;
1691
1692         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1693                 fw |= intel_uncore_forcewake_for_reg(uncore,
1694                                                      wa->reg,
1695                                                      FW_REG_READ |
1696                                                      FW_REG_WRITE);
1697
1698         return fw;
1699 }
1700
1701 static bool
1702 wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
1703 {
1704         if ((cur ^ wa->set) & wa->read) {
1705                 DRM_ERROR("%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1706                           name, from, i915_mmio_reg_offset(wa->reg),
1707                           cur, cur & wa->read, wa->set & wa->read);
1708
1709                 return false;
1710         }
1711
1712         return true;
1713 }
1714
1715 static void
1716 wa_list_apply(struct intel_gt *gt, const struct i915_wa_list *wal)
1717 {
1718         struct intel_uncore *uncore = gt->uncore;
1719         enum forcewake_domains fw;
1720         unsigned long flags;
1721         struct i915_wa *wa;
1722         unsigned int i;
1723
1724         if (!wal->count)
1725                 return;
1726
1727         fw = wal_get_fw_for_rmw(uncore, wal);
1728
1729         spin_lock_irqsave(&uncore->lock, flags);
1730         intel_uncore_forcewake_get__locked(uncore, fw);
1731
1732         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1733                 u32 val, old = 0;
1734
1735                 /* open-coded rmw due to steering */
1736                 if (wa->clr)
1737                         old = wa->is_mcr ?
1738                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1739                                 intel_uncore_read_fw(uncore, wa->reg);
1740                 val = (old & ~wa->clr) | wa->set;
1741                 if (val != old || !wa->clr) {
1742                         if (wa->is_mcr)
1743                                 intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1744                         else
1745                                 intel_uncore_write_fw(uncore, wa->reg, val);
1746                 }
1747
1748                 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1749                         u32 val = wa->is_mcr ?
1750                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1751                                 intel_uncore_read_fw(uncore, wa->reg);
1752
1753                         wa_verify(wa, val, wal->name, "application");
1754                 }
1755         }
1756
1757         intel_uncore_forcewake_put__locked(uncore, fw);
1758         spin_unlock_irqrestore(&uncore->lock, flags);
1759 }
1760
1761 void intel_gt_apply_workarounds(struct intel_gt *gt)
1762 {
1763         wa_list_apply(gt, &gt->wa_list);
1764 }
1765
1766 static bool wa_list_verify(struct intel_gt *gt,
1767                            const struct i915_wa_list *wal,
1768                            const char *from)
1769 {
1770         struct intel_uncore *uncore = gt->uncore;
1771         struct i915_wa *wa;
1772         enum forcewake_domains fw;
1773         unsigned long flags;
1774         unsigned int i;
1775         bool ok = true;
1776
1777         fw = wal_get_fw_for_rmw(uncore, wal);
1778
1779         spin_lock_irqsave(&uncore->lock, flags);
1780         intel_uncore_forcewake_get__locked(uncore, fw);
1781
1782         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1783                 ok &= wa_verify(wa, wa->is_mcr ?
1784                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1785                                 intel_uncore_read_fw(uncore, wa->reg),
1786                                 wal->name, from);
1787
1788         intel_uncore_forcewake_put__locked(uncore, fw);
1789         spin_unlock_irqrestore(&uncore->lock, flags);
1790
1791         return ok;
1792 }
1793
1794 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1795 {
1796         return wa_list_verify(gt, &gt->wa_list, from);
1797 }
1798
1799 __maybe_unused
1800 static bool is_nonpriv_flags_valid(u32 flags)
1801 {
1802         /* Check only valid flag bits are set */
1803         if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1804                 return false;
1805
1806         /* NB: Only 3 out of 4 enum values are valid for access field */
1807         if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1808             RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1809                 return false;
1810
1811         return true;
1812 }
1813
1814 static void
1815 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1816 {
1817         struct i915_wa wa = {
1818                 .reg = reg
1819         };
1820
1821         if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1822                 return;
1823
1824         if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1825                 return;
1826
1827         wa.reg.reg |= flags;
1828         _wa_add(wal, &wa);
1829 }
1830
1831 static void
1832 whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1833 {
1834         struct i915_wa wa = {
1835                 .mcr_reg = reg,
1836                 .is_mcr = 1,
1837         };
1838
1839         if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1840                 return;
1841
1842         if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1843                 return;
1844
1845         wa.mcr_reg.reg |= flags;
1846         _wa_add(wal, &wa);
1847 }
1848
1849 static void
1850 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1851 {
1852         whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1853 }
1854
1855 static void
1856 whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1857 {
1858         whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1859 }
1860
1861 static void gen9_whitelist_build(struct i915_wa_list *w)
1862 {
1863         /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1864         whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1865
1866         /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1867         whitelist_reg(w, GEN8_CS_CHICKEN1);
1868
1869         /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1870         whitelist_reg(w, GEN8_HDC_CHICKEN1);
1871
1872         /* WaSendPushConstantsFromMMIO:skl,bxt */
1873         whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1874 }
1875
1876 static void skl_whitelist_build(struct intel_engine_cs *engine)
1877 {
1878         struct i915_wa_list *w = &engine->whitelist;
1879
1880         if (engine->class != RENDER_CLASS)
1881                 return;
1882
1883         gen9_whitelist_build(w);
1884
1885         /* WaDisableLSQCROPERFforOCL:skl */
1886         whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1887 }
1888
1889 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1890 {
1891         if (engine->class != RENDER_CLASS)
1892                 return;
1893
1894         gen9_whitelist_build(&engine->whitelist);
1895 }
1896
1897 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1898 {
1899         struct i915_wa_list *w = &engine->whitelist;
1900
1901         if (engine->class != RENDER_CLASS)
1902                 return;
1903
1904         gen9_whitelist_build(w);
1905
1906         /* WaDisableLSQCROPERFforOCL:kbl */
1907         whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1908 }
1909
1910 static void glk_whitelist_build(struct intel_engine_cs *engine)
1911 {
1912         struct i915_wa_list *w = &engine->whitelist;
1913
1914         if (engine->class != RENDER_CLASS)
1915                 return;
1916
1917         gen9_whitelist_build(w);
1918
1919         /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1920         whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1921 }
1922
1923 static void cfl_whitelist_build(struct intel_engine_cs *engine)
1924 {
1925         struct i915_wa_list *w = &engine->whitelist;
1926
1927         if (engine->class != RENDER_CLASS)
1928                 return;
1929
1930         gen9_whitelist_build(w);
1931
1932         /*
1933          * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1934          *
1935          * This covers 4 register which are next to one another :
1936          *   - PS_INVOCATION_COUNT
1937          *   - PS_INVOCATION_COUNT_UDW
1938          *   - PS_DEPTH_COUNT
1939          *   - PS_DEPTH_COUNT_UDW
1940          */
1941         whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1942                           RING_FORCE_TO_NONPRIV_ACCESS_RD |
1943                           RING_FORCE_TO_NONPRIV_RANGE_4);
1944 }
1945
1946 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1947 {
1948         struct i915_wa_list *w = &engine->whitelist;
1949
1950         if (engine->class != RENDER_CLASS)
1951                 whitelist_reg_ext(w,
1952                                   RING_CTX_TIMESTAMP(engine->mmio_base),
1953                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
1954 }
1955
1956 static void cml_whitelist_build(struct intel_engine_cs *engine)
1957 {
1958         allow_read_ctx_timestamp(engine);
1959
1960         cfl_whitelist_build(engine);
1961 }
1962
1963 static void icl_whitelist_build(struct intel_engine_cs *engine)
1964 {
1965         struct i915_wa_list *w = &engine->whitelist;
1966
1967         allow_read_ctx_timestamp(engine);
1968
1969         switch (engine->class) {
1970         case RENDER_CLASS:
1971                 /* WaAllowUMDToModifyHalfSliceChicken7:icl */
1972                 whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1973
1974                 /* WaAllowUMDToModifySamplerMode:icl */
1975                 whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
1976
1977                 /* WaEnableStateCacheRedirectToCS:icl */
1978                 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1979
1980                 /*
1981                  * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1982                  *
1983                  * This covers 4 register which are next to one another :
1984                  *   - PS_INVOCATION_COUNT
1985                  *   - PS_INVOCATION_COUNT_UDW
1986                  *   - PS_DEPTH_COUNT
1987                  *   - PS_DEPTH_COUNT_UDW
1988                  */
1989                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1990                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
1991                                   RING_FORCE_TO_NONPRIV_RANGE_4);
1992                 break;
1993
1994         case VIDEO_DECODE_CLASS:
1995                 /* hucStatusRegOffset */
1996                 whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1997                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
1998                 /* hucUKernelHdrInfoRegOffset */
1999                 whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2000                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2001                 /* hucStatus2RegOffset */
2002                 whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2003                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2004                 break;
2005
2006         default:
2007                 break;
2008         }
2009 }
2010
2011 static void tgl_whitelist_build(struct intel_engine_cs *engine)
2012 {
2013         struct i915_wa_list *w = &engine->whitelist;
2014
2015         allow_read_ctx_timestamp(engine);
2016
2017         switch (engine->class) {
2018         case RENDER_CLASS:
2019                 /*
2020                  * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2021                  * Wa_1408556865:tgl
2022                  *
2023                  * This covers 4 registers which are next to one another :
2024                  *   - PS_INVOCATION_COUNT
2025                  *   - PS_INVOCATION_COUNT_UDW
2026                  *   - PS_DEPTH_COUNT
2027                  *   - PS_DEPTH_COUNT_UDW
2028                  */
2029                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2030                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
2031                                   RING_FORCE_TO_NONPRIV_RANGE_4);
2032
2033                 /*
2034                  * Wa_1808121037:tgl
2035                  * Wa_14012131227:dg1
2036                  * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2037                  */
2038                 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2039
2040                 /* Wa_1806527549:tgl */
2041                 whitelist_reg(w, HIZ_CHICKEN);
2042                 break;
2043         default:
2044                 break;
2045         }
2046 }
2047
2048 static void dg1_whitelist_build(struct intel_engine_cs *engine)
2049 {
2050         struct i915_wa_list *w = &engine->whitelist;
2051
2052         tgl_whitelist_build(engine);
2053
2054         /* GEN:BUG:1409280441:dg1 */
2055         if (IS_DG1_GRAPHICS_STEP(engine->i915, STEP_A0, STEP_B0) &&
2056             (engine->class == RENDER_CLASS ||
2057              engine->class == COPY_ENGINE_CLASS))
2058                 whitelist_reg_ext(w, RING_ID(engine->mmio_base),
2059                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2060 }
2061
2062 static void xehpsdv_whitelist_build(struct intel_engine_cs *engine)
2063 {
2064         allow_read_ctx_timestamp(engine);
2065 }
2066
2067 static void dg2_whitelist_build(struct intel_engine_cs *engine)
2068 {
2069         struct i915_wa_list *w = &engine->whitelist;
2070
2071         allow_read_ctx_timestamp(engine);
2072
2073         switch (engine->class) {
2074         case RENDER_CLASS:
2075                 /*
2076                  * Wa_1507100340:dg2_g10
2077                  *
2078                  * This covers 4 registers which are next to one another :
2079                  *   - PS_INVOCATION_COUNT
2080                  *   - PS_INVOCATION_COUNT_UDW
2081                  *   - PS_DEPTH_COUNT
2082                  *   - PS_DEPTH_COUNT_UDW
2083                  */
2084                 if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
2085                         whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2086                                           RING_FORCE_TO_NONPRIV_ACCESS_RD |
2087                                           RING_FORCE_TO_NONPRIV_RANGE_4);
2088
2089                 break;
2090         case COMPUTE_CLASS:
2091                 /* Wa_16011157294:dg2_g10 */
2092                 if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
2093                         whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
2094                 break;
2095         default:
2096                 break;
2097         }
2098 }
2099
2100 static void blacklist_trtt(struct intel_engine_cs *engine)
2101 {
2102         struct i915_wa_list *w = &engine->whitelist;
2103
2104         /*
2105          * Prevent read/write access to [0x4400, 0x4600) which covers
2106          * the TRTT range across all engines. Note that normally userspace
2107          * cannot access the other engines' trtt control, but for simplicity
2108          * we cover the entire range on each engine.
2109          */
2110         whitelist_reg_ext(w, _MMIO(0x4400),
2111                           RING_FORCE_TO_NONPRIV_DENY |
2112                           RING_FORCE_TO_NONPRIV_RANGE_64);
2113         whitelist_reg_ext(w, _MMIO(0x4500),
2114                           RING_FORCE_TO_NONPRIV_DENY |
2115                           RING_FORCE_TO_NONPRIV_RANGE_64);
2116 }
2117
2118 static void pvc_whitelist_build(struct intel_engine_cs *engine)
2119 {
2120         allow_read_ctx_timestamp(engine);
2121
2122         /* Wa_16014440446:pvc */
2123         blacklist_trtt(engine);
2124 }
2125
2126 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2127 {
2128         struct drm_i915_private *i915 = engine->i915;
2129         struct i915_wa_list *w = &engine->whitelist;
2130
2131         wa_init_start(w, "whitelist", engine->name);
2132
2133         if (IS_PONTEVECCHIO(i915))
2134                 pvc_whitelist_build(engine);
2135         else if (IS_DG2(i915))
2136                 dg2_whitelist_build(engine);
2137         else if (IS_XEHPSDV(i915))
2138                 xehpsdv_whitelist_build(engine);
2139         else if (IS_DG1(i915))
2140                 dg1_whitelist_build(engine);
2141         else if (GRAPHICS_VER(i915) == 12)
2142                 tgl_whitelist_build(engine);
2143         else if (GRAPHICS_VER(i915) == 11)
2144                 icl_whitelist_build(engine);
2145         else if (IS_COMETLAKE(i915))
2146                 cml_whitelist_build(engine);
2147         else if (IS_COFFEELAKE(i915))
2148                 cfl_whitelist_build(engine);
2149         else if (IS_GEMINILAKE(i915))
2150                 glk_whitelist_build(engine);
2151         else if (IS_KABYLAKE(i915))
2152                 kbl_whitelist_build(engine);
2153         else if (IS_BROXTON(i915))
2154                 bxt_whitelist_build(engine);
2155         else if (IS_SKYLAKE(i915))
2156                 skl_whitelist_build(engine);
2157         else if (GRAPHICS_VER(i915) <= 8)
2158                 ;
2159         else
2160                 MISSING_CASE(GRAPHICS_VER(i915));
2161
2162         wa_init_finish(w);
2163 }
2164
2165 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2166 {
2167         const struct i915_wa_list *wal = &engine->whitelist;
2168         struct intel_uncore *uncore = engine->uncore;
2169         const u32 base = engine->mmio_base;
2170         struct i915_wa *wa;
2171         unsigned int i;
2172
2173         if (!wal->count)
2174                 return;
2175
2176         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2177                 intel_uncore_write(uncore,
2178                                    RING_FORCE_TO_NONPRIV(base, i),
2179                                    i915_mmio_reg_offset(wa->reg));
2180
2181         /* And clear the rest just in case of garbage */
2182         for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2183                 intel_uncore_write(uncore,
2184                                    RING_FORCE_TO_NONPRIV(base, i),
2185                                    i915_mmio_reg_offset(RING_NOPID(base)));
2186 }
2187
2188 /*
2189  * engine_fake_wa_init(), a place holder to program the registers
2190  * which are not part of an official workaround defined by the
2191  * hardware team.
2192  * Adding programming of those register inside workaround will
2193  * allow utilizing wa framework to proper application and verification.
2194  */
2195 static void
2196 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2197 {
2198         u8 mocs_w, mocs_r;
2199
2200         /*
2201          * RING_CMD_CCTL specifies the default MOCS entry that will be used
2202          * by the command streamer when executing commands that don't have
2203          * a way to explicitly specify a MOCS setting.  The default should
2204          * usually reference whichever MOCS entry corresponds to uncached
2205          * behavior, although use of a WB cached entry is recommended by the
2206          * spec in certain circumstances on specific platforms.
2207          */
2208         if (GRAPHICS_VER(engine->i915) >= 12) {
2209                 mocs_r = engine->gt->mocs.uc_index;
2210                 mocs_w = engine->gt->mocs.uc_index;
2211
2212                 if (HAS_L3_CCS_READ(engine->i915) &&
2213                     engine->class == COMPUTE_CLASS) {
2214                         mocs_r = engine->gt->mocs.wb_index;
2215
2216                         /*
2217                          * Even on the few platforms where MOCS 0 is a
2218                          * legitimate table entry, it's never the correct
2219                          * setting to use here; we can assume the MOCS init
2220                          * just forgot to initialize wb_index.
2221                          */
2222                         drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2223                 }
2224
2225                 wa_masked_field_set(wal,
2226                                     RING_CMD_CCTL(engine->mmio_base),
2227                                     CMD_CCTL_MOCS_MASK,
2228                                     CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2229         }
2230 }
2231
2232 static bool needs_wa_1308578152(struct intel_engine_cs *engine)
2233 {
2234         return intel_sseu_find_first_xehp_dss(&engine->gt->info.sseu, 0, 0) >=
2235                 GEN_DSS_PER_GSLICE;
2236 }
2237
2238 static void
2239 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2240 {
2241         struct drm_i915_private *i915 = engine->i915;
2242
2243         if (IS_DG2(i915)) {
2244                 /* Wa_1509235366:dg2 */
2245                 wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2246                             GLOBAL_INVALIDATION_MODE);
2247         }
2248
2249         if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2250                 /* Wa_14013392000:dg2_g11 */
2251                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_ENABLE_LARGE_GRF_MODE);
2252         }
2253
2254         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) ||
2255             IS_DG2_G11(i915) || IS_DG2_G12(i915)) {
2256                 /* Wa_1509727124:dg2 */
2257                 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2258                                  SC_DISABLE_POWER_OPTIMIZATION_EBB);
2259         }
2260
2261         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0) ||
2262             IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2263                 /* Wa_14012419201:dg2 */
2264                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4,
2265                                  GEN12_DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX);
2266         }
2267
2268         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2269             IS_DG2_G11(i915)) {
2270                 /*
2271                  * Wa_22012826095:dg2
2272                  * Wa_22013059131:dg2
2273                  */
2274                 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2275                                      MAXREQS_PER_BANK,
2276                                      REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2277
2278                 /* Wa_22013059131:dg2 */
2279                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2280                                 FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2281         }
2282
2283         /* Wa_1308578152:dg2_g10 when first gslice is fused off */
2284         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) &&
2285             needs_wa_1308578152(engine)) {
2286                 wa_masked_dis(wal, GEN12_CS_DEBUG_MODE1_CCCSUNIT_BE_COMMON,
2287                               GEN12_REPLAY_MODE_GRANULARITY);
2288         }
2289
2290         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) ||
2291             IS_DG2_G11(i915) || IS_DG2_G12(i915)) {
2292                 /* Wa_22013037850:dg2 */
2293                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2294                                 DISABLE_128B_EVICTION_COMMAND_UDW);
2295
2296                 /* Wa_22012856258:dg2 */
2297                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2298                                  GEN12_DISABLE_READ_SUPPRESSION);
2299
2300                 /*
2301                  * Wa_22010960976:dg2
2302                  * Wa_14013347512:dg2
2303                  */
2304                 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2305                                   LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2306         }
2307
2308         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2309                 /*
2310                  * Wa_1608949956:dg2_g10
2311                  * Wa_14010198302:dg2_g10
2312                  */
2313                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
2314                                  MDQ_ARBITRATION_MODE | UGM_BACKUP_MODE);
2315
2316                 /*
2317                  * Wa_14010918519:dg2_g10
2318                  *
2319                  * LSC_CHICKEN_BIT_0 always reads back as 0 is this stepping,
2320                  * so ignoring verification.
2321                  */
2322                 wa_mcr_add(wal, LSC_CHICKEN_BIT_0_UDW, 0,
2323                            FORCE_SLM_FENCE_SCOPE_TO_TILE | FORCE_UGM_FENCE_SCOPE_TO_TILE,
2324                            0, false);
2325         }
2326
2327         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2328                 /* Wa_22010430635:dg2 */
2329                 wa_mcr_masked_en(wal,
2330                                  GEN9_ROW_CHICKEN4,
2331                                  GEN12_DISABLE_GRF_CLEAR);
2332
2333                 /* Wa_14010648519:dg2 */
2334                 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2335         }
2336
2337         /* Wa_14013202645:dg2 */
2338         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2339             IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0))
2340                 wa_mcr_write_or(wal, RT_CTRL, DIS_NULL_QUERY);
2341
2342         /* Wa_22012532006:dg2 */
2343         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_C0) ||
2344             IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0))
2345                 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
2346                                  DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA);
2347
2348         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
2349                 /* Wa_14010680813:dg2_g10 */
2350                 wa_write_or(wal, GEN12_GAMSTLB_CTRL, CONTROL_BLOCK_CLKGATE_DIS |
2351                             EGRESS_BLOCK_CLKGATE_DIS | TAG_BLOCK_CLKGATE_DIS);
2352         }
2353
2354         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0) ||
2355             IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
2356                 /* Wa_14012362059:dg2 */
2357                 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2358         }
2359
2360         if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_B0, STEP_FOREVER) ||
2361             IS_DG2_G10(i915)) {
2362                 /* Wa_22014600077:dg2 */
2363                 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2364                            _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2365                            0 /* Wa_14012342262 write-only reg, so skip verification */,
2366                            true);
2367         }
2368
2369         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2370             IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2371                 /*
2372                  * Wa_1607138336:tgl[a0],dg1[a0]
2373                  * Wa_1607063988:tgl[a0],dg1[a0]
2374                  */
2375                 wa_write_or(wal,
2376                             GEN9_CTX_PREEMPT_REG,
2377                             GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
2378         }
2379
2380         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2381                 /*
2382                  * Wa_1606679103:tgl
2383                  * (see also Wa_1606682166:icl)
2384                  */
2385                 wa_write_or(wal,
2386                             GEN7_SARCHKMD,
2387                             GEN7_DISABLE_SAMPLER_PREFETCH);
2388         }
2389
2390         if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2391             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2392                 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2393                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2394
2395                 /*
2396                  * Wa_1407928979:tgl A*
2397                  * Wa_18011464164:tgl[B0+],dg1[B0+]
2398                  * Wa_22010931296:tgl[B0+],dg1[B0+]
2399                  * Wa_14010919138:rkl,dg1,adl-s,adl-p
2400                  */
2401                 wa_write_or(wal, GEN7_FF_THREAD_MODE,
2402                             GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2403         }
2404
2405         if (IS_ALDERLAKE_P(i915) || IS_DG2(i915) || IS_ALDERLAKE_S(i915) ||
2406             IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2407                 /*
2408                  * Wa_1606700617:tgl,dg1,adl-p
2409                  * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2410                  * Wa_14010826681:tgl,dg1,rkl,adl-p
2411                  * Wa_18019627453:dg2
2412                  */
2413                 wa_masked_en(wal,
2414                              GEN9_CS_DEBUG_MODE1,
2415                              FF_DOP_CLOCK_GATE_DISABLE);
2416         }
2417
2418         if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2419             IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2420             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2421                 /* Wa_1409804808:tgl,rkl,dg1[a0],adl-s,adl-p */
2422                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2423                                  GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2424
2425                 /*
2426                  * Wa_1409085225:tgl
2427                  * Wa_14010229206:tgl,rkl,dg1[a0],adl-s,adl-p
2428                  */
2429                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2430         }
2431
2432         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2433             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2434                 /*
2435                  * Wa_1607030317:tgl
2436                  * Wa_1607186500:tgl
2437                  * Wa_1607297627:tgl,rkl,dg1[a0],adlp
2438                  *
2439                  * On TGL and RKL there are multiple entries for this WA in the
2440                  * BSpec; some indicate this is an A0-only WA, others indicate
2441                  * it applies to all steppings so we trust the "all steppings."
2442                  * For DG1 this only applies to A0.
2443                  */
2444                 wa_masked_en(wal,
2445                              RING_PSMI_CTL(RENDER_RING_BASE),
2446                              GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2447                              GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2448         }
2449
2450         if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) ||
2451             IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) {
2452                 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2453                 wa_mcr_masked_en(wal,
2454                                  GEN10_SAMPLER_MODE,
2455                                  ENABLE_SMALLPL);
2456         }
2457
2458         if (GRAPHICS_VER(i915) == 11) {
2459                 /* This is not an Wa. Enable for better image quality */
2460                 wa_masked_en(wal,
2461                              _3D_CHICKEN3,
2462                              _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2463
2464                 /*
2465                  * Wa_1405543622:icl
2466                  * Formerly known as WaGAPZPriorityScheme
2467                  */
2468                 wa_write_or(wal,
2469                             GEN8_GARBCNTL,
2470                             GEN11_ARBITRATION_PRIO_ORDER_MASK);
2471
2472                 /*
2473                  * Wa_1604223664:icl
2474                  * Formerly known as WaL3BankAddressHashing
2475                  */
2476                 wa_write_clr_set(wal,
2477                                  GEN8_GARBCNTL,
2478                                  GEN11_HASH_CTRL_EXCL_MASK,
2479                                  GEN11_HASH_CTRL_EXCL_BIT0);
2480                 wa_write_clr_set(wal,
2481                                  GEN11_GLBLINVL,
2482                                  GEN11_BANK_HASH_ADDR_EXCL_MASK,
2483                                  GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2484
2485                 /*
2486                  * Wa_1405733216:icl
2487                  * Formerly known as WaDisableCleanEvicts
2488                  */
2489                 wa_mcr_write_or(wal,
2490                                 GEN8_L3SQCREG4,
2491                                 GEN11_LQSC_CLEAN_EVICT_DISABLE);
2492
2493                 /* Wa_1606682166:icl */
2494                 wa_write_or(wal,
2495                             GEN7_SARCHKMD,
2496                             GEN7_DISABLE_SAMPLER_PREFETCH);
2497
2498                 /* Wa_1409178092:icl */
2499                 wa_mcr_write_clr_set(wal,
2500                                      GEN11_SCRATCH2,
2501                                      GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2502                                      0);
2503
2504                 /* WaEnable32PlaneMode:icl */
2505                 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2506                              GEN11_ENABLE_32_PLANE_MODE);
2507
2508                 /*
2509                  * Wa_1408615072:icl,ehl  (vsunit)
2510                  * Wa_1407596294:icl,ehl  (hsunit)
2511                  */
2512                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
2513                             VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
2514
2515                 /*
2516                  * Wa_1408767742:icl[a2..forever],ehl[all]
2517                  * Wa_1605460711:icl[a0..c0]
2518                  */
2519                 wa_write_or(wal,
2520                             GEN7_FF_THREAD_MODE,
2521                             GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2522
2523                 /* Wa_22010271021 */
2524                 wa_masked_en(wal,
2525                              GEN9_CS_DEBUG_MODE1,
2526                              FF_DOP_CLOCK_GATE_DISABLE);
2527         }
2528
2529         /*
2530          * Intel platforms that support fine-grained preemption (i.e., gen9 and
2531          * beyond) allow the kernel-mode driver to choose between two different
2532          * options for controlling preemption granularity and behavior.
2533          *
2534          * Option 1 (hardware default):
2535          *   Preemption settings are controlled in a global manner via
2536          *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2537          *   and settings chosen by the kernel-mode driver will apply to all
2538          *   userspace clients.
2539          *
2540          * Option 2:
2541          *   Preemption settings are controlled on a per-context basis via
2542          *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2543          *   context switch and is writable by userspace (e.g., via
2544          *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2545          *   which allows different userspace drivers/clients to select
2546          *   different settings, or to change those settings on the fly in
2547          *   response to runtime needs.  This option was known by name
2548          *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2549          *   that name is somewhat misleading as other non-granularity
2550          *   preemption settings are also impacted by this decision.
2551          *
2552          * On Linux, our policy has always been to let userspace drivers
2553          * control preemption granularity/settings (Option 2).  This was
2554          * originally mandatory on gen9 to prevent ABI breakage (old gen9
2555          * userspace developed before object-level preemption was enabled would
2556          * not behave well if i915 were to go with Option 1 and enable that
2557          * preemption in a global manner).  On gen9 each context would have
2558          * object-level preemption disabled by default (see
2559          * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2560          * userspace drivers could opt-in to object-level preemption as they
2561          * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2562          * even though it is no longer necessary for ABI compatibility when
2563          * enabling a new platform, it does ensure that userspace will be able
2564          * to implement any workarounds that show up requiring temporary
2565          * adjustments to preemption behavior at runtime.
2566          *
2567          * Notes/Workarounds:
2568          *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2569          *      CS_CHICKEN1[0] does not disable object-level preemption as
2570          *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2571          *      using Option 1).  Effectively this means userspace is unable
2572          *      to disable object-level preemption on these platforms/steppings
2573          *      despite the setting here.
2574          *
2575          *  - Wa_16013994831:  May require that userspace program
2576          *      CS_CHICKEN1[10] when certain runtime conditions are true.
2577          *      Userspace requires Option 2 to be in effect for their update of
2578          *      CS_CHICKEN1[10] to be effective.
2579          *
2580          * Other workarounds may appear in the future that will also require
2581          * Option 2 behavior to allow proper userspace implementation.
2582          */
2583         if (GRAPHICS_VER(i915) >= 9)
2584                 wa_masked_en(wal,
2585                              GEN7_FF_SLICE_CS_CHICKEN1,
2586                              GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2587
2588         if (IS_SKYLAKE(i915) ||
2589             IS_KABYLAKE(i915) ||
2590             IS_COFFEELAKE(i915) ||
2591             IS_COMETLAKE(i915)) {
2592                 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2593                 wa_write_or(wal,
2594                             GEN8_GARBCNTL,
2595                             GEN9_GAPS_TSV_CREDIT_DISABLE);
2596         }
2597
2598         if (IS_BROXTON(i915)) {
2599                 /* WaDisablePooledEuLoadBalancingFix:bxt */
2600                 wa_masked_en(wal,
2601                              FF_SLICE_CS_CHICKEN2,
2602                              GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2603         }
2604
2605         if (GRAPHICS_VER(i915) == 9) {
2606                 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2607                 wa_masked_en(wal,
2608                              GEN9_CSFE_CHICKEN1_RCS,
2609                              GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2610
2611                 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2612                 wa_mcr_write_or(wal,
2613                                 BDW_SCRATCH1,
2614                                 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2615
2616                 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2617                 if (IS_GEN9_LP(i915))
2618                         wa_mcr_write_clr_set(wal,
2619                                              GEN8_L3SQCREG1,
2620                                              L3_PRIO_CREDITS_MASK,
2621                                              L3_GENERAL_PRIO_CREDITS(62) |
2622                                              L3_HIGH_PRIO_CREDITS(2));
2623
2624                 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2625                 wa_mcr_write_or(wal,
2626                                 GEN8_L3SQCREG4,
2627                                 GEN8_LQSC_FLUSH_COHERENT_LINES);
2628
2629                 /* Disable atomics in L3 to prevent unrecoverable hangs */
2630                 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2631                                  GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2632                 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2633                                      GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2634                 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2635                                      EVICTION_PERF_FIX_ENABLE, 0);
2636         }
2637
2638         if (IS_HASWELL(i915)) {
2639                 /* WaSampleCChickenBitEnable:hsw */
2640                 wa_masked_en(wal,
2641                              HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2642
2643                 wa_masked_dis(wal,
2644                               CACHE_MODE_0_GEN7,
2645                               /* enable HiZ Raw Stall Optimization */
2646                               HIZ_RAW_STALL_OPT_DISABLE);
2647         }
2648
2649         if (IS_VALLEYVIEW(i915)) {
2650                 /* WaDisableEarlyCull:vlv */
2651                 wa_masked_en(wal,
2652                              _3D_CHICKEN3,
2653                              _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2654
2655                 /*
2656                  * WaVSThreadDispatchOverride:ivb,vlv
2657                  *
2658                  * This actually overrides the dispatch
2659                  * mode for all thread types.
2660                  */
2661                 wa_write_clr_set(wal,
2662                                  GEN7_FF_THREAD_MODE,
2663                                  GEN7_FF_SCHED_MASK,
2664                                  GEN7_FF_TS_SCHED_HW |
2665                                  GEN7_FF_VS_SCHED_HW |
2666                                  GEN7_FF_DS_SCHED_HW);
2667
2668                 /* WaPsdDispatchEnable:vlv */
2669                 /* WaDisablePSDDualDispatchEnable:vlv */
2670                 wa_masked_en(wal,
2671                              GEN7_HALF_SLICE_CHICKEN1,
2672                              GEN7_MAX_PS_THREAD_DEP |
2673                              GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2674         }
2675
2676         if (IS_IVYBRIDGE(i915)) {
2677                 /* WaDisableEarlyCull:ivb */
2678                 wa_masked_en(wal,
2679                              _3D_CHICKEN3,
2680                              _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2681
2682                 if (0) { /* causes HiZ corruption on ivb:gt1 */
2683                         /* enable HiZ Raw Stall Optimization */
2684                         wa_masked_dis(wal,
2685                                       CACHE_MODE_0_GEN7,
2686                                       HIZ_RAW_STALL_OPT_DISABLE);
2687                 }
2688
2689                 /*
2690                  * WaVSThreadDispatchOverride:ivb,vlv
2691                  *
2692                  * This actually overrides the dispatch
2693                  * mode for all thread types.
2694                  */
2695                 wa_write_clr_set(wal,
2696                                  GEN7_FF_THREAD_MODE,
2697                                  GEN7_FF_SCHED_MASK,
2698                                  GEN7_FF_TS_SCHED_HW |
2699                                  GEN7_FF_VS_SCHED_HW |
2700                                  GEN7_FF_DS_SCHED_HW);
2701
2702                 /* WaDisablePSDDualDispatchEnable:ivb */
2703                 if (IS_IVB_GT1(i915))
2704                         wa_masked_en(wal,
2705                                      GEN7_HALF_SLICE_CHICKEN1,
2706                                      GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2707         }
2708
2709         if (GRAPHICS_VER(i915) == 7) {
2710                 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2711                 wa_masked_en(wal,
2712                              RING_MODE_GEN7(RENDER_RING_BASE),
2713                              GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2714
2715                 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2716                 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2717
2718                 /*
2719                  * BSpec says this must be set, even though
2720                  * WaDisable4x2SubspanOptimization:ivb,hsw
2721                  * WaDisable4x2SubspanOptimization isn't listed for VLV.
2722                  */
2723                 wa_masked_en(wal,
2724                              CACHE_MODE_1,
2725                              PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2726
2727                 /*
2728                  * BSpec recommends 8x4 when MSAA is used,
2729                  * however in practice 16x4 seems fastest.
2730                  *
2731                  * Note that PS/WM thread counts depend on the WIZ hashing
2732                  * disable bit, which we don't touch here, but it's good
2733                  * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2734                  */
2735                 wa_masked_field_set(wal,
2736                                     GEN7_GT_MODE,
2737                                     GEN6_WIZ_HASHING_MASK,
2738                                     GEN6_WIZ_HASHING_16x4);
2739         }
2740
2741         if (IS_GRAPHICS_VER(i915, 6, 7))
2742                 /*
2743                  * We need to disable the AsyncFlip performance optimisations in
2744                  * order to use MI_WAIT_FOR_EVENT within the CS. It should
2745                  * already be programmed to '1' on all products.
2746                  *
2747                  * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2748                  */
2749                 wa_masked_en(wal,
2750                              RING_MI_MODE(RENDER_RING_BASE),
2751                              ASYNC_FLIP_PERF_DISABLE);
2752
2753         if (GRAPHICS_VER(i915) == 6) {
2754                 /*
2755                  * Required for the hardware to program scanline values for
2756                  * waiting
2757                  * WaEnableFlushTlbInvalidationMode:snb
2758                  */
2759                 wa_masked_en(wal,
2760                              GFX_MODE,
2761                              GFX_TLB_INVALIDATE_EXPLICIT);
2762
2763                 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2764                 wa_masked_en(wal,
2765                              _3D_CHICKEN,
2766                              _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2767
2768                 wa_masked_en(wal,
2769                              _3D_CHICKEN3,
2770                              /* WaStripsFansDisableFastClipPerformanceFix:snb */
2771                              _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2772                              /*
2773                               * Bspec says:
2774                               * "This bit must be set if 3DSTATE_CLIP clip mode is set
2775                               * to normal and 3DSTATE_SF number of SF output attributes
2776                               * is more than 16."
2777                               */
2778                              _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2779
2780                 /*
2781                  * BSpec recommends 8x4 when MSAA is used,
2782                  * however in practice 16x4 seems fastest.
2783                  *
2784                  * Note that PS/WM thread counts depend on the WIZ hashing
2785                  * disable bit, which we don't touch here, but it's good
2786                  * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2787                  */
2788                 wa_masked_field_set(wal,
2789                                     GEN6_GT_MODE,
2790                                     GEN6_WIZ_HASHING_MASK,
2791                                     GEN6_WIZ_HASHING_16x4);
2792
2793                 /* WaDisable_RenderCache_OperationalFlush:snb */
2794                 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2795
2796                 /*
2797                  * From the Sandybridge PRM, volume 1 part 3, page 24:
2798                  * "If this bit is set, STCunit will have LRA as replacement
2799                  *  policy. [...] This bit must be reset. LRA replacement
2800                  *  policy is not supported."
2801                  */
2802                 wa_masked_dis(wal,
2803                               CACHE_MODE_0,
2804                               CM0_STC_EVICT_DISABLE_LRA_SNB);
2805         }
2806
2807         if (IS_GRAPHICS_VER(i915, 4, 6))
2808                 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2809                 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2810                        0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2811                        /* XXX bit doesn't stick on Broadwater */
2812                        IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2813
2814         if (GRAPHICS_VER(i915) == 4)
2815                 /*
2816                  * Disable CONSTANT_BUFFER before it is loaded from the context
2817                  * image. For as it is loaded, it is executed and the stored
2818                  * address may no longer be valid, leading to a GPU hang.
2819                  *
2820                  * This imposes the requirement that userspace reload their
2821                  * CONSTANT_BUFFER on every batch, fortunately a requirement
2822                  * they are already accustomed to from before contexts were
2823                  * enabled.
2824                  */
2825                 wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2826                        0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2827                        0 /* XXX bit doesn't stick on Broadwater */,
2828                        true);
2829 }
2830
2831 static void
2832 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2833 {
2834         struct drm_i915_private *i915 = engine->i915;
2835
2836         /* WaKBLVECSSemaphoreWaitPoll:kbl */
2837         if (IS_KBL_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2838                 wa_write(wal,
2839                          RING_SEMA_WAIT_POLL(engine->mmio_base),
2840                          1);
2841         }
2842 }
2843
2844 static void
2845 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2846 {
2847         if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2848                 /* Wa_14014999345:pvc */
2849                 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2850         }
2851 }
2852
2853 /*
2854  * The bspec performance guide has recommended MMIO tuning settings.  These
2855  * aren't truly "workarounds" but we want to program them with the same
2856  * workaround infrastructure to ensure that they're automatically added to
2857  * the GuC save/restore lists, re-applied at the right times, and checked for
2858  * any conflicting programming requested by real workarounds.
2859  *
2860  * Programming settings should be added here only if their registers are not
2861  * part of an engine's register state context.  If a register is part of a
2862  * context, then any tuning settings should be programmed in an appropriate
2863  * function invoked by __intel_engine_init_ctx_wa().
2864  */
2865 static void
2866 add_render_compute_tuning_settings(struct drm_i915_private *i915,
2867                                    struct i915_wa_list *wal)
2868 {
2869         if (IS_PONTEVECCHIO(i915)) {
2870                 wa_write(wal, XEHPC_L3SCRUB,
2871                          SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
2872         }
2873
2874         if (IS_DG2(i915)) {
2875                 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
2876                 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2877
2878                 /*
2879                  * This is also listed as Wa_22012654132 for certain DG2
2880                  * steppings, but the tuning setting programming is a superset
2881                  * since it applies to all DG2 variants and steppings.
2882                  *
2883                  * Note that register 0xE420 is write-only and cannot be read
2884                  * back for verification on DG2 (due to Wa_14012342262), so
2885                  * we need to explicitly skip the readback.
2886                  */
2887                 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2888                            _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2889                            0 /* write-only, so skip validation */,
2890                            true);
2891         }
2892
2893         /*
2894          * This tuning setting proves beneficial only on ATS-M designs; the
2895          * default "age based" setting is optimal on regular DG2 and other
2896          * platforms.
2897          */
2898         if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2899                 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2900                                         THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2901 }
2902
2903 /*
2904  * The workarounds in this function apply to shared registers in
2905  * the general render reset domain that aren't tied to a
2906  * specific engine.  Since all render+compute engines get reset
2907  * together, and the contents of these registers are lost during
2908  * the shared render domain reset, we'll define such workarounds
2909  * here and then add them to just a single RCS or CCS engine's
2910  * workaround list (whichever engine has the XXXX flag).
2911  */
2912 static void
2913 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2914 {
2915         struct drm_i915_private *i915 = engine->i915;
2916
2917         add_render_compute_tuning_settings(i915, wal);
2918
2919         if (IS_PONTEVECCHIO(i915)) {
2920                 /* Wa_16016694945 */
2921                 wa_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
2922         }
2923
2924         if (IS_XEHPSDV(i915)) {
2925                 /* Wa_1409954639 */
2926                 wa_mcr_masked_en(wal,
2927                                  GEN8_ROW_CHICKEN,
2928                                  SYSTOLIC_DOP_CLOCK_GATING_DIS);
2929
2930                 /* Wa_1607196519 */
2931                 wa_mcr_masked_en(wal,
2932                                  GEN9_ROW_CHICKEN4,
2933                                  GEN12_DISABLE_GRF_CLEAR);
2934
2935                 /* Wa_14010670810:xehpsdv */
2936                 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2937
2938                 /* Wa_14010449647:xehpsdv */
2939                 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
2940                                  GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2941
2942                 /* Wa_18011725039:xehpsdv */
2943                 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
2944                         wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
2945                         wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
2946                 }
2947
2948                 /* Wa_14012362059:xehpsdv */
2949                 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2950
2951                 /* Wa_14014368820:xehpsdv */
2952                 wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2953                                 GLOBAL_INVALIDATION_MODE);
2954         }
2955
2956         if (IS_DG2(i915) || IS_PONTEVECCHIO(i915)) {
2957                 /* Wa_14015227452:dg2,pvc */
2958                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2959
2960                 /* Wa_22014226127:dg2,pvc */
2961                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2962
2963                 /* Wa_16015675438:dg2,pvc */
2964                 wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2965
2966                 /* Wa_18018781329:dg2,pvc */
2967                 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
2968                 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
2969                 wa_mcr_write_or(wal, VDBX_MOD_CTRL, FORCE_MISS_FTLB);
2970                 wa_mcr_write_or(wal, VEBX_MOD_CTRL, FORCE_MISS_FTLB);
2971         }
2972
2973         if (IS_DG2(i915)) {
2974                 /*
2975                  * Wa_16011620976:dg2_g11
2976                  * Wa_22015475538:dg2
2977                  */
2978                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2979
2980                 /* Wa_18017747507:dg2 */
2981                 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2982         }
2983 }
2984
2985 static void
2986 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2987 {
2988         if (I915_SELFTEST_ONLY(GRAPHICS_VER(engine->i915) < 4))
2989                 return;
2990
2991         engine_fake_wa_init(engine, wal);
2992
2993         /*
2994          * These are common workarounds that just need to applied
2995          * to a single RCS/CCS engine's workaround list since
2996          * they're reset as part of the general render domain reset.
2997          */
2998         if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2999                 general_render_compute_wa_init(engine, wal);
3000
3001         if (engine->class == COMPUTE_CLASS)
3002                 ccs_engine_wa_init(engine, wal);
3003         else if (engine->class == RENDER_CLASS)
3004                 rcs_engine_wa_init(engine, wal);
3005         else
3006                 xcs_engine_wa_init(engine, wal);
3007 }
3008
3009 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3010 {
3011         struct i915_wa_list *wal = &engine->wa_list;
3012
3013         if (GRAPHICS_VER(engine->i915) < 4)
3014                 return;
3015
3016         wa_init_start(wal, "engine", engine->name);
3017         engine_init_workarounds(engine, wal);
3018         wa_init_finish(wal);
3019 }
3020
3021 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3022 {
3023         wa_list_apply(engine->gt, &engine->wa_list);
3024 }
3025
3026 static const struct i915_range mcr_ranges_gen8[] = {
3027         { .start = 0x5500, .end = 0x55ff },
3028         { .start = 0x7000, .end = 0x7fff },
3029         { .start = 0x9400, .end = 0x97ff },
3030         { .start = 0xb000, .end = 0xb3ff },
3031         { .start = 0xe000, .end = 0xe7ff },
3032         {},
3033 };
3034
3035 static const struct i915_range mcr_ranges_gen12[] = {
3036         { .start =  0x8150, .end =  0x815f },
3037         { .start =  0x9520, .end =  0x955f },
3038         { .start =  0xb100, .end =  0xb3ff },
3039         { .start =  0xde80, .end =  0xe8ff },
3040         { .start = 0x24a00, .end = 0x24a7f },
3041         {},
3042 };
3043
3044 static const struct i915_range mcr_ranges_xehp[] = {
3045         { .start =  0x4000, .end =  0x4aff },
3046         { .start =  0x5200, .end =  0x52ff },
3047         { .start =  0x5400, .end =  0x7fff },
3048         { .start =  0x8140, .end =  0x815f },
3049         { .start =  0x8c80, .end =  0x8dff },
3050         { .start =  0x94d0, .end =  0x955f },
3051         { .start =  0x9680, .end =  0x96ff },
3052         { .start =  0xb000, .end =  0xb3ff },
3053         { .start =  0xc800, .end =  0xcfff },
3054         { .start =  0xd800, .end =  0xd8ff },
3055         { .start =  0xdc00, .end =  0xffff },
3056         { .start = 0x17000, .end = 0x17fff },
3057         { .start = 0x24a00, .end = 0x24a7f },
3058         {},
3059 };
3060
3061 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3062 {
3063         const struct i915_range *mcr_ranges;
3064         int i;
3065
3066         if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3067                 mcr_ranges = mcr_ranges_xehp;
3068         else if (GRAPHICS_VER(i915) >= 12)
3069                 mcr_ranges = mcr_ranges_gen12;
3070         else if (GRAPHICS_VER(i915) >= 8)
3071                 mcr_ranges = mcr_ranges_gen8;
3072         else
3073                 return false;
3074
3075         /*
3076          * Registers in these ranges are affected by the MCR selector
3077          * which only controls CPU initiated MMIO. Routing does not
3078          * work for CS access so we cannot verify them on this path.
3079          */
3080         for (i = 0; mcr_ranges[i].start; i++)
3081                 if (offset >= mcr_ranges[i].start &&
3082                     offset <= mcr_ranges[i].end)
3083                         return true;
3084
3085         return false;
3086 }
3087
3088 static int
3089 wa_list_srm(struct i915_request *rq,
3090             const struct i915_wa_list *wal,
3091             struct i915_vma *vma)
3092 {
3093         struct drm_i915_private *i915 = rq->engine->i915;
3094         unsigned int i, count = 0;
3095         const struct i915_wa *wa;
3096         u32 srm, *cs;
3097
3098         srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3099         if (GRAPHICS_VER(i915) >= 8)
3100                 srm++;
3101
3102         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3103                 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3104                         count++;
3105         }
3106
3107         cs = intel_ring_begin(rq, 4 * count);
3108         if (IS_ERR(cs))
3109                 return PTR_ERR(cs);
3110
3111         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3112                 u32 offset = i915_mmio_reg_offset(wa->reg);
3113
3114                 if (mcr_range(i915, offset))
3115                         continue;
3116
3117                 *cs++ = srm;
3118                 *cs++ = offset;
3119                 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3120                 *cs++ = 0;
3121         }
3122         intel_ring_advance(rq, cs);
3123
3124         return 0;
3125 }
3126
3127 static int engine_wa_list_verify(struct intel_context *ce,
3128                                  const struct i915_wa_list * const wal,
3129                                  const char *from)
3130 {
3131         const struct i915_wa *wa;
3132         struct i915_request *rq;
3133         struct i915_vma *vma;
3134         struct i915_gem_ww_ctx ww;
3135         unsigned int i;
3136         u32 *results;
3137         int err;
3138
3139         if (!wal->count)
3140                 return 0;
3141
3142         vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3143                                            wal->count * sizeof(u32));
3144         if (IS_ERR(vma))
3145                 return PTR_ERR(vma);
3146
3147         intel_engine_pm_get(ce->engine);
3148         i915_gem_ww_ctx_init(&ww, false);
3149 retry:
3150         err = i915_gem_object_lock(vma->obj, &ww);
3151         if (err == 0)
3152                 err = intel_context_pin_ww(ce, &ww);
3153         if (err)
3154                 goto err_pm;
3155
3156         err = i915_vma_pin_ww(vma, &ww, 0, 0,
3157                            i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3158         if (err)
3159                 goto err_unpin;
3160
3161         rq = i915_request_create(ce);
3162         if (IS_ERR(rq)) {
3163                 err = PTR_ERR(rq);
3164                 goto err_vma;
3165         }
3166
3167         err = i915_request_await_object(rq, vma->obj, true);
3168         if (err == 0)
3169                 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3170         if (err == 0)
3171                 err = wa_list_srm(rq, wal, vma);
3172
3173         i915_request_get(rq);
3174         if (err)
3175                 i915_request_set_error_once(rq, err);
3176         i915_request_add(rq);
3177
3178         if (err)
3179                 goto err_rq;
3180
3181         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3182                 err = -ETIME;
3183                 goto err_rq;
3184         }
3185
3186         results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3187         if (IS_ERR(results)) {
3188                 err = PTR_ERR(results);
3189                 goto err_rq;
3190         }
3191
3192         err = 0;
3193         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3194                 if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
3195                         continue;
3196
3197                 if (!wa_verify(wa, results[i], wal->name, from))
3198                         err = -ENXIO;
3199         }
3200
3201         i915_gem_object_unpin_map(vma->obj);
3202
3203 err_rq:
3204         i915_request_put(rq);
3205 err_vma:
3206         i915_vma_unpin(vma);
3207 err_unpin:
3208         intel_context_unpin(ce);
3209 err_pm:
3210         if (err == -EDEADLK) {
3211                 err = i915_gem_ww_ctx_backoff(&ww);
3212                 if (!err)
3213                         goto retry;
3214         }
3215         i915_gem_ww_ctx_fini(&ww);
3216         intel_engine_pm_put(ce->engine);
3217         i915_vma_put(vma);
3218         return err;
3219 }
3220
3221 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3222                                     const char *from)
3223 {
3224         return engine_wa_list_verify(engine->kernel_context,
3225                                      &engine->wa_list,
3226                                      from);
3227 }
3228
3229 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3230 #include "selftest_workarounds.c"
3231 #endif