Merge tag 'powerpc-6.6-6' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[platform/kernel/linux-starfive.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22
23 /*
24  * The per-platform tables are u8-encoded in @data. Decode @data and set the
25  * addresses' offset and commands in @regs. The following encoding is used
26  * for each byte. There are 2 steps: decoding commands and decoding addresses.
27  *
28  * Commands:
29  * [7]: create NOPs - number of NOPs are set in lower bits
30  * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31  *      MI_LRI_FORCE_POSTED
32  * [5:0]: Number of NOPs or registers to set values to in case of
33  *        MI_LOAD_REGISTER_IMM
34  *
35  * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36  * number of registers. They are set by using the REG/REG16 macros: the former
37  * is used for offsets smaller than 0x200 while the latter is for values bigger
38  * than that. Those macros already set all the bits documented below correctly:
39  *
40  * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41  *      follow, for the lower bits
42  * [6:0]: Register offset, without considering the engine base.
43  *
44  * This function only tweaks the commands and register offsets. Values are not
45  * filled out.
46  */
47 static void set_offsets(u32 *regs,
48                         const u8 *data,
49                         const struct intel_engine_cs *engine,
50                         bool close)
51 #define NOP(x) (BIT(7) | (x))
52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53 #define POSTED BIT(0)
54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55 #define REG16(x) \
56         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57         (((x) >> 2) & 0x7f)
58 #define END 0
59 {
60         const u32 base = engine->mmio_base;
61
62         while (*data) {
63                 u8 count, flags;
64
65                 if (*data & BIT(7)) { /* skip */
66                         count = *data++ & ~BIT(7);
67                         regs += count;
68                         continue;
69                 }
70
71                 count = *data & 0x3f;
72                 flags = *data >> 6;
73                 data++;
74
75                 *regs = MI_LOAD_REGISTER_IMM(count);
76                 if (flags & POSTED)
77                         *regs |= MI_LRI_FORCE_POSTED;
78                 if (GRAPHICS_VER(engine->i915) >= 11)
79                         *regs |= MI_LRI_LRM_CS_MMIO;
80                 regs++;
81
82                 GEM_BUG_ON(!count);
83                 do {
84                         u32 offset = 0;
85                         u8 v;
86
87                         do {
88                                 v = *data++;
89                                 offset <<= 7;
90                                 offset |= v & ~BIT(7);
91                         } while (v & BIT(7));
92
93                         regs[0] = base + (offset << 2);
94                         regs += 2;
95                 } while (--count);
96         }
97
98         if (close) {
99                 /* Close the batch; used mainly by live_lrc_layout() */
100                 *regs = MI_BATCH_BUFFER_END;
101                 if (GRAPHICS_VER(engine->i915) >= 11)
102                         *regs |= BIT(0);
103         }
104 }
105
106 static const u8 gen8_xcs_offsets[] = {
107         NOP(1),
108         LRI(11, 0),
109         REG16(0x244),
110         REG(0x034),
111         REG(0x030),
112         REG(0x038),
113         REG(0x03c),
114         REG(0x168),
115         REG(0x140),
116         REG(0x110),
117         REG(0x11c),
118         REG(0x114),
119         REG(0x118),
120
121         NOP(9),
122         LRI(9, 0),
123         REG16(0x3a8),
124         REG16(0x28c),
125         REG16(0x288),
126         REG16(0x284),
127         REG16(0x280),
128         REG16(0x27c),
129         REG16(0x278),
130         REG16(0x274),
131         REG16(0x270),
132
133         NOP(13),
134         LRI(2, 0),
135         REG16(0x200),
136         REG(0x028),
137
138         END
139 };
140
141 static const u8 gen9_xcs_offsets[] = {
142         NOP(1),
143         LRI(14, POSTED),
144         REG16(0x244),
145         REG(0x034),
146         REG(0x030),
147         REG(0x038),
148         REG(0x03c),
149         REG(0x168),
150         REG(0x140),
151         REG(0x110),
152         REG(0x11c),
153         REG(0x114),
154         REG(0x118),
155         REG(0x1c0),
156         REG(0x1c4),
157         REG(0x1c8),
158
159         NOP(3),
160         LRI(9, POSTED),
161         REG16(0x3a8),
162         REG16(0x28c),
163         REG16(0x288),
164         REG16(0x284),
165         REG16(0x280),
166         REG16(0x27c),
167         REG16(0x278),
168         REG16(0x274),
169         REG16(0x270),
170
171         NOP(13),
172         LRI(1, POSTED),
173         REG16(0x200),
174
175         NOP(13),
176         LRI(44, POSTED),
177         REG(0x028),
178         REG(0x09c),
179         REG(0x0c0),
180         REG(0x178),
181         REG(0x17c),
182         REG16(0x358),
183         REG(0x170),
184         REG(0x150),
185         REG(0x154),
186         REG(0x158),
187         REG16(0x41c),
188         REG16(0x600),
189         REG16(0x604),
190         REG16(0x608),
191         REG16(0x60c),
192         REG16(0x610),
193         REG16(0x614),
194         REG16(0x618),
195         REG16(0x61c),
196         REG16(0x620),
197         REG16(0x624),
198         REG16(0x628),
199         REG16(0x62c),
200         REG16(0x630),
201         REG16(0x634),
202         REG16(0x638),
203         REG16(0x63c),
204         REG16(0x640),
205         REG16(0x644),
206         REG16(0x648),
207         REG16(0x64c),
208         REG16(0x650),
209         REG16(0x654),
210         REG16(0x658),
211         REG16(0x65c),
212         REG16(0x660),
213         REG16(0x664),
214         REG16(0x668),
215         REG16(0x66c),
216         REG16(0x670),
217         REG16(0x674),
218         REG16(0x678),
219         REG16(0x67c),
220         REG(0x068),
221
222         END
223 };
224
225 static const u8 gen12_xcs_offsets[] = {
226         NOP(1),
227         LRI(13, POSTED),
228         REG16(0x244),
229         REG(0x034),
230         REG(0x030),
231         REG(0x038),
232         REG(0x03c),
233         REG(0x168),
234         REG(0x140),
235         REG(0x110),
236         REG(0x1c0),
237         REG(0x1c4),
238         REG(0x1c8),
239         REG(0x180),
240         REG16(0x2b4),
241
242         NOP(5),
243         LRI(9, POSTED),
244         REG16(0x3a8),
245         REG16(0x28c),
246         REG16(0x288),
247         REG16(0x284),
248         REG16(0x280),
249         REG16(0x27c),
250         REG16(0x278),
251         REG16(0x274),
252         REG16(0x270),
253
254         END
255 };
256
257 static const u8 dg2_xcs_offsets[] = {
258         NOP(1),
259         LRI(15, POSTED),
260         REG16(0x244),
261         REG(0x034),
262         REG(0x030),
263         REG(0x038),
264         REG(0x03c),
265         REG(0x168),
266         REG(0x140),
267         REG(0x110),
268         REG(0x1c0),
269         REG(0x1c4),
270         REG(0x1c8),
271         REG(0x180),
272         REG16(0x2b4),
273         REG(0x120),
274         REG(0x124),
275
276         NOP(1),
277         LRI(9, POSTED),
278         REG16(0x3a8),
279         REG16(0x28c),
280         REG16(0x288),
281         REG16(0x284),
282         REG16(0x280),
283         REG16(0x27c),
284         REG16(0x278),
285         REG16(0x274),
286         REG16(0x270),
287
288         END
289 };
290
291 static const u8 gen8_rcs_offsets[] = {
292         NOP(1),
293         LRI(14, POSTED),
294         REG16(0x244),
295         REG(0x034),
296         REG(0x030),
297         REG(0x038),
298         REG(0x03c),
299         REG(0x168),
300         REG(0x140),
301         REG(0x110),
302         REG(0x11c),
303         REG(0x114),
304         REG(0x118),
305         REG(0x1c0),
306         REG(0x1c4),
307         REG(0x1c8),
308
309         NOP(3),
310         LRI(9, POSTED),
311         REG16(0x3a8),
312         REG16(0x28c),
313         REG16(0x288),
314         REG16(0x284),
315         REG16(0x280),
316         REG16(0x27c),
317         REG16(0x278),
318         REG16(0x274),
319         REG16(0x270),
320
321         NOP(13),
322         LRI(1, 0),
323         REG(0x0c8),
324
325         END
326 };
327
328 static const u8 gen9_rcs_offsets[] = {
329         NOP(1),
330         LRI(14, POSTED),
331         REG16(0x244),
332         REG(0x34),
333         REG(0x30),
334         REG(0x38),
335         REG(0x3c),
336         REG(0x168),
337         REG(0x140),
338         REG(0x110),
339         REG(0x11c),
340         REG(0x114),
341         REG(0x118),
342         REG(0x1c0),
343         REG(0x1c4),
344         REG(0x1c8),
345
346         NOP(3),
347         LRI(9, POSTED),
348         REG16(0x3a8),
349         REG16(0x28c),
350         REG16(0x288),
351         REG16(0x284),
352         REG16(0x280),
353         REG16(0x27c),
354         REG16(0x278),
355         REG16(0x274),
356         REG16(0x270),
357
358         NOP(13),
359         LRI(1, 0),
360         REG(0xc8),
361
362         NOP(13),
363         LRI(44, POSTED),
364         REG(0x28),
365         REG(0x9c),
366         REG(0xc0),
367         REG(0x178),
368         REG(0x17c),
369         REG16(0x358),
370         REG(0x170),
371         REG(0x150),
372         REG(0x154),
373         REG(0x158),
374         REG16(0x41c),
375         REG16(0x600),
376         REG16(0x604),
377         REG16(0x608),
378         REG16(0x60c),
379         REG16(0x610),
380         REG16(0x614),
381         REG16(0x618),
382         REG16(0x61c),
383         REG16(0x620),
384         REG16(0x624),
385         REG16(0x628),
386         REG16(0x62c),
387         REG16(0x630),
388         REG16(0x634),
389         REG16(0x638),
390         REG16(0x63c),
391         REG16(0x640),
392         REG16(0x644),
393         REG16(0x648),
394         REG16(0x64c),
395         REG16(0x650),
396         REG16(0x654),
397         REG16(0x658),
398         REG16(0x65c),
399         REG16(0x660),
400         REG16(0x664),
401         REG16(0x668),
402         REG16(0x66c),
403         REG16(0x670),
404         REG16(0x674),
405         REG16(0x678),
406         REG16(0x67c),
407         REG(0x68),
408
409         END
410 };
411
412 static const u8 gen11_rcs_offsets[] = {
413         NOP(1),
414         LRI(15, POSTED),
415         REG16(0x244),
416         REG(0x034),
417         REG(0x030),
418         REG(0x038),
419         REG(0x03c),
420         REG(0x168),
421         REG(0x140),
422         REG(0x110),
423         REG(0x11c),
424         REG(0x114),
425         REG(0x118),
426         REG(0x1c0),
427         REG(0x1c4),
428         REG(0x1c8),
429         REG(0x180),
430
431         NOP(1),
432         LRI(9, POSTED),
433         REG16(0x3a8),
434         REG16(0x28c),
435         REG16(0x288),
436         REG16(0x284),
437         REG16(0x280),
438         REG16(0x27c),
439         REG16(0x278),
440         REG16(0x274),
441         REG16(0x270),
442
443         LRI(1, POSTED),
444         REG(0x1b0),
445
446         NOP(10),
447         LRI(1, 0),
448         REG(0x0c8),
449
450         END
451 };
452
453 static const u8 gen12_rcs_offsets[] = {
454         NOP(1),
455         LRI(13, POSTED),
456         REG16(0x244),
457         REG(0x034),
458         REG(0x030),
459         REG(0x038),
460         REG(0x03c),
461         REG(0x168),
462         REG(0x140),
463         REG(0x110),
464         REG(0x1c0),
465         REG(0x1c4),
466         REG(0x1c8),
467         REG(0x180),
468         REG16(0x2b4),
469
470         NOP(5),
471         LRI(9, POSTED),
472         REG16(0x3a8),
473         REG16(0x28c),
474         REG16(0x288),
475         REG16(0x284),
476         REG16(0x280),
477         REG16(0x27c),
478         REG16(0x278),
479         REG16(0x274),
480         REG16(0x270),
481
482         LRI(3, POSTED),
483         REG(0x1b0),
484         REG16(0x5a8),
485         REG16(0x5ac),
486
487         NOP(6),
488         LRI(1, 0),
489         REG(0x0c8),
490         NOP(3 + 9 + 1),
491
492         LRI(51, POSTED),
493         REG16(0x588),
494         REG16(0x588),
495         REG16(0x588),
496         REG16(0x588),
497         REG16(0x588),
498         REG16(0x588),
499         REG(0x028),
500         REG(0x09c),
501         REG(0x0c0),
502         REG(0x178),
503         REG(0x17c),
504         REG16(0x358),
505         REG(0x170),
506         REG(0x150),
507         REG(0x154),
508         REG(0x158),
509         REG16(0x41c),
510         REG16(0x600),
511         REG16(0x604),
512         REG16(0x608),
513         REG16(0x60c),
514         REG16(0x610),
515         REG16(0x614),
516         REG16(0x618),
517         REG16(0x61c),
518         REG16(0x620),
519         REG16(0x624),
520         REG16(0x628),
521         REG16(0x62c),
522         REG16(0x630),
523         REG16(0x634),
524         REG16(0x638),
525         REG16(0x63c),
526         REG16(0x640),
527         REG16(0x644),
528         REG16(0x648),
529         REG16(0x64c),
530         REG16(0x650),
531         REG16(0x654),
532         REG16(0x658),
533         REG16(0x65c),
534         REG16(0x660),
535         REG16(0x664),
536         REG16(0x668),
537         REG16(0x66c),
538         REG16(0x670),
539         REG16(0x674),
540         REG16(0x678),
541         REG16(0x67c),
542         REG(0x068),
543         REG(0x084),
544         NOP(1),
545
546         END
547 };
548
549 static const u8 xehp_rcs_offsets[] = {
550         NOP(1),
551         LRI(13, POSTED),
552         REG16(0x244),
553         REG(0x034),
554         REG(0x030),
555         REG(0x038),
556         REG(0x03c),
557         REG(0x168),
558         REG(0x140),
559         REG(0x110),
560         REG(0x1c0),
561         REG(0x1c4),
562         REG(0x1c8),
563         REG(0x180),
564         REG16(0x2b4),
565
566         NOP(5),
567         LRI(9, POSTED),
568         REG16(0x3a8),
569         REG16(0x28c),
570         REG16(0x288),
571         REG16(0x284),
572         REG16(0x280),
573         REG16(0x27c),
574         REG16(0x278),
575         REG16(0x274),
576         REG16(0x270),
577
578         LRI(3, POSTED),
579         REG(0x1b0),
580         REG16(0x5a8),
581         REG16(0x5ac),
582
583         NOP(6),
584         LRI(1, 0),
585         REG(0x0c8),
586
587         END
588 };
589
590 static const u8 dg2_rcs_offsets[] = {
591         NOP(1),
592         LRI(15, POSTED),
593         REG16(0x244),
594         REG(0x034),
595         REG(0x030),
596         REG(0x038),
597         REG(0x03c),
598         REG(0x168),
599         REG(0x140),
600         REG(0x110),
601         REG(0x1c0),
602         REG(0x1c4),
603         REG(0x1c8),
604         REG(0x180),
605         REG16(0x2b4),
606         REG(0x120),
607         REG(0x124),
608
609         NOP(1),
610         LRI(9, POSTED),
611         REG16(0x3a8),
612         REG16(0x28c),
613         REG16(0x288),
614         REG16(0x284),
615         REG16(0x280),
616         REG16(0x27c),
617         REG16(0x278),
618         REG16(0x274),
619         REG16(0x270),
620
621         LRI(3, POSTED),
622         REG(0x1b0),
623         REG16(0x5a8),
624         REG16(0x5ac),
625
626         NOP(6),
627         LRI(1, 0),
628         REG(0x0c8),
629
630         END
631 };
632
633 static const u8 mtl_rcs_offsets[] = {
634         NOP(1),
635         LRI(15, POSTED),
636         REG16(0x244),
637         REG(0x034),
638         REG(0x030),
639         REG(0x038),
640         REG(0x03c),
641         REG(0x168),
642         REG(0x140),
643         REG(0x110),
644         REG(0x1c0),
645         REG(0x1c4),
646         REG(0x1c8),
647         REG(0x180),
648         REG16(0x2b4),
649         REG(0x120),
650         REG(0x124),
651
652         NOP(1),
653         LRI(9, POSTED),
654         REG16(0x3a8),
655         REG16(0x28c),
656         REG16(0x288),
657         REG16(0x284),
658         REG16(0x280),
659         REG16(0x27c),
660         REG16(0x278),
661         REG16(0x274),
662         REG16(0x270),
663
664         NOP(2),
665         LRI(2, POSTED),
666         REG16(0x5a8),
667         REG16(0x5ac),
668
669         NOP(6),
670         LRI(1, 0),
671         REG(0x0c8),
672
673         END
674 };
675
676 #undef END
677 #undef REG16
678 #undef REG
679 #undef LRI
680 #undef NOP
681
682 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
683 {
684         /*
685          * The gen12+ lists only have the registers we program in the basic
686          * default state. We rely on the context image using relative
687          * addressing to automatic fixup the register state between the
688          * physical engines for virtual engine.
689          */
690         GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
691                    !intel_engine_has_relative_mmio(engine));
692
693         if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
694                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
695                         return mtl_rcs_offsets;
696                 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
697                         return dg2_rcs_offsets;
698                 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
699                         return xehp_rcs_offsets;
700                 else if (GRAPHICS_VER(engine->i915) >= 12)
701                         return gen12_rcs_offsets;
702                 else if (GRAPHICS_VER(engine->i915) >= 11)
703                         return gen11_rcs_offsets;
704                 else if (GRAPHICS_VER(engine->i915) >= 9)
705                         return gen9_rcs_offsets;
706                 else
707                         return gen8_rcs_offsets;
708         } else {
709                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
710                         return dg2_xcs_offsets;
711                 else if (GRAPHICS_VER(engine->i915) >= 12)
712                         return gen12_xcs_offsets;
713                 else if (GRAPHICS_VER(engine->i915) >= 9)
714                         return gen9_xcs_offsets;
715                 else
716                         return gen8_xcs_offsets;
717         }
718 }
719
720 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
721 {
722         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
723                 return 0x70;
724         else if (GRAPHICS_VER(engine->i915) >= 12)
725                 return 0x60;
726         else if (GRAPHICS_VER(engine->i915) >= 9)
727                 return 0x54;
728         else if (engine->class == RENDER_CLASS)
729                 return 0x58;
730         else
731                 return -1;
732 }
733
734 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
735 {
736         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
737                 return 0x80;
738         else if (GRAPHICS_VER(engine->i915) >= 12)
739                 return 0x70;
740         else if (GRAPHICS_VER(engine->i915) >= 9)
741                 return 0x64;
742         else if (GRAPHICS_VER(engine->i915) >= 8 &&
743                  engine->class == RENDER_CLASS)
744                 return 0xc4;
745         else
746                 return -1;
747 }
748
749 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
750 {
751         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
752                 return 0x84;
753         else if (GRAPHICS_VER(engine->i915) >= 12)
754                 return 0x74;
755         else if (GRAPHICS_VER(engine->i915) >= 9)
756                 return 0x68;
757         else if (engine->class == RENDER_CLASS)
758                 return 0xd8;
759         else
760                 return -1;
761 }
762
763 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
764 {
765         if (GRAPHICS_VER(engine->i915) >= 12)
766                 return 0x12;
767         else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
768                 return 0x18;
769         else
770                 return -1;
771 }
772
773 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
774 {
775         int x;
776
777         x = lrc_ring_wa_bb_per_ctx(engine);
778         if (x < 0)
779                 return x;
780
781         return x + 2;
782 }
783
784 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
785 {
786         int x;
787
788         x = lrc_ring_indirect_ptr(engine);
789         if (x < 0)
790                 return x;
791
792         return x + 2;
793 }
794
795 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
796 {
797
798         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
799                 /*
800                  * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
801                  * simply to match the RCS context image layout.
802                  */
803                 return 0xc6;
804         else if (engine->class != RENDER_CLASS)
805                 return -1;
806         else if (GRAPHICS_VER(engine->i915) >= 12)
807                 return 0xb6;
808         else if (GRAPHICS_VER(engine->i915) >= 11)
809                 return 0xaa;
810         else
811                 return -1;
812 }
813
814 static u32
815 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
816 {
817         if (GRAPHICS_VER(engine->i915) >= 12)
818                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
819         else if (GRAPHICS_VER(engine->i915) >= 11)
820                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
821         else if (GRAPHICS_VER(engine->i915) >= 9)
822                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
823         else if (GRAPHICS_VER(engine->i915) >= 8)
824                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
825
826         GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
827
828         return 0;
829 }
830
831 static void
832 lrc_setup_indirect_ctx(u32 *regs,
833                        const struct intel_engine_cs *engine,
834                        u32 ctx_bb_ggtt_addr,
835                        u32 size)
836 {
837         GEM_BUG_ON(!size);
838         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
839         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
840         regs[lrc_ring_indirect_ptr(engine) + 1] =
841                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
842
843         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
844         regs[lrc_ring_indirect_offset(engine) + 1] =
845                 lrc_ring_indirect_offset_default(engine) << 6;
846 }
847
848 static void init_common_regs(u32 * const regs,
849                              const struct intel_context *ce,
850                              const struct intel_engine_cs *engine,
851                              bool inhibit)
852 {
853         u32 ctl;
854         int loc;
855
856         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
857         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
858         if (inhibit)
859                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
860         if (GRAPHICS_VER(engine->i915) < 11)
861                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
862                                            CTX_CTRL_RS_CTX_ENABLE);
863         regs[CTX_CONTEXT_CONTROL] = ctl;
864
865         regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
866
867         loc = lrc_ring_bb_offset(engine);
868         if (loc != -1)
869                 regs[loc + 1] = 0;
870 }
871
872 static void init_wa_bb_regs(u32 * const regs,
873                             const struct intel_engine_cs *engine)
874 {
875         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
876
877         if (wa_ctx->per_ctx.size) {
878                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
879
880                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
881                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
882                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
883         }
884
885         if (wa_ctx->indirect_ctx.size) {
886                 lrc_setup_indirect_ctx(regs, engine,
887                                        i915_ggtt_offset(wa_ctx->vma) +
888                                        wa_ctx->indirect_ctx.offset,
889                                        wa_ctx->indirect_ctx.size);
890         }
891 }
892
893 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
894 {
895         if (i915_vm_is_4lvl(&ppgtt->vm)) {
896                 /* 64b PPGTT (48bit canonical)
897                  * PDP0_DESCRIPTOR contains the base address to PML4 and
898                  * other PDP Descriptors are ignored.
899                  */
900                 ASSIGN_CTX_PML4(ppgtt, regs);
901         } else {
902                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
903                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
904                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
905                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
906         }
907 }
908
909 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
910 {
911         if (i915_is_ggtt(vm))
912                 return i915_vm_to_ggtt(vm)->alias;
913         else
914                 return i915_vm_to_ppgtt(vm);
915 }
916
917 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
918 {
919         int x;
920
921         x = lrc_ring_mi_mode(engine);
922         if (x != -1) {
923                 regs[x + 1] &= ~STOP_RING;
924                 regs[x + 1] |= STOP_RING << 16;
925         }
926 }
927
928 static void __lrc_init_regs(u32 *regs,
929                             const struct intel_context *ce,
930                             const struct intel_engine_cs *engine,
931                             bool inhibit)
932 {
933         /*
934          * A context is actually a big batch buffer with several
935          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
936          * values we are setting here are only for the first context restore:
937          * on a subsequent save, the GPU will recreate this batchbuffer with new
938          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
939          * we are not initializing here).
940          *
941          * Must keep consistent with virtual_update_register_offsets().
942          */
943
944         if (inhibit)
945                 memset(regs, 0, PAGE_SIZE);
946
947         set_offsets(regs, reg_offsets(engine), engine, inhibit);
948
949         init_common_regs(regs, ce, engine, inhibit);
950         init_ppgtt_regs(regs, vm_alias(ce->vm));
951
952         init_wa_bb_regs(regs, engine);
953
954         __reset_stop_ring(regs, engine);
955 }
956
957 void lrc_init_regs(const struct intel_context *ce,
958                    const struct intel_engine_cs *engine,
959                    bool inhibit)
960 {
961         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
962 }
963
964 void lrc_reset_regs(const struct intel_context *ce,
965                     const struct intel_engine_cs *engine)
966 {
967         __reset_stop_ring(ce->lrc_reg_state, engine);
968 }
969
970 static void
971 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
972 {
973         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
974                 return;
975
976         vaddr += engine->context_size;
977
978         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
979 }
980
981 static void
982 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
983 {
984         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
985                 return;
986
987         vaddr += engine->context_size;
988
989         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
990                 drm_err_once(&engine->i915->drm,
991                              "%s context redzone overwritten!\n",
992                              engine->name);
993 }
994
995 static u32 context_wa_bb_offset(const struct intel_context *ce)
996 {
997         return PAGE_SIZE * ce->wa_bb_page;
998 }
999
1000 static u32 *context_indirect_bb(const struct intel_context *ce)
1001 {
1002         void *ptr;
1003
1004         GEM_BUG_ON(!ce->wa_bb_page);
1005
1006         ptr = ce->lrc_reg_state;
1007         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008         ptr += context_wa_bb_offset(ce);
1009
1010         return ptr;
1011 }
1012
1013 void lrc_init_state(struct intel_context *ce,
1014                     struct intel_engine_cs *engine,
1015                     void *state)
1016 {
1017         bool inhibit = true;
1018
1019         set_redzone(state, engine);
1020
1021         if (engine->default_state) {
1022                 shmem_read(engine->default_state, 0,
1023                            state, engine->context_size);
1024                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025                 inhibit = false;
1026         }
1027
1028         /* Clear the ppHWSP (inc. per-context counters) */
1029         memset(state, 0, PAGE_SIZE);
1030
1031         /* Clear the indirect wa and storage */
1032         if (ce->wa_bb_page)
1033                 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034
1035         /*
1036          * The second page of the context object contains some registers which
1037          * must be set up prior to the first execution.
1038          */
1039         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040 }
1041
1042 u32 lrc_indirect_bb(const struct intel_context *ce)
1043 {
1044         return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1045 }
1046
1047 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1048 {
1049         /* If predication is active, this will be noop'ed */
1050         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052         *cs++ = 0;
1053         *cs++ = 0; /* No predication */
1054
1055         /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056         *cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057         *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058
1059         /* Instructions are no longer predicated (disabled), we can proceed */
1060         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061         *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062         *cs++ = 0;
1063         *cs++ = 1; /* enable predication before the next BB */
1064
1065         *cs++ = MI_BATCH_BUFFER_END;
1066         GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067
1068         return cs;
1069 }
1070
1071 static struct i915_vma *
1072 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073 {
1074         struct drm_i915_gem_object *obj;
1075         struct i915_vma *vma;
1076         u32 context_size;
1077
1078         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1079
1080         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082
1083         if (GRAPHICS_VER(engine->i915) >= 12) {
1084                 ce->wa_bb_page = context_size / PAGE_SIZE;
1085                 context_size += PAGE_SIZE;
1086         }
1087
1088         if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1089                 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1090                 context_size += PARENT_SCRATCH_SIZE;
1091         }
1092
1093         obj = i915_gem_object_create_lmem(engine->i915, context_size,
1094                                           I915_BO_ALLOC_PM_VOLATILE);
1095         if (IS_ERR(obj)) {
1096                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1097                 if (IS_ERR(obj))
1098                         return ERR_CAST(obj);
1099
1100                 /*
1101                  * Wa_22016122933: For Media version 13.0, all Media GT shared
1102                  * memory needs to be mapped as WC on CPU side and UC (PAT
1103                  * index 2) on GPU side.
1104                  */
1105                 if (intel_gt_needs_wa_22016122933(engine->gt))
1106                         i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1107         }
1108
1109         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1110         if (IS_ERR(vma)) {
1111                 i915_gem_object_put(obj);
1112                 return vma;
1113         }
1114
1115         return vma;
1116 }
1117
1118 static struct intel_timeline *
1119 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1120 {
1121         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1122
1123         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1124 }
1125
1126 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1127 {
1128         struct intel_ring *ring;
1129         struct i915_vma *vma;
1130         int err;
1131
1132         GEM_BUG_ON(ce->state);
1133
1134         vma = __lrc_alloc_state(ce, engine);
1135         if (IS_ERR(vma))
1136                 return PTR_ERR(vma);
1137
1138         ring = intel_engine_create_ring(engine, ce->ring_size);
1139         if (IS_ERR(ring)) {
1140                 err = PTR_ERR(ring);
1141                 goto err_vma;
1142         }
1143
1144         if (!page_mask_bits(ce->timeline)) {
1145                 struct intel_timeline *tl;
1146
1147                 /*
1148                  * Use the static global HWSP for the kernel context, and
1149                  * a dynamically allocated cacheline for everyone else.
1150                  */
1151                 if (unlikely(ce->timeline))
1152                         tl = pinned_timeline(ce, engine);
1153                 else
1154                         tl = intel_timeline_create(engine->gt);
1155                 if (IS_ERR(tl)) {
1156                         err = PTR_ERR(tl);
1157                         goto err_ring;
1158                 }
1159
1160                 ce->timeline = tl;
1161         }
1162
1163         ce->ring = ring;
1164         ce->state = vma;
1165
1166         return 0;
1167
1168 err_ring:
1169         intel_ring_put(ring);
1170 err_vma:
1171         i915_vma_put(vma);
1172         return err;
1173 }
1174
1175 void lrc_reset(struct intel_context *ce)
1176 {
1177         GEM_BUG_ON(!intel_context_is_pinned(ce));
1178
1179         intel_ring_reset(ce->ring, ce->ring->emit);
1180
1181         /* Scrub away the garbage */
1182         lrc_init_regs(ce, ce->engine, true);
1183         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1184 }
1185
1186 int
1187 lrc_pre_pin(struct intel_context *ce,
1188             struct intel_engine_cs *engine,
1189             struct i915_gem_ww_ctx *ww,
1190             void **vaddr)
1191 {
1192         GEM_BUG_ON(!ce->state);
1193         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1194
1195         *vaddr = i915_gem_object_pin_map(ce->state->obj,
1196                                          intel_gt_coherent_map_type(ce->engine->gt,
1197                                                                     ce->state->obj,
1198                                                                     false) |
1199                                          I915_MAP_OVERRIDE);
1200
1201         return PTR_ERR_OR_ZERO(*vaddr);
1202 }
1203
1204 int
1205 lrc_pin(struct intel_context *ce,
1206         struct intel_engine_cs *engine,
1207         void *vaddr)
1208 {
1209         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1210
1211         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1212                 lrc_init_state(ce, engine, vaddr);
1213
1214         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1215         return 0;
1216 }
1217
1218 void lrc_unpin(struct intel_context *ce)
1219 {
1220         if (unlikely(ce->parallel.last_rq)) {
1221                 i915_request_put(ce->parallel.last_rq);
1222                 ce->parallel.last_rq = NULL;
1223         }
1224         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1225                       ce->engine);
1226 }
1227
1228 void lrc_post_unpin(struct intel_context *ce)
1229 {
1230         i915_gem_object_unpin_map(ce->state->obj);
1231 }
1232
1233 void lrc_fini(struct intel_context *ce)
1234 {
1235         if (!ce->state)
1236                 return;
1237
1238         intel_ring_put(fetch_and_zero(&ce->ring));
1239         i915_vma_put(fetch_and_zero(&ce->state));
1240 }
1241
1242 void lrc_destroy(struct kref *kref)
1243 {
1244         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1245
1246         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1247         GEM_BUG_ON(intel_context_is_pinned(ce));
1248
1249         lrc_fini(ce);
1250
1251         intel_context_fini(ce);
1252         intel_context_free(ce);
1253 }
1254
1255 static u32 *
1256 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1257 {
1258         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1259                 MI_SRM_LRM_GLOBAL_GTT |
1260                 MI_LRI_LRM_CS_MMIO;
1261         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1262         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1263                 CTX_TIMESTAMP * sizeof(u32);
1264         *cs++ = 0;
1265
1266         *cs++ = MI_LOAD_REGISTER_REG |
1267                 MI_LRR_SOURCE_CS_MMIO |
1268                 MI_LRI_LRM_CS_MMIO;
1269         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1270         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1271
1272         *cs++ = MI_LOAD_REGISTER_REG |
1273                 MI_LRR_SOURCE_CS_MMIO |
1274                 MI_LRI_LRM_CS_MMIO;
1275         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1276         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1277
1278         return cs;
1279 }
1280
1281 static u32 *
1282 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1283 {
1284         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1285
1286         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1287                 MI_SRM_LRM_GLOBAL_GTT |
1288                 MI_LRI_LRM_CS_MMIO;
1289         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1290         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1291                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1292         *cs++ = 0;
1293
1294         return cs;
1295 }
1296
1297 static u32 *
1298 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1299 {
1300         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1301
1302         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1303                 MI_SRM_LRM_GLOBAL_GTT |
1304                 MI_LRI_LRM_CS_MMIO;
1305         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1306         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1307                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1308         *cs++ = 0;
1309
1310         *cs++ = MI_LOAD_REGISTER_REG |
1311                 MI_LRR_SOURCE_CS_MMIO |
1312                 MI_LRI_LRM_CS_MMIO;
1313         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1314         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1315
1316         return cs;
1317 }
1318
1319 /*
1320  * On DG2 during context restore of a preempted context in GPGPU mode,
1321  * RCS restore hang is detected. This is extremely timing dependent.
1322  * To address this below sw wabb is implemented for DG2 A steppings.
1323  */
1324 static u32 *
1325 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1326 {
1327         *cs++ = MI_LOAD_REGISTER_IMM(1);
1328         *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG(ce->engine->mmio_base));
1329         *cs++ = 0x21;
1330
1331         *cs++ = MI_LOAD_REGISTER_REG;
1332         *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1333         *cs++ = i915_mmio_reg_offset(XEHP_CULLBIT1);
1334
1335         *cs++ = MI_LOAD_REGISTER_REG;
1336         *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1337         *cs++ = i915_mmio_reg_offset(XEHP_CULLBIT2);
1338
1339         return cs;
1340 }
1341
1342 /*
1343  * The bspec's tuning guide asks us to program a vertical watermark value of
1344  * 0x3FF.  However this register is not saved/restored properly by the
1345  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1346  * batch buffer to ensure the value takes effect properly.  All other bits
1347  * in this register should remain at 0 (the hardware default).
1348  */
1349 static u32 *
1350 dg2_emit_draw_watermark_setting(u32 *cs)
1351 {
1352         *cs++ = MI_LOAD_REGISTER_IMM(1);
1353         *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1354         *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1355
1356         return cs;
1357 }
1358
1359 static u32 *
1360 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1361 {
1362         cs = gen12_emit_timestamp_wa(ce, cs);
1363         cs = gen12_emit_cmd_buf_wa(ce, cs);
1364         cs = gen12_emit_restore_scratch(ce, cs);
1365
1366         /* Wa_22011450934:dg2 */
1367         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1368             IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1369                 cs = dg2_emit_rcs_hang_wabb(ce, cs);
1370
1371         /* Wa_16013000631:dg2 */
1372         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1373             IS_DG2_G11(ce->engine->i915))
1374                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1375
1376         cs = gen12_emit_aux_table_inv(ce->engine, cs);
1377
1378         /* Wa_16014892111 */
1379         if (IS_MTL_GRAPHICS_STEP(ce->engine->i915, M, STEP_A0, STEP_B0) ||
1380             IS_MTL_GRAPHICS_STEP(ce->engine->i915, P, STEP_A0, STEP_B0) ||
1381             IS_DG2(ce->engine->i915))
1382                 cs = dg2_emit_draw_watermark_setting(cs);
1383
1384         return cs;
1385 }
1386
1387 static u32 *
1388 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1389 {
1390         cs = gen12_emit_timestamp_wa(ce, cs);
1391         cs = gen12_emit_restore_scratch(ce, cs);
1392
1393         /* Wa_16013000631:dg2 */
1394         if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1395             IS_DG2_G11(ce->engine->i915))
1396                 if (ce->engine->class == COMPUTE_CLASS)
1397                         cs = gen8_emit_pipe_control(cs,
1398                                                     PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1399                                                     0);
1400
1401         return gen12_emit_aux_table_inv(ce->engine, cs);
1402 }
1403
1404 static void
1405 setup_indirect_ctx_bb(const struct intel_context *ce,
1406                       const struct intel_engine_cs *engine,
1407                       u32 *(*emit)(const struct intel_context *, u32 *))
1408 {
1409         u32 * const start = context_indirect_bb(ce);
1410         u32 *cs;
1411
1412         cs = emit(ce, start);
1413         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1414         while ((unsigned long)cs % CACHELINE_BYTES)
1415                 *cs++ = MI_NOOP;
1416
1417         GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1418         setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1419
1420         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1421                                lrc_indirect_bb(ce),
1422                                (cs - start) * sizeof(*cs));
1423 }
1424
1425 /*
1426  * The context descriptor encodes various attributes of a context,
1427  * including its GTT address and some flags. Because it's fairly
1428  * expensive to calculate, we'll just do it once and cache the result,
1429  * which remains valid until the context is unpinned.
1430  *
1431  * This is what a descriptor looks like, from LSB to MSB::
1432  *
1433  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1434  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1435  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1436  *      bits 53-54:    mbz, reserved for use by hardware
1437  *      bits 55-63:    group ID, currently unused and set to 0
1438  *
1439  * Starting from Gen11, the upper dword of the descriptor has a new format:
1440  *
1441  *      bits 32-36:    reserved
1442  *      bits 37-47:    SW context ID
1443  *      bits 48:53:    engine instance
1444  *      bit 54:        mbz, reserved for use by hardware
1445  *      bits 55-60:    SW counter
1446  *      bits 61-63:    engine class
1447  *
1448  * On Xe_HP, the upper dword of the descriptor has a new format:
1449  *
1450  *      bits 32-37:    virtual function number
1451  *      bit 38:        mbz, reserved for use by hardware
1452  *      bits 39-54:    SW context ID
1453  *      bits 55-57:    reserved
1454  *      bits 58-63:    SW counter
1455  *
1456  * engine info, SW context ID and SW counter need to form a unique number
1457  * (Context ID) per lrc.
1458  */
1459 static u32 lrc_descriptor(const struct intel_context *ce)
1460 {
1461         u32 desc;
1462
1463         desc = INTEL_LEGACY_32B_CONTEXT;
1464         if (i915_vm_is_4lvl(ce->vm))
1465                 desc = INTEL_LEGACY_64B_CONTEXT;
1466         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1467
1468         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1469         if (GRAPHICS_VER(ce->vm->i915) == 8)
1470                 desc |= GEN8_CTX_L3LLC_COHERENT;
1471
1472         return i915_ggtt_offset(ce->state) | desc;
1473 }
1474
1475 u32 lrc_update_regs(const struct intel_context *ce,
1476                     const struct intel_engine_cs *engine,
1477                     u32 head)
1478 {
1479         struct intel_ring *ring = ce->ring;
1480         u32 *regs = ce->lrc_reg_state;
1481
1482         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1483         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1484
1485         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1486         regs[CTX_RING_HEAD] = head;
1487         regs[CTX_RING_TAIL] = ring->tail;
1488         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1489
1490         /* RPCS */
1491         if (engine->class == RENDER_CLASS) {
1492                 regs[CTX_R_PWR_CLK_STATE] =
1493                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1494
1495                 i915_oa_init_reg_state(ce, engine);
1496         }
1497
1498         if (ce->wa_bb_page) {
1499                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1500
1501                 fn = gen12_emit_indirect_ctx_xcs;
1502                 if (ce->engine->class == RENDER_CLASS)
1503                         fn = gen12_emit_indirect_ctx_rcs;
1504
1505                 /* Mutually exclusive wrt to global indirect bb */
1506                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1507                 setup_indirect_ctx_bb(ce, engine, fn);
1508         }
1509
1510         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1511 }
1512
1513 void lrc_update_offsets(struct intel_context *ce,
1514                         struct intel_engine_cs *engine)
1515 {
1516         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1517 }
1518
1519 void lrc_check_regs(const struct intel_context *ce,
1520                     const struct intel_engine_cs *engine,
1521                     const char *when)
1522 {
1523         const struct intel_ring *ring = ce->ring;
1524         u32 *regs = ce->lrc_reg_state;
1525         bool valid = true;
1526         int x;
1527
1528         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1529                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1530                        engine->name,
1531                        regs[CTX_RING_START],
1532                        i915_ggtt_offset(ring->vma));
1533                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1534                 valid = false;
1535         }
1536
1537         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1538             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1539                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1540                        engine->name,
1541                        regs[CTX_RING_CTL],
1542                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1543                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1544                 valid = false;
1545         }
1546
1547         x = lrc_ring_mi_mode(engine);
1548         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1549                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1550                        engine->name, regs[x + 1]);
1551                 regs[x + 1] &= ~STOP_RING;
1552                 regs[x + 1] |= STOP_RING << 16;
1553                 valid = false;
1554         }
1555
1556         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1557 }
1558
1559 /*
1560  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1561  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1562  * but there is a slight complication as this is applied in WA batch where the
1563  * values are only initialized once so we cannot take register value at the
1564  * beginning and reuse it further; hence we save its value to memory, upload a
1565  * constant value with bit21 set and then we restore it back with the saved value.
1566  * To simplify the WA, a constant value is formed by using the default value
1567  * of this register. This shouldn't be a problem because we are only modifying
1568  * it for a short period and this batch in non-premptible. We can ofcourse
1569  * use additional instructions that read the actual value of the register
1570  * at that time and set our bit of interest but it makes the WA complicated.
1571  *
1572  * This WA is also required for Gen9 so extracting as a function avoids
1573  * code duplication.
1574  */
1575 static u32 *
1576 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1577 {
1578         /* NB no one else is allowed to scribble over scratch + 256! */
1579         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1580         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1581         *batch++ = intel_gt_scratch_offset(engine->gt,
1582                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1583         *batch++ = 0;
1584
1585         *batch++ = MI_LOAD_REGISTER_IMM(1);
1586         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1587         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1588
1589         batch = gen8_emit_pipe_control(batch,
1590                                        PIPE_CONTROL_CS_STALL |
1591                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1592                                        0);
1593
1594         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1595         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1596         *batch++ = intel_gt_scratch_offset(engine->gt,
1597                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1598         *batch++ = 0;
1599
1600         return batch;
1601 }
1602
1603 /*
1604  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1605  * initialized at the beginning and shared across all contexts but this field
1606  * helps us to have multiple batches at different offsets and select them based
1607  * on a criteria. At the moment this batch always start at the beginning of the page
1608  * and at this point we don't have multiple wa_ctx batch buffers.
1609  *
1610  * The number of WA applied are not known at the beginning; we use this field
1611  * to return the no of DWORDS written.
1612  *
1613  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1614  * so it adds NOOPs as padding to make it cacheline aligned.
1615  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1616  * makes a complete batch buffer.
1617  */
1618 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1619 {
1620         /* WaDisableCtxRestoreArbitration:bdw,chv */
1621         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1622
1623         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1624         if (IS_BROADWELL(engine->i915))
1625                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1626
1627         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1628         /* Actual scratch location is at 128 bytes offset */
1629         batch = gen8_emit_pipe_control(batch,
1630                                        PIPE_CONTROL_FLUSH_L3 |
1631                                        PIPE_CONTROL_STORE_DATA_INDEX |
1632                                        PIPE_CONTROL_CS_STALL |
1633                                        PIPE_CONTROL_QW_WRITE,
1634                                        LRC_PPHWSP_SCRATCH_ADDR);
1635
1636         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1637
1638         /* Pad to end of cacheline */
1639         while ((unsigned long)batch % CACHELINE_BYTES)
1640                 *batch++ = MI_NOOP;
1641
1642         /*
1643          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1644          * execution depends on the length specified in terms of cache lines
1645          * in the register CTX_RCS_INDIRECT_CTX
1646          */
1647
1648         return batch;
1649 }
1650
1651 struct lri {
1652         i915_reg_t reg;
1653         u32 value;
1654 };
1655
1656 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1657 {
1658         GEM_BUG_ON(!count || count > 63);
1659
1660         *batch++ = MI_LOAD_REGISTER_IMM(count);
1661         do {
1662                 *batch++ = i915_mmio_reg_offset(lri->reg);
1663                 *batch++ = lri->value;
1664         } while (lri++, --count);
1665         *batch++ = MI_NOOP;
1666
1667         return batch;
1668 }
1669
1670 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1671 {
1672         static const struct lri lri[] = {
1673                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1674                 {
1675                         COMMON_SLICE_CHICKEN2,
1676                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1677                                        0),
1678                 },
1679
1680                 /* BSpec: 11391 */
1681                 {
1682                         FF_SLICE_CHICKEN,
1683                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1684                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1685                 },
1686
1687                 /* BSpec: 11299 */
1688                 {
1689                         _3D_CHICKEN3,
1690                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1691                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1692                 }
1693         };
1694
1695         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1696
1697         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1698         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1699
1700         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1701         batch = gen8_emit_pipe_control(batch,
1702                                        PIPE_CONTROL_FLUSH_L3 |
1703                                        PIPE_CONTROL_STORE_DATA_INDEX |
1704                                        PIPE_CONTROL_CS_STALL |
1705                                        PIPE_CONTROL_QW_WRITE,
1706                                        LRC_PPHWSP_SCRATCH_ADDR);
1707
1708         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1709
1710         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1711         if (HAS_POOLED_EU(engine->i915)) {
1712                 /*
1713                  * EU pool configuration is setup along with golden context
1714                  * during context initialization. This value depends on
1715                  * device type (2x6 or 3x6) and needs to be updated based
1716                  * on which subslice is disabled especially for 2x6
1717                  * devices, however it is safe to load default
1718                  * configuration of 3x6 device instead of masking off
1719                  * corresponding bits because HW ignores bits of a disabled
1720                  * subslice and drops down to appropriate config. Please
1721                  * see render_state_setup() in i915_gem_render_state.c for
1722                  * possible configurations, to avoid duplication they are
1723                  * not shown here again.
1724                  */
1725                 *batch++ = GEN9_MEDIA_POOL_STATE;
1726                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1727                 *batch++ = 0x00777000;
1728                 *batch++ = 0;
1729                 *batch++ = 0;
1730                 *batch++ = 0;
1731         }
1732
1733         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1734
1735         /* Pad to end of cacheline */
1736         while ((unsigned long)batch % CACHELINE_BYTES)
1737                 *batch++ = MI_NOOP;
1738
1739         return batch;
1740 }
1741
1742 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1743
1744 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1745 {
1746         struct drm_i915_gem_object *obj;
1747         struct i915_vma *vma;
1748         int err;
1749
1750         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1751         if (IS_ERR(obj))
1752                 return PTR_ERR(obj);
1753
1754         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1755         if (IS_ERR(vma)) {
1756                 err = PTR_ERR(vma);
1757                 goto err;
1758         }
1759
1760         engine->wa_ctx.vma = vma;
1761         return 0;
1762
1763 err:
1764         i915_gem_object_put(obj);
1765         return err;
1766 }
1767
1768 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1769 {
1770         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1771 }
1772
1773 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1774
1775 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1776 {
1777         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1778         struct i915_wa_ctx_bb *wa_bb[] = {
1779                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1780         };
1781         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1782         struct i915_gem_ww_ctx ww;
1783         void *batch, *batch_ptr;
1784         unsigned int i;
1785         int err;
1786
1787         if (GRAPHICS_VER(engine->i915) >= 11 ||
1788             !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1789                 return;
1790
1791         if (GRAPHICS_VER(engine->i915) == 9) {
1792                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1793                 wa_bb_fn[1] = NULL;
1794         } else if (GRAPHICS_VER(engine->i915) == 8) {
1795                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1796                 wa_bb_fn[1] = NULL;
1797         }
1798
1799         err = lrc_create_wa_ctx(engine);
1800         if (err) {
1801                 /*
1802                  * We continue even if we fail to initialize WA batch
1803                  * because we only expect rare glitches but nothing
1804                  * critical to prevent us from using GPU
1805                  */
1806                 drm_err(&engine->i915->drm,
1807                         "Ignoring context switch w/a allocation error:%d\n",
1808                         err);
1809                 return;
1810         }
1811
1812         if (!engine->wa_ctx.vma)
1813                 return;
1814
1815         i915_gem_ww_ctx_init(&ww, true);
1816 retry:
1817         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1818         if (!err)
1819                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1820         if (err)
1821                 goto err;
1822
1823         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1824         if (IS_ERR(batch)) {
1825                 err = PTR_ERR(batch);
1826                 goto err_unpin;
1827         }
1828
1829         /*
1830          * Emit the two workaround batch buffers, recording the offset from the
1831          * start of the workaround batch buffer object for each and their
1832          * respective sizes.
1833          */
1834         batch_ptr = batch;
1835         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1836                 wa_bb[i]->offset = batch_ptr - batch;
1837                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1838                                                   CACHELINE_BYTES))) {
1839                         err = -EINVAL;
1840                         break;
1841                 }
1842                 if (wa_bb_fn[i])
1843                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1844                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1845         }
1846         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1847
1848         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1849         __i915_gem_object_release_map(wa_ctx->vma->obj);
1850
1851         /* Verify that we can handle failure to setup the wa_ctx */
1852         if (!err)
1853                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1854
1855 err_unpin:
1856         if (err)
1857                 i915_vma_unpin(wa_ctx->vma);
1858 err:
1859         if (err == -EDEADLK) {
1860                 err = i915_gem_ww_ctx_backoff(&ww);
1861                 if (!err)
1862                         goto retry;
1863         }
1864         i915_gem_ww_ctx_fini(&ww);
1865
1866         if (err) {
1867                 i915_vma_put(engine->wa_ctx.vma);
1868
1869                 /* Clear all flags to prevent further use */
1870                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1871         }
1872 }
1873
1874 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1875 {
1876 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1877         stats->runtime.num_underflow++;
1878         stats->runtime.max_underflow =
1879                 max_t(u32, stats->runtime.max_underflow, -dt);
1880 #endif
1881 }
1882
1883 static u32 lrc_get_runtime(const struct intel_context *ce)
1884 {
1885         /*
1886          * We can use either ppHWSP[16] which is recorded before the context
1887          * switch (and so excludes the cost of context switches) or use the
1888          * value from the context image itself, which is saved/restored earlier
1889          * and so includes the cost of the save.
1890          */
1891         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1892 }
1893
1894 void lrc_update_runtime(struct intel_context *ce)
1895 {
1896         struct intel_context_stats *stats = &ce->stats;
1897         u32 old;
1898         s32 dt;
1899
1900         old = stats->runtime.last;
1901         stats->runtime.last = lrc_get_runtime(ce);
1902         dt = stats->runtime.last - old;
1903         if (!dt)
1904                 return;
1905
1906         if (unlikely(dt < 0)) {
1907                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1908                          old, stats->runtime.last, dt);
1909                 st_runtime_underflow(stats, dt);
1910                 return;
1911         }
1912
1913         ewma_runtime_add(&stats->runtime.avg, dt);
1914         stats->runtime.total += dt;
1915 }
1916
1917 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1918 #include "selftest_lrc.c"
1919 #endif