Merge branches 'clk-qcom', 'clk-socfpga', 'clk-mediatek', 'clk-lmk' and 'clk-x86...
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / i915 / gt / intel_ggtt_fencing.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2008-2015 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "i915_scatterlist.h"
8 #include "i915_pvinfo.h"
9 #include "i915_vgpu.h"
10
11 /**
12  * DOC: fence register handling
13  *
14  * Important to avoid confusions: "fences" in the i915 driver are not execution
15  * fences used to track command completion but hardware detiler objects which
16  * wrap a given range of the global GTT. Each platform has only a fairly limited
17  * set of these objects.
18  *
19  * Fences are used to detile GTT memory mappings. They're also connected to the
20  * hardware frontbuffer render tracking and hence interact with frontbuffer
21  * compression. Furthermore on older platforms fences are required for tiled
22  * objects used by the display engine. They can also be used by the render
23  * engine - they're required for blitter commands and are optional for render
24  * commands. But on gen4+ both display (with the exception of fbc) and rendering
25  * have their own tiling state bits and don't need fences.
26  *
27  * Also note that fences only support X and Y tiling and hence can't be used for
28  * the fancier new tiling formats like W, Ys and Yf.
29  *
30  * Finally note that because fences are such a restricted resource they're
31  * dynamically associated with objects. Furthermore fence state is committed to
32  * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
33  * explicitly call i915_gem_object_get_fence() to synchronize fencing status
34  * for cpu access. Also note that some code wants an unfenced view, for those
35  * cases the fence can be removed forcefully with i915_gem_object_put_fence().
36  *
37  * Internally these functions will synchronize with userspace access by removing
38  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
39  */
40
41 #define pipelined 0
42
43 static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence)
44 {
45         return fence->ggtt->vm.i915;
46 }
47
48 static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence)
49 {
50         return fence->ggtt->vm.gt->uncore;
51 }
52
53 static void i965_write_fence_reg(struct i915_fence_reg *fence)
54 {
55         i915_reg_t fence_reg_lo, fence_reg_hi;
56         int fence_pitch_shift;
57         u64 val;
58
59         if (GRAPHICS_VER(fence_to_i915(fence)) >= 6) {
60                 fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
61                 fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
62                 fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
63
64         } else {
65                 fence_reg_lo = FENCE_REG_965_LO(fence->id);
66                 fence_reg_hi = FENCE_REG_965_HI(fence->id);
67                 fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
68         }
69
70         val = 0;
71         if (fence->tiling) {
72                 unsigned int stride = fence->stride;
73
74                 GEM_BUG_ON(!IS_ALIGNED(stride, 128));
75
76                 val = fence->start + fence->size - I965_FENCE_PAGE;
77                 val <<= 32;
78                 val |= fence->start;
79                 val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
80                 if (fence->tiling == I915_TILING_Y)
81                         val |= BIT(I965_FENCE_TILING_Y_SHIFT);
82                 val |= I965_FENCE_REG_VALID;
83         }
84
85         if (!pipelined) {
86                 struct intel_uncore *uncore = fence_to_uncore(fence);
87
88                 /*
89                  * To w/a incoherency with non-atomic 64-bit register updates,
90                  * we split the 64-bit update into two 32-bit writes. In order
91                  * for a partial fence not to be evaluated between writes, we
92                  * precede the update with write to turn off the fence register,
93                  * and only enable the fence as the last step.
94                  *
95                  * For extra levels of paranoia, we make sure each step lands
96                  * before applying the next step.
97                  */
98                 intel_uncore_write_fw(uncore, fence_reg_lo, 0);
99                 intel_uncore_posting_read_fw(uncore, fence_reg_lo);
100
101                 intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
102                 intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
103                 intel_uncore_posting_read_fw(uncore, fence_reg_lo);
104         }
105 }
106
107 static void i915_write_fence_reg(struct i915_fence_reg *fence)
108 {
109         u32 val;
110
111         val = 0;
112         if (fence->tiling) {
113                 unsigned int stride = fence->stride;
114                 unsigned int tiling = fence->tiling;
115                 bool is_y_tiled = tiling == I915_TILING_Y;
116
117                 if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
118                         stride /= 128;
119                 else
120                         stride /= 512;
121                 GEM_BUG_ON(!is_power_of_2(stride));
122
123                 val = fence->start;
124                 if (is_y_tiled)
125                         val |= BIT(I830_FENCE_TILING_Y_SHIFT);
126                 val |= I915_FENCE_SIZE_BITS(fence->size);
127                 val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
128
129                 val |= I830_FENCE_REG_VALID;
130         }
131
132         if (!pipelined) {
133                 struct intel_uncore *uncore = fence_to_uncore(fence);
134                 i915_reg_t reg = FENCE_REG(fence->id);
135
136                 intel_uncore_write_fw(uncore, reg, val);
137                 intel_uncore_posting_read_fw(uncore, reg);
138         }
139 }
140
141 static void i830_write_fence_reg(struct i915_fence_reg *fence)
142 {
143         u32 val;
144
145         val = 0;
146         if (fence->tiling) {
147                 unsigned int stride = fence->stride;
148
149                 val = fence->start;
150                 if (fence->tiling == I915_TILING_Y)
151                         val |= BIT(I830_FENCE_TILING_Y_SHIFT);
152                 val |= I830_FENCE_SIZE_BITS(fence->size);
153                 val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT;
154                 val |= I830_FENCE_REG_VALID;
155         }
156
157         if (!pipelined) {
158                 struct intel_uncore *uncore = fence_to_uncore(fence);
159                 i915_reg_t reg = FENCE_REG(fence->id);
160
161                 intel_uncore_write_fw(uncore, reg, val);
162                 intel_uncore_posting_read_fw(uncore, reg);
163         }
164 }
165
166 static void fence_write(struct i915_fence_reg *fence)
167 {
168         struct drm_i915_private *i915 = fence_to_i915(fence);
169
170         /*
171          * Previous access through the fence register is marshalled by
172          * the mb() inside the fault handlers (i915_gem_release_mmaps)
173          * and explicitly managed for internal users.
174          */
175
176         if (GRAPHICS_VER(i915) == 2)
177                 i830_write_fence_reg(fence);
178         else if (GRAPHICS_VER(i915) == 3)
179                 i915_write_fence_reg(fence);
180         else
181                 i965_write_fence_reg(fence);
182
183         /*
184          * Access through the fenced region afterwards is
185          * ordered by the posting reads whilst writing the registers.
186          */
187 }
188
189 static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
190 {
191         return GRAPHICS_VER(fence_to_i915(fence)) < 4;
192 }
193
194 static int fence_update(struct i915_fence_reg *fence,
195                         struct i915_vma *vma)
196 {
197         struct i915_ggtt *ggtt = fence->ggtt;
198         struct intel_uncore *uncore = fence_to_uncore(fence);
199         intel_wakeref_t wakeref;
200         struct i915_vma *old;
201         int ret;
202
203         fence->tiling = 0;
204         if (vma) {
205                 GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) ||
206                            !i915_gem_object_get_tiling(vma->obj));
207
208                 if (!i915_vma_is_map_and_fenceable(vma))
209                         return -EINVAL;
210
211                 if (gpu_uses_fence_registers(fence)) {
212                         /* implicit 'unfenced' GPU blits */
213                         ret = i915_vma_sync(vma);
214                         if (ret)
215                                 return ret;
216                 }
217
218                 fence->start = vma->node.start;
219                 fence->size = vma->fence_size;
220                 fence->stride = i915_gem_object_get_stride(vma->obj);
221                 fence->tiling = i915_gem_object_get_tiling(vma->obj);
222         }
223         WRITE_ONCE(fence->dirty, false);
224
225         old = xchg(&fence->vma, NULL);
226         if (old) {
227                 /* XXX Ideally we would move the waiting to outside the mutex */
228                 ret = i915_active_wait(&fence->active);
229                 if (ret) {
230                         fence->vma = old;
231                         return ret;
232                 }
233
234                 i915_vma_flush_writes(old);
235
236                 /*
237                  * Ensure that all userspace CPU access is completed before
238                  * stealing the fence.
239                  */
240                 if (old != vma) {
241                         GEM_BUG_ON(old->fence != fence);
242                         i915_vma_revoke_mmap(old);
243                         old->fence = NULL;
244                 }
245
246                 list_move(&fence->link, &ggtt->fence_list);
247         }
248
249         /*
250          * We only need to update the register itself if the device is awake.
251          * If the device is currently powered down, we will defer the write
252          * to the runtime resume, see intel_ggtt_restore_fences().
253          *
254          * This only works for removing the fence register, on acquisition
255          * the caller must hold the rpm wakeref. The fence register must
256          * be cleared before we can use any other fences to ensure that
257          * the new fences do not overlap the elided clears, confusing HW.
258          */
259         wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm);
260         if (!wakeref) {
261                 GEM_BUG_ON(vma);
262                 return 0;
263         }
264
265         WRITE_ONCE(fence->vma, vma);
266         fence_write(fence);
267
268         if (vma) {
269                 vma->fence = fence;
270                 list_move_tail(&fence->link, &ggtt->fence_list);
271         }
272
273         intel_runtime_pm_put(uncore->rpm, wakeref);
274         return 0;
275 }
276
277 /**
278  * i915_vma_revoke_fence - force-remove fence for a VMA
279  * @vma: vma to map linearly (not through a fence reg)
280  *
281  * This function force-removes any fence from the given object, which is useful
282  * if the kernel wants to do untiled GTT access.
283  */
284 void i915_vma_revoke_fence(struct i915_vma *vma)
285 {
286         struct i915_fence_reg *fence = vma->fence;
287         intel_wakeref_t wakeref;
288
289         lockdep_assert_held(&vma->vm->mutex);
290         if (!fence)
291                 return;
292
293         GEM_BUG_ON(fence->vma != vma);
294         GEM_BUG_ON(!i915_active_is_idle(&fence->active));
295         GEM_BUG_ON(atomic_read(&fence->pin_count));
296
297         fence->tiling = 0;
298         WRITE_ONCE(fence->vma, NULL);
299         vma->fence = NULL;
300
301         /*
302          * Skip the write to HW if and only if the device is currently
303          * suspended.
304          *
305          * If the driver does not currently hold a wakeref (if_in_use == 0),
306          * the device may currently be runtime suspended, or it may be woken
307          * up before the suspend takes place. If the device is not suspended
308          * (powered down) and we skip clearing the fence register, the HW is
309          * left in an undefined state where we may end up with multiple
310          * registers overlapping.
311          */
312         with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
313                 fence_write(fence);
314 }
315
316 static bool fence_is_active(const struct i915_fence_reg *fence)
317 {
318         return fence->vma && i915_vma_is_active(fence->vma);
319 }
320
321 static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
322 {
323         struct i915_fence_reg *active = NULL;
324         struct i915_fence_reg *fence, *fn;
325
326         list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
327                 GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
328
329                 if (fence == active) /* now seen this fence twice */
330                         active = ERR_PTR(-EAGAIN);
331
332                 /* Prefer idle fences so we do not have to wait on the GPU */
333                 if (active != ERR_PTR(-EAGAIN) && fence_is_active(fence)) {
334                         if (!active)
335                                 active = fence;
336
337                         list_move_tail(&fence->link, &ggtt->fence_list);
338                         continue;
339                 }
340
341                 if (atomic_read(&fence->pin_count))
342                         continue;
343
344                 return fence;
345         }
346
347         /* Wait for completion of pending flips which consume fences */
348         if (intel_has_pending_fb_unpin(ggtt->vm.i915))
349                 return ERR_PTR(-EAGAIN);
350
351         return ERR_PTR(-ENOBUFS);
352 }
353
354 int __i915_vma_pin_fence(struct i915_vma *vma)
355 {
356         struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
357         struct i915_fence_reg *fence;
358         struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
359         int err;
360
361         lockdep_assert_held(&vma->vm->mutex);
362
363         /* Just update our place in the LRU if our fence is getting reused. */
364         if (vma->fence) {
365                 fence = vma->fence;
366                 GEM_BUG_ON(fence->vma != vma);
367                 atomic_inc(&fence->pin_count);
368                 if (!fence->dirty) {
369                         list_move_tail(&fence->link, &ggtt->fence_list);
370                         return 0;
371                 }
372         } else if (set) {
373                 fence = fence_find(ggtt);
374                 if (IS_ERR(fence))
375                         return PTR_ERR(fence);
376
377                 GEM_BUG_ON(atomic_read(&fence->pin_count));
378                 atomic_inc(&fence->pin_count);
379         } else {
380                 return 0;
381         }
382
383         err = fence_update(fence, set);
384         if (err)
385                 goto out_unpin;
386
387         GEM_BUG_ON(fence->vma != set);
388         GEM_BUG_ON(vma->fence != (set ? fence : NULL));
389
390         if (set)
391                 return 0;
392
393 out_unpin:
394         atomic_dec(&fence->pin_count);
395         return err;
396 }
397
398 /**
399  * i915_vma_pin_fence - set up fencing for a vma
400  * @vma: vma to map through a fence reg
401  *
402  * When mapping objects through the GTT, userspace wants to be able to write
403  * to them without having to worry about swizzling if the object is tiled.
404  * This function walks the fence regs looking for a free one for @obj,
405  * stealing one if it can't find any.
406  *
407  * It then sets up the reg based on the object's properties: address, pitch
408  * and tiling format.
409  *
410  * For an untiled surface, this removes any existing fence.
411  *
412  * Returns:
413  *
414  * 0 on success, negative error code on failure.
415  */
416 int i915_vma_pin_fence(struct i915_vma *vma)
417 {
418         int err;
419
420         if (!vma->fence && !i915_gem_object_is_tiled(vma->obj))
421                 return 0;
422
423         /*
424          * Note that we revoke fences on runtime suspend. Therefore the user
425          * must keep the device awake whilst using the fence.
426          */
427         assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm);
428         GEM_BUG_ON(!i915_vma_is_pinned(vma));
429         GEM_BUG_ON(!i915_vma_is_ggtt(vma));
430
431         err = mutex_lock_interruptible(&vma->vm->mutex);
432         if (err)
433                 return err;
434
435         err = __i915_vma_pin_fence(vma);
436         mutex_unlock(&vma->vm->mutex);
437
438         return err;
439 }
440
441 /**
442  * i915_reserve_fence - Reserve a fence for vGPU
443  * @ggtt: Global GTT
444  *
445  * This function walks the fence regs looking for a free one and remove
446  * it from the fence_list. It is used to reserve fence for vGPU to use.
447  */
448 struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt)
449 {
450         struct i915_fence_reg *fence;
451         int count;
452         int ret;
453
454         lockdep_assert_held(&ggtt->vm.mutex);
455
456         /* Keep at least one fence available for the display engine. */
457         count = 0;
458         list_for_each_entry(fence, &ggtt->fence_list, link)
459                 count += !atomic_read(&fence->pin_count);
460         if (count <= 1)
461                 return ERR_PTR(-ENOSPC);
462
463         fence = fence_find(ggtt);
464         if (IS_ERR(fence))
465                 return fence;
466
467         if (fence->vma) {
468                 /* Force-remove fence from VMA */
469                 ret = fence_update(fence, NULL);
470                 if (ret)
471                         return ERR_PTR(ret);
472         }
473
474         list_del(&fence->link);
475
476         return fence;
477 }
478
479 /**
480  * i915_unreserve_fence - Reclaim a reserved fence
481  * @fence: the fence reg
482  *
483  * This function add a reserved fence register from vGPU to the fence_list.
484  */
485 void i915_unreserve_fence(struct i915_fence_reg *fence)
486 {
487         struct i915_ggtt *ggtt = fence->ggtt;
488
489         lockdep_assert_held(&ggtt->vm.mutex);
490
491         list_add(&fence->link, &ggtt->fence_list);
492 }
493
494 /**
495  * intel_ggtt_restore_fences - restore fence state
496  * @ggtt: Global GTT
497  *
498  * Restore the hw fence state to match the software tracking again, to be called
499  * after a gpu reset and on resume. Note that on runtime suspend we only cancel
500  * the fences, to be reacquired by the user later.
501  */
502 void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
503 {
504         int i;
505
506         for (i = 0; i < ggtt->num_fences; i++)
507                 fence_write(&ggtt->fence_regs[i]);
508 }
509
510 /**
511  * DOC: tiling swizzling details
512  *
513  * The idea behind tiling is to increase cache hit rates by rearranging
514  * pixel data so that a group of pixel accesses are in the same cacheline.
515  * Performance improvement from doing this on the back/depth buffer are on
516  * the order of 30%.
517  *
518  * Intel architectures make this somewhat more complicated, though, by
519  * adjustments made to addressing of data when the memory is in interleaved
520  * mode (matched pairs of DIMMS) to improve memory bandwidth.
521  * For interleaved memory, the CPU sends every sequential 64 bytes
522  * to an alternate memory channel so it can get the bandwidth from both.
523  *
524  * The GPU also rearranges its accesses for increased bandwidth to interleaved
525  * memory, and it matches what the CPU does for non-tiled.  However, when tiled
526  * it does it a little differently, since one walks addresses not just in the
527  * X direction but also Y.  So, along with alternating channels when bit
528  * 6 of the address flips, it also alternates when other bits flip --  Bits 9
529  * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
530  * are common to both the 915 and 965-class hardware.
531  *
532  * The CPU also sometimes XORs in higher bits as well, to improve
533  * bandwidth doing strided access like we do so frequently in graphics.  This
534  * is called "Channel XOR Randomization" in the MCH documentation.  The result
535  * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
536  * decode.
537  *
538  * All of this bit 6 XORing has an effect on our memory management,
539  * as we need to make sure that the 3d driver can correctly address object
540  * contents.
541  *
542  * If we don't have interleaved memory, all tiling is safe and no swizzling is
543  * required.
544  *
545  * When bit 17 is XORed in, we simply refuse to tile at all.  Bit
546  * 17 is not just a page offset, so as we page an object out and back in,
547  * individual pages in it will have different bit 17 addresses, resulting in
548  * each 64 bytes being swapped with its neighbor!
549  *
550  * Otherwise, if interleaved, we have to tell the 3d driver what the address
551  * swizzling it needs to do is, since it's writing with the CPU to the pages
552  * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
553  * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
554  * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
555  * to match what the GPU expects.
556  */
557
558 /**
559  * detect_bit_6_swizzle - detect bit 6 swizzling pattern
560  * @ggtt: Global GGTT
561  *
562  * Detects bit 6 swizzling of address lookup between IGD access and CPU
563  * access through main memory.
564  */
565 static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
566 {
567         struct intel_uncore *uncore = ggtt->vm.gt->uncore;
568         struct drm_i915_private *i915 = ggtt->vm.i915;
569         u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
570         u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
571
572         if (GRAPHICS_VER(i915) >= 8 || IS_VALLEYVIEW(i915)) {
573                 /*
574                  * On BDW+, swizzling is not used. We leave the CPU memory
575                  * controller in charge of optimizing memory accesses without
576                  * the extra address manipulation GPU side.
577                  *
578                  * VLV and CHV don't have GPU swizzling.
579                  */
580                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
581                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
582         } else if (GRAPHICS_VER(i915) >= 6) {
583                 if (i915->preserve_bios_swizzle) {
584                         if (intel_uncore_read(uncore, DISP_ARB_CTL) &
585                             DISP_TILE_SURFACE_SWIZZLING) {
586                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
587                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
588                         } else {
589                                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
590                                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
591                         }
592                 } else {
593                         u32 dimm_c0, dimm_c1;
594
595                         dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
596                         dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
597                         dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
598                         dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
599                         /*
600                          * Enable swizzling when the channels are populated
601                          * with identically sized dimms. We don't need to check
602                          * the 3rd channel because no cpu with gpu attached
603                          * ships in that configuration. Also, swizzling only
604                          * makes sense for 2 channels anyway.
605                          */
606                         if (dimm_c0 == dimm_c1) {
607                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
608                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
609                         } else {
610                                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
611                                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
612                         }
613                 }
614         } else if (GRAPHICS_VER(i915) == 5) {
615                 /*
616                  * On Ironlake whatever DRAM config, GPU always do
617                  * same swizzling setup.
618                  */
619                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
620                 swizzle_y = I915_BIT_6_SWIZZLE_9;
621         } else if (GRAPHICS_VER(i915) == 2) {
622                 /*
623                  * As far as we know, the 865 doesn't have these bit 6
624                  * swizzling issues.
625                  */
626                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
627                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
628         } else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) {
629                 /*
630                  * The 965, G33, and newer, have a very flexible memory
631                  * configuration.  It will enable dual-channel mode
632                  * (interleaving) on as much memory as it can, and the GPU
633                  * will additionally sometimes enable different bit 6
634                  * swizzling for tiled objects from the CPU.
635                  *
636                  * Here's what I found on the G965:
637                  *    slot fill         memory size  swizzling
638                  * 0A   0B   1A   1B    1-ch   2-ch
639                  * 512  0    0    0     512    0     O
640                  * 512  0    512  0     16     1008  X
641                  * 512  0    0    512   16     1008  X
642                  * 0    512  0    512   16     1008  X
643                  * 1024 1024 1024 0     2048   1024  O
644                  *
645                  * We could probably detect this based on either the DRB
646                  * matching, which was the case for the swizzling required in
647                  * the table above, or from the 1-ch value being less than
648                  * the minimum size of a rank.
649                  *
650                  * Reports indicate that the swizzling actually
651                  * varies depending upon page placement inside the
652                  * channels, i.e. we see swizzled pages where the
653                  * banks of memory are paired and unswizzled on the
654                  * uneven portion, so leave that as unknown.
655                  */
656                 if (intel_uncore_read16(uncore, C0DRB3_BW) ==
657                     intel_uncore_read16(uncore, C1DRB3_BW)) {
658                         swizzle_x = I915_BIT_6_SWIZZLE_9_10;
659                         swizzle_y = I915_BIT_6_SWIZZLE_9;
660                 }
661         } else {
662                 u32 dcc = intel_uncore_read(uncore, DCC);
663
664                 /*
665                  * On 9xx chipsets, channel interleave by the CPU is
666                  * determined by DCC.  For single-channel, neither the CPU
667                  * nor the GPU do swizzling.  For dual channel interleaved,
668                  * the GPU's interleave is bit 9 and 10 for X tiled, and bit
669                  * 9 for Y tiled.  The CPU's interleave is independent, and
670                  * can be based on either bit 11 (haven't seen this yet) or
671                  * bit 17 (common).
672                  */
673                 switch (dcc & DCC_ADDRESSING_MODE_MASK) {
674                 case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
675                 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
676                         swizzle_x = I915_BIT_6_SWIZZLE_NONE;
677                         swizzle_y = I915_BIT_6_SWIZZLE_NONE;
678                         break;
679                 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
680                         if (dcc & DCC_CHANNEL_XOR_DISABLE) {
681                                 /*
682                                  * This is the base swizzling by the GPU for
683                                  * tiled buffers.
684                                  */
685                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10;
686                                 swizzle_y = I915_BIT_6_SWIZZLE_9;
687                         } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
688                                 /* Bit 11 swizzling by the CPU in addition. */
689                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
690                                 swizzle_y = I915_BIT_6_SWIZZLE_9_11;
691                         } else {
692                                 /* Bit 17 swizzling by the CPU in addition. */
693                                 swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
694                                 swizzle_y = I915_BIT_6_SWIZZLE_9_17;
695                         }
696                         break;
697                 }
698
699                 /* check for L-shaped memory aka modified enhanced addressing */
700                 if (GRAPHICS_VER(i915) == 4 &&
701                     !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
702                         swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
703                         swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
704                 }
705
706                 if (dcc == 0xffffffff) {
707                         drm_err(&i915->drm, "Couldn't read from MCHBAR.  "
708                                   "Disabling tiling.\n");
709                         swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
710                         swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
711                 }
712         }
713
714         if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN ||
715             swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
716                 /*
717                  * Userspace likes to explode if it sees unknown swizzling,
718                  * so lie. We will finish the lie when reporting through
719                  * the get-tiling-ioctl by reporting the physical swizzle
720                  * mode as unknown instead.
721                  *
722                  * As we don't strictly know what the swizzling is, it may be
723                  * bit17 dependent, and so we need to also prevent the pages
724                  * from being moved.
725                  */
726                 i915->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
727                 swizzle_x = I915_BIT_6_SWIZZLE_NONE;
728                 swizzle_y = I915_BIT_6_SWIZZLE_NONE;
729         }
730
731         i915->ggtt.bit_6_swizzle_x = swizzle_x;
732         i915->ggtt.bit_6_swizzle_y = swizzle_y;
733 }
734
735 /*
736  * Swap every 64 bytes of this page around, to account for it having a new
737  * bit 17 of its physical address and therefore being interpreted differently
738  * by the GPU.
739  */
740 static void swizzle_page(struct page *page)
741 {
742         char temp[64];
743         char *vaddr;
744         int i;
745
746         vaddr = kmap(page);
747
748         for (i = 0; i < PAGE_SIZE; i += 128) {
749                 memcpy(temp, &vaddr[i], 64);
750                 memcpy(&vaddr[i], &vaddr[i + 64], 64);
751                 memcpy(&vaddr[i + 64], temp, 64);
752         }
753
754         kunmap(page);
755 }
756
757 /**
758  * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
759  * @obj: i915 GEM buffer object
760  * @pages: the scattergather list of physical pages
761  *
762  * This function fixes up the swizzling in case any page frame number for this
763  * object has changed in bit 17 since that state has been saved with
764  * i915_gem_object_save_bit_17_swizzle().
765  *
766  * This is called when pinning backing storage again, since the kernel is free
767  * to move unpinned backing storage around (either by directly moving pages or
768  * by swapping them out and back in again).
769  */
770 void
771 i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
772                                   struct sg_table *pages)
773 {
774         struct sgt_iter sgt_iter;
775         struct page *page;
776         int i;
777
778         if (obj->bit_17 == NULL)
779                 return;
780
781         i = 0;
782         for_each_sgt_page(page, sgt_iter, pages) {
783                 char new_bit_17 = page_to_phys(page) >> 17;
784
785                 if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) {
786                         swizzle_page(page);
787                         set_page_dirty(page);
788                 }
789
790                 i++;
791         }
792 }
793
794 /**
795  * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
796  * @obj: i915 GEM buffer object
797  * @pages: the scattergather list of physical pages
798  *
799  * This function saves the bit 17 of each page frame number so that swizzling
800  * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
801  * be called before the backing storage can be unpinned.
802  */
803 void
804 i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
805                                     struct sg_table *pages)
806 {
807         const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
808         struct sgt_iter sgt_iter;
809         struct page *page;
810         int i;
811
812         if (obj->bit_17 == NULL) {
813                 obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL);
814                 if (obj->bit_17 == NULL) {
815                         DRM_ERROR("Failed to allocate memory for bit 17 "
816                                   "record\n");
817                         return;
818                 }
819         }
820
821         i = 0;
822
823         for_each_sgt_page(page, sgt_iter, pages) {
824                 if (page_to_phys(page) & (1 << 17))
825                         __set_bit(i, obj->bit_17);
826                 else
827                         __clear_bit(i, obj->bit_17);
828                 i++;
829         }
830 }
831
832 void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
833 {
834         struct drm_i915_private *i915 = ggtt->vm.i915;
835         struct intel_uncore *uncore = ggtt->vm.gt->uncore;
836         int num_fences;
837         int i;
838
839         INIT_LIST_HEAD(&ggtt->fence_list);
840         INIT_LIST_HEAD(&ggtt->userfault_list);
841         intel_wakeref_auto_init(&ggtt->userfault_wakeref, uncore->rpm);
842
843         detect_bit_6_swizzle(ggtt);
844
845         if (!i915_ggtt_has_aperture(ggtt))
846                 num_fences = 0;
847         else if (GRAPHICS_VER(i915) >= 7 &&
848                  !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)))
849                 num_fences = 32;
850         else if (GRAPHICS_VER(i915) >= 4 ||
851                  IS_I945G(i915) || IS_I945GM(i915) ||
852                  IS_G33(i915) || IS_PINEVIEW(i915))
853                 num_fences = 16;
854         else
855                 num_fences = 8;
856
857         if (intel_vgpu_active(i915))
858                 num_fences = intel_uncore_read(uncore,
859                                                vgtif_reg(avail_rs.fence_num));
860         ggtt->fence_regs = kcalloc(num_fences,
861                                    sizeof(*ggtt->fence_regs),
862                                    GFP_KERNEL);
863         if (!ggtt->fence_regs)
864                 num_fences = 0;
865
866         /* Initialize fence registers to zero */
867         for (i = 0; i < num_fences; i++) {
868                 struct i915_fence_reg *fence = &ggtt->fence_regs[i];
869
870                 i915_active_init(&fence->active, NULL, NULL, 0);
871                 fence->ggtt = ggtt;
872                 fence->id = i;
873                 list_add_tail(&fence->link, &ggtt->fence_list);
874         }
875         ggtt->num_fences = num_fences;
876
877         intel_ggtt_restore_fences(ggtt);
878 }
879
880 void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
881 {
882         int i;
883
884         for (i = 0; i < ggtt->num_fences; i++) {
885                 struct i915_fence_reg *fence = &ggtt->fence_regs[i];
886
887                 i915_active_fini(&fence->active);
888         }
889
890         kfree(ggtt->fence_regs);
891 }
892
893 void intel_gt_init_swizzling(struct intel_gt *gt)
894 {
895         struct drm_i915_private *i915 = gt->i915;
896         struct intel_uncore *uncore = gt->uncore;
897
898         if (GRAPHICS_VER(i915) < 5 ||
899             i915->ggtt.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
900                 return;
901
902         intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING);
903
904         if (GRAPHICS_VER(i915) == 5)
905                 return;
906
907         intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL);
908
909         if (GRAPHICS_VER(i915) == 6)
910                 intel_uncore_write(uncore,
911                                    ARB_MODE,
912                                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
913         else if (GRAPHICS_VER(i915) == 7)
914                 intel_uncore_write(uncore,
915                                    ARB_MODE,
916                                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
917         else if (GRAPHICS_VER(i915) == 8)
918                 intel_uncore_write(uncore,
919                                    GAMTARBMODE,
920                                    _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
921         else
922                 MISSING_CASE(GRAPHICS_VER(i915));
923 }