drm/amdgpu/bu: Add use_mtype_cc_wa module param
authorGraham Sider <Graham.Sider@amd.com>
Mon, 6 Feb 2023 19:04:42 +0000 (14:04 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 13:59:03 +0000 (09:59 -0400)
By default, set use_mtype_cc_wa to 1 to set PTE coherence flag MTYPE_CC
instead of MTYPE_RW by default. This is required for the time being to
mitigate a bug causing XCCs to hit stale data due to TCC marking fully
dirty lines as exclusive.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
Reviewed-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index cb9373f..cd2a29a 100644 (file)
@@ -212,6 +212,7 @@ extern int amdgpu_noretry;
 extern int amdgpu_force_asic_type;
 extern int amdgpu_smartshift_bias;
 extern int amdgpu_use_xgmi_p2p;
+extern bool amdgpu_use_mtype_cc_wa;
 #ifdef CONFIG_HSA_AMD
 extern int sched_policy;
 extern bool debug_evictions;
index da4e50a..8bc3782 100644 (file)
@@ -823,6 +823,13 @@ module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm
 #endif
 
 /**
+ * DOC: use_mtype_cc_wa (bool)
+ */
+bool amdgpu_use_mtype_cc_wa = true;
+MODULE_PARM_DESC(use_mtype_cc_wa, "Use MTYPE_CC workaround (0 = use MTYPE_RW where applicable, 1 = use MTYPE_CC where applicable (default))");
+module_param_named(use_mtype_cc_wa, amdgpu_use_mtype_cc_wa, bool, 0444);
+
+/**
  * DOC: pcie_p2p (bool)
  * Enable PCIe P2P (requires large-BAR). Default value: true (on)
  */
index 2eb67b5..8623b93 100644 (file)
@@ -1187,6 +1187,7 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
        bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
        bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
        unsigned int mtype;
+       unsigned int mtype_default;
        bool snoop = false;
 
        switch (adev->ip_versions[GC_HWIP][0]) {
@@ -1230,7 +1231,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
                /* FIXME: Needs more work for handling multiple memory
                 * partitions (> NPS1 mode) e.g. NPS4 for both APU and dGPU
                 * modes.
+                * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable.
+                * To force use of MTYPE_RW, set use_mtype_cc_wa=0
                 */
+               mtype_default = amdgpu_use_mtype_cc_wa ? MTYPE_CC : MTYPE_RW;
                snoop = true;
                if (uncached) {
                        mtype = MTYPE_UC;
@@ -1245,14 +1249,14 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
                         * socket should be treated as remote access so MTYPE_RW
                         * cannot be used always.
                         */
-                       mtype = MTYPE_RW;
+                       mtype = mtype_default;
                } else if (adev->flags & AMD_IS_APU) {
                        /* APU on carve out mode */
-                       mtype = MTYPE_RW;
+                       mtype = mtype_default;
                } else {
                        /* dGPU */
                        if (is_vram && bo_adev == adev)
-                               mtype = MTYPE_RW;
+                               mtype = mtype_default;
                        else if (is_vram)
                                mtype = MTYPE_NC;
                        else
index 2b2129d..477ef92 100644 (file)
@@ -1198,9 +1198,12 @@ svm_range_get_pte_flags(struct kfd_node *node,
                if (uncached) {
                        mapping_flags |= AMDGPU_VM_MTYPE_UC;
                } else if (domain == SVM_RANGE_VRAM_DOMAIN) {
-                       /* local HBM region close to partition */
+                       /* local HBM region close to partition
+                        * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable.
+                        * To force use of MTYPE_RW, set use_mtype_cc_wa=0
+                        */
                        if (bo_node == node)
-                               mapping_flags |= AMDGPU_VM_MTYPE_RW;
+                               mapping_flags |= amdgpu_use_mtype_cc_wa ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
                        /* local HBM region far from partition or remote XGMI GPU */
                        else if (svm_nodes_in_same_hive(bo_node, node))
                                mapping_flags |= AMDGPU_VM_MTYPE_NC;