[PATCH 2/6] drm/amdgpu/bu: Add use_mtype_cc_wa module param

Alex Deucher alexander.deucher at amd.com
Wed May 10 17:56:26 UTC 2023


From: Graham Sider <Graham.Sider at amd.com>

By default, set use_mtype_cc_wa to 1 to set PTE coherence flag MTYPE_CC
instead of MTYPE_RW by default. This is required for the time being to
mitigate a bug causing XCCs to hit stale data due to TCC marking fully
dirty lines as exclusive.

Signed-off-by: Graham Sider <Graham.Sider at amd.com>
Reviewed-by: Joseph Greathouse <Joseph.Greathouse at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 10 +++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c    |  7 +++++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9904ce78b8fc..a3a0dbeb251f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -213,6 +213,7 @@ extern int amdgpu_noretry;
 extern int amdgpu_force_asic_type;
 extern int amdgpu_smartshift_bias;
 extern int amdgpu_use_xgmi_p2p;
+extern bool amdgpu_use_mtype_cc_wa;
 #ifdef CONFIG_HSA_AMD
 extern int sched_policy;
 extern bool debug_evictions;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index e4d09bf0887d..2f38c49aa597 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -831,6 +831,13 @@ MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault (
 module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444);
 #endif
 
+/**
+ * DOC: use_mtype_cc_wa (bool)
+ */
+bool amdgpu_use_mtype_cc_wa = true;
+MODULE_PARM_DESC(use_mtype_cc_wa, "Use MTYPE_CC workaround (0 = use MTYPE_RW where applicable, 1 = use MTYPE_CC where applicable (default))");
+module_param_named(use_mtype_cc_wa, amdgpu_use_mtype_cc_wa, bool, 0444);
+
 /**
  * DOC: pcie_p2p (bool)
  * Enable PCIe P2P (requires large-BAR). Default value: true (on)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index d28ffdb07ae6..59ce741dfa73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1192,6 +1192,7 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
 	bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
 	bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
 	unsigned int mtype;
+	unsigned int mtype_default;
 	bool snoop = false;
 
 	switch (adev->ip_versions[GC_HWIP][0]) {
@@ -1235,7 +1236,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
 		/* FIXME: Needs more work for handling multiple memory
 		 * partitions (> NPS1 mode) e.g. NPS4 for both APU and dGPU
 		 * modes.
+		 * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable.
+		 * To force use of MTYPE_RW, set use_mtype_cc_wa=0
 		 */
+		mtype_default = amdgpu_use_mtype_cc_wa ? MTYPE_CC : MTYPE_RW;
 		snoop = true;
 		if (uncached) {
 			mtype = MTYPE_UC;
@@ -1250,14 +1254,14 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
 			 * socket should be treated as remote access so MTYPE_RW
 			 * cannot be used always.
 			 */
-			mtype = MTYPE_RW;
+			mtype = mtype_default;
 		} else if (adev->flags & AMD_IS_APU) {
 			/* APU on carve out mode */
-			mtype = MTYPE_RW;
+			mtype = mtype_default;
 		} else {
 			/* dGPU */
 			if (is_vram && bo_adev == adev)
-				mtype = MTYPE_RW;
+				mtype = mtype_default;
 			else if (is_vram)
 				mtype = MTYPE_NC;
 			else
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 83f8e4e50315..c55b9754c506 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1197,9 +1197,12 @@ svm_range_get_pte_flags(struct kfd_node *node,
 		if (uncached) {
 			mapping_flags |= AMDGPU_VM_MTYPE_UC;
 		} else if (domain == SVM_RANGE_VRAM_DOMAIN) {
-			/* local HBM region close to partition */
+			/* local HBM region close to partition
+			 * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable.
+			 * To force use of MTYPE_RW, set use_mtype_cc_wa=0
+			 */
 			if (bo_node == node)
-				mapping_flags |= AMDGPU_VM_MTYPE_RW;
+				mapping_flags |= amdgpu_use_mtype_cc_wa ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
 			/* local HBM region far from partition or remote XGMI GPU */
 			else if (svm_nodes_in_same_hive(bo_node, node))
 				mapping_flags |= AMDGPU_VM_MTYPE_NC;
-- 
2.40.1



More information about the amd-gfx mailing list