[PATCH] drm/amdkfd: change to return status when flush tlb

Dennis Li Dennis.Li at amd.com
Mon Jul 6 09:39:22 UTC 2020


If GPU hang, driver will fail to flush tlb, return the hang error
to callers, make callers have a chance to handle the error.

Signed-off-by: Dennis Li <dennis.li at amd.com>
Change-Id: Ie305ad0a77675f6eab7d5b8f68e279b7f4e7a8b9

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e9b96ad3d9a5..18e243183b5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1488,7 +1488,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
 		peer_pdd = kfd_get_process_device_data(peer, p);
 		if (WARN_ON_ONCE(!peer_pdd))
 			continue;
-		kfd_flush_tlb(peer_pdd);
+		err = kfd_flush_tlb(peer_pdd);
 	}
 
 	kfree(devices_arr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 21eb0998c4ae..d636cbf7d32f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -263,6 +263,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 			struct queue *q)
 {
 	int allocated_vmid = -1, i;
+	int ret = 0;
 
 	for (i = dqm->dev->vm_info.first_vmid_kfd;
 			i <= dqm->dev->vm_info.last_vmid_kfd; i++) {
@@ -295,13 +296,26 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 			qpd->vmid,
 			qpd->page_table_base);
 	/* invalidate the VM context after pasid and vmid mapping is set up */
-	kfd_flush_tlb(qpd_to_pdd(qpd));
+	ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+	if (ret) {
+		pr_err("Failed to flush tlb\n");
+		goto pro_failed;
+	}
 
 	if (dqm->dev->kfd2kgd->set_scratch_backing_va)
 		dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
 				qpd->sh_hidden_private_base, qpd->vmid);
 
 	return 0;
+
+pro_failed:
+	/* Release the vmid mapping */
+	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+	dqm->vmid_pasid[qpd->vmid] = 0;
+
+	qpd->vmid = 0;
+	q->properties.vmid = 0;
+	return ret;
 }
 
 static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
@@ -326,12 +340,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
+	int ret = 0;
+
 	/* On GFX v7, CP doesn't flush TC at dequeue */
 	if (q->device->device_info->asic_family == CHIP_HAWAII)
 		if (flush_texture_cache_nocpsch(q->device, qpd))
 			pr_err("Failed to flush TC\n");
 
-	kfd_flush_tlb(qpd_to_pdd(qpd));
+	ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+	if (ret) {
+		pr_err("Failed to flush tlb\n");
+	}
 
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -795,7 +814,9 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 				dqm->dev->kgd,
 				qpd->vmid,
 				qpd->page_table_base);
-		kfd_flush_tlb(pdd);
+		ret = kfd_flush_tlb(pdd);
+		if (ret)
+			goto out;
 	}
 
 	/* Take a safe reference to the mm_struct, which may otherwise
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 51ba2020732e..31ea72946d06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1081,7 +1081,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
-void kfd_flush_tlb(struct kfd_process_device *pdd);
+int kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8616a204e4c3..3919cc88813c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1444,21 +1444,24 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
 			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
 }
 
-void kfd_flush_tlb(struct kfd_process_device *pdd)
+int kfd_flush_tlb(struct kfd_process_device *pdd)
 {
 	struct kfd_dev *dev = pdd->dev;
+	int ret = 0;
 
 	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
 		/* Nothing to flush until a VMID is assigned, which
 		 * only happens when the first queue is created.
 		 */
 		if (pdd->qpd.vmid)
-			amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
+			ret = amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
 							pdd->qpd.vmid);
 	} else {
-		amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
+		ret = amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
 						pdd->process->pasid);
 	}
+
+	return ret;
 }
 
 #if defined(CONFIG_DEBUG_FS)
-- 
2.17.1



More information about the amd-gfx mailing list