[PATCH] drm/amdkfd: fix vm-pasid lookup for multiple partitions

Christian König ckoenig.leichtzumerken at gmail.com
Thu Sep 5 14:23:57 UTC 2024


Am 19.08.24 um 19:59 schrieb Jonathan Kim:
> Currently multiple partitions will incorrectly overwrite the VM lookup
> table since the table is indexed by PASID and multiple partitions can
> register different VM objects on the same PASID.

That's a rather bad idea. Why do we have the same PASID for different VM 
objects in the first place?

Regards,
Christian.

>
> This results in loading the wrong VM object on PASID query.
>
> To correct this, setup the lookup table to be per-partition-per-PASID
> instead.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    | 12 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |  7 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 55 +++++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        | 11 +++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  5 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c        |  5 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c        |  5 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c         |  3 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c         |  5 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 16 ++----
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c        |  2 +-
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c      |  4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  3 +-
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  |  8 +--
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  8 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   |  3 +-
>   18 files changed, 92 insertions(+), 65 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index c272461d70a9..28db789610e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -887,3 +887,15 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
>   
>   	return r;
>   }
> +
> +int amdgpu_amdkfd_node_id_to_xcc_id(struct amdgpu_device *adev, uint32_t node_id)
> +{
> +	if (adev->gfx.funcs->ih_node_to_logical_xcc) {
> +		int xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev, node_id);
> +
> +		if (xcc_id >= 0)
> +			return xcc_id;
> +	}
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 4ed49265c764..bf8bb45d8ab6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -356,6 +356,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
>   		uint64_t size, u32 alloc_flag, int8_t xcp_id);
>   
>   u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
> +int amdgpu_amdkfd_node_id_to_xcc_id(struct amdgpu_device *adev, uint32_t node_id);
>   
>   #define KFD_XCP_MEM_ID(adev, xcp_id) \
>   		((adev)->xcp_mgr && (xcp_id) >= 0 ?\
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index c6a1783fc9ef..bf9f8802e18d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,7 +37,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	struct amdgpu_job *job = to_amdgpu_job(s_job);
>   	struct amdgpu_task_info *ti;
>   	struct amdgpu_device *adev = ring->adev;
> -	int idx;
> +	int idx, xcp_id = !job->vm ? 0 : job->vm->xcp_id;
>   	int r;
>   
>   	if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
> @@ -62,7 +62,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>   		ring->fence_drv.sync_seq);
>   
> -	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
> +	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid, xcp_id);
>   	if (ti) {
>   		dev_err(adev->dev,
>   			"Process information: process %s pid %d thread %s pid %d\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index d9fde38f6ee2..e413bf4a3e84 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1275,17 +1275,20 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   		struct amdgpu_vm *vm = &fpriv->vm;
>   		struct drm_amdgpu_info_gpuvm_fault gpuvm_fault;
>   		unsigned long flags;
> +		int i;
>   
>   		if (!vm)
>   			return -EINVAL;
>   
>   		memset(&gpuvm_fault, 0, sizeof(gpuvm_fault));
>   
> -		xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> +		for (i = 0; i < adev->xcp_mgr->num_xcps; i++)
> +			xa_lock_irqsave(&adev->vm_manager.pasids[i], flags);
>   		gpuvm_fault.addr = vm->fault_info.addr;
>   		gpuvm_fault.status = vm->fault_info.status;
>   		gpuvm_fault.vmhub = vm->fault_info.vmhub;
> -		xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> +		for (i = 0; i < adev->xcp_mgr->num_xcps; i++)
> +			xa_unlock_irqrestore(&adev->vm_manager.pasids[i], flags);
>   
>   		return copy_to_user(out, &gpuvm_fault,
>   				    min((size_t)size, sizeof(gpuvm_fault))) ? -EFAULT : 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bcb729094521..f43e1c15f423 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -146,7 +146,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   		return 0;
>   
>   	if (vm->pasid) {
> -		r = xa_err(xa_erase_irq(&adev->vm_manager.pasids, vm->pasid));
> +		r = xa_err(xa_erase_irq(&adev->vm_manager.pasids[vm->xcp_id], vm->pasid));
>   		if (r < 0)
>   			return r;
>   
> @@ -154,7 +154,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	}
>   
>   	if (pasid) {
> -		r = xa_err(xa_store_irq(&adev->vm_manager.pasids, pasid, vm,
> +		r = xa_err(xa_store_irq(&adev->vm_manager.pasids[vm->xcp_id], pasid, vm,
>   					GFP_KERNEL));
>   		if (r < 0)
>   			return r;
> @@ -2288,14 +2288,14 @@ static void amdgpu_vm_destroy_task_info(struct kref *kref)
>   }
>   
>   static inline struct amdgpu_vm *
> -amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
> +amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id)
>   {
>   	struct amdgpu_vm *vm;
>   	unsigned long flags;
>   
> -	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> -	vm = xa_load(&adev->vm_manager.pasids, pasid);
> -	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> +	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], flags);
> +	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
> +	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], flags);
>   
>   	return vm;
>   }
> @@ -2343,10 +2343,10 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
>    * referenced down with amdgpu_vm_put_task_info.
>    */
>   struct amdgpu_task_info *
> -amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id)
>   {
>   	return amdgpu_vm_get_task_info_vm(
> -			amdgpu_vm_get_vm_from_pasid(adev, pasid));
> +			amdgpu_vm_get_vm_from_pasid(adev, pasid, xcp_id));
>   }
>   
>   static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
> @@ -2481,6 +2481,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	amdgpu_bo_unreserve(vm->root.bo);
>   	amdgpu_bo_unref(&root_bo);
>   
> +	vm->xcp_id = xcp_id < 0 ? 0 : xcp_id;
> +
>   	return 0;
>   
>   error_free_root:
> @@ -2695,8 +2697,8 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
>   #else
>   	adev->vm_manager.vm_update_mode = 0;
>   #endif
> -
> -	xa_init_flags(&adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ);
> +	for (i = 0; i < MAX_XCP; i++)
> +		xa_init_flags(&(adev->vm_manager.pasids[i]), XA_FLAGS_LOCK_IRQ);
>   }
>   
>   /**
> @@ -2708,10 +2710,15 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
>    */
>   void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
>   {
> -	WARN_ON(!xa_empty(&adev->vm_manager.pasids));
> -	xa_destroy(&adev->vm_manager.pasids);
> +	int i;
> +
> +	for (i = 0; i < MAX_XCP; i++) {
> +		WARN_ON(!xa_empty(&adev->vm_manager.pasids[i]));
> +		xa_destroy(&adev->vm_manager.pasids[i]);
> +	}
>   
>   	amdgpu_vmid_mgr_fini(adev);
> +
>   }
>   
>   /**
> @@ -2778,17 +2785,18 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>   	unsigned long irqflags;
>   	uint64_t value, flags;
>   	struct amdgpu_vm *vm;
> -	int r;
> +	int r, xcp_id;
>   
> -	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> -	vm = xa_load(&adev->vm_manager.pasids, pasid);
> +	xcp_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, node_id)/adev->gfx.num_xcc_per_xcp;
> +	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], irqflags);
> +	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
>   	if (vm) {
>   		root = amdgpu_bo_ref(vm->root.bo);
>   		is_compute_context = vm->is_compute_context;
>   	} else {
>   		root = NULL;
>   	}
> -	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> +	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], irqflags);
>   
>   	if (!root)
>   		return false;
> @@ -2806,11 +2814,11 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>   		goto error_unref;
>   
>   	/* Double check that the VM still exists */
> -	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> -	vm = xa_load(&adev->vm_manager.pasids, pasid);
> +	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], irqflags);
> +	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
>   	if (vm && vm->root.bo != root)
>   		vm = NULL;
> -	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> +	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], irqflags);
>   	if (!vm)
>   		goto error_unlock;
>   
> @@ -2968,14 +2976,15 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   				  unsigned int pasid,
>   				  uint64_t addr,
>   				  uint32_t status,
> -				  unsigned int vmhub)
> +				  unsigned int vmhub,
> +				  uint32_t xcp_id)
>   {
>   	struct amdgpu_vm *vm;
>   	unsigned long flags;
>   
> -	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> +	xa_lock_irqsave(&adev->vm_manager.pasids[xcp_id], flags);
>   
> -	vm = xa_load(&adev->vm_manager.pasids, pasid);
> +	vm = xa_load(&adev->vm_manager.pasids[xcp_id], pasid);
>   	/* Don't update the fault cache if status is 0.  In the multiple
>   	 * fault case, subsequent faults will return a 0 status which is
>   	 * useless for userspace and replaces the useful fault status, so
> @@ -3008,7 +3017,7 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   			WARN_ONCE(1, "Invalid vmhub %u\n", vmhub);
>   		}
>   	}
> -	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> +	xa_unlock_irqrestore(&adev->vm_manager.pasids[xcp_id], flags);
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 046949c4b695..1499f5f731e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -35,6 +35,7 @@
>   #include "amdgpu_sync.h"
>   #include "amdgpu_ring.h"
>   #include "amdgpu_ids.h"
> +#include "amdgpu_xcp.h"
>   
>   struct drm_exec;
>   
> @@ -418,6 +419,9 @@ struct amdgpu_vm {
>   
>   	/* cached fault info */
>   	struct amdgpu_vm_fault_info fault_info;
> +
> +	/* XCP ID */
> +	int xcp_id;
>   };
>   
>   struct amdgpu_vm_manager {
> @@ -456,7 +460,7 @@ struct amdgpu_vm_manager {
>   	/* PASID to VM mapping, will be used in interrupt context to
>   	 * look up VM of a page fault
>   	 */
> -	struct xarray				pasids;
> +	struct xarray				pasids[MAX_XCP];
>   	/* Global registration of recent page fault information */
>   	struct amdgpu_vm_fault_info	fault_info;
>   };
> @@ -550,7 +554,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>   
>   struct amdgpu_task_info *
> -amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid, u32 xcp_id);
>   
>   struct amdgpu_task_info *
>   amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
> @@ -649,7 +653,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   				  unsigned int pasid,
>   				  uint64_t addr,
>   				  uint32_t status,
> -				  unsigned int vmhub);
> +				  unsigned int vmhub,
> +				  uint32_t xcp_id);
>   void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
>   				 struct amdgpu_vm *vm,
>   				 struct dma_fence **fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index f0ceab3ce5bf..24b042febf5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -151,7 +151,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
>   		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> -					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
> +					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
> +					     0);
>   	}
>   
>   	if (!printk_ratelimit())
> @@ -161,7 +162,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>   		"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>   		entry->vmid_src ? "mmhub" : "gfxhub",
>   		entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> -	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
>   	if (task_info) {
>   		dev_err(adev->dev,
>   			" in process %s pid %d thread %s pid %d\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 2797fd84432b..3507046d33e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -122,7 +122,8 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
>   		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> -					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
> +					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
> +					     0);
>   	}
>   
>   	if (printk_ratelimit()) {
> @@ -132,7 +133,7 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
>   			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>   			entry->vmid_src ? "mmhub" : "gfxhub",
>   			entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> -		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
>   		if (task_info) {
>   			dev_err(adev->dev,
>   				" in process %s pid %d thread %s pid %d)\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> index 60acf676000b..9844564c6c74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> @@ -115,7 +115,8 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
>   		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> -					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
> +					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0),
> +					     0);
>   	}
>   
>   	if (printk_ratelimit()) {
> @@ -125,7 +126,7 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
>   			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>   			entry->vmid_src ? "mmhub" : "gfxhub",
>   			entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> -		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
>   		if (task_info) {
>   			dev_err(adev->dev,
>   				" in process %s pid %d thread %s pid %d)\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> index 994432fb57ea..2cdb0cbb7c4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> @@ -1268,7 +1268,8 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
>   		return 0;
>   
>   	amdgpu_vm_update_fault_cache(adev, entry->pasid,
> -				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
> +				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status,
> +				     AMDGPU_GFXHUB(0), 0);
>   
>   	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
>   		gmc_v7_0_set_fault_enable_default(adev, false);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 86488c052f82..6855caeb7f74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1437,7 +1437,8 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   		return 0;
>   
>   	amdgpu_vm_update_fault_cache(adev, entry->pasid,
> -				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
> +				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status,
> +				     AMDGPU_GFXHUB(0), 0);
>   
>   	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
>   		gmc_v8_0_set_fault_enable_default(adev, false);
> @@ -1448,7 +1449,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   		dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>   			entry->src_id, entry->src_data[0]);
>   
> -		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +		task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
>   		if (task_info) {
>   			dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
>   				task_info->process_name, task_info->tgid,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index b73136d390cc..e183e08b2c02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -556,10 +556,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   	unsigned int vmhub;
>   	u64 addr;
>   	uint32_t cam_index = 0;
> -	int ret, xcc_id = 0;
> -	uint32_t node_id;
> +	int ret;
> +	uint32_t node_id, xcc_id, xcp_id;
>   
>   	node_id = entry->node_id;
> +	xcc_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, node_id);
> +	xcp_id = xcc_id/adev->gfx.num_xcc_per_xcp;
>   
>   	addr = (u64)entry->src_data[0] << 12;
>   	addr |= ((u64)entry->src_data[1] & 0xf) << 44;
> @@ -572,12 +574,6 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   		vmhub = AMDGPU_MMHUB1(0);
>   	} else {
>   		hub_name = "gfxhub0";
> -		if (adev->gfx.funcs->ih_node_to_logical_xcc) {
> -			xcc_id = adev->gfx.funcs->ih_node_to_logical_xcc(adev,
> -				node_id);
> -			if (xcc_id < 0)
> -				xcc_id = 0;
> -		}
>   		vmhub = xcc_id;
>   	}
>   	hub = &adev->vmhub[vmhub];
> @@ -631,7 +627,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   		retry_fault ? "retry" : "no-retry",
>   		entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
>   
> -	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, xcp_id);
>   	if (task_info) {
>   		dev_err(adev->dev,
>   			" for process %s pid %d thread %s pid %d)\n",
> @@ -675,7 +671,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   	if (!amdgpu_sriov_vf(adev))
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
> -	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
> +	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub, xcp_id);
>   
>   	dev_err(adev->dev,
>   		"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 23ef4eb36b40..1ac4224bbe5b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -2182,7 +2182,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
>   			   instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>   			   entry->pasid);
>   
> -	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, 0);
>   	if (task_info) {
>   		dev_dbg_ratelimited(adev->dev,
>   				    " for process %s pid %d thread %s pid %d\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 57f16c09abfc..c8b5c0302ca7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1683,6 +1683,8 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
>   	int instance;
>   	struct amdgpu_task_info *task_info;
>   	u64 addr;
> +	uint32_t xcc_id = amdgpu_amdkfd_node_id_to_xcc_id(adev, entry->node_id);
> +	uint32_t xcp_id = xcc_id/adev->gfx.num_xcc_per_xcp;
>   
>   	instance = sdma_v4_4_2_irq_id_to_seq(adev, entry->client_id);
>   	if (instance < 0 || instance >= adev->sdma.num_instances) {
> @@ -1698,7 +1700,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
>   			    instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>   			    entry->pasid);
>   
> -	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> +	task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid, xcp_id);
>   	if (task_info) {
>   		dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
>   				    task_info->process_name, task_info->tgid,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index ea3792249209..c098fbaf0e1c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1262,8 +1262,9 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>   
>   		if (dev->dqm->detect_hang_count) {
>   			struct amdgpu_task_info *ti;
> +			uint32_t xcp_id = dev->xcp ? dev->xcp->id : 0;
>   
> -			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
> +			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid, xcp_id);
>   			if (ti) {
>   				dev_err(dev->adev->dev,
>   					"Queues reset on process %s tid %d thread %s pid %d\n",
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> index 8e0d0356e810..d7cbf9525698 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> @@ -377,12 +377,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
>   		struct kfd_hsa_memory_exception_data exception_data;
>   
>   		/* gfxhub */
> -		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
> -			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
> -				node_id);
> -			if (hub_inst < 0)
> -				hub_inst = 0;
> -		}
> +		if (!vmid_type)
> +			hub_inst = amdgpu_amdkfd_node_id_to_xcc_id(dev->adev, node_id);
>   
>   		/* mmhub */
>   		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index a9c3580be8c9..4708b8c811a5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -437,12 +437,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
>   		struct kfd_hsa_memory_exception_data exception_data;
>   
>   		/* gfxhub */
> -		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
> -			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
> -				node_id);
> -			if (hub_inst < 0)
> -				hub_inst = 0;
> -		}
> +		if (!vmid_type)
> +			hub_inst = amdgpu_amdkfd_node_id_to_xcc_id(dev->adev, node_id);
>   
>   		/* mmhub */
>   		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ea6a8e43bd5b..b5f2f5b1069c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -251,8 +251,9 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>   void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
>   {
>   	struct amdgpu_task_info *task_info;
> +	uint32_t xcp_id = dev->xcp ? dev->xcp->id : 0;
>   
> -	task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
> +	task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid, xcp_id);
>   	if (task_info) {
>   		/* Report VM faults from user applications, not retry from kernel */
>   		if (task_info->pid)



More information about the amd-gfx mailing list