[PATCH] drm/amdkfd: Fix some kfd related recover issues

Fri Mar 21 11:06:06 UTC 2025

On 3/21/2025 4:22 PM, Emily Deng wrote:
> It need to check whether kq has been initialized correctly in kq_acquire_packet_buffer.
> Or it will hit memory corruption during recover, as for recover, it will uninitialize
> kq first.
> 
> Need to flush tlb after recover successully, as it maybe has create bo and
> map bo during recover.

Is this related to any specific type of 'reset'? For mode-2/mode-1 type
of resets, expectation is GC as whole is reset which includes GPU VM block.

Thanks,
Lijo

> 
> Signed-off-by: Emily Deng <Emily.Deng at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  4 ++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 22 +++++++++++++++++++
>  4 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b9c82be6ce13..eb2df5842618 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1000,6 +1000,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>  		return 0;
>  
>  	for (i = 0; i < kfd->num_nodes; i++) {
> +		kfd_flush_all_processes(kfd->nodes[i]);
>  		ret = kfd_resume(kfd->nodes[i]);
>  		if (ret)
>  			return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2b0a830f5b29..5e4ae969818e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -238,6 +238,10 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
>  	uint64_t wptr64;
>  	unsigned int *queue_address;
>  
> +	if (!kq) {
> +		pr_debug("kq has not been initialized\n");
> +		goto err_no_space;
> +	}
>  	/* When rptr == wptr, the buffer is empty.
>  	 * When rptr == wptr + 1, the buffer is full.
>  	 * It is always rptr that advances to the position of wptr, rather than
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..6c073ead2b06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1059,7 +1059,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
>  int kfd_process_restore_queues(struct kfd_process *p);
>  void kfd_suspend_all_processes(void);
>  int kfd_resume_all_processes(void);
> -
> +void kfd_flush_all_processes(struct kfd_node *node);
>  struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,
>  							 uint32_t gpu_id);
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 7c0c24732481..4ed03359020b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -2110,6 +2110,28 @@ int kfd_resume_all_processes(void)
>  	return ret;
>  }
>  
> +void kfd_flush_all_processes(struct kfd_node *node)
> +{
> +	struct kfd_process *p;
> +	struct kfd_process_device *pdd;
> +	unsigned int temp;
> +	int idx = srcu_read_lock(&kfd_processes_srcu);
> +	struct amdgpu_vm *vm;
> +
> +	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> +		pdd = kfd_get_process_device_data(node, p);
> +		if (!pdd)
> +			continue;
> +		vm = drm_priv_to_vm(pdd->drm_priv);
> +		if (!vm)
> +			continue;
> +		atomic64_inc(&vm->tlb_seq);
> +		kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
> +	}
> +	srcu_read_unlock(&kfd_processes_srcu, idx);
> +
> +}
> +
>  int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
>  			  struct vm_area_struct *vma)
>  {