[PATCH] drm/amdkfd: Fix some kfd related recover issues
Lazar, Lijo
lijo.lazar at amd.com
Fri Mar 21 11:06:06 UTC 2025
On 3/21/2025 4:22 PM, Emily Deng wrote:
> It need to check whether kq has been initialized correctly in kq_acquire_packet_buffer.
> Or it will hit memory corruption during recover, as for recover, it will uninitialize
> kq first.
>
> Need to flush tlb after recover successully, as it maybe has create bo and
> map bo during recover.
Is this related to any specific type of 'reset'? For mode-2/mode-1 type
of resets, expectation is GC as whole is reset which includes GPU VM block.
Thanks,
Lijo
>
> Signed-off-by: Emily Deng <Emily.Deng at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 4 ++++
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 22 +++++++++++++++++++
> 4 files changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b9c82be6ce13..eb2df5842618 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1000,6 +1000,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
> return 0;
>
> for (i = 0; i < kfd->num_nodes; i++) {
> + kfd_flush_all_processes(kfd->nodes[i]);
> ret = kfd_resume(kfd->nodes[i]);
> if (ret)
> return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2b0a830f5b29..5e4ae969818e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -238,6 +238,10 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
> uint64_t wptr64;
> unsigned int *queue_address;
>
> + if (!kq) {
> + pr_debug("kq has not been initialized\n");
> + goto err_no_space;
> + }
> /* When rptr == wptr, the buffer is empty.
> * When rptr == wptr + 1, the buffer is full.
> * It is always rptr that advances to the position of wptr, rather than
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..6c073ead2b06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1059,7 +1059,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
> int kfd_process_restore_queues(struct kfd_process *p);
> void kfd_suspend_all_processes(void);
> int kfd_resume_all_processes(void);
> -
> +void kfd_flush_all_processes(struct kfd_node *node);
> struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,
> uint32_t gpu_id);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 7c0c24732481..4ed03359020b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -2110,6 +2110,28 @@ int kfd_resume_all_processes(void)
> return ret;
> }
>
> +void kfd_flush_all_processes(struct kfd_node *node)
> +{
> + struct kfd_process *p;
> + struct kfd_process_device *pdd;
> + unsigned int temp;
> + int idx = srcu_read_lock(&kfd_processes_srcu);
> + struct amdgpu_vm *vm;
> +
> + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> + pdd = kfd_get_process_device_data(node, p);
> + if (!pdd)
> + continue;
> + vm = drm_priv_to_vm(pdd->drm_priv);
> + if (!vm)
> + continue;
> + atomic64_inc(&vm->tlb_seq);
> + kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
> + }
> + srcu_read_unlock(&kfd_processes_srcu, idx);
> +
> +}
> +
> int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
> struct vm_area_struct *vma)
> {
More information about the amd-gfx
mailing list