[PATCH] drm/amdkfd: change to return status when flush tlb
Felix Kuehling
felix.kuehling at amd.com
Mon Jul 6 21:54:51 UTC 2020
Am 2020-07-06 um 5:39 a.m. schrieb Dennis Li:
> If GPU hang, driver will fail to flush tlb, return the hang error
> to callers, make callers have a chance to handle the error.
Usually, when a function returns an error, you expect that the function
didn't do what you asked it to do. But in your case, the function
actually did the work, and only failed on the final step, flushing TLBs.
How is the caller expected to detect that case and handle it?
If the cause of the problem is a GPU reset, the caller really has a
bigger problem. The application will get informed of the GPU hang
through the events interface, and most likely terminate shortly after.
Failure to flush TLBs could be an issues if the result is, that the GPU
continues accessing memory pointed to by a TLB. If the corresponding
memory is released, that would lead to memory corruption. Note that this
is a problem that user mode should not need to deal with. This is memory
protection that the kernel is expected to enforce. However, if the GPU
is hanging, that's not really a concern. If, OTOH, the state of the GPU
is unknown, it may be best to wait until a pending reset is complete,
after which you can consider the TLB flushed.
Regards,
Felix
>
> Signed-off-by: Dennis Li <dennis.li at amd.com>
> Change-Id: Ie305ad0a77675f6eab7d5b8f68e279b7f4e7a8b9
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index e9b96ad3d9a5..18e243183b5e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1488,7 +1488,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
> peer_pdd = kfd_get_process_device_data(peer, p);
> if (WARN_ON_ONCE(!peer_pdd))
> continue;
> - kfd_flush_tlb(peer_pdd);
> + err = kfd_flush_tlb(peer_pdd);
> }
>
> kfree(devices_arr);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 21eb0998c4ae..d636cbf7d32f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -263,6 +263,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
> struct queue *q)
> {
> int allocated_vmid = -1, i;
> + int ret = 0;
>
> for (i = dqm->dev->vm_info.first_vmid_kfd;
> i <= dqm->dev->vm_info.last_vmid_kfd; i++) {
> @@ -295,13 +296,26 @@ static int allocate_vmid(struct device_queue_manager *dqm,
> qpd->vmid,
> qpd->page_table_base);
> /* invalidate the VM context after pasid and vmid mapping is set up */
> - kfd_flush_tlb(qpd_to_pdd(qpd));
> + ret = kfd_flush_tlb(qpd_to_pdd(qpd));
> + if (ret) {
> + pr_err("Failed to flush tlb\n");
> + goto pro_failed;
> + }
>
> if (dqm->dev->kfd2kgd->set_scratch_backing_va)
> dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
> qpd->sh_hidden_private_base, qpd->vmid);
>
> return 0;
> +
> +pro_failed:
> + /* Release the vmid mapping */
> + set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
> + dqm->vmid_pasid[qpd->vmid] = 0;
> +
> + qpd->vmid = 0;
> + q->properties.vmid = 0;
> + return ret;
> }
>
> static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
> @@ -326,12 +340,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd,
> struct queue *q)
> {
> + int ret = 0;
> +
> /* On GFX v7, CP doesn't flush TC at dequeue */
> if (q->device->device_info->asic_family == CHIP_HAWAII)
> if (flush_texture_cache_nocpsch(q->device, qpd))
> pr_err("Failed to flush TC\n");
>
> - kfd_flush_tlb(qpd_to_pdd(qpd));
> + ret = kfd_flush_tlb(qpd_to_pdd(qpd));
> + if (ret) {
> + pr_err("Failed to flush tlb\n");
> + }
>
> /* Release the vmid mapping */
> set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
> @@ -795,7 +814,9 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
> dqm->dev->kgd,
> qpd->vmid,
> qpd->page_table_base);
> - kfd_flush_tlb(pdd);
> + ret = kfd_flush_tlb(pdd);
> + if (ret)
> + goto out;
> }
>
> /* Take a safe reference to the mm_struct, which may otherwise
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 51ba2020732e..31ea72946d06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1081,7 +1081,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
>
> void kfd_signal_reset_event(struct kfd_dev *dev);
>
> -void kfd_flush_tlb(struct kfd_process_device *pdd);
> +int kfd_flush_tlb(struct kfd_process_device *pdd);
>
> int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8616a204e4c3..3919cc88813c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1444,21 +1444,24 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
> KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
> }
>
> -void kfd_flush_tlb(struct kfd_process_device *pdd)
> +int kfd_flush_tlb(struct kfd_process_device *pdd)
> {
> struct kfd_dev *dev = pdd->dev;
> + int ret = 0;
>
> if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> /* Nothing to flush until a VMID is assigned, which
> * only happens when the first queue is created.
> */
> if (pdd->qpd.vmid)
> - amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
> + ret = amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
> pdd->qpd.vmid);
> } else {
> - amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
> + ret = amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
> pdd->process->pasid);
> }
> +
> + return ret;
> }
>
> #if defined(CONFIG_DEBUG_FS)
More information about the amd-gfx
mailing list