[PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison
Felix Kuehling
felix.kuehling at amd.com
Fri Dec 10 23:30:42 UTC 2021
On 2021-12-10 6:15 a.m., Tao Zhou wrote:
> CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
> Replacing whole gpu reset with reset queue mode for RAS poison consumption
> saves much time, and we can also fallback to gpu reset solution if reset
> queue fails.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 ++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 ++-
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++
> .../drm/amd/amdkfd/kfd_device_queue_manager.h | 1 +
> .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 27 ++++++++++++++++---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++
> 6 files changed, 45 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 46cf48b3904a..0bf09a94d944 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
> return adev->have_atomics_support;
> }
>
> -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
> +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
> {
> struct ras_err_data err_data = {0, 0, 0, NULL};
>
> /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
> if (!adev->gmc.xgmi.connected_to_cpu)
> - amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
> - else
> + amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
> + else if (reset)
> amdgpu_amdkfd_gpu_reset(adev);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index fcbc8a9c9e06..61f899e54fd5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
> uint64_t *mmap_offset);
> int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
> struct tile_config *config);
> -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
> +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
> + bool reset);
> #if IS_ENABLED(CONFIG_HSA_AMD)
> void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
> void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 01a2cc3928ac..095b2e0822aa 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> return retval;
> }
>
> +int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid)
> +{
> + int ret;
> +
> + dqm_lock(dqm);
> +
> + ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> + pasid, true);
> +
> + dqm_unlock(dqm);
> +
> + return ret;
> +}
> +
> /* dqm->lock mutex has to be locked before calling this function */
> static int execute_queues_cpsch(struct device_queue_manager *dqm,
> enum kfd_unmap_queues_filter filter,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 499fc0ea387f..c52869133159 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
> unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
> unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
> unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
> +int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid);
>
> static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
> {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index deb64168c9e8..2863bb9e5bca 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {
> #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
> #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
>
> +static void event_interrupt_poison_consumption(struct kfd_dev *dev,
> + uint16_t pasid)
> +{
> + int ret;
> + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +
> + /* all queues of a process will be unmapped in one time */
> + if (p && atomic_read(&p->poison))
> + return;
> +
> + atomic_set(&p->poison, 1);
You're not checking p != NULL here.
You also need to release the process refcount before this function
returns. Otherwise the process resources will be leaked. You can see
leaked processes in /sys/class/kfd/kfd/proc. That directory should be
empty after all KFD processes terminated.
Other than that, the series is
Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
> + ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
> + kfd_signal_poison_consumed_event(dev, pasid);
> + /* resetting queue passes, do page retirement without gpu reset
> + resetting queue fails, fallback to gpu reset solution */
> + if (!ret)
> + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
> + else
> + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
> +}
> +
> static bool event_interrupt_isr_v9(struct kfd_dev *dev,
> const uint32_t *ih_ring_entry,
> uint32_t *patched_ihre,
> @@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
> sq_intr_err);
> if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
> sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
> - kfd_signal_poison_consumed_event(dev, pasid);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
> + event_interrupt_poison_consumption(dev, pasid);
> return;
> }
> break;
> @@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
> if (source_id == SOC15_INTSRC_SDMA_TRAP) {
> kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
> } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
> - kfd_signal_poison_consumed_event(dev, pasid);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
> + event_interrupt_poison_consumption(dev, pasid);
> return;
> }
> } else if (client_id == SOC15_IH_CLIENTID_VMC ||
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 0c3f911e3bf4..ea68f3b3a4e9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -856,6 +856,8 @@ struct kfd_process {
> struct svm_range_list svms;
>
> bool xnack_enabled;
> +
> + atomic_t poison;
> };
>
> #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
More information about the amd-gfx
mailing list