[PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9

Wed Jul 24 21:15:56 UTC 2024

On Wed, Jul 24, 2024 at 4:34 PM Felix Kuehling <felix.kuehling at amd.com> wrote:
>
>
> On 2024-07-18 13:56, Jonathan Kim wrote:
> > Support per-queue reset for GFX9.  The recommendation is for the driver
> > to target reset the HW queue via a SPI MMIO register write.
> >
> > Since this requires pipe and HW queue info and MEC FW is limited to
> > doorbell reports of hung queues after an unmap failure, scan the HW
> > queue slots defined by SET_RESOURCES first to identify the user queue
> > candidates to reset.
> >
> > Only signal reset events to processes that have had a queue reset.
> >
> > If queue reset fails, fall back to GPU reset.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   1 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |   9 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   6 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    |  11 +-
> >   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  56 +++++
> >   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   6 +
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 195 ++++++++++++++++--
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  12 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  15 ++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   6 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   4 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +
> >   .../gpu/drm/amd/include/kgd_kfd_interface.h   |   4 +
> >   16 files changed, 310 insertions(+), 26 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > index aff08321e976..1dc601e4518a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > @@ -191,4 +191,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> >       .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
> >       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > index 3a3f3ce09f00..534975c722df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > @@ -418,5 +418,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> >       .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
> >       .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> > -     .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
> > +     .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > index a5c7259cf2a3..b53c1cfa34de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > @@ -541,5 +541,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
> >                       kgd_gfx_v9_4_3_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > index 3ab6c3aa0ad1..dd449a0caba8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > @@ -1070,6 +1070,14 @@ static void program_trap_handler_settings(struct amdgpu_device *adev,
> >       unlock_srbm(adev);
> >   }
> >
> > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                       uint32_t pipe_id, uint32_t queue_id,
> > +                                       uint32_t inst, unsigned int utimeout,
> > +                                       bool detect_only)
> > +{
> > +     return 0;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
> >       .program_sh_mem_settings = kgd_program_sh_mem_settings,
> >       .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
> > @@ -1097,4 +1105,5 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> >       .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
> >       .program_trap_handler_settings = program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > index 67bcaa3d4226..9eeda8808a3c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > @@ -56,3 +56,9 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> >                                              uint32_t grace_period,
> >                                              uint32_t *reg_offset,
> >                                              uint32_t *reg_data);
> > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                       uint32_t pipe_id,
> > +                                       uint32_t queue_id,
> > +                                       uint32_t inst,
> > +                                       unsigned int utimeout,
> > +                                       bool detect_only);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > index 8c8437a4383f..9872bca4156b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > @@ -680,5 +680,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
> >       .set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v10_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v10_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v10_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > index b61a32d6af4b..483937c571ab 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > @@ -786,6 +786,14 @@ static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
> >       return 0;
> >   }
> >
> > +static uint64_t kgd_gfx_v11_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                              uint32_t pipe_id, uint32_t queue_id,
> > +                                              uint32_t inst, unsigned int utimeout,
> > +                                              bool detect_only)
> > +{
> > +     return 0;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
> >       .program_sh_mem_settings = program_sh_mem_settings_v11,
> >       .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> > @@ -808,5 +816,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
> >       .set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v11_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v11_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v11_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v11_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > index 5a35a8ca8922..9428e98ac12b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > @@ -1144,6 +1144,61 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
> >       kgd_gfx_v9_unlock_srbm(adev, inst);
> >   }
> >
> > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id, uint32_t queue_id,
> > +                                      uint32_t inst, unsigned int utimeout,
> > +                                      bool detect_only)
>
> This is a bit misleading. This function doesn't detect anything. It just
> returns a queue address and optionally resets the queue. And the way
> it's called is like "detect and reset, but don't reset", except that
> last part is not obvious if you don't know what the last parameter
> means. It would make the code easier to read if you split this into two
> functions: hqd_get_pq_addr and hqd_reset.
>
>
> > +{
> > +     uint32_t low, high, temp;
> > +     unsigned long end_jiffies;
> > +     uint64_t queue_addr = 0;
> > +
> > +     kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
> > +     if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
> > +             goto unlock_out;
> > +
> > +     low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
> > +     high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI);
>
> You may need to disable GFXOFF before accessing these registers. Maybe
> you're assuming that the GPU is not idle when you're getting here, and
> maybe that's a good assumption, so I'm not sure.

I think you should wrap any reset MMIO in
amdgpu_gfx_rlc_enter_safe_mode() and amdgpu_gfx_rlc_exit_safe_mode().

Alex

>
>
> > +
> > +     /* only concerned with user queues. */
> > +     if (!high)
> > +             goto unlock_out;
> > +
> > +     queue_addr = (((queue_addr | high) << 32) | low) << 8;
> > +
> > +     if (detect_only)
> > +             goto unlock_out;
> > +
> > +     pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n",
> > +              inst, pipe_id, queue_id);
> > +
> > +     /* assume previous dequeue request issued will take affect after reset */
> > +     WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1);
> > +
> > +     end_jiffies = (utimeout * HZ / 1000) + jiffies;
> > +     while (true) {
> > +             temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
> > +
> > +             if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> > +                     break;
> > +
> > +             if (time_after(jiffies, end_jiffies)) {
> > +                     queue_addr = 0;
> > +                     break;
> > +             }
> > +
> > +             usleep_range(500, 1000);
> > +     }
> > +
> > +unlock_out:
> > +     if (!detect_only)
> > +             pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> > +                      inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
> > +     kgd_gfx_v9_release_queue(adev, inst);
> > +
> > +     return queue_addr;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> >       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
> >       .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> > @@ -1172,4 +1227,5 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> >       .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
> >       .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> >       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > index ce424615f59b..cf70d8fc3d10 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > @@ -101,3 +101,9 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> >                                              uint32_t grace_period,
> >                                              uint32_t *reg_offset,
> >                                              uint32_t *reg_data);
> > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id,
> > +                                      uint32_t queue_id,
> > +                                      uint32_t inst,
> > +                                      unsigned int utimeout,
> > +                                      bool detect_only);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 4f48507418d2..037b75a64e66 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -153,6 +153,19 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
> >
> >   static void kfd_hws_hang(struct device_queue_manager *dqm)
> >   {
> > +     struct device_process_node *cur;
> > +     struct qcm_process_device *qpd;
> > +     struct queue *q;
> > +
> > +     /* Mark all device queues as reset. */
> > +     list_for_each_entry(cur, &dqm->queues, list) {
> > +             qpd = cur->qpd;
> > +             list_for_each_entry(q, &qpd->queues_list, list) {
> > +                     q->properties.is_reset = true;
> > +                     q->process->has_reset_queue = true;
>
> Both seem to track the same information, but one does it per queue, the
> other does it per process. Why do you need to track this per queue at
> all? The only place where I see q->properties->is_reset used is in
> update_queue. I think you can use the per-process tracking there.
>
> That said, I think it would be better to track has_reset_queue in the
> pdd or qpd because GPU resets are per GPU, and these fields are accessed
> under the per-GPU DQM lock and reset events sent to user mode are per
> GPU as well.
>
>
> > +             }
> > +     }
> > +
> >       /*
> >        * Issue a GPU reset if HWS is unresponsive
> >        */
> > @@ -880,6 +893,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
> >               else if (prev_active)
> >                       retval = remove_queue_mes(dqm, q, &pdd->qpd);
> >
> > +             /* queue is reset so inaccessable  */
> > +             if (q->properties.is_reset) {
>
> You could use pdd->process->has_reset_queue (or pdd->has_reset_queue)
> here and get rid of is_reset.
>
>
> > +                     retval = -EACCES;
> > +                     goto out_unlock;
> > +             }
> > +
> >               if (retval) {
> >                       dev_err(dev, "unmap queue failed\n");
> >                       goto out_unlock;
> > @@ -1629,7 +1648,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
> >   static int start_cpsch(struct device_queue_manager *dqm)
> >   {
> >       struct device *dev = dqm->dev->adev->dev;
> > -     int retval;
> > +     int retval, num_hw_queue_slots;
> >
> >       retval = 0;
> >
> > @@ -1682,6 +1701,14 @@ static int start_cpsch(struct device_queue_manager *dqm)
> >                                       &dqm->wait_times);
> >       }
> >
> > +     /* setup per-queue reset detection buffer  */
> > +     num_hw_queue_slots =  dqm->dev->kfd->shared_resources.num_queue_per_pipe *
> > +                           dqm->dev->kfd->shared_resources.num_pipe_per_mec *
> > +                           NUM_XCC(dqm->dev->xcc_mask);
> > +
> > +     dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info);
> > +     dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL);
> > +
> >       dqm_unlock(dqm);
> >
> >       return 0;
> > @@ -1715,6 +1742,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
> >       kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> >       if (!dqm->dev->kfd->shared_resources.enable_mes)
> >               pm_uninit(&dqm->packet_mgr);
> > +     kfree(dqm->detect_hang_info);
> >       dqm_unlock(dqm);
> >
> >       return 0;
> > @@ -1931,6 +1959,143 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
> >       return retval;
> >   }
> >
> > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> > +                            struct qcm_process_device *qpd)
> > +{
> > +     pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
> > +            q->properties.queue_id, q->process->pasid);
> > +
> > +     q->properties.is_reset = true;
> > +     q->process->has_reset_queue = true;
> > +     if (q->properties.is_active) {
> > +             q->properties.is_active = false;
> > +             decrement_queue_count(dqm, qpd, q);
> > +     }
> > +}
> > +
> > +static int detect_queue_hang(struct device_queue_manager *dqm)
> > +{
> > +     int i;
> > +
> > +     memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
> > +
> > +     for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
> > +             uint32_t mec, pipe, queue;
> > +             int xcc_id;
> > +
> > +             mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
> > +                     / dqm->dev->kfd->shared_resources.num_pipe_per_mec;
> > +
> > +             if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap))
> > +                     continue;
> > +
> > +             amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue);
> > +
> > +             for_each_inst(xcc_id, dqm->dev->xcc_mask) {
> > +                     uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_detect_and_reset(
> > +                                             dqm->dev->adev, pipe, queue, xcc_id,
> > +                                             0, true);
> > +                     struct dqm_detect_hang_info hang_info;
> > +
> > +                     if (!queue_addr)
> > +                             continue;
> > +
> > +                     hang_info.pipe_id = pipe;
> > +                     hang_info.queue_id = queue;
> > +                     hang_info.xcc_id = xcc_id;
> > +                     hang_info.queue_address = queue_addr;
> > +
> > +                     dqm->detect_hang_info[dqm->detect_hang_count] = hang_info;
> > +                     dqm->detect_hang_count++;
> > +             }
> > +     }
> > +
> > +     return dqm->detect_hang_count;
> > +}
> > +
> > +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address)
> > +{
> > +     struct device_process_node *cur;
> > +     struct qcm_process_device *qpd;
> > +     struct queue *q;
> > +
> > +     list_for_each_entry(cur, &dqm->queues, list) {
> > +             qpd = cur->qpd;
> > +             list_for_each_entry(q, &qpd->queues_list, list) {
> > +                     if (queue_address == q->properties.queue_address)
> > +                             return q;
> > +             }
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +/* only for compute queue */
> > +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid)
> > +{
> > +     int retval = 0, reset_count = 0, i;
> > +
> > +     if (pasid) {
> > +             dqm_lock(dqm);
> > +
> > +             retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> > +                                         pasid, USE_DEFAULT_GRACE_PERIOD, true);
> > +
> > +             dqm_unlock(dqm);
> > +             return retval;
> > +     }
> > +
> > +     if (!dqm->detect_hang_info || dqm->is_hws_hang)
> > +             return -EIO;
> > +
> > +     /* assume dqm locked. */
>
> This is weird, the same function does different things and has different
> assumptions about its locking context depending where it's called from.
> Maybe keep the original function and and make this a new function with a
> distinct name.
>
>
> > +     if (!detect_queue_hang(dqm))
> > +             return -ENOTRECOVERABLE;
> > +
> > +     for (i = 0; i < dqm->detect_hang_count; i++) {
> > +             struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i];
> > +             struct queue *q = find_queue_by_address(dqm, hang_info.queue_address);
> > +             struct kfd_process_device *pdd;
> > +             uint64_t queue_addr = 0;
> > +             int pipe = hang_info.pipe_id;
> > +             int queue = hang_info.queue_id;
> > +             int xcc_id = hang_info.xcc_id;
>
> Why do you need these local variables. hang_info is already a local
> variable. Now you have two copies of this on your stack.
>
> Regards,
>    Felix
>
>
> > +
> > +             if (!q) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             pdd = kfd_get_process_device_data(dqm->dev, q->process);
> > +             if (!pdd) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             queue_addr = dqm->dev->kfd2kgd->hqd_detect_and_reset(
> > +                             dqm->dev->adev, pipe, queue, xcc_id,
> > +                             KFD_UNMAP_LATENCY_MS, false);
> > +
> > +             if (queue_addr != q->properties.queue_address) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             set_queue_as_reset(dqm, q, &pdd->qpd);
> > +             reset_count++;
> > +     }
> > +
> > +     if (reset_count == dqm->detect_hang_count)
> > +             kfd_signal_reset_event(dqm->dev);
> > +     else
> > +             retval = -ENOTRECOVERABLE;
> > +
> > +reset_fail:
> > +     dqm->detect_hang_count = 0;
> > +
> > +     return retval;
> > +}
> > +
> >   /* dqm->lock mutex has to be locked before calling this function */
> >   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> >                               enum kfd_unmap_queues_filter filter,
> > @@ -1981,11 +2146,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> >        */
> >       mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
> >       if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
> > -             while (halt_if_hws_hang)
> > -                     schedule();
> > -             kfd_hws_hang(dqm);
> > -             retval = -ETIME;
> > -             goto out;
> > +             if (reset_queues_cpsch(dqm, 0)) {
> > +                     while (halt_if_hws_hang)
> > +                             schedule();
> > +                     dqm->is_hws_hang = true;
> > +                     kfd_hws_hang(dqm);
> > +                     retval = -ETIME;
> > +                     goto out;
> > +             }
> >       }
> >
> >       /* We need to reset the grace period value for this device */
> > @@ -2003,21 +2171,6 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> >       return retval;
> >   }
> >
> > -/* only for compute queue */
> > -static int reset_queues_cpsch(struct device_queue_manager *dqm,
> > -                     uint16_t pasid)
> > -{
> > -     int retval;
> > -
> > -     dqm_lock(dqm);
> > -
> > -     retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> > -                     pasid, USE_DEFAULT_GRACE_PERIOD, true);
> > -
> > -     dqm_unlock(dqm);
> > -     return retval;
> > -}
> > -
> >   /* dqm->lock mutex has to be locked before calling this function */
> >   static int execute_queues_cpsch(struct device_queue_manager *dqm,
> >                               enum kfd_unmap_queues_filter filter,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > index 3b9b8eabaacc..dfb36a246637 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops {
> >                                struct kfd_node *dev);
> >   };
> >
> > +struct dqm_detect_hang_info {
> > +     int pipe_id;
> > +     int queue_id;
> > +     int xcc_id;
> > +     uint64_t queue_address;
> > +};
> > +
> >   /**
> >    * struct device_queue_manager
> >    *
> > @@ -264,6 +271,11 @@ struct device_queue_manager {
> >       uint32_t                wait_times;
> >
> >       wait_queue_head_t       destroy_wait;
> > +
> > +     /* for per-queue reset support */
> > +     struct dqm_detect_hang_info *detect_hang_info;
> > +     size_t detect_hang_info_size;
> > +     int detect_hang_count;
> >   };
> >
> >   void device_queue_manager_init_cik(
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > index 9b33d9d2c9ad..30a55d3733e8 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > @@ -31,6 +31,7 @@
> >   #include <linux/memory.h>
> >   #include "kfd_priv.h"
> >   #include "kfd_events.h"
> > +#include "kfd_device_queue_manager.h"
> >   #include <linux/device.h>
> >
> >   /*
> > @@ -1250,6 +1251,20 @@ void kfd_signal_reset_event(struct kfd_node *dev)
> >                       continue;
> >               }
> >
> > +             if (dev->dqm->detect_hang_count && !p->has_reset_queue)
> > +                     continue;
> > +
> > +             if (dev->dqm->detect_hang_count) {
> > +                     struct amdgpu_task_info *ti;
> > +
> > +                     ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
> > +                     if (ti) {
> > +                             DRM_ERROR("Process info: process %s tid %d thread %s pid %d\n",
> > +                             ti->process_name, ti->tgid, ti->task_name, ti->pid);
> > +                             amdgpu_vm_put_task_info(ti);
> > +                     }
> > +             }
> > +
> >               rcu_read_lock();
> >
> >               id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index 66c73825c0a0..84e8ea3a8a0c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
> >   static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
> >   {
> >       struct v9_mqd *m = (struct v9_mqd *)mqd;
> > +     uint32_t doorbell_id = m->queue_doorbell_id0;
> >
> > -     return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> > +     m->queue_doorbell_id0 = 0;
> > +
> > +     return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
> >   }
> >
> >   static int get_wave_state(struct mqd_manager *mm, void *mqd,
> > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
> >               m = get_mqd(mqd + hiq_mqd_size * inst);
> >               ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
> >                                       m->queue_doorbell_id0, inst);
> > +             m->queue_doorbell_id0 = 0;
> >               ++inst;
> >       }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 2b3ec92981e8..e244242fd1b4 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -504,6 +504,7 @@ struct queue_properties {
> >       bool is_being_destroyed;
> >       bool is_active;
> >       bool is_gws;
> > +     bool is_reset;
> >       uint32_t pm4_target_xcc;
> >       bool is_dbg_wa;
> >       bool is_user_cu_masked;
> > @@ -982,6 +983,9 @@ struct kfd_process {
> >       struct semaphore runtime_enable_sema;
> >       bool is_runtime_retry;
> >       struct kfd_runtime_info runtime_info;
> > +
> > +     /* Tracks queue reset status */
> > +     bool has_reset_queue;
> >   };
> >
> >   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 17e42161b015..9a1d69f83a08 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
> >                       goto fail;
> >               }
> >               n_evicted++;
> > +
> > +             pdd->dev->dqm->is_hws_hang = false;
> >       }
> >
> >       return r;
> > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > index 6d094cf3587d..20bb4a0c893e 100644
> > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > @@ -318,6 +318,10 @@ struct kfd2kgd_calls {
> >       void (*program_trap_handler_settings)(struct amdgpu_device *adev,
> >                       uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
> >                       uint32_t inst);
> > +     uint64_t (*hqd_detect_and_reset)(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id, uint32_t queue_id,
> > +                                      uint32_t inst, unsigned int utimeout,
> > +                                      bool detect_only);
> >   };
> >
> >   #endif      /* KGD_KFD_INTERFACE_H_INCLUDED */