[PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9

Thu Jul 25 13:54:52 UTC 2024

[Public]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Felix
> Kuehling
> Sent: Wednesday, July 24, 2024 4:35 PM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> On 2024-07-18 13:56, Jonathan Kim wrote:
> > Support per-queue reset for GFX9.  The recommendation is for the driver
> > to target reset the HW queue via a SPI MMIO register write.
> >
> > Since this requires pipe and HW queue info and MEC FW is limited to
> > doorbell reports of hung queues after an unmap failure, scan the HW
> > queue slots defined by SET_RESOURCES first to identify the user queue
> > candidates to reset.
> >
> > Only signal reset events to processes that have had a queue reset.
> >
> > If queue reset fails, fall back to GPU reset.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   1 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |   9 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   6 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   3 +-
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    |  11 +-
> >   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  56 +++++
> >   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   6 +
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 195
> ++++++++++++++++--
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  12 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  15 ++
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   6 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   4 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +
> >   .../gpu/drm/amd/include/kgd_kfd_interface.h   |   4 +
> >   16 files changed, 310 insertions(+), 26 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > index aff08321e976..1dc601e4518a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> > @@ -191,4 +191,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> >       .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> >       .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > index 3a3f3ce09f00..534975c722df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > @@ -418,5 +418,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
> >       .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> >       .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> > -     .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings
> > +     .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > index a5c7259cf2a3..b53c1cfa34de 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> > @@ -541,5 +541,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
> >                       kgd_gfx_v9_4_3_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > index 3ab6c3aa0ad1..dd449a0caba8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> > @@ -1070,6 +1070,14 @@ static void
> program_trap_handler_settings(struct amdgpu_device *adev,
> >       unlock_srbm(adev);
> >   }
> >
> > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device
> *adev,
> > +                                       uint32_t pipe_id, uint32_t queue_id,
> > +                                       uint32_t inst, unsigned int utimeout,
> > +                                       bool detect_only)
> > +{
> > +     return 0;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
> >       .program_sh_mem_settings = kgd_program_sh_mem_settings,
> >       .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
> > @@ -1097,4 +1105,5 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
> >       .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
> >       .build_grace_period_packet_info =
> kgd_gfx_v10_build_grace_period_packet_info,
> >       .program_trap_handler_settings = program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > index 67bcaa3d4226..9eeda8808a3c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> > @@ -56,3 +56,9 @@ void
> kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
> >                                              uint32_t grace_period,
> >                                              uint32_t *reg_offset,
> >                                              uint32_t *reg_data);
> > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device
> *adev,
> > +                                       uint32_t pipe_id,
> > +                                       uint32_t queue_id,
> > +                                       uint32_t inst,
> > +                                       unsigned int utimeout,
> > +                                       bool detect_only);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > index 8c8437a4383f..9872bca4156b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> > @@ -680,5 +680,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
> >       .set_wave_launch_trap_override =
> kgd_gfx_v10_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v10_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v10_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v10_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > index b61a32d6af4b..483937c571ab 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
> > @@ -786,6 +786,14 @@ static uint32_t
> kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
> >       return 0;
> >   }
> >
> > +static uint64_t kgd_gfx_v11_hqd_detect_and_reset(struct amdgpu_device
> *adev,
> > +                                              uint32_t pipe_id, uint32_t queue_id,
> > +                                              uint32_t inst, unsigned int utimeout,
> > +                                              bool detect_only)
> > +{
> > +     return 0;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
> >       .program_sh_mem_settings = program_sh_mem_settings_v11,
> >       .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
> > @@ -808,5 +816,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
> >       .set_wave_launch_trap_override =
> kgd_gfx_v11_set_wave_launch_trap_override,
> >       .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
> >       .set_address_watch = kgd_gfx_v11_set_address_watch,
> > -     .clear_address_watch = kgd_gfx_v11_clear_address_watch
> > +     .clear_address_watch = kgd_gfx_v11_clear_address_watch,
> > +     .hqd_detect_and_reset = kgd_gfx_v11_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > index 5a35a8ca8922..9428e98ac12b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > @@ -1144,6 +1144,61 @@ void
> kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
> >       kgd_gfx_v9_unlock_srbm(adev, inst);
> >   }
> >
> > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id, uint32_t queue_id,
> > +                                      uint32_t inst, unsigned int utimeout,
> > +                                      bool detect_only)
>
> This is a bit misleading. This function doesn't detect anything. It just
> returns a queue address and optionally resets the queue. And the way
> it's called is like "detect and reset, but don't reset", except that
> last part is not obvious if you don't know what the last parameter
> means. It would make the code easier to read if you split this into two
> functions: hqd_get_pq_addr and hqd_reset.

Ok.  Agreed.  That seems cleaner and hqd_get_pq_addr could be useful outside the reset scope in the future.

>
>
> > +{
> > +     uint32_t low, high, temp;
> > +     unsigned long end_jiffies;
> > +     uint64_t queue_addr = 0;
> > +
> > +     kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
> > +     if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
> > +             goto unlock_out;
> > +
> > +     low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
> > +     high = RREG32_SOC15(GC, GET_INST(GC, inst),
> mmCP_HQD_PQ_BASE_HI);
>
> You may need to disable GFXOFF before accessing these registers. Maybe
> you're assuming that the GPU is not idle when you're getting here, and
> maybe that's a good assumption, so I'm not sure.

These registers were assumed to be accessed HWS post-hang so the GPU can't be idle.
But it's probably not harmful to have these accessed in RLC safe mode based on Alex's suggestion if anyone wants to use this function outside the KFD reset scope in the future.

>
>
> > +
> > +     /* only concerned with user queues. */
> > +     if (!high)
> > +             goto unlock_out;
> > +
> > +     queue_addr = (((queue_addr | high) << 32) | low) << 8;
> > +
> > +     if (detect_only)
> > +             goto unlock_out;
> > +
> > +     pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n",
> > +              inst, pipe_id, queue_id);
> > +
> > +     /* assume previous dequeue request issued will take affect after reset */
> > +     WREG32_SOC15(GC, GET_INST(GC, inst),
> mmSPI_COMPUTE_QUEUE_RESET, 0x1);
> > +
> > +     end_jiffies = (utimeout * HZ / 1000) + jiffies;
> > +     while (true) {
> > +             temp = RREG32_SOC15(GC, GET_INST(GC, inst),
> mmCP_HQD_ACTIVE);
> > +
> > +             if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> > +                     break;
> > +
> > +             if (time_after(jiffies, end_jiffies)) {
> > +                     queue_addr = 0;
> > +                     break;
> > +             }
> > +
> > +             usleep_range(500, 1000);
> > +     }
> > +
> > +unlock_out:
> > +     if (!detect_only)
> > +             pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> > +                      inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
> > +     kgd_gfx_v9_release_queue(adev, inst);
> > +
> > +     return queue_addr;
> > +}
> > +
> >   const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> >       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
> >       .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> > @@ -1172,4 +1227,5 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
> >       .build_grace_period_packet_info =
> kgd_gfx_v9_build_grace_period_packet_info,
> >       .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> >       .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> > +     .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > index ce424615f59b..cf70d8fc3d10 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > @@ -101,3 +101,9 @@ void
> kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
> >                                              uint32_t grace_period,
> >                                              uint32_t *reg_offset,
> >                                              uint32_t *reg_data);
> > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id,
> > +                                      uint32_t queue_id,
> > +                                      uint32_t inst,
> > +                                      unsigned int utimeout,
> > +                                      bool detect_only);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 4f48507418d2..037b75a64e66 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -153,6 +153,19 @@ void program_sh_mem_settings(struct
> device_queue_manager *dqm,
> >
> >   static void kfd_hws_hang(struct device_queue_manager *dqm)
> >   {
> > +     struct device_process_node *cur;
> > +     struct qcm_process_device *qpd;
> > +     struct queue *q;
> > +
> > +     /* Mark all device queues as reset. */
> > +     list_for_each_entry(cur, &dqm->queues, list) {
> > +             qpd = cur->qpd;
> > +             list_for_each_entry(q, &qpd->queues_list, list) {
> > +                     q->properties.is_reset = true;
> > +                     q->process->has_reset_queue = true;
>
> Both seem to track the same information, but one does it per queue, the
> other does it per process. Why do you need to track this per queue at
> all? The only place where I see q->properties->is_reset used is in
> update_queue. I think you can use the per-process tracking there.

I'll move the per-queue reset bit to the 2nd patch for debugger support since it makes more sense there.

>
> That said, I think it would be better to track has_reset_queue in the
> pdd or qpd because GPU resets are per GPU, and these fields are accessed
> under the per-GPU DQM lock and reset events sent to user mode are per
> GPU as well.

Ack'd.
>
>
> > +             }
> > +     }
> > +
> >       /*
> >        * Issue a GPU reset if HWS is unresponsive
> >        */
> > @@ -880,6 +893,12 @@ static int update_queue(struct
> device_queue_manager *dqm, struct queue *q,
> >               else if (prev_active)
> >                       retval = remove_queue_mes(dqm, q, &pdd->qpd);
> >
> > +             /* queue is reset so inaccessable  */
> > +             if (q->properties.is_reset) {
>
> You could use pdd->process->has_reset_queue (or pdd->has_reset_queue)
> here and get rid of is_reset.

Ack'd.

>
>
> > +                     retval = -EACCES;
> > +                     goto out_unlock;
> > +             }
> > +
> >               if (retval) {
> >                       dev_err(dev, "unmap queue failed\n");
> >                       goto out_unlock;
> > @@ -1629,7 +1648,7 @@ static int initialize_cpsch(struct
> device_queue_manager *dqm)
> >   static int start_cpsch(struct device_queue_manager *dqm)
> >   {
> >       struct device *dev = dqm->dev->adev->dev;
> > -     int retval;
> > +     int retval, num_hw_queue_slots;
> >
> >       retval = 0;
> >
> > @@ -1682,6 +1701,14 @@ static int start_cpsch(struct
> device_queue_manager *dqm)
> >                                       &dqm->wait_times);
> >       }
> >
> > +     /* setup per-queue reset detection buffer  */
> > +     num_hw_queue_slots =  dqm->dev->kfd-
> >shared_resources.num_queue_per_pipe *
> > +                           dqm->dev->kfd->shared_resources.num_pipe_per_mec *
> > +                           NUM_XCC(dqm->dev->xcc_mask);
> > +
> > +     dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct
> dqm_detect_hang_info);
> > +     dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size,
> GFP_KERNEL);
> > +
> >       dqm_unlock(dqm);
> >
> >       return 0;
> > @@ -1715,6 +1742,7 @@ static int stop_cpsch(struct
> device_queue_manager *dqm)
> >       kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> >       if (!dqm->dev->kfd->shared_resources.enable_mes)
> >               pm_uninit(&dqm->packet_mgr);
> > +     kfree(dqm->detect_hang_info);
> >       dqm_unlock(dqm);
> >
> >       return 0;
> > @@ -1931,6 +1959,143 @@ static int map_queues_cpsch(struct
> device_queue_manager *dqm)
> >       return retval;
> >   }
> >
> > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct
> queue *q,
> > +                            struct qcm_process_device *qpd)
> > +{
> > +     pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
> > +            q->properties.queue_id, q->process->pasid);
> > +
> > +     q->properties.is_reset = true;
> > +     q->process->has_reset_queue = true;
> > +     if (q->properties.is_active) {
> > +             q->properties.is_active = false;
> > +             decrement_queue_count(dqm, qpd, q);
> > +     }
> > +}
> > +
> > +static int detect_queue_hang(struct device_queue_manager *dqm)
> > +{
> > +     int i;
> > +
> > +     memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
> > +
> > +     for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
> > +             uint32_t mec, pipe, queue;
> > +             int xcc_id;
> > +
> > +             mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
> > +                     / dqm->dev->kfd->shared_resources.num_pipe_per_mec;
> > +
> > +             if (mec || !test_bit(i, dqm->dev->kfd-
> >shared_resources.cp_queue_bitmap))
> > +                     continue;
> > +
> > +             amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i,
> &mec, &pipe, &queue);
> > +
> > +             for_each_inst(xcc_id, dqm->dev->xcc_mask) {
> > +                     uint64_t queue_addr = dqm->dev->kfd2kgd-
> >hqd_detect_and_reset(
> > +                                             dqm->dev->adev, pipe, queue, xcc_id,
> > +                                             0, true);
> > +                     struct dqm_detect_hang_info hang_info;
> > +
> > +                     if (!queue_addr)
> > +                             continue;
> > +
> > +                     hang_info.pipe_id = pipe;
> > +                     hang_info.queue_id = queue;
> > +                     hang_info.xcc_id = xcc_id;
> > +                     hang_info.queue_address = queue_addr;
> > +
> > +                     dqm->detect_hang_info[dqm->detect_hang_count] = hang_info;
> > +                     dqm->detect_hang_count++;
> > +             }
> > +     }
> > +
> > +     return dqm->detect_hang_count;
> > +}
> > +
> > +static struct queue *find_queue_by_address(struct device_queue_manager
> *dqm, uint64_t queue_address)
> > +{
> > +     struct device_process_node *cur;
> > +     struct qcm_process_device *qpd;
> > +     struct queue *q;
> > +
> > +     list_for_each_entry(cur, &dqm->queues, list) {
> > +             qpd = cur->qpd;
> > +             list_for_each_entry(q, &qpd->queues_list, list) {
> > +                     if (queue_address == q->properties.queue_address)
> > +                             return q;
> > +             }
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +/* only for compute queue */
> > +static int reset_queues_cpsch(struct device_queue_manager *dqm,
> uint16_t pasid)
> > +{
> > +     int retval = 0, reset_count = 0, i;
> > +
> > +     if (pasid) {
> > +             dqm_lock(dqm);
> > +
> > +             retval = unmap_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> > +                                         pasid, USE_DEFAULT_GRACE_PERIOD, true);
> > +
> > +             dqm_unlock(dqm);
> > +             return retval;
> > +     }
> > +
> > +     if (!dqm->detect_hang_info || dqm->is_hws_hang)
> > +             return -EIO;
> > +
> > +     /* assume dqm locked. */
>
> This is weird, the same function does different things and has different
> assumptions about its locking context depending where it's called from.
> Maybe keep the original function and and make this a new function with a
> distinct name.

Ack'd.

>
>
> > +     if (!detect_queue_hang(dqm))
> > +             return -ENOTRECOVERABLE;
> > +
> > +     for (i = 0; i < dqm->detect_hang_count; i++) {
> > +             struct dqm_detect_hang_info hang_info = dqm-
> >detect_hang_info[i];
> > +             struct queue *q = find_queue_by_address(dqm,
> hang_info.queue_address);
> > +             struct kfd_process_device *pdd;
> > +             uint64_t queue_addr = 0;
> > +             int pipe = hang_info.pipe_id;
> > +             int queue = hang_info.queue_id;
> > +             int xcc_id = hang_info.xcc_id;
>
> Why do you need these local variables. hang_info is already a local
> variable. Now you have two copies of this on your stack.

Ack'd.  That was left over fluff I forgot to remove.

Thanks,

Jon

>
> Regards,
>    Felix
>
>
> > +
> > +             if (!q) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             pdd = kfd_get_process_device_data(dqm->dev, q->process);
> > +             if (!pdd) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             queue_addr = dqm->dev->kfd2kgd->hqd_detect_and_reset(
> > +                             dqm->dev->adev, pipe, queue, xcc_id,
> > +                             KFD_UNMAP_LATENCY_MS, false);
> > +
> > +             if (queue_addr != q->properties.queue_address) {
> > +                     retval = -ENOTRECOVERABLE;
> > +                     goto reset_fail;
> > +             }
> > +
> > +             set_queue_as_reset(dqm, q, &pdd->qpd);
> > +             reset_count++;
> > +     }
> > +
> > +     if (reset_count == dqm->detect_hang_count)
> > +             kfd_signal_reset_event(dqm->dev);
> > +     else
> > +             retval = -ENOTRECOVERABLE;
> > +
> > +reset_fail:
> > +     dqm->detect_hang_count = 0;
> > +
> > +     return retval;
> > +}
> > +
> >   /* dqm->lock mutex has to be locked before calling this function */
> >   static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> >                               enum kfd_unmap_queues_filter filter,
> > @@ -1981,11 +2146,14 @@ static int unmap_queues_cpsch(struct
> device_queue_manager *dqm,
> >        */
> >       mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
> >       if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm-
> >packet_mgr.priv_queue->queue->mqd)) {
> > -             while (halt_if_hws_hang)
> > -                     schedule();
> > -             kfd_hws_hang(dqm);
> > -             retval = -ETIME;
> > -             goto out;
> > +             if (reset_queues_cpsch(dqm, 0)) {
> > +                     while (halt_if_hws_hang)
> > +                             schedule();
> > +                     dqm->is_hws_hang = true;
> > +                     kfd_hws_hang(dqm);
> > +                     retval = -ETIME;
> > +                     goto out;
> > +             }
> >       }
> >
> >       /* We need to reset the grace period value for this device */
> > @@ -2003,21 +2171,6 @@ static int unmap_queues_cpsch(struct
> device_queue_manager *dqm,
> >       return retval;
> >   }
> >
> > -/* only for compute queue */
> > -static int reset_queues_cpsch(struct device_queue_manager *dqm,
> > -                     uint16_t pasid)
> > -{
> > -     int retval;
> > -
> > -     dqm_lock(dqm);
> > -
> > -     retval = unmap_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_BY_PASID,
> > -                     pasid, USE_DEFAULT_GRACE_PERIOD, true);
> > -
> > -     dqm_unlock(dqm);
> > -     return retval;
> > -}
> > -
> >   /* dqm->lock mutex has to be locked before calling this function */
> >   static int execute_queues_cpsch(struct device_queue_manager *dqm,
> >                               enum kfd_unmap_queues_filter filter,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > index 3b9b8eabaacc..dfb36a246637 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops {
> >                                struct kfd_node *dev);
> >   };
> >
> > +struct dqm_detect_hang_info {
> > +     int pipe_id;
> > +     int queue_id;
> > +     int xcc_id;
> > +     uint64_t queue_address;
> > +};
> > +
> >   /**
> >    * struct device_queue_manager
> >    *
> > @@ -264,6 +271,11 @@ struct device_queue_manager {
> >       uint32_t                wait_times;
> >
> >       wait_queue_head_t       destroy_wait;
> > +
> > +     /* for per-queue reset support */
> > +     struct dqm_detect_hang_info *detect_hang_info;
> > +     size_t detect_hang_info_size;
> > +     int detect_hang_count;
> >   };
> >
> >   void device_queue_manager_init_cik(
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > index 9b33d9d2c9ad..30a55d3733e8 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> > @@ -31,6 +31,7 @@
> >   #include <linux/memory.h>
> >   #include "kfd_priv.h"
> >   #include "kfd_events.h"
> > +#include "kfd_device_queue_manager.h"
> >   #include <linux/device.h>
> >
> >   /*
> > @@ -1250,6 +1251,20 @@ void kfd_signal_reset_event(struct kfd_node
> *dev)
> >                       continue;
> >               }
> >
> > +             if (dev->dqm->detect_hang_count && !p->has_reset_queue)
> > +                     continue;
> > +
> > +             if (dev->dqm->detect_hang_count) {
> > +                     struct amdgpu_task_info *ti;
> > +
> > +                     ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
> > +                     if (ti) {
> > +                             DRM_ERROR("Process info: process %s tid %d thread %s
> pid %d\n",
> > +                             ti->process_name, ti->tgid, ti->task_name, ti->pid);
> > +                             amdgpu_vm_put_task_info(ti);
> > +                     }
> > +             }
> > +
> >               rcu_read_lock();
> >
> >               id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index 66c73825c0a0..84e8ea3a8a0c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager
> *mm, void *mqd,
> >   static bool check_preemption_failed(struct mqd_manager *mm, void
> *mqd)
> >   {
> >       struct v9_mqd *m = (struct v9_mqd *)mqd;
> > +     uint32_t doorbell_id = m->queue_doorbell_id0;
> >
> > -     return kfd_check_hiq_mqd_doorbell_id(mm->dev, m-
> >queue_doorbell_id0, 0);
> > +     m->queue_doorbell_id0 = 0;
> > +
> > +     return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
> >   }
> >
> >   static int get_wave_state(struct mqd_manager *mm, void *mqd,
> > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct
> mqd_manager *mm, void *mqd)
> >               m = get_mqd(mqd + hiq_mqd_size * inst);
> >               ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
> >                                       m->queue_doorbell_id0, inst);
> > +             m->queue_doorbell_id0 = 0;
> >               ++inst;
> >       }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 2b3ec92981e8..e244242fd1b4 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -504,6 +504,7 @@ struct queue_properties {
> >       bool is_being_destroyed;
> >       bool is_active;
> >       bool is_gws;
> > +     bool is_reset;
> >       uint32_t pm4_target_xcc;
> >       bool is_dbg_wa;
> >       bool is_user_cu_masked;
> > @@ -982,6 +983,9 @@ struct kfd_process {
> >       struct semaphore runtime_enable_sema;
> >       bool is_runtime_retry;
> >       struct kfd_runtime_info runtime_info;
> > +
> > +     /* Tracks queue reset status */
> > +     bool has_reset_queue;
> >   };
> >
> >   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 17e42161b015..9a1d69f83a08 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process
> *p, uint32_t trigger)
> >                       goto fail;
> >               }
> >               n_evicted++;
> > +
> > +             pdd->dev->dqm->is_hws_hang = false;
> >       }
> >
> >       return r;
> > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > index 6d094cf3587d..20bb4a0c893e 100644
> > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> > @@ -318,6 +318,10 @@ struct kfd2kgd_calls {
> >       void (*program_trap_handler_settings)(struct amdgpu_device *adev,
> >                       uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
> >                       uint32_t inst);
> > +     uint64_t (*hqd_detect_and_reset)(struct amdgpu_device *adev,
> > +                                      uint32_t pipe_id, uint32_t queue_id,
> > +                                      uint32_t inst, unsigned int utimeout,
> > +                                      bool detect_only);
> >   };
> >
> >   #endif      /* KGD_KFD_INTERFACE_H_INCLUDED */