[PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

Thu Nov 24 16:25:06 UTC 2022

Am 2022-11-24 um 09:58 schrieb Kim, Jonathan:
> [AMD Official Use Only - General]
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling at amd.com>
>> Sent: November 22, 2022 6:59 PM
>> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
>> gfx at lists.freedesktop.org
>> Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode
>> enable and disable calls
>>
>>
>> On 2022-10-31 12:23, Jonathan Kim wrote:
>>> On GFX9.4.1, the implicit wait count instruction on s_barrier is
>>> disabled by default in the driver during normal operation for
>>> performance requirements.
>>>
>>> There is a hardware bug in GFX9.4.1 where if the implicit wait count
>>> instruction after an s_barrier instruction is disabled, any wave that
>>> hits an exception may step over the s_barrier when returning from the
>>> trap handler with the barrier logic having no ability to be
>>> aware of this, thereby causing other waves to wait at the barrier
>>> indefinitely resulting in a shader hang.  This bug has been corrected
>>> for GFX9.4.2 and onward.
>>>
>>> Since the debugger subscribes to hardware exceptions, in order to avoid
>>> this bug, the debugger must enable implicit wait count on s_barrier
>>> for a debug session and disable it on detach.
>>>
>>> In order to change this setting in the in the device global SQ_CONFIG
>>> register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
>>> will either dispatch work through the compute ring buffers used for
>>> image post processing or through the hardware scheduler by the KFD.
>>>
>>> Have the KGD suspend and drain the compute ring buffer, then suspend
>> the
>>> hardware scheduler and block any future KFD process job requests before
>>> changing the implicit wait count setting.  Once set, resume all work.
>>>
>>> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
>>>    .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105
>> +++++++++++++++++-
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
>>>    4 files changed, 110 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 0e6ddf05c23c..9f2499f52d2c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1034,6 +1034,9 @@ struct amdgpu_device {
>>>      struct pci_saved_state          *pci_state;
>>>      pci_channel_state_t             pci_channel_state;
>>>
>>> +   /* Track auto wait count on s_barrier settings */
>>> +   bool                            barrier_has_auto_waitcnt;
>>> +
>>>      struct amdgpu_reset_control     *reset_cntl;
>>>      uint32_t
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> index 4191af5a3f13..13f02a0aa828 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
>>> @@ -26,6 +26,7 @@
>>>    #include "amdgpu.h"
>>>    #include "amdgpu_amdkfd.h"
>>>    #include "amdgpu_amdkfd_arcturus.h"
>>> +#include "amdgpu_reset.h"
>>>    #include "sdma0/sdma0_4_2_2_offset.h"
>>>    #include "sdma0/sdma0_4_2_2_sh_mask.h"
>>>    #include "sdma1/sdma1_4_2_2_offset.h"
>>> @@ -48,6 +49,8 @@
>>>    #include "amdgpu_amdkfd_gfx_v9.h"
>>>    #include "gfxhub_v1_0.h"
>>>    #include "mmhub_v9_4.h"
>>> +#include "gc/gc_9_0_offset.h"
>>> +#include "gc/gc_9_0_sh_mask.h"
>>>
>>>    #define HQD_N_REGS 56
>>>    #define DUMP_REG(addr) do {                               \
>>> @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct
>> amdgpu_device *adev, void *mqd,
>>>      return 0;
>>>    }
>>>
>>> +/*
>>> + * Helper used to suspend/resume gfx pipe for image post process work
>> to set
>>> + * barrier behaviour.
>>> + */
>>> +static int suspend_resume_compute_scheduler(struct amdgpu_device
>> *adev, bool suspend)
>>> +{
>>> +   int i, r = 0;
>>> +
>>> +   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>>> +           struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
>>> +
>>> +           if (!(ring && ring->sched.thread))
>>> +                   continue;
>>> +
>>> +           /* stop secheduler and drain ring. */
>>> +           if (suspend) {
>>> +                   drm_sched_stop(&ring->sched, NULL);
>>> +                   r = amdgpu_fence_wait_empty(ring);
>>> +                   if (r)
>>> +                           goto out;
>>> +           } else {
>>> +                   drm_sched_start(&ring->sched, false);
>>> +           }
>>> +   }
>>> +
>>> +out:
>>> +   /* return on resume or failure to drain rings. */
>>> +   if (!suspend || r)
>>> +           return r;
>>> +
>>> +   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
>>> +}
>>> +
>>> +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool
>> enable_waitcnt)
>>> +{
>>> +   uint32_t data;
>>> +
>>> +   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
>>> +
>>> +   if (!down_read_trylock(&adev->reset_domain->sem))
>>> +           return;
>>> +
>>> +   amdgpu_amdkfd_suspend(adev, false);
>>> +
>>> +   if (suspend_resume_compute_scheduler(adev, true))
>>> +           goto out;
>>> +
>>> +   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
>>> +   data = REG_SET_FIELD(data, SQ_CONFIG,
>> DISABLE_BARRIER_WAITCNT,
>>> +                                           enable_waitcnt ? 0 : 1);
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
>>> +
>>> +out:
>>> +   suspend_resume_compute_scheduler(adev, false);
>>> +
>>> +   amdgpu_amdkfd_resume(adev, false);
>>> +
>>> +   up_read(&adev->reset_domain->sem);
>>> +}
>>> +
>>> +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device
>> *adev,
>>> +                           bool restore_dbg_registers,
>>> +                           uint32_t vmid)
>>> +{
>>> +   mutex_lock(&adev->grbm_idx_mutex);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
>>> +
>>> +   set_barrier_auto_waitcnt(adev, true);
>>> +
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
>> 0);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
>>> +
>>> +   mutex_unlock(&adev->grbm_idx_mutex);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device
>> *adev,
>>> +                                   bool keep_trap_enabled,
>>> +                                   uint32_t vmid)
>>> +{
>>> +
>>> +   mutex_lock(&adev->grbm_idx_mutex);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
>>> +
>>> +   set_barrier_auto_waitcnt(adev, false);
>>> +
>>> +   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK),
>> 0);
>>> +
>>> +   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
>>> +
>>> +   mutex_unlock(&adev->grbm_idx_mutex);
>>> +
>>> +   return 0;
>>> +}
>>>    const struct kfd2kgd_calls arcturus_kfd2kgd = {
>>>      .program_sh_mem_settings =
>> kgd_gfx_v9_program_sh_mem_settings,
>>>      .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
>>> @@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>>>
>>        kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>>>      .set_vm_context_page_table_base =
>>>
>>        kgd_gfx_v9_set_vm_context_page_table_base,
>>> +   .enable_debug_trap = kgd_arcturus_enable_debug_trap,
>>> +   .disable_debug_trap = kgd_arcturus_disable_debug_trap,
>>>      .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
>>> -   .program_trap_handler_settings =
>> kgd_gfx_v9_program_trap_handler_settings
>>> +   .program_trap_handler_settings =
>> kgd_gfx_v9_program_trap_handler_settings,
>>>    };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index a0e5ad342f13..8ed1b5d255f7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct
>> amdgpu_device *adev)
>>>      switch (adev->ip_versions[GC_HWIP][0]) {
>>>      case IP_VERSION(9, 4, 1):
>>>              tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
>>> -           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
>>> -                                   DISABLE_BARRIER_WAITCNT, 1);
>>> +           tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
>> DISABLE_BARRIER_WAITCNT,
>>> +                           READ_ONCE(adev-
>>> barrier_has_auto_waitcnt) ? 0 : 1);
>>>              WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
>>>              break;
>>>      default:
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 56ad38fcd26e..efb81ccef8f5 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
>>>      WARN(debug_evictions, "Evicting all processes");
>>>      hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>>              cancel_delayed_work_sync(&p->eviction_work);
>>> -           cancel_delayed_work_sync(&p->restore_work);
>>> +           flush_delayed_work(&p->restore_work);
>> This looks like a sneak bug fix. Should this be a separate patch
>> independent of this path series?
> Ok.  That should probably be fixed in general.
> Back-to-back KFD suspends/resume calls can result in asymmetrical evictions and restores if scheduled restores are cancelled on suspend.
> The bug just happens to get surfaced for mGPU GFX9.4.1 debugging, because debug attach forces that scenario.
> I can send this out as a separate fix that's not related to this series.

I agree.

Thanks,
   Felix

>
> Thanks,
>
> Jon
>
>
>> Regards,
>>     Felix
>>
>>
>>>              if (kfd_process_evict_queues(p,
>> KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>>>                      pr_err("Failed to suspend process 0x%x\n", p-
>>> pasid);