[PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

Lazar, Lijo lijo.lazar at amd.com
Tue Oct 29 08:47:21 UTC 2024



On 10/29/2024 1:55 PM, Zhang, Jesse(Jie) wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> Hi Lijo,
> 
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Tuesday, October 29, 2024 3:58 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; Huang, Tim <Tim.Huang at amd.com>
> Subject: Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
> 
> 
> 
> On 10/29/2024 12:44 PM, Jesse.zhang at amd.com wrote:
>> From: "Jesse.zhang at amd.com" <Jesse.zhang at amd.com>
>>
>> Add two sysfs interfaces for gfx and compute:
>> gfx_reset_mask
>> compute_reset_mask
>>
>> These interfaces are read-only and show the resets supported by the IP.
>> For example, full adapter reset (mode1/mode2/BACO/etc), soft reset,
>> queue reset, and pipe reset.
>>
>> V2: the sysfs node returns a text string instead of some flags
>> (Christian)
>> v3: add a generic helper which takes the ring as parameter
>>     and print the strings in the order they are applied (Christian)
>>
>>     check amdgpu_gpu_recovery  before creating sysfs file itself,
>>     and initialize supported_reset_types in IP version files (Lijo)
>> v4: Fixing uninitialized variables (Tim)
>>
>> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com> Suggested-by:Alex
>> Deucher <alexander.deucher at amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  8 +++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 66 ++++++++++++++++++++++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  4 ++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     |  9 +++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 23 ++++++++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 10 ++++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 ++++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c    | 17 ++++++
>>  9 files changed, 184 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 48c9b9b06905..aea1031d7b84 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
>>  #define AMDGPU_RESET_VCE                     (1 << 13)
>>  #define AMDGPU_RESET_VCE1                    (1 << 14)
>>
>> +/* reset mask */
>> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset,
>> +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET (1 <<
>> +1) /* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1
>> +<< 2) /* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /*
>> +per pipe */
>> +
>>  /* max cursor sizes (in pixels) */
>>  #define CIK_CURSOR_WIDTH 128
>>  #define CIK_CURSOR_HEIGHT 128
>> @@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct
>> amdgpu_device *adev);  struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
>>                                           struct dma_fence *gang);
>>  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
>> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
>> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>>
>>  /* atpx handler */
>>  #if defined(CONFIG_VGA_SWITCHEROO)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ef715b2bbcdb..cd1e3f018893 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
>>       }
>>       return ret;
>>  }
>> +
>> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) {
>> +     ssize_t size = 0;
>> +
>> +     if (!ring)
>> +             return size;
>> +
>> +     if (amdgpu_device_should_recover_gpu(ring->adev))
>> +             size |= AMDGPU_RESET_TYPE_FULL;
>> +
>> +     if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
>> +         !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
>> +             size |= AMDGPU_RESET_TYPE_SOFT_RESET;
>> +
>> +     return size;
>> +}
>> +
>> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) {
>> +     ssize_t size = 0;
>> +
>> +     if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
>> +             size += sysfs_emit_at(buf, size, "soft ");
>> +
>> +     if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
>> +             size += sysfs_emit_at(buf, size, "queue ");
>> +
>> +     if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
>> +             size += sysfs_emit_at(buf, size, "pipe ");
>> +
>> +     if (supported_reset & AMDGPU_RESET_TYPE_FULL)
>> +             size += sysfs_emit_at(buf, size, "full ");
>> +
>> +     size += sysfs_emit_at(buf, size, "\n");
> 
> Is there an expectation of having "Unsupported" when no reset is supported (supported_reset == 0)?
>   Yes, will add it .
> 

Asked that for clarification. Now I see the sysfs is not created when
recovery is not enabled. Then maybe you could avoid creating sysfs if
supported_reset = 0. Or, create anyway and show unsupported if
gpu_recovery or supported_reset = 0.

Thanks,
Lijo

> Thanks
> Jesse
> 
> Thanks,
> Lijo
> 
>> +     return size;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index e96984c53e72..6de1f3bf6863 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>>       return count;
>>  }
>>
>> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
>> +                                             struct device_attribute *attr,
>> +                                             char *buf)
>> +{
>> +     struct drm_device *ddev = dev_get_drvdata(dev);
>> +     struct amdgpu_device *adev = drm_to_adev(ddev);
>> +
>> +     if (!adev)
>> +             return -ENODEV;
>> +
>> +     return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); }
>> +
>> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>> +                                             struct device_attribute *attr,
>> +                                             char *buf)
>> +{
>> +     struct drm_device *ddev = dev_get_drvdata(dev);
>> +     struct amdgpu_device *adev = drm_to_adev(ddev);
>> +
>> +     if (!adev)
>> +             return -ENODEV;
>> +
>> +     return amdgpu_show_reset_mask(buf,
>> +adev->gfx.compute_supported_reset);
>> +}
>> +
>>  static DEVICE_ATTR(run_cleaner_shader, 0200,
>>                  NULL, amdgpu_gfx_set_run_cleaner_shader);
>>
>> @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition,
>> 0644,  static DEVICE_ATTR(available_compute_partition, 0444,
>>                  amdgpu_gfx_get_available_compute_partition, NULL);
>>
>> +static DEVICE_ATTR(gfx_reset_mask, 0444,
>> +                amdgpu_gfx_get_gfx_reset_mask, NULL);
>> +
>> +static DEVICE_ATTR(compute_reset_mask, 0444,
>> +                amdgpu_gfx_get_compute_reset_mask, NULL);
>> +
>>  int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)  {
>>       struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6 +1734,40
>> @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
>>                           cleaner_shader_size);
>>  }
>>
>> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) {
>> +     int r = 0;
>> +
>> +     if (!amdgpu_gpu_recovery)
>> +             return r;
>> +
>> +     if (adev->gfx.num_gfx_rings) {
>> +             r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
>> +             if (r)
>> +                     return r;
>> +     }
>> +
>> +     if (adev->gfx.num_compute_rings) {
>> +             r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
>> +             if (r)
>> +                     return r;
>> +     }
>> +
>> +     return r;
>> +}
>> +
>> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) {
>> +     if (!amdgpu_gpu_recovery)
>> +             return;
>> +
>> +     if (adev->gfx.num_gfx_rings)
>> +             device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
>> +
>> +     if (adev->gfx.num_compute_rings)
>> +             device_remove_file(adev->dev, &dev_attr_compute_reset_mask); }
>> +
>>  /**
>>   * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
>>   * @adev: amdgpu_device pointer
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index f710178a21bc..fb0e1adf6766 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -424,6 +424,8 @@ struct amdgpu_gfx {
>>       /* reset mask */
>>       uint32_t                        grbm_soft_reset;
>>       uint32_t                        srbm_soft_reset;
>> +     uint32_t                        gfx_supported_reset;
>> +     uint32_t                        compute_supported_reset;
>>
>>       /* gfx off */
>>       bool                            gfx_off_state;      /* true: enabled, false: disabled */
>> @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct
>> amdgpu_device *adev);  void
>> amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);  void
>> amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
>> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring
>> *ring);
>> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
>> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>>
>>  static inline const char *amdgpu_gfx_compute_mode_desc(int mode)  {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 9da95b25e158..e2b2cdab423b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>>                       }
>>               }
>>       }
>> +     /* TODO: Add queue reset mask when FW fully supports it */
>> +     adev->gfx.gfx_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> +     adev->gfx.compute_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>>
>>       r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
>>       if (r) {
>> @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>>       gfx_v10_0_alloc_ip_dump(adev);
>>
>>       r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> +     if (r)
>> +             return r;
>> +     r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>>       if (r)
>>               return r;
>>       return 0;
>> @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
>>       amdgpu_gfx_kiq_fini(adev, 0);
>>
>>       amdgpu_gfx_cleaner_shader_sw_fini(adev);
>> +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>>       gfx_v10_0_pfp_fini(adev);
>>       gfx_v10_0_ce_fini(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> index 5aff8f72de9c..ec24e8d019b3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>>               }
>>       }
>>
>> +     adev->gfx.gfx_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> +     adev->gfx.compute_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +     switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>> +     case IP_VERSION(11, 0, 0):
>> +     case IP_VERSION(11, 0, 2):
>> +     case IP_VERSION(11, 0, 3):
>> +             if ((adev->gfx.me_fw_version >= 2280) &&
>> +                         (adev->gfx.mec_fw_version >= 2410)) {
>> +                             adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> +                             adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> +             }
>> +             break;
>> +     default:
>> +             break;
>> +     }
>> +
>>       if (!adev->enable_mes_kiq) {
>>               r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
>>               if (r) {
>> @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>>       if (r)
>>               return r;
>>
>> +     r = amdgpu_gfx_sysfs_reset_mask_init (adev);
>> +     if (r)
>> +             return r;
>> +
>>       return 0;
>>  }
>>
>> @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
>>       gfx_v11_0_free_microcode(adev);
>>
>>       amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>>       kfree(adev->gfx.ip_dump_core);
>>       kfree(adev->gfx.ip_dump_compute_queues);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> index 9fec28d8a5fc..f5ffa2d8b22a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>>               }
>>       }
>>
>> +     /* TODO: Add queue reset mask when FW fully supports it */
>> +     adev->gfx.gfx_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> +     adev->gfx.compute_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +
>>       if (!adev->enable_mes_kiq) {
>>               r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
>>               if (r) {
>> @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>>       gfx_v12_0_alloc_ip_dump(adev);
>>
>>       r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> +     if (r)
>> +             return r;
>> +     r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>>       if (r)
>>               return r;
>>
>> @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
>>       gfx_v12_0_free_microcode(adev);
>>
>>       amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>>       kfree(adev->gfx.ip_dump_core);
>>       kfree(adev->gfx.ip_dump_compute_queues);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index b4c4b9916289..94007a9ed54b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>>               }
>>       }
>>
>> +     /* TODO: Add queue reset mask when FW fully supports it */
>> +     adev->gfx.gfx_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> +     adev->gfx.compute_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +
>>       r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
>>       if (r) {
>>               DRM_ERROR("Failed to init KIQ BOs!\n"); @@ -2391,6 +2397,9 @@
>> static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>>       gfx_v9_0_alloc_ip_dump(adev);
>>
>>       r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> +     if (r)
>> +             return r;
>> +     r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>>       if (r)
>>               return r;
>>
>> @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
>>       amdgpu_gfx_kiq_fini(adev, 0);
>>
>>       amdgpu_gfx_cleaner_shader_sw_fini(adev);
>> +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>>       gfx_v9_0_mec_fini(adev);
>>       amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> index 016290f00592..028fda13ac50 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>>                       return r;
>>       }
>>
>> +     adev->gfx.compute_supported_reset =
>> +             amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +     switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>> +     case IP_VERSION(9, 4, 3):
>> +     case IP_VERSION(9, 4, 4):
>> +             if (adev->gfx.mec_fw_version >= 155) {
>> +                     adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> +                     adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE;
>> +             }
>> +             break;
>> +     default:
>> +             break;
>> +     }
>>       r = gfx_v9_4_3_gpu_early_init(adev);
>>       if (r)
>>               return r;
>> @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>>       if (r)
>>               return r;
>>
>> +     r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>> +     if (r)
>> +             return r;
>>       return 0;
>>  }
>>
>> @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
>>       gfx_v9_4_3_free_microcode(adev);
>>       amdgpu_gfx_sysfs_fini(adev);
>>       amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>>       kfree(adev->gfx.ip_dump_core);
>>       kfree(adev->gfx.ip_dump_compute_queues);


More information about the amd-gfx mailing list