[PATCH 1/5 V2] drm/amdgpu: Add sysfs interface for gc reset mask
Lazar, Lijo
lijo.lazar at amd.com
Wed Oct 23 07:29:57 UTC 2024
On 10/23/2024 8:13 AM, Jesse.zhang at amd.com wrote:
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
>
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc),
> soft reset, queue reset, and pipe reset.
>
> V2: the sysfs node returns a text string instead of some flags (Christian)
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> Suggested-by:Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 +
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 ++
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 5 +
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 5 +
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 +
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 5 +
> 7 files changed, 150 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..10d55755ee88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> return count;
> }
>
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> + ssize_t size = 0;
> + struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
> +
> + if (!adev || !ring)
> + return -ENODEV;
> +
> + if (amdgpu_device_should_recover_gpu(adev))
> + size += sysfs_emit_at(buf, size, "full ");
> +
> + if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> + && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> + size += sysfs_emit_at(buf, size, "soft ");
> +
If amdgpu_gpu_recovery is disabled, then that check may be made before
creating the sysfs file itself. It doesn't have to be here.
> + if (amdgpu_gpu_recovery && ring->funcs->reset) {
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 2, 2): //reven2
> + case IP_VERSION(9, 3, 0): //renior
> + case IP_VERSION(9, 4, 0): //vega20
> + case IP_VERSION(10, 1, 0): //navi10
> + case IP_VERSION(10, 1, 1): //navi12
> + case IP_VERSION(10, 1, 2): //navi13
> + /* Skip flag setting because some cases
> + * are not supported by current firmware.
> + */
> + break;
> +
> + default:
> + size += sysfs_emit_at(buf, size, "queue ");
> + break;
> + }
> + }
This kind of version check is not good. Instead initialize
supported_reset_types in IP version files. As in the compute example
below, sometimes it requires FW support/other checks also, not just the
existence of callback implementation.
This function may just iterate over the type mask to print the text version.
Thanks,
Lijo
> +
> + size += sysfs_emit_at(buf, size, "\n");
> + return size;
> +}
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> + ssize_t size = 0;
> + struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> +
> + if (!adev || !ring)
> + return -ENODEV;
> +
> + if (amdgpu_device_should_recover_gpu(adev))
> + size += sysfs_emit_at(buf, size, "full ");
> +
> + if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> + && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> + size += sysfs_emit_at(buf, size, "soft ");
> +
> + if (amdgpu_gpu_recovery && ring->funcs->reset) {
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 2, 2): //reven2
> + case IP_VERSION(9, 3, 0): //renior
> + case IP_VERSION(9, 4, 0): //vega20
> + case IP_VERSION(10, 1, 0): //navi10
> + case IP_VERSION(10, 1, 1): //navi12
> + case IP_VERSION(10, 1, 2): //navi13
> + /* Skip flag setting because some test cases
> + * are not supported by current firmware.
> + */
> + break;
> +
> + default:
> + size += sysfs_emit_at(buf, size, "queue ");
> + break;
> + }
> + }
> +
> + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> + adev->gfx.mec_fw_version >= 0x0000009b)
> + size += sysfs_emit_at(buf, size, "pipe ");
> +
> + size += sysfs_emit_at(buf, size, "\n");
> + return size;
> +}
> +
> static DEVICE_ATTR(run_cleaner_shader, 0200,
> NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1602,6 +1690,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
> static DEVICE_ATTR(available_compute_partition, 0444,
> amdgpu_gfx_get_available_compute_partition, NULL);
>
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> + amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> + amdgpu_gfx_get_compute_reset_mask, NULL);
> +
> int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
> {
> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1702,6 +1796,34 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
> cleaner_shader_size);
> }
>
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
> +{
> + int r = 0;
> +
> + if (adev->gfx.num_gfx_rings) {
> + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + if (adev->gfx.num_compute_rings) {
> + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev)
> +{
> + if (adev->gfx.num_gfx_rings)
> + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> + if (adev->gfx.num_compute_rings)
> + device_remove_file(adev->dev, &dev_attr_compute_reset_mask);
> +}
> +
> /**
> * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
> * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..0cf2151b3cf4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -582,6 +582,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
> void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
> void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>
> static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..2baa76095232 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4856,6 +4856,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> if (r)
> return r;
> +
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -4908,6 +4913,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
>
> gfx_v10_0_free_microcode(adev);
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..32d14b9cc6e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1721,6 +1721,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1783,6 +1787,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v11_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..925b7ca49b2b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1470,6 +1470,10 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1530,6 +1534,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v12_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..0de199c1cfdd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2394,6 +2394,10 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -2432,6 +2436,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v9_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..87cfd77e2fb4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1175,6 +1175,10 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1200,6 +1204,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v9_4_3_free_microcode(adev);
> amdgpu_gfx_sysfs_fini(adev);
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
More information about the amd-gfx
mailing list