[PATCH 1/5 V2] drm/amdgpu: Add sysfs interface for gc reset mask

Lazar, Lijo lijo.lazar at amd.com
Wed Oct 23 07:29:57 UTC 2024



On 10/23/2024 8:13 AM, Jesse.zhang at amd.com wrote:
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
> 
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc),
> soft reset, queue reset, and pipe reset.
> 
> V2: the sysfs node returns a text string instead of some flags (Christian)
> 
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> Suggested-by:Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |   6 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   5 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   5 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |   5 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |   5 +
>  7 files changed, 150 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..10d55755ee88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>  	return count;
>  }
>  
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> +						struct device_attribute *attr,
> +						char *buf)
> +{
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +	ssize_t size = 0;
> +	struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
> +
> +	if (!adev || !ring)
> +		return -ENODEV;
> +
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		size += sysfs_emit_at(buf, size, "full ");
> +
> +	if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> +			&& !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> +		size += sysfs_emit_at(buf, size, "soft ");
> +

If amdgpu_gpu_recovery is disabled, then that check may be made before
creating the sysfs file itself.  It doesn't have to be here.

> +	if (amdgpu_gpu_recovery && ring->funcs->reset) {
> +                switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> +                case IP_VERSION(9, 2, 2): //reven2
> +                case IP_VERSION(9, 3, 0): //renior
> +                case IP_VERSION(9, 4, 0): //vega20
> +                case IP_VERSION(10, 1, 0): //navi10
> +                case IP_VERSION(10, 1, 1): //navi12
> +                case IP_VERSION(10, 1, 2): //navi13
> +                        /* Skip flag setting because some cases
> +                         * are not supported by current firmware.
> +                         */
> +                        break;
> +
> +                default:
> +			size += sysfs_emit_at(buf, size, "queue ");
> +                        break;
> +		}
> +        }

This kind of version check is not good. Instead initialize
supported_reset_types in IP version files. As in the compute example
below, sometimes it requires FW support/other checks also, not just the
existence of callback implementation.

This function may just iterate over the type mask to print the text version.

Thanks,
Lijo

> +
> +	size += sysfs_emit_at(buf, size, "\n");
> +	return size;
> +}
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> +						struct device_attribute *attr,
> +						char *buf)
> +{
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +	ssize_t size = 0;
> +	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> +
> +	if (!adev || !ring)
> +		return -ENODEV;
> +
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		size += sysfs_emit_at(buf, size, "full ");
> +
> +	if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> +			&& !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> +		size += sysfs_emit_at(buf, size, "soft ");
> +
> +	if (amdgpu_gpu_recovery && ring->funcs->reset) {
> +                switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> +                case IP_VERSION(9, 2, 2): //reven2
> +                case IP_VERSION(9, 3, 0): //renior
> +                case IP_VERSION(9, 4, 0): //vega20
> +                case IP_VERSION(10, 1, 0): //navi10
> +                case IP_VERSION(10, 1, 1): //navi12
> +                case IP_VERSION(10, 1, 2): //navi13
> +                        /* Skip flag setting because some test cases
> +                         * are not supported by current firmware.
> +                         */
> +                        break;
> +
> +                default:
> +			size += sysfs_emit_at(buf, size, "queue ");
> +                        break;
> +		}
> +        }
> +
> +	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> +			adev->gfx.mec_fw_version >= 0x0000009b)
> +			size += sysfs_emit_at(buf, size, "pipe ");
> +
> +	size += sysfs_emit_at(buf, size, "\n");
> +	return size;
> +}
> +
>  static DEVICE_ATTR(run_cleaner_shader, 0200,
>  		   NULL, amdgpu_gfx_set_run_cleaner_shader);
>  
> @@ -1602,6 +1690,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
>  static DEVICE_ATTR(available_compute_partition, 0444,
>  		   amdgpu_gfx_get_available_compute_partition, NULL);
>  
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> +		   amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> +		   amdgpu_gfx_get_compute_reset_mask, NULL);
> +
>  int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
>  {
>  	struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1702,6 +1796,34 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
>  			    cleaner_shader_size);
>  }
>  
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
> +{
> +	int r = 0;
> +
> +	if (adev->gfx.num_gfx_rings) {
> +		r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> +		if (r)
> +			return r;
> +	}
> +
> +	if (adev->gfx.num_compute_rings) {
> +		r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> +		if (r)
> +			return r;
> +	}
> +
> +	return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev)
> +{
> +	if (adev->gfx.num_gfx_rings)
> +		device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> +	if (adev->gfx.num_compute_rings)
> +		device_remove_file(adev->dev, &dev_attr_compute_reset_mask);
> +}
> +
>  /**
>   * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
>   * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..0cf2151b3cf4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -582,6 +582,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
>  void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
>  void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
>  void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>  
>  static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..2baa76095232 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4856,6 +4856,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>  	r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>  	if (r)
>  		return r;
> +
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -4908,6 +4913,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
>  
>  	gfx_v10_0_free_microcode(adev);
>  	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>  
>  	kfree(adev->gfx.ip_dump_core);
>  	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..32d14b9cc6e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1721,6 +1721,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>  	if (r)
>  		return r;
>  
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -1783,6 +1787,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
>  	gfx_v11_0_free_microcode(adev);
>  
>  	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>  
>  	kfree(adev->gfx.ip_dump_core);
>  	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..925b7ca49b2b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1470,6 +1470,10 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>  	if (r)
>  		return r;
>  
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -1530,6 +1534,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
>  	gfx_v12_0_free_microcode(adev);
>  
>  	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>  
>  	kfree(adev->gfx.ip_dump_core);
>  	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..0de199c1cfdd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2394,6 +2394,10 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>  	if (r)
>  		return r;
>  
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -2432,6 +2436,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
>  	gfx_v9_0_free_microcode(adev);
>  
>  	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>  
>  	kfree(adev->gfx.ip_dump_core);
>  	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..87cfd77e2fb4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1175,6 +1175,10 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>  	if (r)
>  		return r;
>  
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -1200,6 +1204,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
>  	gfx_v9_4_3_free_microcode(adev);
>  	amdgpu_gfx_sysfs_fini(adev);
>  	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>  
>  	kfree(adev->gfx.ip_dump_core);
>  	kfree(adev->gfx.ip_dump_compute_queues);


More information about the amd-gfx mailing list