[PATCH 1/5] drm/amdgpu: Add sysfs interface for gc reset mask

Christian König christian.koenig at amd.com
Tue Oct 22 09:52:28 UTC 2024


Am 22.10.24 um 09:59 schrieb Jesse.zhang at amd.com:
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
>
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc),
> soft reset, queue reset, and pipe reset.
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> Suggested-by:Alex Deucher <alexander.deucher at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |   6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   2 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |   6 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |   5 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c  |   5 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |   5 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |   5 +
>   8 files changed, 156 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 48c9b9b06905..0dd475c30267 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
>   #define AMDGPU_RESET_VCE			(1 << 13)
>   #define AMDGPU_RESET_VCE1			(1 << 14)
>   
> +/* reset mask */
> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */
> +#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
> +#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
> +#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */

The general approach looks good to me, but I would really prefer that 
the sysfs node returns a text string instead of some flags. E.g. 
"soft,queue,pipe,full" etc...

That's easier to extend should we at some point get something more 
complex and we don't need any specialized knowledge to decode it.

Regards,
Christian.

> +
>   /* max cursor sizes (in pixels) */
>   #define CIK_CURSOR_WIDTH 128
>   #define CIK_CURSOR_HEIGHT 128
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..b4706355ece8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>   	return count;
>   }
>   
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> +						struct device_attribute *attr,
> +						char *buf)
> +{
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +	ssize_t size = 0;
> +	struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
> +
> +	if (!adev || !ring)
> +		return -ENODEV;
> +
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		size |= AMDGPU_RESET_TYPE_FULL;
> +
> +	if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> +			&& !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> +		size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> +
> +	if (amdgpu_gpu_recovery && ring->funcs->reset) {
> +                switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> +                case IP_VERSION(9, 2, 2): //reven2
> +                case IP_VERSION(9, 3, 0): //renior
> +                case IP_VERSION(9, 4, 0): //vega20
> +                case IP_VERSION(10, 1, 0): //navi10
> +                case IP_VERSION(10, 1, 1): //navi12
> +                case IP_VERSION(10, 1, 2): //navi13
> +                        /* Skip flag setting because some cases
> +                         * are not supported by current firmware.
> +                         */
> +                        break;
> +
> +                default:
> +                        size |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                        break;
> +		}
> +        }
> +
> +	size = sysfs_emit_at(buf, 0, "%lu\n", size);
> +	return size;
> +}
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> +						struct device_attribute *attr,
> +						char *buf)
> +{
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +	ssize_t size = 0;
> +	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> +
> +	if (!adev || !ring)
> +		return -ENODEV;
> +
> +	if (amdgpu_device_should_recover_gpu(adev))
> +		size |= AMDGPU_RESET_TYPE_FULL;
> +
> +	if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery)
> +			&& !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery)
> +		size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> +
> +	if (amdgpu_gpu_recovery && ring->funcs->reset) {
> +                switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> +                case IP_VERSION(9, 2, 2): //reven2
> +                case IP_VERSION(9, 3, 0): //renior
> +                case IP_VERSION(9, 4, 0): //vega20
> +                case IP_VERSION(10, 1, 0): //navi10
> +                case IP_VERSION(10, 1, 1): //navi12
> +                case IP_VERSION(10, 1, 2): //navi13
> +                        /* Skip flag setting because some test cases
> +                         * are not supported by current firmware.
> +                         */
> +                        break;
> +
> +                default:
> +                        size |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                        break;
> +		}
> +        }
> +
> +	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> +			adev->gfx.mec_fw_version >= 0x0000009b)
> +		size |= AMDGPU_RESET_TYPE_PER_PIPE;
> +
> +	size = sysfs_emit_at(buf, 0, "%lu\n", size);
> +	return size;
> +}
> +
>   static DEVICE_ATTR(run_cleaner_shader, 0200,
>   		   NULL, amdgpu_gfx_set_run_cleaner_shader);
>   
> @@ -1602,6 +1690,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
>   static DEVICE_ATTR(available_compute_partition, 0444,
>   		   amdgpu_gfx_get_available_compute_partition, NULL);
>   
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> +		   amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> +		   amdgpu_gfx_get_compute_reset_mask, NULL);
> +
>   int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1702,6 +1796,34 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
>   			    cleaner_shader_size);
>   }
>   
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
> +{
> +	int r = 0;
> +
> +	if (adev->gfx.num_gfx_rings) {
> +		r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> +		if (r)
> +			return r;
> +	}
> +
> +	if (adev->gfx.num_compute_rings) {
> +		r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> +		if (r)
> +			return r;
> +	}
> +
> +	return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev)
> +{
> +	if (adev->gfx.num_gfx_rings)
> +		device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> +	if (adev->gfx.num_compute_rings)
> +		device_remove_file(adev->dev, &dev_attr_compute_reset_mask);
> +}
> +
>   /**
>    * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
>    * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..0cf2151b3cf4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -582,6 +582,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
>   void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
>   void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
>   void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>   
>   static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
>   {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..2baa76095232 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4856,6 +4856,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>   	r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>   	if (r)
>   		return r;
> +
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> @@ -4908,6 +4913,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
>   
>   	gfx_v10_0_free_microcode(adev);
>   	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>   
>   	kfree(adev->gfx.ip_dump_core);
>   	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..32d14b9cc6e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1721,6 +1721,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>   	if (r)
>   		return r;
>   
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> @@ -1783,6 +1787,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
>   	gfx_v11_0_free_microcode(adev);
>   
>   	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>   
>   	kfree(adev->gfx.ip_dump_core);
>   	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..925b7ca49b2b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1470,6 +1470,10 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>   	if (r)
>   		return r;
>   
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> @@ -1530,6 +1534,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
>   	gfx_v12_0_free_microcode(adev);
>   
>   	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>   
>   	kfree(adev->gfx.ip_dump_core);
>   	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..0de199c1cfdd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2394,6 +2394,10 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>   	if (r)
>   		return r;
>   
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> @@ -2432,6 +2436,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
>   	gfx_v9_0_free_microcode(adev);
>   
>   	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>   
>   	kfree(adev->gfx.ip_dump_core);
>   	kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..87cfd77e2fb4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1175,6 +1175,10 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>   	if (r)
>   		return r;
>   
> +	r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
>   
> @@ -1200,6 +1204,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
>   	gfx_v9_4_3_free_microcode(adev);
>   	amdgpu_gfx_sysfs_fini(adev);
>   	amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> +	amdgpu_gfx_sysfs_reset_mask_fini(adev);
>   
>   	kfree(adev->gfx.ip_dump_core);
>   	kfree(adev->gfx.ip_dump_compute_queues);



More information about the amd-gfx mailing list