[RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager
Lazar, Lijo
lijo.lazar at amd.com
Fri Jan 17 05:03:31 UTC 2025
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Add helper functions to track status for ras manager and ip blocks.
>
> Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 38 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
> 3 files changed, 85 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5e55a44f9eef..f0f773659faf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
>
> #define AMDGPU_MAX_IP_NUM 16
>
> +enum amdgpu_marker {
> + // Markers for IRQs, used for both ip blocks and ras blocks.
> + AMDGPU_MARKER_IRQ0 = 32,
> + AMDGPU_MARKER_IRQ1,
> + AMDGPU_MARKER_IRQ2,
> + AMDGPU_MARKER_IRQ3,
> + AMDGPU_MARKER_IRQ4,
> + AMDGPU_MARKER_IRQ5,
> + AMDGPU_MARKER_IRQ6,
> + AMDGPU_MARKER_IRQ7,
> + AMDGPU_MARKER_IRQ_MAX = 63,
> +};
> +
> +#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
> +
> struct amdgpu_ip_block_status {
> bool valid;
> bool sw;
> bool hw;
> bool late_initialized;
> bool hang;
> + uint64_t markers;
> };
>
This fine grained levels maintained at IP layer doesn't look like a
proper solution. It's either IP or RAS block has the required IRQs
enabled or disabled. Unwinding them needs to be tracked at IRQ object
layer and not here.
Thanks,
Lijo
> struct amdgpu_ip_block_version {
> @@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
> int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
> const struct amdgpu_ip_block_version *ip_block_version);
>
> +static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_marker marker)
> +{
> + WARN_ON(marker > 63);
> + WARN_ON(ip_block->status.markers & (0x1ull << marker));
> + ip_block->status.markers |= 0x1ull << (int)marker;
> +}
> +
> +static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_marker marker)
> +{
> + bool set = false;
> + uint64_t value = 0x1ull << (int)marker;
> +
> + if ((ip_block->status.markers & value) != 0) {
> + ip_block->status.markers &= ~value;
> + set = true;
> + }
> +
> + return set;
> +}
> +
> /*
> * BIOS.
> */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f0924aa3f4e4..5e19d820ab34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
>
> return con->is_rma;
> }
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + if (obj && obj->markers & (0x1ull << marker))
> + return true;
> +
> + return false;
> +}
> +
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + WARN_ON(marker > 63);
> + WARN_ON(obj->markers & (0x1ull << marker));
> + if (obj)
> + obj->markers |= 0x1ull << marker;
> +}
> +
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + bool set = false;
> + uint64_t value = 0x1ull << marker;
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + if (obj && (obj->markers & value) != 0) {
> + obj->markers &= ~value;
> + set = true;
> + }
> +
> + return set;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 82db986c36a0..35881087b17b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -634,6 +634,8 @@ struct ras_manager {
> struct ras_common_if head;
> /* reference count */
> int use;
> + /* Flags for status tracking */
> + uint64_t markers;
> /* ras block link */
> struct list_head node;
> /* the device */
> @@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
> const char *fmt, ...);
>
> bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker);
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker);
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head,
> + int marker);
> #endif
More information about the amd-gfx
mailing list