[PATCH v2 1/2] drm/amdgpu: separate ras irq from vcn instance irq for UVD_POISON

Zhang, Horatio Hongkun.Zhang at amd.com
Mon May 15 07:04:40 UTC 2023


[AMD Official Use Only - General]

Thank you for your suggestion, I will split these two patches in the next version.

Regards,
Horatio

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com> 
Sent: Monday, May 15, 2023 2:38 PM
To: Zhang, Horatio <Hongkun.Zhang at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Xu, Feifei <Feifei.Xu at amd.com>; Liu, Leo <Leo.Liu at amd.com>; Jiang, Sonny <Sonny.Jiang at amd.com>; Limonciello, Mario <Mario.Limonciello at amd.com>; Liu, HaoPing (Alan) <HaoPing.Liu at amd.com>; Zhou, Bob <Bob.Zhou at amd.com>; Zhang, Horatio <Hongkun.Zhang at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: RE: [PATCH v2 1/2] drm/amdgpu: separate ras irq from vcn instance irq for UVD_POISON

[AMD Official Use Only - General]

The code is fine with me, but it's better to split the patch into three parts, one is for common vcn code, one is for vcn 2.6 and the third one is for vcn 4.0.

Regards,
Tao

> -----Original Message-----
> From: Horatio Zhang <Hongkun.Zhang at amd.com>
> Sent: Monday, May 15, 2023 10:28 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao 
> <Tao.Zhou1 at amd.com>; Xu, Feifei <Feifei.Xu at amd.com>; Liu, Leo 
> <Leo.Liu at amd.com>; Jiang, Sonny <Sonny.Jiang at amd.com>; Limonciello, 
> Mario <Mario.Limonciello at amd.com>; Liu, HaoPing (Alan) 
> <HaoPing.Liu at amd.com>; Zhou, Bob <Bob.Zhou at amd.com>; Zhang, Horatio 
> <Hongkun.Zhang at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH v2 1/2] drm/amdgpu: separate ras irq from vcn instance 
> irq for UVD_POISON
> 
> Separate RAS poison consumption handling from the instance irq, and 
> register dedicated ras_poison_irq src and funcs for UVD_POISON. Fix 
> the amdgpu_irq_put call trace in vcn_v4_0_hw_fini.
> 
> [   44.563572] RIP: 0010:amdgpu_irq_put+0xa4/0xc0 [amdgpu]
> [   44.563629] RSP: 0018:ffffb36740edfc90 EFLAGS: 00010246
> [   44.563630] RAX: 0000000000000000 RBX: 0000000000000001 RCX:
> 0000000000000000
> [   44.563630] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> 0000000000000000
> [   44.563631] RBP: ffffb36740edfcb0 R08: 0000000000000000 R09:
> 0000000000000000
> [   44.563631] R10: 0000000000000000 R11: 0000000000000000 R12:
> ffff954c568e2ea8
> [   44.563631] R13: 0000000000000000 R14: ffff954c568c0000 R15:
> ffff954c568e2ea8
> [   44.563632] FS:  0000000000000000(0000) GS:ffff954f584c0000(0000)
> knlGS:0000000000000000
> [   44.563632] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   44.563633] CR2: 00007f028741ba70 CR3: 000000026ca10000 CR4:
> 0000000000750ee0
> [   44.563633] PKRU: 55555554
> [   44.563633] Call Trace:
> [   44.563634]  <TASK>
> [   44.563634]  vcn_v4_0_hw_fini+0x62/0x160 [amdgpu]
> [   44.563700]  vcn_v4_0_suspend+0x13/0x30 [amdgpu]
> [   44.563755]  amdgpu_device_ip_suspend_phase2+0x240/0x470 [amdgpu]
> [   44.563806]  amdgpu_device_ip_suspend+0x41/0x80 [amdgpu]
> [   44.563858]  amdgpu_device_pre_asic_reset+0xd9/0x4a0 [amdgpu]
> [   44.563909]  amdgpu_device_gpu_recover.cold+0x548/0xcf1 [amdgpu]
> [   44.564006]  amdgpu_debugfs_reset_work+0x4c/0x80 [amdgpu]
> [   44.564061]  process_one_work+0x21f/0x400
> [   44.564062]  worker_thread+0x200/0x3f0
> [   44.564063]  ? process_one_work+0x400/0x400
> [   44.564064]  kthread+0xee/0x120
> [   44.564065]  ? kthread_complete_and_exit+0x20/0x20
> [   44.564066]  ret_from_fork+0x22/0x30
> 
> Suggested-by: Hawking Zhang <Hawking.Zhang at amd.com>
> Signed-off-by: Horatio Zhang <Hongkun.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 27 ++++++++++++++++++- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  3 +++
>  drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 24 ++++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 35 ++++++++++++++++++++-----
>  4 files changed, 78 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index e63fcc58e8e0..f53c22db8d25 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -1181,6 +1181,31 @@ int amdgpu_vcn_process_poison_irq(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct 
> +ras_common_if *ras_block) {
> +	int r, i;
> +
> +	r = amdgpu_ras_block_late_init(adev, ras_block);
> +	if (r)
> +		return r;
> +
> +	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
> +		for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> +			if (adev->vcn.harvest_config & (1 << i))
> +				continue;
> +
> +			r = amdgpu_irq_get(adev, &adev-
> >vcn.inst[i].ras_poison_irq, 0);
> +			if (r)
> +				goto late_fini;
> +		}
> +	}
> +	return 0;
> +
> +late_fini:
> +	amdgpu_ras_block_late_fini(adev, ras_block);
> +	return r;
> +}
> +
>  int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev)  {
>  	int err;
> @@ -1202,7 +1227,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device
> *adev)
>  	adev->vcn.ras_if = &ras->ras_block.ras_comm;
> 
>  	if (!ras->ras_block.ras_late_init)
> -		ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
> +		ras->ras_block.ras_late_init = amdgpu_vcn_ras_late_init;
> 
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index c730949ece7d..802d4c2edb41 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -234,6 +234,7 @@ struct amdgpu_vcn_inst {
>  	struct amdgpu_ring	ring_enc[AMDGPU_VCN_MAX_ENC_RINGS];
>  	atomic_t		sched_score;
>  	struct amdgpu_irq_src	irq;
> +	struct amdgpu_irq_src	ras_poison_irq;
>  	struct amdgpu_vcn_reg	external;
>  	struct amdgpu_bo	*dpg_sram_bo;
>  	struct dpg_pause_state	pause_state;
> @@ -400,6 +401,8 @@ void amdgpu_debugfs_vcn_fwlog_init(struct
> amdgpu_device *adev,  int amdgpu_vcn_process_poison_irq(struct
> amdgpu_device *adev,
>  			struct amdgpu_irq_src *source,
>  			struct amdgpu_iv_entry *entry);
> +int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev,
> +			struct ras_common_if *ras_block);
>  int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index ab0b45d0ead1..44262d67b3ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -143,7 +143,7 @@ static int vcn_v2_5_sw_init(void *handle)
> 
>  		/* VCN POISON TRAP */
>  		r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[j],
> -			VCN_2_6__SRCID_UVD_POISON, &adev-
> >vcn.inst[j].irq);
> +			VCN_2_6__SRCID_UVD_POISON, &adev-
> >vcn.inst[j].ras_poison_irq);
>  		if (r)
>  			return r;
>  	}
> @@ -354,6 +354,9 @@ static int vcn_v2_5_hw_fini(void *handle)
>  		    (adev->vcn.cur_state != AMD_PG_STATE_GATE &&
>  		     RREG32_SOC15(VCN, i, mmUVD_STATUS)))
>  			vcn_v2_5_set_powergating_state(adev,
> AMD_PG_STATE_GATE);
> +
> +		if (amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__VCN))
> +			amdgpu_irq_put(adev, &adev-
> >vcn.inst[i].ras_poison_irq, 0);
>  	}
> 
>  	return 0;
> @@ -1807,6 +1810,14 @@ static int vcn_v2_5_set_interrupt_state(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> +static int vcn_v2_6_set_ras_interrupt_state(struct amdgpu_device *adev,
> +					struct amdgpu_irq_src *source,
> +					unsigned int type,
> +					enum amdgpu_interrupt_state state) {
> +	return 0;
> +}
> +
>  static int vcn_v2_5_process_interrupt(struct amdgpu_device *adev,
>  				      struct amdgpu_irq_src *source,
>  				      struct amdgpu_iv_entry *entry) @@ -1837,9
> +1848,6 @@ static int vcn_v2_5_process_interrupt(struct amdgpu_device 
> +*adev,
>  	case VCN_2_0__SRCID__UVD_ENC_LOW_LATENCY:
>  		amdgpu_fence_process(&adev-
> >vcn.inst[ip_instance].ring_enc[1]);
>  		break;
> -	case VCN_2_6__SRCID_UVD_POISON:
> -		amdgpu_vcn_process_poison_irq(adev, source, entry);
> -		break;
>  	default:
>  		DRM_ERROR("Unhandled interrupt: %d %d\n",
>  			  entry->src_id, entry->src_data[0]); @@ -1854,6
> +1862,11 @@ static const struct amdgpu_irq_src_funcs 
> +vcn_v2_5_irq_funcs = {
>  	.process = vcn_v2_5_process_interrupt,  };
> 
> +static const struct amdgpu_irq_src_funcs vcn_v2_6_ras_irq_funcs = {
> +	.set = vcn_v2_6_set_ras_interrupt_state,
> +	.process = amdgpu_vcn_process_poison_irq, };
> +
>  static void vcn_v2_5_set_irq_funcs(struct amdgpu_device *adev)  {
>  	int i;
> @@ -1863,6 +1876,9 @@ static void vcn_v2_5_set_irq_funcs(struct 
> amdgpu_device *adev)
>  			continue;
>  		adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1;
>  		adev->vcn.inst[i].irq.funcs = &vcn_v2_5_irq_funcs;
> +
> +		adev->vcn.inst[i].ras_poison_irq.num_types = adev-
> >vcn.num_enc_rings + 1;
> +		adev->vcn.inst[i].ras_poison_irq.funcs =
> &vcn_v2_6_ras_irq_funcs;
>  	}
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> index bf0674039598..1dfc7cee6d59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> @@ -139,7 +139,7 @@ static int vcn_v4_0_sw_init(void *handle)
> 
>  		/* VCN POISON TRAP */
>  		r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[i],
> -				VCN_4_0__SRCID_UVD_POISON, &adev-
> >vcn.inst[i].irq);
> +				VCN_4_0__SRCID_UVD_POISON, &adev-
> >vcn.inst[i].ras_poison_irq);
>  		if (r)
>  			return r;
> 
> @@ -305,8 +305,8 @@ static int vcn_v4_0_hw_fini(void *handle)
>                          vcn_v4_0_set_powergating_state(adev, AMD_PG_STATE_GATE);
>  			}
>  		}
> -
> -		amdgpu_irq_put(adev, &adev->vcn.inst[i].irq, 0);
> +		if (amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__VCN))
> +			amdgpu_irq_put(adev, &adev-
> >vcn.inst[i].ras_poison_irq, 0);
>  	}
> 
>  	return 0;
> @@ -1975,6 +1975,24 @@ static int vcn_v4_0_set_interrupt_state(struct
> amdgpu_device *adev, struct amdgp
>  	return 0;
>  }
> 
> +/**
> + * vcn_v4_0_set_ras_interrupt_state - set VCN block RAS interrupt 
> +state
> + *
> + * @adev: amdgpu_device pointer
> + * @source: interrupt sources
> + * @type: interrupt types
> + * @state: interrupt states
> + *
> + * Set VCN block RAS interrupt state
> + */
> +static int vcn_v4_0_set_ras_interrupt_state(struct amdgpu_device *adev,
> +	struct amdgpu_irq_src *source,
> +	unsigned int type,
> +	enum amdgpu_interrupt_state state)
> +{
> +	return 0;
> +}
> +
>  /**
>   * vcn_v4_0_process_interrupt - process VCN block interrupt
>   *
> @@ -2007,9 +2025,6 @@ static int vcn_v4_0_process_interrupt(struct 
> amdgpu_device *adev, struct amdgpu_
>  	case VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE:
>  		amdgpu_fence_process(&adev-
> >vcn.inst[ip_instance].ring_enc[0]);
>  		break;
> -	case VCN_4_0__SRCID_UVD_POISON:
> -		amdgpu_vcn_process_poison_irq(adev, source, entry);
> -		break;
>  	default:
>  		DRM_ERROR("Unhandled interrupt: %d %d\n",
>  			  entry->src_id, entry->src_data[0]); @@ -2024,6
> +2039,11 @@ static const struct amdgpu_irq_src_funcs 
> +vcn_v4_0_irq_funcs = {
>  	.process = vcn_v4_0_process_interrupt,  };
> 
> +static const struct amdgpu_irq_src_funcs vcn_v4_0_ras_irq_funcs = {
> +	.set = vcn_v4_0_set_ras_interrupt_state,
> +	.process = amdgpu_vcn_process_poison_irq, };
> +
>  /**
>   * vcn_v4_0_set_irq_funcs - set VCN block interrupt irq functions
>   *
> @@ -2041,6 +2061,9 @@ static void vcn_v4_0_set_irq_funcs(struct 
> amdgpu_device *adev)
> 
>  		adev->vcn.inst[i].irq.num_types = adev->vcn.num_enc_rings + 1;
>  		adev->vcn.inst[i].irq.funcs = &vcn_v4_0_irq_funcs;
> +
> +		adev->vcn.inst[i].ras_poison_irq.num_types = adev-
> >vcn.num_enc_rings + 1;
> +		adev->vcn.inst[i].ras_poison_irq.funcs =
> &vcn_v4_0_ras_irq_funcs;
>  	}
>  }
> 
> --
> 2.34.1


More information about the amd-gfx mailing list