[PATCH 2/7] drm/amdgpu: Avoid accessing HW when suspending SW state

Luben Tuikov luben.tuikov at amd.com
Sat Aug 29 02:03:24 UTC 2020


On 2020-08-26 10:46, Andrey Grodzovsky wrote:
> At this point the ASIC is already post reset by the HW/PSP
> so the HW not in proper state to be configured for suspension,
> some bloks might be even gated and so best is to avoid touching it.

"blocks"

> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    |  6 +++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 +++++
>  drivers/gpu/drm/amd/amdgpu/atom.c          |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     | 18 +++++++++------
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  |  3 +++
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c     |  3 +++
>  8 files changed, 64 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3399242..3489622 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,7 @@ struct amdgpu_device {
>  	atomic_t			throttling_logging_enabled;
>  	struct ratelimit_state		throttling_logging_rs;
>  	uint32_t			ras_features;
> +	bool                            in_dpc;
>  };

This is very cryptic and if we do that, in 10 years,
the code will be full of cryptic variables and comments.

Better to add a comment

	bool				in_dpc; /* PCI Downstream Port Containment */

>  
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 84f8d14..e1bbefd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>  {
>  	uint32_t ret;
>  
> +	if (adev->in_dpc)
> +		return 0;
> +
>  	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>  		return amdgpu_kiq_rreg(adev, reg);
>  
> @@ -350,8 +353,10 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>   *
>   * Returns the 8 bit value from the offset specified.
>   */
> -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> -{
> +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
> +
> +	if (adev->in_dpc)
> +		return 0;
>  
>  	if (offset < adev->rmmio_size)
>  		return (readb(adev->rmmio + offset));
> @@ -373,8 +378,10 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>   *
>   * Writes the value specified to the offset specified.
>   */
> -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
> -{
> +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
> +
> +	if (adev->in_dpc)
> +		return;
>  
>  	if (offset < adev->rmmio_size)
>  		writeb(value, adev->rmmio + offset);
> @@ -384,6 +391,8 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>  
>  void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
>  {
> +	if (adev->in_dpc)
> +		return;
>  
>  	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>  
> @@ -412,6 +421,8 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>  void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>  		    uint32_t acc_flags)
>  {
> +	if (adev->in_dpc)
> +		return;
>  
>  	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>  		return amdgpu_kiq_wreg(adev, reg, v);
> @@ -427,6 +438,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>  		    uint32_t acc_flags)
>  {
> +	if (adev->in_dpc)
> +		return;
> +
>  	if (amdgpu_sriov_fullaccess(adev) &&
>  		adev->gfx.rlc.funcs &&
>  		adev->gfx.rlc.funcs->is_rlcg_access_range) {
> @@ -448,6 +462,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t
>   */
>  u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>  {
> +	if (adev->in_dpc)
> +		return 0;
> +
>  	if ((reg * 4) < adev->rio_mem_size)
>  		return ioread32(adev->rio_mem + (reg * 4));
>  	else {
> @@ -467,6 +484,8 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>   */
>  void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>  {
> +	if (adev->in_dpc)
> +		return;
>  
>  	if ((reg * 4) < adev->rio_mem_size)
>  		iowrite32(v, adev->rio_mem + (reg * 4));
> @@ -487,6 +506,8 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>   */
>  u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>  {
> +	if (adev->in_dpc)
> +		return 0;
>  
>  	if (index < adev->doorbell.num_doorbells) {
>  		return readl(adev->doorbell.ptr + index);
> @@ -508,6 +529,8 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>   */
>  void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>  {
> +	if (adev->in_dpc)
> +		return;
>  
>  	if (index < adev->doorbell.num_doorbells) {
>  		writel(v, adev->doorbell.ptr + index);
> @@ -527,6 +550,8 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>   */
>  u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>  {
> +	if (adev->in_dpc)
> +		return 0;
>  
>  	if (index < adev->doorbell.num_doorbells) {
>  		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
> @@ -548,6 +573,8 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>   */
>  void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
>  {
> +	if (adev->in_dpc)
> +			return;
>  
>  	if (index < adev->doorbell.num_doorbells) {
>  		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
> @@ -4790,7 +4817,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>  
>  	pci_restore_state(pdev);
>  
> +	adev->in_dpc = true;
>  	r = amdgpu_device_ip_suspend(adev);
> +	adev->in_dpc = false;
>  	if (r)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index d698142..50c42c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>  	struct amdgpu_ring *ring = &kiq->ring;
>  
> +	if (adev->in_dpc)
> +		return 0;
> +
>  	BUG_ON(!ring->funcs->emit_rreg);
>  
>  	spin_lock_irqsave(&kiq->ring_lock, flags);
> @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>  
>  	BUG_ON(!ring->funcs->emit_wreg);
>  
> +	if (adev->in_dpc)
> +		return;
> +

Don't know how I feel about such "architecture",
where we introduce a flag-type variable and check it
in a slew of places. If we did this a few times over,
we'd have a few flags which we check almost everywhere,
and it would make the code difficult to analyze.
For instance, which flag to check first? Or, how
are flags related to each other.

There has to be a better way, where we check
that flag only once before proceeding. A more
unified approach.

>  	spin_lock_irqsave(&kiq->ring_lock, flags);
>  	amdgpu_ring_alloc(ring, 32);
>  	amdgpu_ring_emit_wreg(ring, reg, v);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index d6c38e2..555ef59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
>  	int i;
>  	struct amdgpu_device *adev = psp->adev;
>  
> +	if (psp->adev->in_dpc)
> +		return 0;
> +
>  	for (i = 0; i < adev->usec_timeout; i++) {
>  		val = RREG32(reg_index);
>  		if (check_changed) {
> @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp,
>  	bool ras_intr = false;
>  	bool skip_unsupport = false;
>  
> +	if (psp->adev->in_dpc)
> +		return 0;
> +
>  	mutex_lock(&psp->mutex);
>  
>  	memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
> index 4cfc786..613dac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
> @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
>  					DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n",
>  						  ATOM_CMD_TIMEOUT_SEC);
>  					ctx->abort = true;
> +					dump_stack();
>  				}
>  			} else {
>  				/* jiffies wrap around we will just wait a little longer */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 2db195e..86f268e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle)
>  
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
> +
> +	if (!adev->in_dpc) {
>  #ifndef BRING_UP_DEBUG
> -	if (amdgpu_async_gfx_ring) {
> -		r = gfx_v10_0_kiq_disable_kgq(adev);
> -		if (r)
> -			DRM_ERROR("KGQ disable failed\n");
> -	}
> +		if (amdgpu_async_gfx_ring) {
> +			r = gfx_v10_0_kiq_disable_kgq(adev);
> +			if (r)
> +				DRM_ERROR("KGQ disable failed\n");
> +		}
>  #endif
> -	if (amdgpu_gfx_disable_kcq(adev))
> -		DRM_ERROR("KCQ disable failed\n");
> +		if (amdgpu_gfx_disable_kcq(adev))
> +			DRM_ERROR("KCQ disable failed\n");
> +	}
> +
>  	if (amdgpu_sriov_vf(adev)) {
>  		gfx_v10_0_cp_gfx_enable(adev, false);
>  		/* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 8462b30..306461d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type,
>  {
>  	int ret = 0;
>  
> +
>  	if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
>  		return -EOPNOTSUPP;
>  
> +
> +

Unnecessary white space changes.

Regards,
Luben

>  	switch (block_type) {
>  	/*
>  	 * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index a58ea08..02cf55e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>  	struct amdgpu_device *adev = smu->adev;
>  	int ret = 0, index = 0;
>  
> +	if (smu->adev->in_dpc)
> +		return 0;
> +
>  	index = smu_cmn_to_asic_specific_index(smu,
>  					       CMN2ASIC_MAPPING_MSG,
>  					       msg);
> 



More information about the amd-gfx mailing list