[PATCH 2/7] drm/amdgpu: Avoid accessing HW when suspending SW state

Mon Aug 31 14:58:36 UTC 2020

On 8/28/20 10:03 PM, Luben Tuikov wrote:
> On 2020-08-26 10:46, Andrey Grodzovsky wrote:
>> At this point the ASIC is already post reset by the HW/PSP
>> so the HW not in proper state to be configured for suspension,
>> some bloks might be even gated and so best is to avoid touching it.
> "blocks"
>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++++++++++++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    |  6 +++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 +++++
>>   drivers/gpu/drm/amd/amdgpu/atom.c          |  1 +
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     | 18 +++++++++------
>>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  |  3 +++
>>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c     |  3 +++
>>   8 files changed, 64 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 3399242..3489622 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -992,6 +992,7 @@ struct amdgpu_device {
>>   	atomic_t			throttling_logging_enabled;
>>   	struct ratelimit_state		throttling_logging_rs;
>>   	uint32_t			ras_features;
>> +	bool                            in_dpc;
>>   };
> This is very cryptic and if we do that, in 10 years,
> the code will be full of cryptic variables and comments.
>
> Better to add a comment
>
> 	bool				in_dpc; /* PCI Downstream Port Containment */

Per Nirmoy's comment I already modified the naming to a descriptive one in V2 
patchset

>
>>   
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 84f8d14..e1bbefd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>   {
>>   	uint32_t ret;
>>   
>> +	if (adev->in_dpc)
>> +		return 0;
>> +
>>   	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>   		return amdgpu_kiq_rreg(adev, reg);
>>   
>> @@ -350,8 +353,10 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    *
>>    * Returns the 8 bit value from the offset specified.
>>    */
>> -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>> -{
>> +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
>> +
>> +	if (adev->in_dpc)
>> +		return 0;
>>   
>>   	if (offset < adev->rmmio_size)
>>   		return (readb(adev->rmmio + offset));
>> @@ -373,8 +378,10 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>>    *
>>    * Writes the value specified to the offset specified.
>>    */
>> -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>> -{
>> +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
>> +
>> +	if (adev->in_dpc)
>> +		return;
>>   
>>   	if (offset < adev->rmmio_size)
>>   		writeb(value, adev->rmmio + offset);
>> @@ -384,6 +391,8 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>>   
>>   void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
>>   {
>> +	if (adev->in_dpc)
>> +		return;
>>   
>>   	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>>   
>> @@ -412,6 +421,8 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>   		    uint32_t acc_flags)
>>   {
>> +	if (adev->in_dpc)
>> +		return;
>>   
>>   	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>   		return amdgpu_kiq_wreg(adev, reg, v);
>> @@ -427,6 +438,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>   void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>   		    uint32_t acc_flags)
>>   {
>> +	if (adev->in_dpc)
>> +		return;
>> +
>>   	if (amdgpu_sriov_fullaccess(adev) &&
>>   		adev->gfx.rlc.funcs &&
>>   		adev->gfx.rlc.funcs->is_rlcg_access_range) {
>> @@ -448,6 +462,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t
>>    */
>>   u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>>   {
>> +	if (adev->in_dpc)
>> +		return 0;
>> +
>>   	if ((reg * 4) < adev->rio_mem_size)
>>   		return ioread32(adev->rio_mem + (reg * 4));
>>   	else {
>> @@ -467,6 +484,8 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>>    */
>>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>   {
>> +	if (adev->in_dpc)
>> +		return;
>>   
>>   	if ((reg * 4) < adev->rio_mem_size)
>>   		iowrite32(v, adev->rio_mem + (reg * 4));
>> @@ -487,6 +506,8 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>    */
>>   u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>   {
>> +	if (adev->in_dpc)
>> +		return 0;
>>   
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		return readl(adev->doorbell.ptr + index);
>> @@ -508,6 +529,8 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>    */
>>   void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>   {
>> +	if (adev->in_dpc)
>> +		return;
>>   
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		writel(v, adev->doorbell.ptr + index);
>> @@ -527,6 +550,8 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>    */
>>   u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>   {
>> +	if (adev->in_dpc)
>> +		return 0;
>>   
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
>> @@ -548,6 +573,8 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>    */
>>   void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
>>   {
>> +	if (adev->in_dpc)
>> +			return;
>>   
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
>> @@ -4790,7 +4817,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>>   
>>   	pci_restore_state(pdev);
>>   
>> +	adev->in_dpc = true;
>>   	r = amdgpu_device_ip_suspend(adev);
>> +	adev->in_dpc = false;
>>   	if (r)
>>   		goto out;
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index d698142..50c42c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>   	struct amdgpu_ring *ring = &kiq->ring;
>>   
>> +	if (adev->in_dpc)
>> +		return 0;
>> +
>>   	BUG_ON(!ring->funcs->emit_rreg);
>>   
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>> @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>   
>>   	BUG_ON(!ring->funcs->emit_wreg);
>>   
>> +	if (adev->in_dpc)
>> +		return;
>> +
> Don't know how I feel about such "architecture",
> where we introduce a flag-type variable and check it
> in a slew of places. If we did this a few times over,
> we'd have a few flags which we check almost everywhere,
> and it would make the code difficult to analyze.
> For instance, which flag to check first? Or, how
> are flags related to each other.
>
> There has to be a better way, where we check
> that flag only once before proceeding. A more
> unified approach.

I fail to see any other approach besides this one currently, there are a few 
different functions for
r/w register access and unless there would be a single register access generic 
function where I could
place this check (which I don't think is a good idea) I don't see how to do it 
in once place. On top of
that, this variable is being checked in wait for HW response use cases such as 
psp_wait_for where you
expect specific value to be returned from the driver/psp interfacing registers 
which you should simply
avoid expecting in this case. All in all I don't see how u can make this a 
generic check in once place that solves
all the use cases.

Andrey

>
>>   	spin_lock_irqsave(&kiq->ring_lock, flags);
>>   	amdgpu_ring_alloc(ring, 32);
>>   	amdgpu_ring_emit_wreg(ring, reg, v);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> index d6c38e2..555ef59 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
>>   	int i;
>>   	struct amdgpu_device *adev = psp->adev;
>>   
>> +	if (psp->adev->in_dpc)
>> +		return 0;
>> +
>>   	for (i = 0; i < adev->usec_timeout; i++) {
>>   		val = RREG32(reg_index);
>>   		if (check_changed) {
>> @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp,
>>   	bool ras_intr = false;
>>   	bool skip_unsupport = false;
>>   
>> +	if (psp->adev->in_dpc)
>> +		return 0;
>> +
>>   	mutex_lock(&psp->mutex);
>>   
>>   	memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
>> index 4cfc786..613dac1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
>> @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
>>   					DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n",
>>   						  ATOM_CMD_TIMEOUT_SEC);
>>   					ctx->abort = true;
>> +					dump_stack();
>>   				}
>>   			} else {
>>   				/* jiffies wrap around we will just wait a little longer */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 2db195e..86f268e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle)
>>   
>>   	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>>   	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>> +
>> +	if (!adev->in_dpc) {
>>   #ifndef BRING_UP_DEBUG
>> -	if (amdgpu_async_gfx_ring) {
>> -		r = gfx_v10_0_kiq_disable_kgq(adev);
>> -		if (r)
>> -			DRM_ERROR("KGQ disable failed\n");
>> -	}
>> +		if (amdgpu_async_gfx_ring) {
>> +			r = gfx_v10_0_kiq_disable_kgq(adev);
>> +			if (r)
>> +				DRM_ERROR("KGQ disable failed\n");
>> +		}
>>   #endif
>> -	if (amdgpu_gfx_disable_kcq(adev))
>> -		DRM_ERROR("KCQ disable failed\n");
>> +		if (amdgpu_gfx_disable_kcq(adev))
>> +			DRM_ERROR("KCQ disable failed\n");
>> +	}
>> +
>>   	if (amdgpu_sriov_vf(adev)) {
>>   		gfx_v10_0_cp_gfx_enable(adev, false);
>>   		/* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index 8462b30..306461d 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type,
>>   {
>>   	int ret = 0;
>>   
>> +
>>   	if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
>>   		return -EOPNOTSUPP;
>>   
>> +
>> +
> Unnecessary white space changes.
>
> Regards,
> Luben
>
>>   	switch (block_type) {
>>   	/*
>>   	 * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index a58ea08..02cf55e 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>   	struct amdgpu_device *adev = smu->adev;
>>   	int ret = 0, index = 0;
>>   
>> +	if (smu->adev->in_dpc)
>> +		return 0;
>> +
>>   	index = smu_cmn_to_asic_specific_index(smu,
>>   					       CMN2ASIC_MAPPING_MSG,
>>   					       msg);
>>