[PATCH 2/7] drm/amdgpu: Avoid accessing HW when suspending SW state
Nirmoy
nirmodas at amd.com
Wed Aug 26 16:53:13 UTC 2020
On 8/26/20 4:46 PM, Andrey Grodzovsky wrote:
> At this point the ASIC is already post reset by the HW/PSP
> so the HW not in proper state to be configured for suspension,
> some bloks might be even gated and so best is to avoid touching it.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++++++++++++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +++++
> drivers/gpu/drm/amd/amdgpu/atom.c | 1 +
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 18 +++++++++------
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 +++
> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 3 +++
> 8 files changed, 64 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3399242..3489622 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,7 @@ struct amdgpu_device {
> atomic_t throttling_logging_enabled;
> struct ratelimit_state throttling_logging_rs;
> uint32_t ras_features;
> + bool in_dpc;
We should have more generic name instead of "in_dpc", may be
"in_pci_error" ?
Nirmoy
> };
>
> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 84f8d14..e1bbefd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
> {
> uint32_t ret;
>
> + if (adev->in_dpc)
> + return 0;
> +
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> return amdgpu_kiq_rreg(adev, reg);
>
> @@ -350,8 +353,10 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
> *
> * Returns the 8 bit value from the offset specified.
> */
> -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> -{
> +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
> +
> + if (adev->in_dpc)
> + return 0;
>
> if (offset < adev->rmmio_size)
> return (readb(adev->rmmio + offset));
> @@ -373,8 +378,10 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> *
> * Writes the value specified to the offset specified.
> */
> -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
> -{
> +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
> +
> + if (adev->in_dpc)
> + return;
>
> if (offset < adev->rmmio_size)
> writeb(value, adev->rmmio + offset);
> @@ -384,6 +391,8 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>
> void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
> {
> + if (adev->in_dpc)
> + return;
>
> trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>
> @@ -412,6 +421,8 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
> void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
> uint32_t acc_flags)
> {
> + if (adev->in_dpc)
> + return;
>
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> return amdgpu_kiq_wreg(adev, reg, v);
> @@ -427,6 +438,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
> void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
> uint32_t acc_flags)
> {
> + if (adev->in_dpc)
> + return;
> +
> if (amdgpu_sriov_fullaccess(adev) &&
> adev->gfx.rlc.funcs &&
> adev->gfx.rlc.funcs->is_rlcg_access_range) {
> @@ -448,6 +462,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t
> */
> u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
> {
> + if (adev->in_dpc)
> + return 0;
> +
> if ((reg * 4) < adev->rio_mem_size)
> return ioread32(adev->rio_mem + (reg * 4));
> else {
> @@ -467,6 +484,8 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
> */
> void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
> {
> + if (adev->in_dpc)
> + return;
>
> if ((reg * 4) < adev->rio_mem_size)
> iowrite32(v, adev->rio_mem + (reg * 4));
> @@ -487,6 +506,8 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
> */
> u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
> {
> + if (adev->in_dpc)
> + return 0;
>
> if (index < adev->doorbell.num_doorbells) {
> return readl(adev->doorbell.ptr + index);
> @@ -508,6 +529,8 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
> */
> void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
> {
> + if (adev->in_dpc)
> + return;
>
> if (index < adev->doorbell.num_doorbells) {
> writel(v, adev->doorbell.ptr + index);
> @@ -527,6 +550,8 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
> */
> u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
> {
> + if (adev->in_dpc)
> + return 0;
>
> if (index < adev->doorbell.num_doorbells) {
> return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
> @@ -548,6 +573,8 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
> */
> void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
> {
> + if (adev->in_dpc)
> + return;
>
> if (index < adev->doorbell.num_doorbells) {
> atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
> @@ -4790,7 +4817,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>
> pci_restore_state(pdev);
>
> + adev->in_dpc = true;
> r = amdgpu_device_ip_suspend(adev);
> + adev->in_dpc = false;
> if (r)
> goto out;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index d698142..50c42c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
> struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> struct amdgpu_ring *ring = &kiq->ring;
>
> + if (adev->in_dpc)
> + return 0;
> +
> BUG_ON(!ring->funcs->emit_rreg);
>
> spin_lock_irqsave(&kiq->ring_lock, flags);
> @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>
> BUG_ON(!ring->funcs->emit_wreg);
>
> + if (adev->in_dpc)
> + return;
> +
> spin_lock_irqsave(&kiq->ring_lock, flags);
> amdgpu_ring_alloc(ring, 32);
> amdgpu_ring_emit_wreg(ring, reg, v);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index d6c38e2..555ef59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
> int i;
> struct amdgpu_device *adev = psp->adev;
>
> + if (psp->adev->in_dpc)
> + return 0;
> +
> for (i = 0; i < adev->usec_timeout; i++) {
> val = RREG32(reg_index);
> if (check_changed) {
> @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp,
> bool ras_intr = false;
> bool skip_unsupport = false;
>
> + if (psp->adev->in_dpc)
> + return 0;
> +
> mutex_lock(&psp->mutex);
>
> memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
> index 4cfc786..613dac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
> @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
> DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n",
> ATOM_CMD_TIMEOUT_SEC);
> ctx->abort = true;
> + dump_stack();
> }
> } else {
> /* jiffies wrap around we will just wait a little longer */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 2db195e..86f268e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle)
>
> amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
> amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
> +
> + if (!adev->in_dpc) {
> #ifndef BRING_UP_DEBUG
> - if (amdgpu_async_gfx_ring) {
> - r = gfx_v10_0_kiq_disable_kgq(adev);
> - if (r)
> - DRM_ERROR("KGQ disable failed\n");
> - }
> + if (amdgpu_async_gfx_ring) {
> + r = gfx_v10_0_kiq_disable_kgq(adev);
> + if (r)
> + DRM_ERROR("KGQ disable failed\n");
> + }
> #endif
> - if (amdgpu_gfx_disable_kcq(adev))
> - DRM_ERROR("KCQ disable failed\n");
> + if (amdgpu_gfx_disable_kcq(adev))
> + DRM_ERROR("KCQ disable failed\n");
> + }
> +
> if (amdgpu_sriov_vf(adev)) {
> gfx_v10_0_cp_gfx_enable(adev, false);
> /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 8462b30..306461d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type,
> {
> int ret = 0;
>
> +
> if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
> return -EOPNOTSUPP;
>
> +
> +
> switch (block_type) {
> /*
> * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index a58ea08..02cf55e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> struct amdgpu_device *adev = smu->adev;
> int ret = 0, index = 0;
>
> + if (smu->adev->in_dpc)
> + return 0;
> +
> index = smu_cmn_to_asic_specific_index(smu,
> CMN2ASIC_MAPPING_MSG,
> msg);
More information about the amd-gfx
mailing list