[PATCH v2 2/2] drm/amdgpu: Create an option to disable soft recovery
Christian König
christian.koenig at amd.com
Thu Aug 31 06:31:17 UTC 2023
Am 31.08.23 um 00:08 schrieb André Almeida:
> Create a module option to disable soft recoveries on amdgpu, making
> every recovery go through the device reset path. This option makes
> easier to force device resets for testing and debugging purposes.
>
> Signed-off-by: André Almeida <andrealmeid at igalia.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +++++-
> drivers/gpu/drm/amd/include/amd_shared.h | 1 +
> 4 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 82eaccfce347..5f49e2c0ae7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1105,6 +1105,7 @@ struct amdgpu_device {
> /* Debug */
> bool debug_vm;
> bool debug_largebar;
> + bool debug_disable_soft_recovery;
> };
>
> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0cd48c025433..59e9fe594b51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -927,6 +927,7 @@ MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics
> * - 0x2: Enable simulating large-bar capability on non-large bar system. This
> * limits the VRAM size reported to ROCm applications to the visible
> * size, usually 256MB.
> + * - 0x4: Disable GPU soft recovery
"Disable GPU soft recovery, always do a full reset."
Apart from that Reviewed-by: Christian König <christian.koenig at amd.com>.
Regards,
Christian.
> */
> MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
> module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
> @@ -2046,6 +2047,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
> pr_info("debug: enabled simulating large-bar capability on non-large bar system\n");
> adev->debug_largebar = true;
> }
> +
> + if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
> + pr_info("debug: soft reset for GPU recovery disabled\n");
> + adev->debug_disable_soft_recovery = true;
> + }
> }
>
> static int amdgpu_pci_probe(struct pci_dev *pdev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 80d6e132e409..6a80d3ec887e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -434,8 +434,12 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
> struct dma_fence *fence)
> {
> unsigned long flags;
> + ktime_t deadline;
>
> - ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> + if (unlikely(ring->adev->debug_disable_soft_recovery))
> + return false;
> +
> + deadline = ktime_add_us(ktime_get(), 10000);
>
> if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
> return false;
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
> index 2fd6af2183cc..32ee982be99e 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -263,6 +263,7 @@ enum amd_dpm_forced_level;
> enum AMDGPU_DEBUG_MASK {
> AMDGPU_DEBUG_VM = BIT(0),
> AMDGPU_DEBUG_LARGEBAR = BIT(1),
> + AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
> };
>
> /**
More information about the amd-gfx
mailing list