[PATCH v2 2/2] drm/amdgpu: Create an option to disable soft recovery

Christian König christian.koenig at amd.com
Thu Aug 31 06:31:17 UTC 2023


Am 31.08.23 um 00:08 schrieb André Almeida:
> Create a module option to disable soft recoveries on amdgpu, making
> every recovery go through the device reset path. This option makes
> easier to force device resets for testing and debugging purposes.
>
> Signed-off-by: André Almeida <andrealmeid at igalia.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +++++-
>   drivers/gpu/drm/amd/include/amd_shared.h | 1 +
>   4 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 82eaccfce347..5f49e2c0ae7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1105,6 +1105,7 @@ struct amdgpu_device {
>   	/* Debug */
>   	bool                            debug_vm;
>   	bool                            debug_largebar;
> +	bool                            debug_disable_soft_recovery;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0cd48c025433..59e9fe594b51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -927,6 +927,7 @@ MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics
>    * - 0x2: Enable simulating large-bar capability on non-large bar system. This
>    *   limits the VRAM size reported to ROCm applications to the visible
>    *   size, usually 256MB.
> + * - 0x4: Disable GPU soft recovery

"Disable GPU soft recovery, always do a full reset."

Apart from that Reviewed-by: Christian König <christian.koenig at amd.com>.

Regards,
Christian.

>    */
>   MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
>   module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
> @@ -2046,6 +2047,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
>   		pr_info("debug: enabled simulating large-bar capability on non-large bar system\n");
>   		adev->debug_largebar = true;
>   	}
> +
> +	if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
> +		pr_info("debug: soft reset for GPU recovery disabled\n");
> +		adev->debug_disable_soft_recovery = true;
> +	}
>   }
>   
>   static int amdgpu_pci_probe(struct pci_dev *pdev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 80d6e132e409..6a80d3ec887e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -434,8 +434,12 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>   			       struct dma_fence *fence)
>   {
>   	unsigned long flags;
> +	ktime_t deadline;
>   
> -	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> +	if (unlikely(ring->adev->debug_disable_soft_recovery))
> +		return false;
> +
> +	deadline = ktime_add_us(ktime_get(), 10000);
>   
>   	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>   		return false;
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
> index 2fd6af2183cc..32ee982be99e 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -263,6 +263,7 @@ enum amd_dpm_forced_level;
>   enum AMDGPU_DEBUG_MASK {
>   	AMDGPU_DEBUG_VM = BIT(0),
>   	AMDGPU_DEBUG_LARGEBAR = BIT(1),
> +	AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
>   };
>   
>   /**



More information about the amd-gfx mailing list