[PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)

Christian König ckoenig.leichtzumerken at gmail.com
Fri May 31 13:55:47 UTC 2019


Am 29.05.19 um 20:44 schrieb Alex Deucher:
> Split late_init into two functions, one (do_late_init) which
> just does the hw init, and late_init which calls do_late_init
> and schedules the IB test work.  Call do_late_init in
> the GPU reset code to run the init code, but not schedule
> the IB test code.  The IB test code is called directly
> in the gpu reset code so no need to run the IB tests
> in a separate work thread.  If we do, we end up racing.
>
> v2: Rework late_init.  Pull out the mgpu fan boost and xgmi
> pstate code into late_init so they get called in all cases.
> rename the late_init worker thread to delayed work since it's
> just the IB tests now which can happen later.  Schedule the
> work at init and resume time.  It's not needed at reset time
> because the IB tests are called directly.
>
> Cc: Xinhui Pan <xinhui.pan at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++++++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   2 +-
>   3 files changed, 61 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d355e9a09ad1..19a00282e34c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -925,7 +925,7 @@ struct amdgpu_device {
>   	const struct amdgpu_df_funcs	*df_funcs;
>   
>   	/* delayed work_func for deferring clockgating during resume */
> -	struct delayed_work     late_init_work;
> +	struct delayed_work     delayed_init_work;
>   
>   	struct amdgpu_virt	virt;
>   	/* firmware VRAM reservation */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7a8c2201cd04..d00fd5dd307a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power
>   	return 0;
>   }
>   
> +static int amdgpu_device_enable_mgpu_fan_boost(void)
> +{
> +	struct amdgpu_gpu_instance *gpu_ins;
> +	struct amdgpu_device *adev;
> +	int i, ret = 0;
> +
> +	mutex_lock(&mgpu_info.mutex);
> +
> +	/*
> +	 * MGPU fan boost feature should be enabled
> +	 * only when there are two or more dGPUs in
> +	 * the system
> +	 */
> +	if (mgpu_info.num_dgpu < 2)
> +		goto out;
> +
> +	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> +		gpu_ins = &(mgpu_info.gpu_ins[i]);
> +		adev = gpu_ins->adev;
> +		if (!(adev->flags & AMD_IS_APU) &&
> +		    !gpu_ins->mgpu_fan_enabled &&
> +		    adev->powerplay.pp_funcs &&
> +		    adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
> +			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
> +			if (ret)
> +				break;
> +
> +			gpu_ins->mgpu_fan_enabled = 1;
> +		}
> +	}
> +
> +out:
> +	mutex_unlock(&mgpu_info.mutex);
> +
> +	return ret;
> +}
> +
>   /**
>    * amdgpu_device_ip_late_init - run late init for hardware IPs
>    *
> @@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
>   	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
>   	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
>   
> -	queue_delayed_work(system_wq, &adev->late_init_work,
> -			   msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
>   	amdgpu_device_fill_reset_magic(adev);
>   
> +	r = amdgpu_device_enable_mgpu_fan_boost();
> +	if (r)
> +		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> +
> +	/* set to low pstate by default */
> +	amdgpu_xgmi_set_pstate(adev, 0);
> +
>   	return 0;
>   }
>   
> @@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -static int amdgpu_device_enable_mgpu_fan_boost(void)
> -{
> -	struct amdgpu_gpu_instance *gpu_ins;
> -	struct amdgpu_device *adev;
> -	int i, ret = 0;
> -
> -	mutex_lock(&mgpu_info.mutex);
> -
> -	/*
> -	 * MGPU fan boost feature should be enabled
> -	 * only when there are two or more dGPUs in
> -	 * the system
> -	 */
> -	if (mgpu_info.num_dgpu < 2)
> -		goto out;
> -
> -	for (i = 0; i < mgpu_info.num_dgpu; i++) {
> -		gpu_ins = &(mgpu_info.gpu_ins[i]);
> -		adev = gpu_ins->adev;
> -		if (!(adev->flags & AMD_IS_APU) &&
> -		    !gpu_ins->mgpu_fan_enabled &&
> -		    adev->powerplay.pp_funcs &&
> -		    adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
> -			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
> -			if (ret)
> -				break;
> -
> -			gpu_ins->mgpu_fan_enabled = 1;
> -		}
> -	}
> -
> -out:
> -	mutex_unlock(&mgpu_info.mutex);
> -
> -	return ret;
> -}
> -
>   /**
> - * amdgpu_device_ip_late_init_func_handler - work handler for ib test
> + * amdgpu_device_delayed_init_work_handler - work handler for IB tests
>    *
>    * @work: work_struct.
>    */
> -static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
> +static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
>   {
>   	struct amdgpu_device *adev =
> -		container_of(work, struct amdgpu_device, late_init_work.work);
> +		container_of(work, struct amdgpu_device, delayed_init_work.work);
>   	int r;
>   
>   	r = amdgpu_ib_ring_tests(adev);
>   	if (r)
>   		DRM_ERROR("ib ring test failed (%d).\n", r);
> -
> -	r = amdgpu_device_enable_mgpu_fan_boost();
> -	if (r)
> -		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> -
> -	/*set to low pstate by default */
> -	amdgpu_xgmi_set_pstate(adev, 0);
> -
>   }
>   
>   static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
> @@ -2535,8 +2531,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	INIT_LIST_HEAD(&adev->ring_lru_list);
>   	spin_lock_init(&adev->ring_lru_list_lock);
>   
> -	INIT_DELAYED_WORK(&adev->late_init_work,
> -			  amdgpu_device_ip_late_init_func_handler);
> +	INIT_DELAYED_WORK(&adev->delayed_init_work,
> +			  amdgpu_device_delayed_init_work_handler);
>   	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
>   			  amdgpu_device_delay_enable_gfx_off);
>   
> @@ -2749,6 +2745,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	/* must succeed. */
>   	amdgpu_ras_resume(adev);
>   
> +	queue_delayed_work(system_wq, &adev->delayed_init_work,
> +			   msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
>   	r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
>   	if (r) {
>   		dev_err(adev->dev, "Could not create pcie_replay_count");
> @@ -2796,7 +2795,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>   		adev->firmware.gpu_info_fw = NULL;
>   	}
>   	adev->accel_working = false;
> -	cancel_delayed_work_sync(&adev->late_init_work);
> +	cancel_delayed_work_sync(&adev->delayed_init_work);
>   	/* free i2c buses */
>   	if (!amdgpu_device_has_dc_support(adev))
>   		amdgpu_i2c_fini(adev);
> @@ -2859,7 +2858,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
>   	if (fbcon)
>   		amdgpu_fbdev_set_suspend(adev, 1);
>   
> -	cancel_delayed_work_sync(&adev->late_init_work);
> +	cancel_delayed_work_sync(&adev->delayed_init_work);
>   
>   	if (!amdgpu_device_has_dc_support(adev)) {
>   		/* turn off display hw */
> @@ -2979,6 +2978,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
>   	if (r)
>   		return r;
>   
> +	queue_delayed_work(system_wq, &adev->delayed_init_work,
> +			   msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
>   	if (!amdgpu_device_has_dc_support(adev)) {
>   		/* pin cursors */
>   		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
> @@ -3002,7 +3004,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
>   		return r;
>   
>   	/* Make sure IB tests flushed */
> -	flush_delayed_work(&adev->late_init_work);
> +	flush_delayed_work(&adev->delayed_init_work);
>   
>   	/* blat the mode back in */
>   	if (fbcon) {
> @@ -3593,7 +3595,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	dev_info(adev->dev, "GPU reset begin!\n");
>   
> -	cancel_delayed_work_sync(&adev->late_init_work);
> +	cancel_delayed_work_sync(&adev->delayed_init_work);
>   
>   	hive = amdgpu_get_xgmi_hive(adev, false);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 87a93874d71e..be97771525e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -974,7 +974,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
>   	int r, pasid;
>   
>   	/* Ensure IB tests are run on ring */
> -	flush_delayed_work(&adev->late_init_work);
> +	flush_delayed_work(&adev->delayed_init_work);
>   
>   	file_priv->driver_priv = NULL;
>   



More information about the amd-gfx mailing list