[PATCH 4/9] drm/amdgpu:replace deprecated gpu reset

Christian König ckoenig.leichtzumerken at gmail.com
Thu Oct 26 07:13:19 UTC 2017


Am 25.10.2017 um 11:22 schrieb Monk Liu:
> now use new gpu recover

Might be better to squash together with the previous patch.

This one doesn't introduce new functionality, but only removes the old 
code and switches over to the new one.



>
> Change-Id: Ieccd25772c47c0e710ad81537a3dd0c1767585a1
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 298 -----------------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  10 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |   5 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |   1 -
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |   2 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |   2 +-
>   8 files changed, 10 insertions(+), 312 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 003668f..335df11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1844,7 +1844,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> -int amdgpu_gpu_reset(struct amdgpu_device *adev);
> +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_post(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0db3b3c..a2f9a7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2818,304 +2818,6 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> -/**
> - * amdgpu_sriov_gpu_reset - reset the asic
> - *
> - * @adev: amdgpu device pointer
> - * @job: which job trigger hang
> - *
> - * Attempt the reset the GPU if it has hung (all asics).
> - * for SRIOV case.
> - * Returns 0 for success or an error on failure.
> - */
> -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
> -{
> -	int i, j, r = 0;
> -	int resched;
> -	struct amdgpu_bo *bo, *tmp;
> -	struct amdgpu_ring *ring;
> -	struct dma_fence *fence = NULL, *next = NULL;
> -
> -	mutex_lock(&adev->virt.lock_reset);
> -	atomic_inc(&adev->gpu_reset_counter);
> -	adev->in_sriov_reset = true;
> -
> -	/* block TTM */
> -	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> -
> -	/* we start from the ring trigger GPU hang */
> -	j = job ? job->ring->idx : 0;
> -
> -	/* block scheduler */
> -	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
> -		ring = adev->rings[i % AMDGPU_MAX_RINGS];
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		kthread_park(ring->sched.thread);
> -
> -		if (job && j != i)
> -			continue;
> -
> -		/* here give the last chance to check if job removed from mirror-list
> -		 * since we already pay some time on kthread_park */
> -		if (job && list_empty(&job->base.node)) {
> -			kthread_unpark(ring->sched.thread);
> -			goto give_up_reset;
> -		}
> -
> -		if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit))
> -			amd_sched_job_kickout(&job->base);
> -
> -		/* only do job_reset on the hang ring if @job not NULL */
> -		amd_sched_hw_job_reset(&ring->sched, NULL);
> -
> -		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> -		amdgpu_fence_driver_force_completion(ring);
> -	}
> -
> -	/* request to take full control of GPU before re-initialization  */
> -	if (job)
> -		amdgpu_virt_reset_gpu(adev);
> -	else
> -		amdgpu_virt_request_full_gpu(adev, true);
> -
> -
> -	/* Resume IP prior to SMC */
> -	amdgpu_sriov_reinit_early(adev);
> -
> -	/* we need recover gart prior to run SMC/CP/SDMA resume */
> -	amdgpu_ttm_recover_gart(adev);
> -
> -	/* now we are okay to resume SMC/CP/SDMA */
> -	amdgpu_sriov_reinit_late(adev);
> -
> -	amdgpu_irq_gpu_reset_resume_helper(adev);
> -
> -	if (amdgpu_ib_ring_tests(adev))
> -		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
> -
> -	/* release full control of GPU after ib test */
> -	amdgpu_virt_release_full_gpu(adev, true);
> -
> -	DRM_INFO("recover vram bo from shadow\n");
> -
> -	ring = adev->mman.buffer_funcs_ring;
> -	mutex_lock(&adev->shadow_list_lock);
> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> -		next = NULL;
> -		amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
> -		if (fence) {
> -			r = dma_fence_wait(fence, false);
> -			if (r) {
> -				WARN(r, "recovery from shadow isn't completed\n");
> -				break;
> -			}
> -		}
> -
> -		dma_fence_put(fence);
> -		fence = next;
> -	}
> -	mutex_unlock(&adev->shadow_list_lock);
> -
> -	if (fence) {
> -		r = dma_fence_wait(fence, false);
> -		if (r)
> -			WARN(r, "recovery from shadow isn't completed\n");
> -	}
> -	dma_fence_put(fence);
> -
> -	for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
> -		ring = adev->rings[i % AMDGPU_MAX_RINGS];
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -
> -		if (job && j != i) {
> -			kthread_unpark(ring->sched.thread);
> -			continue;
> -		}
> -
> -		amd_sched_job_recovery(&ring->sched);
> -		kthread_unpark(ring->sched.thread);
> -	}
> -
> -	drm_helper_resume_force_mode(adev->ddev);
> -give_up_reset:
> -	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> -	if (r) {
> -		/* bad news, how to tell it to userspace ? */
> -		dev_info(adev->dev, "GPU reset failed\n");
> -	} else {
> -		dev_info(adev->dev, "GPU reset successed!\n");
> -	}
> -
> -	adev->in_sriov_reset = false;
> -	mutex_unlock(&adev->virt.lock_reset);
> -	return r;
> -}
> -
> -/**
> - * amdgpu_gpu_reset - reset the asic
> - *
> - * @adev: amdgpu device pointer
> - *
> - * Attempt the reset the GPU if it has hung (all asics).
> - * Returns 0 for success or an error on failure.
> - */
> -int amdgpu_gpu_reset(struct amdgpu_device *adev)
> -{
> -	struct drm_atomic_state *state = NULL;
> -	int i, r;
> -	int resched;
> -	bool need_full_reset, vram_lost = false;
> -
> -	if (!amdgpu_check_soft_reset(adev)) {
> -		DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
> -		return 0;
> -	}
> -
> -	atomic_inc(&adev->gpu_reset_counter);
> -
> -	/* block TTM */
> -	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> -	/* store modesetting */
> -	if (amdgpu_device_has_dc_support(adev))
> -		state = drm_atomic_helper_suspend(adev->ddev);
> -
> -	/* block scheduler */
> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -		struct amdgpu_ring *ring = adev->rings[i];
> -
> -		if (!ring || !ring->sched.thread)
> -			continue;
> -		kthread_park(ring->sched.thread);
> -		amd_sched_hw_job_reset(&ring->sched, NULL);
> -		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
> -		amdgpu_fence_driver_force_completion(ring);
> -	}
> -
> -	need_full_reset = amdgpu_need_full_reset(adev);
> -
> -	if (!need_full_reset) {
> -		amdgpu_pre_soft_reset(adev);
> -		r = amdgpu_soft_reset(adev);
> -		amdgpu_post_soft_reset(adev);
> -		if (r || amdgpu_check_soft_reset(adev)) {
> -			DRM_INFO("soft reset failed, will fallback to full reset!\n");
> -			need_full_reset = true;
> -		}
> -	}
> -
> -	if (need_full_reset) {
> -		r = amdgpu_suspend(adev);
> -
> -retry:
> -		amdgpu_atombios_scratch_regs_save(adev);
> -		r = amdgpu_asic_reset(adev);
> -		amdgpu_atombios_scratch_regs_restore(adev);
> -		/* post card */
> -		amdgpu_atom_asic_init(adev->mode_info.atom_context);
> -
> -		if (!r) {
> -			dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
> -			r = amdgpu_resume_phase1(adev);
> -			if (r)
> -				goto out;
> -			vram_lost = amdgpu_check_vram_lost(adev);
> -			if (vram_lost) {
> -				DRM_ERROR("VRAM is lost!\n");
> -				atomic_inc(&adev->vram_lost_counter);
> -			}
> -			r = amdgpu_ttm_recover_gart(adev);
> -			if (r)
> -				goto out;
> -			r = amdgpu_resume_phase2(adev);
> -			if (r)
> -				goto out;
> -			if (vram_lost)
> -				amdgpu_fill_reset_magic(adev);
> -		}
> -	}
> -out:
> -	if (!r) {
> -		amdgpu_irq_gpu_reset_resume_helper(adev);
> -		r = amdgpu_ib_ring_tests(adev);
> -		if (r) {
> -			dev_err(adev->dev, "ib ring test failed (%d).\n", r);
> -			r = amdgpu_suspend(adev);
> -			need_full_reset = true;
> -			goto retry;
> -		}
> -		/**
> -		 * recovery vm page tables, since we cannot depend on VRAM is
> -		 * consistent after gpu full reset.
> -		 */
> -		if (need_full_reset && amdgpu_need_backup(adev)) {
> -			struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> -			struct amdgpu_bo *bo, *tmp;
> -			struct dma_fence *fence = NULL, *next = NULL;
> -
> -			DRM_INFO("recover vram bo from shadow\n");
> -			mutex_lock(&adev->shadow_list_lock);
> -			list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> -				next = NULL;
> -				amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
> -				if (fence) {
> -					r = dma_fence_wait(fence, false);
> -					if (r) {
> -						WARN(r, "recovery from shadow isn't completed\n");
> -						break;
> -					}
> -				}
> -
> -				dma_fence_put(fence);
> -				fence = next;
> -			}
> -			mutex_unlock(&adev->shadow_list_lock);
> -			if (fence) {
> -				r = dma_fence_wait(fence, false);
> -				if (r)
> -					WARN(r, "recovery from shadow isn't completed\n");
> -			}
> -			dma_fence_put(fence);
> -		}
> -		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			struct amdgpu_ring *ring = adev->rings[i];
> -
> -			if (!ring || !ring->sched.thread)
> -				continue;
> -
> -			amd_sched_job_recovery(&ring->sched);
> -			kthread_unpark(ring->sched.thread);
> -		}
> -	} else {
> -		dev_err(adev->dev, "asic resume failed (%d).\n", r);
> -		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -			if (adev->rings[i] && adev->rings[i]->sched.thread) {
> -				kthread_unpark(adev->rings[i]->sched.thread);
> -			}
> -		}
> -	}
> -
> -	if (amdgpu_device_has_dc_support(adev)) {
> -		r = drm_atomic_helper_resume(adev->ddev, state);
> -		amdgpu_dm_display_resume(adev);
> -	} else
> -		drm_helper_resume_force_mode(adev->ddev);
> -
> -	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
> -	if (r) {
> -		/* bad news, how to tell it to userspace ? */
> -		dev_info(adev->dev, "GPU reset failed\n");
> -	}
> -	else {
> -		dev_info(adev->dev, "GPU reset successed!\n");
> -	}
> -
> -	amdgpu_vf_error_trans_all(adev);
> -	return r;
> -}
> -
>   static int amdgpu_reset(struct amdgpu_device *adev, uint64_t* reset_flags)
>   {
>   	int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index d7374cf..9d67bcb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
>   }
>   
>   /**
> - * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset
> + * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
>    *
>    * Manually trigger a gpu reset at the next fence wait.
>    */
> -static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data)
> +static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
>   {
>   	struct drm_info_node *node = (struct drm_info_node *) m->private;
>   	struct drm_device *dev = node->minor->dev;
>   	struct amdgpu_device *adev = dev->dev_private;
>   
> -	seq_printf(m, "gpu reset\n");
> -	amdgpu_gpu_reset(adev);
> +	seq_printf(m, "gpu recover\n");
> +	amdgpu_gpu_recover(adev, NULL);
>   
>   	return 0;
>   }
>   
>   static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
>   	{"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL},
> -	{"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL}
> +	{"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL}
>   };
>   
>   static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 47c5ce9..9c2f87c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
>   						  reset_work);
>   
>   	if (!amdgpu_sriov_vf(adev))
> -		amdgpu_gpu_reset(adev);
> +		amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   /* Disable *all* interrupts */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index f08fde9..ac6a4f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
>   		  atomic_read(&job->ring->fence_drv.last_seq),
>   		  job->ring->fence_drv.sync_seq);
>   
> -	if (amdgpu_sriov_vf(job->adev))
> -		amdgpu_sriov_gpu_reset(job->adev, job);
> -	else
> -		amdgpu_gpu_reset(job->adev);
> +	amdgpu_gpu_recover(job->adev, job);
>   }
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b89d37f..3a661aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -285,7 +285,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> -int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
>   int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
>   void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
>   int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index b4906d2..f8522a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index c25a831..dae6d3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -514,7 +514,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
>   	}
>   
>   	/* Trigger recovery due to world switch failure */
> -	amdgpu_sriov_gpu_reset(adev, NULL);
> +	amdgpu_gpu_recover(adev, NULL);
>   }
>   
>   static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,




More information about the amd-gfx mailing list