[PATCH 1/4] drm/amdgpu: add ring soft recovery v3

Christian König ckoenig.leichtzumerken at gmail.com
Thu Aug 23 18:25:04 UTC 2018


Am 23.08.2018 um 17:20 schrieb Zhu, Rex:
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Thursday, August 23, 2018 7:24 PM
>> To: amd-gfx at lists.freedesktop.org
>> Subject: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
>>
>> Instead of hammering hard on the GPU try a soft recovery first.
>>
>> v2: reorder code a bit
>> v3: increase timeout to 10ms, increment GPU reset counter
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  |  6 ++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25
>> +++++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  4 ++++
>>   3 files changed, 35 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 265ff90f4e01..d93e31a5c4e7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct
>> drm_sched_job *s_job)
>>   	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>   	struct amdgpu_job *job = to_amdgpu_job(s_job);
>>
>> +	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence-
>>> parent)) {
>> +		DRM_ERROR("ring %s timeout, but soft recovered\n",
>> +			  s_job->sched->name);
>> +		return;
>> +	}
>> +
>>   	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>   		  job->base.sched->name, atomic_read(&ring-
>>> fence_drv.last_seq),
>>   		  ring->fence_drv.sync_seq);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 5dfd26be1eec..d445acb3d956 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -383,6 +383,31 @@ void
>> amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
>>   	amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);  }
>>
>> +/**
>> + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
>> + *
>> + * @ring: ring to try the recovery on
>> + * @vmid: VMID we try to get going again
>> + * @fence: timedout fence
>> + *
>> + * Tries to get a ring proceeding again when it is stuck.
>> + */
>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
>> vmid,
>> +			       struct dma_fence *fence)
>> +{
>> +	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>> +
>> +	if (!ring->funcs->soft_recovery)
>> +		return false;
>> +
>> +	atomic_inc(&adev->gpu_reset_counter);
>> +	while (!dma_fence_is_signaled(fence) &&
>> +	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
>> +		ring->funcs->soft_recovery(ring, vmid);
> Hi Christian,
>
> Is it necessary to add a udelay() here?

No, I don't think so.

Christian.

>
> Regards
> Rex
>> +	return dma_fence_is_signaled(fence);
>> +}
>> +
>>   /*
>>    * Debugfs info
>>    */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 409fdd9b9710..9cc239968e40 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs {
>>   	/* priority functions */
>>   	void (*set_priority) (struct amdgpu_ring *ring,
>>   			      enum drm_sched_priority priority);
>> +	/* Try to soft recover the ring to make the fence signal */
>> +	void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid);
>>   };
>>
>>   struct amdgpu_ring {
>> @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
>> void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring
>> *ring,
>>   						uint32_t reg0, uint32_t val0,
>>   						uint32_t reg1, uint32_t val1);
>> +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
>> vmid,
>> +			       struct dma_fence *fence);
>>
>>   static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)  {
>> --
>> 2.14.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list