[PATCH 4/4] drm/amdgpu: busywait KIQ register accessing

Fri Oct 13 10:21:45 UTC 2017

Am 13.10.2017 um 11:35 schrieb Ding, Pixel:
> On 13/10/2017, 5:16 PM, "Christian König" <ckoenig.leichtzumerken at gmail.com> wrote:
>
>
>
>> Am 13.10.2017 um 10:26 schrieb Pixel Ding:
>>> From: pding <Pixel.Ding at amd.com>
>>>
>>> Register accessing is performed when IRQ is disabled. Never sleep in
>>> this function.
>>>
>>> Known issue: dead sleep in many use cases of index/data registers.
>>>
>>> Signed-off-by: pding <Pixel.Ding at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 ++---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 55 ++++++++++++++++++------------
>>>    6 files changed, 40 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index ca212ef..f9de756 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -886,6 +886,7 @@ struct amdgpu_kiq {
>>>    	u64			eop_gpu_addr;
>>>    	struct amdgpu_bo	*eop_obj;
>>>    	struct mutex		ring_mutex;
>> You can remove the ring_mutex if you don't use it any more.
> [Pixel] KFD still use the mutex, I didn’t change it also to spin lock, should I?

Yes, when they use the mutex as well we need to change that to.

Otherwise we run into bad trouble when both try to access the same ring.

BTW: Do they also use the fence? That would be rather problematic.

>>> +	spinlock_t              ring_lock;
>>>    	struct amdgpu_ring	ring;
>>>    	struct amdgpu_irq_src	irq;
>>>    };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index ab8f0d6..1197274 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -109,10 +109,8 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>>    {
>>>    	uint32_t ret;
>>>    
>>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) {
>>> -		BUG_ON(in_interrupt());
>>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>>    		return amdgpu_virt_kiq_rreg(adev, reg);
>>> -	}
>>>    
>>>    	if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
>>>    		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
>>> @@ -137,10 +135,8 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>>    		adev->last_mm_index = v;
>>>    	}
>>>    
>>> -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) {
>>> -		BUG_ON(in_interrupt());
>>> +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>>    		return amdgpu_virt_kiq_wreg(adev, reg, v);
>>> -	}
>>>    
>>>    	if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
>>>    		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 2044758..c6c7bf3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -109,7 +109,7 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
>>>     * Reads a fence value from memory (all asics).
>>>     * Returns the value of the fence read from memory.
>>>     */
>>> -static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>>> +u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>>>    {
>>>    	struct amdgpu_fence_driver *drv = &ring->fence_drv;
>>>    	u32 seq = 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index 4f6c68f..46fa88c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -186,6 +186,7 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
>>>    	int r = 0;
>>>    
>>>    	mutex_init(&kiq->ring_mutex);
>>> +	spin_lock_init(&kiq->ring_lock);
>>>    
>>>    	r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
>>>    	if (r)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index af8e544..a4fa923 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -89,6 +89,7 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>>>    void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
>>>    void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
>>>    int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
>>> +u32 amdgpu_fence_read(struct amdgpu_ring *ring);
>>>    void amdgpu_fence_process(struct amdgpu_ring *ring);
>>>    int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
>>>    unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index ab05121..9757df1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -22,7 +22,7 @@
>>>     */
>>>    
>>>    #include "amdgpu.h"
>>> -#define MAX_KIQ_REG_WAIT	100000
>>> +#define MAX_KIQ_REG_WAIT	100000000 /* in usecs */
>> You want to busy wait 100 seconds for an result?
>>
>> Forget it that is way to long you will certainly run into problems with
>> the NMI much earlier.
>> [Pixel] thanks, would change it to 10s.

Still way to long. I think the maximum we can have here is 1s, maybe 
even only 100ms.

>>
>>>    
>>>    int amdgpu_allocate_static_csa(struct amdgpu_device *adev)
>>>    {
>>> @@ -113,28 +113,32 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
>>>    
>>>    uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    {
>>> -	signed long r;
>>> -	uint32_t val;
>>> -	struct dma_fence *f;
>>> +	signed long timeout = usecs_to_jiffies(MAX_KIQ_REG_WAIT);
>>> +	unsigned long flags;
>>> +	uint32_t val, seq, wait_seq;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_rreg);
>>>    
>>> -	mutex_lock(&kiq->ring_mutex);
>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>>    	amdgpu_ring_emit_rreg(ring, reg);
>>> -	amdgpu_fence_emit(ring, &f);
>>> +	wait_seq = ++ring->fence_drv.sync_seq;
>>> +	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>>> +			       wait_seq, AMDGPU_FENCE_FLAG_INT);
>> Do you still need to send an interrupt when you busy wait? I don't think so.
> [Pixel]I want to let the IRQ handler to update the last_seq in case of that KIQ is used by other functionality with fence. Is it necessary?

Good point. In this case just keep it for now.

Might as well be that the KFD still needs the fence code.

Please double check that, cause that would make things much more 
complicated.

>>>    	amdgpu_ring_commit(ring);
>>> -	mutex_unlock(&kiq->ring_mutex);
>>> -
>>> -	r = dma_fence_wait_timeout(f, false, msecs_to_jiffies(MAX_KIQ_REG_WAIT));
>>> -	dma_fence_put(f);
>>> -	if (r < 1) {
>>> -		DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +	do {
>>> +		seq = amdgpu_fence_read(ring);
>>> +		udelay(5);
>>> +		timeout -= 5;
>>> +	} while (seq < wait_seq && timeout > 0);
>> You need to correctly handle sequence wrap around here.
> [Pixel] yes thanks, but I didn’t see similar case handling in normal fence handling codepath, Is there? So I assumed 32bit fence seq could not overflow for years.

The normal fence handling code uses a circular buffer, so overflow is 
actually welcome there and doesn't need special handling.

But 2^32 register writes sounds like something we might actually hit 
sooner or later.

Regards,
Christian.

>>> +
>>> +	if (timeout < 1) {
>>> +		DRM_ERROR("wait for kiq fence error: %ld\n", timeout);
>>>    		return ~0;
>>>    	}
>>> -
>>>    	val = adev->wb.wb[adev->virt.reg_val_offs];
>>>    
>>>    	return val;
>>> @@ -142,24 +146,33 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>>    
>>>    void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>>    {
>>> -	signed long r;
>>> +	signed long timeout = usecs_to_jiffies(MAX_KIQ_REG_WAIT);
>>> +	unsigned long flags;
>>> +	uint32_t seq, wait_seq;
>>>    	struct dma_fence *f;
>>>    	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>    	struct amdgpu_ring *ring = &kiq->ring;
>>>    
>>>    	BUG_ON(!ring->funcs->emit_wreg);
>>>    
>>> -	mutex_lock(&kiq->ring_mutex);
>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>    	amdgpu_ring_alloc(ring, 32);
>>>    	amdgpu_ring_emit_wreg(ring, reg, v);
>>> -	amdgpu_fence_emit(ring, &f);
>>> +	/* amdgpu_fence_emit(ring, &f); */
>>> +	wait_seq = ++ring->fence_drv.sync_seq;
>>> +	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>>> +			       wait_seq, AMDGPU_FENCE_FLAG_INT);
>>>    	amdgpu_ring_commit(ring);
>>> -	mutex_unlock(&kiq->ring_mutex);
>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +
>>> +	do {
>>> +		seq = amdgpu_fence_read(ring);
>>> +		udelay(5);
>>> +		timeout -= 5;
>>> +	} while (seq < wait_seq && timeout > 0);
>> Same here. It would probably be better to have a function for this in
>> amdgpu_fence.c
> [Pixel] get it, will add a wrapper.
>> Regards,
>> Christian.
>>
>>>    
>>> -	r = dma_fence_wait_timeout(f, false, msecs_to_jiffies(MAX_KIQ_REG_WAIT));
>>> -	if (r < 1)
>>> -		DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>> -	dma_fence_put(f);
>>> +	if (timeout < 1)
>>> +		DRM_ERROR("wait for kiq fence error: %ld\n", timeout);
>>>    }
>>>    
>>>    /**
>>
>> —
>> Sincerely Yours,
>> Pixel
>>
>>
>>
>>