[PATCH 1/2] drm/amdgpu: make duplicated EOP packet for GFX7/8 have real content

Mon Jun 17 13:09:08 UTC 2024

Am 17.06.24 um 15:03 schrieb Icenowy Zheng:
> 在 2024-06-17星期一的 14:35 +0200，Christian König写道：
>> Am 17.06.24 um 12:58 schrieb Icenowy Zheng:
>>> The duplication of EOP packets for GFX7/8, with the former one have
>>> seq-1 written and the latter one have seq written, seems to confuse
>>> some
>>> hardware platform (e.g. Loongson 7A series PCIe controllers).
>>>
>>> Make the content of the duplicated EOP packet the same with the
>>> real
>>> one, only masking any possible interrupts.
>> Well completely NAK to that, exactly that disables the workaround.
>>
>> The CPU needs to see two different values written here.
> Why do the CPU need to see two different values here? Only the second
> packet will raise an interrupt before and after applying this patch,
> and the first packet's result should just be overriden on ordinary
> platforms. The CPU won't see the first one, until it's polling for the
> address for a very short interval, so short that the GPU CP couldn't
> execute 2 commands.

Yes exactly that. We need to make two writes, one with the old value 
(seq - 1) and a second with the real value (seq).

Otherwise it is possible that a polling CPU would see the sequence 
before the second EOP is issued with results in incoherent view of memory.

> Or do you mean the GPU needs to see two different values being written,
> or they will be merged into only one write request?
>
> Please give out more information about this workaround, otherwise the
> GPU hang problem on Loongson platforms will persist.

Well if Loongson can't handle two consecutive write operations to the 
same address with different values then you have a massive platform bug.

That is something which can happen all the time throughout the operation.

Regards,
Christian.

>
>> Regards,
>> Christian.
>>
>>> Fixes: bf26da927a1c ("drm/amdgpu: add cache flush workaround to
>>> gfx8 emit_fence")
>>> Fixes: a2e73f56fa62 ("drm/amdgpu: Add support for CIK parts")
>>> Signed-off-by: Icenowy Zheng <uwu at icenowy.me>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 12 +++++-------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 12 ++++--------
>>>    2 files changed, 9 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index 541dbd70d8c75..778f27f1a34fe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -2117,9 +2117,8 @@ static void
>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>    {
>>>          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>> -       /* Workaround for cache flush problems. First send a dummy
>>> EOP
>>> -        * event down the pipe with seq one below.
>>> -        */
>>> +
>>> +       /* Workaround for cache flush problems, send EOP twice. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> @@ -2127,11 +2126,10 @@ static void
>>> gfx_v7_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>                                   EVENT_INDEX(5)));
>>>          amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
>>> -                               DATA_SEL(1) | INT_SEL(0));
>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>> +                               DATA_SEL(write64bit ? 2 : 1) |
>>> INT_SEL(0));
>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>    
>>> -       /* Then send the real EOP event down the pipe. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 2f0e72caee1af..39a7d60f1fd69 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -6153,9 +6153,7 @@ static void
>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>>>          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>>>    
>>> -       /* Workaround for cache flush problems. First send a dummy
>>> EOP
>>> -        * event down the pipe with seq one below.
>>> -        */
>>> +       /* Workaround for cache flush problems, send EOP twice. */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |
>>> @@ -6164,12 +6162,10 @@ static void
>>> gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>>>                                   EVENT_INDEX(5)));
>>>          amdgpu_ring_write(ring, addr & 0xfffffffc);
>>>          amdgpu_ring_write(ring, (upper_32_bits(addr) & 0xffff) |
>>> -                               DATA_SEL(1) | INT_SEL(0));
>>> -       amdgpu_ring_write(ring, lower_32_bits(seq - 1));
>>> -       amdgpu_ring_write(ring, upper_32_bits(seq - 1));
>>> +                         DATA_SEL(write64bit ? 2 : 1) |
>>> INT_SEL(0));
>>> +       amdgpu_ring_write(ring, lower_32_bits(seq));
>>> +       amdgpu_ring_write(ring, upper_32_bits(seq));
>>>    
>>> -       /* Then send the real EOP event down the pipe:
>>> -        * EVENT_WRITE_EOP - flush caches, send int */
>>>          amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP,
>>> 4));
>>>          amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
>>>                                   EOP_TC_ACTION_EN |