[PATCH] drm/amdgpu: invalidate mmhub semphore workaround in gmc9/gmc10
Felix Kuehling
felix.kuehling at amd.com
Sat Jan 18 00:28:29 UTC 2020
I'm working on the TLB flushing code and noticed a problem with this
commit. See inline ...
On 2019-11-21 10:47 a.m., Changfeng.Zhu wrote:
> From: changzhu <Changfeng.Zhu at amd.com>
>
> It may lose gpuvm invalidate acknowldege state across power-gating off
> cycle. To avoid this issue in gmc9/gmc10 invalidation, add semaphore acquire
> before invalidation and semaphore release after invalidation.
>
> After adding semaphore acquire before invalidation, the semaphore
> register become read-only if another process try to acquire semaphore.
> Then it will not be able to release this semaphore. Then it may cause
> deadlock problem. If this deadlock problem happens, it needs a semaphore
> firmware fix.
>
> Change-Id: I9942a2f451265c1f1038ccfe2f70042c7c8118af
> Signed-off-by: changzhu <Changfeng.Zhu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 52 ++++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 52 ++++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/soc15.h | 4 +-
> 3 files changed, 106 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index af2615ba52aa..e0104b985c42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -234,6 +234,27 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
> const unsigned eng = 17;
> unsigned int i;
>
> + spin_lock(&adev->gmc.invalidate_lock);
> + /*
> + * It may lose gpuvm invalidate acknowldege state across power-gating
> + * off cycle, add semaphore acquire before invalidation and semaphore
> + * release after invalidation to avoid entering power gated state
> + * to WA the Issue
> + */
> + if (vmhub == AMDGPU_MMHUB_0 ||
> + vmhub == AMDGPU_MMHUB_1) {
> + for (i = 0; i < adev->usec_timeout; i++) {
> + /* a read return value of 1 means semaphore acuqire */
> + tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
> + if (tmp & 0x1)
> + break;
> + udelay(1);
> + }
> +
> + if (i >= adev->usec_timeout)
> + DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> + }
> +
> WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>
> /*
> @@ -253,6 +274,16 @@ static void gmc_v10_0_flush_vm_hub(struct amdgpu_device *adev, uint32_t vmid,
> udelay(1);
> }
>
> + /*
> + * add semaphore release after invalidation,
> + * write with 0 means semaphore release
> + */
> + if (vmhub == AMDGPU_MMHUB_0 ||
> + vmhub == AMDGPU_MMHUB_1)
> + WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> + spin_unlock(&adev->gmc.invalidate_lock);
> +
> if (i < adev->usec_timeout)
> return;
>
> @@ -338,6 +369,19 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> uint32_t req = gmc_v10_0_get_invalidate_req(vmid, 0);
> unsigned eng = ring->vm_inv_eng;
>
> + /*
> + * It may lose gpuvm invalidate acknowldege state across power-gating
> + * off cycle, add semaphore acquire before invalidation and semaphore
> + * release after invalidation to avoid entering power gated state
> + * to WA the Issue
> + */
> +
> + /* a read return value of 1 means semaphore acuqire */
> + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> + ring->funcs->vmhub == AMDGPU_MMHUB_1)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +
> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> lower_32_bits(pd_addr));
>
> @@ -348,6 +392,14 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> hub->vm_inv_eng0_ack + eng,
> req, 1 << vmid);
>
> + /*
> + * add semaphore release after invalidation,
> + * write with 0 means semaphore release
> + */
> + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> + ring->funcs->vmhub == AMDGPU_MMHUB_1)
> + amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
> return pd_addr;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index b7f2b184e9b8..816fdd602c85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
...
tmp = gmc_v9_0_get_invalidate_req(vmid, flush_type);
...
> @@ -455,6 +455,27 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> }
>
> spin_lock(&adev->gmc.invalidate_lock);
> +
> + /*
> + * It may lose gpuvm invalidate acknowldege state across power-gating
> + * off cycle, add semaphore acquire before invalidation and semaphore
> + * release after invalidation to avoid entering power gated state
> + * to WA the Issue
> + */
> + if (vmhub == AMDGPU_MMHUB_0 ||
> + vmhub == AMDGPU_MMHUB_1) {
> + for (j = 0; j < adev->usec_timeout; j++) {
> + /* a read return value of 1 means semaphore acuqire */
> + tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng);
The tmp variable is also used for the request written to the
vm_inv_eng0_req register below. This overwrites that value.
> + if (tmp & 0x1)
> + break;
> + udelay(1);
> + }
> +
> + if (j >= adev->usec_timeout)
> + DRM_ERROR("Timeout waiting for sem acquire in VM flush!\n");
> + }
> +
> WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
I think this is broken if you acquired the semaphore above. You should
use a different tmp variable for acquiring the semaphore.
Regards,
Felix
>
> /*
> @@ -470,7 +491,17 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> break;
> udelay(1);
> }
> +
> + /*
> + * add semaphore release after invalidation,
> + * write with 0 means semaphore release
> + */
> + if (vmhub == AMDGPU_MMHUB_0 ||
> + vmhub == AMDGPU_MMHUB_1)
> + WREG32_NO_KIQ(hub->vm_inv_eng0_sem + eng, 0);
> +
> spin_unlock(&adev->gmc.invalidate_lock);
> +
> if (j < adev->usec_timeout)
> return;
>
> @@ -485,6 +516,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> uint32_t req = gmc_v9_0_get_invalidate_req(vmid, 0);
> unsigned eng = ring->vm_inv_eng;
>
> + /*
> + * It may lose gpuvm invalidate acknowldege state across power-gating
> + * off cycle, add semaphore acquire before invalidation and semaphore
> + * release after invalidation to avoid entering power gated state
> + * to WA the Issue
> + */
> +
> + /* a read return value of 1 means semaphore acuqire */
> + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> + ring->funcs->vmhub == AMDGPU_MMHUB_1)
> + amdgpu_ring_emit_reg_wait(ring,
> + hub->vm_inv_eng0_sem + eng, 0x1, 0x1);
> +
> amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + (2 * vmid),
> lower_32_bits(pd_addr));
>
> @@ -495,6 +539,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
> hub->vm_inv_eng0_ack + eng,
> req, 1 << vmid);
>
> + /*
> + * add semaphore release after invalidation,
> + * write with 0 means semaphore release
> + */
> + if (ring->funcs->vmhub == AMDGPU_MMHUB_0 ||
> + ring->funcs->vmhub == AMDGPU_MMHUB_1)
> + amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_sem + eng, 0);
> +
> return pd_addr;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 344280b869c4..d0fb7a67c1a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -28,8 +28,8 @@
> #include "nbio_v7_0.h"
> #include "nbio_v7_4.h"
>
> -#define SOC15_FLUSH_GPU_TLB_NUM_WREG 4
> -#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT 1
> +#define SOC15_FLUSH_GPU_TLB_NUM_WREG 6
> +#define SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT 3
>
> extern const struct amd_ip_funcs soc15_common_ip_funcs;
>
More information about the amd-gfx
mailing list