[PATCH 13/18] drm/amdgpu:fix driver unloading bug
Monk Liu
Monk.Liu at amd.com
Mon Sep 18 06:11:57 UTC 2017
[SWDEV-126631] - fix hypervisor save_vf fail that occured
after driver removed:
1. Because the KIQ and KCQ were not ummapped, save_vf will fail if driver freed mqd of KIQ and KCQ.
2. KIQ can't be unmapped since RLCV always need it, the bo_free on KIQ should be skipped
3. KCQ can be unmapped, and should be unmapped during hw_fini,
4. RLCV still need to access other mc address from some hw even after driver unloaded,
So we should not unbind gart for VF.
Change-Id: I320487a9a848f41484c5f8cc11be34aca807b424
Signed-off-by: Horace Chen <horace.chen at amd.com>
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 3 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +++
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 60 +++++++++++++++++++++++++++++++-
3 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index f437008..2fee071 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -394,7 +394,8 @@ int amdgpu_gart_init(struct amdgpu_device *adev)
*/
void amdgpu_gart_fini(struct amdgpu_device *adev)
{
- if (adev->gart.ready) {
+ /* gart is still used by other hw under SRIOV, don't unbind it */
+ if (adev->gart.ready && !amdgpu_sriov_vf(adev)) {
/* unbind pages */
amdgpu_gart_unbind(adev, 0, adev->gart.num_cpu_pages);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 4f6c68f..bf6656f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -309,6 +309,11 @@ void amdgpu_gfx_compute_mqd_sw_fini(struct amdgpu_device *adev)
&ring->mqd_ptr);
}
+ /* don't deallocate KIQ mqd because the bo is still used by RLCV even
+ the guest VM is shutdown */
+ if (amdgpu_sriov_vf(adev))
+ return;
+
ring = &adev->gfx.kiq.ring;
kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]);
amdgpu_bo_free_kernel(&ring->mqd_obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 44960b3..a577bbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2892,14 +2892,72 @@ static int gfx_v9_0_hw_init(void *handle)
return r;
}
+static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct amdgpu_ring *ring)
+{
+ struct amdgpu_device *adev = kiq_ring->adev;
+ uint32_t scratch, tmp = 0;
+ int r, i;
+
+ r = amdgpu_gfx_scratch_get(adev, &scratch);
+ if (r) {
+ DRM_ERROR("Failed to get scratch reg (%d).\n", r);
+ return r;
+ }
+ WREG32(scratch, 0xCAFEDEAD);
+
+ r = amdgpu_ring_alloc(kiq_ring, 10);
+ if (r) {
+ DRM_ERROR("Failed to lock KIQ (%d).\n", r);
+ amdgpu_gfx_scratch_free(adev, scratch);
+ return r;
+ }
+
+ /* unmap queues */
+ amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
+ amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
+ PACKET3_UNMAP_QUEUES_ACTION(1) | /* RESET_QUEUES */
+ PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
+ PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) |
+ PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
+ amdgpu_ring_write(kiq_ring, PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
+ amdgpu_ring_write(kiq_ring, 0);
+ amdgpu_ring_write(kiq_ring, 0);
+ amdgpu_ring_write(kiq_ring, 0);
+ /* write to scratch for completion */
+ amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
+ amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START));
+ amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
+ amdgpu_ring_commit(kiq_ring);
+
+ for (i = 0; i < adev->usec_timeout; i++) {
+ tmp = RREG32(scratch);
+ if (tmp == 0xDEADBEEF)
+ break;
+ DRM_UDELAY(1);
+ }
+ if (i >= adev->usec_timeout) {
+ DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", scratch, tmp);
+ r = -EINVAL;
+ }
+ amdgpu_gfx_scratch_free(adev, scratch);
+ return r;
+}
+
+
static int gfx_v9_0_hw_fini(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+ int i, r;
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
if (amdgpu_sriov_vf(adev)) {
- pr_debug("For SRIOV client, shouldn't do anything.\n");
+ /* disable KCQ to avoid CPC touch memory not valid anymore */
+ for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+ r = gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, &adev->gfx.compute_ring[i]);
+ if (r)
+ return r;
+ }
return 0;
}
gfx_v9_0_cp_enable(adev, false);
--
2.7.4
More information about the amd-gfx
mailing list