[PATCH] drm/amdgpu: deprecate guilty handling
Christian König
ckoenig.leichtzumerken at gmail.com
Fri Sep 20 13:58:55 UTC 2024
The guilty handling tried to establish a second way of signaling problems with
the GPU back to userspace. This caused quite a bunch of issue we had to work
around, especially lifetime issues with the drm_sched_entity.
Just drop the handling altogether and use the dma_fence based approach instead.
Signed-off-by: Christian König <christian.koenig at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 5 -----
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 25 ++++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +-------
4 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1e475eb01417..5c8a6396d924 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -59,11 +59,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
if (!p->ctx)
return -EINVAL;
- if (atomic_read(&p->ctx->guilty)) {
- amdgpu_ctx_put(p->ctx);
- return -ECANCELED;
- }
-
amdgpu_sync_create(&p->sync);
drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
DRM_EXEC_IGNORE_DUPLICATES, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c43d1b6e5d66..39fe7cc6e34e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -250,7 +250,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
}
r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, num_scheds,
- &ctx->guilty);
+ NULL);
if (r)
goto error_free_entity;
@@ -572,6 +572,27 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
#define AMDGPU_RAS_COUNTE_DELAY_MS 3000
+static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx)
+{
+ int i, j;
+
+ for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+ for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+ struct amdgpu_ctx_entity *ctx_entity;
+
+ ctx_entity = ctx->entities[i][j];
+ if (ctx_entity)
+ continue;
+
+ if (drm_sched_entity_error(&ctx_entity->entity) ==
+ -ETIME)
+ return true;
+ }
+ }
+
+ return false;
+}
+
static int amdgpu_ctx_query2(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv, uint32_t id,
union drm_amdgpu_ctx_out *out)
@@ -600,7 +621,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
- if (atomic_read(&ctx->guilty))
+ if (amdgpu_ctx_guilty(ctx))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
if (amdgpu_in_reset(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 85376baaa92f..45569cce484e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,7 +53,6 @@ struct amdgpu_ctx {
bool preamble_presented;
int32_t init_priority;
int32_t override_priority;
- atomic_t guilty;
unsigned long ras_counter_ce;
unsigned long ras_counter_ue;
uint32_t stable_pstate;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index df19271130c6..26f72b8afde9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5330,14 +5330,10 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
struct amdgpu_reset_context *reset_context)
{
- int i, r = 0;
- struct amdgpu_job *job = NULL;
struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
-
- if (reset_context->reset_req_dev == adev)
- job = reset_context->job;
+ int i, r;
if (amdgpu_sriov_vf(adev))
amdgpu_virt_pre_reset(adev);
@@ -5362,9 +5358,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
amdgpu_fence_driver_isr_toggle(adev, false);
- if (job && job->vm)
- drm_sched_increase_karma(&job->base);
-
r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
if (r == -EOPNOTSUPP)
--
2.34.1
More information about the amd-gfx
mailing list