[RFC 3/5] drm/scheduler: Add a simple TDR test

Tvrtko Ursulin tvrtko.ursulin at igalia.com
Wed Feb 5 10:01:24 UTC 2025


On 04/02/2025 16:21, Christian König wrote:
> Am 03.02.25 um 16:30 schrieb Tvrtko Ursulin:
>> Add a very simple TDR test which submits a single job and verifies that
>> the TDR handling will run if the backend failed to complete the job in
>> time.
> 
> I think I said it before but I strongly suggest to not use TDR as name 
> in the scheduler at all.
> 
> What the scheduler provides is a simple timeout while waiting for the HW 
> fence to signal.
> 
> That is fundamentally different to the TDR functionality Windows provide 
> and we already had people confusing this.

I did a s/tdr/timeout/ locally.

> Apart from that "yes, please". Those tests are desperately needed.

Cool. Lets see what other people will say and if someone can actually 
review.

Regards,

Tvrtko

>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>> Cc: Christian König <christian.koenig at amd.com>
>> Cc: Danilo Krummrich <dakr at kernel.org>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: Philipp Stanner <phasta at kernel.org>
>> ---
>>   .../drm/scheduler/tests/drm_mock_scheduler.c  | 12 +++-
>>   .../gpu/drm/scheduler/tests/drm_sched_tests.h |  6 +-
>>   .../scheduler/tests/drm_sched_tests_basic.c   | 64 ++++++++++++++++++-
>>   3 files changed, 76 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/tests/drm_mock_scheduler.c 
>> b/drivers/gpu/drm/scheduler/tests/drm_mock_scheduler.c
>> index f1985900a6ba..79b6193ce920 100644
>> --- a/drivers/gpu/drm/scheduler/tests/drm_mock_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/tests/drm_mock_scheduler.c
>> @@ -160,7 +160,11 @@ static struct dma_fence 
>> *mock_sched_run_job(struct drm_sched_job *sched_job)
>>   static enum drm_gpu_sched_stat
>>   mock_sched_timedout_job(struct drm_sched_job *sched_job)
>>   {
>> -    return DRM_GPU_SCHED_STAT_ENODEV;
>> +    struct drm_mock_sched_job *job = 
>> drm_sched_job_to_mock_job(sched_job);
>> +
>> +    job->flags |= DRM_MOCK_SCHED_JOB_TIMEDOUT;
>> +
>> +    return DRM_GPU_SCHED_STAT_NOMINAL;
>>   }
>>   static void mock_sched_free_job(struct drm_sched_job *sched_job)
>> @@ -174,7 +178,9 @@ static const struct drm_sched_backend_ops 
>> drm_mock_scheduler_ops = {
>>       .free_job = mock_sched_free_job
>>   };
>> -struct drm_mock_scheduler *drm_mock_new_scheduler(struct kunit *test)
>> +struct drm_mock_scheduler *
>> +drm_mock_new_scheduler(struct kunit *test,
>> +               long timeout)
>>   {
>>       struct drm_mock_scheduler *sched;
>>       int ret;
>> @@ -188,7 +194,7 @@ struct drm_mock_scheduler 
>> *drm_mock_new_scheduler(struct kunit *test)
>>                    DRM_SCHED_PRIORITY_COUNT,
>>                    U32_MAX, /* max credits */
>>                    UINT_MAX, /* hang limit */
>> -                 MAX_SCHEDULE_TIMEOUT, /* timeout */
>> +                 timeout,
>>                    NULL, /* timeout wq */
>>                    NULL, /* score */
>>                    "drm-mock-scheduler",
>> diff --git a/drivers/gpu/drm/scheduler/tests/drm_sched_tests.h 
>> b/drivers/gpu/drm/scheduler/tests/drm_sched_tests.h
>> index 421ee2712985..20695f55e453 100644
>> --- a/drivers/gpu/drm/scheduler/tests/drm_sched_tests.h
>> +++ b/drivers/gpu/drm/scheduler/tests/drm_sched_tests.h
>> @@ -35,6 +35,9 @@ struct drm_mock_sched_entity {
>>   struct drm_mock_sched_job {
>>       struct drm_sched_job    base;
>> +#define DRM_MOCK_SCHED_JOB_TIMEDOUT 0x1
>> +    unsigned long        flags;
>> +
>>       struct list_head    link;
>>       struct hrtimer        timer;
>> @@ -65,7 +68,8 @@ drm_sched_job_to_mock_job(struct drm_sched_job 
>> *sched_job)
>>       return container_of(sched_job, struct drm_mock_sched_job, base);
>>   };
>> -struct drm_mock_scheduler *drm_mock_new_scheduler(struct kunit *test);
>> +struct drm_mock_scheduler *drm_mock_new_scheduler(struct kunit *test,
>> +                          long timeout);
>>   void drm_mock_scheduler_fini(struct drm_mock_scheduler *sched);
>>   unsigned int drm_mock_sched_advance(struct drm_mock_scheduler *sched,
>>                       unsigned int num);
>> diff --git a/drivers/gpu/drm/scheduler/tests/drm_sched_tests_basic.c 
>> b/drivers/gpu/drm/scheduler/tests/drm_sched_tests_basic.c
>> index 6fd39bea95b1..eb0d54d00f21 100644
>> --- a/drivers/gpu/drm/scheduler/tests/drm_sched_tests_basic.c
>> +++ b/drivers/gpu/drm/scheduler/tests/drm_sched_tests_basic.c
>> @@ -3,7 +3,7 @@
>>   static int drm_sched_basic_init(struct kunit *test)
>>   {
>> -    test->priv = drm_mock_new_scheduler(test);
>> +    test->priv = drm_mock_new_scheduler(test, MAX_SCHEDULE_TIMEOUT);
>>       return 0;
>>   }
>> @@ -15,6 +15,13 @@ static void drm_sched_basic_exit(struct kunit *test)
>>       drm_mock_scheduler_fini(sched);
>>   }
>> +static int drm_sched_tdr_init(struct kunit *test)
>> +{
>> +    test->priv = drm_mock_new_scheduler(test, HZ);
>> +
>> +    return 0;
>> +}
>> +
>>   static void drm_sched_basic_submit(struct kunit *test)
>>   {
>>       struct drm_mock_scheduler *sched = test->priv;
>> @@ -244,4 +251,57 @@ static struct kunit_suite drm_sched_basic = {
>>       .test_cases = drm_sched_basic_tests,
>>   };
>> -kunit_test_suite(drm_sched_basic);
>> +static void drm_sched_basic_tdr(struct kunit *test)
>> +{
>> +    struct drm_mock_scheduler *sched = test->priv;
>> +    struct drm_mock_sched_entity *entity;
>> +    struct drm_mock_sched_job *job;
>> +    bool done;
>> +
>> +    /*
>> +     * Submit a single job against a scheduler with the timeout 
>> configured
>> +     * and verify that the timeout handling will run if the backend 
>> fails
>> +     * to complete it in time.
>> +     */
>> +
>> +    entity = drm_mock_new_sched_entity(test,
>> +                       DRM_SCHED_PRIORITY_NORMAL,
>> +                       sched);
>> +    job = drm_mock_new_sched_job(test, entity);
>> +
>> +    drm_mock_sched_job_submit(job);
>> +
>> +    done = drm_mock_sched_job_wait_scheduled(job, HZ);
>> +    KUNIT_ASSERT_EQ(test, done, true);
>> +
>> +    done = drm_mock_sched_job_wait_finished(job, HZ / 2);
>> +    KUNIT_ASSERT_EQ(test, done, false);
>> +
>> +    KUNIT_ASSERT_EQ(test,
>> +            job->flags & DRM_MOCK_SCHED_JOB_TIMEDOUT,
>> +            0);
>> +
>> +    done = drm_mock_sched_job_wait_finished(job, HZ);
>> +    KUNIT_ASSERT_EQ(test, done, false);
>> +
>> +    KUNIT_ASSERT_EQ(test,
>> +            job->flags & DRM_MOCK_SCHED_JOB_TIMEDOUT,
>> +            DRM_MOCK_SCHED_JOB_TIMEDOUT);
>> +
>> +    drm_mock_sched_entity_free(entity);
>> +}
>> +
>> +static struct kunit_case drm_sched_tdr_tests[] = {
>> +    KUNIT_CASE(drm_sched_basic_tdr),
>> +    {}
>> +};
>> +
>> +static struct kunit_suite drm_sched_tdr = {
>> +    .name = "drm_sched_basic_tdr_tests",
>> +    .init = drm_sched_tdr_init,
>> +    .exit = drm_sched_basic_exit,
>> +    .test_cases = drm_sched_tdr_tests,
>> +};
>> +
>> +kunit_test_suites(&drm_sched_basic,
>> +          &drm_sched_tdr);
> 


More information about the dri-devel mailing list