[Intel-gfx] [PATCH 2/2] drm/i915/guc: Add a selftest for FAST_REQUEST errors

Fri Nov 10 02:01:02 UTC 2023

On 11/9/2023 12:33, Daniele Ceraolo Spurio wrote:
> On 11/6/2023 3:59 PM, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> There is a mechanism for reporting errors from fire and forget H2G
>> messages. This is the only way to find out about almost any error in
>> the GuC backend submission path. So it would be useful to know that it
>> is working.
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc.h    |   4 +
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |   9 ++
>>   drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 122 ++++++++++++++++++++++
>>   3 files changed, 135 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> index 2b6dfe62c8f2a..e22c12ce245ad 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> @@ -297,6 +297,10 @@ struct intel_guc {
>>        * @number_guc_id_stolen: The number of guc_ids that have been 
>> stolen
>>        */
>>       int number_guc_id_stolen;
>> +    /**
>> +     * @fast_response_selftest: Backdoor to CT handler for fast 
>> response selftest
>> +     */
>> +    u32 fast_response_selftest;
>>   #endif
>>   };
>>   diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> index 89e314b3756bb..9d958afb78b7f 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>> @@ -1076,6 +1076,15 @@ static int ct_handle_response(struct 
>> intel_guc_ct *ct, struct ct_incoming_msg *r
>>           found = true;
>>           break;
>>       }
>> +
>> +#ifdef CONFIG_DRM_I915_SELFTEST
>> +    if (!found && ct_to_guc(ct)->fast_response_selftest) {
>> +        CT_DEBUG(ct, "Assuming unsolicited response due to 
>> FAST_REQUEST selftest\n");
>> +        ct_to_guc(ct)->fast_response_selftest++;
>> +        found = 1;
>
> found = true ? it's the same thing, but it's cleaner to assign boolean 
> values to bool variables
Doh.

>
>> +    }
>> +#endif
>> +
>>       if (!found) {
>>           CT_ERROR(ct, "Unsolicited response message: len %u, data 
>> %#x (fence %u, last %u)\n",
>>                len, hxg[0], fence, ct->requests.last_fence);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
>> b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
>> index bfb72143566f6..97fbbb396336c 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
>> @@ -286,11 +286,133 @@ static int intel_guc_steal_guc_ids(void *arg)
>>       return ret;
>>   }
>>   +/*
>> + * Send a context schedule H2G message with an invalid context id.
>> + * This should generate a GUC_RESULT_INVALID_CONTEXT response.
>> + */
>> +static int bad_h2g(struct intel_guc *guc)
>> +{
>> +    u32 action[3], len = 0;
>
> AFAICS This is a 2 DW command, so you can use action[2].
Yup. Copy and paste bug.

>
>> +
>> +    action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT;
>> +    action[len++] = 0x12345678;
>> +
>> +    return intel_guc_send_nb(guc, action, len, 0);
>> +}
>> +
>> +/*
>> + * Set a spinner running to make sure the system is alive and active,
>> + * then send a bad but asynchronous H2G command and wait to see if an
>> + * error response is returned. If no response is received or if the
>> + * spinner dies then the test will fail.
>> + */
>> +#define FAST_RESPONSE_TIMEOUT_MS    1000
>> +static int intel_guc_fast_request(void *arg)
>> +{
>> +    struct intel_gt *gt = arg;
>> +    struct intel_context *ce;
>> +    struct igt_spinner spin;
>> +    struct i915_request *rq;
>> +    intel_wakeref_t wakeref;
>> +    struct intel_engine_cs *engine = 
>> intel_selftest_find_any_engine(gt);
>> +    ktime_t before, now, delta;
>> +    bool spinning = false;
>> +    u64 delta_ms;
>> +    int ret = 0;
>> +
>> +    if (!engine)
>> +        return 0;
>> +
>> +    wakeref = intel_runtime_pm_get(gt->uncore->rpm);
>> +
>> +    ce = intel_context_create(engine);
>> +    if (IS_ERR(ce)) {
>> +        ret = PTR_ERR(ce);
>> +        gt_err(gt, "Failed to create spinner request: %pe\n", ce);
>> +        goto err_pm;
>> +    }
>> +
>> +    ret = igt_spinner_init(&spin, engine->gt);
>> +    if (ret) {
>> +        gt_err(gt, "Failed to create spinner: %pe\n", ERR_PTR(ret));
>> +        goto err_pm;
>> +    }
>> +    spinning = true;
>> +
>> +    rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
>> +    intel_context_put(ce);
>> +    if (IS_ERR(rq)) {
>> +        ret = PTR_ERR(rq);
>> +        gt_err(gt, "Failed to create spinner request: %pe\n", rq);
>> +        goto err_spin;
>> +    }
>> +
>> +    ret = request_add_spin(rq, &spin);
>> +    if (ret) {
>> +        gt_err(gt, "Failed to add Spinner request: %pe\n", 
>> ERR_PTR(ret));
>> +        goto err_rq;
>> +    }
>> +
>> +    gt->uc.guc.fast_response_selftest = 1;
>> +
>> +    ret = bad_h2g(&gt->uc.guc);
>> +    if (ret) {
>> +        gt_err(gt, "Failed to send H2G: %pe\n", ERR_PTR(ret));
>> +        goto err_rq;
>> +    }
>> +
>> +    before = ktime_get();
>> +    while (gt->uc.guc.fast_response_selftest == 1) {
>> +        ret = i915_request_wait(rq, 0, 1);
>> +        if (ret != -ETIME) {
>> +            gt_err(gt, "Request wait failed: %pe\n", ERR_PTR(ret));
>> +            goto err_rq;
>> +        }
>> +        now = ktime_get();
>> +        delta = ktime_sub(now, before);
>> +        delta_ms = ktime_to_ms(delta);
>> +
>> +        if (delta_ms > FAST_RESPONSE_TIMEOUT_MS) {
>> +            gt_err(gt, "Timed out waiting for fast request error!\n");
>> +            ret = -ETIME;
>> +            goto err_rq;
>> +        }
>> +    }
>
>
> This seems a bit convoluted. Why not just wait for either the request 
> completion or the fast_response_selftest value to change?
> something like:
>
> ret = wait_for(fast_response_selftest != 1 || i915_request_completed(rq),
>                FAST_RESPONSE_TIMEOUT_MS);
>
> if (ret || i915_request_completed(rq))
>     // error
Um. Because my brain wasn't working when I knocked this up?

John.

>
>
> Daniele
>
>> +
>> +    if (gt->uc.guc.fast_response_selftest != 2) {
>> +        gt_err(gt, "Unexpected fast response count: %d\n",
>> +               gt->uc.guc.fast_response_selftest);
>> +        goto err_rq;
>> +    }
>> +
>> +    igt_spinner_end(&spin);
>> +    spinning = false;
>> +
>> +    ret = intel_selftest_wait_for_rq(rq);
>> +    if (ret) {
>> +        gt_err(gt, "Request failed to complete: %pe\n", ERR_PTR(ret));
>> +        goto err_rq;
>> +    }
>> +
>> +err_rq:
>> +    i915_request_put(rq);
>> +
>> +err_spin:
>> +    if (spinning)
>> +        igt_spinner_end(&spin);
>> +    igt_spinner_fini(&spin);
>> +
>> +err_pm:
>> +    intel_runtime_pm_put(gt->uncore->rpm, wakeref);
>> +    return ret;
>> +}
>> +
>>   int intel_guc_live_selftests(struct drm_i915_private *i915)
>>   {
>>       static const struct i915_subtest tests[] = {
>>           SUBTEST(intel_guc_scrub_ctbs),
>>           SUBTEST(intel_guc_steal_guc_ids),
>> +        SUBTEST(intel_guc_fast_request),
>>       };
>>       struct intel_gt *gt = to_gt(i915);
>