[Intel-gfx] [PATCH] drm/i915/selftests: Stop using kthread_stop()
Ville Syrjälä
ville.syrjala at linux.intel.com
Thu Oct 20 14:18:51 UTC 2022
On Thu, Oct 20, 2022 at 02:08:41PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Since a7c01fa93aeb ("signal: break out of wait loops on kthread_stop()")
> kthread_stop() started asserting a pending signal which wreaks havoc with
> a few of our selftests. Mainly because they are not fully expecting to
> handle signals, but also cutting the intended test runtimes short due
> signal_pending() now returning true (via __igt_timeout), which therefore
> breaks both the patterns of:
>
> kthread_run()
> ..sleep for igt_timeout_ms to allow test to exercise stuff..
> kthread_stop()
>
> And check for errors recorded in the thread.
>
> And also:
>
> Main thread | Test thread
> ---------------+------------------------------
> kthread_run() |
> kthread_stop() | do stuff until __igt_timeout
> | -- exits early due signal --
>
> Where this kthread_stop() was assume would have a "join" semantics, which
> it would have had if not the new signal assertion issue.
>
> To recap, threads are now likely to catch a previously impossible
> ERESTARTSYS or EINTR, marking the test as failed, or have a pointlessly
> short run time.
>
> To work around this start using kthread_work(er) API which provides
> an explicit way of waiting for threads to exit. And for cases where
> parent controls the test duration we add explicit signaling which threads
> will now use instead of relying on kthread_should_stop().
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
> ---
> .../drm/i915/gem/selftests/i915_gem_context.c | 118 ++++----
> drivers/gpu/drm/i915/gt/selftest_execlists.c | 48 ++--
> drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 51 ++--
> drivers/gpu/drm/i915/selftests/i915_request.c | 252 +++++++++++-------
> 4 files changed, 281 insertions(+), 188 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> index c6ad67b90e8a..d8864444432b 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> @@ -179,97 +179,108 @@ static int live_nop_switch(void *arg)
> }
>
> struct parallel_switch {
> - struct task_struct *tsk;
> + struct kthread_worker *worker;
> + struct kthread_work work;
> struct intel_context *ce[2];
> + int result;
> };
>
> -static int __live_parallel_switch1(void *data)
> +static void __live_parallel_switch1(struct kthread_work *work)
> {
> - struct parallel_switch *arg = data;
> + struct parallel_switch *arg =
> + container_of(work, typeof(*arg), work);
> IGT_TIMEOUT(end_time);
> unsigned long count;
>
> count = 0;
> + arg->result = 0;
> do {
> struct i915_request *rq = NULL;
> - int err, n;
> + int n;
>
> - err = 0;
> - for (n = 0; !err && n < ARRAY_SIZE(arg->ce); n++) {
> + for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) {
> struct i915_request *prev = rq;
>
> rq = i915_request_create(arg->ce[n]);
> if (IS_ERR(rq)) {
> i915_request_put(prev);
> - return PTR_ERR(rq);
> + arg->result = PTR_ERR(rq);
> + break;
> }
>
> i915_request_get(rq);
> if (prev) {
> - err = i915_request_await_dma_fence(rq, &prev->fence);
> + arg->result =
> + i915_request_await_dma_fence(rq,
> + &prev->fence);
> i915_request_put(prev);
> }
>
> i915_request_add(rq);
> }
> +
> + if (IS_ERR_OR_NULL(rq))
> + break;
> +
> if (i915_request_wait(rq, 0, HZ) < 0)
> - err = -ETIME;
> + arg->result = -ETIME;
> +
> i915_request_put(rq);
> - if (err)
> - return err;
>
> count++;
> - } while (!__igt_timeout(end_time, NULL));
> + } while (!arg->result && !__igt_timeout(end_time, NULL));
>
> - pr_info("%s: %lu switches (sync)\n", arg->ce[0]->engine->name, count);
> - return 0;
> + pr_info("%s: %lu switches (sync) <%d>\n",
> + arg->ce[0]->engine->name, count, arg->result);
> }
>
> -static int __live_parallel_switchN(void *data)
> +static void __live_parallel_switchN(struct kthread_work *work)
> {
> - struct parallel_switch *arg = data;
> + struct parallel_switch *arg =
> + container_of(work, typeof(*arg), work);
> struct i915_request *rq = NULL;
> IGT_TIMEOUT(end_time);
> unsigned long count;
> int n;
>
> count = 0;
> + arg->result = 0;
> do {
> - for (n = 0; n < ARRAY_SIZE(arg->ce); n++) {
> + for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) {
> struct i915_request *prev = rq;
> - int err = 0;
>
> rq = i915_request_create(arg->ce[n]);
> if (IS_ERR(rq)) {
> i915_request_put(prev);
> - return PTR_ERR(rq);
> + arg->result = PTR_ERR(rq);
> + break;
> }
>
> i915_request_get(rq);
> if (prev) {
> - err = i915_request_await_dma_fence(rq, &prev->fence);
> + arg->result =
> + i915_request_await_dma_fence(rq,
> + &prev->fence);
> i915_request_put(prev);
> }
>
> i915_request_add(rq);
> - if (err) {
> - i915_request_put(rq);
> - return err;
> - }
> }
>
> count++;
> - } while (!__igt_timeout(end_time, NULL));
> - i915_request_put(rq);
> + } while (!arg->result && !__igt_timeout(end_time, NULL));
>
> - pr_info("%s: %lu switches (many)\n", arg->ce[0]->engine->name, count);
> - return 0;
> + if (!IS_ERR_OR_NULL(rq))
> + i915_request_put(rq);
> +
> + pr_info("%s: %lu switches (many) <%d>\n",
> + arg->ce[0]->engine->name, count, arg->result);
> }
>
> static int live_parallel_switch(void *arg)
> {
> struct drm_i915_private *i915 = arg;
> - static int (* const func[])(void *arg) = {
> + static void (* const func[])(struct kthread_work *) = {
> __live_parallel_switch1,
> __live_parallel_switchN,
> NULL,
> @@ -277,7 +288,7 @@ static int live_parallel_switch(void *arg)
> struct parallel_switch *data = NULL;
> struct i915_gem_engines *engines;
> struct i915_gem_engines_iter it;
> - int (* const *fn)(void *arg);
> + void (* const *fn)(struct kthread_work *);
> struct i915_gem_context *ctx;
> struct intel_context *ce;
> struct file *file;
> @@ -348,9 +359,22 @@ static int live_parallel_switch(void *arg)
> }
> }
>
> + for (n = 0; n < count; n++) {
> + struct kthread_worker *worker;
> +
> + if (!data[n].ce[0])
> + continue;
> +
> + worker = kthread_create_worker(0, "igt/parallel:%s",
> + data[n].ce[0]->engine->name);
> + if (IS_ERR(worker))
> + goto out;
> +
> + data[n].worker = worker;
> + }
> +
> for (fn = func; !err && *fn; fn++) {
> struct igt_live_test t;
> - int n;
>
> err = igt_live_test_begin(&t, i915, __func__, "");
> if (err)
> @@ -360,30 +384,17 @@ static int live_parallel_switch(void *arg)
> if (!data[n].ce[0])
> continue;
>
> - data[n].tsk = kthread_run(*fn, &data[n],
> - "igt/parallel:%s",
> - data[n].ce[0]->engine->name);
> - if (IS_ERR(data[n].tsk)) {
> - err = PTR_ERR(data[n].tsk);
> - break;
> - }
> - get_task_struct(data[n].tsk);
> + data[n].result = 0;
> + kthread_init_work(&data[n].work, *fn);
> + kthread_queue_work(data[n].worker, &data[n].work);
> }
>
> - yield(); /* start all threads before we kthread_stop() */
> -
> for (n = 0; n < count; n++) {
> - int status;
> -
> - if (IS_ERR_OR_NULL(data[n].tsk))
> - continue;
> -
> - status = kthread_stop(data[n].tsk);
> - if (status && !err)
> - err = status;
> -
> - put_task_struct(data[n].tsk);
> - data[n].tsk = NULL;
> + if (data[n].ce[0]) {
> + kthread_flush_work(&data[n].work);
> + if (data[n].result && !err)
> + err = data[n].result;
> + }
> }
>
> if (igt_live_test_end(&t))
> @@ -399,6 +410,9 @@ static int live_parallel_switch(void *arg)
> intel_context_unpin(data[n].ce[m]);
> intel_context_put(data[n].ce[m]);
> }
> +
> + if (data[n].worker)
> + kthread_destroy_worker(data[n].worker);
> }
> kfree(data);
> out_file:
> diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c
> index 56b7d5b5fea0..2c7c053a8808 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
> @@ -3473,12 +3473,14 @@ static int random_priority(struct rnd_state *rnd)
>
> struct preempt_smoke {
> struct intel_gt *gt;
> + struct kthread_work work;
> struct i915_gem_context **contexts;
> struct intel_engine_cs *engine;
> struct drm_i915_gem_object *batch;
> unsigned int ncontext;
> struct rnd_state prng;
> unsigned long count;
> + int result;
> };
>
> static struct i915_gem_context *smoke_context(struct preempt_smoke *smoke)
> @@ -3538,34 +3540,31 @@ static int smoke_submit(struct preempt_smoke *smoke,
> return err;
> }
>
> -static int smoke_crescendo_thread(void *arg)
> +static void smoke_crescendo_work(struct kthread_work *work)
> {
> - struct preempt_smoke *smoke = arg;
> + struct preempt_smoke *smoke = container_of(work, typeof(*smoke), work);
> IGT_TIMEOUT(end_time);
> unsigned long count;
>
> count = 0;
> do {
> struct i915_gem_context *ctx = smoke_context(smoke);
> - int err;
>
> - err = smoke_submit(smoke,
> - ctx, count % I915_PRIORITY_MAX,
> - smoke->batch);
> - if (err)
> - return err;
> + smoke->result = smoke_submit(smoke, ctx,
> + count % I915_PRIORITY_MAX,
> + smoke->batch);
>
> count++;
> - } while (count < smoke->ncontext && !__igt_timeout(end_time, NULL));
> + } while (!smoke->result && count < smoke->ncontext &&
> + !__igt_timeout(end_time, NULL));
>
> smoke->count = count;
> - return 0;
> }
>
> static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
> #define BATCH BIT(0)
> {
> - struct task_struct *tsk[I915_NUM_ENGINES] = {};
> + struct kthread_worker *worker[I915_NUM_ENGINES] = {};
> struct preempt_smoke *arg;
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> @@ -3576,6 +3575,8 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
> if (!arg)
> return -ENOMEM;
>
> + memset(arg, 0, I915_NUM_ENGINES * sizeof(*arg));
kcalloc()?
> +
> for_each_engine(engine, smoke->gt, id) {
> arg[id] = *smoke;
> arg[id].engine = engine;
> @@ -3583,31 +3584,28 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
> arg[id].batch = NULL;
> arg[id].count = 0;
>
> - tsk[id] = kthread_run(smoke_crescendo_thread, arg,
> - "igt/smoke:%d", id);
> - if (IS_ERR(tsk[id])) {
> - err = PTR_ERR(tsk[id]);
> + worker[id] = kthread_create_worker(0, "igt/smoke:%d", id);
> + if (IS_ERR(worker[id])) {
> + err = PTR_ERR(worker[id]);
> break;
> }
> - get_task_struct(tsk[id]);
> - }
>
> - yield(); /* start all threads before we kthread_stop() */
> + kthread_init_work(&arg[id].work, smoke_crescendo_work);
> + kthread_queue_work(worker[id], &arg[id].work);
> + }
>
> count = 0;
> for_each_engine(engine, smoke->gt, id) {
> - int status;
> -
> - if (IS_ERR_OR_NULL(tsk[id]))
> + if (IS_ERR_OR_NULL(worker[id]))
> continue;
>
> - status = kthread_stop(tsk[id]);
> - if (status && !err)
> - err = status;
> + kthread_flush_work(&arg[id].work);
> + if (arg[id].result && !err)
> + err = arg[id].result;
>
> count += arg[id].count;
>
> - put_task_struct(tsk[id]);
> + kthread_destroy_worker(worker[id]);
> }
>
> pr_info("Submitted %lu crescendo:%x requests across %d engines and %d contexts\n",
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 7f3bb1d34dfb..71263058a7b0 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -866,10 +866,13 @@ static int igt_reset_active_engine(void *arg)
> }
>
> struct active_engine {
> - struct task_struct *task;
> + struct kthread_worker *worker;
> + struct kthread_work work;
> struct intel_engine_cs *engine;
> unsigned long resets;
> unsigned int flags;
> + bool stop;
> + int result;
> };
>
> #define TEST_ACTIVE BIT(0)
> @@ -900,10 +903,10 @@ static int active_request_put(struct i915_request *rq)
> return err;
> }
>
> -static int active_engine(void *data)
> +static void active_engine(struct kthread_work *work)
> {
> I915_RND_STATE(prng);
> - struct active_engine *arg = data;
> + struct active_engine *arg = container_of(work, typeof(*arg), work);
> struct intel_engine_cs *engine = arg->engine;
> struct i915_request *rq[8] = {};
> struct intel_context *ce[ARRAY_SIZE(rq)];
> @@ -913,16 +916,17 @@ static int active_engine(void *data)
> for (count = 0; count < ARRAY_SIZE(ce); count++) {
> ce[count] = intel_context_create(engine);
> if (IS_ERR(ce[count])) {
> - err = PTR_ERR(ce[count]);
> - pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
> + arg->result = PTR_ERR(ce[count]);
> + pr_err("[%s] Create context #%ld failed: %d!\n",
> + engine->name, count, arg->result);
> while (--count)
> intel_context_put(ce[count]);
> - return err;
> + return;
> }
> }
>
> count = 0;
> - while (!kthread_should_stop()) {
> + while (!READ_ONCE(arg->stop)) {
> unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
> struct i915_request *old = rq[idx];
> struct i915_request *new;
> @@ -967,7 +971,7 @@ static int active_engine(void *data)
> intel_context_put(ce[count]);
> }
>
> - return err;
> + arg->result = err;
> }
>
> static int __igt_reset_engines(struct intel_gt *gt,
> @@ -1022,7 +1026,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
>
> memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
> for_each_engine(other, gt, tmp) {
> - struct task_struct *tsk;
> + struct kthread_worker *worker;
>
> threads[tmp].resets =
> i915_reset_engine_count(global, other);
> @@ -1036,19 +1040,21 @@ static int __igt_reset_engines(struct intel_gt *gt,
> threads[tmp].engine = other;
> threads[tmp].flags = flags;
>
> - tsk = kthread_run(active_engine, &threads[tmp],
> - "igt/%s", other->name);
> - if (IS_ERR(tsk)) {
> - err = PTR_ERR(tsk);
> - pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
> + worker = kthread_create_worker(0, "igt/%s",
> + other->name);
> + if (IS_ERR(worker)) {
> + err = PTR_ERR(worker);
> + pr_err("[%s] Worker create failed: %d!\n",
> + engine->name, err);
> goto unwind;
> }
>
> - threads[tmp].task = tsk;
> - get_task_struct(tsk);
> - }
> + threads[tmp].worker = worker;
>
> - yield(); /* start all threads before we begin */
> + kthread_init_work(&threads[tmp].work, active_engine);
> + kthread_queue_work(threads[tmp].worker,
> + &threads[tmp].work);
> + }
>
> st_engine_heartbeat_disable_no_pm(engine);
> GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
> @@ -1197,17 +1203,20 @@ static int __igt_reset_engines(struct intel_gt *gt,
> for_each_engine(other, gt, tmp) {
> int ret;
>
> - if (!threads[tmp].task)
> + if (!threads[tmp].worker)
> continue;
>
> - ret = kthread_stop(threads[tmp].task);
> + WRITE_ONCE(threads[tmp].stop, true);
> + kthread_flush_work(&threads[tmp].work);
Could perhaps optimize things a bit flagging all the threads
to stop before doing any flush_work()s. And maybe also do all
the init stuff before queuing any works. But dunno if that's
worth the hassle.
Looks reasonable enough to me.
Reviewed-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
--
Ville Syrjälä
Intel
More information about the Intel-gfx
mailing list