[Intel-gfx] [PATCH] drm/i915/selftests: Avoid repeatedly harming the same innocent context

Sun Apr 1 02:45:31 UTC 2018

Hi Chris,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__

sparse warnings: (new ones prefixed by >>)

   drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:33: sparse: undefined identifier 'GEM_TRACE_DUMP'
>> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:47: sparse: call with no type!
   In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0:
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue':
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP'; did you mean 'GEM_TRACE'? [-Werror=implicit-function-declaration]
        GEM_TRACE_DUMP();
        ^~~~~~~~~~~~~~
        GEM_TRACE
   cc1: some warnings being treated as errors

vim +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   922	
   923	static int igt_reset_queue(void *arg)
   924	{
   925		struct drm_i915_private *i915 = arg;
   926		struct intel_engine_cs *engine;
   927		enum intel_engine_id id;
   928		struct hang h;
   929		int err;
   930	
   931		/* Check that we replay pending requests following a hang */
   932	
   933		global_reset_lock(i915);
   934	
   935		mutex_lock(&i915->drm.struct_mutex);
   936		err = hang_init(&h, i915);
   937		if (err)
   938			goto unlock;
   939	
   940		for_each_engine(engine, i915, id) {
   941			struct i915_request *prev;
   942			IGT_TIMEOUT(end_time);
   943			unsigned int count;
   944	
   945			if (!intel_engine_can_store_dword(engine))
   946				continue;
   947	
   948			prev = hang_create_request(&h, engine);
   949			if (IS_ERR(prev)) {
   950				err = PTR_ERR(prev);
   951				goto fini;
   952			}
   953	
   954			i915_request_get(prev);
   955			__i915_request_add(prev, true);
   956	
   957			count = 0;
   958			do {
   959				struct i915_request *rq;
   960				unsigned int reset_count;
   961	
   962				rq = hang_create_request(&h, engine);
   963				if (IS_ERR(rq)) {
   964					err = PTR_ERR(rq);
   965					goto fini;
   966				}
   967	
   968				i915_request_get(rq);
   969				__i915_request_add(rq, true);
   970	
   971				/*
   972				 * XXX We don't handle resetting the kernel context
   973				 * very well. If we trigger a device reset twice in
   974				 * quick succession while the kernel context is
   975				 * executing, we may end up skipping the breadcrumb.
   976				 * This is really only a problem for the selftest as
   977				 * normally there is a large interlude between resets
   978				 * (hangcheck), or we focus on resetting just one
   979				 * engine and so avoid repeatedly resetting innocents.
   980				 */
   981				err = wait_for_others(i915, engine);
   982				if (err) {
   983					pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
   984					       __func__, engine->name);
   985					i915_request_put(rq);
   986					i915_request_put(prev);
   987	
 > 988					GEM_TRACE_DUMP();
   989					i915_gem_set_wedged(i915);
   990					goto fini;
   991				}
   992	
   993				if (!wait_for_hang(&h, prev)) {
   994					struct drm_printer p = drm_info_printer(i915->drm.dev);
   995	
   996					pr_err("%s(%s): Failed to start request %x, at %x\n",
   997					       __func__, engine->name,
   998					       prev->fence.seqno, hws_seqno(&h, prev));
   999					intel_engine_dump(engine, &p,
  1000							  "%s\n", engine->name);
  1001	
  1002					i915_request_put(rq);
  1003					i915_request_put(prev);
  1004	
  1005					i915_reset(i915, 0);
  1006					i915_gem_set_wedged(i915);
  1007	
  1008					err = -EIO;
  1009					goto fini;
  1010				}
  1011	
  1012				reset_count = fake_hangcheck(prev);
  1013	
  1014				i915_reset(i915, I915_RESET_QUIET);
  1015	
  1016				GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  1017						    &i915->gpu_error.flags));
  1018	
  1019				if (prev->fence.error != -EIO) {
  1020					pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  1021					       prev->fence.error);
  1022					i915_request_put(rq);
  1023					i915_request_put(prev);
  1024					err = -EINVAL;
  1025					goto fini;
  1026				}
  1027	
  1028				if (rq->fence.error) {
  1029					pr_err("Fence error status not zero [%d] after unrelated reset\n",
  1030					       rq->fence.error);
  1031					i915_request_put(rq);
  1032					i915_request_put(prev);
  1033					err = -EINVAL;
  1034					goto fini;
  1035				}
  1036	
  1037				if (i915_reset_count(&i915->gpu_error) == reset_count) {
  1038					pr_err("No GPU reset recorded!\n");
  1039					i915_request_put(rq);
  1040					i915_request_put(prev);
  1041					err = -EINVAL;
  1042					goto fini;
  1043				}
  1044	
  1045				i915_request_put(prev);
  1046				prev = rq;
  1047				count++;
  1048			} while (time_before(jiffies, end_time));
  1049			pr_info("%s: Completed %d resets\n", engine->name, count);
  1050	
  1051			*h.batch = MI_BATCH_BUFFER_END;
  1052			i915_gem_chipset_flush(i915);
  1053	
  1054			i915_request_put(prev);
  1055	
  1056			err = flush_test(i915, I915_WAIT_LOCKED);
  1057			if (err)
  1058				break;
  1059		}
  1060	
  1061	fini:
  1062		hang_fini(&h);
  1063	unlock:
  1064		mutex_unlock(&i915->drm.struct_mutex);
  1065		global_reset_unlock(i915);
  1066	
  1067		if (i915_terminally_wedged(&i915->gpu_error))
  1068			return -EIO;
  1069	
  1070		return err;
  1071	}
  1072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation