[Intel-gfx] [PATCH v4] drm/i915/guc: Clean up locks in GuC

Mon Jan 25 08:01:34 PST 2016

On Fri, Jan 22, 2016 at 10:54:42AM +0000, Tvrtko Ursulin wrote:
> 
> On 03/12/15 00:56, yu.dai at intel.com wrote:
> > From: Alex Dai <yu.dai at intel.com>
> > 
> > For now, remove the spinlocks that protected the GuC's
> > statistics block and work queue; they are only accessed
> > by code that already holds the global struct_mutex, and
> > so are redundant (until the big struct_mutex rewrite!).
> > 
> > The specific problem that the spinlocks caused was that
> > if the work queue was full, the driver would try to
> > spinwait for one jiffy, but with interrupts disabled the
> > jiffy count would not advance, leading to a system hang.
> > The issue was found using test case igt/gem_close_race.
> > 
> > The new version will usleep() instead, still holding
> > the struct_mutex but without any spinlocks.
> > 
> > v4: Reorganize commit message (Dave Gordon)
> > v3: Remove unnecessary whitespace churn
> > v2: Clean up wq_lock too
> > v1: Clean up host2guc lock as well
> > 
> > Signed-off-by: Alex Dai <yu.dai at intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_debugfs.c        | 12 ++++++------
> >   drivers/gpu/drm/i915/i915_guc_submission.c | 31 ++++++------------------------
> >   drivers/gpu/drm/i915/intel_guc.h           |  4 ----
> >   3 files changed, 12 insertions(+), 35 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index a728ff1..d6b7817 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -2473,15 +2473,15 @@ static int i915_guc_info(struct seq_file *m, void *data)
> >   	if (!HAS_GUC_SCHED(dev_priv->dev))
> >   		return 0;
> >   
> > +	if (mutex_lock_interruptible(&dev->struct_mutex))
> > +		return 0;
> > +
> >   	/* Take a local copy of the GuC data, so we can dump it at leisure */
> > -	spin_lock(&dev_priv->guc.host2guc_lock);
> >   	guc = dev_priv->guc;
> > -	if (guc.execbuf_client) {
> > -		spin_lock(&guc.execbuf_client->wq_lock);
> > +	if (guc.execbuf_client)
> >   		client = *guc.execbuf_client;
> > -		spin_unlock(&guc.execbuf_client->wq_lock);
> > -	}
> > -	spin_unlock(&dev_priv->guc.host2guc_lock);
> > +
> > +	mutex_unlock(&dev->struct_mutex);
> >   
> >   	seq_printf(m, "GuC total action count: %llu\n", guc.action_count);
> >   	seq_printf(m, "GuC action failure count: %u\n", guc.action_fail);
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index ed9f100..a7f9785 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -86,7 +86,6 @@ static int host2guc_action(struct intel_guc *guc, u32 *data, u32 len)
> >   		return -EINVAL;
> >   
> >   	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> > -	spin_lock(&dev_priv->guc.host2guc_lock);
> >   
> >   	dev_priv->guc.action_count += 1;
> >   	dev_priv->guc.action_cmd = data[0];
> > @@ -119,7 +118,6 @@ static int host2guc_action(struct intel_guc *guc, u32 *data, u32 len)
> >   	}
> >   	dev_priv->guc.action_status = status;
> >   
> > -	spin_unlock(&dev_priv->guc.host2guc_lock);
> >   	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> >   
> >   	return ret;
> > @@ -292,16 +290,12 @@ static uint32_t select_doorbell_cacheline(struct intel_guc *guc)
> >   	const uint32_t cacheline_size = cache_line_size();
> >   	uint32_t offset;
> >   
> > -	spin_lock(&guc->host2guc_lock);
> > -
> >   	/* Doorbell uses a single cache line within a page */
> >   	offset = offset_in_page(guc->db_cacheline);
> >   
> >   	/* Moving to next cache line to reduce contention */
> >   	guc->db_cacheline += cacheline_size;
> >   
> > -	spin_unlock(&guc->host2guc_lock);
> > -
> >   	DRM_DEBUG_DRIVER("selected doorbell cacheline 0x%x, next 0x%x, linesize %u\n",
> >   			offset, guc->db_cacheline, cacheline_size);
> >   
> > @@ -322,13 +316,11 @@ static uint16_t assign_doorbell(struct intel_guc *guc, uint32_t priority)
> >   	const uint16_t end = start + half;
> >   	uint16_t id;
> >   
> > -	spin_lock(&guc->host2guc_lock);
> >   	id = find_next_zero_bit(guc->doorbell_bitmap, end, start);
> >   	if (id == end)
> >   		id = GUC_INVALID_DOORBELL_ID;
> >   	else
> >   		bitmap_set(guc->doorbell_bitmap, id, 1);
> > -	spin_unlock(&guc->host2guc_lock);
> >   
> >   	DRM_DEBUG_DRIVER("assigned %s priority doorbell id 0x%x\n",
> >   			hi_pri ? "high" : "normal", id);
> > @@ -338,9 +330,7 @@ static uint16_t assign_doorbell(struct intel_guc *guc, uint32_t priority)
> >   
> >   static void release_doorbell(struct intel_guc *guc, uint16_t id)
> >   {
> > -	spin_lock(&guc->host2guc_lock);
> >   	bitmap_clear(guc->doorbell_bitmap, id, 1);
> > -	spin_unlock(&guc->host2guc_lock);
> >   }
> >   
> >   /*
> > @@ -487,16 +477,13 @@ static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
> >   	struct guc_process_desc *desc;
> >   	void *base;
> >   	u32 size = sizeof(struct guc_wq_item);
> > -	int ret = 0, timeout_counter = 200;
> > +	int ret = -ETIMEDOUT, timeout_counter = 200;
> >   
> >   	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
> >   	desc = base + gc->proc_desc_offset;
> >   
> >   	while (timeout_counter-- > 0) {
> > -		ret = wait_for_atomic(CIRC_SPACE(gc->wq_tail, desc->head,
> > -				gc->wq_size) >= size, 1);
> > -
> > -		if (!ret) {
> > +		if (CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size) >= size) {
> >   			*offset = gc->wq_tail;
> >   
> >   			/* advance the tail for next workqueue item */
> > @@ -505,7 +492,11 @@ static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
> >   
> >   			/* this will break the loop */
> >   			timeout_counter = 0;
> > +			ret = 0;
> >   		}
> > +
> > +		if (timeout_counter)
> > +			usleep_range(1000, 2000);
> 
> Sleeping is illegal while holding kmap_atomic mapping:
> 
> [   34.098798] BUG: scheduling while atomic: gem_close_race/1941/0x00000002
> [   34.098822] Modules linked in: hid_generic usbhid i915 asix usbnet libphy mii i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm coretemp i2c_hid hid video pinctrl_sunrisepoint pinctrl_intel acpi_pad nls_iso8859_1 e1000e ptp psmouse pps_core ahci libahci
> [   34.098824] CPU: 0 PID: 1941 Comm: gem_close_race Tainted: G     U          4.4.0-160121+ #123
> [   34.098824] Hardware name: Intel Corporation Skylake Client platform/Skylake AIO DDR3L RVP10, BIOS SKLSE2R1.R00.X100.B01.1509220551 09/22/2015
> [   34.098825]  0000000000013e40 ffff880166c27a78 ffffffff81280d02 ffff880172c13e40
> [   34.098826]  ffff880166c27a88 ffffffff810c203a ffff880166c27ac8 ffffffff814ec808
> [   34.098827]  ffff88016b7c6000 ffff880166c28000 00000000000f4240 0000000000000001
> [   34.098827] Call Trace:
> [   34.098831]  [<ffffffff81280d02>] dump_stack+0x4b/0x79
> [   34.098833]  [<ffffffff810c203a>] __schedule_bug+0x41/0x4f
> [   34.098834]  [<ffffffff814ec808>] __schedule+0x5a8/0x690
> [   34.098835]  [<ffffffff814ec927>] schedule+0x37/0x80
> [   34.098836]  [<ffffffff814ef3fd>] schedule_hrtimeout_range_clock+0xad/0x130
> [   34.098837]  [<ffffffff81090be0>] ? hrtimer_init+0x10/0x10
> [   34.098838]  [<ffffffff814ef3f1>] ? schedule_hrtimeout_range_clock+0xa1/0x130
> [   34.098839]  [<ffffffff814ef48e>] schedule_hrtimeout_range+0xe/0x10
> [   34.098840]  [<ffffffff814eef9b>] usleep_range+0x3b/0x40
> [   34.098853]  [<ffffffffa01ec109>] i915_guc_wq_check_space+0x119/0x210 [i915]
> [   34.098861]  [<ffffffffa01da47c>] intel_logical_ring_alloc_request_extras+0x5c/0x70 [i915]
> [   34.098869]  [<ffffffffa01cdbf1>] i915_gem_request_alloc+0x91/0x170 [i915]
> [   34.098875]  [<ffffffffa01c1c07>] i915_gem_do_execbuffer.isra.25+0xbc7/0x12a0 [i915]
> [   34.098882]  [<ffffffffa01cb785>] ? i915_gem_object_get_pages_gtt+0x225/0x3c0 [i915]
> [   34.098889]  [<ffffffffa01d1fb6>] ? i915_gem_pwrite_ioctl+0xd6/0x9f0 [i915]
> [   34.098895]  [<ffffffffa01c2e68>] i915_gem_execbuffer2+0xa8/0x250 [i915]
> [   34.098900]  [<ffffffffa00f65d8>] drm_ioctl+0x258/0x4f0 [drm]
> [   34.098906]  [<ffffffffa01c2dc0>] ? i915_gem_execbuffer+0x340/0x340 [i915]
> [   34.098908]  [<ffffffff8111590d>] do_vfs_ioctl+0x2cd/0x4a0
> [   34.098909]  [<ffffffff8111eac2>] ? __fget+0x72/0xb0
> [   34.098910]  [<ffffffff81115b1c>] SyS_ioctl+0x3c/0x70
> [   34.098911]  [<ffffffff814effd7>] entry_SYSCALL_64_fastpath+0x12/0x6a
> [   34.100208] ------------[ cut here ]------------
> 
> I don't know the code well enough to suggest a suitable fix.

Code really, really, really should be run with full debug tooling (and the
above is one of the most basic ones) enabled.

I think the best approach would be to:
- install GuC firmware on all CI machines. Please coordinate with
  Daniela&Tomi on this, and cc: intel-gfx-ci at eclists.intel.com on any such
  mails.

- Every patch series that touches still disabled code needs to include a
  patch to enable GuC (or whatever it is) by default. Of course that patch
  must have a DONTMERGE or similar tag, if the feature isn't 100% ready
  for prime time yet. This way we can enlist CI to hunt for stuff like the
  above, and it's _really_ good at that.

Alex&Tvrtko, can you please take care of this?

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch