[Intel-gfx] [PATCH] drm/vblank: Fixup and document timestamp update/read barriers

Daniel Vetter daniel at ffwll.ch
Thu Apr 16 01:54:16 PDT 2015


On Wed, Apr 15, 2015 at 11:26:37PM +0200, Mario Kleiner wrote:
> A couple of questions to educate me and one review comment.
> 
> On 04/15/2015 07:34 PM, Daniel Vetter wrote:
> >This was a bit too much cargo-culted, so lets make it solid:
> >- vblank->count doesn't need to be an atomic, writes are always done
> >   under the protection of dev->vblank_time_lock. Switch to an unsigned
> >   long instead and update comments. Note that atomic_read is just a
> >   normal read of a volatile variable, so no need to audit all the
> >   read-side access specifically.
> >
> >- The barriers for the vblank counter seqlock weren't complete: The
> >   read-side was missing the first barrier between the counter read and
> >   the timestamp read, it only had a barrier between the ts and the
> >   counter read. We need both.
> >
> >- Barriers weren't properly documented. Since barriers only work if
> >   you have them on boths sides of the transaction it's prudent to
> >   reference where the other side is. To avoid duplicating the
> >   write-side comment 3 times extract a little store_vblank() helper.
> >   In that helper also assert that we do indeed hold
> >   dev->vblank_time_lock, since in some cases the lock is acquired a
> >   few functions up in the callchain.
> >
> >Spotted while reviewing a patch from Chris Wilson to add a fastpath to
> >the vblank_wait ioctl.
> >
> >v2: Add comment to better explain how store_vblank works, suggested by
> >Chris.
> >
> >v3: Peter noticed that as-is the 2nd smp_wmb is redundant with the
> >implicit barrier in the spin_unlock. But that can only be proven by
> >auditing all callers and my point in extracting this little helper was
> >to localize all the locking into just one place. Hence I think that
> >additional optimization is too risky.
> >
> >Cc: Chris Wilson <chris at chris-wilson.co.uk>
> >Cc: Mario Kleiner <mario.kleiner.de at gmail.com>
> >Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
> >Cc: Michel Dänzer <michel at daenzer.net>
> >Cc: Peter Hurley <peter at hurleysoftware.com>
> >Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
> >---
> >  drivers/gpu/drm/drm_irq.c | 95 +++++++++++++++++++++++++----------------------
> >  include/drm/drmP.h        |  8 +++-
> >  2 files changed, 57 insertions(+), 46 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
> >index c8a34476570a..8694b77d0002 100644
> >--- a/drivers/gpu/drm/drm_irq.c
> >+++ b/drivers/gpu/drm/drm_irq.c
> >@@ -74,6 +74,36 @@ module_param_named(vblankoffdelay, drm_vblank_offdelay, int, 0600);
> >  module_param_named(timestamp_precision_usec, drm_timestamp_precision, int, 0600);
> >  module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int, 0600);
> >
> >+static void store_vblank(struct drm_device *dev, int crtc,
> >+			 unsigned vblank_count_inc,
> >+			 struct timeval *t_vblank)
> >+{
> >+	struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
> >+	u32 tslot;
> >+
> >+	assert_spin_locked(&dev->vblank_time_lock);
> >+
> >+	if (t_vblank) {
> >+		/* All writers hold the spinlock, but readers are serialized by
> >+		 * the latching of vblank->count below.
> >+		 */
> >+		tslot = vblank->count + vblank_count_inc;
> >+		vblanktimestamp(dev, crtc, tslot) = *t_vblank;
> >+	}
> >+
> >+	/*
> >+	 * vblank timestamp updates are protected on the write side with
> >+	 * vblank_time_lock, but on the read side done locklessly using a
> >+	 * sequence-lock on the vblank counter. Ensure correct ordering using
> >+	 * memory barrriers. We need the barrier both before and also after the
> >+	 * counter update to synchronize with the next timestamp write.
> >+	 * The read-side barriers for this are in drm_vblank_count_and_time.
> >+	 */
> >+	smp_wmb();
> >+	vblank->count += vblank_count_inc;
> >+	smp_wmb();
> >+}
> >+
> >  /**
> >   * drm_update_vblank_count - update the master vblank counter
> >   * @dev: DRM device
> >@@ -93,7 +123,7 @@ module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int, 0600);
> >  static void drm_update_vblank_count(struct drm_device *dev, int crtc)
> >  {
> >  	struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
> >-	u32 cur_vblank, diff, tslot;
> >+	u32 cur_vblank, diff;
> >  	bool rc;
> >  	struct timeval t_vblank;
> >
> >@@ -129,18 +159,12 @@ static void drm_update_vblank_count(struct drm_device *dev, int crtc)
> >  	if (diff == 0)
> >  		return;
> >
> >-	/* Reinitialize corresponding vblank timestamp if high-precision query
> >-	 * available. Skip this step if query unsupported or failed. Will
> >-	 * reinitialize delayed at next vblank interrupt in that case.
> >+	/*
> >+	 * Only reinitialize corresponding vblank timestamp if high-precision query
> >+	 * available and didn't fail. Will reinitialize delayed at next vblank
> >+	 * interrupt in that case.
> >  	 */
> >-	if (rc) {
> >-		tslot = atomic_read(&vblank->count) + diff;
> >-		vblanktimestamp(dev, crtc, tslot) = t_vblank;
> >-	}
> >-
> >-	smp_mb__before_atomic();
> >-	atomic_add(diff, &vblank->count);
> >-	smp_mb__after_atomic();
> >+	store_vblank(dev, crtc, diff, rc ? &t_vblank : NULL);
> >  }
> >
> >  /*
> >@@ -218,7 +242,7 @@ static void vblank_disable_and_save(struct drm_device *dev, int crtc)
> >  	/* Compute time difference to stored timestamp of last vblank
> >  	 * as updated by last invocation of drm_handle_vblank() in vblank irq.
> >  	 */
> >-	vblcount = atomic_read(&vblank->count);
> >+	vblcount = vblank->count;
> >  	diff_ns = timeval_to_ns(&tvblank) -
> >  		  timeval_to_ns(&vblanktimestamp(dev, crtc, vblcount));
> >
> >@@ -234,17 +258,8 @@ static void vblank_disable_and_save(struct drm_device *dev, int crtc)
> >  	 * available. In that case we can't account for this and just
> >  	 * hope for the best.
> >  	 */
> >-	if (vblrc && (abs64(diff_ns) > 1000000)) {
> >-		/* Store new timestamp in ringbuffer. */
> >-		vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
> >-
> >-		/* Increment cooked vblank count. This also atomically commits
> >-		 * the timestamp computed above.
> >-		 */
> >-		smp_mb__before_atomic();
> >-		atomic_inc(&vblank->count);
> >-		smp_mb__after_atomic();
> >-	}
> >+	if (vblrc && (abs64(diff_ns) > 1000000))
> >+		store_vblank(dev, crtc, 1, &tvblank);
> >
> >  	spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags);
> >  }
> >@@ -852,7 +867,7 @@ u32 drm_vblank_count(struct drm_device *dev, int crtc)
> >
> >  	if (WARN_ON(crtc >= dev->num_crtcs))
> >  		return 0;
> >-	return atomic_read(&vblank->count);
> >+	return vblank->count;
> 
> I wrongly assumed atomic_read would guarantee more than it actually does, so
> please help me to learn something. Why don't we need some smp_rmb() here
> before returning vblank->count? What guarantees that drm_vblank_count() does
> return the latest value assigned to vblank->count in store_vblank()? In
> store_vblank() there is a smp_wmb(), but why don't we need a matching
> smp_rmb() here to benefit from it?

Because atomic_read is unordered and atomic_t is just one cpu word
atomic_read() is actually the exact same thing as the replacement here.
Essentially drm_vblank_count just gives you a snapshot and makes no
guarantees about ordering, and from a quick look all callers are ok with
that.

> >  }
> >  EXPORT_SYMBOL(drm_vblank_count);
> >
> >@@ -897,16 +912,17 @@ u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc,
> >  	if (WARN_ON(crtc >= dev->num_crtcs))
> >  		return 0;
> >
> >-	/* Read timestamp from slot of _vblank_time ringbuffer
> >-	 * that corresponds to current vblank count. Retry if
> >-	 * count has incremented during readout. This works like
> >-	 * a seqlock.
> >+	/*
> >+	 * Vblank timestamps are read lockless. To ensure consistency the vblank
> >+	 * counter is rechecked and ordering is ensured using memory barriers.
> >+	 * This works like a seqlock. The write-side barriers are in store_vblank.
> >  	 */
> >  	do {
> >-		cur_vblank = atomic_read(&vblank->count);
> >+		cur_vblank = vblank->count;
> >+		smp_rmb();
> >  		*vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
> >  		smp_rmb();
> >-	} while (cur_vblank != atomic_read(&vblank->count));
> >+	} while (cur_vblank != vblank->count);
> >
> 
> Similar question as above. We have a new smp_rmb() after the cur_vblank
> assignment and then after *vblanktime assignment. My original wrong
> assumption was that the first smp_rmb() wouldn't be needed because
> atomic_read() would imply that, and that the compiler/cpu couldn't reorder
> anything here because the *vblanktime assignment depends on cur_vblank from
> the preceeding atomic_read.

atomic_t is fully unordered in Linux, which is a big contrast to userspace
where atomics (afaik at least) are by default ordered acording to the
load-acquire store-release model. Which means that the cpu and compiler
are free to reorder things however they want (and hence we need explicit
barriers to make sure thing are ok).

The other thing to remember is that for unordered access (i.e. everything
which is not explicit or implicitly a barrier) compiler and cpu can do
whatever they want to with how loads and stores are visible on the
coherency fabric. There is only one restriction: The compiler/cpu is not
allowed to fabricate stores which are not in the code. I.e. if you have
the following code

	b = 0;
	b = 1;
	b = 2;

The only guarantee you really have is that eventually you'll see 2, and
you see nothing else but 0, 1, 2 in between.

The important part is that all the coherency ordering is _only_ from the
cpu up to the coherency fabric. It does _not_ extend to all the other
cpus, so if you want to make an ordered transaction, you have to have
barriers on both the sending and the receiving cpu. Specifically in this
case the following is allowed without the first barrier:

1. read the timestamp, racing with an update (so inconsistent data)
2. read vblank->count
3. smp_rmb();
4. read vblank->count again

So 2&4 read the same vblank count despite that the update race. Only if
you add the first smp_rmb() step 2 is guranateed to happen before step 1.

> But why can we now do the comparison while(cur_vblank != vblank->count)
> without needing something like
> 
> 	new_vblank = vblank->count;
> 	smp_rmb();
>    } while (cur_vblank != new_vblank);
> 
> to make sure the value from the 2nd vblank->count read isn't stale wrt. to
> potential updates from store_vblank()?

You only ever need barriers between different loads/stores and not between
load/stores and computation. So adding a smp_rmb() where you suggested
does exactly nothing.

Well not quite, there is one exception: Compilers are allowed to fabricate
as many loads as they want to. So if your code flow depends upon having a
consistent value (e.g. because you have a bunch of if checks or something
else) then you need to make sure the compiler only loads the value once
using ACCESS_ONCE. But here we only care about equality and only have one
check, so if the compiler goes nuts and reloads the value we don't care
since if it's inequal any value reloaded later on will still be unequal to
cur_vblank. But if you do something more fancy you might indeed need some
additional compiler or full barriers.

> Another question is why the drm_vblank_count_and_time() code ever worked
> without triggering any of my own tests and consistency checks in my
> software, or any of your igt tests? I run my tests very often, but only on
> Intel architecture cpus. I assume the same is true for the igt tests? Is
> there anything specific about Intel cpu's that makes this still work or very
> unlikely to break? Or are the tests insufficient to catch this? Or just
> luck?
> 
> I looked through kernels back to 3.16 and most uses of the function would be
> safe from races due to the locking around it, holding of vblank refcounts,
> or the place and order of execution, e.g., from within drm_handle_vblank().
> But in some tight test loop just calling the drmWaitVblank ioctl to query
> current values i'd expect it to at least occassionally return corrupted
> timestamps, e.g., time jumping forward or backward, etc.?

Luck and the x86 cpu actually being a lot more coherent than the linux
memory model is. x86 has
- fully ordered writes against writes
- fully ordered reads against reads
- the only thing where it reorders is that the cpu can move reads ahead of
  earlier writes (but not the other way round).

So the only risk here is gcc doing something funny, and that only happens
if gcc has a need (register pressure or whatever). The code is simple
enough for that to be practically impossible.

> 
> >  	return cur_vblank;
> >  }
> >@@ -1715,7 +1731,7 @@ bool drm_handle_vblank(struct drm_device *dev, int crtc)
> >  	 */
> >
> >  	/* Get current timestamp and count. */
> >-	vblcount = atomic_read(&vblank->count);
> >+	vblcount = vblank->count;
> >  	drm_get_last_vbltimestamp(dev, crtc, &tvblank, DRM_CALLED_FROM_VBLIRQ);
> >
> >  	/* Compute time difference to timestamp of last vblank */
> >@@ -1731,20 +1747,11 @@ bool drm_handle_vblank(struct drm_device *dev, int crtc)
> >  	 * e.g., due to spurious vblank interrupts. We need to
> >  	 * ignore those for accounting.
> >  	 */
> >-	if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS) {
> >-		/* Store new timestamp in ringbuffer. */
> >-		vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
> >-
> >-		/* Increment cooked vblank count. This also atomically commits
> >-		 * the timestamp computed above.
> >-		 */
> >-		smp_mb__before_atomic();
> >-		atomic_inc(&vblank->count);
> >-		smp_mb__after_atomic();
> >-	} else {
> >+	if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS)
> >+		store_vblank(dev, crtc, 1, &tvblank);
> >+	else
> >  		DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n",
> >  			  crtc, (int) diff_ns);
> >-	}
> >
> >  	spin_unlock(&dev->vblank_time_lock);
> >
> >diff --git a/include/drm/drmP.h b/include/drm/drmP.h
> >index 62c40777c009..4c31a2cc5a33 100644
> >--- a/include/drm/drmP.h
> >+++ b/include/drm/drmP.h
> >@@ -686,9 +686,13 @@ struct drm_pending_vblank_event {
> >  struct drm_vblank_crtc {
> >  	struct drm_device *dev;		/* pointer to the drm_device */
> >  	wait_queue_head_t queue;	/**< VBLANK wait queue */
> >-	struct timeval time[DRM_VBLANKTIME_RBSIZE];	/**< timestamp of current count */
> >  	struct timer_list disable_timer;		/* delayed disable timer */
> >-	atomic_t count;			/**< number of VBLANK interrupts */
> >+
> >+	/* vblank counter, protected by dev->vblank_time_lock for writes */
> >+	unsigned long count;
> 
> Why is count an unsigned long (= 64 bit on 64-bit kernels) instead of u32
> when all users of count are u32? Is this intentional?

Well I figured it can't hurt. I can change it to u32 for consistency,
makes sense to do so.
-Daniel

> 
> 
> >+	/* vblank timestamps, protected by dev->vblank_time_lock for writes */
> >+	struct timeval time[DRM_VBLANKTIME_RBSIZE];
> >+
> >  	atomic_t refcount;		/* number of users of vblank interruptsper crtc */
> >  	u32 last;			/* protected by dev->vbl_lock, used */
> >  					/* for wraparound handling */
> >
> 
> Thanks,
> -mario

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the Intel-gfx mailing list