[RFC PATCH] drm: Fix off-by-one races on vblank disable

Andy Lutomirski luto at MIT.EDU
Wed Nov 16 21:39:06 PST 2011


There are two possible races when disabling vblanks.  If the IRQ
fired but the hardware didn't update its counter yet, then we store
too low a hardware counter.  (Sensible hardware never does this.
Apparently not all hardware is sensible.)  If, on the other hand,
the counter updated but the IRQ didn't fire yet, we store too high a
counter.

We handled the former case with a heuristic based on timestamps and
we did not handle the latter case.  By saving a little more state,
we can handle both cases exactly: all we need to do is watch for
changes in the difference between the hardware and software vblank
counts.

Signed-off-by: Andy Lutomirski <luto at mit.edu>
---

Rather than tweaking more things to reduce the chance of hitting a race
while keeping the vblank disable timeout as low as possible, why not
just fix the race?

This compiles but is not very well tested, because I don't know what
tests to run.  I haven't been able to provoke either race on my SNB
laptop.

 drivers/gpu/drm/drm_irq.c |   92 ++++++++++++++++++++++++++++----------------
 include/drm/drmP.h        |    2 +-
 2 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index 3830e9e..1674a33 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -56,6 +56,12 @@
  */
 #define DRM_REDUNDANT_VBLIRQ_THRESH_NS 1000000
 
+/* Saved vblank count data, used only in this file. */
+struct drm_vbl_counts {
+	u32 hwcount;	/* hw count at last state save or load */
+	u32 drmcount;	/* drm count at last state save or load */
+};
+
 /**
  * Get interrupt from bus id.
  *
@@ -101,7 +107,8 @@ static void clear_vblank_timestamps(struct drm_device *dev, int crtc)
 static void vblank_disable_and_save(struct drm_device *dev, int crtc)
 {
 	unsigned long irqflags;
-	u32 vblcount;
+	u32 drmcount, hwcount;
+	u32 drm_counts_seen, hw_counts_seen, offset;
 	s64 diff_ns;
 	int vblrc;
 	struct timeval tvblank;
@@ -121,44 +128,53 @@ static void vblank_disable_and_save(struct drm_device *dev, int crtc)
 	/* No further vblank irq's will be processed after
 	 * this point. Get current hardware vblank count and
 	 * vblank timestamp, repeat until they are consistent.
-	 *
-	 * FIXME: There is still a race condition here and in
-	 * drm_update_vblank_count() which can cause off-by-one
-	 * reinitialization of software vblank counter. If gpu
-	 * vblank counter doesn't increment exactly at the leading
-	 * edge of a vblank interval, then we can lose 1 count if
-	 * we happen to execute between start of vblank and the
-	 * delayed gpu counter increment.
 	 */
 	do {
-		dev->last_vblank[crtc] = dev->driver->get_vblank_counter(dev, crtc);
+		hwcount = dev->driver->get_vblank_counter(dev, crtc);
 		vblrc = drm_get_last_vbltimestamp(dev, crtc, &tvblank, 0);
-	} while (dev->last_vblank[crtc] != dev->driver->get_vblank_counter(dev, crtc));
+	} while (hwcount != dev->driver->get_vblank_counter(dev, crtc));
 
 	/* Compute time difference to stored timestamp of last vblank
 	 * as updated by last invocation of drm_handle_vblank() in vblank irq.
 	 */
-	vblcount = atomic_read(&dev->_vblank_count[crtc]);
+	drmcount = atomic_read(&dev->_vblank_count[crtc]);
 	diff_ns = timeval_to_ns(&tvblank) -
-		  timeval_to_ns(&vblanktimestamp(dev, crtc, vblcount));
+		  timeval_to_ns(&vblanktimestamp(dev, crtc, drmcount));
 
-	/* If there is at least 1 msec difference between the last stored
-	 * timestamp and tvblank, then we are currently executing our
-	 * disable inside a new vblank interval, the tvblank timestamp
-	 * corresponds to this new vblank interval and the irq handler
-	 * for this vblank didn't run yet and won't run due to our disable.
-	 * Therefore we need to do the job of drm_handle_vblank() and
-	 * increment the vblank counter by one to account for this vblank.
+	/* We could be off by one in either direction.  If a vblank just
+	 * happened but the IRQ hasn't been handled yet, then drmcount is
+	 * too low by one.  On the other hand, if the GPU fires its vblank
+	 * interrupts *before* updating its counter, then hwcount could
+	 * be too low by one.  (If both happen, they cancel out.)
 	 *
-	 * Skip this step if there isn't any high precision timestamp
-	 * available. In that case we can't account for this and just
-	 * hope for the best.
+	 * Fortunately, we have enough information to figure out what
+	 * happened.  Assuming the hardware counter works right, the
+	 * difference between drmcount and vblcount should be a constant
+	 * (modulo max_vblank_count).  We have both saved values from last
+	 * time we turned the interrupt on.
 	 */
-	if ((vblrc > 0) && (abs64(diff_ns) > 1000000)) {
-		atomic_inc(&dev->_vblank_count[crtc]);
-		smp_mb__after_atomic_inc();
+	drm_counts_seen = drmcount - dev->last_vblank[crtc].drmcount;
+	hw_counts_seen = hwcount - dev->last_vblank[crtc].hwcount;
+
+	offset = (drm_counts_seen - hw_counts_seen) % dev->max_vblank_count;
+	if (offset == 1) {
+		hwcount++;
+		DRM_DEBUG("last_vblank[%d]: hwcount++\n", crtc);
+	} else if (offset == dev->max_vblank_count - 1) {
+		hwcount--;
+		DRM_DEBUG("last_vblank[%d]: hwcount--\n", crtc);
+	} else if (offset != 0) {
+		DRM_DEBUG("last_vblank[%d]: drm_counts_seen = %u, hw_counts_seen = %u, max = %u is unexpected",
+			  crtc, (unsigned)drm_counts_seen,
+			  (unsigned)hw_counts_seen,
+			  (unsigned)dev->max_vblank_count);
+
+		/* Trying to fix it might just make it worse. */
 	}
 
+	dev->last_vblank[crtc].hwcount = hwcount;
+	dev->last_vblank[crtc].drmcount = drmcount;  /* not really necessary */
+
 	/* Invalidate all timestamps while vblank irq's are off. */
 	clear_vblank_timestamps(dev, crtc);
 
@@ -238,7 +254,8 @@ int drm_vblank_init(struct drm_device *dev, int num_crtcs)
 	if (!dev->vblank_enabled)
 		goto err;
 
-	dev->last_vblank = kcalloc(num_crtcs, sizeof(u32), GFP_KERNEL);
+	dev->last_vblank = kcalloc(num_crtcs, sizeof(struct drm_vbl_counts),
+				   GFP_KERNEL);
 	if (!dev->last_vblank)
 		goto err;
 
@@ -268,6 +285,9 @@ int drm_vblank_init(struct drm_device *dev, int num_crtcs)
 		init_waitqueue_head(&dev->vbl_queue[i]);
 		atomic_set(&dev->_vblank_count[i], 0);
 		atomic_set(&dev->vblank_refcount[i], 0);
+		dev->last_vblank[i].drmcount = 0;
+		dev->last_vblank[i].hwcount =
+			dev->driver->get_vblank_counter(dev, i);
 	}
 
 	dev->vblank_disable_allowed = 0;
@@ -410,7 +430,9 @@ int drm_irq_uninstall(struct drm_device *dev)
 	for (i = 0; i < dev->num_crtcs; i++) {
 		DRM_WAKEUP(&dev->vbl_queue[i]);
 		dev->vblank_enabled[i] = 0;
-		dev->last_vblank[i] = dev->driver->get_vblank_counter(dev, i);
+		dev->last_vblank[i].hwcount =
+			dev->driver->get_vblank_counter(dev, i);
+		dev->last_vblank[i].drmcount = drm_vblank_count(dev, i);
 	}
 	spin_unlock_irqrestore(&dev->vbl_lock, irqflags);
 
@@ -841,12 +863,12 @@ static void drm_update_vblank_count(struct drm_device *dev, int crtc)
 	} while (cur_vblank != dev->driver->get_vblank_counter(dev, crtc));
 
 	/* Deal with counter wrap */
-	diff = cur_vblank - dev->last_vblank[crtc];
-	if (cur_vblank < dev->last_vblank[crtc]) {
+	diff = cur_vblank - dev->last_vblank[crtc].hwcount;
+	if (cur_vblank < dev->last_vblank[crtc].hwcount) {
 		diff += dev->max_vblank_count;
 
-		DRM_DEBUG("last_vblank[%d]=0x%x, cur_vblank=0x%x => diff=0x%x\n",
-			  crtc, dev->last_vblank[crtc], cur_vblank, diff);
+		DRM_DEBUG("last_vblank[%d].hwcount=0x%x, cur_vblank=0x%x => diff=0x%x\n",
+			  crtc, dev->last_vblank[crtc].hwcount, cur_vblank, diff);
 	}
 
 	DRM_DEBUG("enabling vblank interrupts on crtc %d, missed %d\n",
@@ -861,8 +883,10 @@ static void drm_update_vblank_count(struct drm_device *dev, int crtc)
 		vblanktimestamp(dev, crtc, tslot) = t_vblank;
 	}
 
+	dev->last_vblank[crtc].hwcount = cur_vblank;
 	smp_mb__before_atomic_inc();
-	atomic_add(diff, &dev->_vblank_count[crtc]);
+	dev->last_vblank[crtc].drmcount =
+		atomic_add_return(diff, &dev->_vblank_count[crtc]);
 	smp_mb__after_atomic_inc();
 }
 
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 9b7c2bb..4950c0f 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1108,7 +1108,7 @@ struct drm_device {
 	spinlock_t vblank_time_lock;    /**< Protects vblank count and time updates during vblank enable/disable */
 	spinlock_t vbl_lock;
 	atomic_t *vblank_refcount;      /* number of users of vblank interruptsper crtc */
-	u32 *last_vblank;               /* protected by dev->vbl_lock, used */
+	struct drm_vbl_counts *last_vblank; /* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int *vblank_enabled;            /* so we don't call enable more than
 					   once per disable */
-- 
1.7.7.1



More information about the dri-devel mailing list