[PATCH 4/4] drm/vmwgfx: Rework locking around register accesses

Fri Oct 30 02:42:46 PDT 2015

Relax the required locking to, instead of disabling irqs, disable tasklets.
Allow the caller to perform the locking using utility functions, which
allows the caller to combine a number of register accesses within the
same critical section. Implement this for capability reads and command
buffer submission.

Signed-off-by: Thomas Hellstrom <thellstrom at vmware.com>
Reviewed-by: Sinclair Yeh <syeh at vmware.com>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c |   8 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c    |   9 +--
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h    | 130 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/vmwgfx/vmwgfx_fence.c  |   6 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c   |  40 +++++++---
 drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c  |  20 ++---
 drivers/gpu/drm/vmwgfx/vmwgfx_irq.c    |   2 +-
 7 files changed, 163 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
index 8a76821..b8d2436 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c
@@ -290,18 +290,20 @@ void vmw_cmdbuf_header_free(struct vmw_cmdbuf_header *header)
  */
 static int vmw_cmdbuf_header_submit(struct vmw_cmdbuf_header *header)
 {
-	struct vmw_cmdbuf_man *man = header->man;
+	struct vmw_private *dev_priv = header->man->dev_priv;
 	u32 val;
 
 	if (sizeof(header->handle) > 4)
 		val = (header->handle >> 32);
 	else
 		val = 0;
-	vmw_write(man->dev_priv, SVGA_REG_COMMAND_HIGH, val);
+	vmw_hw_lock(dev_priv, true);
+	__vmw_write(dev_priv, SVGA_REG_COMMAND_HIGH, val);
 
 	val = (header->handle & 0xFFFFFFFFULL);
 	val |= header->cb_context & SVGA_CB_CONTEXT_MASK;
-	vmw_write(man->dev_priv, SVGA_REG_COMMAND_LOW, val);
+	__vmw_write(dev_priv, SVGA_REG_COMMAND_LOW, val);
+	vmw_hw_unlock(dev_priv, true);
 
 	return header->cb_header->status;
 }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index a09cf85..2a5496a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -631,7 +631,6 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	ttm_lock_init(&dev_priv->reservation_sem);
 	spin_lock_init(&dev_priv->hw_lock);
 	spin_lock_init(&dev_priv->waiter_lock);
-	spin_lock_init(&dev_priv->cap_lock);
 	spin_lock_init(&dev_priv->svga_lock);
 
 	for (i = vmw_res_context; i < vmw_res_max; ++i) {
@@ -854,10 +853,10 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	}
 
 	if (dev_priv->has_mob) {
-		spin_lock(&dev_priv->cap_lock);
-		vmw_write(dev_priv, SVGA_REG_DEV_CAP, SVGA3D_DEVCAP_DX);
-		dev_priv->has_dx = !!vmw_read(dev_priv, SVGA_REG_DEV_CAP);
-		spin_unlock(&dev_priv->cap_lock);
+		vmw_hw_lock(dev_priv, true);
+		__vmw_write(dev_priv, SVGA_REG_DEV_CAP, SVGA3D_DEVCAP_DX);
+		dev_priv->has_dx = !!__vmw_read(dev_priv, SVGA_REG_DEV_CAP);
+		vmw_hw_unlock(dev_priv, true);
 	}
 
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index a8ae9df..f7dcbcc 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -385,7 +385,6 @@ struct vmw_private {
 	bool has_gmr;
 	bool has_mob;
 	spinlock_t hw_lock;
-	spinlock_t cap_lock;
 	bool has_dx;
 
 	/*
@@ -546,34 +545,127 @@ static inline struct vmw_master *vmw_master(struct drm_master *master)
 	return (struct vmw_master *) master->driver_priv;
 }
 
-/*
- * The locking here is fine-grained, so that it is performed once
- * for every read- and write operation. This is of course costly, but we
- * don't perform much register access in the timing critical paths anyway.
- * Instead we have the extra benefit of being sure that we don't forget
- * the hw lock around register accesses.
+/**
+ * vmw_hw_lock - Perform necessary locking for register access
+ *
+ * @dev_priv: Pointer to device private structure
+ * @lock_bh: Disable bottom halves. Always set to true unless irqs are disabled.
  */
-static inline void vmw_write(struct vmw_private *dev_priv,
-			     unsigned int offset, uint32_t value)
+static inline void vmw_hw_lock(struct vmw_private *dev_priv,
+			       bool lock_bh)
 {
-	unsigned long irq_flags;
+	if (lock_bh)
+		spin_lock_bh(&dev_priv->hw_lock);
+	else
+		spin_lock(&dev_priv->hw_lock);
+}
 
-	spin_lock_irqsave(&dev_priv->hw_lock, irq_flags);
+/**
+ * vmw_hw_unlock - Perform necessary unlocking after register access
+ *
+ * @dev_priv: Pointer to device private structure
+ * @unlock_bh: Disable bottom halves. Always set to true if it was set to tru
+ * in the corresponding lock.
+ */
+static inline void vmw_hw_unlock(struct vmw_private *dev_priv,
+				 bool unlock_bh)
+{
+	if (unlock_bh)
+		spin_unlock_bh(&dev_priv->hw_lock);
+	else
+		spin_unlock(&dev_priv->hw_lock);
+}
+
+/**
+ * __vmw_write - Write a value to a device register with external locking.
+ *
+ * @dev_priv: Pointer to the device private structure
+ * @offset: Device register offset
+ * @value: Value to write
+ *
+ * Note that the caller must ensure that when calling this function,
+ * @dev_priv::hw_lock must be held and that we're safe against racing
+ * against tasklets.
+ */
+static inline void __vmw_write(struct vmw_private *dev_priv,
+			       unsigned int offset, uint32_t value)
+{
+#ifdef CONFIG_LOCKDEP
+	lockdep_assert_held_once(&dev_priv->hw_lock.rlock);
+	WARN_ON_ONCE(!(in_softirq() || irqs_disabled()));
+#endif
 	outl(offset, dev_priv->io_start + VMWGFX_INDEX_PORT);
 	outl(value, dev_priv->io_start + VMWGFX_VALUE_PORT);
-	spin_unlock_irqrestore(&dev_priv->hw_lock, irq_flags);
 }
 
-static inline uint32_t vmw_read(struct vmw_private *dev_priv,
-				unsigned int offset)
+/**
+ * __vmw_read - Read a value from a device register with external locking.
+ *
+ * @dev_priv: Pointer to the device private structure
+ * @offset: Device register offset
+ * @value: Value to write
+ *
+ * Note that the caller must ensure that when calling this function,
+ * @dev_priv::hw_lock is be held and that we're safe against racing
+ * against tasklets.
+ */
+static inline uint32_t __vmw_read(struct vmw_private *dev_priv,
+				  unsigned int offset)
 {
-	unsigned long irq_flags;
 	u32 val;
 
-	spin_lock_irqsave(&dev_priv->hw_lock, irq_flags);
+#ifdef CONFIG_LOCKDEP
+	lockdep_assert_held_once(&dev_priv->hw_lock.rlock);
+	WARN_ON_ONCE(!(in_softirq() || irqs_disabled()));
+#endif
 	outl(offset, dev_priv->io_start + VMWGFX_INDEX_PORT);
 	val = inl(dev_priv->io_start + VMWGFX_VALUE_PORT);
-	spin_unlock_irqrestore(&dev_priv->hw_lock, irq_flags);
+
+	return val;
+}
+
+/**
+ * vmw_write - Write a value to a device register
+ *
+ * @dev_priv: Pointer to the device private structure
+ * @offset: Device register offset
+ * @value: Value to write
+ *
+ * This function may not be called when irqs are disabled, since
+ * its call to vmw_hw_unlock will re-enable irqs.
+ */
+static inline void vmw_write(struct vmw_private *dev_priv,
+			     unsigned int offset, uint32_t value)
+{
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(irqs_disabled());
+#endif
+	vmw_hw_lock(dev_priv, true);
+	__vmw_write(dev_priv, offset, value);
+	vmw_hw_unlock(dev_priv, true);
+}
+
+/**
+ * vmw_write - Read a value from a device register
+ *
+ * @dev_priv: Pointer to the device private structure
+ * @offset: Device register offset
+ * @value: Value to write
+ *
+ * This function may not be called when irqs are disabled, since
+ * its call to vmw_hw_unlock will re-enable irqs.
+ */
+static inline uint32_t vmw_read(struct vmw_private *dev_priv,
+				unsigned int offset)
+{
+	u32 val;
+
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(irqs_disabled());
+#endif
+	vmw_hw_lock(dev_priv, true);
+	val = __vmw_read(dev_priv, offset);
+	vmw_hw_unlock(dev_priv, true);
 
 	return val;
 }
@@ -722,8 +814,8 @@ extern void vmw_fifo_commit(struct vmw_private *dev_priv, uint32_t bytes);
 extern void vmw_fifo_commit_flush(struct vmw_private *dev_priv, uint32_t bytes);
 extern int vmw_fifo_send_fence(struct vmw_private *dev_priv,
 			       uint32_t *seqno);
-extern void vmw_fifo_ping_host_locked(struct vmw_private *, uint32_t reason);
-extern void vmw_fifo_ping_host(struct vmw_private *dev_priv, uint32_t reason);
+extern void vmw_fifo_ping_host(struct vmw_private *dev_priv, uint32_t reason,
+			       bool lock_bh);
 extern bool vmw_fifo_have_3d(struct vmw_private *dev_priv);
 extern bool vmw_fifo_have_pitchlock(struct vmw_private *dev_priv);
 extern int vmw_fifo_emit_dummy_query(struct vmw_private *dev_priv,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
index f40c36e..ac66aad 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
@@ -149,7 +149,7 @@ static bool vmw_fence_enable_signaling(struct fence *f)
 	if (seqno - fence->base.seqno < VMW_FENCE_WRAP)
 		return false;
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, false);
 
 	return true;
 }
@@ -183,7 +183,7 @@ static long vmw_fence_wait(struct fence *f, bool intr, signed long timeout)
 	if (likely(vmw_fence_obj_signaled(fence)))
 		return timeout;
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 	vmw_seqno_waiter_add(dev_priv);
 
 	spin_lock_bh(f->lock);
@@ -525,7 +525,7 @@ void vmw_fence_obj_flush(struct vmw_fence_obj *fence)
 {
 	struct vmw_private *dev_priv = fman_from_fence(fence)->dev_priv;
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 }
 
 static void vmw_fence_destroy(struct vmw_fence_obj *fence)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index a8baf5f..a70cc88 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -49,10 +49,10 @@ bool vmw_fifo_have_3d(struct vmw_private *dev_priv)
 		if (!dev_priv->has_mob)
 			return false;
 
-		spin_lock(&dev_priv->cap_lock);
-		vmw_write(dev_priv, SVGA_REG_DEV_CAP, SVGA3D_DEVCAP_3D);
-		result = vmw_read(dev_priv, SVGA_REG_DEV_CAP);
-		spin_unlock(&dev_priv->cap_lock);
+		vmw_hw_lock(dev_priv, true);
+		__vmw_write(dev_priv, SVGA_REG_DEV_CAP, SVGA3D_DEVCAP_3D);
+		result = __vmw_read(dev_priv, SVGA_REG_DEV_CAP);
+		vmw_hw_unlock(dev_priv, true);
 
 		return (result != 0);
 	}
@@ -163,14 +163,32 @@ int vmw_fifo_init(struct vmw_private *dev_priv, struct vmw_fifo_state *fifo)
 	return 0;
 }
 
-void vmw_fifo_ping_host(struct vmw_private *dev_priv, uint32_t reason)
+/**
+ * vmw_fifo_ping_host - Notify the device that there are device commands
+ * to process
+ *
+ * @dev_priv: Pointer to a device private struct.
+ * @reason: A valid reason to ping the host. See device headers.
+ * @lock_bh: Always set to true unless irqs are disabled in which case
+ * it must be set to false.
+ *
+ * Aka the device "doorbell". The device has an optimization that allows
+ * avoiding calling the relatively expensive register write to
+ * SVGA_REG_SYNC if the SVGA_FIFO_BUSY mmio register is set to 1, meaning
+ * that the doorbell has already been called. The SVGA_FIFO_BUSY mmio register
+ * will be cleared by the device when a new write to SVGA_REG_SYNC is needed
+ * to notify the device of pending commands.
+ */
+void vmw_fifo_ping_host(struct vmw_private *dev_priv, uint32_t reason,
+			bool lock_bh)
 {
 	u32 *fifo_mem = dev_priv->mmio_virt;
 
-	preempt_disable();
-	if (cmpxchg(fifo_mem + SVGA_FIFO_BUSY, 0, 1) == 0)
-		vmw_write(dev_priv, SVGA_REG_SYNC, reason);
-	preempt_enable();
+	if (cmpxchg(fifo_mem + SVGA_FIFO_BUSY, 0, 1) == 0) {
+		vmw_hw_lock(dev_priv, lock_bh);
+		__vmw_write(dev_priv, SVGA_REG_SYNC, reason);
+		vmw_hw_unlock(dev_priv, lock_bh);
+	}
 }
 
 void vmw_fifo_release(struct vmw_private *dev_priv, struct vmw_fifo_state *fifo)
@@ -256,7 +274,7 @@ static int vmw_fifo_wait(struct vmw_private *dev_priv,
 	if (likely(!vmw_fifo_is_full(dev_priv, bytes)))
 		return 0;
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_FIFOFULL);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_FIFOFULL, true);
 	if (!(dev_priv->capabilities & SVGA_CAP_IRQMASK))
 		return vmw_fifo_wait_noirq(dev_priv, bytes,
 					   interruptible, timeout);
@@ -490,7 +508,7 @@ static void vmw_local_fifo_commit(struct vmw_private *dev_priv, uint32_t bytes)
 		vmw_mmio_write(0, fifo_mem + SVGA_FIFO_RESERVED);
 	mb();
 	up_write(&fifo_state->rwsem);
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 	mutex_unlock(&fifo_state->fifo_mutex);
 }
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index b8c6a03..8ce0886 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -159,14 +159,14 @@ static int vmw_fill_compat_cap(struct vmw_private *dev_priv, void *bounce,
 		(pair_offset + max_size * sizeof(SVGA3dCapPair)) / sizeof(u32);
 	compat_cap->header.type = SVGA3DCAPS_RECORD_DEVCAPS;
 
-	spin_lock(&dev_priv->cap_lock);
+	vmw_hw_lock(dev_priv, true);
 	for (i = 0; i < max_size; ++i) {
-		vmw_write(dev_priv, SVGA_REG_DEV_CAP, i);
+		__vmw_write(dev_priv, SVGA_REG_DEV_CAP, i);
 		compat_cap->pairs[i][0] = i;
 		compat_cap->pairs[i][1] = vmw_mask_multisample
-			(i, vmw_read(dev_priv, SVGA_REG_DEV_CAP));
+			(i, __vmw_read(dev_priv, SVGA_REG_DEV_CAP));
 	}
-	spin_unlock(&dev_priv->cap_lock);
+	vmw_hw_unlock(dev_priv, true);
 
 	return 0;
 }
@@ -216,13 +216,13 @@ int vmw_get_cap_3d_ioctl(struct drm_device *dev, void *data,
 		if (num > SVGA3D_DEVCAP_MAX)
 			num = SVGA3D_DEVCAP_MAX;
 
-		spin_lock(&dev_priv->cap_lock);
+		vmw_hw_lock(dev_priv, true);
 		for (i = 0; i < num; ++i) {
-			vmw_write(dev_priv, SVGA_REG_DEV_CAP, i);
+			__vmw_write(dev_priv, SVGA_REG_DEV_CAP, i);
 			*bounce32++ = vmw_mask_multisample
-				(i, vmw_read(dev_priv, SVGA_REG_DEV_CAP));
+				(i, __vmw_read(dev_priv, SVGA_REG_DEV_CAP));
 		}
-		spin_unlock(&dev_priv->cap_lock);
+		vmw_hw_unlock(dev_priv, true);
 	} else if (gb_objects) {
 		ret = vmw_fill_compat_cap(dev_priv, bounce, size);
 		if (unlikely(ret != 0))
@@ -420,7 +420,7 @@ unsigned int vmw_fops_poll(struct file *filp, struct poll_table_struct *wait)
 	struct vmw_private *dev_priv =
 		vmw_priv(file_priv->minor->dev);
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 	return drm_poll(filp, wait);
 }
 
@@ -443,6 +443,6 @@ ssize_t vmw_fops_read(struct file *filp, char __user *buffer,
 	struct vmw_private *dev_priv =
 		vmw_priv(file_priv->minor->dev);
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 	return drm_read(filp, buffer, count, offset);
 }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
index 12aaed7..af6970b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c
@@ -246,7 +246,7 @@ int vmw_wait_seqno(struct vmw_private *dev_priv,
 	if (likely(vmw_seqno_passed(dev_priv, seqno)))
 		return 0;
 
-	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC);
+	vmw_fifo_ping_host(dev_priv, SVGA_SYNC_GENERIC, true);
 
 	if (!(fifo->capabilities & SVGA_FIFO_CAP_FENCE))
 		return vmw_fallback_wait(dev_priv, lazy, true, seqno,
-- 
2.4.3