[Intel-gfx] [PATCH v3] drm/i915: Optimistically spin for the request completion

Thu Mar 19 08:16:15 PDT 2015

On Thu, Mar 12, 2015 at 11:11:17AM +0000, Chris Wilson wrote:
> This provides a nice boost to mesa in swap bound scenarios (as mesa
> throttles itself to the previous frame and given the scenario that will
> complete shortly). It will also provide a good boost to systems running
> with semaphores disabled and so frequently waiting on the GPU as it
> switches rings. In the most favourable of microbenchmarks, this can
> increase performance by around 15% - though in practice improvements
> will be marginal and rarely noticeable.
> 
> v2: Account for user timeouts
> v3: Limit the spinning to a single jiffie (~1us) at most. On an
> otherwise idle system, there is no scheduler contention and so without a
> limit we would spin until the GPU is ready.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>

Just recording ideas for the future. Replace the busy-spin with
monitor/mwait. This requires Pentium4+, a cooperating GPU with working
cacheline snooping and that we use HWS seqno.

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 85e71e0e2340..454a38d4caa3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -37,6 +37,7 @@
 #include <linux/swap.h>
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
+#include <asm/mwait.h>
 
 #define RQ_BUG_ON(expr)
 
@@ -1187,18 +1188,42 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
        unsigned long timeout;
        int ret = -EBUSY;
 
+       if (ring->irq_refcount) /* IRQ is already active, keep using it */
+               return ret;
+
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
        timeout = jiffies + 1;
-       while (!need_resched()) {
-               if (i915_gem_request_completed(req, true)) {
-                       ret = 0;
-                       goto out;
-               }
+       if (this_cpu_has(X86_FEATURE_MWAIT)) {
+               do {
+                       unsigned long ecx = 1; /* break on interrupt */
+                       unsigned long eax = 0; /* cstate */
 
-               if (time_after_eq(jiffies, timeout))
-                       break;
+                       __monitor((void *)&ring->status_page.page_addr[I915_GEM_HWS_INDEX], 0, 0);
+                       if (need_resched())
+                               break;
+
+                       if (i915_gem_request_completed(req, true)) {
+                               ret = 0;
+                               goto out;
+                       }
 
-               cpu_relax_lowlatency();
+                       if (time_after_eq(jiffies, timeout))
+                               break;
+
+                       __mwait(eax, ecx);
+               } while (1);
+       } else {
+               while (!need_resched()) {
+                       if (i915_gem_request_completed(req, true)) {
+                               ret = 0;
+                               goto out;
+                       }
+
+                       if (time_after_eq(jiffies, timeout))
+                               break;
+
+                       cpu_relax_lowlatency();
+               }
        }
        if (i915_gem_request_completed(req, false))
                ret = 0;


-- 
Chris Wilson, Intel Open Source Technology Centre