[Intel-gfx] [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths
Chris Wilson
chris at chris-wilson.co.uk
Thu Mar 24 11:53:16 UTC 2016
On Thu, Mar 24, 2016 at 11:31:34AM +0000, Tvrtko Ursulin wrote:
>
> On 24/03/16 11:16, Chris Wilson wrote:
> >On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> >>
> >>GCC cannot optimize well calculations hidden in macros and
> >>assigned to temporary structures. We can cache the register in
> >>ELSP write, and refactor reading of the CSB a bit to enable
> >>it to do a better job.
> >>
> >>Code is still equally readable but the generated body of the
> >>CSB read loop is 30% smaller, and since that loop runs at
> >>least once per interrupt, which in turn can fire in tens or
> >>hundreds thousands times per second, must be of some value.
> >>
> >>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> >>---
> >> drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
> >> drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
> >> 2 files changed, 21 insertions(+), 16 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >>index 3a23b9549f7b..67592f8354d6 100644
> >>--- a/drivers/gpu/drm/i915/intel_lrc.c
> >>+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >>@@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> >> {
> >>
> >> struct intel_engine_cs *engine = rq0->engine;
> >>- struct drm_device *dev = engine->dev;
> >>- struct drm_i915_private *dev_priv = dev->dev_private;
> >>+ struct drm_i915_private *dev_priv = rq0->i915;
> >>+ i915_reg_t elsp_reg = RING_ELSP(engine);
> >
> >If we are going to open-code it, how about going whole hog
> >and
> >
> >/* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
> > * trouble of doing so - gcc fails miserably!
> > */
> >u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> >then use writel(upper_32_bits(desc[1]), elsp);
> >
> >Keeping the comment around for grepping.
>
> Yeah I had that but thought it will be considered to tasteless for
> posting. :)
>
> >> uint64_t desc[2];
> >>
> >> if (rq1) {
> >>@@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> >> rq0->elsp_submitted++;
> >>
> >> /* You must always write both descriptors in the order below. */
> >>- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> >>- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> >>+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
> >>+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
> >>
> >>- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> >>+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
> >> /* The context is automatically loaded after the following */
> >>- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> >>+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
> >>
> >> /* ELSP is a wo register, use another nearby reg for posting */
> >> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
> >
> >Observing the above, we can also kill the POSTING_READ_FW() which will
> >make a much bigger improvement than all of the above.
>
> It is a different register, why it can be removed?
We use uncached mmio writes. The POSTING_READs are primarily for
documentation about where we need the write barriers to flush the WC
buffer (which we don't use nor are we ever going to because HW forbids
us). However they are most painful at times.
> >>@@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
> >> }
> >>
> >> static u32
> >>-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> >>- u32 *context_id)
> >>+get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
> >>+ unsigned int read_pointer, u32 *context_id)
> >> {
> >>- struct drm_i915_private *dev_priv = engine->dev->dev_private;
> >> u32 status;
> >>
> >> read_pointer %= GEN8_CSB_ENTRIES;
> >>
> >>- status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> >>+ status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
> >
> >If we forgo the "convenience" interface here, could we not also improve
> >the readability of the code by just having the csb ringbuffer and readl?
>
> The same whole-hog open coding you mean like the above? It is
> slightly more efficient, not sure that is is more readable so maybe
> I am not getting exactly what you mean.
I think it will be. Even more so we say "we're x86 only and use u32 *
__iomem csb" ;)
quick draft
t a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 40ef4ea..b7d2ae9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -516,26 +516,6 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
return 1;
}
-static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
- u32 *context_id)
-{
- struct drm_i915_private *dev_priv = engine->dev->dev_private;
- u32 status;
-
- read_pointer %= GEN8_CSB_ENTRIES;
-
- status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-
- if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
- return 0;
-
- *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
- read_pointer));
-
- return status;
-}
-
/**
* intel_lrc_irq_handler() - handle Context Switch interrupts
* @ring: Engine Command Streamer to handle.
@@ -548,6 +528,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
struct drm_i915_private *dev_priv = engine->dev->dev_private;
u32 status_pointer;
unsigned int read_pointer, write_pointer;
+ u32 *csb_mmio = dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
u32 csb[GEN8_CSB_ENTRIES][2];
unsigned int csb_read = 0, i;
unsigned int submit_contexts = 0;
@@ -563,10 +544,16 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
write_pointer += GEN8_CSB_ENTRIES;
while (read_pointer < write_pointer) {
+ unsigned idx;
+
if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
break;
- csb[csb_read][0] = get_context_status(engine, ++read_pointer,
- &csb[csb_read][1]);
+
+ idx = read_pointer++ % GEN8_CSB_ENTRIES;
+ csb[csb_read][0] = readl(&csb_mmio[2*idx+0]);
+ if ((status & GEN8_CTX_STATUS_IDLE_ACTIVE) == 0)
+ csb[csb_read][1] = readl(&csb_mmio[2*idx+1]);
+
csb_read++;
}
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list