[Intel-gfx] [PATCH v2 3/5] drm/i915: context submission pvmmio optimization
Zhang, Xiaolin
xiaolin.zhang at intel.com
Wed Oct 31 09:18:50 UTC 2018
Ping review, thanks very much.
BRs, Xiaolin
-----Original Message-----
From: Zhang, Xiaolin
Sent: Friday, October 19, 2018 3:27 PM
To: intel-gfx at lists.freedesktop.org
Cc: intel-gvt-dev at lists.freedesktop.org; Zhang, Xiaolin <xiaolin.zhang at intel.com>; Zhenyu Wang <zhenyuw at linux.intel.com>; Wang, Zhi A <zhi.a.wang at intel.com>; Chris Wilson <chris at chris-wilson.co.uk>; Joonas Lahtinen <joonas.lahtinen at linux.intel.com>; He; He, Min <min.he at intel.com>; Jiang; Jiang, Fei <fei.jiang at intel.com>; Gong; Gong, Zhipeng <zhipeng.gong at intel.com>; Yuan; Yuan, Hang <hang.yuan at intel.com>; Lv, Zhiyuan <zhiyuan.lv at intel.com>
Subject: [PATCH v2 3/5] drm/i915: context submission pvmmio optimization
It is performance optimization to reduce mmio trap numbers from 4 to
1 durning ELSP porting writing (context submission).
When context subission, to cache elsp_data[4] values in the shared page, the last elsp_data[0] port writing will be trapped to gvt for real context submission.
Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization.
v0: RFC
v1: rebase
v2: added pv ops for pv context submission. to maximize code resuse, introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of submit_ports and preempt_context implemented.
Cc: Zhenyu Wang <zhenyuw at linux.intel.com>
Cc: Zhi Wang <zhi.a.wang at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: He, Min <min.he at intel.com>
Cc: Jiang, Fei <fei.jiang at intel.com>
Cc: Gong, Zhipeng <zhipeng.gong at intel.com>
Cc: Yuan, Hang <hang.yuan at intel.com>
Cc: Zhiyuan Lv <zhiyuan.lv at intel.com>
Signed-off-by: Xiaolin Zhang <xiaolin.zhang at intel.com>
---
drivers/gpu/drm/i915/i915_vgpu.c | 2 +
drivers/gpu/drm/i915/intel_lrc.c | 88 +++++++++++++++++++++++++++++++--
drivers/gpu/drm/i915/intel_ringbuffer.h | 3 ++
3 files changed, 90 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index cb409d5..9870ea6 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -66,6 +66,8 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv)
BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE);
+ dev_priv->vgpu.pv_caps = PVMMIO_ELSP_SUBMIT;
+
magic = __raw_i915_read64(dev_priv, vgtif_reg(magic));
if (magic != VGT_MAGIC)
return;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 22b57b8..9e6ccf9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -460,6 +460,60 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); }
+static void execlists_submit_ports_pv(struct intel_engine_cs *engine) {
+ struct intel_engine_execlists *execlists = &engine->execlists;
+ struct execlist_port *port = execlists->port;
+ u32 __iomem *elsp =
+ engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+ u32 *elsp_data;
+ unsigned int n;
+ u32 descs[4];
+ int i = 0;
+
+ /*
+ * ELSQ note: the submit queue is not cleared after being submitted
+ * to the HW so we need to make sure we always clean it up. This is
+ * currently ensured by the fact that we always write the same number
+ * of elsq entries, keep this in mind before changing the loop below.
+ */
+ for (n = execlists_num_ports(execlists); n--; ) {
+ struct i915_request *rq;
+ unsigned int count;
+ u64 desc;
+
+ rq = port_unpack(&port[n], &count);
+ if (rq) {
+ GEM_BUG_ON(count > !n);
+ if (!count++)
+ execlists_context_schedule_in(rq);
+ port_set(&port[n], port_pack(rq, count));
+ desc = execlists_update_context(rq);
+ } else {
+ GEM_BUG_ON(!n);
+ desc = 0;
+ }
+ GEM_BUG_ON(i >= 4);
+ descs[i] = upper_32_bits(desc);
+ descs[i + 1] = lower_32_bits(desc);
+ i += 2;
+ }
+
+ spin_lock(&engine->i915->vgpu.shared_page_lock);
+ elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+ *elsp_data = descs[0];
+ *(elsp_data + 1) = descs[1];
+ *(elsp_data + 2) = descs[2];
+ writel(descs[3], elsp);
+ spin_unlock(&engine->i915->vgpu.shared_page_lock);
+
+ /* we need to manually load the submit queue */
+ if (execlists->ctrl_reg)
+ writel(EL_CTRL_LOAD, execlists->ctrl_reg);
+
+ execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); }
+
static bool ctx_single_port_submission(const struct intel_context *ce) {
return (IS_ENABLED(CONFIG_DRM_I915_GVT) && @@ -497,7 +551,6 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
GEM_BUG_ON(execlists->preempt_complete_status !=
upper_32_bits(ce->lrc_desc));
-
/*
* Switch to our empty preempt context so
* the state of the GPU is known (idle).
@@ -516,6 +569,27 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT); }
+static void inject_preempt_context_pv(struct intel_engine_cs *engine) {
+ struct intel_engine_execlists *execlists = &engine->execlists;
+ struct intel_context *ce =
+ to_intel_context(engine->i915->preempt_context, engine);
+ u32 __iomem *elsp =
+ engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+ u32 *elsp_data;
+
+ GEM_BUG_ON(execlists->preempt_complete_status !=
+ upper_32_bits(ce->lrc_desc));
+
+ spin_lock(&engine->i915->vgpu.shared_page_lock);
+ elsp_data = engine->i915->vgpu.shared_page->elsp_data;
+ *elsp_data = 0;
+ *(elsp_data + 1) = 0;
+ *(elsp_data + 2) = upper_32_bits(ce->lrc_desc);
+ writel(lower_32_bits(ce->lrc_desc), elsp);
+ spin_unlock(&engine->i915->vgpu.shared_page_lock);
+}
+
static void complete_preempt_context(struct intel_engine_execlists *execlists) {
GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)); @@ -583,7 +657,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
return;
if (need_preempt(engine, last, execlists->queue_priority)) {
- inject_preempt_context(engine);
+ engine->preempt_context(engine);
return;
}
@@ -705,7 +779,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
if (submit) {
port_assign(port, last);
- execlists_submit_ports(engine);
+ engine->submit_ports(engine);
}
/* We must always keep the beast fed if we have work piled up */ @@ -2134,6 +2208,14 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
engine->reset.prepare = execlists_reset_prepare;
+ engine->preempt_context = inject_preempt_context;
+ engine->submit_ports = execlists_submit_ports;
+
+ if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) {
+ engine->preempt_context = inject_preempt_context_pv;
+ engine->submit_ports = execlists_submit_ports_pv;
+ }
+
engine->park = NULL;
engine->unpark = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index f6ec48a..e9895bf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -523,6 +523,9 @@ struct intel_engine_cs {
void (*irq_seqno_barrier)(struct intel_engine_cs *engine);
void (*cleanup)(struct intel_engine_cs *engine);
+ void (*preempt_context)(struct intel_engine_cs *engine);
+ void (*submit_ports)(struct intel_engine_cs *engine);
+
/* GEN8 signal/wait table - never trust comments!
* signal to signal to signal to signal to signal to
* RCS VCS BCS VECS VCS2
--
2.7.4
More information about the Intel-gfx
mailing list