[Intel-gfx] [PATCH v8 06/13] drm/i915/perf: implement active wait for noa configurations
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Wed Jul 10 23:43:21 UTC 2019
On Tue, Jul 09, 2019 at 03:33:44PM +0300, Lionel Landwerlin wrote:
>NOA configuration take some amount of time to apply. That amount of
>time depends on the size of the GT. There is no documented time for
>this. For example, past experimentations with powergating
>configuration changes seem to indicate a 60~70us delay. We go with
>500us as default for now which should be over the required amount of
>time (according to HW architects).
>
>v2: Don't forget to save/restore registers used for the wait (Chris)
>
>v3: Name used CS_GPR registers (Chris)
> Fix compile issue due to rebase (Lionel)
>
>Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
>---
> drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 24 ++
> drivers/gpu/drm/i915/gt/intel_gt_types.h | 5 +
> drivers/gpu/drm/i915/i915_debugfs.c | 31 +++
> drivers/gpu/drm/i915/i915_drv.h | 8 +
> drivers/gpu/drm/i915/i915_perf.c | 226 ++++++++++++++++++-
> drivers/gpu/drm/i915/i915_reg.h | 4 +-
> 6 files changed, 295 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>index e7eff9db343e..4a66af38c87b 100644
>--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>@@ -151,6 +151,7 @@
> #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
> #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
> #define MI_BATCH_RESOURCE_STREAMER (1<<10)
>+#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/
>
> /*
> * 3D instructions used by the kernel
>@@ -226,6 +227,29 @@
> #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0)
> #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
>
>+#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
>+#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
>+/* operands */
>+#define MI_ALU_OP_NOOP 0
>+#define MI_ALU_OP_LOAD 128
>+#define MI_ALU_OP_LOADINV 1152
>+#define MI_ALU_OP_LOAD0 129
>+#define MI_ALU_OP_LOAD1 1153
>+#define MI_ALU_OP_ADD 256
>+#define MI_ALU_OP_SUB 257
>+#define MI_ALU_OP_AND 258
>+#define MI_ALU_OP_OR 259
>+#define MI_ALU_OP_XOR 260
>+#define MI_ALU_OP_STORE 384
>+#define MI_ALU_OP_STOREINV 1408
>+/* sources */
>+#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */
>+#define MI_ALU_SRC_SRCA 32
>+#define MI_ALU_SRC_SRCB 33
>+#define MI_ALU_SRC_ACCU 49
>+#define MI_ALU_SRC_ZF 50
>+#define MI_ALU_SRC_CF 51
>+
> /*
> * Commands used only by the command parser
> */
>diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>index 3563ce970102..a3141b79d344 100644
>--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
>+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>@@ -73,6 +73,11 @@ enum intel_gt_scratch_field {
> /* 8 bytes */
> INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
>
>+ /* 6 * 8 bytes */
>+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
>+
>+ /* 4 bytes */
>+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
> };
>
> #endif /* __INTEL_GT_TYPES_H__ */
>diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>index 3e4f58f19362..46fca53dfbda 100644
>--- a/drivers/gpu/drm/i915/i915_debugfs.c
>+++ b/drivers/gpu/drm/i915/i915_debugfs.c
>@@ -3653,6 +3653,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
> i915_wedged_get, i915_wedged_set,
> "%llu\n");
>
>+static int
>+i915_perf_noa_delay_set(void *data, u64 val)
>+{
>+ struct drm_i915_private *i915 = data;
>+
>+ /* This would lead to infinite waits as we're doing timestamp
>+ * difference on the CS with only 32bits.
>+ */
>+ if (val > ((1ul << 32) - 1) * RUNTIME_INFO(i915)->cs_timestamp_frequency_khz)
>+ return -EINVAL;
>+
>+ atomic64_set(&i915->perf.oa.noa_programming_delay, val);
>+ return 0;
>+}
>+
>+static int
>+i915_perf_noa_delay_get(void *data, u64 *val)
>+{
>+ struct drm_i915_private *i915 = data;
>+
>+ *val = atomic64_read(&i915->perf.oa.noa_programming_delay);
>+ return 0;
>+}
>+
>+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
>+ i915_perf_noa_delay_get,
>+ i915_perf_noa_delay_set,
>+ "%llu\n");
>+
>+
> #define DROP_UNBOUND BIT(0)
> #define DROP_BOUND BIT(1)
> #define DROP_RETIRE BIT(2)
>@@ -4418,6 +4448,7 @@ static const struct i915_debugfs_files {
> const char *name;
> const struct file_operations *fops;
> } i915_debugfs_files[] = {
>+ {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
> {"i915_wedged", &i915_wedged_fops},
> {"i915_cache_sharing", &i915_cache_sharing_fops},
> {"i915_gem_drop_caches", &i915_drop_caches_fops},
>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>index 0419dfd0dea3..b3c6dd72c7a1 100644
>--- a/drivers/gpu/drm/i915/i915_drv.h
>+++ b/drivers/gpu/drm/i915/i915_drv.h
>@@ -1834,6 +1834,14 @@ struct drm_i915_private {
>
> struct i915_oa_ops ops;
> const struct i915_oa_format *oa_formats;
>+
>+ /**
>+ * A batch buffer doing a wait on the GPU for the NOA
>+ * logic to be reprogrammed.
>+ */
>+ struct i915_vma *noa_wait;
>+
>+ atomic64_t noa_programming_delay;
> } oa;
> } perf;
>
>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>index 882d7056aec3..abfa437a95b7 100644
>--- a/drivers/gpu/drm/i915/i915_perf.c
>+++ b/drivers/gpu/drm/i915/i915_perf.c
>@@ -197,6 +197,7 @@
>
> #include "gem/i915_gem_context.h"
> #include "gem/i915_gem_pm.h"
>+#include "gt/intel_gt.h"
> #include "gt/intel_lrc_reg.h"
>
> #include "i915_drv.h"
>@@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
> MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
> config_length += oa_config->flex_regs_len * 8;
> }
>- config_length += 4; /* MI_BATCH_BUFFER_END */
>+ config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
> config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
>
> bo = i915_gem_object_create_shmem(i915, config_length);
>@@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
> cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
> cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
>
>- *cs++ = MI_BATCH_BUFFER_END;
>+
>+ /* Jump into the NOA wait busy loop. */
>+ *cs++ = (INTEL_GEN(i915) < 8 ?
>+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
>+ *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
>+ *cs++ = 0;
>
> i915_gem_object_flush_map(bo);
> i915_gem_object_unpin_map(bo);
>@@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
> mutex_lock(&dev_priv->drm.struct_mutex);
> dev_priv->perf.oa.exclusive_stream = NULL;
> dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
>+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
> mutex_unlock(&dev_priv->drm.struct_mutex);
>
> free_oa_buffer(dev_priv);
>@@ -1653,6 +1660,205 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> return ret;
> }
>
>+static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
>+ i915_reg_t reg, u32 offset, u32 dword_count)
>+{
>+ uint32_t d;
>+
>+ for (d = 0; d < dword_count; d++) {
>+ *cs++ = INTEL_GEN(i915) >= 8 ?
>+ MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
>+ *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
>+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
>+ *cs++ = 0;
>+ }
>+
>+ return cs;
>+}
>+
>+static u32 *restore_register(struct drm_i915_private *i915, u32 *cs,
>+ i915_reg_t reg, u32 offset, u32 dword_count)
>+{
>+ uint32_t d;
>+
>+ for (d = 0; d < dword_count; d++) {
>+ *cs++ = INTEL_GEN(i915) >= 8 ?
>+ MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM;
>+ *cs++ = i915_mmio_reg_offset(reg);
>+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset);
are you missing + 4 * d in the above 2 lines?
Regards,
Umesh
>+ *cs++ = 0;
>+ }
>+
>+ return cs;
>+}
>+
>+static int alloc_noa_wait(struct drm_i915_private *i915)
>+{
>+ struct drm_i915_gem_object *bo;
>+ struct i915_vma *vma;
>+ const u64 delay_ticks = 0xffffffffffffffff -
>+ DIV64_U64_ROUND_UP(
>+ atomic64_read(&i915->perf.oa.noa_programming_delay) *
>+ RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
>+ 1000000ull);
>+ u32 *batch, *ts0, *cs, *jump;
>+ int ret, i;
>+ enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR };
>+
>+ bo = i915_gem_object_create_internal(i915, 4096);
>+ if (IS_ERR(bo)) {
>+ DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
>+ return PTR_ERR(bo);
>+ }
>+
>+ /*
>+ * We pin in GGTT because we jump into this buffer now because
>+ * multiple OA config BOs will have a jump to this address and it
>+ * needs to be fixed during the lifetime of the i915/perf stream.
>+ */
>+ vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
>+ if (IS_ERR(vma)) {
>+ ret = PTR_ERR(vma);
>+ goto err_unref;
>+ }
>+
>+ batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
>+ if (IS_ERR(batch)) {
>+ ret = PTR_ERR(batch);
>+ goto err_unpin;
>+ }
>+
>+ /* Save registers. */
>+ for (i = 0; i < N_CS_GPR; i++) {
>+ cs = save_register(i915, cs, HSW_CS_GPR(i),
>+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>+ }
>+ cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
>+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>+
>+ /* First timestamp snapshot location. */
>+ ts0 = cs;
>+
>+ /*
>+ * Initial snapshot of the timestamp register to implement the wait.
>+ * We work with 32b values, so clear out the top 32b bits of the
>+ * register because the ALU works 64bits.
>+ */
>+ *cs++ = MI_LOAD_REGISTER_IMM(1);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4;
>+ *cs++ = 0;
>+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS));
>+
>+ /*
>+ * This is the location we're going to jump back into until the
>+ * required amount of time has passed.
>+ */
>+ jump = cs;
>+
>+ /*
>+ * Take another snapshot of the timestamp register. Take care to clear
>+ * up the top 32bits of CS_GPR(1) as we're using it for other
>+ * operations below.
>+ */
>+ *cs++ = MI_LOAD_REGISTER_IMM(1);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4;
>+ *cs++ = 0;
>+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS));
>+
>+ /*
>+ * Do a diff between the 2 timestamps and store the result back into
>+ * CS_GPR(1).
>+ */
>+ *cs++ = MI_MATH(5);
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS));
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS));
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU);
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
>+
>+ /*
>+ * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
>+ * timestamp have rolled over the 32bits) into the predicate register
>+ * to be used for the predicated jump.
>+ */
>+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
>+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>+
>+ /* Restart from the beginning if we had timestamps roll over. */
>+ *cs++ = (INTEL_GEN(i915) < 8 ?
>+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>+ MI_BATCH_PREDICATE;
>+ *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
>+ *cs++ = 0;
>+
>+ /*
>+ * Now add the diff between to previous timestamps and add it to :
>+ * (((1 * << 64) - 1) - delay_ns)
>+ *
>+ * When the Carry Flag contains 1 this means the elapsed time is
>+ * longer than the expected delay, and we can exit the wait loop.
>+ */
>+ *cs++ = MI_LOAD_REGISTER_IMM(2);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET));
>+ *cs++ = lower_32_bits(delay_ticks);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4;
>+ *cs++ = upper_32_bits(delay_ticks);
>+
>+ *cs++ = MI_MATH(4);
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS));
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET));
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
>+ *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
>+
>+ /*
>+ * Transfer the result into the predicate register to be used for the
>+ * predicated jump.
>+ */
>+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
>+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>+
>+ /* Predicate the jump. */
>+ *cs++ = (INTEL_GEN(i915) < 8 ?
>+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>+ MI_BATCH_PREDICATE;
>+ *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
>+ *cs++ = 0;
>+
>+ /* Restore registers. */
>+ for (i = 0; i < N_CS_GPR; i++) {
>+ cs = restore_register(i915, cs, HSW_CS_GPR(i),
>+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>+ }
>+ cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
>+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>+
>+ /* And return to the ring. */
>+ *cs++ = MI_BATCH_BUFFER_END;
>+
>+ GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch)));
>+
>+ i915_gem_object_flush_map(bo);
>+ i915_gem_object_unpin_map(bo);
>+
>+ i915->perf.oa.noa_wait = vma;
>+
>+ return 0;
>+
>+err_unpin:
>+ __i915_vma_unpin(vma);
>+
>+err_unref:
>+ i915_gem_object_put(bo);
>+
>+ return ret;
>+}
>+
> static void config_oa_regs(struct drm_i915_private *dev_priv,
> const struct i915_oa_reg *regs,
> u32 n_regs)
>@@ -2221,6 +2427,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> goto err_config;
> }
>
>+ ret = alloc_noa_wait(dev_priv);
>+ if (ret) {
>+ DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
>+ goto err_noa_wait_alloc;
>+ }
>+
> /* PRM - observability performance counters:
> *
> * OACONTROL, performance counter enable, note:
>@@ -2273,6 +2485,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
> intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
> intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
>
>+ mutex_lock(&dev_priv->drm.struct_mutex);
>+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
>+ mutex_unlock(&dev_priv->drm.struct_mutex);
>+
>+err_noa_wait_alloc:
>+ put_oa_config(stream->oa_config);
>+
> err_config:
> if (stream->ctx)
> oa_put_render_ctx_id(stream);
>@@ -3657,6 +3876,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
> mutex_init(&dev_priv->perf.metrics_lock);
> idr_init(&dev_priv->perf.metrics_idr);
>
>+ atomic64_set(&dev_priv->perf.oa.noa_programming_delay,
>+ 500 * 1000 /* 500us */);
>+
> dev_priv->perf.initialized = true;
> }
> }
>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>index 5898f59e3dd7..a73464dd5e91 100644
>--- a/drivers/gpu/drm/i915/i915_reg.h
>+++ b/drivers/gpu/drm/i915/i915_reg.h
>@@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4)
> #define MI_PREDICATE_SRC1 _MMIO(0x2408)
> #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4)
>-
>+#define MI_PREDICATE_DATA _MMIO(0x2410)
>+#define MI_PREDICATE_RESULT _MMIO(0x2418)
>+#define MI_PREDICATE_RESULT_1 _MMIO(0x241c)
> #define MI_PREDICATE_RESULT_2 _MMIO(0x2214)
> #define LOWER_SLICE_ENABLED (1 << 0)
> #define LOWER_SLICE_DISABLED (0 << 0)
>--
>2.22.0
>
>_______________________________________________
>Intel-gfx mailing list
>Intel-gfx at lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list