[Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Tue Jul 14 07:44:53 UTC 2020
On Tue, Jul 14, 2020 at 12:22:39AM -0700, Umesh Nerlige Ramappa wrote:
>From: Piotr Maciejewski <piotr.maciejewski at intel.com>
>
>i915 used to support time based sampling mode which is good for overall
>system monitoring, but is not enough for query mode used to measure a
>single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>implementation for query, but Gen12+ requires a new approach based on
>triggered reports within oa buffer. In order to enable above feature
>two changes are required:
>
>1. Whitelist update:
>- enable triggered reports within oa buffer
>- reading oa buffer head/tail/status information
>- reading gpu ticks counter.
>
>2. Map oa buffer at umd driver level to solve below constraints related
> to time based sampling interface:
>- longer time to access reports collected by oa buffer
>- slow oa reports browsing since oa buffer size is large
>- missing oa report index, so query cannot browse report directly
>- with direct access to oa buffer, query can extract other useful
> reports like context switch information needed to calculate correct
> performance counters values.
>
>Signed-off-by: Piotr Maciejewski <piotr.maciejewski at intel.com>
>---
> drivers/gpu/drm/i915/gt/intel_workarounds.c | 54 ++++++++
> drivers/gpu/drm/i915/i915_perf.c | 130 +++++++++++++++++++-
> drivers/gpu/drm/i915/i915_perf_types.h | 13 ++
> drivers/gpu/drm/i915/i915_reg.h | 14 +++
> include/uapi/drm/i915_drm.h | 19 +++
> 5 files changed, 227 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>index 5726cd0a37e0..cf89928fc3a5 100644
>--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
> whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
> }
>
>+static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
>+{
>+ /* OA buffer trigger report 2/6 used by performance query */
>+ whitelist_reg(w, OAREPORTTRIG2);
>+ whitelist_reg(w, OAREPORTTRIG6);
>+
>+ /* Performance counters A18-20 used by tbs marker query */
>+ whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RW |
>+ RING_FORCE_TO_NONPRIV_RANGE_16);
the above whitelist should be broken into
whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
RING_FORCE_TO_NONPRIV_ACCESS_RW |
RING_FORCE_TO_NONPRIV_RANGE_4);
whitelist_reg(w, OA_PERF_COUNTER_A20);
whitelist_reg(w, OA_PERF_COUNTER_A20_UPPER);
>+
>+ /* Read access to gpu ticks */
>+ whitelist_reg_ext(w, GEN8_GPU_TICKS,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RD);
>+
>+ /* Read access to: oa status, head, tail, buffer settings */
>+ whitelist_reg_ext(w, GEN8_OASTATUS,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RD |
>+ RING_FORCE_TO_NONPRIV_RANGE_4);
>+}
>+
>+static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
>+{
>+ /* OA buffer trigger report 2/6 used by performance query */
>+ whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>+ whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>+
>+ /* Performance counters A18-20 used by tbs marker query */
>+ whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RW |
>+ RING_FORCE_TO_NONPRIV_RANGE_16);
same as the above comment
>+
>+ /* Read access to gpu ticks */
>+ whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RD);
>+
>+ /* Read access to: oa status, head, tail, buffer settings */
>+ whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>+ RING_FORCE_TO_NONPRIV_ACCESS_RD |
>+ RING_FORCE_TO_NONPRIV_RANGE_4);
>+}
>+
> static void gen9_whitelist_build(struct i915_wa_list *w)
> {
> /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
>@@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w)
>
> /* WaSendPushConstantsFromMMIO:skl,bxt */
> whitelist_reg(w, COMMON_SLICE_CHICKEN2);
>+
>+ /* Performance counters support */
>+ gen9_whitelist_build_performance_counters(w);
> }
>
> static void skl_whitelist_build(struct intel_engine_cs *engine)
>@@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine)
>
> /* WaEnablePreemptionGranularityControlByUMD:cnl */
> whitelist_reg(w, GEN8_CS_CHICKEN1);
>+
>+ /* Performance counters support */
>+ gen9_whitelist_build_performance_counters(w);
> }
>
> static void icl_whitelist_build(struct intel_engine_cs *engine)
>@@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
> whitelist_reg_ext(w, PS_INVOCATION_COUNT,
> RING_FORCE_TO_NONPRIV_ACCESS_RD |
> RING_FORCE_TO_NONPRIV_RANGE_4);
>+
>+ /* Performance counters support */
>+ gen9_whitelist_build_performance_counters(w);
> break;
>
> case VIDEO_DECODE_CLASS:
>@@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
>
> /* Wa_1806527549:tgl */
> whitelist_reg(w, HIZ_CHICKEN);
>+
>+ /* Performance counters support */
>+ gen12_whitelist_build_performance_counters(w);
> break;
> default:
> whitelist_reg_ext(w,
>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>index c6f6370283cf..06a3fff52dfa 100644
>--- a/drivers/gpu/drm/i915/i915_perf.c
>+++ b/drivers/gpu/drm/i915/i915_perf.c
>@@ -192,6 +192,7 @@
> */
>
> #include <linux/anon_inodes.h>
>+#include <linux/mman.h>
> #include <linux/sizes.h>
> #include <linux/uuid.h>
>
>@@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
> return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> }
>
>+static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+ struct intel_uncore *uncore = stream->uncore;
>+
>+ return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) &
>+ GEN12_OAG_OAHEADPTR_MASK;
>+}
>+
>+static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+ struct intel_uncore *uncore = stream->uncore;
>+
>+ return intel_uncore_read(uncore, GEN8_OAHEADPTR) &
>+ GEN8_OAHEADPTR_MASK;
>+}
>+
>+static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+ struct intel_uncore *uncore = stream->uncore;
>+ u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2);
>+
>+ return oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
>+}
>+
> /**
> * oa_buffer_check_unlocked - check for data and update tail ptr state
> * @stream: i915 stream instance
>@@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream)
> i915_vma_unpin_and_release(&stream->oa_buffer.vma,
> I915_VMA_RELEASE_MAP);
>
>+ stream->oa_buffer.cpu_address = 0;
> stream->oa_buffer.vaddr = NULL;
> }
>
>@@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
> * bit."
> */
> intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
>- OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>+ OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>+ GEN7_OABUFFER_EDGE_TRIGGER);
> intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
>
> /* Mark that we need updated tail pointers to read from... */
>@@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> * bit."
> */
> intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
>- OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>+ OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>+ GEN7_OABUFFER_EDGE_TRIGGER);
> intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
> gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>
>@@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
> goto err_unref;
> }
> stream->oa_buffer.vma = vma;
>+ stream->oa_buffer.cpu_address = 0;
>
> stream->oa_buffer.vaddr =
> i915_gem_object_pin_map(bo, I915_MAP_WB);
>@@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
> return ret;
> }
>
>+static int map_oa_buffer(struct i915_perf_stream *stream)
>+{
>+ unsigned long address = 0;
>+ const u64 size = OA_BUFFER_SIZE;
>+ struct i915_vma *oabuffer_vma = stream->oa_buffer.vma;
>+ struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj;
>+ struct mm_struct *mm = current->mm;
>+ struct vm_area_struct *vma = NULL;
>+
>+ if(stream->oa_buffer.cpu_address != 0)
>+ return 0;
>+
>+ if (!boot_cpu_has(X86_FEATURE_PAT))
>+ return -ENODEV;
>+
>+ if (!oabuffer_obj || !oabuffer_vma)
>+ return -ENOENT;
>+
>+ if (!oabuffer_obj->base.filp)
>+ return -ENXIO;
>+
>+ if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size))
>+ return -EINVAL;
>+
>+ address = vm_mmap(oabuffer_obj->base.filp, 0, size,
>+ PROT_READ, MAP_SHARED, 0);
>+
>+ if (IS_ERR_VALUE(address))
>+ return address;
>+
>+ if (mmap_write_lock_killable(mm))
>+ return -EINTR;
>+
>+ vma = find_vma(mm, address);
>+ if (vma) {
>+ vma->vm_page_prot =
>+ pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
>+
>+ stream->oa_buffer.cpu_address = address;
>+ }
>+
>+ mmap_write_unlock(mm);
>+
>+ return vma ? 0 : -ENOMEM;
>+}
>+
> static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
> bool save, i915_reg_t reg, u32 offset,
> u32 dword_count)
>@@ -2493,6 +2568,13 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
> : 0);
>
>+ /*
>+ * Initialize Super Queue Internal Cnt Register
>+ * BIT(30) - PMON Enable - set in order to collect valid metrics.
>+ */
>+ intel_uncore_write(uncore, GEN12_SQCNT1,
>+ intel_uncore_read(uncore, GEN12_SQCNT1) | BIT(30));
>+
> /*
> * Update all contexts prior writing the mux configurations as we need
> * to make sure all slices/subslices are ON before writing to NOA
>@@ -3199,6 +3281,39 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream,
> return ret;
> }
>
>+/**
>+ * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer
>+ * @arg: pointer to oa buffer info populated by this function.
>+ */
>+static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream,
>+ unsigned long arg)
>+{
>+ struct drm_i915_perf_oa_buffer_info info;
>+ void __user *output = (void __user *) arg;
>+ int ret;
>+
>+ if (!output)
>+ return -EINVAL;
>+
>+ memset(&info, 0, sizeof(info));
>+
>+ info.size = stream->oa_buffer.vma->size;
>+ info.head = stream->perf->ops.oa_hw_head_read(stream);
>+ info.tail = stream->perf->ops.oa_hw_tail_read(stream);
>+ info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma);
>+
>+ ret = map_oa_buffer(stream);
>+ if (ret)
>+ return ret;
>+
>+ info.cpu_address = stream->oa_buffer.cpu_address;
>+
>+ if (copy_to_user(output, &info, sizeof(info)))
>+ return -EFAULT;
>+
>+ return 0;
>+}
>+
> /**
> * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
> * @stream: An i915 perf stream
>@@ -3224,6 +3339,8 @@ static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
> return 0;
> case I915_PERF_IOCTL_CONFIG:
> return i915_perf_config_locked(stream, arg);
>+ case I915_PERF_IOCTL_GET_OA_BUFFER_INFO:
>+ return i915_perf_get_oa_buffer_info_locked(stream, arg);
> }
>
> return -EINVAL;
>@@ -4245,6 +4362,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> perf->ops.oa_disable = gen7_oa_disable;
> perf->ops.read = gen7_oa_read;
> perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
>+ perf->ops.oa_hw_head_read = gen7_oa_hw_head_read;
>
> perf->oa_formats = hsw_oa_formats;
> } else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
>@@ -4276,6 +4394,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> perf->ops.enable_metric_set = gen8_enable_metric_set;
> perf->ops.disable_metric_set = gen8_disable_metric_set;
> perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>+ perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
>
> if (IS_GEN(i915, 8)) {
> perf->ctx_oactxctrl_offset = 0x120;
>@@ -4303,6 +4422,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> perf->ops.enable_metric_set = gen8_enable_metric_set;
> perf->ops.disable_metric_set = gen10_disable_metric_set;
> perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>+ perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
>
> if (IS_GEN(i915, 10)) {
> perf->ctx_oactxctrl_offset = 0x128;
>@@ -4327,6 +4447,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> perf->ops.enable_metric_set = gen12_enable_metric_set;
> perf->ops.disable_metric_set = gen12_disable_metric_set;
> perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
>+ perf->ops.oa_hw_head_read = gen12_oa_hw_head_read;
>
> perf->ctx_flexeu0_offset = 0;
> perf->ctx_oactxctrl_offset = 0x144;
>@@ -4432,8 +4553,11 @@ int i915_perf_ioctl_version(void)
> *
> * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the
> * interval for the hrtimer used to check for OA data.
>+ *
>+ * 6: Added an option to map oa buffer at umd driver level and trigger
>+ * oa reports within oa buffer from command buffer.
> */
>- return 5;
>+ return 6;
> }
>
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
>index a36a455ae336..5b40e20c2aa9 100644
>--- a/drivers/gpu/drm/i915/i915_perf_types.h
>+++ b/drivers/gpu/drm/i915/i915_perf_types.h
>@@ -251,6 +251,14 @@ struct i915_perf_stream {
> int format_size;
> int size_exponent;
>
>+ /**
>+ * @cpu_address: OA buffer cpu address.
>+ *
>+ * Needed to map OA buffer at umd driver level
>+ * to obtain cpu pointer and browse reports.
>+ */
>+ u64 cpu_address;
>+
> /**
> * @ptr_lock: Locks reads and writes to all head/tail state
> *
>@@ -377,6 +385,11 @@ struct i915_oa_ops {
> * generations.
> */
> u32 (*oa_hw_tail_read)(struct i915_perf_stream *stream);
>+
>+ /**
>+ * @oa_hw_head_read: read the OA head pointer register
>+ */
>+ u32 (*oa_hw_head_read)(struct i915_perf_stream *stream);
> };
>
> struct i915_perf {
>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>index 86a23ced051b..2e3d264339e0 100644
>--- a/drivers/gpu/drm/i915/i915_reg.h
>+++ b/drivers/gpu/drm/i915/i915_reg.h
>@@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define GEN7_OASTATUS2_HEAD_MASK 0xffffffc0
> #define GEN7_OASTATUS2_MEM_SELECT_GGTT (1 << 0) /* 0: PPGTT, 1: GGTT */
>
>+#define GEN8_GPU_TICKS _MMIO(0x2910)
> #define GEN8_OASTATUS _MMIO(0x2b08)
> #define GEN8_OASTATUS_OVERRUN_STATUS (1 << 3)
> #define GEN8_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>@@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define OABUFFER_SIZE_16M (7 << 3)
>
> #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>+#define GEN12_SQCNT1 _MMIO(0x8718)
>
> /* Gen12 OAR unit */
> #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>@@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS (1 << 2)
> #define GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>
>+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
> #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
> #define GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
> #define GEN12_OAG_OASTATUS_BUFFER_OVERFLOW (1 << 1)
>@@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT 24
> #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT 28
>
>+/* Performance counters registers */
>+#define OA_PERF_COUNTER_A18 _MMIO(0x2890)
>+#define OA_PERF_COUNTER_A19 _MMIO(0x2898)
>+#define OA_PERF_COUNTER_A20 _MMIO(0x28A0)
>+
>+/* Gen12 Performance counters registers */
>+#define GEN12_OAG_PERF_COUNTER_A16 _MMIO(0xDA00)
unused. remove ^
>+#define GEN12_OAG_PERF_COUNTER_A18 _MMIO(0xDA10)
>+#define GEN12_OAG_PERF_COUNTER_A19 _MMIO(0xDA18)
>+#define GEN12_OAG_PERF_COUNTER_A20 _MMIO(0xDA20)
>+
> /* Same layout as OASTARTTRIGX */
> #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
> #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>index 14b67cd6b54b..62b88c0123c8 100644
>--- a/include/uapi/drm/i915_drm.h
>+++ b/include/uapi/drm/i915_drm.h
>@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {
> */
> #define I915_PERF_IOCTL_CONFIG _IO('i', 0x2)
>
>+/**
>+ * Returns OA buffer properties.
>+ *
>+ * This ioctl is available in perf revision 6.
>+ */
>+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>+
>+/**
>+ * OA buffer information structure.
>+ */
>+struct drm_i915_perf_oa_buffer_info {
>+ __u32 size;
>+ __u32 head;
>+ __u32 tail;
>+ __u32 gpu_address;
>+ __u64 cpu_address;
>+ __u64 reserved[4];
>+};
>+
> /**
> * Common to all i915 perf records
> */
>--
>2.20.1
>
>_______________________________________________
>Intel-gfx mailing list
>Intel-gfx at lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list