[PATCH 08/17] drm/xe/oa: OA stream initialization (OAG)
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Wed Dec 20 02:31:45 UTC 2023
On Thu, Dec 07, 2023 at 10:43:20PM -0800, Ashutosh Dixit wrote:
>Implement majority of OA stream initialization (as part of OA stream open)
>ioctl). OAG buffer is allocated for receiving perf counter samples from
>HW. OAG unit is initialized and the selected OA metric configuration is
>programmed into OAG unit HW using a command/batch buffer.
>
>Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>---
> drivers/gpu/drm/xe/regs/xe_gt_regs.h | 3 +
> drivers/gpu/drm/xe/xe_oa.c | 397 +++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_oa_types.h | 82 ++++++
> 3 files changed, 482 insertions(+)
>
>diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>index d318ec0efd7db..1b98b609f7fda 100644
>--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>@@ -156,6 +156,8 @@
>
> #define SQCNT1 XE_REG_MCR(0x8718)
> #define XELPMP_SQCNT1 XE_REG(0x8718)
>+#define SQCNT1_PMON_ENABLE REG_BIT(30)
>+#define SQCNT1_OABPC REG_BIT(29)
> #define ENFORCE_RAR REG_BIT(23)
REG_BIT(29) indentation seems to be off
>
> #define XEHP_SQCM XE_REG_MCR(0x8724)
>@@ -365,6 +367,7 @@
> #define ROW_CHICKEN XE_REG_MCR(0xe4f0, XE_REG_OPTION_MASKED)
> #define UGM_BACKUP_MODE REG_BIT(13)
> #define MDQ_ARBITRATION_MODE REG_BIT(12)
>+#define STALL_DOP_GATING_DISABLE REG_BIT(5)
> #define EARLY_EOT_DIS REG_BIT(1)
>
> #define ROW_CHICKEN2 XE_REG_MCR(0xe4f4, XE_REG_OPTION_MASKED)
>diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
>index 9b0bd58fcbc06..d898610322d50 100644
>--- a/drivers/gpu/drm/xe/xe_oa.c
>+++ b/drivers/gpu/drm/xe/xe_oa.c
>@@ -6,15 +6,26 @@
> #include <linux/nospec.h>
> #include <linux/sysctl.h>
>
>+#include <drm/drm_drv.h>
>+#include <drm/xe_drm.h>
>+
>+#include "instructions/xe_mi_commands.h"
> #include "regs/xe_gt_regs.h"
> #include "regs/xe_oa_regs.h"
> #include "xe_device.h"
> #include "xe_exec_queue.h"
>+#include "xe_bb.h"
>+#include "xe_bo.h"
> #include "xe_gt.h"
>+#include "xe_gt_mcr.h"
> #include "xe_mmio.h"
> #include "xe_oa.h"
>+#include "xe_sched_job.h"
> #include "xe_perf.h"
>
>+#define DEFAULT_POLL_FREQUENCY_HZ 200
>+#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ)
>+
> static int xe_oa_sample_rate_hard_limit;
> static u32 xe_oa_max_sample_rate = 100000;
>
>@@ -63,6 +74,13 @@ struct xe_oa_open_param {
> struct xe_hw_engine *hwe;
> };
>
>+struct xe_oa_config_bo {
>+ struct llist_node node;
>+
>+ struct xe_oa_config *oa_config;
>+ struct xe_bb *bb;
>+};
>+
> #define DRM_FMT(x) DRM_XE_OA_FMT_TYPE_##x
>
> static const struct xe_oa_format oa_formats[] = {
>@@ -105,6 +123,381 @@ static void xe_oa_config_put(struct xe_oa_config *oa_config)
> kref_put(&oa_config->ref, xe_oa_config_release);
> }
>
>+static struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config)
>+{
>+ return kref_get_unless_zero(&oa_config->ref) ? oa_config : NULL;
>+}
>+
>+static struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set)
>+{
>+ struct xe_oa_config *oa_config;
>+
>+ rcu_read_lock();
>+ oa_config = idr_find(&oa->metrics_idr, metrics_set);
>+ if (oa_config)
>+ oa_config = xe_oa_config_get(oa_config);
>+ rcu_read_unlock();
>+
>+ return oa_config;
>+}
>+
>+static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo)
>+{
>+ xe_oa_config_put(oa_bo->oa_config);
>+ xe_bb_free(oa_bo->bb, NULL);
>+ kfree(oa_bo);
>+}
>+
>+static const struct xe_oa_regs *__oa_regs(struct xe_oa_stream *stream)
>+{
>+ return &stream->hwe->oa_unit->regs;
>+}
>+
>+static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
>+{
>+ struct xe_sched_job *job;
>+ struct dma_fence *fence;
>+ long timeout;
>+ int err = 0;
>+
>+ /* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */
>+ job = xe_bb_create_job(stream->k_exec_q, bb);
>+ if (IS_ERR(job)) {
>+ err = PTR_ERR(job);
>+ goto exit;
>+ }
>+
>+ xe_sched_job_arm(job);
>+ fence = dma_fence_get(&job->drm.s_fence->finished);
>+ xe_sched_job_push(job);
>+
>+ timeout = dma_fence_wait_timeout(fence, false, HZ);
>+ dma_fence_put(fence);
>+ if (timeout < 0)
>+ err = timeout;
>+ else if (!timeout)
>+ err = -ETIME;
>+exit:
>+ return err;
>+}
>+
>+static void xe_oa_free_oa_buffer(struct xe_oa_stream *stream)
>+{
>+ xe_bo_unpin_map_no_vm(stream->oa_buffer.bo);
>+}
>+
>+static void xe_oa_free_configs(struct xe_oa_stream *stream)
>+{
>+ struct xe_oa_config_bo *oa_bo, *tmp;
>+
>+ xe_oa_config_put(stream->oa_config);
>+ llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
>+ free_oa_config_bo(oa_bo);
>+}
>+
>+#define HAS_OA_BPC_REPORTING(xe) (GRAPHICS_VERx100(xe) >= 1255)
>+
>+static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
>+{
>+ u32 sqcnt1;
>+
>+ /*
>+ * Wa_1508761755:xehpsdv, dg2
>+ * Enable thread stall DOP gating and EU DOP gating.
>+ */
>+ if (stream->oa->xe->info.platform == XE_DG2) {
>+ xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
>+ _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE));
>+ xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
>+ _MASKED_BIT_DISABLE(DISABLE_DOP_GATING));
>+ }
>+
>+ /* Make sure we disable noa to save power. */
>+ xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GT_NOA_ENABLE, 0);
>+
>+ sqcnt1 = SQCNT1_PMON_ENABLE |
>+ (HAS_OA_BPC_REPORTING(stream->oa->xe) ? SQCNT1_OABPC : 0);
>+
>+ /* Reset PMON Enable to save power. */
>+ xe_mmio_rmw32(stream->gt, XELPMP_SQCNT1, sqcnt1, 0);
>+}
>+
>+static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream)
>+{
>+ struct xe_bo *bo;
>+
>+ BUILD_BUG_ON_NOT_POWER_OF_2(XE_OA_BUFFER_SIZE);
>+ BUILD_BUG_ON(XE_OA_BUFFER_SIZE < SZ_128K || XE_OA_BUFFER_SIZE > SZ_16M);
>+
>+ bo = xe_bo_create_pin_map(stream->oa->xe, stream->gt->tile, NULL,
>+ XE_OA_BUFFER_SIZE, ttm_bo_type_kernel,
>+ XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_GGTT_BIT);
>+ if (IS_ERR(bo))
>+ return PTR_ERR(bo);
>+
>+ stream->oa_buffer.bo = bo;
>+ stream->oa_buffer.vaddr = bo->vmap.vaddr;
>+ return 0;
>+}
>+
>+static void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs)
>+{
>+ u32 i;
>+
>+#define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
>+
>+ for (i = 0; i < n_regs; i++) {
>+ if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
>+ u32 n_lri = min_t(u32, n_regs - i,
>+ MI_LOAD_REGISTER_IMM_MAX_REGS);
>+
>+ bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(n_lri);
>+ }
>+ bb->cs[bb->len++] = reg_data[i].addr.addr;
>+ bb->cs[bb->len++] = reg_data[i].value;
>+ }
>+}
>+
>+static int num_lri_dwords(int num_regs)
>+{
>+ int count = 0;
>+
>+ if (num_regs > 0) {
>+ count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
>+ count += num_regs * 2;
>+ }
>+
>+ return count;
>+}
>+
>+static struct xe_oa_config_bo *
>+__xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa_config)
>+{
>+ struct xe_oa_config_bo *oa_bo;
>+ size_t config_length;
>+ struct xe_bb *bb;
>+
>+ oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
>+ if (!oa_bo)
>+ return ERR_PTR(-ENOMEM);
>+
>+ config_length = num_lri_dwords(oa_config->regs_len);
>+ config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32);
>+
>+ bb = xe_bb_new(stream->gt, config_length, false);
>+ if (IS_ERR(bb))
>+ goto err_free;
>+
>+ write_cs_mi_lri(bb, oa_config->regs, oa_config->regs_len);
>+
>+ oa_bo->bb = bb;
>+ oa_bo->oa_config = xe_oa_config_get(oa_config);
>+ llist_add(&oa_bo->node, &stream->oa_config_bos);
>+
>+ return oa_bo;
>+err_free:
>+ kfree(oa_bo);
>+ return ERR_CAST(bb);
>+}
>+
>+static struct xe_oa_config_bo *xe_oa_alloc_config_buffer(struct xe_oa_stream *stream)
>+{
>+ struct xe_oa_config *oa_config = stream->oa_config;
>+ struct xe_oa_config_bo *oa_bo;
>+
>+ /* Look for the buffer in the already allocated BOs attached to the stream */
>+ llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
>+ if (oa_bo->oa_config == oa_config &&
>+ memcmp(oa_bo->oa_config->uuid, oa_config->uuid,
>+ sizeof(oa_config->uuid)) == 0)
>+ goto out;
>+ }
>+
>+ oa_bo = __xe_oa_alloc_config_buffer(stream, oa_config);
>+out:
>+ return oa_bo;
>+}
>+
>+static int xe_oa_emit_oa_config(struct xe_oa_stream *stream)
>+{
>+#define NOA_PROGRAM_ADDITIONAL_DELAY_US 500
>+ struct xe_oa_config_bo *oa_bo;
>+ int err, us = NOA_PROGRAM_ADDITIONAL_DELAY_US;
>+
>+ oa_bo = xe_oa_alloc_config_buffer(stream);
>+ if (IS_ERR(oa_bo)) {
>+ err = PTR_ERR(oa_bo);
>+ goto exit;
>+ }
>+
>+ err = xe_oa_submit_bb(stream, oa_bo->bb);
>+
>+ /* Additional empirical delay needed for NOA programming after registers are written */
>+ usleep_range(us, 2 * us);
Are we planning to signal user fence or something to indicate
completion? I haven't tracked that aspect much.
The reset is familiar and lgtm,
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
Umesh
More information about the Intel-xe
mailing list