[Intel-gfx] [PATCH 6/9] drm/i915/perf: execute OA configuration from command stream

Tue Oct 8 21:40:18 UTC 2019

From: Lionel Landwerlin <lionel.g.landwerlin at intel.com>

We haven't run into issues with programming the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer. On TGL in particular
one of the clock domain in which some of that programming goes might
not be powered when we poke things from the CPU.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

This also allows us to significantly reduce the amount of time we hold
the main lock.

v2: Drop the global lock as much as possible

v3: Take global lock to pin global

v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel)

v5: Move locking to the stream (Lionel)

v6: Move active reconfiguration request into i915_perf_stream (Lionel)

v7: Pin VMA outside request creation (Chris)
    Lock VMA before move to active (Chris)

v8: Fix double free on stream->initial_oa_config_bo (Lionel)
    Don't allow interruption when waiting on active config request
    (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c       | 105 ++++++++++++-------------
 drivers/gpu/drm/i915/i915_perf_types.h |  14 +++-
 2 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 655980276e96..05d6e10ac9fe 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1902,56 +1902,54 @@ static int alloc_noa_wait(struct i915_perf_stream *stream)
 	return 0;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	i915_vma_unpin_and_release(&vma, 0);
 err_unref:
 	i915_gem_object_put(bo);
 	return ret;
 }
 
-static void config_oa_regs(struct intel_uncore *uncore,
-			   const struct i915_oa_reg *regs,
-			   u32 n_regs)
+static int emit_oa_config(struct i915_perf_stream *stream)
 {
-	u32 i;
+	struct i915_vma *vma = stream->initial_oa_vma;
+	struct i915_request *rq;
+	int err;
 
-	for (i = 0; i < n_regs; i++) {
-		const struct i915_oa_reg *reg = regs + i;
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto err_vma_unpin;
 
-		intel_uncore_write(uncore, reg->addr, reg->value);
+	rq = i915_request_create(stream->engine->kernel_context);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_add_request;
 	}
-}
 
-static void delay_after_mux(void)
-{
-	/*
-	 * It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
+	err = i915_active_fence_set(&stream->perf->active_config, rq);
+	if (err)
+		goto err_add_request;
+
+	i915_vma_lock(vma);
+	err = i915_request_await_object(rq, vma->obj, 0);
+	if (!err)
+		err = i915_vma_move_to_active(vma, rq, 0);
+	i915_vma_unlock(vma);
+	if (err)
+		goto err_add_request;
+
+	err = rq->engine->emit_bb_start(rq,
+					vma->node.start, 0,
+					I915_DISPATCH_SECURE);
+err_add_request:
+	i915_request_add(rq);
+err_vma_unpin:
+	i915_vma_unpin(vma);
+
+	return err;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->gt->uncore;
-	const struct i915_oa_config *oa_config = stream->oa_config;
 
 	/*
 	 * PRM:
@@ -1968,13 +1966,7 @@ static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 	intel_uncore_rmw(uncore, GEN6_UCGCTL1,
 			 0, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
 
-	config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
-	delay_after_mux();
-
-	config_oa_regs(uncore, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(stream);
 }
 
 static void hsw_disable_metric_set(struct i915_perf_stream *stream)
@@ -2338,13 +2330,7 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	if (ret)
 		return ret;
 
-	config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
-	delay_after_mux();
-
-	config_oa_regs(uncore, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(stream);
 }
 
 static void gen8_disable_metric_set(struct i915_perf_stream *stream)
@@ -2592,10 +2578,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		goto err_noa_wait_alloc;
 	}
 
-	stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
-	if (!stream->oa_config) {
+	ret = i915_perf_stream_get_oa_config(stream, props->metrics_set,
+					     &stream->oa_config,
+					     &stream->initial_oa_vma);
+	if (ret) {
 		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
-		ret = -EINVAL;
 		goto err_config;
 	}
 
@@ -2623,10 +2610,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	perf->exclusive_stream = stream;
 
 	ret = perf->ops.enable_metric_set(stream);
-	if (ret) {
-		DRM_DEBUG("Unable to enable metric set\n");
+	if (ret)
+		goto err_enable;
+
+	i915_vma_put(stream->initial_oa_vma);
+	stream->initial_oa_vma = NULL;
+	if (ret)
 		goto err_enable;
-	}
 
 	DRM_DEBUG("opening stream oa config uuid=%s\n",
 		  stream->oa_config->uuid);
@@ -2651,6 +2641,9 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 	free_oa_configs(stream);
 
+	if (stream->initial_oa_vma)
+		i915_vma_put(stream->initial_oa_vma);
+
 err_config:
 	free_noa_wait(stream);
 
@@ -4017,6 +4010,8 @@ void i915_perf_init(struct drm_i915_private *i915)
 	if (perf->ops.enable_metric_set) {
 		mutex_init(&perf->lock);
 
+		INIT_ACTIVE_FENCE(&perf->active_config, &perf->lock);
+
 		oa_sample_rate_hard_limit = 1000 *
 			(RUNTIME_INFO(i915)->cs_timestamp_frequency_khz / 2);
 		perf->sysctl_header = register_sysctl_table(dev_root);
diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
index f126f790c68b..0fad714bc362 100644
--- a/drivers/gpu/drm/i915/i915_perf_types.h
+++ b/drivers/gpu/drm/i915/i915_perf_types.h
@@ -16,6 +16,7 @@
 #include <linux/uuid.h>
 #include <linux/wait.h>
 
+#include "i915_active_types.h"
 #include "i915_reg.h"
 #include "intel_wakeref.h"
 
@@ -183,7 +184,8 @@ struct i915_perf_stream {
 	const struct i915_perf_stream_ops *ops;
 
 	/**
-	 * @active_config_mutex: Protects access to @oa_config & @oa_config_bos.
+	 * @active_config_mutex: Protects access to @active_config_rq,
+	 * @oa_config & @oa_config_bos.
 	 */
 	struct mutex config_mutex;
 
@@ -198,6 +200,11 @@ struct i915_perf_stream {
 	 */
 	struct list_head oa_config_bos;
 
+	/**
+	 * @initial_oa_vma: First OA configuration BO to be run.
+	 */
+	struct i915_vma *initial_oa_vma;
+
 	/**
 	 * @pinned_ctx: The OA context specific information.
 	 * The OA context specific information.
@@ -383,6 +390,11 @@ struct i915_perf {
 	 */
 	struct i915_perf_stream *exclusive_stream;
 
+	/**
+	 * @active_config: Last request using the active configuration.
+	 */
+	struct i915_active_fence active_config;
+
 	/**
 	 * For rate limiting any notifications of spurious
 	 * invalid OA reports
-- 
2.23.0