[Intel-gfx] [PATCH 36/36] drm/i915: Support per-context user requests for GPU frequency control

Mon Mar 19 09:51:08 UTC 2018


On 3/14/2018 3:07 PM, Chris Wilson wrote:
> Often, we find ourselves facing a workload where the user knows in
> advance what GPU frequency they require for it to complete in a timely
> manner, and using past experience they can outperform the HW assisted
> RPS autotuning. An example might be kodi (HTPC) where they know that
> video decoding and compositing require a minimum frequency to avoid ever
> dropping a frame, or conversely know when they are in a powersaving mode
> and would rather have slower updates than ramp up the GPU frequency and
> power consumption. Other workloads may defeat the autotuning entirely
> and need manual control to meet their performance goals, e.g. bursty
> applications which require low latency.
>
> To accommodate the varying needs of different applications, that may be
> running concurrently, we want a more flexible system than a global limit
> supplied by sysfs. To this end, we offer the application the option to
> set their desired frequency bounds on the context itself, and apply those
> bounds when we execute commands from the application, switching between
> bounds just as easily as we switch between the clients themselves.
>
> The clients can query the range supported by the HW, or at least the
> range they are restricted to, and then freely select frequencies within
> that range that they want to run at. (They can select just a single
> frequency if they so choose.) As this is subject to the global limit
> supplied by the user in sysfs, and a client can only reduce the range of
> frequencies they allow the HW to run at, we allow all clients to adjust
> their request (and not restrict raising the minimum to privileged
> CAP_SYS_NICE clients).
>
> Testcase: igt/gem_ctx_freq
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Praveen Paneri <praveen.paneri at intel.com>
> Cc: Sagar A Kamble <sagar.a.kamble at intel.com>
Change looks good to me. I have one query below.
<snip>
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 8a8ad2fe158d..d8eaae683186 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -26,9 +26,12 @@
>   #include <trace/events/dma_fence.h>
>   
>   #include "intel_guc_submission.h"
> -#include "intel_lrc_reg.h"
> +
>   #include "i915_drv.h"
>   
> +#include "intel_gt_pm.h"
> +#include "intel_lrc_reg.h"
> +
>   #define GUC_PREEMPT_FINISHED		0x1
>   #define GUC_PREEMPT_BREADCRUMB_DWORDS	0x8
>   #define GUC_PREEMPT_BREADCRUMB_BYTES	\
> @@ -650,6 +653,12 @@ static void guc_submit(struct intel_engine_cs *engine)
>   	}
>   }
>   
> +static void update_rps(struct intel_engine_cs *engine)
> +{
> +	intel_rps_update_engine(engine,
> +				port_request(engine->execlists.port)->ctx);
> +}
> +
>   static void port_assign(struct execlist_port *port, struct i915_request *rq)
>   {
>   	GEM_BUG_ON(port_isset(port));
> @@ -728,6 +737,7 @@ static void guc_dequeue(struct intel_engine_cs *engine)
>   	execlists->first = rb;
>   	if (submit) {
>   		port_assign(port, last);
> +		update_rps(engine);
>   		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
>   		guc_submit(engine);
>   	}
> @@ -757,8 +767,10 @@ static void guc_submission_tasklet(unsigned long data)
>   
>   		rq = port_request(&port[0]);
>   	}
> -	if (!rq)
> +	if (!rq) {
>   		execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
> +		intel_rps_update_engine(engine, NULL);
I think we also need to do this (update_engine(NULL)) while handling 
preemption completion for both GuC and execlists also.
Doing it as part of execlists_cancel_port_requests will cover all those 
cases including reset.
Am I right?
> +	}
>   
>   	if (execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT) &&
>   	    intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3a69b367e565..518f7b3db857 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -138,6 +138,7 @@
>   #include "i915_drv.h"
>   #include "i915_gem_render_state.h"
>   #include "intel_lrc_reg.h"
> +#include "intel_gt_pm.h"
>   #include "intel_mocs.h"
>   
>   #define RING_EXECLIST_QFULL		(1 << 0x2)
> @@ -535,6 +536,12 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>   	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
>   }
>   
> +static void update_rps(struct intel_engine_cs *engine)
> +{
> +	intel_rps_update_engine(engine,
> +				port_request(engine->execlists.port)->ctx);
> +}
> +
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -708,6 +715,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	spin_unlock_irq(&engine->timeline->lock);
>   
>   	if (submit) {
> +		update_rps(engine);
>   		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
>   		execlists_submit_ports(engine);
>   	}
> @@ -982,6 +990,11 @@ static void execlists_submission_tasklet(unsigned long data)
>   					  engine->name, port->context_id);
>   
>   				execlists_port_complete(execlists, port);
> +
> +				/* Switch to the next request/context */
> +				rq = port_request(port);
> +				intel_rps_update_engine(engine,
> +							rq ? rq->ctx : NULL);
>   			} else {
>   				port_set(port, port_pack(rq, count));
>   			}
> @@ -1717,6 +1730,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>   	__unwind_incomplete_requests(engine);
>   	spin_unlock(&engine->timeline->lock);
>   
> +	intel_rps_update_engine(engine, NULL);
> +
>   	/* Mark all CS interrupts as complete */
>   	execlists->active = 0;
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 9a48aa441743..85b6e6d020b7 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -14,6 +14,7 @@ selftest(fence, i915_sw_fence_mock_selftests)
>   selftest(scatterlist, scatterlist_mock_selftests)
>   selftest(syncmap, i915_syncmap_mock_selftests)
>   selftest(uncore, intel_uncore_mock_selftests)
> +selftest(gt_pm, intel_gt_pm_mock_selftests)
>   selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
>   selftest(timelines, i915_gem_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/intel_gt_pm.c b/drivers/gpu/drm/i915/selftests/intel_gt_pm.c
> new file mode 100644
> index 000000000000..c3871eb9eabb
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/intel_gt_pm.c
> @@ -0,0 +1,130 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#include "../i915_selftest.h"
> +#include "i915_random.h"
> +
> +#include "mock_gem_device.h"
> +
> +static void mock_rps_init(struct drm_i915_private *i915)
> +{
> +	struct intel_rps *rps = &i915->gt_pm.rps;
> +
> +	/* Disable the register writes */
> +	mkwrite_device_info(i915)->gen = 0;
> +	mkwrite_device_info(i915)->has_rps = true;
> +
> +	intel_rps_init(rps);
> +
> +	rps->min_freq_hw = 0;
> +	rps->max_freq_hw = 255;
> +
> +	rps->min_freq_user = rps->min_freq_hw;
> +	rps->max_freq_user = rps->max_freq_hw;
> +
> +	intel_rps_init__frequencies(rps);
> +}
> +
> +static void mock_rps_fini(struct drm_i915_private *i915)
> +{
> +	struct intel_rps *rps = &i915->gt_pm.rps;
> +
> +	cancel_work_sync(&rps->work);
> +}
> +
> +static int igt_rps_engine(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_rps *rps = &i915->gt_pm.rps;
> +	I915_RND_STATE(prng);
> +	int err;
> +	int i;
> +
> +	intel_gt_pm_busy(i915); /* Activate RPS */
> +
> +	/*
> +	 * Minimum unit tests for intel_rps_update_engine().
> +	 *
> +	 * Whenever we call intel_rps_update_engine, it will
> +	 * replace the context min/max frequency request for a particular
> +	 * engine and then recompute the global max(min)/min(max) over all
> +	 * engines. In this mockup, we are limited to checking those
> +	 * max(min)/min(max) calculations and then seeing if the rps
> +	 * worker uses those bounds.
> +	 */
> +
> +	for (i = 0; i < 256 * 256; i++) {
> +		u8 freq = prandom_u32_state(&prng);
> +
> +		__rps_update_engine(rps, 0, freq, freq);
> +		if (rps->min_freq_context != freq ||
> +		    rps->max_freq_context != freq) {
> +			pr_err("Context min/max frequencies not restricted to %d, found [%d, %d]\n",
> +			       freq, rps->min_freq_context, rps->max_freq_context);
> +			err = -EINVAL;
> +			goto out;
> +		}
> +		flush_work(&rps->work);
> +
> +		if (rps->freq != freq) {
> +			pr_err("Tried to restrict frequency to %d, found %d\n",
> +			       freq, rps->freq);
> +			err = -EINVAL;
> +			goto out;
> +		}
> +	}
> +
> +	__rps_update_engine(rps, 0, rps->min_freq_hw, rps->max_freq_hw);
> +	if (rps->min_freq_context != rps->min_freq_hw ||
> +	    rps->max_freq_context != rps->max_freq_hw) {
> +		pr_err("Context frequency not restored to [%d, %d], found [%d, %d]\n",
> +		       rps->min_freq_hw, rps->min_freq_hw,
> +		       rps->min_freq_context, rps->max_freq_context);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < I915_NUM_ENGINES; i++)
> +		__rps_update_engine(rps, i, i, 255 - i);
> +	i--;
> +	if (rps->min_freq_context != i) {
> +		pr_err("Minimum context frequency across all engines not raised to %d, found %d\n", i, rps->min_freq_context);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +	if (rps->max_freq_context != 255 - i) {
> +		pr_err("Maxmimum context frequency across all engines not lowered to %d, found %d\n", 255 - i, rps->max_freq_context);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	err = 0;
> +out:
> +	intel_gt_pm_idle(i915);
> +	return err;
> +}
> +
> +int intel_gt_pm_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(igt_rps_engine),
> +	};
> +	struct drm_i915_private *i915;
> +	int err;
> +
> +	i915 = mock_gem_device();
> +	if (!i915)
> +		return -ENOMEM;
> +
> +	mock_rps_init(i915);
> +
> +	err = i915_subtests(tests, i915);
> +
> +	mock_rps_fini(i915);
> +	drm_dev_unref(&i915->drm);
> +
> +	return err;
> +}
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7f5634ce8e88..64c6377df769 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1456,6 +1456,26 @@ struct drm_i915_gem_context_param {
>   #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>   #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>   #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +
> +/*
> + * I915_CONTEXT_PARAM_FREQUENCY:
> + *
> + * Request that when this context runs, the GPU is restricted to run
> + * in this frequency range; but still contrained by the global user
> + * restriction specified via sysfs.
> + *
> + * The minimum / maximum frequencies are specified in MHz. Each context
> + * starts in the default unrestricted state, where the range is taken from
> + * the hardware, and so may be queried.
> + *
> + * Note the frequency is only changed on a context switch; if the
> + * context's frequency is updated whilst the context is currently executing
> + * the request will not take effect until the next time the context is run.
> + */
> +#define I915_CONTEXT_PARAM_FREQUENCY	0x7
> +#define   I915_CONTEXT_MIN_FREQUENCY(x) ((x) & 0xffffffff)
> +#define   I915_CONTEXT_MAX_FREQUENCY(x) ((x) >> 32)
> +#define   I915_CONTEXT_SET_FREQUENCY(min, max) ((__u64)(max) << 32 | (min))
>   	__u64 value;
>   };
>   

-- 
Thanks,
Sagar