[Nouveau] [PATCH 3/4] subdev/pmu/fuc: implement perf

Martin Peres martin.peres at free.fr
Sun Feb 14 22:02:22 UTC 2016


On 26/10/15 20:13, Karol Herbst wrote:
> From: Karol Herbst <git at karolherbst.de>
>
> ---
>   drm/nouveau/nvkm/subdev/pmu/fuc/gf100.fuc3.h | 788 +++++++++++++++------------
>   drm/nouveau/nvkm/subdev/pmu/fuc/gf119.fuc4.h | 740 ++++++++++++++-----------
>   drm/nouveau/nvkm/subdev/pmu/fuc/gk104.fuc4.h | 740 ++++++++++++++-----------
>   drm/nouveau/nvkm/subdev/pmu/fuc/gk208.fuc5.h | 710 ++++++++++++++----------
>   drm/nouveau/nvkm/subdev/pmu/fuc/gt215.fuc3.h | 755 ++++++++++++++-----------
>   drm/nouveau/nvkm/subdev/pmu/fuc/os.h         |   4 +
>   drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc     | 148 +++++
>   7 files changed, 2267 insertions(+), 1618 deletions(-)
>
>
> diff --git a/drm/nouveau/nvkm/subdev/pmu/fuc/os.h b/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> index c8b06cb..53508d9 100644
> --- a/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> +++ b/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> @@ -49,4 +49,8 @@
>   #define I2C__MSG_DATA0_WR08_REG 0:7
>   #define I2C__MSG_DATA1_WR08_VAL 0:7
>   
> +
> +/* PERF: message identifiers */
> +#define PERF_MSG_LOAD 1

Could you document the expected inputs and outputs of this message as a 
comment?

// IN: don't care
// OUT:
// - byte 3: PCIe load (GF100+)
// - byte 2: Memory load
// - byte 1: Video decoding engines' load
// - byte 0: GR load

> +
>   #endif
> diff --git a/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc b/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> index 38eadf7..69a8f8d 100644
> --- a/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> +++ b/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> @@ -30,6 +30,18 @@ process(PROC_PERF, #perf_init, #perf_recv)
>    * PERF data segment
>    *****************************************************************************/
>   #ifdef INCLUDE_DATA
> +perf_attr_start:
> +// parameters
> +perf_polling_period_us: .b32 100000
> +
> +// engine usage percentage

It is a tiny bit misleading to say percentage when you mean per-255. How 
about:

// engine usage (0-255)

> +perf_eng_gr:   .b8 0
> +perf_eng_vdec: .b8 0
> +perf_eng_mc:   .b8 0
> +#if NVKM_PPWR_CHIPSET >= GF100
> +perf_eng_pcie: .b8 0
> +#endif
> +.align 4
>   #endif
>   
>   /******************************************************************************
> @@ -46,6 +58,78 @@ process(PROC_PERF, #perf_init, #perf_recv)
>   // $r11 - data1
>   // $r0  - zero
>   perf_recv:
> +	push $r1
> +
> +	imm32($r10, PROC_HOST)
> +	cmp b32 $r14 $r10
> +	bra ne #perf_recv_not_host
> +		cmp b32 $r13 PERF_MSG_LOAD

Thanks for doing this and not assuming straight away that this is going 
to be the only type of messages you will receive from the host :)

> +		bra e #perf_load
> +		bra #perf_recv_exit
> +
> +perf_load:
> +			clear b32 $r11
> +			clear b32 $r12
> +#if NVKM_PPWR_CHIPSET >= GF100
> +			ld(b8, $r12, #perf_eng_pcie)
> +			shl b32 $r12 8
> +#endif
> +			ld(b8, $r12, #perf_eng_mc)
> +			shl b32 $r12 8
> +			ld(b8, $r12, #perf_eng_vdec)
> +			shl b32 $r12 8
> +			ld(b8, $r12, #perf_eng_gr)

Simple and effective, that's nice :)
> +			call(send)
> +			bra #perf_recv_exit
> +
> +perf_recv_not_host:
> +	call(perf_counter_readout)
> +
> +	ld(b32, $r14, #perf_polling_period_us)
> +	call #ticks_from_us
> +	call(timer)
> +
> +perf_recv_exit:
> +	pop $r1
> +	ret
> +
> +
> +// description
> +//
> +// $r15 - current (perf)
> +// $r0  - zero
> +perf_counter_readout:
> +	nv_iord($r14, NV_PPWR_COUNTER_COUNT(0))
> +	div $r14 $r14 0xff

Not a big fan of the loss in precision. Why do you do that instead of 
what I did here:
https://cgit.freedesktop.org/~mperes/nouveau/tree/nvkm/subdev/pwr/fuc/perf.fuc?h=ppwr_rework&id=27310fa8fdc39e54a3f4383fada96a3562c5a022#n134

With my solution, you get no loss in precision. And you also poll the 
counters faster so it means a greater accuracy. I also miss less cycles.

All in all, could you please reuse my polling function please? :D

> +
> +	nv_iord($r13, NV_PPWR_COUNTER_COUNT(1))
> +	div $r13 $r13 $r14
> +	st(b8, #perf_eng_gr, $r13)
> +
> +	nv_iord($r13, NV_PPWR_COUNTER_COUNT(2))
> +	div $r13 $r13 $r14
> +	st(b8, #perf_eng_vdec, $r13)
> +
> +	nv_iord($r13, NV_PPWR_COUNTER_COUNT(3))
> +	div $r13 $r13 $r14
> +	st(b8, #perf_eng_mc, $r13)
> +
> +#if NVKM_PPWR_CHIPSET >= GF100
> +	nv_iord($r13, NV_PPWR_COUNTER_COUNT(4))
> +	div $r13 $r13 $r14
> +	st(b8, #perf_eng_pcie, $r13)
> +#endif
> +
> +	// reset the counters
> +	imm32($r14, NV_PPWR_COUNTER_COUNT_RESET)
> +	nv_iowr(NV_PPWR_COUNTER_COUNT(0), $r14)
> +	nv_iowr(NV_PPWR_COUNTER_COUNT(1), $r14)
> +	nv_iowr(NV_PPWR_COUNTER_COUNT(2), $r14)
> +	nv_iowr(NV_PPWR_COUNTER_COUNT(3), $r14)
> +#if NVKM_PPWR_CHIPSET >= GF100
> +	nv_iowr(NV_PPWR_COUNTER_COUNT(4), $r14)
> +#endif
> +
>   	ret
>   
>   // description
> @@ -53,5 +137,69 @@ perf_recv:
>   // $r15 - current (perf)
>   // $r0  - zero
>   perf_init:
> +	// set up the total ticks counter first
> +	imm32($r14, NV_PPWR_COUNTER_MODE_ALWAYS)
> +	nv_iowr(NV_PPWR_COUNTER_MODE(0), $r14)
> +
> +	// set up the other counters, with fermi there are more
> +	imm32($r14, NV_PPWR_COUNTER_MODE_IF_NOT_ALL)
> +	nv_iowr(NV_PPWR_COUNTER_MODE(1), $r14)
> +	nv_iowr(NV_PPWR_COUNTER_MODE(2), $r14)
> +	nv_iowr(NV_PPWR_COUNTER_MODE(3), $r14)
> +#if NVKM_PPWR_CHIPSET >= GF100
> +	nv_iowr(NV_PPWR_COUNTER_MODE(4), $r14)
> +#endif
> +
> +	// core load counter
> +	imm32($r14,
> +		  NV_PPWR_COUNTER_SIG_GR_IDLE
> +		| NV_PPWR_COUNTER_SIG_GR_GPC_IDLE
> +		| NV_PPWR_COUNTER_SIG_GR_ROP_IDLE
> +#if NVKM_PPWR_CHIPSET >= GF100
> +		| NV_PPWR_COUNTER_SIG_GR_HUB_IDLE
> +		| NV_PPWR_COUNTER_SIG_PCOPY0_IDLE
> +		| NV_PPWR_COUNTER_SIG_PCOPY1_IDLE
> +#if NVKM_PPWR_CHIPSET >= GK104
> +		| NV_PPWR_COUNTER_SIG_PCOPY2_IDLE
> +#endif
> +#endif
> +	)
> +	nv_iowr(NV_PPWR_COUNTER_MASK(1), $r14)
> +
> +	// video load counter
> +	imm32($r14,
> +		  NV_PPWR_COUNTER_SIG_PVLD_IDLE
> +		| NV_PPWR_COUNTER_SIG_PPDEC_IDLE
> +		| NV_PPWR_COUNTER_SIG_PPPP_IDLE
> +#if NVKM_PPWR_CHIPSET >= GK104
> +		| NV_PPWR_COUNTER_SIG_PVENC
> +#endif
> +	)
> +	nv_iowr(NV_PPWR_COUNTER_MASK(2), $r14)
> +
> +	// memory load counter
> +	imm32($r14,
> +#if NVKM_PPWR_CHIPSET >= GF100
> +		  NV_PPWR_COUNTER_SIG_BFB_PART0_REQ
> +#else
> +		  NV_PPWR_COUNTER_SIG_FB_PART0_REQ
> +#endif
> +	)
> +	nv_iowr(NV_PPWR_COUNTER_MASK(3), $r14)
> +
> +	// pcie load counter
> +#if NVKM_PPWR_CHIPSET >= GF100
> +	imm32($r14, NV_PPWR_COUNTER_SIG_PCIE)
> +	nv_iowr(NV_PPWR_COUNTER_MASK(4), $r14)
> +#endif
> +
> +	// initial read out
> +	call(perf_counter_readout)
> +
> +	// schedule the next read out
> +	ld(b32, $r14, #perf_polling_period_us)
> +	call #ticks_from_us
> +	call(timer)
> +
>   	ret
>   #endif

Looks about good, except the polling function!


More information about the Nouveau mailing list