[PATCH] drm/amd: add Streaming Performance Monitor feature

Felix Kuehling felix.kuehling at amd.com
Tue Apr 28 21:39:04 UTC 2020


Hi Gang,

You've made a lot of progress on this. I'll need a day or two to review
all of this in detail. Some general observations after a quick read-through:

  * You have a function to acquire SPM. But it behaves more like "steal
    SPM". You'll need some coordination with SPM VMID usage in the
    amdgpu_cs API. That means, either acquire SPM has to fail in some
    cases, or it needs to wait for amdgpu_cs to release the SPM VMID.
    The other way around, amdgpu_cs will have to wait for KFD to release
    the SPM VMID or fail if someone tries to use SPM.
  * amdgpu_amdkfd_rlc_spm.c has a bunch of wrapper functions to
    allocated KIQ space and commit around ASIC-specific functions that
    write actual KIQ packets. But some of them don't actually use the
    KIQ. The rptr and wptr get functions just read a register.
  * I'm not sure about the amdgpu_amdkfd_rlc_spm_set_reg function. It
    allocates a lot of space on the KIQ but only writes a single
    register. Was this meant to write many registers at once?
  * Also, you're exposing MMIO or KIQ register access to user mode
    through the KFD_IOCTL_SPM_OP_CONFIG function. We should not allow
    this as it would give user mode access to privileged registers. It
    should also not be necessary because most of the SPM registers are
    unprivileged and accessible through user mode queues.

I see a lot more minor issues and typos. I'll respond with more detailed
comments in a separate email.

Regards,
  Felix

Am 2020-04-28 um 9:58 a.m. schrieb Gang Ba:
> Signed-off-by: Gang Ba <gaba at amd.com>
> Change-Id: If83ee0a14ef834699de6733eeab0f140159bbd6e
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile                |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h         |  10 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_rlc_spm.c | 165 ++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h            |  27 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c             | 175 +++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c              | 176 +++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c              | 168 ++++++
>  drivers/gpu/drm/amd/amdkfd/Makefile                |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   |  10 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |  17 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |  11 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  20 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_spm.c               | 577 +++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/soc15_int.h             |   1 +
>  include/uapi/linux/kfd_ioctl.h                     |  55 +-
>  16 files changed, 1413 insertions(+), 6 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_rlc_spm.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_spm.c
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 210d57a..1498b18 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -177,7 +177,8 @@ amdgpu-y += \
>  	 amdgpu_amdkfd_gfx_v8.o \
>  	 amdgpu_amdkfd_gfx_v9.o \
>  	 amdgpu_amdkfd_arcturus.o \
> -	 amdgpu_amdkfd_gfx_v10.o
> +	 amdgpu_amdkfd_gfx_v10.o \
> +	 amdgpu_amdkfd_rlc_spm.o \
>  
>  ifneq ($(CONFIG_DRM_AMDGPU_CIK),)
>  amdgpu-y += amdgpu_amdkfd_gfx_v7.o
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index d065c50..fdc438a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -246,6 +246,16 @@ void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo);
>  int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
>  				struct tile_config *config);
>  
> +void amdgpu_amdkfd_rlc_spm_cntl(struct kgd_dev *kgd, bool cntl);
> +int amdgpu_amdkfd_rlc_spm(struct kgd_dev *kgd, void *args);
> +void amdgpu_amdkfd_rlc_spm_acquire(struct kgd_dev *kgd, u64 gpu_addr, u32 size);
> +int amdgpu_amdkfd_rlc_spm_release(struct kgd_dev *kgd);
> +u32 amdgpu_amdkfd_rlc_spm_get_rdptr(struct kgd_dev *kgd);
> +void amdgpu_amdkfd_rlc_spm_set_rdptr(struct kgd_dev *kgd, u32 rptr);
> +u32 amdgpu_amdkfd_rlc_spm_get_wrptr(struct kgd_dev *kgd);
> +void amdgpu_amdkfd_rlc_spm_set_wrptr(struct kgd_dev *kgd, u32 wptr);
> +int amdgpu_amdkfd_rlc_spm_set_reg(struct kgd_dev *kgd, u64 uReg, u32 value);
> +
>  /* KGD2KFD callbacks */
>  int kgd2kfd_init(void);
>  void kgd2kfd_exit(void);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_rlc_spm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_rlc_spm.c
> new file mode 100644
> index 0000000..b492c1e
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_rlc_spm.c
> @@ -0,0 +1,165 @@
> +/*
> + * Copyright 2014-2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/list.h>
> +#include <linux/pagemap.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched/task.h>
> +
> +#include "amdgpu_object.h"
> +#include "amdgpu_vm.h"
> +#include "amdgpu_amdkfd.h"
> +#include <uapi/linux/kfd_ioctl.h>
> +
> +
> +
> +void amdgpu_amdkfd_rlc_spm_cntl(struct kgd_dev *kgd, bool cntl)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	cntl == true ? adev->gfx.spm.spmf->start(adev):adev->gfx.spm.spmf->stop(adev);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +}
> +
> +u32 amdgpu_amdkfd_rlc_spm_get_rdptr(struct kgd_dev *kgd)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +    	u32 rptr = 0;
> +
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	rptr = adev->gfx.spm.spmf->get_rdptr(adev);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +
> +    	return rptr;
> +}
> +
> +void amdgpu_amdkfd_rlc_spm_set_rdptr(struct kgd_dev *kgd, u32 rptr)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	adev->gfx.spm.spmf->set_rdptr(adev, rptr);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +
> +}
> +
> +u32 amdgpu_amdkfd_rlc_spm_get_wrptr(struct kgd_dev *kgd)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +    	u32 wptr;
> +
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	wptr = adev->gfx.spm.spmf->get_wrptr(adev);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +    	return wptr;
> +}
> +
> +void amdgpu_amdkfd_rlc_spm_set_wrptr(struct kgd_dev *kgd, u32 wptr)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	adev->gfx.spm.spmf->set_wrptr(adev, wptr);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +}
> +
> +void amdgpu_amdkfd_rlc_spm_acquire(struct kgd_dev *kgd, u64 gpu_addr, u32 size)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +    	/* init spm vmid with 0xf */
> +    	if (adev->gfx.rlc.funcs->update_spm_vmid)
> +        	adev->gfx.rlc.funcs->update_spm_vmid(adev, 0x0);
> +
> +    	/* set spm ring registers */
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	adev->gfx.spm.spmf->set_spm_porfmon_ring_buf(adev, gpu_addr, size);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +}
> +
> +
> +int amdgpu_amdkfd_rlc_spm_release(struct kgd_dev *kgd)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +
> +    	/* stop spm stream and interupt */
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	adev->gfx.spm.spmf->stop(adev);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +
> +    	amdgpu_irq_put(adev, &adev->gfx.spm_irq, 0);
> +
> +    	/* revert spm vmid with 0xf */
> +    	if (adev->gfx.rlc.funcs->update_spm_vmid)
> +             adev->gfx.rlc.funcs->update_spm_vmid(adev, 0xf);
> +
> +    	return 0;
> +}
> +
> +int amdgpu_amdkfd_rlc_spm_set_reg(struct kgd_dev *kgd, u64 uReg, u32 value)
> +{
> +    	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +    	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +    	struct amdgpu_spm *spm = &adev->gfx.spm;
> +
> +    	pr_debug("[%s]\n", __func__);
> +
> +    	/* stop spm stream and interupt */
> +    	spin_lock(&adev->gfx.kiq.ring_lock);
> +    	amdgpu_ring_alloc(kiq_ring, spm->spmf->get_spm_data_size);
> +    	adev->gfx.spm.spmf->set_reg(adev, uReg, value);
> +    	amdgpu_ring_commit(kiq_ring);
> +    	spin_unlock(&adev->gfx.kiq.ring_lock);
> +
> +    	return 0;
> +}
> +
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index ee698f0..ba4da52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -105,6 +105,31 @@ struct amdgpu_kiq {
>  	const struct kiq_pm4_funcs *pmf;
>  };
>  
> +struct spm_funcs {
> +	void (*start)(struct amdgpu_device *adev);
> +	void (*stop)(struct amdgpu_device *adev);
> +	void (*set_reg)(struct amdgpu_device *adev, uint64_t reg, uint32_t val);
> +	u32 (*get_rdptr)(struct amdgpu_device *adev);
> +	void (*set_rdptr)(struct amdgpu_device *adev, u32 rptr);
> +	u32 (*get_wrptr)(struct amdgpu_device *adev);
> +	void (*set_wrptr)(struct amdgpu_device *adev, u32 wptr);
> +	void (*set_spm_porfmon_ring_buf)(struct amdgpu_device *adev, u64 gpu_rptr, u32 size);
> +
> +	/* Packet sizes */
> +	int set_spm_config_size;
> +	int get_spm_data_size;
> +};
> +
> +struct amdgpu_spm {
> +	const struct spm_funcs *spmf;
> +	u64              spm_gpu_addr;
> +	u32              spm_gpu_size;
> +	u32              spm_ring_rptr;
> +	u32              spm_ring_rptrsize_to_read;
> +	struct amdgpu_bo *spm_obj;
> +	void             *spm_cpu_addr;
> +};
> +
>  /*
>   * GPU scratch registers structures, functions & helpers
>   */
> @@ -256,6 +281,7 @@ struct amdgpu_gfx {
>  	struct amdgpu_me		me;
>  	struct amdgpu_mec		mec;
>  	struct amdgpu_kiq		kiq;
> +	struct amdgpu_spm		spm;
>  	struct amdgpu_scratch		scratch;
>  	const struct firmware		*me_fw;	/* ME firmware */
>  	uint32_t			me_fw_version;
> @@ -292,6 +318,7 @@ struct amdgpu_gfx {
>  	struct amdgpu_irq_src		priv_reg_irq;
>  	struct amdgpu_irq_src		priv_inst_irq;
>  	struct amdgpu_irq_src		cp_ecc_error_irq;
> +	struct amdgpu_irq_src		spm_irq;
>  	struct amdgpu_irq_src		sq_irq;
>  	struct sq_work			sq_work;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 63ac430..9c507d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4121,6 +4121,13 @@ static int gfx_v10_0_sw_init(void *handle)
>  	if (r)
>  		return r;
>  
> +	/* KIQ SPM */
> +	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_RLC,
> +			      GFX_10_1__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT,
> +			      &adev->gfx.spm_irq);
> +	if (r)
> +		return r;
> +
>  	/* EOP Event */
>  	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP,
>  			      GFX_10_1__SRCID__CP_EOP_INTERRUPT,
> @@ -6603,6 +6610,7 @@ static int gfx_v10_0_hw_fini(void *handle)
>  
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
> +	amdgpu_irq_put(adev, &adev->gfx.spm_irq, 0);
>  #ifndef BRING_UP_DEBUG
>  	if (amdgpu_async_gfx_ring) {
>  		r = gfx_v10_0_kiq_disable_kgq(adev);
> @@ -6768,6 +6776,126 @@ static void gfx_v10_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>  				    (1 << (oa_size + oa_base)) - (1 << oa_base));
>  }
>  
> +static void gfx_v10_0_spm_start(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +	u8 se;
> +
> +	for (se = 0; se < adev->gfx.config.max_shader_engines + 1; ++se)
> +	{
> +		uint32_t mux_addr_reg = SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_GLOBAL_MUXSEL_ADDR);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			INSTANCE_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SA_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SE_BROADCAST_WRITES, 1);
> +
> +		if (se < adev->gfx.config.max_shader_engines) // SE else GB
> +		{
> +			data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se);
> +			mux_addr_reg = SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_SE_MUXSEL_ADDR);
> +		}
> +		gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
> +		gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, mux_addr_reg, 0);
> +	}
> +
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, 0);
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
> +
> +	data = 0;
> +	data |= CP_PERFMON_STATE_DISABLE_AND_RESET <<4 ;
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	data = 0;
> +	data |= STRM_PERFMON_STATE_START_COUNTING <<4 ;
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_INT_CNTL) ,1);
> +}
> +
> +static void gfx_v10_0_spm_stop(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +
> +	data = CP_PERFMON_STATE_STOP_COUNTING;
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	data |= (1<<10);
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_INT_CNTL), 0);
> +}
> +
> +static u32 gfx_v10_0_spm_get_rdptr(struct amdgpu_device *adev)
> +{
> +	return RREG32_SOC15(GC, 0, mmRLC_SPM_RING_RDPTR);;
> +}
> +
> +static void gfx_v10_0_spm_set_rdptr(struct amdgpu_device *adev,  u32 rptr)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_RING_RDPTR), rptr);
> +}
> +
> +static u32 gfx_v10_0_spm_get_wrptr(struct amdgpu_device *adev)
> +{
> +	return RREG32_SOC15(GC, 0, mmRLC_SPM_RING_WRPTR);
> +}
> +
> +static void gfx_v10_0_spm_set_wrptr(struct amdgpu_device *adev,  u32 wptr)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_RING_WRPTR), wptr);
> +}
> +
> +static void gfx_v10_0_set_spm_porfmon_ring_buf(struct amdgpu_device *adev, u64 gpu_addr, u32 size)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0,
> +			mmRLC_SPM_PERFMON_RING_BASE_LO), lower_32_bits(gpu_addr));
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false,
> +			SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_PERFMON_RING_BASE_HI), upper_32_bits (gpu_addr));
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false,
> +			SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_PERFMON_RING_SIZE), size);
> +}
> +
> +static void gfx_v10_0_spm_set_reg(struct amdgpu_device *adev, u64 reg, u32 value)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t grbm_cntl;
> +	grbm_cntl = adev->reg_offset[GC_HWIP][0][1] + reg;
> +
> +	gfx_v10_0_write_data_to_reg(kiq_ring, 0, false, grbm_cntl, value);
> +}
> +
> +static const struct spm_funcs gfx_v10_0_spm_funcs = {
> +	.start = gfx_v10_0_spm_start,
> +	.stop = gfx_v10_0_spm_stop,
> +	.get_rdptr= gfx_v10_0_spm_get_rdptr,
> +	.set_rdptr= gfx_v10_0_spm_set_rdptr,
> +	.get_wrptr= gfx_v10_0_spm_get_wrptr,
> +	.set_wrptr= gfx_v10_0_spm_set_wrptr,
> +	.set_spm_porfmon_ring_buf = gfx_v10_0_set_spm_porfmon_ring_buf,
> +	.set_reg = gfx_v10_0_spm_set_reg,
> +	.set_spm_config_size = 3,
> +	.get_spm_data_size = 128,
> +};
> +
> +static void gfx_v10_0_set_spm_funcs(struct amdgpu_device *adev)
> +{
> +	adev->gfx.spm.spmf = &gfx_v10_0_spm_funcs;
> +}
> +
> +
>  static int gfx_v10_0_early_init(void *handle)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -6776,6 +6904,7 @@ static int gfx_v10_0_early_init(void *handle)
>  
>  	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>  
> +	gfx_v10_0_set_spm_funcs(adev);
>  	gfx_v10_0_set_kiq_pm4_funcs(adev);
>  	gfx_v10_0_set_ring_funcs(adev);
>  	gfx_v10_0_set_irq_funcs(adev);
> @@ -6794,6 +6923,10 @@ static int gfx_v10_0_late_init(void *handle)
>  	if (r)
>  		return r;
>  
> +        r = amdgpu_irq_get(adev, &adev->gfx.spm_irq, 0);
> +        if (r)
> +		return r;
> +
>  	r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
>  	if (r)
>  		return r;
> @@ -6860,6 +6993,7 @@ static void gfx_v10_0_update_medium_grain_clock_gating(struct amdgpu_device *ade
>  		if (def != data)
>  			WREG32_SOC15(GC, 0, mmRLC_CGTT_MGCG_OVERRIDE, data);
>  
> +
>  		/* MGLS is a global flag to control all MGLS in GFX */
>  		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGLS) {
>  			/* 2 - RLC memory Light sleep */
> @@ -8018,6 +8152,39 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>  	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>  }
>  
> +static int gfx_v10_0_spm_set_interrupt_state(struct amdgpu_device *adev,
> +					     struct amdgpu_irq_src *src,
> +					     unsigned int type,
> +					     enum amdgpu_interrupt_state state)
> +{
> +	switch (state) {
> +	case AMDGPU_IRQ_STATE_DISABLE:
> +		WREG32_SOC15(GC, 0, mmRLC_SPM_INT_CNTL, 0);
> +		break;
> +	case AMDGPU_IRQ_STATE_ENABLE:
> +		WREG32_SOC15(GC, 0, mmRLC_SPM_INT_CNTL, 1);
> +		break;
> +	default:
> +		break;
> +	}
> +	return 0;
> +}
> +
> +static int gfx_v10_0_spm_irq(struct amdgpu_device *adev,
> +			     struct amdgpu_irq_src *source,
> +			     struct amdgpu_iv_entry *entry)
> +{
> +	u8 me_id, pipe_id, queue_id;
> +
> +	me_id = (entry->ring_id & 0x0c) >> 2;
> +	pipe_id = (entry->ring_id & 0x03) >> 0;
> +	queue_id = (entry->ring_id & 0x70) >> 4;
> +	pr_debug ("IH: RLC_RPM_INT, me:%d, pipe:%d, queue:%d\n",
> +                                         me_id, pipe_id, queue_id);
> +	return  0; /* This prevents sending it to KFD */
> +}
> +
> +
>  static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
>  	.name = "gfx_v10_0",
>  	.early_init = gfx_v10_0_early_init,
> @@ -8189,6 +8356,11 @@ static const struct amdgpu_irq_src_funcs gfx_v10_0_kiq_irq_funcs = {
>  	.process = gfx_v10_0_kiq_irq,
>  };
>  
> +static const struct amdgpu_irq_src_funcs gfx_v10_0_spm_irq_funcs = {
> +    .set = gfx_v10_0_spm_set_interrupt_state,
> +    .process = gfx_v10_0_spm_irq,
> +};
> +
>  static void gfx_v10_0_set_irq_funcs(struct amdgpu_device *adev)
>  {
>  	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
> @@ -8197,6 +8369,9 @@ static void gfx_v10_0_set_irq_funcs(struct amdgpu_device *adev)
>  	adev->gfx.kiq.irq.num_types = AMDGPU_CP_KIQ_IRQ_LAST;
>  	adev->gfx.kiq.irq.funcs = &gfx_v10_0_kiq_irq_funcs;
>  
> +	adev->gfx.spm_irq.num_types = 1;
> +	adev->gfx.spm_irq.funcs = &gfx_v10_0_spm_irq_funcs;
> +
>  	adev->gfx.priv_reg_irq.num_types = 1;
>  	adev->gfx.priv_reg_irq.funcs = &gfx_v10_0_priv_reg_irq_funcs;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 14790f8..1125b91 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1955,6 +1955,12 @@ static int gfx_v8_0_sw_init(void *handle)
>  	adev->gfx.mec.num_pipe_per_mec = 4;
>  	adev->gfx.mec.num_queue_per_pipe = 8;
>  
> +	/* KIQ SPM */
> +	r = amdgpu_irq_add_id(adev, AMDGPU_IRQ_CLIENTID_LEGACY,	VISLANDS30_IV_SRCID_RLC_STRM_PERF_MONITOR,
> +			      &adev->gfx.spm_irq);
> +	if (r)
> +        return r;
> +
>  	/* EOP Event */
>  	r = amdgpu_irq_add_id(adev, AMDGPU_IRQ_CLIENTID_LEGACY, VISLANDS30_IV_SRCID_CP_END_OF_PIPE, &adev->gfx.eop_irq);
>  	if (r)
> @@ -4927,6 +4933,7 @@ static int gfx_v8_0_hw_fini(void *handle)
>  	amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
>  
>  	amdgpu_irq_put(adev, &adev->gfx.sq_irq, 0);
> +	amdgpu_irq_put(adev, &adev->gfx.spm_irq, 0);
>  
>  	/* disable KCQ to avoid CPC touch memory not valid anymore */
>  	gfx_v8_0_kcq_disable(adev);
> @@ -5291,6 +5298,126 @@ static const struct amdgpu_gfx_funcs gfx_v8_0_gfx_funcs = {
>  	.select_me_pipe_q = &gfx_v8_0_select_me_pipe_q
>  };
>  
> +static void gfx_v8_0_write_data_to_reg(struct amdgpu_ring *ring, int eng_sel,
> +				       bool wc, uint32_t reg, uint32_t val)
> +{
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> +	amdgpu_ring_write(ring, WRITE_DATA_ENGINE_SEL(eng_sel) |
> +				WRITE_DATA_DST_SEL(0) |
> +				(wc ? WR_CONFIRM : 0));
> +	amdgpu_ring_write(ring, reg);
> +	amdgpu_ring_write(ring, 0);
> +	amdgpu_ring_write(ring, val);
> +}
> +
> +static void gfx_v8_0_spm_start(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +	u8 se;
> +
> +	for (se = 0; se < adev->gfx.config.max_shader_engines + 1; ++se)
> +	{
> +		uint32_t mux_addr_reg = mmRLC_SPM_GLOBAL_MUXSEL_ADDR;
> +
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			INSTANCE_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SH_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SE_BROADCAST_WRITES, 1);
> +
> +		if (se < adev->gfx.config.max_shader_engines) // SE else GB
> +		{
> +			data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se);
> +
> +			mux_addr_reg = mmRLC_SPM_SE_MUXSEL_ADDR;
> +		}
> +		gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmGRBM_GFX_INDEX, data);
> +		// init addr
> +		gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mux_addr_reg, data);
> +	}
> +
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, 0);
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmGRBM_GFX_INDEX, data);
> +
> +	data = 0;
> +	data |= CP_PERFMON_STATE_DISABLE_AND_RESET <<4 ;
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmCP_PERFMON_CNTL, data);
> +
> +	data = 0;
> +	data |= STRM_PERFMON_STATE_START_COUNTING <<4 ;
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmCP_PERFMON_CNTL,	data);
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_INT_CNTL ,1);
> +}
> +
> +static void gfx_v8_0_spm_stop(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +
> +	data = CP_PERFMON_STATE_STOP_COUNTING;
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmCP_PERFMON_CNTL, data);
> +	data |= (1<<10);
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmCP_PERFMON_CNTL, data);
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_INT_CNTL, 0);
> +}
> +
> +
> +static u32 gfx_v8_0_spm_get_rdptr(struct amdgpu_device *adev)
> +{
> +	return RREG32 (mmRLC_SPM_RING_RDPTR);
> +}
> +
> +static void gfx_v8_0_spm_set_rdptr(struct amdgpu_device *adev,  u32 rptr)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_RING_RDPTR, rptr);
> +}
> +
> +static u32 gfx_v8_0_spm_get_wrptr(struct amdgpu_device *adev)
> +{
> +	return  -1;
> +}
> +
> +static void gfx_v8_0_set_spm_porfmon_ring_buf(struct amdgpu_device *adev, u64 gpu_addr, u32 size)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_PERFMON_RING_BASE_LO, lower_32_bits(gpu_addr));
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_PERFMON_RING_BASE_HI, upper_32_bits (gpu_addr));
> +
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, mmRLC_SPM_PERFMON_RING_SIZE, size);
> +}
> +
> +static void gfx_v8_0_spm_set_reg(struct amdgpu_device *adev, u64 reg, u32 value)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	gfx_v8_0_write_data_to_reg(kiq_ring, 0, false, reg, value);
> +}
> +
> +static const struct spm_funcs gfx_v8_0_spm_funcs = {
> +	.start = gfx_v8_0_spm_start,
> +	.stop = gfx_v8_0_spm_stop,
> +	.get_rdptr= gfx_v8_0_spm_get_rdptr,
> +	.set_rdptr= gfx_v8_0_spm_set_rdptr,
> +	.get_wrptr= gfx_v8_0_spm_get_wrptr,
> +	.set_spm_porfmon_ring_buf = gfx_v8_0_set_spm_porfmon_ring_buf,
> +	.set_reg = gfx_v8_0_spm_set_reg,
> +	.set_spm_config_size = 3,
> +	.get_spm_data_size = 128,
> +};
> +
> +static void gfx_v8_0_set_spm_funcs(struct amdgpu_device *adev)
> +{
> +	adev->gfx.spm.spmf = &gfx_v8_0_spm_funcs;
> +}
> +
>  static int gfx_v8_0_early_init(void *handle)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -5298,6 +5425,8 @@ static int gfx_v8_0_early_init(void *handle)
>  	adev->gfx.num_gfx_rings = GFX8_NUM_GFX_RINGS;
>  	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>  	adev->gfx.funcs = &gfx_v8_0_gfx_funcs;
> +
> +	gfx_v8_0_set_spm_funcs(adev);
>  	gfx_v8_0_set_ring_funcs(adev);
>  	gfx_v8_0_set_irq_funcs(adev);
>  	gfx_v8_0_set_gds_init(adev);
> @@ -5338,6 +5467,10 @@ static int gfx_v8_0_late_init(void *handle)
>  		return r;
>  	}
>  
> +	r = amdgpu_irq_get(adev, &adev->gfx.spm_irq, 0);
> +	if (r)
> +		return r;
> +
>  	return 0;
>  }
>  
> @@ -6845,6 +6978,41 @@ static void gfx_v8_0_emit_mem_sync_compute(struct amdgpu_ring *ring)
>  	amdgpu_ring_write(ring, 0x0000000A);	/* poll interval */
>  }
>  
> +static int gfx_v8_0_spm_set_interrupt_state(struct amdgpu_device *adev,
> +					     struct amdgpu_irq_src *src,
> +					     unsigned int type,
> +					     enum amdgpu_interrupt_state state)
> +{
> +	switch (state) {
> +	case AMDGPU_IRQ_STATE_DISABLE:
> +		WREG32(mmRLC_SPM_INT_CNTL, 0);
> +		break;
> +	case AMDGPU_IRQ_STATE_ENABLE:
> +		WREG32(mmRLC_SPM_INT_CNTL, 1);
> +		break;
> +	default:
> +		break;
> +	}
> +	return 0;
> +}
> +
> +static int gfx_v8_0_spm_irq(struct amdgpu_device *adev,
> +			     struct amdgpu_irq_src *source,
> +			     struct amdgpu_iv_entry *entry)
> +{
> +	u8 me_id, pipe_id, queue_id;
> +	struct amdgpu_ring *ring = &(adev->gfx.kiq.ring);
> +
> +	me_id = (entry->ring_id & 0x0c) >> 2;
> +	pipe_id = (entry->ring_id & 0x03) >> 0;
> +	queue_id = (entry->ring_id & 0x70) >> 4;
> +	pr_debug("IH: RLC_RPM_INT, me:%d, pipe:%d, queue:%d\n",
> +			me_id, pipe_id, queue_id);
> +
> +	amdgpu_fence_process(ring);
> +	return 0;
> +}
> +
>  static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {
>  	.name = "gfx_v8_0",
>  	.early_init = gfx_v8_0_early_init,
> @@ -7005,11 +7173,19 @@ static const struct amdgpu_irq_src_funcs gfx_v8_0_sq_irq_funcs = {
>  	.process = gfx_v8_0_sq_irq,
>  };
>  
> +static const struct amdgpu_irq_src_funcs gfx_v8_0_spm_irq_funcs = {
> +	.set = gfx_v8_0_spm_set_interrupt_state,
> +	.process = gfx_v8_0_spm_irq,
> +};
> +
>  static void gfx_v8_0_set_irq_funcs(struct amdgpu_device *adev)
>  {
>  	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
>  	adev->gfx.eop_irq.funcs = &gfx_v8_0_eop_irq_funcs;
>  
> +	adev->gfx.spm_irq.num_types = 1;
> +	adev->gfx.spm_irq.funcs = &gfx_v8_0_spm_irq_funcs;
> +
>  	adev->gfx.priv_reg_irq.num_types = 1;
>  	adev->gfx.priv_reg_irq.funcs = &gfx_v8_0_priv_reg_irq_funcs;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2767c6d..bfde274 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2249,6 +2249,13 @@ static int gfx_v9_0_sw_init(void *handle)
>  	adev->gfx.mec.num_pipe_per_mec = 4;
>  	adev->gfx.mec.num_queue_per_pipe = 8;
>  
> +	/* KIQ SPM */
> +	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_RLC,
> +			      GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT,
> +			      &adev->gfx.spm_irq);
> +	if (r)
> +        return r;
> +
>  	/* EOP Event */
>  	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_EOP_INTERRUPT, &adev->gfx.eop_irq);
>  	if (r)
> @@ -3907,6 +3914,7 @@ static int gfx_v9_0_hw_fini(void *handle)
>  
>  	amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
> +	amdgpu_irq_put(adev, &adev->gfx.spm_irq, 0);
>  	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>  
>  	/* DF freeze and kcq disable will fail */
> @@ -4617,6 +4625,121 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  	return r;
>  }
>  
> +static void gfx_v9_0_spm_start(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +	u8 se;
> +
> +	for (se = 0; se < adev->gfx.config.max_shader_engines + 1; ++se)
> +	{
> +		uint32_t mux_addr_reg = SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_GLOBAL_MUXSEL_ADDR);
> +
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			INSTANCE_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SH_BROADCAST_WRITES, 1);
> +		data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
> +			SE_BROADCAST_WRITES, 1);
> +
> +		if (se < adev->gfx.config.max_shader_engines) // SE else GB
> +		{
> +			data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se);
> +
> +			mux_addr_reg = SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_SE_MUXSEL_ADDR);
> +		}
> +		gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
> +		// init addr
> +		gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, mux_addr_reg, data);
> +	}
> +
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, 0);
> +	data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
> +
> +	data = 0;
> +	data |= CP_PERFMON_STATE_DISABLE_AND_RESET <<4 ;
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	data = 0;
> +	data |= STRM_PERFMON_STATE_START_COUNTING <<4 ;
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_INT_CNTL) ,1);
> +}
> +
> +
> +static void gfx_v9_0_spm_stop(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t data = 0;
> +
> +
> +	data = CP_PERFMON_STATE_STOP_COUNTING;
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +	data |= (1<<10);
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmCP_PERFMON_CNTL), data);
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_INT_CNTL), 0);
> +}
> +
> +static u32 gfx_v9_0_spm_get_rdptr(struct amdgpu_device *adev)
> +{
> +	return RREG32_SOC15(GC, 0, mmRLC_SPM_RING_RDPTR);;
> +}
> +
> +static void gfx_v9_0_spm_set_rdptr(struct amdgpu_device *adev,  u32 rptr)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_RING_RDPTR), rptr);
> +}
> +
> +static u32 gfx_v9_0_spm_get_wrptr(struct amdgpu_device *adev)
> +{
> +	return -1;
> +}
> +
> +static void gfx_v9_0_set_spm_porfmon_ring_buf(struct amdgpu_device *adev, u64 gpu_addr, u32 size)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, SOC15_REG_OFFSET(GC, 0,
> +                               mmRLC_SPM_PERFMON_RING_BASE_LO), lower_32_bits(gpu_addr));
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false,
> +                               SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_PERFMON_RING_BASE_HI), upper_32_bits (gpu_addr));
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false,
> +                               SOC15_REG_OFFSET(GC, 0, mmRLC_SPM_PERFMON_RING_SIZE), size);
> +}
> +
> +static void gfx_v9_0_spm_set_reg(struct amdgpu_device *adev, u64 reg, u32 value)
> +{
> +	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
> +	uint32_t grbm_cntl;
> +	grbm_cntl = adev->reg_offset[GC_HWIP][0][1] + reg;
> +
> +	gfx_v9_0_write_data_to_reg(kiq_ring, 0, false, grbm_cntl, value);
> +}
> +
> +static const struct spm_funcs gfx_v9_0_spm_funcs = {
> +	.start = gfx_v9_0_spm_start,
> +	.stop = gfx_v9_0_spm_stop,
> +	.get_rdptr= gfx_v9_0_spm_get_rdptr,
> +	.set_rdptr= gfx_v9_0_spm_set_rdptr,
> +	.get_wrptr= gfx_v9_0_spm_get_wrptr,
> +	.set_spm_porfmon_ring_buf = gfx_v9_0_set_spm_porfmon_ring_buf,
> +	.set_reg = gfx_v9_0_spm_set_reg,
> +	.set_spm_config_size = 3,
> +	.get_spm_data_size = 128,
> +};
> +
> +static void gfx_v9_0_set_spm_funcs(struct amdgpu_device *adev)
> +{
> +	adev->gfx.spm.spmf = &gfx_v9_0_spm_funcs;
> +}
> +
>  static int gfx_v9_0_early_init(void *handle)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -4626,6 +4749,7 @@ static int gfx_v9_0_early_init(void *handle)
>  	else
>  		adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
>  	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
> +	gfx_v9_0_set_spm_funcs(adev);
>  	gfx_v9_0_set_kiq_pm4_funcs(adev);
>  	gfx_v9_0_set_ring_funcs(adev);
>  	gfx_v9_0_set_irq_funcs(adev);
> @@ -4677,6 +4801,10 @@ static int gfx_v9_0_late_init(void *handle)
>  	if (r)
>  		return r;
>  
> +	r = amdgpu_irq_get(adev, &adev->gfx.spm_irq, 0);
> +	if (r)
> +		return r;
> +
>  	r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
>  	if (r)
>  		return r;
> @@ -6657,6 +6785,39 @@ static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring)
>  	amdgpu_ring_write(ring, 0x0000000A); /* POLL_INTERVAL */
>  }
>  
> +static int gfx_v9_0_spm_set_interrupt_state(struct amdgpu_device *adev,
> +					     struct amdgpu_irq_src *src,
> +					     unsigned int type,
> +					     enum amdgpu_interrupt_state state)
> +{
> +	switch (state) {
> +	case AMDGPU_IRQ_STATE_DISABLE:
> +		WREG32(mmRLC_SPM_INT_CNTL, 0);
> +		break;
> +	case AMDGPU_IRQ_STATE_ENABLE:
> +		WREG32(mmRLC_SPM_INT_CNTL, 1);
> +		break;
> +	default:
> +		break;
> +	}
> +	return 0;
> +}
> +
> +static int gfx_v9_0_spm_irq(struct amdgpu_device *adev,
> +			     struct amdgpu_irq_src *source,
> +			     struct amdgpu_iv_entry *entry)
> +{
> +	u8 me_id, pipe_id, queue_id;
> +
> +	me_id = (entry->ring_id & 0x0c) >> 2;
> +	pipe_id = (entry->ring_id & 0x03) >> 0;
> +	queue_id = (entry->ring_id & 0x70) >> 4;
> +	pr_debug("IH: RLC_RPM_INT, me:%d, pipe:%d, queue:%d\n",
> +			me_id, pipe_id, queue_id);
> +
> +	return 0; /* This also prevents sending it to KFD */
> +}
> +
>  static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
>  	.name = "gfx_v9_0",
>  	.early_init = gfx_v9_0_early_init,
> @@ -6825,12 +6986,19 @@ static const struct amdgpu_irq_src_funcs gfx_v9_0_cp_ecc_error_irq_funcs = {
>  	.process = amdgpu_gfx_cp_ecc_error_irq,
>  };
>  
> +static const struct amdgpu_irq_src_funcs gfx_v9_0_spm_irq_funcs = {
> +	.set = gfx_v9_0_spm_set_interrupt_state,
> +	.process = gfx_v9_0_spm_irq,
> +};
>  
>  static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
>  {
>  	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
>  	adev->gfx.eop_irq.funcs = &gfx_v9_0_eop_irq_funcs;
>  
> +	adev->gfx.spm_irq.num_types = 1;
> +	adev->gfx.spm_irq.funcs = &gfx_v9_0_spm_irq_funcs;
> +
>  	adev->gfx.priv_reg_irq.num_types = 1;
>  	adev->gfx.priv_reg_irq.funcs = &gfx_v9_0_priv_reg_irq_funcs;
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..43edba0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>  		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>  		$(AMDKFD_PATH)/kfd_dbgdev.o \
>  		$(AMDKFD_PATH)/kfd_dbgmgr.o \
> -		$(AMDKFD_PATH)/kfd_crat.o
> +		$(AMDKFD_PATH)/kfd_crat.o	\
> +		$(AMDKFD_PATH)/kfd_spm.o
>  
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..cd394cd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "ivsrcid/ivsrcid_vislands30.h"
>  
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>  					const uint32_t *ih_ring_entry,
> @@ -37,6 +38,11 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>  	uint16_t pasid;
>  	bool ret;
>  
> +	vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
> +
> +	if ((ihre->source_id == VISLANDS30_IV_SRCID_RLC_STRM_PERF_MONITOR) && (vmid == 0))
> +		 return (kfd_spm_interrupt_isr (dev, ihre->source_id, ihre->source_id));
> +
>  	/* This workaround is due to HW/FW limitation on Hawaii that
>  	 * VMID and PASID are not written into ih_ring_entry
>  	 */
> @@ -49,7 +55,6 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>  		*patched_flag = true;
>  		*tmp_ihre = *ihre;
>  
> -		vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
>  		ret = f2g->get_atc_vmid_pasid_mapping_info(dev->kgd, vmid, &pasid);
>  
>  		tmp_ihre->ring_id &= 0x000000ff;
> @@ -95,6 +100,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>  	if (pasid == 0)
>  		return;
>  
> +	if ((ihre->source_id == VISLANDS30_IV_SRCID_RLC_STRM_PERF_MONITOR) && (vmid == 0))
> +		kfd_spm_interrupt_wq(dev, ihre->source_id);
> +
>  	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
>  		kfd_signal_event_interrupt(pasid, context_id, 28);
>  	else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..bfc83beb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1732,6 +1732,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
>  	return r;
>  }
>  
> +static int kfd_ioctl_rlc_spm(struct file *filep,
> +				   struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_spm_args *args = data;
> +	int err;
> +
> +	err = kfd_rlc_spm(p,
> +			(void __user *)args,
> +			 args->buf_size,
> +			 args->op);
> +
> +	return err;
> +}
> +
>  #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>  	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>  			    .cmd_drv = 0, .name = #ioctl}
> @@ -1827,6 +1841,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>  
>  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>  			kfd_ioctl_alloc_queue_gws, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_RLC_SPM,
> +			kfd_ioctl_rlc_spm, 0),
>  };
>  
>  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..481f0ae 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
> +#include "ivsrcid/gfx/irqsrcs_gfx_9_0.h"
>  
>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>  					const uint32_t *ih_ring_entry,
> @@ -35,12 +36,15 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>  
>  	/* Only handle interrupts from KFD VMIDs */
>  	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
> +	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
> +	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
> +	if ((source_id == GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT) && (vmid == 0))
> +		 return (kfd_spm_interrupt_isr (dev, source_id, client_id));
> +
>  	if (vmid < dev->vm_info.first_vmid_kfd ||
>  	    vmid > dev->vm_info.last_vmid_kfd)
>  		return 0;
>  
> -	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
> -	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
>  	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
>  
>  	/* This is a known issue for gfx9. Under non HWS, pasid is not set
> @@ -95,6 +99,9 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>  	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
>  	context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
>  
> +	if ((source_id == GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT) && (vmid == 0))
> +		kfd_spm_interrupt_wq(dev, source_id);
> +
>  	if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
>  		kfd_signal_event_interrupt(pasid, context_id, 32);
>  	else if (source_id == SOC15_INTSRC_SDMA_TRAP)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 43b888b..707d672 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -309,6 +309,9 @@ struct kfd_dev {
>  
>  	/* Global GWS resource shared b/t processes*/
>  	void *gws;
> +
> +	/*spm process id */
> +	unsigned int spm_pasid;
>  };
>  
>  enum kfd_mempool {
> @@ -740,6 +743,13 @@ struct kfd_process {
>  	struct kobject *kobj;
>  	struct kobject *kobj_queues;
>  	struct attribute attr_pasid;
> +	/* spm data */
> +	struct kfd_spm_cntr *spm_cntr;
> +	bool is_spm_acquired;
> +	/* Work items for tranfer data to user */
> +	struct delayed_work copy_to_user_work;
> +	/* spm-related data */
> +	struct mutex spm_mutex;
>  };
>  
>  #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> @@ -1045,10 +1055,20 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
>  
>  bool kfd_is_locked(void);
>  
> +void kfd_spm_init_process(struct kfd_process *p);
> +int kfd_rlc_spm(struct kfd_process *p,  void __user *data,
> +		       uint32_t buf_size, __u32 op);
> +
>  /* Compute profile */
>  void kfd_inc_compute_active(struct kfd_dev *dev);
>  void kfd_dec_compute_active(struct kfd_dev *dev);
>  
> +/* spm interrupt */
> +bool kfd_spm_interrupt_isr(struct kfd_dev *dev,
> +					uint16_t source_id,	uint32_t client_id);
> +void kfd_spm_interrupt_wq(struct kfd_dev *dev,	uint16_t source_id);
> +
> +
>  /* Cgroup Support */
>  /* Check with device cgroup if @kfd device is accessible */
>  static inline int kfd_devcgroup_check_permission(struct kfd_dev *kfd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index fe0cd49..338868d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -746,6 +746,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>  	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
>  	process->last_restore_timestamp = get_jiffies_64();
>  	kfd_event_init_process(process);
> +	kfd_spm_init_process(process);
>  	process->is_32bit_user_mode = in_compat_syscall();
>  
>  	process->pasid = kfd_pasid_alloc();
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_spm.c b/drivers/gpu/drm/amd/amdkfd/kfd_spm.c
> new file mode 100644
> index 0000000..773e2ee
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_spm.c
> @@ -0,0 +1,577 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/device.h>
> +#include "kfd_priv.h"
> +#include "amdgpu_amdkfd.h"
> +#include "soc15_int.h"
> +#include "ivsrcid/gfx/irqsrcs_gfx_9_0.h"
> +#include "ivsrcid/ivsrcid_vislands30.h"
> +#include <linux/delay.h>
> +
> +#define SAMPLING_MAX 4
> +
> +struct user_buf {
> +        uint64_t __user *user_addr;
> +        u32 ubufsize;
> +
> +};
> +
> +struct kfd_spm_cntr {
> +        struct kgd_dev *kgd;
> +        u64    spm_gpu_addr;
> +        u32    spm_ring_buf_size;
> +        u32    spm_ring_buf_mark;
> +	    u32	   spm_ring_rptr;
> +        u32    spm_ring_wptr;
> +        u32    spm_ring_size_copied;
> +        void   *spm_obj;
> +        u32    *spm_cpu_addr;
> +        struct user_buf ubuf [SAMPLING_MAX];
> +        u32    buf_count;
> +        bool   is_uesr_buf_filled;
> +        bool   is_nowait;
> +        struct task_struct *thread;
> +        bool   thread_ready;
> +        char   *name;
> +        wait_queue_head_t spm_wq;
> +        int    wanted_cluster;
> +        bool   is_timeout;
> +};
> +
> +static int kfd_spm_data_cocy (struct kfd_spm_cntr * spm_cntr, u32 size_to_copy)
> +{
> +        u32 user_buf_space_left;
> +        int ret = 0;
> +        u32 bufSize;
> +        uint64_t __user * user_address;
> +        uint64_t * ring_buf;
> +        
> +        pr_debug("[%s]\n", __func__);
> +
> +        bufSize = spm_cntr->ubuf[0].ubufsize;
> +        user_address = (uint64_t*)((uint64_t)spm_cntr->ubuf[0].user_addr + spm_cntr->spm_ring_size_copied);
> +        ring_buf =  (uint64_t*)((uint64_t)spm_cntr->spm_cpu_addr + spm_cntr->spm_ring_rptr);
> +
> +        if (user_address == NULL)
> +	        return -EFAULT;
> +
> +        user_buf_space_left = bufSize - spm_cntr->spm_ring_size_copied;
> +
> +        if (size_to_copy <= user_buf_space_left) {
> +		ret = copy_to_user(user_address, ring_buf, size_to_copy);
> +	     	if (ret) {
> +	        	pr_err("copy_to_user failed, ret = %x\n", ret);
> +			return -EFAULT;
> +              	}
> +		spm_cntr->spm_ring_size_copied += size_to_copy;
> +	} else {
> +		size_to_copy = spm_cntr->spm_ring_buf_size - spm_cntr->spm_ring_size_copied;
> +		ret = copy_to_user(user_address, ring_buf, user_buf_space_left);
> +		if (ret)
> +			return -EFAULT;
> +
> +		spm_cntr->spm_ring_size_copied = bufSize;
> +		spm_cntr->is_uesr_buf_filled = true;
> +	}
> +
> +	return ret;
> +}
> +
> +static int kfd_spm_reag_ring_buf_polling (struct kfd_spm_cntr * spm_cntr, long timeout)
> +{
> +	u32 size_to_copy;
> +	int ret = 0;
> +
> +	pr_info("[%s]\n", __func__);
> +
> +	while (spm_cntr->is_uesr_buf_filled != true){
> +		spm_cntr->spm_ring_rptr = amdgpu_amdkfd_rlc_spm_get_rdptr(spm_cntr->kgd) & spm_cntr->spm_ring_buf_mark;
> +#if 1
> +		spm_cntr->spm_ring_wptr = amdgpu_amdkfd_rlc_spm_get_wrptr(spm_cntr->kgd) & spm_cntr->spm_ring_buf_mark;
> +#else
> +		spm_cntr->spm_ring_wptr = spm_cntr->spm_cpu_addr[0] & spm_cntr->spm_ring_buf_mark;
> +#endif
> +
> +		if ((spm_cntr->spm_ring_rptr >= 0) &&  (spm_cntr->spm_ring_rptr  < 0x20))
> +				spm_cntr->spm_ring_rptr = 0x20;
> +
> +		if (spm_cntr->is_uesr_buf_filled == true)
> +			goto exit;
> +
> +		if (spm_cntr->spm_ring_wptr > spm_cntr->spm_ring_rptr) {
> +			size_to_copy = spm_cntr->spm_ring_wptr - spm_cntr->spm_ring_rptr;
> +			ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +		} else if (spm_cntr->spm_ring_wptr < spm_cntr->spm_ring_rptr) {
> +			size_to_copy = spm_cntr->spm_ring_buf_size - spm_cntr->spm_ring_rptr;
> +			ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +
> +			/* correct counter start point */
> +			spm_cntr->spm_ring_rptr = 0x20;
> +			size_to_copy = spm_cntr->spm_ring_wptr;
> +			ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +		}
> +
> +		if (!ret) {
> +			if (spm_cntr->is_uesr_buf_filled == true) {
> +				/* stop */
> +				amdgpu_amdkfd_rlc_spm_cntl(spm_cntr->kgd, 0);
> +				amdgpu_amdkfd_rlc_spm_set_rdptr(spm_cntr->kgd, 0);
> +#if 0
> +				amdgpu_amdkfd_rlc_spm_set_wrptr(spm_cntr->kgd, 0);
> +#else
> +				spm_cntr->spm_cpu_addr[0]= 0;
> +#endif
> +				return ret;
> +			} else
> +				amdgpu_amdkfd_rlc_spm_set_rdptr(spm_cntr->kgd, spm_cntr->spm_ring_wptr);
> +
> +			if (spm_cntr->is_timeout == true) {
> +				/* stop */
> +				amdgpu_amdkfd_rlc_spm_cntl(spm_cntr->kgd, 0);
> +				amdgpu_amdkfd_rlc_spm_set_rdptr(spm_cntr->kgd, 0);
> +#if 0
> +				amdgpu_amdkfd_rlc_spm_set_wrptr(spm_cntr->kgd, 0);
> +#else
> +				spm_cntr->spm_cpu_addr[0]= 0;
> +#endif
> +				break;
> +			}
> +		}
> +	}
> +exit:
> +	return ret;
> +}
> +
> +static int kfd_spm_reag_ring_buf (struct kfd_spm_cntr * spm_cntr)
> +{
> +	u32 size_to_copy;
> +	int ret = 0;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	spm_cntr->spm_ring_rptr = amdgpu_amdkfd_rlc_spm_get_rdptr(spm_cntr->kgd) & spm_cntr->spm_ring_buf_mark;
> +#if 1
> +	spm_cntr->spm_ring_wptr = amdgpu_amdkfd_rlc_spm_get_wrptr(spm_cntr->kgd) & spm_cntr->spm_ring_buf_mark;
> +#else
> +	spm_cntr->spm_ring_wptr = spm_cntr->spm_cpu_addr[0] & spm_cntr->spm_ring_buf_mark;
> +#endif
> +	if ((spm_cntr->spm_ring_rptr >= 0) &&  (spm_cntr->spm_ring_rptr  < 0x20)) {
> +		spm_cntr->spm_ring_rptr = 0x20;
> +	}
> +
> +	if (spm_cntr->is_uesr_buf_filled == true)
> +		goto exit;
> +
> +	if (spm_cntr->spm_ring_wptr > spm_cntr->spm_ring_rptr) {
> +		size_to_copy = spm_cntr->spm_ring_wptr - spm_cntr->spm_ring_rptr;
> +		ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +
> +	} else if (spm_cntr->spm_ring_wptr < spm_cntr->spm_ring_rptr) {
> +		size_to_copy = spm_cntr->spm_ring_buf_size - spm_cntr->spm_ring_rptr;
> +		ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +
> +		spm_cntr->spm_ring_rptr = 0x20;
> +		size_to_copy = spm_cntr->spm_ring_wptr;
> +		ret = kfd_spm_data_cocy(spm_cntr, size_to_copy);
> +	}
> +		if (!ret) {
> +			if (spm_cntr->is_uesr_buf_filled == true) {
> +				/* stop */
> +				amdgpu_amdkfd_rlc_spm_cntl(spm_cntr->kgd, 0);
> +				amdgpu_amdkfd_rlc_spm_set_rdptr(spm_cntr->kgd, 0);
> +#if 0
> +				amdgpu_amdkfd_rlc_spm_set_wrptr(spm_cntr->kgd, 0);
> +#else
> +				spm_cntr->spm_cpu_addr[0]= 0;
> +#endif
> +				return ret;
> +			} else
> +				amdgpu_amdkfd_rlc_spm_set_rdptr(spm_cntr->kgd, spm_cntr->spm_ring_wptr);
> +		}
> +exit:
> +	return ret;
> +}
> +
> +static int kfd_spm_sched_main(void *param)
> +{
> +	struct kfd_spm_cntr * spm_cntr = (struct kfd_spm_cntr *)param;
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_interruptible(spm_cntr->spm_wq,
> +				spm_cntr->wanted_cluster != false ||
> +				kthread_should_stop());
> +
> +		kfd_spm_reag_ring_buf (spm_cntr);
> +
> +		spm_cntr->wanted_cluster = false;
> +	}
> +	return 0;
> +}
> +
> +static void transfer_data_process_worker(struct work_struct *work)
> +{
> +	struct kfd_process *p;
> +	struct delayed_work *dwork;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	dwork = to_delayed_work(work);
> +
> +	/* Process termination destroys this worker thread. So during the
> +	 * lifetime of this thread, kfd_process p will be valid
> +	 */
> +	p = container_of(dwork, struct kfd_process, copy_to_user_work);
> +
> +	p->spm_cntr->is_timeout = true;
> +}
> +
> +
> +/**
> + * kfd_spm_init - init driver ring struct.
> + * Returns 0 on success, error on failure.
> + */
> +int kfd_spm_shed_init(struct kfd_process *p, struct kgd_dev *kgd)
> +{
> +	int ret = 0;
> +	init_waitqueue_head(&p->spm_cntr->spm_wq);
> +	p->spm_cntr->wanted_cluster = false;
> +	p->spm_cntr->kgd = kgd;
> +	INIT_DELAYED_WORK(&p->copy_to_user_work, transfer_data_process_worker);
> +
> +	/* Each scheduler will run on a seperate kernel thread */
> +	p->spm_cntr->thread = kthread_run(kfd_spm_sched_main, p->spm_cntr, p->spm_cntr->name);
> +	if (IS_ERR(p->spm_cntr->thread)) {
> +		ret = PTR_ERR(p->spm_cntr->thread);
> +		p->spm_cntr->thread = NULL;
> +		DRM_ERROR("Failed to create scheduler for %s.\n", p->spm_cntr->name);
> +		return ret;
> +	}
> +
> +	p->spm_cntr->thread_ready = true;
> +	return ret;
> +}
> +
> +/**
> + * amdgpu_ring_fini - tear down the driver ring struct.
> + *
> + * @adev: amdgpu_device pointer
> + * @ring: amdgpu_ring structure holding ring information
> + *
> + * Tear down the driver information for the selected ring (all asics).
> + */
> +void kfd_spm_shed_fini(struct kfd_process *p)
> +{
> +	if (p->spm_cntr->thread)
> +		kthread_stop(p->spm_cntr->thread);
> +
> +	cancel_delayed_work_sync(&p->copy_to_user_work);
> +	p->spm_cntr->thread = NULL;
> +	p->spm_cntr->thread_ready = false;
> +}
> +
> +void kfd_spm_init_process(struct kfd_process *p)
> +{
> +	mutex_init(&p->spm_mutex);
> +	p->spm_cntr = NULL;
> +	p->is_spm_acquired = false;
> +}
> +
> +static struct kfd_spm_cntr *allocate_spm_cntr_data(void)
> +{
> +	struct kfd_spm_cntr *cntr;
> +
> +	cntr = kzalloc(sizeof(*cntr), GFP_KERNEL);
> +	if (!cntr)
> +		return NULL;
> +
> +	return cntr;
> +}
> +
> +int kfd_acquire_spm(struct kfd_process *p, struct kgd_dev *kgd)
> +{
> +	int retval;
> +	int count;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	if(p->is_spm_acquired == true)
> +		return EINVAL;
> +
> +	if (!p->spm_cntr) {
> +		p->spm_cntr = allocate_spm_cntr_data();
> +		if (!p->spm_cntr)
> +			return -ENOMEM;
> +		/* git spm ring buffer 128KB */
> +		p->spm_cntr->spm_ring_buf_size = order_base_2(128 * 1024/4);
> +		p->spm_cntr->spm_ring_buf_size = (1 << p->spm_cntr->spm_ring_buf_size) * 4;
> +		p->spm_cntr->spm_ring_buf_mark = p->spm_cntr->spm_ring_buf_size -1;
> +		for (count = 0; count < SAMPLING_MAX; ++count) {
> +			p->spm_cntr->ubuf[count].user_addr = NULL;
> +			p->spm_cntr->ubuf[count].ubufsize = 0;
> +		}
> +		p->spm_cntr->buf_count = 0;
> +		p->spm_cntr->is_uesr_buf_filled = false;
> +		p->spm_cntr->is_nowait = false;
> +		p->spm_cntr->thread_ready = false;
> +	}
> +
> +	retval = amdgpu_amdkfd_alloc_gtt_mem(kgd,
> +			p->spm_cntr->spm_ring_buf_size, &p->spm_cntr->spm_obj,
> +			&p->spm_cntr->spm_gpu_addr, (void *)&p->spm_cntr->spm_cpu_addr,
> +			false);
> +
> +	if (retval)
> +		return EINVAL;
> +
> +	memset(p->spm_cntr->spm_cpu_addr, 0, p->spm_cntr->spm_ring_buf_size);
> +
> +	amdgpu_amdkfd_rlc_spm_acquire(kgd, p->spm_cntr->spm_gpu_addr, p->spm_cntr->spm_ring_buf_size);
> +
> +	if (p->spm_cntr->thread_ready == false) {
> +		p->spm_cntr->name = "spm";
> +		retval = kfd_spm_shed_init(p, kgd);
> +		if (retval) {
> +			DRM_ERROR("Failed to create spm thread %s.\n",	p->spm_cntr->name);
> +			return retval;
> +		}
> +	}
> +	p->is_spm_acquired = true;
> +
> +	return 0;
> +}
> +
> +int kfd_release_spm(struct kfd_process *p, struct kgd_dev *kgd)
> +{
> +
> +	kfd_spm_shed_fini(p);
> +
> +	amdgpu_amdkfd_free_gtt_mem(kgd, p->spm_cntr->spm_obj);
> +
> +	kfree(p->spm_cntr);
> +	p->spm_cntr = NULL;
> +	p->is_spm_acquired = false;
> +
> +	return 0;
> +}
> +
> +int set_dest_buf_polling(struct kfd_spm_cntr *spm, struct kgd_dev *kgd, void __user *data)
> +{
> +	struct kfd_ioctl_spm_args __user *user_spm_data =
> +			(struct kfd_ioctl_spm_args __user *) data;
> +	u32 i;
> +	int ret = 0;
> +	unsigned long timeout;
> +
> +	pr_debug ("[%s]\n", __func__);
> +
> +	timeout = msecs_to_jiffies(user_spm_data->timeout) + 1;
> +
> +	/* if buf = NULL, stop spm */
> +	if (!user_spm_data->destptr) {
> +		amdgpu_amdkfd_rlc_spm_cntl(kgd, 0);
> +
> +		user_spm_data->bytes_copied = spm->spm_ring_size_copied;
> +		spm->spm_ring_size_copied = 0;
> +		user_spm_data->spmtptr = (uint64_t)spm->ubuf[0].user_addr;
> +
> +		for (i = 0; i < spm->buf_count; ++i) {
> +			spm->ubuf [i] = spm->ubuf [i+1];
> +		}
> +		spm->ubuf[spm->buf_count].user_addr = NULL;
> +		spm->ubuf[spm->buf_count].ubufsize = 0;
> +		spm->buf_count --;
> +		if (spm->buf_count < 0)
> +			spm->buf_count = 0;
> +		return ret;
> +	}
> +
> +	if (!spm->buf_count) {
> +		/* First time save user spm buffer, then start spm sampling */
> +		spm->ubuf[0].user_addr = (uint64_t*)user_spm_data->destptr;
> +		spm->ubuf[0].ubufsize = user_spm_data->buf_size;
> +		user_spm_data->bytes_copied = 0;
> +		spm->spm_ring_size_copied = 0;
> +		spm->buf_count ++;
> +		spm->is_uesr_buf_filled = false;
> +		amdgpu_amdkfd_rlc_spm_cntl(kgd, 1);
> +
> +		ret = kfd_spm_reag_ring_buf_polling (spm, timeout);
> +		user_spm_data->bytes_copied = spm->spm_ring_size_copied;
> +		user_spm_data->spmtptr = (uint64_t)spm->ubuf[0].user_addr;
> +		spm->buf_count --;
> +	} else {
> +		spm->spm_ring_size_copied = 0;
> +		spm->is_uesr_buf_filled = false;
> +		amdgpu_amdkfd_rlc_spm_cntl(kgd, 1);
> +		ret = kfd_spm_reag_ring_buf_polling (spm, timeout);
> +
> +		user_spm_data->bytes_copied = spm->spm_ring_size_copied;
> +		user_spm_data->spmtptr = (uint64_t)spm->ubuf[0].user_addr;
> +		spm->buf_count --;
> +
> +		for (i = 0; i < spm->buf_count; ++i)
> +			/* Repeated dest buf */
> +			if (spm->ubuf[i].user_addr == (uint64_t*)user_spm_data->destptr)
> +				break;
> +		if (i == spm->buf_count) {
> +			spm->ubuf[i].user_addr = (uint64_t*)user_spm_data->destptr;
> +			spm->ubuf[i].ubufsize = user_spm_data->buf_size;
> +			spm->buf_count ++;
> +		}
> +
> +		for (i = 0; i < spm->buf_count; ++i)
> +			spm->ubuf[i] = spm->ubuf[i + 1];
> +		spm->ubuf[spm->buf_count].user_addr  = NULL;
> +	}
> +
> +	user_spm_data->bytes_copied = spm->spm_ring_size_copied;
> +	return ret;
> +
> +}
> +
> +int kfd_set_dest_buffer(struct kfd_process *p, struct kgd_dev *kgd, void __user *data)
> +{
> +	struct kfd_ioctl_spm_args __user *user_spm_data =
> +							(struct kfd_ioctl_spm_args __user *) data;
> +	struct kfd_spm_cntr *spm = p->spm_cntr;
> +	unsigned long timeout;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	if(p->is_spm_acquired == false)
> +		return -EINVAL;
> +
> +	timeout = msecs_to_jiffies(user_spm_data->timeout) + 1;
> +	spm->is_timeout = false;
> +	schedule_delayed_work(&p->copy_to_user_work, timeout);
> +	return set_dest_buf_polling(spm, kgd, data);
> +}
> +
> +int kfd_config_spm(struct kfd_process *p, struct kgd_dev *kgd,struct kfd_ioctl_spm_args *data)
> +{
> +	struct kfd_ioctl_spm_args __user *user_spm_data =
> +			(struct kfd_ioctl_spm_args __user *) data;
> +	struct kfd_spm_set_reg *spm_reg;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	spm_reg = kvmalloc_array(1, sizeof (struct kfd_spm_set_reg), GFP_KERNEL);
> +
> +	if (copy_from_user(spm_reg, (void __user *)user_spm_data->destptr, sizeof (struct kfd_spm_set_reg))) {
> +		pr_err("copy_from_user Fail\n");
> +		goto exit;
> +	}
> +	amdgpu_amdkfd_rlc_spm_set_reg(kgd, spm_reg->reg, spm_reg->value);
> +exit:
> +	kfree(spm_reg);
> +	return 0;
> +}
> +
> +int kfd_rlc_spm(struct kfd_process *p,  void __user *data,
> +		       uint32_t buf_size, __u32 operation)
> +{
> +	struct kfd_ioctl_spm_args *args = data;
> +	struct kfd_dev *dev;
> +	int r;
> +
> +	dev = kfd_device_by_id(args->gpu_id);
> +	if (!dev)
> +		return -EINVAL;
> +
> +
> +	switch (operation) {
> +	case KFD_IOCTL_SPM_OP_ACQUIRE:
> +		dev->spm_pasid = p->pasid;
> +		r = kfd_acquire_spm(p, dev->kgd);
> +		break;
> +
> +	case KFD_IOCTL_SPM_OP_RELEASE:
> +		r = kfd_release_spm(p, dev->kgd);
> +				break;
> +
> +	case KFD_IOCTL_SPM_OP_SET_DEST_BUF:
> +		r = kfd_set_dest_buffer(p, dev->kgd, data);
> +		break;
> +
> +	case KFD_IOCTL_SPM_OP_CONFIG:
> +		r = kfd_config_spm(p, dev->kgd, args);
> +				break;
> +
> +	default:
> +		r = -EINVAL;
> +		break;
> +	}
> +	return r;
> +}
> +
> +void kfd_spm_interrupt(unsigned int pasid)
> +{
> +
> +	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +
> +	if (!p) {
> +		pr_info("kfd_spm_interrupt p = %p \n", p);
> +		return; /* Presumably process exited. */
> +	}
> +
> +	mutex_lock(&p->spm_mutex);
> +
> +	p->spm_cntr->wanted_cluster = true;
> +	wake_up_interruptible(&p->spm_cntr->spm_wq);
> +
> +	mutex_unlock(&p->spm_mutex);
> +
> +	kfd_unref_process(p);
> +}
> +
> +bool kfd_spm_interrupt_isr(struct kfd_dev *dev,
> +					uint16_t source_id,	uint32_t client_id)
> +{
> +	pr_debug("[%s]\n", __func__);
> +
> +	if (source_id != GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT)
> +		return 0;
> +
> +	/* Interrupt types we care about: various signals and faults.
> +	 * They will be forwarded to a work queue (see below).
> +	 */
> +
> +	return source_id == GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT ||
> +		source_id == VISLANDS30_IV_SRCID_RLC_STRM_PERF_MONITOR ||
> +		client_id == AMDGPU_IRQ_CLIENTID_LEGACY ||
> +		client_id == SOC15_IH_CLIENTID_RLC;
> +}
> +
> +void kfd_spm_interrupt_wq(struct kfd_dev *dev,	uint16_t source_id)
> +{
> +	uint16_t pasid;
> +
> +	pr_debug("[%s]\n", __func__);
> +
> +	pasid = dev->spm_pasid;
> +#if 0
> +	if ((source_id == GFX_9_0__SRCID__RLC_STRM_PERF_MONITOR_INTERRUPT) ||
> +				(source_id == VISLANDS30_IV_SRCID_RLC_STRM_PERF_MONITOR))
> +		kfd_spm_interrupt(pasid);
> +#endif
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
> index 0bc0b25..fb4ad60 100644
> --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
> +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
> @@ -30,6 +30,7 @@
>  #define SOC15_INTSRC_SQ_INTERRUPT_MSG	239
>  #define SOC15_INTSRC_VMC_FAULT		0
>  #define SOC15_INTSRC_SDMA_TRAP		224
> +#define AMDGPU_IRQ_CLIENTID_LEGACY	0
>  
>  
>  #define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 4f66764..f7cb7d4 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,56 @@ struct kfd_ioctl_import_dmabuf_args {
>  	__u32 dmabuf_fd;	/* to KFD */
>  };
>  
> +/**
> + * kfd_ioctl_spm_op - SPM ioctl operations
> + *
> + * @KFD_IOCTL_SPM_OP_ACQUIRE: acquire exclusive access to SPM
> + * @KFD_IOCTL_SPM_OP_RELEASE: release exclusive access to SPM
> + * @KFD_IOCTL_SPM_OP_SET_DEST_BUF: set or unset destination buffer for SPM streaming
> + */
> +enum kfd_ioctl_spm_op {
> +	KFD_IOCTL_SPM_OP_ACQUIRE,
> +	KFD_IOCTL_SPM_OP_RELEASE,
> +	KFD_IOCTL_SPM_OP_SET_DEST_BUF,
> +	KFD_IOCTL_SPM_OP_CONFIG
> +};
> +
> +/* Don't wait for previous buffer to fill up */
> +#define KFD_IOCTL_SPM_FLAG_POLLING 1
> +
> +/**
> + * kfd_ioctl_spm_args - Arguments for SPM ioctl
> + *
> + * @op:     specifies the operation to perform
> + * @destptr:used for the address of the destination buffer in @KFD_IOCTL_SPM_SET_DEST_BUFFER
> + * @buf_size:size  of the destination buffer in @KFD_IOCTL_SPM_SET_DEST_BUFFER
> + * @timeout: timeout to wait buffer get filled
> + * @gpu_id: gpi ID
> + * @bytes_copied: byte copied from streaming performance ring buffer
> + *
> + * If @ptr is NULL, the destination buffer address is unset and copying of counters
> + * is stopped.
> + *
> + * Returns negative error code on failure. On success, @KFD_IOCTL_SPM_OP_ACQUIRE and
> + * @KFD_IOCTL_SPM_OP_RELEASE return 0, @KFD_IOCTL_SPM_OP_SET_DEST_BUF returns the fill
> + * level of the previous buffer.
> + */
> +struct kfd_ioctl_spm_args {
> +	__u64 destptr;
> +	__u64 spmtptr;
> +	__u32 buf_size;
> +	__u32 op;
> +	__u32 timeout;
> +	__u32 gpu_id;	/* to KFD */
> +	/* from KFD: Total amount of bytes copied */
> +	__u64 bytes_copied;
> +};
> +
> +struct kfd_spm_set_reg {
> +	__u64 reg;		/* to KFD */
> +	__u32 value;
> +};
> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +596,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>  		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>  
> +#define AMDKFD_IOC_RLC_SPM		\
> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_spm_args)
> +
>  #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x1F
> +#define AMDKFD_COMMAND_END		0x20
>  
>  #endif
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20200428/d331298d/attachment-0001.htm>


More information about the amd-gfx mailing list