[PATCH v2] drm/xe/lnl: Apply Wa_22019338487

Mon Jun 10 20:13:58 UTC 2024

On Mon, Jun 10, 2024 at 11:52:31AM -0700, Vinay Belgaumkar wrote:
> This WA requires us to limit media GT frequency requests to a certain
> cap value during driver load. Freq limits are restored after load
> completes, so perf will not be affected during normal operations.
> 
> During normal driver operation, this WA requires dummy writes to media
> offset 0x380D8C after every ~63 GGTT writes. This will ensure completion
> of the LMEM writes originating from Gunit.
> 
> During driver unload(before FLR), the WA requires that we set requested
> frequency to the cap value again.
> 
> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile          |  2 +
>  drivers/gpu/drm/xe/xe_device.c       | 10 ++++
>  drivers/gpu/drm/xe/xe_ggtt.c         | 19 ++++++
>  drivers/gpu/drm/xe/xe_ggtt_types.h   |  2 +
>  drivers/gpu/drm/xe/xe_gsc.c          |  4 ++
>  drivers/gpu/drm/xe/xe_gt.c           |  2 +
>  drivers/gpu/drm/xe/xe_guc_pc.c       | 87 +++++++++++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_guc_pc.h       |  4 ++
>  drivers/gpu/drm/xe/xe_guc_pc_types.h |  4 ++
>  drivers/gpu/drm/xe/xe_wa_oob.rules   |  1 +
>  10 files changed, 134 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 478acc94a71c..ab282070fedf 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -24,9 +24,11 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>  	$(call cmd,wa_oob)
>  
>  uses_generated_oob := \
> +	$(obj)/xe_ggtt.o \
>  	$(obj)/xe_gsc.o \
>  	$(obj)/xe_guc.o \
>  	$(obj)/xe_guc_ads.o \
> +	$(obj)/xe_guc_pc.o \
>  	$(obj)/xe_migrate.o \
>  	$(obj)/xe_ring_ops.o \
>  	$(obj)/xe_vm.o \
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 94dbfe5cf19c..6450cccb1771 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -37,6 +37,7 @@
>  #include "xe_gt_printk.h"
>  #include "xe_gt_sriov_vf.h"
>  #include "xe_guc.h"
> +#include "xe_guc_pc.h"

does it make sense to introduce this dependency at the device level?
or perhaps this should be part of the gt_idle which then, if using
guc it calls there?

>  #include "xe_hwmon.h"
>  #include "xe_irq.h"
>  #include "xe_memirq.h"
> @@ -50,6 +51,7 @@
>  #include "xe_tile.h"
>  #include "xe_ttm_stolen_mgr.h"
>  #include "xe_ttm_sys_mgr.h"
> +#include "xe_uc_fw.h"
>  #include "xe_vm.h"
>  #include "xe_vram.h"
>  #include "xe_wait_user_fence.h"
> @@ -668,6 +670,12 @@ int xe_device_probe(struct xe_device *xe)
>  
>  	xe_hwmon_register(xe);
>  
> +	/* Wa_22019338487: Restore GT/Media freq now if GSC is not being loaded. */
> +	for_each_gt(gt, xe, id) {
> +		if (!xe_uc_fw_is_available(&gt->uc.gsc.fw))
> +			xe_guc_pc_wa_22019338487_end(&gt->uc.guc.pc);
> +	}

please move this to a function inside guc_pc...
perhaps calling the function
xe_guc_pc_sanitize_freq()

But please keep the probe cleaning calling one component that makes sense.

hmm... perhaps instead of guc_pc or gt_idle we should use uc_fw component,
since we check uc_fw gsc availability to then call for the guc_pc sanitize?
But anyway, please do not call the workaround names in the function. Let's
define a generic flow in which the workaround is the underneath static call,
but not the exported one.

> +
>  	return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>  
>  err_fini_display:
> @@ -701,6 +709,8 @@ void xe_device_remove(struct xe_device *xe)
>  	struct xe_gt *gt;
>  	u8 id;
>  
> +	xe_guc_pc_wa_22019338487_fini(xe);

end? fini? what's the difference?
Perhaps this should be a registration with devm or drmm instead of this
direct call here?

and again, please not workaround name in the function name.
Make a generic flow in which w/a is underneath, not the ultimate
exported call.

> +
>  	xe_device_remove_display(xe);
>  
>  	xe_display_fini(xe);
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 8ff91fd1b7c8..58d177be724a 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -11,6 +11,7 @@
>  #include <drm/drm_drv.h>
>  #include <drm/drm_managed.h>
>  #include <drm/intel/i915_drm.h>
> +#include <generated/xe_wa_oob.h>
>  
>  #include "regs/xe_gt_regs.h"
>  #include "regs/xe_gtt_defs.h"
> @@ -23,8 +24,10 @@
>  #include "xe_gt_sriov_vf.h"
>  #include "xe_gt_tlb_invalidation.h"
>  #include "xe_map.h"
> +#include "xe_mmio.h"
>  #include "xe_pm.h"
>  #include "xe_sriov.h"
> +#include "xe_wa.h"
>  #include "xe_wopcm.h"
>  
>  static u64 xelp_ggtt_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
> @@ -69,12 +72,28 @@ static unsigned int probe_gsm_size(struct pci_dev *pdev)
>  	return ggms ? SZ_1M << ggms : 0;
>  }
>  
> +static void apply_wa_22019338487(struct xe_ggtt *ggtt)

For the record:
for the static non exported functions it is okay to have the workaround name,
But please, prefer to highlight what it is doing instead of why the fn
exists... something like ggtt_update_access_counter

> +{
> +	/*
> +	 * 22019338487: GMD_ID is a RO register, a dummy write forces gunit

Please wa_22019338487:

> +	 * to wait for completion of prior GTT writes before letting this through.
> +	 */
> +	if (XE_WA(ggtt->tile->media_gt, 22019338487)) {
> +		if (++ggtt->update_count > 63) {
> +			xe_mmio_write32(ggtt->tile->media_gt, GMD_ID, 0x0);
> +			ggtt->update_count = 0;
> +		}
> +	}
> +}
> +
>  void xe_ggtt_set_pte(struct xe_ggtt *ggtt, u64 addr, u64 pte)
>  {
>  	xe_tile_assert(ggtt->tile, !(addr & XE_PTE_MASK));
>  	xe_tile_assert(ggtt->tile, addr < ggtt->size);
> +	lockdep_assert_held(&ggtt->lock);

it looks like the assert should be in the above function, not here...

>  
>  	writeq(pte, &ggtt->gsm[addr >> XE_PTE_SHIFT]);
> +	apply_wa_22019338487(ggtt);
>  }
>  
>  static void xe_ggtt_clear(struct xe_ggtt *ggtt, u64 start, u64 size)
> diff --git a/drivers/gpu/drm/xe/xe_ggtt_types.h b/drivers/gpu/drm/xe/xe_ggtt_types.h
> index d8c584d9a8c3..89b721205d2d 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt_types.h
> +++ b/drivers/gpu/drm/xe/xe_ggtt_types.h
> @@ -34,6 +34,8 @@ struct xe_ggtt {
>  	const struct xe_ggtt_pt_ops *pt_ops;
>  
>  	struct drm_mm mm;
> +
> +	unsigned int update_count;

access_count?

perhaps we should increase it regardless the w/a and then print it on debugfs?
and then for the wa itself we just use a mod % 64 == 0 to apply the w/a?
but no strong opinion on this...

>  };
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c
> index 80a61934decc..9cd154bf75b7 100644
> --- a/drivers/gpu/drm/xe/xe_gsc.c
> +++ b/drivers/gpu/drm/xe/xe_gsc.c
> @@ -22,6 +22,7 @@
>  #include "xe_gt.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_printk.h"
> +#include "xe_guc_pc.h"
>  #include "xe_huc.h"
>  #include "xe_map.h"
>  #include "xe_mmio.h"
> @@ -342,6 +343,7 @@ static void gsc_work(struct work_struct *work)
>  	struct xe_gsc *gsc = container_of(work, typeof(*gsc), work);
>  	struct xe_gt *gt = gsc_to_gt(gsc);
>  	struct xe_device *xe = gt_to_xe(gt);
> +	struct xe_guc_pc *pc = &gt->uc.guc.pc;
>  	u32 actions;
>  	int ret;
>  
> @@ -370,6 +372,8 @@ static void gsc_work(struct work_struct *work)
>  	if (actions & GSC_ACTION_SW_PROXY)
>  		xe_gsc_proxy_request_handler(gsc);
>  
> +	xe_guc_pc_wa_22019338487_end(pc);
> +
>  out:
>  	xe_force_wake_put(gt_to_fw(gt), XE_FW_GSC);
>  	xe_pm_runtime_put(xe);
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 57d84751e160..1315b184220a 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -816,6 +816,8 @@ int xe_gt_resume(struct xe_gt *gt)
>  
>  	xe_gt_idle_enable_pg(gt);
>  
> +	xe_guc_pc_wa_22019338487_end(&gt->uc.guc.pc);
> +
>  	XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL));
>  	xe_gt_dbg(gt, "resumed\n");
>  
> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c
> index 666a37106bc5..7d8aa68e60b7 100644
> --- a/drivers/gpu/drm/xe/xe_guc_pc.c
> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
> @@ -8,6 +8,7 @@
>  #include <linux/delay.h>
>  
>  #include <drm/drm_managed.h>
> +#include <generated/xe_wa_oob.h>
>  
>  #include "abi/guc_actions_slpc_abi.h"
>  #include "regs/xe_gt_regs.h"
> @@ -24,6 +25,7 @@
>  #include "xe_map.h"
>  #include "xe_mmio.h"
>  #include "xe_pcode.h"
> +#include "xe_wa.h"
>  
>  #define MCHBAR_MIRROR_BASE_SNB	0x140000
>  
> @@ -41,6 +43,8 @@
>  #define GT_FREQUENCY_MULTIPLIER	50
>  #define GT_FREQUENCY_SCALER	3
>  
> +#define LNL_MERT_FREQ_CAP	800
> +
>  /**
>   * DOC: GuC Power Conservation (PC)
>   *
> @@ -673,6 +677,14 @@ static void pc_init_fused_rp_values(struct xe_guc_pc *pc)
>  		tgl_init_fused_rp_values(pc);
>  }
>  
> +u32 xe_guc_pc_mert_freq_cap(struct xe_guc_pc *pc)

do we really need to export this?
if so, please add doc.

> +{
> +	if (MEDIA_VERx100(pc_to_xe(pc)) == 2000)
> +		return LNL_MERT_FREQ_CAP;
> +	else
> +		return 0;
> +}
> +
>  /**
>   * xe_guc_pc_init_early - Initialize RPx values and request a higher GT
>   * frequency to allow faster GuC load times
> @@ -684,7 +696,11 @@ void xe_guc_pc_init_early(struct xe_guc_pc *pc)
>  
>  	xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT);
>  	pc_init_fused_rp_values(pc);
> -	pc_set_cur_freq(pc, pc->rp0_freq);
> +
> +	if (XE_WA(gt, 22019338487))
> +		pc_set_cur_freq(pc, min(xe_guc_pc_mert_freq_cap(pc), pc->rp0_freq));

with this we are setting 0 to cur freq!

> +	else
> +		pc_set_cur_freq(pc, pc->rp0_freq);
>  }
>  
>  static int pc_adjust_freq_bounds(struct xe_guc_pc *pc)
> @@ -740,6 +756,71 @@ static int pc_adjust_requested_freq(struct xe_guc_pc *pc)
>  	return ret;
>  }
>  
> +static int pc_wa_22019338487_start(struct xe_guc_pc *pc)
> +{
> +	int ret = 0;
> +
> +	if (XE_WA(pc_to_gt(pc), 22019338487)) {
> +		/*
> +		 * Get updated min/max and stash them.
> +		 */
> +		ret = xe_guc_pc_get_min_freq(pc, &pc->pre_wa_min_freq);
> +		if (!ret)
> +			ret = xe_guc_pc_get_max_freq(pc, &pc->pre_wa_max_freq);
> +		if (ret)
> +			return ret;
> +
> +		/*
> +		 * Ensure min and max are bound by MERT_FREQ_CAP until driver loads.
> +		 */
> +		mutex_lock(&pc->freq_lock);
> +
> +		ret = pc_set_min_freq(pc, min(pc->rpe_freq, xe_guc_pc_mert_freq_cap(pc)));
> +		if (!ret)
> +			ret = pc_set_max_freq(pc, min(pc->rp0_freq, xe_guc_pc_mert_freq_cap(pc)));
> +
> +		mutex_unlock(&pc->freq_lock);
> +	}
> +
> +	return ret;
> +}
> +
> +static int restore_pre_wa_freq(struct xe_guc_pc *pc)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&pc->freq_lock);
> +	ret = pc_set_max_freq(pc, pc->pre_wa_max_freq);
> +	if (!ret)
> +		pc_set_min_freq(pc, pc->pre_wa_min_freq);
> +
> +	mutex_unlock(&pc->freq_lock);
> +
> +	return ret;
> +}
> +
> +int xe_guc_pc_wa_22019338487_end(struct xe_guc_pc *pc)

doc to every exported functions please. Besides avoiding the wa in the name itself as ditto.

> +{
> +	if (XE_WA(pc_to_gt(pc), 22019338487))
> +		return restore_pre_wa_freq(pc);
> +	else
> +		return 0;
> +}
> +
> +void xe_guc_pc_wa_22019338487_fini(struct xe_device *xe)
> +{
> +	struct xe_gt *gt;
> +	struct xe_guc_pc *pc;
> +	u8 id;
> +
> +	for_each_gt(gt, xe, id) {
> +		pc = &gt->uc.guc.pc;
> +		/* Set requested freq to mert_freq_cap before unload */
> +		if (XE_WA(gt, 22019338487))
> +			pc_set_cur_freq(pc, min(xe_guc_pc_mert_freq_cap(pc), pc->rpe_freq));
> +	}
> +}
> +
>  /**
>   * xe_guc_pc_gucrc_disable - Disable GuC RC
>   * @pc: Xe_GuC_PC instance
> @@ -854,6 +935,10 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
>  	if (ret)
>  		goto out;
>  
> +	ret = pc_wa_22019338487_start(pc);
> +	if (ret)
> +		goto out;
> +
>  	if (xe->info.platform == XE_PVC) {
>  		xe_guc_pc_gucrc_disable(pc);
>  		ret = 0;
> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h
> index 532cac985a6d..760324ecdbf8 100644
> --- a/drivers/gpu/drm/xe/xe_guc_pc.h
> +++ b/drivers/gpu/drm/xe/xe_guc_pc.h
> @@ -8,6 +8,7 @@
>  
>  #include <linux/types.h>
>  
> +struct xe_device;
>  struct xe_guc_pc;
>  
>  int xe_guc_pc_init(struct xe_guc_pc *pc);
> @@ -29,5 +30,8 @@ enum xe_gt_idle_state xe_guc_pc_c_status(struct xe_guc_pc *pc);
>  u64 xe_guc_pc_rc6_residency(struct xe_guc_pc *pc);
>  u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc);
>  void xe_guc_pc_init_early(struct xe_guc_pc *pc);
> +u32 xe_guc_pc_mert_freq_cap(struct xe_guc_pc *pc);
> +int xe_guc_pc_wa_22019338487_end(struct xe_guc_pc *pc);
> +void xe_guc_pc_wa_22019338487_fini(struct xe_device *xe);
>  
>  #endif /* _XE_GUC_PC_H_ */
> diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h
> index 2afd0dbc3542..721f673b1628 100644
> --- a/drivers/gpu/drm/xe/xe_guc_pc_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h
> @@ -25,6 +25,10 @@ struct xe_guc_pc {
>  	u32 user_requested_min;
>  	/** @user_requested_max: Stash the maximum requested freq by user */
>  	u32 user_requested_max;
> +	/** @pre_wa_min_freq: Stash the minimum freq before applying WA */
> +	u32 pre_wa_min_freq;
> +	/** @pre_wa_max_freq: Stash the maximum freq before applying WA */
> +	u32 pre_wa_max_freq;

do we really need this?
can't we simply return to a regular point where we had put already?
if we really need this, also please avoid 'wa' in the name.
Use a generic name for stash_prev_min stash_prev_max, or something like that...

>  	/** @freq_lock: Let's protect the frequencies */
>  	struct mutex freq_lock;
>  	/** @freq_ready: Only handle freq changes, if they are really ready */
> diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
> index 12fe88796a49..a6b897030fde 100644
> --- a/drivers/gpu/drm/xe/xe_wa_oob.rules
> +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
> @@ -27,3 +27,4 @@
>  16022287689	GRAPHICS_VERSION(2001)
>  		GRAPHICS_VERSION(2004)
>  13011645652	GRAPHICS_VERSION(2004)
> +22019338487	MEDIA_VERSION(2000)
> -- 
> 2.38.1
>