[PATCH v3 1/2] drm/xe/bmg: implement Wa_16023588340

Matthew Auld matthew.auld at intel.com
Wed Jul 3 09:41:31 UTC 2024


Hi,

On 03/07/2024 10:24, Thomas Hellström wrote:
> Hi, Matt
> 
> On Tue, 2024-07-02 at 16:06 +0100, Matthew Auld wrote:
>> This involves enabling l2 caching of host side memory access to VRAM
>> through the CPU BAR. The main fallout here is with display since VRAM
>> writes from CPU can now be cached in GPU l2, and display is never
>> coherent with caches, so needs various manual flushing.  In the case
>> of
>> fbc we disable it due to complications in getting this to work
>> correctly (in a later patch).
> 
> What about user-space accesses to framebuffers?

There should be a manual flush of entire l2 before flip etc. For 
simplicity we piggy back off of xe_device_td_flush() which should 
already be called in the right places from i915-display. With td_flush 
we were already flushing l2, but only cache entries marked as transient 
display.

> 
> /Thomas
> 
> 
>>
>> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
>> Cc: Jonathan Cavitt <jonathan.cavitt at intel.com>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>> Cc: Vinod Govindapillai <vinod.govindapillai at intel.com>
>> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
>> ---
>>   drivers/gpu/drm/xe/Makefile                |  2 +
>>   drivers/gpu/drm/xe/display/xe_dsb_buffer.c |  8 ++++
>>   drivers/gpu/drm/xe/display/xe_fb_pin.c     |  3 ++
>>   drivers/gpu/drm/xe/regs/xe_gt_regs.h       |  8 ++++
>>   drivers/gpu/drm/xe/xe_device.c             | 30 ++++++++++++
>>   drivers/gpu/drm/xe/xe_device.h             |  1 +
>>   drivers/gpu/drm/xe/xe_gt.c                 | 54
>> ++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_pat.c                | 11 ++++-
>>   drivers/gpu/drm/xe/xe_wa_oob.rules         |  1 +
>>   9 files changed, 117 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile
>> b/drivers/gpu/drm/xe/Makefile
>> index b1e03bfe4a68..970c5c09e20a 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -25,12 +25,14 @@ $(obj)/generated/%_wa_oob.c
>> $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>>   
>>   uses_generated_oob := \
>>   	$(obj)/xe_ggtt.o \
>> +	$(obj)/xe_device.o \
>>   	$(obj)/xe_gsc.o \
>>   	$(obj)/xe_gt.o \
>>   	$(obj)/xe_guc.o \
>>   	$(obj)/xe_guc_ads.o \
>>   	$(obj)/xe_guc_pc.o \
>>   	$(obj)/xe_migrate.o \
>> +	$(obj)/xe_pat.o \
>>   	$(obj)/xe_ring_ops.o \
>>   	$(obj)/xe_vm.o \
>>   	$(obj)/xe_wa.o \
>> diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c
>> b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c
>> index 9e860c61f4b3..ccd0d87d438a 100644
>> --- a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c
>> +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c
>> @@ -7,6 +7,8 @@
>>   #include "intel_display_types.h"
>>   #include "intel_dsb_buffer.h"
>>   #include "xe_bo.h"
>> +#include "xe_device.h"
>> +#include "xe_device_types.h"
>>   #include "xe_gt.h"
>>   
>>   u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf)
>> @@ -16,7 +18,10 @@ u32 intel_dsb_buffer_ggtt_offset(struct
>> intel_dsb_buffer *dsb_buf)
>>   
>>   void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32
>> idx, u32 val)
>>   {
>> +	struct xe_device *xe = dsb_buf->vma->bo->tile->xe;
>> +
>>   	iosys_map_wr(&dsb_buf->vma->bo->vmap, idx * 4, u32, val);
>> +	xe_device_l2_flush(xe);
>>   }
>>   
>>   u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx)
>> @@ -26,9 +31,12 @@ u32 intel_dsb_buffer_read(struct intel_dsb_buffer
>> *dsb_buf, u32 idx)
>>   
>>   void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32
>> idx, u32 val, size_t size)
>>   {
>> +	struct xe_device *xe = dsb_buf->vma->bo->tile->xe;
>> +
>>   	WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf-
>>> cmd_buf));
>>   
>>   	iosys_map_memset(&dsb_buf->vma->bo->vmap, idx * 4, val,
>> size);
>> +	xe_device_l2_flush(xe);
>>   }
>>   
>>   bool intel_dsb_buffer_create(struct intel_crtc *crtc, struct
>> intel_dsb_buffer *dsb_buf, size_t size)
>> diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c
>> b/drivers/gpu/drm/xe/display/xe_fb_pin.c
>> index 423f367c7065..d7db44e79eaf 100644
>> --- a/drivers/gpu/drm/xe/display/xe_fb_pin.c
>> +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c
>> @@ -10,6 +10,7 @@
>>   #include "intel_fb.h"
>>   #include "intel_fb_pin.h"
>>   #include "xe_bo.h"
>> +#include "xe_device.h"
>>   #include "xe_ggtt.h"
>>   #include "xe_gt.h"
>>   #include "xe_pm.h"
>> @@ -304,6 +305,8 @@ static struct i915_vma *__xe_pin_fb_vma(const
>> struct intel_framebuffer *fb,
>>   	if (ret)
>>   		goto err_unpin;
>>   
>> +	/* Ensure DPT writes are flushed */
>> +	xe_device_l2_flush(xe);
>>   	return vma;
>>   
>>   err_unpin:
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> index d44564bad009..fd9d94174efb 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> @@ -80,6 +80,9 @@
>>   #define   LE_CACHEABILITY_MASK			REG_GENMASK(1, 0)
>>   #define
>> LE_CACHEABILITY(value)		REG_FIELD_PREP(LE_CACHEABILITY_MASK, value)
>>   
>> +#define XE2_GAMREQSTRM_CTRL			XE_REG(0x4194)
>> +#define   CG_DIS_CNTLBUS			REG_BIT(6)
>> +
>>   #define CCS_AUX_INV				XE_REG(0x4208)
>>   
>>   #define VD0_AUX_INV				XE_REG(0x4218)
>> @@ -372,6 +375,11 @@
>>   
>>   #define XEHPC_L3CLOS_MASK(i)			XE_REG_MCR(0xb194 +
>> (i) * 8)
>>   
>> +#define XE2_GLOBAL_INVAL			XE_REG(0xb404)
>> +
>> +#define SCRATCH1LPFC				XE_REG(0xb474)
>> +#define   EN_L3_RW_CCS_CACHE_FLUSH		REG_BIT(0)
>> +
>>   #define XE2LPM_L3SQCREG5			XE_REG_MCR(0xb658)
>>   
>>   #define XE2_TDF_CTRL				XE_REG(0xb418)
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index cfda7cb5df2c..b0f79ef6bce1 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -54,6 +54,9 @@
>>   #include "xe_vm.h"
>>   #include "xe_vram.h"
>>   #include "xe_wait_user_fence.h"
>> +#include "xe_wa.h"
>> +
>> +#include <generated/xe_wa_oob.h>
>>   
>>   static int xe_file_open(struct drm_device *dev, struct drm_file
>> *file)
>>   {
>> @@ -779,6 +782,11 @@ void xe_device_td_flush(struct xe_device *xe)
>>   	if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
>>   		return;
>>   
>> +	if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
>> +		xe_device_l2_flush(xe);
>> +		return;
>> +	}
>> +
>>   	for_each_gt(gt, xe, id) {
>>   		if (xe_gt_is_media_type(gt))
>>   			continue;
>> @@ -802,6 +810,28 @@ void xe_device_td_flush(struct xe_device *xe)
>>   	}
>>   }
>>   
>> +void xe_device_l2_flush(struct xe_device *xe)
>> +{
>> +	struct xe_gt *gt;
>> +	int err;
>> +
>> +	gt = xe_root_mmio_gt(xe);
>> +
>> +	if (!XE_WA(gt, 16023588340))
>> +		return;
>> +
>> +	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>> +	if (err)
>> +		return;
>> +
>> +	xe_mmio_write32(gt, XE2_GLOBAL_INVAL, 0x1);
>> +
>> +	if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 150,
>> NULL, true))
>> +		xe_gt_err_once(gt, "Global invalidation timeout\n");
>> +
>> +	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>> +}
>> +
>>   u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
>>   {
>>   	return xe_device_has_flat_ccs(xe) ?
>> diff --git a/drivers/gpu/drm/xe/xe_device.h
>> b/drivers/gpu/drm/xe/xe_device.h
>> index bb07f5669dbb..0a2a3e7fd402 100644
>> --- a/drivers/gpu/drm/xe/xe_device.h
>> +++ b/drivers/gpu/drm/xe/xe_device.h
>> @@ -162,6 +162,7 @@ u64 xe_device_canonicalize_addr(struct xe_device
>> *xe, u64 address);
>>   u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64
>> address);
>>   
>>   void xe_device_td_flush(struct xe_device *xe);
>> +void xe_device_l2_flush(struct xe_device *xe);
>>   
>>   static inline bool xe_device_wedged(struct xe_device *xe)
>>   {
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index 29e8ea94d05e..006d3594ba55 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -11,6 +11,8 @@
>>   #include <drm/xe_drm.h>
>>   #include <generated/xe_wa_oob.h>
>>   
>> +#include <generated/xe_wa_oob.h>
>> +
>>   #include "instructions/xe_gfxpipe_commands.h"
>>   #include "instructions/xe_mi_commands.h"
>>   #include "regs/xe_gt_regs.h"
>> @@ -95,6 +97,51 @@ void xe_gt_sanitize(struct xe_gt *gt)
>>   	gt->uc.guc.submission_state.enabled = false;
>>   }
>>   
>> +static void xe_gt_enable_host_l2_vram(struct xe_gt *gt)
>> +{
>> +	u32 reg;
>> +	int err;
>> +
>> +	if (!XE_WA(gt, 16023588340))
>> +		return;
>> +
>> +	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>> +	if (WARN_ON(err))
>> +		return;
>> +
>> +	if (!xe_gt_is_media_type(gt)) {
>> +		xe_mmio_write32(gt, SCRATCH1LPFC,
>> EN_L3_RW_CCS_CACHE_FLUSH);
>> +		reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL);
>> +		reg |= CG_DIS_CNTLBUS;
>> +		xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg);
>> +	}
>> +
>> +	xe_gt_mcr_multicast_write(gt, XEHPC_L3CLOS_MASK(3), 0x3);
>> +	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>> +}
>> +
>> +static void xe_gt_disable_host_l2_vram(struct xe_gt *gt)
>> +{
>> +	u32 reg;
>> +	int err;
>> +
>> +	if (!XE_WA(gt, 16023588340))
>> +		return;
>> +
>> +	if (xe_gt_is_media_type(gt))
>> +		return;
>> +
>> +	err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>> +	if (WARN_ON(err))
>> +		return;
>> +
>> +	reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL);
>> +	reg &= ~CG_DIS_CNTLBUS;
>> +	xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg);
>> +
>> +	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>> +}
>> +
>>   /**
>>    * xe_gt_remove() - Clean up the GT structures before driver removal
>>    * @gt: the GT object
>> @@ -111,6 +158,8 @@ void xe_gt_remove(struct xe_gt *gt)
>>   
>>   	for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i)
>>   		xe_hw_fence_irq_finish(&gt->fence_irq[i]);
>> +
>> +	xe_gt_disable_host_l2_vram(gt);
>>   }
>>   
>>   static void gt_reset_worker(struct work_struct *w);
>> @@ -508,6 +557,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt)
>>   
>>   	xe_gt_mcr_init_early(gt);
>>   	xe_pat_init(gt);
>> +	xe_gt_enable_host_l2_vram(gt);
>>   
>>   	err = xe_uc_init(&gt->uc);
>>   	if (err)
>> @@ -643,6 +693,8 @@ static int do_gt_restart(struct xe_gt *gt)
>>   
>>   	xe_pat_init(gt);
>>   
>> +	xe_gt_enable_host_l2_vram(gt);
>> +
>>   	xe_gt_mcr_set_implicit_defaults(gt);
>>   	xe_reg_sr_apply_mmio(&gt->reg_sr, gt);
>>   
>> @@ -796,6 +848,8 @@ int xe_gt_suspend(struct xe_gt *gt)
>>   
>>   	xe_gt_idle_disable_pg(gt);
>>   
>> +	xe_gt_disable_host_l2_vram(gt);
>> +
>>   	XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt),
>> XE_FORCEWAKE_ALL));
>>   	xe_gt_dbg(gt, "suspended\n");
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_pat.c
>> b/drivers/gpu/drm/xe/xe_pat.c
>> index 4ee32ee1cc88..722278cc23fc 100644
>> --- a/drivers/gpu/drm/xe/xe_pat.c
>> +++ b/drivers/gpu/drm/xe/xe_pat.c
>> @@ -7,6 +7,8 @@
>>   
>>   #include <drm/xe_drm.h>
>>   
>> +#include <generated/xe_wa_oob.h>
>> +
>>   #include "regs/xe_reg_defs.h"
>>   #include "xe_assert.h"
>>   #include "xe_device.h"
>> @@ -15,6 +17,7 @@
>>   #include "xe_gt_mcr.h"
>>   #include "xe_mmio.h"
>>   #include "xe_sriov.h"
>> +#include "xe_wa.h"
>>   
>>   #define _PAT_ATS				0x47fc
>>   #define
>> _PAT_INDEX(index)			_PICK_EVEN_2RANGES(index, 8, \
>> @@ -382,7 +385,13 @@ void xe_pat_init_early(struct xe_device *xe)
>>   	if (GRAPHICS_VER(xe) == 20) {
>>   		xe->pat.ops = &xe2_pat_ops;
>>   		xe->pat.table = xe2_pat_table;
>> -		xe->pat.n_entries = ARRAY_SIZE(xe2_pat_table);
>> +
>> +		/* Wa_16023588340. XXX: Should use XE_WA */
>> +		if (GRAPHICS_VERx100(xe) == 2001)
>> +			xe->pat.n_entries = 28; /* Disable CLOS3 */
>> +		else
>> +			xe->pat.n_entries =
>> ARRAY_SIZE(xe2_pat_table);
>> +
>>   		xe->pat.idx[XE_CACHE_NONE] = 3;
>>   		xe->pat.idx[XE_CACHE_WT] = 15;
>>   		xe->pat.idx[XE_CACHE_WB] = 2;
>> diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules
>> b/drivers/gpu/drm/xe/xe_wa_oob.rules
>> index a6b897030fde..c6d8941621c6 100644
>> --- a/drivers/gpu/drm/xe/xe_wa_oob.rules
>> +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
>> @@ -28,3 +28,4 @@
>>   		GRAPHICS_VERSION(2004)
>>   13011645652	GRAPHICS_VERSION(2004)
>>   22019338487	MEDIA_VERSION(2000)
>> +16023588340	GRAPHICS_VERSION(2001)
> 


More information about the Intel-gfx mailing list