[PATCH] drm/xe: Force write completion of MI_STORE_DATA_IMM

Thomas Hellström thomas.hellstrom at linux.intel.com
Mon Dec 23 15:14:13 UTC 2024


All,

On Wed, 2024-12-18 at 09:27 -0800, Matthew Brost wrote:
> On Wed, Dec 18, 2024 at 08:38:49AM -0700, Souza, Jose wrote:
> > On Tue, 2024-12-17 at 15:39 -0800, Matthew Brost wrote:
> > > On Tue, Dec 17, 2024 at 08:07:32AM -0800, José Roberto de Souza
> > > wrote:
> > > > With Force write completion unset there is no guarantees of
> > > > when the
> > > > write will be globally visible what is not the behavior wanted.
> > > > 
> > > 
> > > Do we want this backported? If so, maybe add a fixes?
> > 
> > Not sure, I don't have an actual issue that is fixed by this but I
> > think would be good to have it backported.
> > But what do you suggest? Add a fixes tag to the patch removing
> > force probe from LNL?
> > 
> 
> Yea fixing LNL force probe removal sounds reasonable to me.


This patch appears very odd to me. What problem is it trying to solve?
I mean in most of the places where we add a command streamer stall by
this flag we *don't* want that behaviour.

emit_store_imm_ppgtt_posted() explicitly calls out that this is a
posted write so why make it a flushed write?

emit_store_imm_gtt() has a flushing equivalent in
emit_flush_imm_ggtt().

emit_pte() is only ever used posted with an explicit flush following
the pte emits and the actual payload using the emitted ptes.

For the changes in xe_migrate.c, those writes are flushed in the
migration ring_ops.

For the oa instance, I'm not sure, but it looks like the ring ops will
be flushing there as well?

So what's the background of this patch? I'm going to leave it out of
drm-xe-fixes for now.

/Thomas


> 
> Matt
> 
> > > 
> > > Matt
> > > 
> > > > Signed-off-by: José Roberto de Souza <jose.souza at intel.com>
> > > > ---
> > > >  drivers/gpu/drm/xe/instructions/xe_mi_commands.h | 13 +++++++-
> > > > -----
> > > >  drivers/gpu/drm/xe/xe_migrate.c                  | 11
> > > > ++++++++---
> > > >  drivers/gpu/drm/xe/xe_oa.c                       |  4 +++-
> > > >  drivers/gpu/drm/xe/xe_ring_ops.c                 |  6 ++++--
> > > >  4 files changed, 22 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > > b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > > index 10ec2920d31b3..f4ee910f09432 100644
> > > > --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > > +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
> > > > @@ -33,12 +33,13 @@
> > > >  #define MI_TOPOLOGY_FILTER		__MI_INSTR(0xD)
> > > >  #define
> > > > MI_FORCE_WAKEUP			__MI_INSTR(0x1D)
> > > >  
> > > > -#define MI_STORE_DATA_IMM		__MI_INSTR(0x20)
> > > > -#define   MI_SDI_GGTT			REG_BIT(22)
> > > > -#define   MI_SDI_LEN_DW			GENMASK(9, 0)
> > > > -#define  
> > > > MI_SDI_NUM_DW(x)		REG_FIELD_PREP(MI_SDI_LEN_DW, (x) + 3 - 2)
> > > > -#define  
> > > > MI_SDI_NUM_QW(x)		(REG_FIELD_PREP(MI_SDI_LEN_DW, 2 * (x) + 3 - 2)|\
> > > > -					 REG_BIT(21))
> > > > +#define
> > > > MI_STORE_DATA_IMM			__MI_INSTR(0x20)
> > > > +#define   MI_SDI_GGTT				REG_BIT(22)
> > > > +#define   MI_FORCE_WRITE_COMPLETION_CHECK	REG_BIT(10)
> > > > +#define  
> > > > MI_SDI_LEN_DW				GENMASK(9, 0)
> > > > +#define  
> > > > MI_SDI_NUM_DW(x)			REG_FIELD_PREP(MI_SDI_LEN_DW, (x) + 3 - 2)
> > > > +#define  
> > > > MI_SDI_NUM_QW(x)			(REG_FIELD_PREP(MI_SDI_LEN_DW, 2 * (x) + 3 -2)|\
> > > > +						 REG_BIT(21))
> > > >  
> > > >  #define MI_LOAD_REGISTER_IMM		__MI_INSTR(0x22)
> > > >  #define   MI_LRI_LRM_CS_MMIO		REG_BIT(19)
> > > > diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> > > > b/drivers/gpu/drm/xe/xe_migrate.c
> > > > index 1b97d90aaddaf..8b32fad678782 100644
> > > > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > > > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > > > @@ -581,7 +581,9 @@ static void emit_pte(struct xe_migrate *m,
> > > >  	while (ptes) {
> > > >  		u32 chunk = min(MAX_PTE_PER_SDI, ptes);
> > > >  
> > > > -		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > MI_SDI_NUM_QW(chunk);
> > > > +		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > +				   
> > > > MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > +				    MI_SDI_NUM_QW(chunk);
> > > >  		bb->cs[bb->len++] = ofs;
> > > >  		bb->cs[bb->len++] = 0;
> > > >  
> > > > @@ -1223,7 +1225,9 @@ static void write_pgtable(struct xe_tile
> > > > *tile, struct xe_bb *bb, u64 ppgtt_ofs,
> > > >  		if (!(bb->len & 1))
> > > >  			bb->cs[bb->len++] = MI_NOOP;
> > > >  
> > > > -		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > MI_SDI_NUM_QW(chunk);
> > > > +		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > +				   
> > > > MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > +				    MI_SDI_NUM_QW(chunk);
> > > >  		bb->cs[bb->len++] = lower_32_bits(addr);
> > > >  		bb->cs[bb->len++] = upper_32_bits(addr);
> > > >  		if (pt_op->bind)
> > > > @@ -1388,7 +1392,8 @@ __xe_migrate_update_pgtables(struct
> > > > xe_migrate *m,
> > > >  			u32 idx = 0;
> > > >  
> > > >  			bb->cs[bb->len++] = MI_STORE_DATA_IMM
> > > > |
> > > > -				MI_SDI_NUM_QW(chunk);
> > > > +					   
> > > > MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > +					   
> > > > MI_SDI_NUM_QW(chunk);
> > > >  			bb->cs[bb->len++] = ofs;
> > > >  			bb->cs[bb->len++] = 0; /*
> > > > upper_32_bits */
> > > >  
> > > > diff --git a/drivers/gpu/drm/xe/xe_oa.c
> > > > b/drivers/gpu/drm/xe/xe_oa.c
> > > > index 56bf375a9d4bc..ae94490b0eac8 100644
> > > > --- a/drivers/gpu/drm/xe/xe_oa.c
> > > > +++ b/drivers/gpu/drm/xe/xe_oa.c
> > > > @@ -690,7 +690,9 @@ static void xe_oa_store_flex(struct
> > > > xe_oa_stream *stream, struct xe_lrc *lrc,
> > > >  	u32 offset = xe_bo_ggtt_addr(lrc->bo);
> > > >  
> > > >  	do {
> > > > -		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > MI_SDI_GGTT | MI_SDI_NUM_DW(1);
> > > > +		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > > MI_SDI_GGTT |
> > > > +				   
> > > > MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > +				    MI_SDI_NUM_DW(1);
> > > >  		bb->cs[bb->len++] = offset + flex->offset *
> > > > sizeof(u32);
> > > >  		bb->cs[bb->len++] = 0;
> > > >  		bb->cs[bb->len++] = flex->value;
> > > > diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c
> > > > b/drivers/gpu/drm/xe/xe_ring_ops.c
> > > > index 0be4f489d3e12..3a75a08b6be92 100644
> > > > --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> > > > +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> > > > @@ -72,7 +72,8 @@ static int emit_user_interrupt(u32 *dw, int
> > > > i)
> > > >  
> > > >  static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw,
> > > > int i)
> > > >  {
> > > > -	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT |
> > > > MI_SDI_NUM_DW(1);
> > > > +	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT |
> > > > +		  MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > MI_SDI_NUM_DW(1);
> > > >  	dw[i++] = addr;
> > > >  	dw[i++] = 0;
> > > >  	dw[i++] = value;
> > > > @@ -162,7 +163,8 @@ static int emit_pipe_invalidate(u32
> > > > mask_flags, bool invalidate_tlb, u32 *dw,
> > > >  static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
> > > >  				       u32 *dw, int i)
> > > >  {
> > > > -	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1);
> > > > +	dw[i++] = MI_STORE_DATA_IMM |
> > > > MI_FORCE_WRITE_COMPLETION_CHECK |
> > > > +		  MI_SDI_NUM_QW(1);
> > > >  	dw[i++] = lower_32_bits(addr);
> > > >  	dw[i++] = upper_32_bits(addr);
> > > >  	dw[i++] = lower_32_bits(value);
> > > > -- 
> > > > 2.47.1
> > > > 
> > 



More information about the Intel-xe mailing list