[Intel-xe] [RFC 4/5] drm/xe/pat: annotate pat_index with coherency mode

Matthew Auld matthew.auld at intel.com
Wed Aug 30 09:32:30 UTC 2023


On 29/08/2023 22:08, Matt Roper wrote:
> On Tue, Aug 29, 2023 at 05:28:45PM +0100, Matthew Auld wrote:
>> Future uapi needs to give userspace the ability to select the pat_index
>> for a given vm_bind. However we need to be able to extract the coherency
>> mode from the provided pat_index to ensure it matches the coherency mode
>> set at object creation. There are various security reasons for why this
>> matters.  However the pat_index itself is very platform specific, so
>> seems reasonable to annotate each platform definition of the pat table.
>> On some older platforms there is no explicit coherency mode, so we just
>> pick whatever makes sense.
>>
>> Bspec: 45101, 44235 #xe
>> Bspec: 70552, 71582, 59400 #xe2
>> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
>> Cc: Pallavi Mishra <pallavi.mishra at intel.com>
>> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> Cc: José Roberto de Souza <jose.souza at intel.com>
>> Cc: Filip Hazubski <filip.hazubski at intel.com>
>> Cc: Carl Zhang <carl.zhang at intel.com>
>> Cc: Effie Yu <effie.yu at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_device_types.h |  2 +-
>>   drivers/gpu/drm/xe/xe_pat.c          | 67 ++++++++++++++++------------
>>   drivers/gpu/drm/xe/xe_pat.h          |  6 +++
>>   3 files changed, 46 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index 06235da647bb..53520ae30b33 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -238,7 +238,7 @@ struct xe_device {
>>   		/** @enable_display: display enabled */
>>   		u8 enable_display:1;
>>   
>> -		const u32 *pat_table;
>> +		const struct xe_pat_table_entry *pat_table;
>>   		int pat_table_n_entries;
>>   
>>   #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>> diff --git a/drivers/gpu/drm/xe/xe_pat.c b/drivers/gpu/drm/xe/xe_pat.c
>> index f19f5d8dcd94..9e72c1b4b41f 100644
>> --- a/drivers/gpu/drm/xe/xe_pat.c
>> +++ b/drivers/gpu/drm/xe/xe_pat.c
>> @@ -4,6 +4,8 @@
>>    */
>>   
>>   
>> +#include <drm/xe_drm.h>
>> +
>>   #include "regs/xe_reg_defs.h"
>>   #include "xe_gt.h"
>>   #include "xe_gt_mcr.h"
>> @@ -33,34 +35,34 @@
>>   #define TGL_PAT_WC				REG_FIELD_PREP(TGL_MEM_TYPE_MASK, 1)
>>   #define TGL_PAT_UC				REG_FIELD_PREP(TGL_MEM_TYPE_MASK, 0)
>>   
>> -static const u32 tgl_pat_table[] = {
>> -	[0] = TGL_PAT_WB,
>> -	[1] = TGL_PAT_WC,
>> -	[2] = TGL_PAT_WT,
>> -	[3] = TGL_PAT_UC,
>> -	[4] = TGL_PAT_WB,
>> -	[5] = TGL_PAT_WB,
>> -	[6] = TGL_PAT_WB,
>> -	[7] = TGL_PAT_WB,
>> +static const struct xe_pat_table_entry tgl_pat_table[] = {
>> +	[0] = { TGL_PAT_WB, XE_GEM_COHERENCY_2WAY },
>> +	[1] = { TGL_PAT_WC, XE_GEM_COHERENCY_NONE },
>> +	[2] = { TGL_PAT_WT, XE_GEM_COHERENCY_NONE },
>> +	[3] = { TGL_PAT_UC, XE_GEM_COHERENCY_NONE },
>> +	[4] = { TGL_PAT_WB }, /* zero coh_mode to indicate invalid from userspace */
>> +	[5] = { TGL_PAT_WB },
>> +	[6] = { TGL_PAT_WB },
>> +	[7] = { TGL_PAT_WB },
> 
> Should we just not even include 4-7?  That's basically the approach
> we've taken with MTL (which has 32 entries in hardware, but we only
> provide a table of the first 5).
> 
>>   };
>>   
>> -static const u32 pvc_pat_table[] = {
>> -	[0] = TGL_PAT_UC,
>> -	[1] = TGL_PAT_WC,
>> -	[2] = TGL_PAT_WT,
>> -	[3] = TGL_PAT_WB,
>> -	[4] = PVC_PAT_CLOS(1) | TGL_PAT_WT,
>> -	[5] = PVC_PAT_CLOS(1) | TGL_PAT_WB,
>> -	[6] = PVC_PAT_CLOS(2) | TGL_PAT_WT,
>> -	[7] = PVC_PAT_CLOS(2) | TGL_PAT_WB,
>> +static const struct xe_pat_table_entry pvc_pat_table[] = {
>> +	[0] = { TGL_PAT_UC, XE_GEM_COHERENCY_NONE },
>> +	[1] = { TGL_PAT_WC, XE_GEM_COHERENCY_NONE },
>> +	[2] = { TGL_PAT_WT, XE_GEM_COHERENCY_NONE },
>> +	[3] = { TGL_PAT_WB, XE_GEM_COHERENCY_2WAY },
> 
> Is 2-way correct here?  Although GPU reads snoop the CPU cache and GPU
> writes invalidate the CPU cache, I don't think the reverse is true (CPU
> operations don't automatically snoop/invalidate the GPU cache).  So
> wouldn't this be just 1-way coherent?

Yeah, I wasn't too sure about this. The Bspec says something like:

Discrete GPUs do not support caching of system memory in the device. 
Therefore, the coherency mode that is supported in discrete GPUs are the 
one-way coherency mode with no caching of system memory inside the 
discrete GPU.  Setting Coherency Mode to 10 or 11 in discrete GPUs 
results in this same behaviour.

My interpretation is that 1way and 2way do the same thing on dgpu, so I 
figured I could cheat here and claim it's 2way and then we don't need 
separate tables or special logic for dgpu vs igpu. I can change it to 
1way if you prefer though?

> 
> By the same logic, once we add in coherency the TGL table above probably
> isn't correct anymore for DG1/DG2 where we only have PCI snooping
> instead of a truly shared LLC cache.
> 
>> +	[4] = { PVC_PAT_CLOS(1) | TGL_PAT_WT, XE_GEM_COHERENCY_NONE },
>> +	[5] = { PVC_PAT_CLOS(1) | TGL_PAT_WB, XE_GEM_COHERENCY_2WAY },
>> +	[6] = { PVC_PAT_CLOS(2) | TGL_PAT_WT, XE_GEM_COHERENCY_NONE },
>> +	[7] = { PVC_PAT_CLOS(2) | TGL_PAT_WB, XE_GEM_COHERENCY_2WAY },
>>   };
>>   
>> -static const u32 mtl_pat_table[] = {
>> -	[0] = MTL_PAT_0_WB,
>> -	[1] = MTL_PAT_1_WT,
>> -	[2] = MTL_PAT_3_UC,
>> -	[3] = MTL_PAT_0_WB | MTL_2_COH_1W,
>> -	[4] = MTL_PAT_0_WB | MTL_3_COH_2W,
>> +static const struct xe_pat_table_entry mtl_pat_table[] = {
>> +	[0] = { MTL_PAT_0_WB, XE_GEM_COHERENCY_NONE },
>> +	[1] = { MTL_PAT_1_WT, XE_GEM_COHERENCY_NONE },
>> +	[2] = { MTL_PAT_3_UC, XE_GEM_COHERENCY_NONE },
>> +	[3] = { MTL_PAT_0_WB | MTL_2_COH_1W, XE_GEM_COHERENCY_1WAY },
>> +	[4] = { MTL_PAT_0_WB | MTL_3_COH_2W, XE_GEM_COHERENCY_2WAY },
> 
> Although this is labelled "2 way" in the bspec, the reality is that it's
> a bit of a lie.  From bspec 63884:
> 
>          "2-way Coherent (GPU <-> CPU Snooping) honored only for atomics,
>          else 1-way coh"
> 
> and also:
> 
>          "Except for system atomics, setting Coherency Mode to 10 or 11
>          results in this same one-way coherenct behavior.  Full CPU-GPU
>          coherency is maintained for system atomics with Coherency Mode =
>          11, with no caching."

Ok, I didn't know that. So what should we put here for MTL_3_COH_2W? 
Just mark it as 1way? AFAICT 1way vs 2way doesn't really matter all that 
much from KMD pov, just so long as the coherency level ensures that 
swap-in and clearing can't ever be bypassed. So maybe all that matters 
here is that this matches the bspec, and exactly what 2way means for the 
platform we don't really care? Or perhaps we don't even need the 2way 
and can just mark stuff as NONE or AT_LEAST_1WAY?

> 
> 
> Matt
> 
>>   };
>>   
>>   static const u32 xelp_pte_pat_table[XE_CACHE_LAST] = {
>> @@ -82,27 +84,35 @@ static const u32 xelpg_pte_pat_table[XE_CACHE_LAST] = {
>>   	[XE_CACHE_WB_1_WAY] = XELPG_PAT_WB_CACHE_1_WAY,
>>   };
>>   
>> +u16 xe_pat_index_get_coh_mode(struct xe_device *xe, u32 pat_index)
>> +{
>> +	WARN_ON(pat_index >= xe->info.pat_table_n_entries);
>> +	return xe->info.pat_table[pat_index].coh_mode;
>> +}
>> +
>>   unsigned int xe_pat_get_index(struct xe_device *xe, enum xe_cache_level cache)
>>   {
>>   	WARN_ON(cache >= XE_CACHE_LAST);
>>   	return (xe->pat_table).pte_pat_table[cache];
>>   }
>>   
>> -static void program_pat(struct xe_gt *gt, const u32 table[], int n_entries)
>> +static void program_pat(struct xe_gt *gt, const struct xe_pat_table_entry table[],
>> +			int n_entries)
>>   {
>>   	for (int i = 0; i < n_entries; i++) {
>>   		struct xe_reg reg = XE_REG(_PAT_INDEX(i));
>>   
>> -		xe_mmio_write32(gt, reg, table[i]);
>> +		xe_mmio_write32(gt, reg, table[i].value);
>>   	}
>>   }
>>   
>> -static void program_pat_mcr(struct xe_gt *gt, const u32 table[], int n_entries)
>> +static void program_pat_mcr(struct xe_gt *gt, const struct xe_pat_table_entry table[],
>> +			    int n_entries)
>>   {
>>   	for (int i = 0; i < n_entries; i++) {
>>   		struct xe_reg_mcr reg_mcr = XE_REG_MCR(_PAT_INDEX(i));
>>   
>> -		xe_gt_mcr_multicast_write(gt, reg_mcr, table[i]);
>> +		xe_gt_mcr_multicast_write(gt, reg_mcr, table[i].value);
>>   	}
>>   }
>>   
>> @@ -115,6 +125,7 @@ int xe_pat_fill_info(struct xe_device *xe)
>>   		xe->info.pat_table = pvc_pat_table;
>>   		xe->info.pat_table_n_entries = ARRAY_SIZE(pvc_pat_table);
>>   	} else if (GRAPHICS_VERx100(xe) <= 1210) {
>> +		WARN_ON_ONCE(!IS_DGFX(xe) && !xe->info.has_llc);
>>   		xe->info.pat_table = tgl_pat_table;
>>   		xe->info.pat_table_n_entries = ARRAY_SIZE(tgl_pat_table);
>>   	} else {
>> diff --git a/drivers/gpu/drm/xe/xe_pat.h b/drivers/gpu/drm/xe/xe_pat.h
>> index 9ab059758ad1..e2fabde2c730 100644
>> --- a/drivers/gpu/drm/xe/xe_pat.h
>> +++ b/drivers/gpu/drm/xe/xe_pat.h
>> @@ -28,9 +28,15 @@
>>   struct xe_gt;
>>   struct xe_device;
>>   
>> +struct xe_pat_table_entry {
>> +	u32 value;
>> +	u16 coh_mode;
>> +};
>> +
>>   int xe_pat_fill_info(struct xe_device *xe);
>>   void xe_pat_init(struct xe_gt *gt);
>>   void xe_pte_pat_init(struct xe_device *xe);
>>   unsigned int xe_pat_get_index(struct xe_device *xe, enum xe_cache_level cache);
>> +u16 xe_pat_index_get_coh_mode(struct xe_device *xe, u32 pat_index);
>>   
>>   #endif
>> -- 
>> 2.41.0
>>
> 


More information about the Intel-xe mailing list