[PATCH v3] drm/amdgpu: Add EXT_COHERENCE memory allocation flags

Felix Kuehling felix.kuehling at amd.com
Thu Jul 27 22:01:30 UTC 2023


In amdgpu_dma_buf_create_obj we copy the coherence-related flags to the 
SG BO that's used to attach the BO to the importer device. You need to 
add the new flag to the list.

Some more nit-picks inline.

Am 2023-07-26 um 09:34 schrieb David Francis:
> These flags (for GEM and SVM allocations) allocate
> memory that allows for system-scope atomic semantics.
>
> On GFX943 these flags cause caches to be avoided on
> non-local memory.
>
> On all other ASICs they are identical in functionality to the
> equivalent COHERENT flags.
>
> Corresponding Thunk patch is at
> https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88
>
> Signed-off-by: David Francis <David.Francis at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c           |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c           |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c            |  5 ++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c             | 10 +++++++++-
>   include/uapi/drm/amdgpu_drm.h                    |  7 +++++++
>   include/uapi/linux/kfd_ioctl.h                   |  3 +++
>   7 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d34c3ef8f3ed..7f23bc0ee592 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1738,6 +1738,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>   
>   	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
>   		alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
> +	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENCE)
> +		alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENCE;
>   	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
>   		alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6b430e10d38e..8e951688668b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -632,6 +632,7 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
>   	}
>   
>   	if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
> +			       AMDGPU_GEM_CREATE_EXT_COHERENCE |
>   			       AMDGPU_GEM_CREATE_UNCACHED))
>   		*flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
>   			 AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index a6ee0220db56..ff330c7c0232 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -540,6 +540,7 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
>   	}
>   
>   	if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
> +			       AMDGPU_GEM_CREATE_EXT_COHERENCE |
>   			       AMDGPU_GEM_CREATE_UNCACHED))
>   		*flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
>   			 AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 880460cd3239..e40fcfc1a3f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1183,7 +1183,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
>   	bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
> -	bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
> +	bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENCE);
> +	bool ext_coherence = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENCE;
>   	bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
>   	struct amdgpu_vm *vm = mapping->bo_va->base.vm;
>   	unsigned int mtype_local, mtype;
> @@ -1251,6 +1252,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
>   		snoop = true;
>   		if (uncached) {
>   			mtype = MTYPE_UC;
> +		} else if (ext_coherence) {
> +			mtype = is_local ? MTYPE_CC : MTYPE_UC;
>   		} else if (adev->flags & AMD_IS_APU) {
>   			mtype = is_local ? mtype_local : MTYPE_NC;
>   		} else {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 1b50eae051a4..28304b93a990 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1155,7 +1155,8 @@ svm_range_get_pte_flags(struct kfd_node *node,
>   	uint32_t mapping_flags = 0;
>   	uint64_t pte_flags;
>   	bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
> -	bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
> +	bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENCE);
> +	bool ext_coherence = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENCE;
>   	bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
>   	unsigned int mtype_local;
>   
> @@ -1203,6 +1204,13 @@ svm_range_get_pte_flags(struct kfd_node *node,
>   		snoop = true;
>   		if (uncached) {
>   			mapping_flags |= AMDGPU_VM_MTYPE_UC;
> +		} else if (ext_coherence) {
> +			/* local HBM region close to partition */
> +			if (bo_node->adev == node->adev &&
> +			    (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
> +				mapping_flags |= AMDGPU_VM_MTYPE_CC;
> +			else
> +				mapping_flags |= AMDGPU_VM_MTYPE_UC;
>   		} else if (domain == SVM_RANGE_VRAM_DOMAIN) {
>   			/* local HBM region close to partition */
>   			if (bo_node->adev == node->adev &&
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 79b14828d542..d67102dc214a 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -158,6 +158,13 @@ extern "C" {
>    * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
>    */
>   #define AMDGPU_GEM_CREATE_UNCACHED		(1 << 14)
> +/* Flag that BO should be coherent across devices when using device-level
> + * atomics. May depend on GPU instructions to flush caches explicitly.
> + *
> + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
> + * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
> + */
> +#define AMDGPU_GEM_CREATE_EXT_COHERENCE		(1 << 15)

The name of this flag is a noun, which is inconsistent with the existing 
flag AMDGPU_GEM_CREATE_COHERENT (adjective). Please change the name to 
KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT

The comment should say something about how these two flags differ. I'd 
update the comment for AMDGPU_GEM_CREATE_COHERENT to say "May depend on 
GPU instructions to flush caches to system scope explicitly". And this 
one to say "May depend on GPU instructions to flush caches to device 
scope explicitly, promoting them to system scope automatically".


>   
>   struct drm_amdgpu_gem_create_in  {
>   	/** the requested memory size */
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index eeb2fdcbdcb7..dd8f0dba7631 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args {
>   #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
>   #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 26)
>   #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED	(1 << 25)
> +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENCE	(1 << 24)

..._COHERENT


>   
>   /* Allocate memory for later SVM (shared virtual memory) mapping.
>    *
> @@ -659,6 +660,8 @@ enum kfd_mmio_remap {
>   #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY     0x00000020
>   /* Keep GPU memory mapping always valid as if XNACK is disable */
>   #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED   0x00000040
> +/* Fine grained coherency between all devices using device-scope atomics */
> +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENCE       0x00000080

..._COHERENT

Regards,
   Felix


>   
>   /**
>    * kfd_ioctl_svm_op - SVM ioctl operations


More information about the amd-gfx mailing list