[RFC PATCH] drm/amdgpu: Set MTYPE in PTE based on BO flags

Thu Sep 8 06:10:58 UTC 2022

Am 07.09.22 um 02:02 schrieb Felix Kuehling:
> The same BO may need different MTYPEs and SNOOP flags in PTEs depending
> on its current location relative to the mappint GPU. Setting MTYPEs from
> clients ahead of time is not practical for coherent memory sharing.
> Instead determine the correct MTYPE for the desired coherence model and
> current BO location when updating the page tables.
>
> To maintain backwards compatibility with MTYPE-selection in
> AMDGPU_VA_OP_MAP, the coherence-model-based MTYPE selection is only
> applied if it chooses an MTYPE other than MTYPE_NC (the default).
>
> Add two AMDGPU_GEM_CREATE_... flags to indicate the coherence model. The
> default if no flag is specified is non-coherent (i.e. coarse-grained
> coherent at dispatch boundaries).
>
> Update amdgpu_amdkfd_gpuvm.c to use this new method to choose the
> correct MTYPE depending on the current memory location.

Only skimmed over this but in general the solution looks really clean now.

One thing that I've noticed is that you don't NULL check the BO pointer 
in the mapping. For PRTs there is no BO backing a mapping and this will 
just crash.

Apart from that I need to take a closer look when I've found my glasses 
and have a bit more time.

Regards,
Christian.

>
> Suggested-by: Christian König <christian.koenig at amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 59 ++------------
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |  7 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c        |  7 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         | 77 +++++++++++++++++--
>   include/uapi/drm/amdgpu_drm.h                 | 14 ++++
>   5 files changed, 105 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index cbd593f7d553..19d72d323355 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -404,63 +404,15 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
>   
>   static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
>   {
> -	struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
> -	bool coherent = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
> -	bool uncached = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
> -	uint32_t mapping_flags;
> -	uint64_t pte_flags;
> -	bool snoop = false;
> +	uint32_t mapping_flags = AMDGPU_VM_PAGE_READABLE |
> +				 AMDGPU_VM_MTYPE_DEFAULT;
>   
> -	mapping_flags = AMDGPU_VM_PAGE_READABLE;
>   	if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)
>   		mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
>   	if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE)
>   		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
>   
> -	switch (adev->asic_type) {
> -	case CHIP_ARCTURUS:
> -	case CHIP_ALDEBARAN:
> -		if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
> -			if (bo_adev == adev) {
> -				if (uncached)
> -					mapping_flags |= AMDGPU_VM_MTYPE_UC;
> -				else if (coherent)
> -					mapping_flags |= AMDGPU_VM_MTYPE_CC;
> -				else
> -					mapping_flags |= AMDGPU_VM_MTYPE_RW;
> -				if (adev->asic_type == CHIP_ALDEBARAN &&
> -				    adev->gmc.xgmi.connected_to_cpu)
> -					snoop = true;
> -			} else {
> -				if (uncached || coherent)
> -					mapping_flags |= AMDGPU_VM_MTYPE_UC;
> -				else
> -					mapping_flags |= AMDGPU_VM_MTYPE_NC;
> -				if (amdgpu_xgmi_same_hive(adev, bo_adev))
> -					snoop = true;
> -			}
> -		} else {
> -			if (uncached || coherent)
> -				mapping_flags |= AMDGPU_VM_MTYPE_UC;
> -			else
> -				mapping_flags |= AMDGPU_VM_MTYPE_NC;
> -			snoop = true;
> -		}
> -		break;
> -	default:
> -		if (uncached || coherent)
> -			mapping_flags |= AMDGPU_VM_MTYPE_UC;
> -		else
> -			mapping_flags |= AMDGPU_VM_MTYPE_NC;
> -
> -		if (!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
> -			snoop = true;
> -	}
> -
> -	pte_flags = amdgpu_gem_va_map_flags(adev, mapping_flags);
> -	pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
> -
> -	return pte_flags;
> +	return amdgpu_gem_va_map_flags(adev, mapping_flags);
>   }
>   
>   /**
> @@ -1670,6 +1622,11 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>   		}
>   	}
>   
> +	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
> +		alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
> +	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
> +		alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
> +
>   	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
>   	if (!*mem) {
>   		ret = -ENOMEM;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index f513e2c2e964..f14db7d3c33d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -612,6 +612,8 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
>   				 struct amdgpu_bo_va_mapping *mapping,
>   				 uint64_t *flags)
>   {
> +	struct amdgpu_bo *bo = mapping->bo_va->base.bo;
> +
>   	*flags &= ~AMDGPU_PTE_EXECUTABLE;
>   	*flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
>   
> @@ -628,6 +630,11 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
>   		*flags |= AMDGPU_PTE_SYSTEM;
>   		*flags &= ~AMDGPU_PTE_VALID;
>   	}
> +
> +	if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
> +			 AMDGPU_GEM_CREATE_UNCACHED))
> +		*flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
> +			 AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
>   }
>   
>   static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 846ccb6cf07d..6ff4de7d8709 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -499,6 +499,8 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
>   				 struct amdgpu_bo_va_mapping *mapping,
>   				 uint64_t *flags)
>   {
> +	struct amdgpu_bo *bo = mapping->bo_va->base.bo;
> +
>   	*flags &= ~AMDGPU_PTE_EXECUTABLE;
>   	*flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
>   
> @@ -515,6 +517,11 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
>   		*flags |= AMDGPU_PTE_SYSTEM;
>   		*flags &= ~AMDGPU_PTE_VALID;
>   	}
> +
> +	if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
> +			 AMDGPU_GEM_CREATE_UNCACHED))
> +		*flags = (*flags & AMDGPU_PTE_MTYPE_NV10_MASK) |
> +			 AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
>   }
>   
>   static unsigned gmc_v11_0_get_vbios_fb_size(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 4603653916f5..58a2d1c8a188 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1110,6 +1110,74 @@ static void gmc_v9_0_get_vm_pde(struct amdgpu_device *adev, int level,
>   	}
>   }
>   
> +static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
> +					 struct amdgpu_bo_va_mapping *mapping,
> +					 uint64_t *flags)
> +{
> +	struct amdgpu_bo *bo = mapping->bo_va->base.bo;
> +	struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
> +	bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
> +	bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
> +	bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
> +	unsigned int mtype;
> +	bool snoop = false;
> +
> +	switch (adev->ip_versions[GC_HWIP][0]) {
> +	case IP_VERSION(9, 4, 1):
> +	case IP_VERSION(9, 4, 2):
> +		if (is_vram) {
> +			if (bo_adev == adev) {
> +				if (uncached)
> +					mtype = MTYPE_UC;
> +				else if (coherent)
> +					mtype = MTYPE_CC;
> +				else
> +					mtype = MTYPE_RW;
> +				/* FIXME: is this still needed? Or does
> +				 * amdgpu_ttm_tt_pde_flags already handle this?
> +				 */
> +				if (adev->ip_versions[GC_HWIP][0] ==
> +					IP_VERSION(9, 4, 2) &&
> +				    adev->gmc.xgmi.connected_to_cpu)
> +					snoop = true;
> +			} else {
> +				if (uncached || coherent)
> +					mtype = MTYPE_UC;
> +				else
> +					mtype = MTYPE_NC;
> +				if (mapping->bo_va->is_xgmi)
> +					snoop = true;
> +			}
> +		} else {
> +			if (uncached || coherent)
> +				mtype = MTYPE_UC;
> +			else
> +				mtype = MTYPE_NC;
> +			/* FIXME: is this still needed? Or does
> +			 * amdgpu_ttm_tt_pde_flags already handle this?
> +			 */
> +			snoop = true;
> +		}
> +		break;
> +	default:
> +		if (uncached || coherent)
> +			mtype = MTYPE_UC;
> +		else
> +			mtype = MTYPE_NC;
> +
> +		/* FIXME: is this still needed? Or does
> +		 * amdgpu_ttm_tt_pde_flags already handle this?
> +		 */
> +		if (!is_vram)
> +			snoop = true;
> +	}
> +
> +	if (mtype != MTYPE_NC)
> +		*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
> +			 AMDGPU_PTE_MTYPE_VG10(mtype);
> +	*flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
> +}
> +
>   static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
>   				struct amdgpu_bo_va_mapping *mapping,
>   				uint64_t *flags)
> @@ -1125,14 +1193,7 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
>   		*flags &= ~AMDGPU_PTE_VALID;
>   	}
>   
> -	if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1) ||
> -	     adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) &&
> -	    !(*flags & AMDGPU_PTE_SYSTEM) &&
> -	    mapping->bo_va->is_xgmi)
> -		*flags |= AMDGPU_PTE_SNOOPED;
> -
> -	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> -		*flags |= mapping->flags & AMDGPU_PTE_SNOOPED;
> +	gmc_v9_0_get_coherence_flags(adev, mapping, flags);
>   }
>   
>   static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev)
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index c2c9c674a223..f9bdc4f388ef 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -144,6 +144,20 @@ extern "C" {
>    * content.
>    */
>   #define AMDGPU_GEM_CREATE_DISCARDABLE		(1 << 12)
> +/* Flag that BO is shared coherently between multiple devices or CPU threads.
> + * May depend on GPU instructions to flush caches explicitly
> + *
> + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
> + * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
> + */
> +#define AMDGPU_GEM_CREATE_COHERENT		(1 << 13)
> +/* Flag that BO should not be cached by GPU. Coherent without having to flush
> + * GPU caches explicitly
> + *
> + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
> + * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
> + */
> +#define AMDGPU_GEM_CREATE_UNCACHED		(1 << 14)
>   
>   struct drm_amdgpu_gem_create_in  {
>   	/** the requested memory size */