[PATCH v3] drm/amdgpu: Add EXT_COHERENT memory allocation flags

Felix Kuehling felix.kuehling at amd.com
Thu Sep 7 20:15:11 UTC 2023


I'd like to see checks for nonsensical flag combinations in the kernel 
mode driver as well. We can only make some of those combinations valid 
and meaningful in the future, if we implement well defined behaviour 
(i.e. return an error) now. Otherwise we risk breaking misbehaving user 
mode applications expecting a particular undefined behaviour later on.

Regards,
   Felix


On 2023-09-06 15:25, Yat Sin, David wrote:
> [AMD Official Use Only - General]
>
> Reviewed-by: David Yat Sin <David.YatSin at amd.com>
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling at amd.com>
>> Sent: Friday, July 28, 2023 4:00 PM
>> To: Francis, David <David.Francis at amd.com>; amd-gfx at lists.freedesktop.org;
>> Yat Sin, David <David.YatSin at amd.com>
>> Subject: Re: [PATCH v3] drm/amdgpu: Add EXT_COHERENT memory allocation
>> flags
>>
>> On 2023-07-28 15:39, David Francis wrote:
>>> These flags (for GEM and SVM allocations) allocate memory that allows
>>> for system-scope atomic semantics.
>>>
>>> On GFX943 these flags cause caches to be avoided on non-local memory.
>>>
>>> On all other ASICs they are identical in functionality to the
>>> equivalent COHERENT flags.
>>>
>>> Corresponding Thunk patch is at
>>> https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88
>>>
>>> v3: changed name of flag
>>>
>>> Signed-off-by: David Francis <David.Francis at amd.com>
>> I made one comment on the user mode patch regarding the explicit handling
>> of invalid combinations of Uncached, Coherent, ExtCoherent flags. I'm not
>> sure what we agreed on any more. But I don't think we want to just leave it up
>> to chance. Other than that, this patch looks good to me.
>>
>> Regards,
>>     Felix
>>
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  2 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c      |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c           |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c           |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c            |  5 ++++-
>>>    drivers/gpu/drm/amd/amdkfd/kfd_svm.c             | 10 +++++++++-
>>>    include/uapi/drm/amdgpu_drm.h                    | 10 +++++++++-
>>>    include/uapi/linux/kfd_ioctl.h                   |  3 +++
>>>    8 files changed, 30 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index d34c3ef8f3ed..a1ce261f2d06 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -1738,6 +1738,8 @@ int
>> amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>>      if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
>>>              alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
>>> +   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
>>> +           alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT;
>>>      if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
>>>              alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>>> index 12210598e5b8..76b618735dc0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
>>> @@ -331,6 +331,7 @@ amdgpu_dma_buf_create_obj(struct drm_device
>> *dev,
>>> struct dma_buf *dma_buf)
>>>
>>>              flags |= other->flags &
>> (AMDGPU_GEM_CREATE_CPU_GTT_USWC |
>>>                                       AMDGPU_GEM_CREATE_COHERENT
>> |
>>> +
>> AMDGPU_GEM_CREATE_EXT_COHERENT |
>> AMDGPU_GEM_CREATE_UNCACHED);
>>>      }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6b430e10d38e..301ffe30824f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -632,6 +632,7 @@ static void gmc_v10_0_get_vm_pte(struct
>> amdgpu_device *adev,
>>>      }
>>>
>>>      if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
>>> +                          AMDGPU_GEM_CREATE_EXT_COHERENT |
>>>                             AMDGPU_GEM_CREATE_UNCACHED))
>>>              *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
>>>                       AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>> index a6ee0220db56..846894e212e7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>> @@ -540,6 +540,7 @@ static void gmc_v11_0_get_vm_pte(struct
>> amdgpu_device *adev,
>>>      }
>>>
>>>      if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
>>> +                          AMDGPU_GEM_CREATE_EXT_COHERENT |
>>>                             AMDGPU_GEM_CREATE_UNCACHED))
>>>              *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
>>>                       AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 880460cd3239..92a623e130d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -1183,7 +1183,8 @@ static void gmc_v9_0_get_coherence_flags(struct
>> amdgpu_device *adev,
>>>    {
>>>      struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>      bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
>>> -   bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
>>> +   bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
>> AMDGPU_GEM_CREATE_EXT_COHERENT);
>>> +   bool ext_coherent = bo->flags &
>> AMDGPU_GEM_CREATE_EXT_COHERENT;
>>>      bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
>>>      struct amdgpu_vm *vm = mapping->bo_va->base.vm;
>>>      unsigned int mtype_local, mtype;
>>> @@ -1251,6 +1252,8 @@ static void gmc_v9_0_get_coherence_flags(struct
>> amdgpu_device *adev,
>>>              snoop = true;
>>>              if (uncached) {
>>>                      mtype = MTYPE_UC;
>>> +           } else if (ext_coherent) {
>>> +                   mtype = is_local ? MTYPE_CC : MTYPE_UC;
>>>              } else if (adev->flags & AMD_IS_APU) {
>>>                      mtype = is_local ? mtype_local : MTYPE_NC;
>>>              } else {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> index 1b50eae051a4..e9ffcfc5dedc 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> @@ -1155,7 +1155,8 @@ svm_range_get_pte_flags(struct kfd_node
>> *node,
>>>      uint32_t mapping_flags = 0;
>>>      uint64_t pte_flags;
>>>      bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
>>> -   bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
>>> +   bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT |
>> KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
>>> +   bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
>>>      bool uncached = false; /*flags &
>> KFD_IOCTL_SVM_FLAG_UNCACHED;*/
>>>      unsigned int mtype_local;
>>>
>>> @@ -1203,6 +1204,13 @@ svm_range_get_pte_flags(struct kfd_node
>> *node,
>>>              snoop = true;
>>>              if (uncached) {
>>>                      mapping_flags |= AMDGPU_VM_MTYPE_UC;
>>> +           } else if (ext_coherent) {
>>> +                   /* local HBM region close to partition */
>>> +                   if (bo_node->adev == node->adev &&
>>> +                       (!bo_node->xcp || !node->xcp || bo_node->xcp-
>>> mem_id == node->xcp->mem_id))
>>> +                           mapping_flags |= AMDGPU_VM_MTYPE_CC;
>>> +                   else
>>> +                           mapping_flags |= AMDGPU_VM_MTYPE_UC;
>>>              } else if (domain == SVM_RANGE_VRAM_DOMAIN) {
>>>                      /* local HBM region close to partition */
>>>                      if (bo_node->adev == node->adev && diff --git
>>> a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
>> index
>>> 79b14828d542..629860dbc9ec 100644
>>> --- a/include/uapi/drm/amdgpu_drm.h
>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>> @@ -145,7 +145,7 @@ extern "C" {
>>>     */
>>>    #define AMDGPU_GEM_CREATE_DISCARDABLE             (1 << 12)
>>>    /* Flag that BO is shared coherently between multiple devices or CPU
>> threads.
>>> - * May depend on GPU instructions to flush caches explicitly
>>> + * May depend on GPU instructions to flush caches to system scope
>> explicitly.
>>>     *
>>>     * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs
>> and
>>>     * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
>>> @@ -158,6 +158,14 @@ extern "C" {
>>>     * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
>>>     */
>>>    #define AMDGPU_GEM_CREATE_UNCACHED                (1 << 14)
>>> +/* Flag that BO should be coherent across devices when using
>>> +device-level
>>> + * atomics. May depend on GPU instructions to flush caches to device
>>> +scope
>>> + * explicitly, promoting them to system scope automatically.
>>> + *
>>> + * This influences the choice of MTYPE in the PTEs on GFXv9 and later
>>> +GPUs and
>>> + * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
>>> + */
>>> +#define AMDGPU_GEM_CREATE_EXT_COHERENT             (1 << 15)
>>>
>>>    struct drm_amdgpu_gem_create_in  {
>>>      /** the requested memory size */
>>> diff --git a/include/uapi/linux/kfd_ioctl.h
>>> b/include/uapi/linux/kfd_ioctl.h index eeb2fdcbdcb7..f0ed68974c54
>>> 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args {
>>>    #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM     (1 << 27)
>>>    #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT  (1 << 26)
>>>    #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED  (1 << 25)
>>> +#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT       (1 << 24)
>>>
>>>    /* Allocate memory for later SVM (shared virtual memory) mapping.
>>>     *
>>> @@ -659,6 +660,8 @@ enum kfd_mmio_remap {
>>>    #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY     0x00000020
>>>    /* Keep GPU memory mapping always valid as if XNACK is disable */
>>>    #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED   0x00000040
>>> +/* Fine grained coherency between all devices using device-scope atomics
>> */
>>> +#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT        0x00000080
>>>
>>>    /**
>>>     * kfd_ioctl_svm_op - SVM ioctl operations


More information about the amd-gfx mailing list