[PATCH v2 2/3] drm/amdgpu: Add flag to disable implicit sync for GEM operations.
Tatsuyuki Ishi
ishitatsuyuki at gmail.com
Thu Nov 2 14:04:35 UTC 2023
In Vulkan, it is the application's responsibility to perform adequate
synchronization before a sparse unmap, replace or BO destroy operation.
Until now, the kernel applied the same rule as implicitly-synchronized
APIs like OpenGL, which with per-VM BOs made page table updates stall the
queue completely. The newly added AMDGPU_VM_EXPLICIT_SYNC flag allows
drivers to opt-out of this behavior, while still ensuring adequate implicit
sync happens for kernel-initiated updates (e.g. BO moves).
We record whether to use implicit sync or not for each freed mapping. To
avoid increasing the mapping struct's size, this is union-ized with the
interval tree field which is unused after the unmap.
The reason this is done with a GEM ioctl flag, instead of being a VM /
context global setting, is that the current libdrm implementation shares
the DRM handle even between different kind of drivers (radeonsi vs radv).
Signed-off-by: Tatsuyuki Ishi <ishitatsuyuki at gmail.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 14 ++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 7 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 6 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 47 +++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 23 +++++----
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 18 +++----
include/uapi/drm/amdgpu_drm.h | 2 +
9 files changed, 71 insertions(+), 50 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7d6daf8d2bfa..10e129bff977 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1196,7 +1196,7 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct amdgpu_device *adev = entry->adev;
struct amdgpu_vm *vm = bo_va->base.vm;
- amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
+ amdgpu_vm_bo_unmap(adev, bo_va, entry->va, true);
amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 720011019741..612279e65bff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -122,7 +122,7 @@ int amdgpu_unmap_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
}
}
- r = amdgpu_vm_bo_unmap(adev, bo_va, csa_addr);
+ r = amdgpu_vm_bo_unmap(adev, bo_va, csa_addr, true);
if (r) {
DRM_ERROR("failed to do bo_unmap on static CSA, err=%d\n", r);
goto error;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index a1b15d0d6c48..cca68b89754e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -667,9 +667,9 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
const uint32_t valid_flags = AMDGPU_VM_DELAY_UPDATE |
AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_MASK |
- AMDGPU_VM_PAGE_NOALLOC;
+ AMDGPU_VM_PAGE_NOALLOC | AMDGPU_VM_EXPLICIT_SYNC;
const uint32_t prt_flags = AMDGPU_VM_DELAY_UPDATE |
- AMDGPU_VM_PAGE_PRT;
+ AMDGPU_VM_PAGE_PRT | AMDGPU_VM_EXPLICIT_SYNC;
struct drm_amdgpu_gem_va *args = data;
struct drm_gem_object *gobj;
@@ -680,6 +680,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
struct drm_exec exec;
uint64_t va_flags;
uint64_t vm_size;
+ bool sync_unmap;
int r = 0;
if (args->va_address < AMDGPU_VA_RESERVED_SIZE) {
@@ -715,6 +716,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
return -EINVAL;
}
+ sync_unmap = !(args->flags & AMDGPU_VM_EXPLICIT_SYNC);
+
switch (args->operation) {
case AMDGPU_VA_OP_MAP:
case AMDGPU_VA_OP_UNMAP:
@@ -774,19 +777,20 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
va_flags);
break;
case AMDGPU_VA_OP_UNMAP:
- r = amdgpu_vm_bo_unmap(adev, bo_va, args->va_address);
+ r = amdgpu_vm_bo_unmap(adev, bo_va, args->va_address,
+ sync_unmap);
break;
case AMDGPU_VA_OP_CLEAR:
r = amdgpu_vm_bo_clear_mappings(adev, &fpriv->vm,
args->va_address,
- args->map_size);
+ args->map_size, sync_unmap);
break;
case AMDGPU_VA_OP_REPLACE:
va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
r = amdgpu_vm_bo_replace_map(adev, bo_va, args->va_address,
args->offset_in_bo, args->map_size,
- va_flags);
+ va_flags, sync_unmap);
break;
default:
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index f3ee83cdf97e..28be03f1bbcf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -67,7 +67,12 @@ struct amdgpu_bo_va_mapping {
struct rb_node rb;
uint64_t start;
uint64_t last;
- uint64_t __subtree_last;
+ union {
+ /* BOs in interval tree only */
+ uint64_t __subtree_last;
+ /* Freed BOs only */
+ bool sync_unmap;
+ };
uint64_t offset;
uint64_t flags;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 2fd1bfb35916..e71443c8c59b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -276,6 +276,7 @@ TRACE_EVENT(amdgpu_vm_bo_unmap,
__field(long, last)
__field(u64, offset)
__field(u64, flags)
+ __field(bool, sync_unmap)
),
TP_fast_assign(
@@ -284,10 +285,11 @@ TRACE_EVENT(amdgpu_vm_bo_unmap,
__entry->last = mapping->last;
__entry->offset = mapping->offset;
__entry->flags = mapping->flags;
+ __entry->sync_unmap = mapping->sync_unmap;
),
- TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%llx",
+ TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%llx, sync_unmap=%d",
__entry->bo, __entry->start, __entry->last,
- __entry->offset, __entry->flags)
+ __entry->offset, __entry->flags, __entry->sync_unmap)
);
DECLARE_EVENT_CLASS(amdgpu_vm_mapping,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 7b9762f1cddd..a74472e16952 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -844,6 +844,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
* @immediate: immediate submission in a page fault
* @unlocked: unlocked invalidation during MM callback
* @flush_tlb: trigger tlb invalidation after update completed
+ * @sync_unmap: wait for BO users before unmapping
* @resv: fences we need to sync to
* @start: start of mapped range
* @last: last mapped entry
@@ -861,8 +862,9 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
*/
int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
bool immediate, bool unlocked, bool flush_tlb,
- struct dma_resv *resv, uint64_t start, uint64_t last,
- uint64_t flags, uint64_t offset, uint64_t vram_base,
+ bool sync_unmap, struct dma_resv *resv,
+ uint64_t start, uint64_t last, uint64_t flags,
+ uint64_t offset, uint64_t vram_base,
struct ttm_resource *res, dma_addr_t *pages_addr,
struct dma_fence **fence)
{
@@ -902,7 +904,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
/* Implicitly sync to command submissions in the same VM before
* unmapping. Sync to moving fences before mapping.
*/
- if (!(flags & (AMDGPU_PTE_VALID | AMDGPU_PTE_PRT)))
+ if (!(flags & (AMDGPU_PTE_VALID | AMDGPU_PTE_PRT)) && sync_unmap)
sync_mode = AMDGPU_SYNC_EQ_OWNER;
else
sync_mode = AMDGPU_SYNC_EXPLICIT;
@@ -1145,10 +1147,10 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
trace_amdgpu_vm_bo_update(mapping);
r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb,
- resv, mapping->start, mapping->last,
- update_flags, mapping->offset,
- vram_base, mem, pages_addr,
- last_update);
+ true, resv, mapping->start,
+ mapping->last, update_flags,
+ mapping->offset, vram_base, mem,
+ pages_addr, last_update);
if (r)
return r;
}
@@ -1340,7 +1342,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
mapping->start < AMDGPU_GMC_HOLE_START)
init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
- r = amdgpu_vm_update_range(adev, vm, false, false, true, resv,
+ r = amdgpu_vm_update_range(adev, vm, false, false, true,
+ mapping->sync_unmap, resv,
mapping->start, mapping->last,
init_pte_value, 0, 0, NULL, NULL,
&f);
@@ -1572,6 +1575,7 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
* @offset: requested offset in the BO
* @size: BO size in bytes
* @flags: attributes of pages (read/write/valid/etc.)
+ * @sync_unmap: wait for BO users before replacing existing mapping
*
* Add a mapping of the BO at the specefied addr into the VM. Replace existing
* mappings as we do so.
@@ -1582,9 +1586,9 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
* Object has to be reserved and unreserved outside!
*/
int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
- struct amdgpu_bo_va *bo_va,
- uint64_t saddr, uint64_t offset,
- uint64_t size, uint64_t flags)
+ struct amdgpu_bo_va *bo_va, uint64_t saddr,
+ uint64_t offset, uint64_t size, uint64_t flags,
+ bool sync_unmap)
{
struct amdgpu_bo_va_mapping *mapping;
struct amdgpu_bo *bo = bo_va->base.bo;
@@ -1608,7 +1612,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
if (!mapping)
return -ENOMEM;
- r = amdgpu_vm_bo_clear_mappings(adev, bo_va->base.vm, saddr, size);
+ r = amdgpu_vm_bo_clear_mappings(adev, bo_va->base.vm, saddr, size, sync_unmap);
if (r) {
kfree(mapping);
return r;
@@ -1633,6 +1637,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
* @adev: amdgpu_device pointer
* @bo_va: bo_va to remove the address from
* @saddr: where to the BO is mapped
+ * @sync_unmap: wait for BO users before unmapping
*
* Remove a mapping of the BO at the specefied addr from the VM.
*
@@ -1641,9 +1646,8 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
*
* Object has to be reserved and unreserved outside!
*/
-int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
- struct amdgpu_bo_va *bo_va,
- uint64_t saddr)
+int amdgpu_vm_bo_unmap(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
+ uint64_t saddr, bool sync_unmap)
{
struct amdgpu_bo_va_mapping *mapping;
struct amdgpu_vm *vm = bo_va->base.vm;
@@ -1671,6 +1675,7 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
list_del(&mapping->list);
amdgpu_vm_it_remove(mapping, &vm->va);
mapping->bo_va = NULL;
+ mapping->sync_unmap = sync_unmap;
trace_amdgpu_vm_bo_unmap(bo_va, mapping);
if (valid)
@@ -1689,6 +1694,7 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
* @vm: VM structure to use
* @saddr: start of the range
* @size: size of the range
+ * @sync_unmap: wait for BO users before unmapping
*
* Remove all mappings in a range, split them as appropriate.
*
@@ -1696,8 +1702,8 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
* 0 for success, error for failure.
*/
int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
- struct amdgpu_vm *vm,
- uint64_t saddr, uint64_t size)
+ struct amdgpu_vm *vm, uint64_t saddr,
+ uint64_t size, bool sync_unmap)
{
struct amdgpu_bo_va_mapping *before, *after, *tmp, *next;
LIST_HEAD(removed);
@@ -1761,6 +1767,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
tmp->last = eaddr;
tmp->bo_va = NULL;
+ tmp->sync_unmap = sync_unmap;
list_add(&tmp->list, &vm->freed);
trace_amdgpu_vm_bo_unmap(NULL, tmp);
}
@@ -1889,6 +1896,7 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
list_del(&mapping->list);
amdgpu_vm_it_remove(mapping, &vm->va);
mapping->bo_va = NULL;
+ mapping->sync_unmap = true;
trace_amdgpu_vm_bo_unmap(bo_va, mapping);
list_add(&mapping->list, &vm->freed);
}
@@ -2617,8 +2625,9 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
goto error_unlock;
}
- r = amdgpu_vm_update_range(adev, vm, true, false, false, NULL, addr,
- addr, flags, value, 0, NULL, NULL, NULL);
+ r = amdgpu_vm_update_range(adev, vm, true, false, false, true, NULL,
+ addr, addr, flags, value, 0, NULL, NULL,
+ NULL);
if (r)
goto error_unlock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 204ab13184ed..73b7b49fdb2e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -423,12 +423,12 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
struct amdgpu_vm *vm, struct amdgpu_bo *bo);
int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
bool immediate, bool unlocked, bool flush_tlb,
- struct dma_resv *resv, uint64_t start, uint64_t last,
- uint64_t flags, uint64_t offset, uint64_t vram_base,
+ bool sync_unmap, struct dma_resv *resv,
+ uint64_t start, uint64_t last, uint64_t flags,
+ uint64_t offset, uint64_t vram_base,
struct ttm_resource *res, dma_addr_t *pages_addr,
struct dma_fence **fence);
-int amdgpu_vm_bo_update(struct amdgpu_device *adev,
- struct amdgpu_bo_va *bo_va,
+int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
bool clear);
bool amdgpu_vm_evictable(struct amdgpu_bo *bo);
void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
@@ -444,15 +444,14 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
uint64_t addr, uint64_t offset,
uint64_t size, uint64_t flags);
int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
- struct amdgpu_bo_va *bo_va,
- uint64_t addr, uint64_t offset,
- uint64_t size, uint64_t flags);
-int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
- struct amdgpu_bo_va *bo_va,
- uint64_t addr);
+ struct amdgpu_bo_va *bo_va, uint64_t addr,
+ uint64_t offset, uint64_t size, uint64_t flags,
+ bool sync_unmap);
+int amdgpu_vm_bo_unmap(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
+ uint64_t addr, bool sync_unmap);
int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
- struct amdgpu_vm *vm,
- uint64_t saddr, uint64_t size);
+ struct amdgpu_vm *vm, uint64_t saddr,
+ uint64_t size, bool sync_unmap);
struct amdgpu_bo_va_mapping *amdgpu_vm_bo_lookup_mapping(struct amdgpu_vm *vm,
uint64_t addr);
void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index bb16b795d1bc..6eb4a0a4bc84 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1291,9 +1291,9 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
pr_debug("[0x%llx 0x%llx]\n", start, last);
- return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start,
- last, init_pte_value, 0, 0, NULL, NULL,
- fence);
+ return amdgpu_vm_update_range(adev, vm, false, true, true, true, NULL,
+ start, last, init_pte_value, 0, 0, NULL,
+ NULL, fence);
}
static int
@@ -1398,12 +1398,12 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
* different memory partition based on fpfn/lpfn, we should use
* same vm_manager.vram_base_offset regardless memory partition.
*/
- r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL,
- last_start, prange->start + i,
- pte_flags,
- (last_start - prange->start) << PAGE_SHIFT,
- bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
- NULL, dma_addr, &vm->last_update);
+ r = amdgpu_vm_update_range(
+ adev, vm, false, false, flush_tlb, true, NULL,
+ last_start, prange->start + i, pte_flags,
+ (last_start - prange->start) << PAGE_SHIFT,
+ bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
+ NULL, dma_addr, &vm->last_update);
for (j = last_start - prange->start; j <= i; j++)
dma_addr[j] |= last_domain;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index f477eda6a2b8..3cdcc299956e 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -556,6 +556,8 @@ struct drm_amdgpu_gem_op {
#define AMDGPU_VM_MTYPE_RW (5 << 5)
/* don't allocate MALL */
#define AMDGPU_VM_PAGE_NOALLOC (1 << 9)
+/* don't sync on unmap */
+#define AMDGPU_VM_EXPLICIT_SYNC (1 << 10)
struct drm_amdgpu_gem_va {
/** GEM object handle */
--
2.42.0
More information about the dri-devel
mailing list