[PATCH 1/3] amd/amdkfd: Add granularity bitmap mapped to gpu flag
Felix Kuehling
felix.kuehling at amd.com
Mon Oct 2 18:35:15 UTC 2023
On 2023-09-29 10:11, Philip Yang wrote:
> Replace prange->mapped_to_gpu with prange->bitmap_mapped[], which is
> based on prange granularity, updated when map to GPUS or unmap from
> GPUs, to optimize multiple GPU map, unmap and retry fault recover.
>
> svm_range_is_mapped is false only if no parital range mapping on any
> GPUs.
>
> Split the bitmap_mapped when unmap from cpu to split the prange.
>
> Signed-off-by: Philip Yang<Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 218 ++++++++++++++++++++++-----
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 4 +-
> 2 files changed, 184 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 040dc32ad475..626e0dd4ec79 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -292,12 +292,12 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
> KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
> }
>
> - /* free dma_addr array for each gpu */
> + /* free dma_addr array, bitmap_mapped for each gpu */
> for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
> - if (prange->dma_addr[gpuidx]) {
> + if (prange->dma_addr[gpuidx])
> kvfree(prange->dma_addr[gpuidx]);
> - prange->dma_addr[gpuidx] = NULL;
> - }
> + if (prange->bitmap_mapped[gpuidx])
> + bitmap_free(prange->bitmap_mapped[gpuidx]);
> }
>
> mutex_destroy(&prange->lock);
> @@ -323,19 +323,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
> uint64_t size = last - start + 1;
> struct svm_range *prange;
> struct kfd_process *p;
> -
> - prange = kzalloc(sizeof(*prange), GFP_KERNEL);
> - if (!prange)
> - return NULL;
> + unsigned int nbits;
> + uint32_t gpuidx;
>
> p = container_of(svms, struct kfd_process, svms);
> if (!p->xnack_enabled && update_mem_usage &&
> amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
> KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
> pr_info("SVM mapping failed, exceeds resident system memory limit\n");
> - kfree(prange);
> return NULL;
> }
> +
> + prange = kzalloc(sizeof(*prange), GFP_KERNEL);
> + if (!prange)
> + return NULL;
> +
> + svm_range_set_default_attributes(&prange->preferred_loc,
> + &prange->prefetch_loc,
> + &prange->granularity, &prange->flags);
> +
> + nbits = svm_range_mapped_nbits(size, prange->granularity);
> + pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_mapped nbits %d\n", prange,
> + start, last, nbits);
> + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
> + prange->bitmap_mapped[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL);
> + if (!prange->bitmap_mapped[gpuidx]) {
> + while (gpuidx--)
> + bitmap_free(prange->bitmap_mapped[gpuidx]);
> + kfree(prange);
> + return NULL;
> + }
> + }
> +
> prange->npages = size;
> prange->svms = svms;
> prange->start = start;
> @@ -354,10 +373,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
> bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
> MAX_GPU_INSTANCE);
>
> - svm_range_set_default_attributes(&prange->preferred_loc,
> - &prange->prefetch_loc,
> - &prange->granularity, &prange->flags);
> -
> pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
>
> return prange;
> @@ -972,6 +987,48 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
> return 0;
> }
>
> +static int
> +svm_range_split_bitmap_mapped(struct svm_range *new, struct svm_range *old,
> + uint64_t start, uint64_t last)
> +{
> + struct kfd_process *p = container_of(new->svms, struct kfd_process, svms);
> + unsigned int nbits, old_nbits, old_nbits2;
> + unsigned long *bits;
> + uint32_t gpuidx;
> +
> + nbits = svm_range_mapped_nbits(new->npages, new->granularity);
> + old_nbits = svm_range_mapped_nbits(old->npages, old->granularity);
> + old_nbits2 = svm_range_mapped_nbits(last - start + 1, old->granularity);
This may be off by one if start and last are not aligned on granularity
boundaries. I think you need to calculate the index for each of start
and last and subtract the indices. E.g. granularity = 9, start = 511,
last = 512. last - start + 1 is 2 and the division tells you you need
one bit. But this range touches two different granules, so you need two
bits.
> +
> + pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n",
> + old, old->start, old->last, start, last, old_nbits, old_nbits2);
> + pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, new->last,
> + nbits);
> +
> + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
> + bits = bitmap_alloc(old_nbits2, GFP_KERNEL);
> + if (!bits)
> + return -ENOMEM;
> +
> + if (start == old->start) {
> + bitmap_shift_right(new->bitmap_mapped[gpuidx],
> + old->bitmap_mapped[gpuidx],
> + old_nbits2, old_nbits);
> + bitmap_shift_right(bits, old->bitmap_mapped[gpuidx], 0,
> + old_nbits2);
Isn't this (shift = 0) the same as bitmap_copy?
> + } else {
> + bitmap_copy(new->bitmap_mapped[gpuidx],
> + old->bitmap_mapped[gpuidx], nbits);
> + bitmap_shift_right(bits, old->bitmap_mapped[gpuidx],
> + nbits, old_nbits);
> + }
> + bitmap_free(old->bitmap_mapped[gpuidx]);
> + old->bitmap_mapped[gpuidx] = bits;
> + }
> +
> + return 0;
> +}
> +
> /**
> * svm_range_split_adjust - split range and adjust
> *
> @@ -1012,6 +1069,10 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
> return r;
> }
>
> + r = svm_range_split_bitmap_mapped(new, old, start, last);
> + if (r)
> + return r;
> +
> old->npages = last - start + 1;
> old->start = start;
> old->last = last;
> @@ -1020,7 +1081,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
> new->prefetch_loc = old->prefetch_loc;
> new->actual_loc = old->actual_loc;
> new->granularity = old->granularity;
> - new->mapped_to_gpu = old->mapped_to_gpu;
> bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
> bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
>
> @@ -1310,6 +1370,84 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> fence);
> }
>
> +/**
> + * svm_range_partial_mapped_dev - check if prange mapped on the specific gpu
> + *
> + * @gpuidx: the gpu to check
> + * @prange: prange to check
> + * @start: the start address in pages
> + * @last: the last address in pages
> + *
> + * Return:
> + * true: if any partial range mapped on gpu based on granularity boundary
> + * false: if the entire range not mapped
> + */
> +static bool
> +svm_range_partial_mapped_dev(uint32_t gpuidx, struct svm_range *prange,
> + unsigned long start, unsigned long last)
> +{
> + unsigned long index, off;
> + bool mapped = false;
> +
> + start = max(start, prange->start);
> + last = min(last, prange->last);
> + if (last < start)
> + goto out;
> +
> + for (off = start; off <= last; off += (1UL << prange->granularity)) {
It would be easier to just iterate over indexes instead of offsets. And
even more efficient to avoid testing individual bits by using a higher
level bitmap function, e.g. bitmap_find_next_bit E.g.
unsigned long start_index, last_index;
start = max(start, prange->start);
last = min(last, prange->last);
if (last < start)
goto out;
start_index = (start - prange->start) >> prange->granularity;
last_index = (last - prange->start) >> prange->granularity;
return find_next_bit(prange->bitmap_mapped[gpuidx], last_index + 1, start_index) <= last_index;
> + index = (off - prange->start) >> prange->granularity;
> + if (test_bit(index, prange->bitmap_mapped[gpuidx])) {
> + mapped = true;
> + break;
> + }
> + }
> +out:
> + pr_debug("prange 0x%p [0x%lx 0x%lx] mapped %d on gpu %d\n", prange,
> + start, last, mapped, gpuidx);
> + return mapped;
> +}
> +
> +static bool
> +svm_range_partial_mapped(struct svm_range *prange, unsigned long start,
> + unsigned long last)
> +{
> + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
> + struct svm_range *pchild;
> + uint32_t gpuidx;
> +
> + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
> + list_for_each_entry(pchild, &prange->child_list, child_list) {
> + if (svm_range_partial_mapped_dev(gpuidx, pchild, start, last))
> + return true;
> + }
> +
> + if (svm_range_partial_mapped_dev(gpuidx, prange, start, last))
> + return true;
> + }
> + return false;
> +}
> +
> +static bool svm_range_is_mapped(struct svm_range *prange)
> +{
> + return svm_range_partial_mapped(prange, prange->start, prange->last);
> +}
> +
> +static void
> +svm_range_update_mapped(uint32_t gpuidx, struct svm_range *prange,
> + unsigned long start, unsigned long last, bool mapped)
> +{
> + unsigned long index, nbits;
> +
> + index = (start - prange->start) >> prange->granularity;
> + nbits = svm_range_mapped_nbits(last - start + 1, prange->granularity);
This may be off by one if start and last are not aligned on granularity
boundaries. I think you need to calculate the index for each of start
and last and subtract the indices.
> + if (mapped)
> + bitmap_set(prange->bitmap_mapped[gpuidx], index, nbits);
> + else
> + bitmap_clear(prange->bitmap_mapped[gpuidx], index, nbits);
> + pr_debug("prange 0x%p [0x%lx 0x%lx] update mapped %d nbits %ld gpu %d\n",
> + prange, start, last, mapped, nbits, gpuidx);
> +}
> +
> static int
> svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
> unsigned long last, uint32_t trigger)
> @@ -1321,29 +1459,28 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
> uint32_t gpuidx;
> int r = 0;
>
> - if (!prange->mapped_to_gpu) {
> - pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
> - prange, prange->start, prange->last);
> - return 0;
> - }
> -
> - if (prange->start == start && prange->last == last) {
> - pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
> - prange->mapped_to_gpu = false;
> - }
> -
> bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
> MAX_GPU_INSTANCE);
> p = container_of(prange->svms, struct kfd_process, svms);
>
> for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
> - pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
> pdd = kfd_process_device_from_gpuidx(p, gpuidx);
> if (!pdd) {
> pr_debug("failed to find device idx %d\n", gpuidx);
> - return -EINVAL;
> + continue;
> + }
> +
> + if (!svm_range_partial_mapped_dev(gpuidx, prange, start, last)) {
> + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not mapped on gpu %d\n",
> + prange->svms, prange, start, last, gpuidx);
> + continue;
> }
>
> + svm_range_update_mapped(gpuidx, prange, start, last, false);
> +
> + pr_debug("unmap svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu %d\n",
> + prange->svms, prange, start, last, gpuidx);
> +
> kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
> start, last, trigger);
>
> @@ -1483,6 +1620,10 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
> if (r)
> break;
>
> + if (!r)
> + svm_range_update_mapped(gpuidx, prange, prange->start + offset,
> + prange->start + offset + npages - 1, true);
> +
> if (fence) {
> r = dma_fence_wait(fence, false);
> dma_fence_put(fence);
> @@ -1648,7 +1789,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>
> if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
> bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
> - if (!prange->mapped_to_gpu ||
> + if (!svm_range_is_mapped(prange) ||
I think this gives you the wrong answer. As I understand it,
svm_range_is_mapped returns true, if any part of the range is currently
mapped on any GPU. But you'd only want to skip validate_and_map is all
of the range is currently mapped on the subset of GPUs you're interested in.
> bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
> r = 0;
> goto free_ctx;
> @@ -1729,9 +1870,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
> r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> ctx->bitmap, flush_tlb);
>
> - if (!r && next == end)
> - prange->mapped_to_gpu = true;
> -
> svm_range_unlock(prange);
>
> addr = next;
> @@ -1900,10 +2038,10 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
> if (!p->xnack_enabled ||
> (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
> int evicted_ranges;
> - bool mapped = prange->mapped_to_gpu;
> + bool mapped = svm_range_is_mapped(prange);
>
> list_for_each_entry(pchild, &prange->child_list, child_list) {
> - if (!pchild->mapped_to_gpu)
> + if (!svm_range_is_mapped(pchild))
> continue;
> mapped = true;
Doesn't svm_range_is_mapped already consider child ranges? So you don't
need to set mapped here.
> mutex_lock_nested(&pchild->lock, 1);
> @@ -1966,7 +2104,9 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
>
> static struct svm_range *svm_range_clone(struct svm_range *old)
> {
> + struct kfd_process *p = container_of(old->svms, struct kfd_process, svms);
> struct svm_range *new;
> + uint32_t gpuidx;
>
> new = svm_range_new(old->svms, old->start, old->last, false);
> if (!new)
> @@ -1988,7 +2128,11 @@ static struct svm_range *svm_range_clone(struct svm_range *old)
> new->prefetch_loc = old->prefetch_loc;
> new->actual_loc = old->actual_loc;
> new->granularity = old->granularity;
> - new->mapped_to_gpu = old->mapped_to_gpu;
> + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
> + bitmap_copy(new->bitmap_mapped[gpuidx], old->bitmap_mapped[gpuidx],
> + svm_range_mapped_nbits(new->last - new->start + 1,
> + new->granularity));
> + };
> bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
> bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
>
> @@ -2107,7 +2251,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
> next_start = min(node->last, last) + 1;
>
> if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
> - prange->mapped_to_gpu) {
> + svm_range_is_mapped(prange)) {
This is probably wrong too. I think you need a check, whether a specific
range is completely mapped on all GPUs that need access.
Regards,
Felix
> /* nothing to do */
> } else if (node->start < start || node->last > last) {
> /* node intersects the update range and its attributes
> @@ -3587,7 +3731,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>
> if (migrated && (!p->xnack_enabled ||
> (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
> - prange->mapped_to_gpu) {
> + svm_range_is_mapped(prange)) {
> pr_debug("restore_work will update mappings of GPUs\n");
> mutex_unlock(&prange->migrate_mutex);
> continue;
> @@ -3598,7 +3742,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
> continue;
> }
>
> - flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
> + flush_tlb = !migrated && update_mapping && svm_range_is_mapped(prange);
>
> r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
> true, flush_tlb);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index d2e94d8fb4be..10c92c5e23a7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -132,7 +132,7 @@ struct svm_range {
> struct list_head child_list;
> DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
> DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
> - bool mapped_to_gpu;
> + unsigned long *bitmap_mapped[MAX_GPU_INSTANCE];
> };
>
> static inline void svm_range_lock(struct svm_range *prange)
> @@ -163,6 +163,8 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo)
> return NULL;
> }
>
> +#define svm_range_mapped_nbits(size, granularity) DIV_ROUND_UP((size), 1UL << (granularity))
> +
> int svm_range_list_init(struct kfd_process *p);
> void svm_range_list_fini(struct kfd_process *p);
> int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20231002/054e9902/attachment-0001.htm>
More information about the amd-gfx
mailing list