<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 2023-10-02 14:35, Felix Kuehling
wrote:<br>
</div>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p><br>
</p>
<div class="moz-cite-prefix">On 2023-09-29 10:11, Philip Yang
wrote:<br>
</div>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap="">Replace prange->mapped_to_gpu with prange->bitmap_mapped[], which is
based on prange granularity, updated when map to GPUS or unmap from
GPUs, to optimize multiple GPU map, unmap and retry fault recover.
svm_range_is_mapped is false only if no parital range mapping on any
GPUs.
Split the bitmap_mapped when unmap from cpu to split the prange.
Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com" moz-do-not-send="true"><Philip.Yang@amd.com></a>
---
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 218 ++++++++++++++++++++++-----
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 4 +-
2 files changed, 184 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 040dc32ad475..626e0dd4ec79 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -292,12 +292,12 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
}
- /* free dma_addr array for each gpu */
+ /* free dma_addr array, bitmap_mapped for each gpu */
for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
- if (prange->dma_addr[gpuidx]) {
+ if (prange->dma_addr[gpuidx])
kvfree(prange->dma_addr[gpuidx]);
- prange->dma_addr[gpuidx] = NULL;
- }
+ if (prange->bitmap_mapped[gpuidx])
+ bitmap_free(prange->bitmap_mapped[gpuidx]);
}
mutex_destroy(&prange->lock);
@@ -323,19 +323,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
uint64_t size = last - start + 1;
struct svm_range *prange;
struct kfd_process *p;
-
- prange = kzalloc(sizeof(*prange), GFP_KERNEL);
- if (!prange)
- return NULL;
+ unsigned int nbits;
+ uint32_t gpuidx;
p = container_of(svms, struct kfd_process, svms);
if (!p->xnack_enabled && update_mem_usage &&
amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
pr_info("SVM mapping failed, exceeds resident system memory limit\n");
- kfree(prange);
return NULL;
}
+
+ prange = kzalloc(sizeof(*prange), GFP_KERNEL);
+ if (!prange)
+ return NULL;
+
+ svm_range_set_default_attributes(&prange->preferred_loc,
+ &prange->prefetch_loc,
+ &prange->granularity, &prange->flags);
+
+ nbits = svm_range_mapped_nbits(size, prange->granularity);
+ pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_mapped nbits %d\n", prange,
+ start, last, nbits);
+ for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+ prange->bitmap_mapped[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL);
+ if (!prange->bitmap_mapped[gpuidx]) {
+ while (gpuidx--)
+ bitmap_free(prange->bitmap_mapped[gpuidx]);
+ kfree(prange);
+ return NULL;
+ }
+ }
+
prange->npages = size;
prange->svms = svms;
prange->start = start;
@@ -354,10 +373,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
MAX_GPU_INSTANCE);
- svm_range_set_default_attributes(&prange->preferred_loc,
- &prange->prefetch_loc,
- &prange->granularity, &prange->flags);
-
pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
return prange;
@@ -972,6 +987,48 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
return 0;
}
+static int
+svm_range_split_bitmap_mapped(struct svm_range *new, struct svm_range *old,
+ uint64_t start, uint64_t last)
+{
+ struct kfd_process *p = container_of(new->svms, struct kfd_process, svms);
+ unsigned int nbits, old_nbits, old_nbits2;
+ unsigned long *bits;
+ uint32_t gpuidx;
+
+ nbits = svm_range_mapped_nbits(new->npages, new->granularity);
+ old_nbits = svm_range_mapped_nbits(old->npages, old->granularity);
+ old_nbits2 = svm_range_mapped_nbits(last - start + 1, old->granularity);</pre>
</blockquote>
<p>This may be off by one if start and last are not aligned on
granularity boundaries. I think you need to calculate the index
for each of start and last and subtract the indices. E.g.
granularity = 9, start = 511, last = 512. last - start + 1 is 2
and the division tells you you need one bit. But this range
touches two different granules, so you need two bits.<br>
</p>
</blockquote>
right, thanks, will check granularity boundary to calculate nbits.<br>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p> </p>
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap="">+
+ pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n",
+ old, old->start, old->last, start, last, old_nbits, old_nbits2);
+ pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, new->last,
+ nbits);
+
+ for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+ bits = bitmap_alloc(old_nbits2, GFP_KERNEL);
+ if (!bits)
+ return -ENOMEM;
+
+ if (start == old->start) {
+ bitmap_shift_right(new->bitmap_mapped[gpuidx],
+ old->bitmap_mapped[gpuidx],
+ old_nbits2, old_nbits);
+ bitmap_shift_right(bits, old->bitmap_mapped[gpuidx], 0,
+ old_nbits2);</pre>
</blockquote>
<p>Isn't this (shift = 0) the same as bitmap_copy?</p>
</blockquote>
yes, this should use bitmap_copy.<br>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap="">+ } else {
+ bitmap_copy(new->bitmap_mapped[gpuidx],
+ old->bitmap_mapped[gpuidx], nbits);
+ bitmap_shift_right(bits, old->bitmap_mapped[gpuidx],
+ nbits, old_nbits);
+ }
+ bitmap_free(old->bitmap_mapped[gpuidx]);
+ old->bitmap_mapped[gpuidx] = bits;
+ }
+
+ return 0;
+}
+
/**
* svm_range_split_adjust - split range and adjust
*
@@ -1012,6 +1069,10 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
return r;
}
+ r = svm_range_split_bitmap_mapped(new, old, start, last);
+ if (r)
+ return r;
+
old->npages = last - start + 1;
old->start = start;
old->last = last;
@@ -1020,7 +1081,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
new->prefetch_loc = old->prefetch_loc;
new->actual_loc = old->actual_loc;
new->granularity = old->granularity;
- new->mapped_to_gpu = old->mapped_to_gpu;
bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
@@ -1310,6 +1370,84 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
fence);
}
+/**
+ * svm_range_partial_mapped_dev - check if prange mapped on the specific gpu
+ *
+ * @gpuidx: the gpu to check
+ * @prange: prange to check
+ * @start: the start address in pages
+ * @last: the last address in pages
+ *
+ * Return:
+ * true: if any partial range mapped on gpu based on granularity boundary
+ * false: if the entire range not mapped
+ */
+static bool
+svm_range_partial_mapped_dev(uint32_t gpuidx, struct svm_range *prange,
+ unsigned long start, unsigned long last)
+{
+ unsigned long index, off;
+ bool mapped = false;
+
+ start = max(start, prange->start);
+ last = min(last, prange->last);
+ if (last < start)
+ goto out;
+
+ for (off = start; off <= last; off += (1UL << prange->granularity)) {</pre>
</blockquote>
<p>It would be easier to just iterate over indexes instead of
offsets. And even more efficient to avoid testing individual
bits by using a higher level bitmap function, e.g.
bitmap_find_next_bit E.g.</p>
<pre> unsigned long start_index, last_index;
start = max(start, prange->start);
last = min(last, prange->last);
if (last < start)
goto out;
start_index = (start - prange->start) >> prange->granularity;
last_index = (last - prange->start) >> prange->granularity;
return find_next_bit(prange->bitmap_mapped[gpuidx], last_index + 1, start_index) <= last_index;</pre>
</blockquote>
thanks, it is better.<br>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<pre>
</pre>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap="">+ index = (off - prange->start) >> prange->granularity;
+ if (test_bit(index, prange->bitmap_mapped[gpuidx])) {
+ mapped = true;
+ break;
+ }
+ }
+out:
+ pr_debug("prange 0x%p [0x%lx 0x%lx] mapped %d on gpu %d\n", prange,
+ start, last, mapped, gpuidx);
+ return mapped;
+}
+
+static bool
+svm_range_partial_mapped(struct svm_range *prange, unsigned long start,
+ unsigned long last)
+{
+ struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
+ struct svm_range *pchild;
+ uint32_t gpuidx;
+
+ for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+ list_for_each_entry(pchild, &prange->child_list, child_list) {
+ if (svm_range_partial_mapped_dev(gpuidx, pchild, start, last))
+ return true;
+ }
+
+ if (svm_range_partial_mapped_dev(gpuidx, prange, start, last))
+ return true;
+ }
+ return false;
+}
+
+static bool svm_range_is_mapped(struct svm_range *prange)
+{
+ return svm_range_partial_mapped(prange, prange->start, prange->last);
+}
+
+static void
+svm_range_update_mapped(uint32_t gpuidx, struct svm_range *prange,
+ unsigned long start, unsigned long last, bool mapped)
+{
+ unsigned long index, nbits;
+
+ index = (start - prange->start) >> prange->granularity;
+ nbits = svm_range_mapped_nbits(last - start + 1, prange->granularity);</pre>
</blockquote>
<p>This may be off by one if start and last are not aligned on
granularity boundaries. I think you need to calculate the index
for each of start and last and subtract the indices.<br>
</p>
</blockquote>
yes, will correct it in function svm_range_mapped_nbits
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p> </p>
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap="">+ if (mapped)
+ bitmap_set(prange->bitmap_mapped[gpuidx], index, nbits);
+ else
+ bitmap_clear(prange->bitmap_mapped[gpuidx], index, nbits);
+ pr_debug("prange 0x%p [0x%lx 0x%lx] update mapped %d nbits %ld gpu %d\n",
+ prange, start, last, mapped, nbits, gpuidx);
+}
+
static int
svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
unsigned long last, uint32_t trigger)
@@ -1321,29 +1459,28 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
uint32_t gpuidx;
int r = 0;
- if (!prange->mapped_to_gpu) {
- pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
- prange, prange->start, prange->last);
- return 0;
- }
-
- if (prange->start == start && prange->last == last) {
- pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
- prange->mapped_to_gpu = false;
- }
-
bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
MAX_GPU_INSTANCE);
p = container_of(prange->svms, struct kfd_process, svms);
for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
- pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
pdd = kfd_process_device_from_gpuidx(p, gpuidx);
if (!pdd) {
pr_debug("failed to find device idx %d\n", gpuidx);
- return -EINVAL;
+ continue;
+ }
+
+ if (!svm_range_partial_mapped_dev(gpuidx, prange, start, last)) {
+ pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not mapped on gpu %d\n",
+ prange->svms, prange, start, last, gpuidx);
+ continue;
}
+ svm_range_update_mapped(gpuidx, prange, start, last, false);
+
+ pr_debug("unmap svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu %d\n",
+ prange->svms, prange, start, last, gpuidx);
+
kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
start, last, trigger);
@@ -1483,6 +1620,10 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
if (r)
break;
+ if (!r)
+ svm_range_update_mapped(gpuidx, prange, prange->start + offset,
+ prange->start + offset + npages - 1, true);
+
if (fence) {
r = dma_fence_wait(fence, false);
dma_fence_put(fence);
@@ -1648,7 +1789,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
- if (!prange->mapped_to_gpu ||
+ if (!svm_range_is_mapped(prange) ||</pre>
</blockquote>
<p>I think this gives you the wrong answer. As I understand it,
svm_range_is_mapped returns true, if any part of the range is
currently mapped on any GPU. But you'd only want to skip
validate_and_map is all of the range is currently mapped on the
subset of GPUs you're interested in.<br>
</p>
</blockquote>
Here is to skip update mapping if prange is NOT mapped or prange as
no GPU access nor access-in-place attribute. if
(!svm_range_is_mapped(prange)) means no any part of prange is mapped
on any GPU, this is correct condition.<br>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p> </p>
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap=""> bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
r = 0;
goto free_ctx;
@@ -1729,9 +1870,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
r = svm_range_map_to_gpus(prange, offset, npages, readonly,
ctx->bitmap, flush_tlb);
- if (!r && next == end)
- prange->mapped_to_gpu = true;
-
svm_range_unlock(prange);
addr = next;
@@ -1900,10 +2038,10 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
if (!p->xnack_enabled ||
(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
int evicted_ranges;
- bool mapped = prange->mapped_to_gpu;
+ bool mapped = svm_range_is_mapped(prange);
list_for_each_entry(pchild, &prange->child_list, child_list) {
- if (!pchild->mapped_to_gpu)
+ if (!svm_range_is_mapped(pchild))
continue;
mapped = true;</pre>
</blockquote>
Doesn't svm_range_is_mapped already consider child ranges? So you
don't need to set mapped here.<br>
</blockquote>
yes, this is duplicate checking, will remove it.<br>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap=""> mutex_lock_nested(&pchild->lock, 1);
@@ -1966,7 +2104,9 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
static struct svm_range *svm_range_clone(struct svm_range *old)
{
+ struct kfd_process *p = container_of(old->svms, struct kfd_process, svms);
struct svm_range *new;
+ uint32_t gpuidx;
new = svm_range_new(old->svms, old->start, old->last, false);
if (!new)
@@ -1988,7 +2128,11 @@ static struct svm_range *svm_range_clone(struct svm_range *old)
new->prefetch_loc = old->prefetch_loc;
new->actual_loc = old->actual_loc;
new->granularity = old->granularity;
- new->mapped_to_gpu = old->mapped_to_gpu;
+ for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+ bitmap_copy(new->bitmap_mapped[gpuidx], old->bitmap_mapped[gpuidx],
+ svm_range_mapped_nbits(new->last - new->start + 1,
+ new->granularity));
+ };
bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
@@ -2107,7 +2251,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
next_start = min(node->last, last) + 1;
if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
- prange->mapped_to_gpu) {
+ svm_range_is_mapped(prange)) {</pre>
</blockquote>
<p>This is probably wrong too. I think you need a check, whether a
specific range is completely mapped on all GPUs that need
access.<br>
</p>
</blockquote>
<p>Will correct this retry handling, there is case that previous
mapping may not finish on all GPUs completely.</p>
<p>Regards,</p>
<p>Philip<br>
</p>
<blockquote type="cite" cite="mid:5c6431f8-a7ac-4646-b282-8054bd94fbe9@amd.com">
<p> </p>
<p>Regards,<br>
Felix</p>
<p><br>
</p>
<blockquote type="cite" cite="mid:20230929141115.10016-1-Philip.Yang@amd.com">
<pre class="moz-quote-pre" wrap=""> /* nothing to do */
} else if (node->start < start || node->last > last) {
/* node intersects the update range and its attributes
@@ -3587,7 +3731,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
if (migrated && (!p->xnack_enabled ||
(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
- prange->mapped_to_gpu) {
+ svm_range_is_mapped(prange)) {
pr_debug("restore_work will update mappings of GPUs\n");
mutex_unlock(&prange->migrate_mutex);
continue;
@@ -3598,7 +3742,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
continue;
}
- flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
+ flush_tlb = !migrated && update_mapping && svm_range_is_mapped(prange);
r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
true, flush_tlb);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index d2e94d8fb4be..10c92c5e23a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -132,7 +132,7 @@ struct svm_range {
struct list_head child_list;
DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
- bool mapped_to_gpu;
+ unsigned long *bitmap_mapped[MAX_GPU_INSTANCE];
};
static inline void svm_range_lock(struct svm_range *prange)
@@ -163,6 +163,8 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo)
return NULL;
}
+#define svm_range_mapped_nbits(size, granularity) DIV_ROUND_UP((size), 1UL << (granularity))
+
int svm_range_list_init(struct kfd_process *p);
void svm_range_list_fini(struct kfd_process *p);
int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
</pre>
</blockquote>
</blockquote>
</body>
</html>