[PATCH v4 1/1] drm/amdkfd: Track unified memory when switching xnack mode
Felix Kuehling
felix.kuehling at amd.com
Wed Sep 28 15:59:42 UTC 2022
Am 2022-09-28 um 11:55 schrieb Philip Yang:
>
> On 2022-09-27 14:58, Felix Kuehling wrote:
>> Am 2022-09-27 um 11:43 schrieb Philip Yang:
>>> Unified memory usage with xnack off is tracked to avoid oversubscribe
>>> system memory, with xnack on, we don't track unified memory usage to
>>> allow memory oversubscribe. When switching xnack mode from off to on,
>>> subsequent free ranges allocated with xnack off will not unreserve
>>> memory. When switching xnack mode from on to off, subsequent free
>>> ranges
>>> allocated with xnack on will unreserve memory. Both cases cause memory
>>> accounting unbalanced.
>>>
>>> When switching xnack mode from on to off, need reserve already
>>> allocated
>>> svm range memory. When switching xnack mode from off to on, need
>>> unreserve already allocated svm range memory.
>>>
>>> v4: Handle reservation memory failure
>>> v3: Handle switching xnack mode race with svm_range_deferred_list_work
>>> v2: Handle both switching xnack from on to off and from off to on cases
>>>
>>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 26 ++++++++++----
>>> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 44
>>> +++++++++++++++++++++++-
>>> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
>>> 3 files changed, 64 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index 56f7307c21d2..5feaba6a77de 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -1584,6 +1584,8 @@ static int kfd_ioctl_smi_events(struct file
>>> *filep,
>>> return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>>> }
>>> +#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>>> +
>>> static int kfd_ioctl_set_xnack_mode(struct file *filep,
>>> struct kfd_process *p, void *data)
>>> {
>>> @@ -1594,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct
>>> file *filep,
>>> if (args->xnack_enabled >= 0) {
>>> if (!list_empty(&p->pqm.queues)) {
>>> pr_debug("Process has user queues running\n");
>>> - mutex_unlock(&p->mutex);
>>> - return -EBUSY;
>>> + r = -EBUSY;
>>> + goto out_unlock;
>>> }
>>> - if (args->xnack_enabled && !kfd_process_xnack_mode(p, true))
>>> +
>>> + if (p->xnack_enabled == args->xnack_enabled)
>>> + goto out_unlock;
>>> +
>>> + if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) {
>>> r = -EPERM;
>>> - else
>>> - p->xnack_enabled = args->xnack_enabled;
>>> + goto out_unlock;
>>> + }
>>> +
>>> + r = svm_range_switch_xnack_reserve_mem(p,
>>> args->xnack_enabled);
>>> } else {
>>> args->xnack_enabled = p->xnack_enabled;
>>> }
>>> +
>>> +out_unlock:
>>> mutex_unlock(&p->mutex);
>>> return r;
>>> }
>>> -#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>>> static int kfd_ioctl_svm(struct file *filep, struct kfd_process
>>> *p, void *data)
>>> {
>>> struct kfd_ioctl_svm_args *args = data;
>>> @@ -1629,6 +1638,11 @@ static int kfd_ioctl_svm(struct file *filep,
>>> struct kfd_process *p, void *data)
>>> return r;
>>> }
>>> #else
>>> +static int kfd_ioctl_set_xnack_mode(struct file *filep,
>>> + struct kfd_process *p, void *data)
>>> +{
>>> + return -EPERM;
>>> +}
>>> static int kfd_ioctl_svm(struct file *filep, struct kfd_process
>>> *p, void *data)
>>> {
>>> return -EPERM;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> index cf5b4005534c..13d2867faa0c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> @@ -278,7 +278,7 @@ static void svm_range_free(struct svm_range
>>> *prange, bool update_mem_usage)
>>> svm_range_free_dma_mappings(prange);
>>> if (update_mem_usage && !p->xnack_enabled) {
>>> - pr_debug("unreserve mem limit: %lld\n", size);
>>> + pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange,
>>> size);
>>> amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>>> KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>>> }
>>> @@ -2956,6 +2956,48 @@ svm_range_restore_pages(struct amdgpu_device
>>> *adev, unsigned int pasid,
>>> return r;
>>> }
>>> +int
>>> +svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool
>>> xnack_enabled)
>>> +{
>>> + struct svm_range *prange;
>>> + uint64_t reserved_size = 0;
>>> + uint64_t size;
>>> + int r = 0;
>>> +
>>> + pr_debug("switching xnack from %d to %d\n", p->xnack_enabled,
>>> xnack_enabled);
>>> +
>>> + mutex_lock(&p->svms.lock);
>>> +
>>> + /* Change xnack mode must be inside svms lock, to avoid race with
>>> + * svm_range_deferred_list_work unreserve memory in parallel.
>>> + */
>>> + p->xnack_enabled = xnack_enabled;
>>
>> This should only be set on a successful return.
>>
>>
>>> +
>>> + list_for_each_entry(prange, &p->svms.list, list) {
>>> + size = (prange->last - prange->start + 1) << PAGE_SHIFT;
>>> + pr_debug("svms 0x%p %s prange 0x%p size 0x%llx\n", &p->svms,
>>> + xnack_enabled ? "unreserve" : "reserve", prange, size);
>>> +
>>> + if (xnack_enabled) {
>>> + amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>>> + } else {
>>> + r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
>>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>>> + if (r)
>>> + break;
>>> + reserved_size += size;
>>> + }
>>> + }
>>> +
>>> + if (r)
>>> + amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
>>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>>> +
>>> + mutex_unlock(&p->svms.lock);
>>> + return r;
>>> +}
>>> +
>>> void svm_range_list_fini(struct kfd_process *p)
>>> {
>>> struct svm_range *prange;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>>> index 012c53729516..e58690376e17 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>>> @@ -203,11 +203,11 @@ void svm_range_list_lock_and_flush_work(struct
>>> svm_range_list *svms, struct mm_s
>>> void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
>>> void svm_range_set_max_pages(struct amdgpu_device *adev);
>>> +int svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool
>>> xnack_enabled);
>>> #else
>>> struct kfd_process;
>>> -
>> Unrelated whitespace change.
>>
>> With these two issues fixed, this patch is
>
> Thanks for the review. I realize we should handle prange->child_list
> when switching xnack mode and reserve memory too
Alternatively, you could flush the deferred work before doing the memory
accounting. Maybe that would be a more general solution.
Sounds like there are some more interesting changes happening to your
patch. In that case I'd like to review the final version.
Thanks,
Felix
> , as there is a small change in sys/kernel/debug/kfd/mem_limit after
> overnight test. I will fix this and the two issues you pointed out,
> then push the patch.
>
> Regards,
>
> Philip
>
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>
>>
>>> static inline int svm_range_list_init(struct kfd_process *p)
>>> {
>>> return 0;
>>
More information about the amd-gfx
mailing list