[PATCH v4 1/1] drm/amdkfd: Track unified memory when switching xnack mode
Philip Yang
yangp at amd.com
Wed Sep 28 15:55:43 UTC 2022
On 2022-09-27 14:58, Felix Kuehling wrote:
> Am 2022-09-27 um 11:43 schrieb Philip Yang:
>> Unified memory usage with xnack off is tracked to avoid oversubscribe
>> system memory, with xnack on, we don't track unified memory usage to
>> allow memory oversubscribe. When switching xnack mode from off to on,
>> subsequent free ranges allocated with xnack off will not unreserve
>> memory. When switching xnack mode from on to off, subsequent free ranges
>> allocated with xnack on will unreserve memory. Both cases cause memory
>> accounting unbalanced.
>>
>> When switching xnack mode from on to off, need reserve already allocated
>> svm range memory. When switching xnack mode from off to on, need
>> unreserve already allocated svm range memory.
>>
>> v4: Handle reservation memory failure
>> v3: Handle switching xnack mode race with svm_range_deferred_list_work
>> v2: Handle both switching xnack from on to off and from off to on cases
>>
>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 26 ++++++++++----
>> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 44 +++++++++++++++++++++++-
>> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
>> 3 files changed, 64 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 56f7307c21d2..5feaba6a77de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1584,6 +1584,8 @@ static int kfd_ioctl_smi_events(struct file
>> *filep,
>> return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>> }
>> +#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>> +
>> static int kfd_ioctl_set_xnack_mode(struct file *filep,
>> struct kfd_process *p, void *data)
>> {
>> @@ -1594,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct
>> file *filep,
>> if (args->xnack_enabled >= 0) {
>> if (!list_empty(&p->pqm.queues)) {
>> pr_debug("Process has user queues running\n");
>> - mutex_unlock(&p->mutex);
>> - return -EBUSY;
>> + r = -EBUSY;
>> + goto out_unlock;
>> }
>> - if (args->xnack_enabled && !kfd_process_xnack_mode(p, true))
>> +
>> + if (p->xnack_enabled == args->xnack_enabled)
>> + goto out_unlock;
>> +
>> + if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) {
>> r = -EPERM;
>> - else
>> - p->xnack_enabled = args->xnack_enabled;
>> + goto out_unlock;
>> + }
>> +
>> + r = svm_range_switch_xnack_reserve_mem(p, args->xnack_enabled);
>> } else {
>> args->xnack_enabled = p->xnack_enabled;
>> }
>> +
>> +out_unlock:
>> mutex_unlock(&p->mutex);
>> return r;
>> }
>> -#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>> static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p,
>> void *data)
>> {
>> struct kfd_ioctl_svm_args *args = data;
>> @@ -1629,6 +1638,11 @@ static int kfd_ioctl_svm(struct file *filep,
>> struct kfd_process *p, void *data)
>> return r;
>> }
>> #else
>> +static int kfd_ioctl_set_xnack_mode(struct file *filep,
>> + struct kfd_process *p, void *data)
>> +{
>> + return -EPERM;
>> +}
>> static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p,
>> void *data)
>> {
>> return -EPERM;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index cf5b4005534c..13d2867faa0c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -278,7 +278,7 @@ static void svm_range_free(struct svm_range
>> *prange, bool update_mem_usage)
>> svm_range_free_dma_mappings(prange);
>> if (update_mem_usage && !p->xnack_enabled) {
>> - pr_debug("unreserve mem limit: %lld\n", size);
>> + pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
>> amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>> KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> }
>> @@ -2956,6 +2956,48 @@ svm_range_restore_pages(struct amdgpu_device
>> *adev, unsigned int pasid,
>> return r;
>> }
>> +int
>> +svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool
>> xnack_enabled)
>> +{
>> + struct svm_range *prange;
>> + uint64_t reserved_size = 0;
>> + uint64_t size;
>> + int r = 0;
>> +
>> + pr_debug("switching xnack from %d to %d\n", p->xnack_enabled,
>> xnack_enabled);
>> +
>> + mutex_lock(&p->svms.lock);
>> +
>> + /* Change xnack mode must be inside svms lock, to avoid race with
>> + * svm_range_deferred_list_work unreserve memory in parallel.
>> + */
>> + p->xnack_enabled = xnack_enabled;
>
> This should only be set on a successful return.
>
>
>> +
>> + list_for_each_entry(prange, &p->svms.list, list) {
>> + size = (prange->last - prange->start + 1) << PAGE_SHIFT;
>> + pr_debug("svms 0x%p %s prange 0x%p size 0x%llx\n", &p->svms,
>> + xnack_enabled ? "unreserve" : "reserve", prange, size);
>> +
>> + if (xnack_enabled) {
>> + amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> + } else {
>> + r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> + if (r)
>> + break;
>> + reserved_size += size;
>> + }
>> + }
>> +
>> + if (r)
>> + amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
>> + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> +
>> + mutex_unlock(&p->svms.lock);
>> + return r;
>> +}
>> +
>> void svm_range_list_fini(struct kfd_process *p)
>> {
>> struct svm_range *prange;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> index 012c53729516..e58690376e17 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> @@ -203,11 +203,11 @@ void svm_range_list_lock_and_flush_work(struct
>> svm_range_list *svms, struct mm_s
>> void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
>> void svm_range_set_max_pages(struct amdgpu_device *adev);
>> +int svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool
>> xnack_enabled);
>> #else
>> struct kfd_process;
>> -
> Unrelated whitespace change.
>
> With these two issues fixed, this patch is
Thanks for the review. I realize we should handle prange->child_list
when switching xnack mode and reserve memory too, as there is a small
change in sys/kernel/debug/kfd/mem_limit after overnight test. I will
fix this and the two issues you pointed out, then push the patch.
Regards,
Philip
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
>
>> static inline int svm_range_list_init(struct kfd_process *p)
>> {
>> return 0;
>
More information about the amd-gfx
mailing list