[PATCH v4 1/1] drm/amdkfd: Track unified memory when switching xnack mode

Philip Yang yangp at amd.com
Wed Sep 28 15:55:43 UTC 2022


On 2022-09-27 14:58, Felix Kuehling wrote:
> Am 2022-09-27 um 11:43 schrieb Philip Yang:
>> Unified memory usage with xnack off is tracked to avoid oversubscribe
>> system memory, with xnack on, we don't track unified memory usage to
>> allow memory oversubscribe. When switching xnack mode from off to on,
>> subsequent free ranges allocated with xnack off will not unreserve
>> memory. When switching xnack mode from on to off, subsequent free ranges
>> allocated with xnack on will unreserve memory. Both cases cause memory
>> accounting unbalanced.
>>
>> When switching xnack mode from on to off, need reserve already allocated
>> svm range memory. When switching xnack mode from off to on, need
>> unreserve already allocated svm range memory.
>>
>> v4: Handle reservation memory failure
>> v3: Handle switching xnack mode race with svm_range_deferred_list_work
>> v2: Handle both switching xnack from on to off and from off to on cases
>>
>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 26 ++++++++++----
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 44 +++++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h     |  2 +-
>>   3 files changed, 64 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 56f7307c21d2..5feaba6a77de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1584,6 +1584,8 @@ static int kfd_ioctl_smi_events(struct file 
>> *filep,
>>       return kfd_smi_event_open(pdd->dev, &args->anon_fd);
>>   }
>>   +#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>> +
>>   static int kfd_ioctl_set_xnack_mode(struct file *filep,
>>                       struct kfd_process *p, void *data)
>>   {
>> @@ -1594,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct 
>> file *filep,
>>       if (args->xnack_enabled >= 0) {
>>           if (!list_empty(&p->pqm.queues)) {
>>               pr_debug("Process has user queues running\n");
>> -            mutex_unlock(&p->mutex);
>> -            return -EBUSY;
>> +            r = -EBUSY;
>> +            goto out_unlock;
>>           }
>> -        if (args->xnack_enabled && !kfd_process_xnack_mode(p, true))
>> +
>> +        if (p->xnack_enabled == args->xnack_enabled)
>> +            goto out_unlock;
>> +
>> +        if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) {
>>               r = -EPERM;
>> -        else
>> -            p->xnack_enabled = args->xnack_enabled;
>> +            goto out_unlock;
>> +        }
>> +
>> +        r = svm_range_switch_xnack_reserve_mem(p, args->xnack_enabled);
>>       } else {
>>           args->xnack_enabled = p->xnack_enabled;
>>       }
>> +
>> +out_unlock:
>>       mutex_unlock(&p->mutex);
>>         return r;
>>   }
>>   -#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
>>   static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, 
>> void *data)
>>   {
>>       struct kfd_ioctl_svm_args *args = data;
>> @@ -1629,6 +1638,11 @@ static int kfd_ioctl_svm(struct file *filep, 
>> struct kfd_process *p, void *data)
>>       return r;
>>   }
>>   #else
>> +static int kfd_ioctl_set_xnack_mode(struct file *filep,
>> +                    struct kfd_process *p, void *data)
>> +{
>> +    return -EPERM;
>> +}
>>   static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, 
>> void *data)
>>   {
>>       return -EPERM;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index cf5b4005534c..13d2867faa0c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -278,7 +278,7 @@ static void svm_range_free(struct svm_range 
>> *prange, bool update_mem_usage)
>>       svm_range_free_dma_mappings(prange);
>>         if (update_mem_usage && !p->xnack_enabled) {
>> -        pr_debug("unreserve mem limit: %lld\n", size);
>> +        pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
>>           amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>>                       KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>>       }
>> @@ -2956,6 +2956,48 @@ svm_range_restore_pages(struct amdgpu_device 
>> *adev, unsigned int pasid,
>>       return r;
>>   }
>>   +int
>> +svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool 
>> xnack_enabled)
>> +{
>> +    struct svm_range *prange;
>> +    uint64_t reserved_size = 0;
>> +    uint64_t size;
>> +    int r = 0;
>> +
>> +    pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, 
>> xnack_enabled);
>> +
>> +    mutex_lock(&p->svms.lock);
>> +
>> +    /* Change xnack mode must be inside svms lock, to avoid race with
>> +     * svm_range_deferred_list_work unreserve memory in parallel.
>> +     */
>> +    p->xnack_enabled = xnack_enabled;
>
> This should only be set on a successful return.
>
>
>> +
>> +    list_for_each_entry(prange, &p->svms.list, list) {
>> +        size = (prange->last - prange->start + 1) << PAGE_SHIFT;
>> +        pr_debug("svms 0x%p %s prange 0x%p size 0x%llx\n", &p->svms,
>> +             xnack_enabled ? "unreserve" : "reserve", prange, size);
>> +
>> +        if (xnack_enabled) {
>> +            amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
>> +                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> +        } else {
>> +            r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
>> +                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> +            if (r)
>> +                break;
>> +            reserved_size += size;
>> +        }
>> +    }
>> +
>> +    if (r)
>> +        amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
>> +                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR);
>> +
>> +    mutex_unlock(&p->svms.lock);
>> +    return r;
>> +}
>> +
>>   void svm_range_list_fini(struct kfd_process *p)
>>   {
>>       struct svm_range *prange;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> index 012c53729516..e58690376e17 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
>> @@ -203,11 +203,11 @@ void svm_range_list_lock_and_flush_work(struct 
>> svm_range_list *svms, struct mm_s
>>   void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
>>     void svm_range_set_max_pages(struct amdgpu_device *adev);
>> +int svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool 
>> xnack_enabled);
>>     #else
>>     struct kfd_process;
>> -
> Unrelated whitespace change.
>
> With these two issues fixed, this patch is

Thanks for the review. I realize we should handle prange->child_list 
when switching xnack mode and reserve memory too, as there is a small 
change in sys/kernel/debug/kfd/mem_limit after overnight test. I will 
fix this and the two issues you pointed out, then push the patch.

Regards,

Philip

>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
>
>>   static inline int svm_range_list_init(struct kfd_process *p)
>>   {
>>       return 0;
>


More information about the amd-gfx mailing list