[PATCH] drm/amdgpu: XGMI pstate switch initial support

Christian König ckoenig.leichtzumerken at gmail.com
Tue Mar 26 16:55:38 UTC 2019


The problem is that userspace could add it first and then change the domain.

Pretty unlikely thought,
Christian.

Am 26.03.19 um 14:15 schrieb Liu, Shaoyun:
> I think in a real usage  ( tensorflow ex),  it's rarely only the 
> system memory (no vram) will be mapped for peer access.
> Anyway, how about add preferred_domain check for xgmi ? I think even 
> user use ioctl to change the preferred_domain,  bo_add should still be 
> called before the real mapping.
>
> Regards
> Shaoyun.liu
> ------------------------------------------------------------------------
> *From:* amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of 
> Kuehling, Felix <Felix.Kuehling at amd.com>
> *Sent:* March 25, 2019 6:28:32 PM
> *To:* Liu, Shaoyun; amd-gfx at lists.freedesktop.org
> *Subject:* Re: [PATCH] drm/amdgpu: XGMI pstate switch initial support
> I don't see any check for the memory type. As far as I can tell you'll
> power up XGMI even for system memory mappings. See inline.
>
> On 2019-03-22 3:28 p.m., Liu, Shaoyun wrote:
> > Driver vote low to high pstate switch whenever there is an outstanding
> > XGMI mapping request. Driver vote high to low pstate when all the
> > outstanding XGMI mapping is terminated.
> >
> > Change-Id: I197501f853c47f844055c0e28c0ac00a1ff06607
> > Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c     | 21 +++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h     |  4 ++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 16 +++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   | 10 ++++++++++
> >   6 files changed, 56 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index ec9562d..c4c61e9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2018,6 +2018,10 @@ static void 
> amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
> >        r = amdgpu_device_enable_mgpu_fan_boost();
> >        if (r)
> >                DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> > +
> > +     /*set to low pstate by default */
> > +     amdgpu_xgmi_set_pstate(adev, 0);
> > +
> >   }
> >
> >   static void amdgpu_device_delay_enable_gfx_off(struct work_struct 
> *work)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > index 220a6a7..c430e82 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > @@ -72,6 +72,8 @@ struct amdgpu_bo_va {
> >
> >        /* If the mappings are cleared or filled */
> >        bool                            cleared;
> > +
> > +     bool                            is_xgmi;
> >   };
> >
> >   struct amdgpu_bo {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 729da1c..a7247d5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -34,6 +34,7 @@
> >   #include "amdgpu_trace.h"
> >   #include "amdgpu_amdkfd.h"
> >   #include "amdgpu_gmc.h"
> > +#include "amdgpu_xgmi.h"
> >
> >   /**
> >    * DOC: GPUVM
> > @@ -2072,6 +2073,15 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
> >        INIT_LIST_HEAD(&bo_va->valids);
> >        INIT_LIST_HEAD(&bo_va->invalids);
> >
> > +     if (bo && amdgpu_xgmi_same_hive(adev, 
> amdgpu_ttm_adev(bo->tbo.bdev))) {
> > +             bo_va->is_xgmi = true;
>
> You're setting this to true even for system memory BOs that don't
> involve XGMI mappings. That means you'll power up XGMI unnecessarily in
> many cases because KFD processes always have system memory mappings that
> are mapped to all GPUs (e.g. the signal page).
>
> Regards,
>    Felix
>
>
> > + mutex_lock(&adev->vm_manager.lock_pstate);
> > +             /* Power up XGMI if it can be potentially used */
> > +             if (++adev->vm_manager.xgmi_map_counter == 1)
> > +                     amdgpu_xgmi_set_pstate(adev, 1);
> > + mutex_unlock(&adev->vm_manager.lock_pstate);
> > +     }
> > +
> >        return bo_va;
> >   }
> >
> > @@ -2490,6 +2500,14 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
> >        }
> >
> >        dma_fence_put(bo_va->last_pt_update);
> > +
> > +     if (bo && bo_va->is_xgmi) {
> > + mutex_lock(&adev->vm_manager.lock_pstate);
> > +             if (--adev->vm_manager.xgmi_map_counter == 0)
> > +                     amdgpu_xgmi_set_pstate(adev, 0);
> > + mutex_unlock(&adev->vm_manager.lock_pstate);
> > +     }
> > +
> >        kfree(bo_va);
> >   }
> >
> > @@ -2997,6 +3015,9 @@ void amdgpu_vm_manager_init(struct 
> amdgpu_device *adev)
> >
> >        idr_init(&adev->vm_manager.pasid_idr);
> > spin_lock_init(&adev->vm_manager.pasid_lock);
> > +
> > +     adev->vm_manager.xgmi_map_counter = 0;
> > + mutex_init(&adev->vm_manager.lock_pstate);
> >   }
> >
> >   /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > index 520122b..f586b38 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > @@ -324,6 +324,10 @@ struct amdgpu_vm_manager {
> >         */
> >        struct idr pasid_idr;
> >        spinlock_t pasid_lock;
> > +
> > +     /* counter of mapped memory through xgmi */
> > +     uint32_t xgmi_map_counter;
> > +     struct mutex lock_pstate;
> >   };
> >
> >   #define amdgpu_vm_copy_pte(adev, ib, pe, src, count) 
> ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count)))
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > index fcc4b05..3368347 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> > @@ -200,12 +200,26 @@ struct amdgpu_hive_info 
> *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
> >
> >        if (lock)
> >                mutex_lock(&tmp->hive_lock);
> > -
> > +     tmp->pstate = -1;
> >        mutex_unlock(&xgmi_mutex);
> >
> >        return tmp;
> >   }
> >
> > +int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
> > +{
> > +     int ret = 0;
> > +     struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
> > +
> > +     if (!hive)
> > +             return 0;
> > +
> > +     if (hive->pstate == pstate)
> > +             return 0;
> > +     /* Todo : sent the message to SMU for pstate change */
> > +     return ret;
> > +}
> > +
> >   int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, 
> struct amdgpu_device *adev)
> >   {
> >        int ret = -EINVAL;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > index 24a3b03..3e9c91e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> > @@ -33,11 +33,21 @@ struct amdgpu_hive_info {
> >        struct kobject *kobj;
> >        struct device_attribute dev_attr;
> >        struct amdgpu_device *adev;
> > +     int pstate; /*0 -- low , 1 -- high , -1 unknown*/
> >   };
> >
> >   struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
> *adev, int lock);
> >   int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, 
> struct amdgpu_device *adev);
> >   int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
> >   void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
> > +int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> > +
> > +static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
> > +             struct amdgpu_device *bo_adev)
> > +{
> > +     return (adev != bo_adev &&
> > +             adev->gmc.xgmi.hive_id &&
> > +             adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
> > +}
> >
> >   #endif
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20190326/31f9050b/attachment-0001.html>


More information about the amd-gfx mailing list