[PATCH] drm/amdgpu: revert "XGMI pstate switch initial support"

Christian König ckoenig.leichtzumerken at gmail.com
Tue Mar 19 12:49:11 UTC 2019


Yeah, all that is perfectly fine.

The problem is Shaoyun didn't put this into the mapping code, but rather 
into the VM state machine. So this won't work at all (the counter and 
increment/decrement unbalanced and multiple times).

The correct place to add this is amdgpu_vm_bo_add/amdgpu_vm_bo_rmv.

Additional to that the approach with the counter doesn't work because 
you don't have a lock protecting the hw update itself. E.g. while 
powering down you can add a mapping which needs to power it up again and 
so powering down and powering up race with each other.

Regards,
Christian.

Am 19.03.19 um 13:42 schrieb Kuehling, Felix:
> We discussed a few different approaches before settling on this one.
>
> Maybe it needs some more background. XGMI links are quite power hungry.
> Being able to power them down improves performance for power-limited
> workloads that don't need XGMI. In machine learning, pretty much all
> workloads are power limited on our GPUs, so this is not just a
> theoretical thing. The problem is, how do you know whether you need
> XGMI? You need to know whether there are P2P memory mappings involving
> XGMI. So the natural place to put that is in the memory mapping code.
>
> If you do spot a race condition in the code, let's talk about how to fix it.
>
> Regards,
>     Felix
>
> On 3/19/2019 8:07 AM, Christian König wrote:
>> This reverts commit c9115f8904eef0f880d3b4f8306f553b1bb1c532.
>>
>> Adding this to the mapping is complete nonsense and the whole
>> implementation looks racy. This patch wasn't thoughtfully reviewed
>> and should be reverted for now.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 -
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c     | 29 +---------------------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 15 -----------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 --
>>    6 files changed, 1 insertion(+), 52 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b5720c1610e1..1db192150532 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -931,9 +931,6 @@ struct amdgpu_device {
>>    	int asic_reset_res;
>>    	struct work_struct		xgmi_reset_work;
>>    
>> -	/* counter of mapped memory through xgmi */
>> -	atomic_t			xgmi_map_counter;
>> -
>>    	bool                            in_baco_reset;
>>    };
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 964a4d3f1f43..206583707124 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2018,9 +2018,6 @@ static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
>>    	r = amdgpu_device_enable_mgpu_fan_boost();
>>    	if (r)
>>    		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
>> -
>> -	/*set to low pstate by default */
>> -	amdgpu_xgmi_set_pstate(adev, 0);
>>    }
>>    
>>    static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 6f176bbe4cf2..220a6a7b1bc1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -54,7 +54,6 @@ struct amdgpu_bo_va_mapping {
>>    	uint64_t			__subtree_last;
>>    	uint64_t			offset;
>>    	uint64_t			flags;
>> -	bool				is_xgmi;
>>    };
>>    
>>    /* User space allocated BO in a VM */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index c5230a9fb7f6..c8f0e4ca05fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -34,7 +34,6 @@
>>    #include "amdgpu_trace.h"
>>    #include "amdgpu_amdkfd.h"
>>    #include "amdgpu_gmc.h"
>> -#include "amdgpu_xgmi.h"
>>    
>>    /**
>>     * DOC: GPUVM
>> @@ -2022,9 +2021,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>>    	struct ttm_mem_reg *mem;
>>    	struct drm_mm_node *nodes;
>>    	struct dma_fence *exclusive, **last_update;
>> -	struct amdgpu_device *bo_adev = adev;
>> -	bool is_xgmi = false;
>>    	uint64_t flags;
>> +	struct amdgpu_device *bo_adev = adev;
>>    	int r;
>>    
>>    	if (clear || !bo) {
>> @@ -2046,10 +2044,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>>    	if (bo) {
>>    		flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
>>    		bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
>> -		if (adev != bo_adev &&
>> -		    adev->gmc.xgmi.hive_id &&
>> -		    adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id)
>> -			is_xgmi = true;
>>    	} else {
>>    		flags = 0x0;
>>    	}
>> @@ -2068,19 +2062,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>>    	}
>>    
>>    	list_for_each_entry(mapping, &bo_va->invalids, list) {
>> -		if (mapping->is_xgmi != is_xgmi) {
>> -			if (is_xgmi) {
>> -				/* Adding an XGMI mapping to the PT */
>> -				if (atomic_inc_return(&adev->xgmi_map_counter) == 1)
>> -					amdgpu_xgmi_set_pstate(adev, 1);
>> -			} else {
>> -				/* Removing an XGMI mapping from the PT */
>> -				if (atomic_dec_return(&adev->xgmi_map_counter) == 0)
>> -					amdgpu_xgmi_set_pstate(adev, 0);
>> -			}
>> -			mapping->is_xgmi = is_xgmi;
>> -		}
>> -
>>    		r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
>>    					       mapping, flags, bo_adev, nodes,
>>    					       last_update);
>> @@ -2298,13 +2279,6 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>>    		r = amdgpu_vm_bo_update_mapping(adev, NULL, NULL, vm,
>>    						mapping->start, mapping->last,
>>    						init_pte_value, 0, &f);
>> -
>> -		if (mapping->is_xgmi) {
>> -			/* Removing an XGMI mapping from the PT */
>> -			if (atomic_dec_return(&adev->xgmi_map_counter) == 0)
>> -				amdgpu_xgmi_set_pstate(adev, 0);
>> -		}
>> -
>>    		amdgpu_vm_free_mapping(adev, vm, mapping, f);
>>    		if (r) {
>>    			dma_fence_put(f);
>> @@ -2501,7 +2475,6 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
>>    	mapping->last = eaddr;
>>    	mapping->offset = offset;
>>    	mapping->flags = flags;
>> -	mapping->is_xgmi = false;
>>    
>>    	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 807440d3edff..fcc4b05c745c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -200,7 +200,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
>>    
>>    	if (lock)
>>    		mutex_lock(&tmp->hive_lock);
>> -	tmp->pstate = -1;
>>    
>>    	mutex_unlock(&xgmi_mutex);
>>    
>> @@ -322,17 +321,3 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>>    		mutex_unlock(&hive->hive_lock);
>>    	}
>>    }
>> -
>> -int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
>> -{
>> -	int ret = 0;
>> -	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>> -
>> -	if (!hive)
>> -		return 0;
>> -
>> -	if (hive->pstate == pstate)
>> -		return 0;
>> -	/* Todo : sent the message to SMU for pstate change */
>> -	return ret;
>> -}
>> \ No newline at end of file
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 7e1710fcbef2..24a3b0362f98 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -33,13 +33,11 @@ struct amdgpu_hive_info {
>>    	struct kobject *kobj;
>>    	struct device_attribute dev_attr;
>>    	struct amdgpu_device *adev;
>> -	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
>>    };
>>    
>>    struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
>>    int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
>>    int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>>    void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>> -int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
>>    
>>    #endif



More information about the amd-gfx mailing list