[PATCH 2/2] drm/amdkfd: optimize memory mapping latency

Tue Jun 1 16:16:22 UTC 2021

Am 2021-06-01 um 12:05 p.m. schrieb Felix Kuehling:
> Am 2021-05-29 um 6:51 p.m. schrieb Eric Huang:
>> 1. conditionally flush TLBs after map.
>> 2. add heavy weight TLBs flushing after unmap.
>>
>> Signed-off-by: Eric Huang <jinhuieric.huang at amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  2 ++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h    |  1 +
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  2 +-
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 24 +++++++++++--------
>>  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 ++---
>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  4 ++--
>>  8 files changed, 25 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 2560977760b3..997258c24ef2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -85,6 +85,7 @@ struct kgd_mem {
>>  
>>  	bool aql_queue;
>>  	bool is_imported;
>> +	bool table_freed;
>>  };
>>  
>>  /* KFD Memory Eviction */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 385c33675227..8ac0d849fd3f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1132,6 +1132,8 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
>>  		return ret;
>>  	}
>>  
>> +	mem->table_freed = bo_va->table_freed;
>> +
> I think this should be
>
>     mem->table_freed = mem->table_freed || bo_va->table_freed;
>
> That way, on a multi-GPU system, mem->table_freed gets set to true if
> any GPU freed a page table. Then somewhere, this needs to be reset to false.
>
> However, that means, if one GPU frees a page table, all GPUs need to
> flush, which may be unnecessary. A better alternative would be to do the
> TLB flushing right here, only for the affected GPU, instead of returning
> an aggregated "table_freed" all the way back to
> kfd_ioctl_map_memory_to_gpu, which flushes all GPUs.

On second thought, in this function you don't know the PASID to flush,
and you don't know whether to flush a PASID (with HWS) or a VMID
(without HWS). Also, we don't wait here for the mapping to complete, so
this is not the right place to flush.

So you probably do need to return this all the way to
kfd_ioctl_map_memory_to_gpu after all. And the flush has to be after the
amdgpu_amdkfd_gpuvm_sync_memory. So flushing only the affected GPUs is
getting more difficult. If the flush is rare enough, it doesn't matter.

So I'm OK with your proposed solution of flushing all GPUs in
kfd_ioctl_map_memory_to_gpu. Just make sure you OR together the
table_freed flags from all the GPUs, and reset it to false before you
start mapping. Also, you probably don't need to store this permanently
in the kgd_mem structure. Just add an output parameter to
amdgpu_amdkfd_gpuvm_map_memory_to_gpu.

Regards,
  Felix

>
> Finally, bo_va->table_freed is only used once, after the
> amdgpu_vm_bo_update call returns. So there is no good reason to store
> this permanently in the bo_va structure. It would be better to just add
> an output parameter to amdgpu_vm_bo_update.
>
> Regards,
>   Felix
>
>
>>  	return amdgpu_sync_fence(sync, bo_va->last_pt_update);
>>  }
>>  
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 36e7f088d4ee..0e0f27f779cd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -87,6 +87,7 @@ struct amdgpu_bo_va {
>>  	bool				cleared;
>>  
>>  	bool				is_xgmi;
>> +	bool				table_freed;
>>  };
>>  
>>  struct amdgpu_bo {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 95b94c95adac..ff3eb8395017 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1885,7 +1885,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
>>  						resv, mapping->start,
>>  						mapping->last, update_flags,
>>  						mapping->offset, mem,
>> -						pages_addr, last_update, NULL,
>> +						pages_addr, last_update, &bo_va->table_freed,
>>  						vram_base_offset);
>>  		if (r)
>>  			return r;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 960913a35ee4..c45ccd1d03c0 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1658,16 +1658,18 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
>>  	}
>>  
>>  	/* Flush TLBs after waiting for the page table updates to complete */
>> -	for (i = 0; i < args->n_devices; i++) {
>> -		peer = kfd_device_by_id(devices_arr[i]);
>> -		if (WARN_ON_ONCE(!peer))
>> -			continue;
>> -		peer_pdd = kfd_get_process_device_data(peer, p);
>> -		if (WARN_ON_ONCE(!peer_pdd))
>> -			continue;
>> -		if (!amdgpu_read_lock(peer->ddev, true)) {
>> -			kfd_flush_tlb(peer_pdd);
>> -			amdgpu_read_unlock(peer->ddev);
>> +	if (((struct kgd_mem *)mem)->table_freed) {
>> +		for (i = 0; i < args->n_devices; i++) {
>> +			peer = kfd_device_by_id(devices_arr[i]);
>> +			if (WARN_ON_ONCE(!peer))
>> +				continue;
>> +			peer_pdd = kfd_get_process_device_data(peer, p);
>> +			if (WARN_ON_ONCE(!peer_pdd))
>> +				continue;
>> +			if (!amdgpu_read_lock(peer->ddev, true)) {
>> +				kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
>> +				amdgpu_read_unlock(peer->ddev);
>> +			}
>>  		}
>>  	}
>>  
>> @@ -1766,6 +1768,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>>  			amdgpu_read_unlock(peer->ddev);
>>  			goto unmap_memory_from_gpu_failed;
>>  		}
>> +		((struct kgd_mem *)mem)->table_freed = false;
>> +		kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
>>  		amdgpu_read_unlock(peer->ddev);
>>  		args->n_success = i+1;
>>  	}
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index c1bea1f7627b..a4920bc5cfbc 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -278,7 +278,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
>>  			qpd->vmid,
>>  			qpd->page_table_base);
>>  	/* invalidate the VM context after pasid and vmid mapping is set up */
>> -	kfd_flush_tlb(qpd_to_pdd(qpd));
>> +	kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
>>  
>>  	if (dqm->dev->kfd2kgd->set_scratch_backing_va)
>>  		dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
>> @@ -314,7 +314,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
>>  		if (flush_texture_cache_nocpsch(q->device, qpd))
>>  			pr_err("Failed to flush TC\n");
>>  
>> -	kfd_flush_tlb(qpd_to_pdd(qpd));
>> +	kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
>>  
>>  	/* Release the vmid mapping */
>>  	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
>> @@ -885,7 +885,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>  				dqm->dev->kgd,
>>  				qpd->vmid,
>>  				qpd->page_table_base);
>> -		kfd_flush_tlb(pdd);
>> +		kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
>>  	}
>>  
>>  	/* Take a safe reference to the mm_struct, which may otherwise
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index ecdd5e782b81..edce3ecf207d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -1338,7 +1338,7 @@ void kfd_signal_reset_event(struct kfd_dev *dev);
>>  
>>  void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid);
>>  
>> -void kfd_flush_tlb(struct kfd_process_device *pdd);
>> +void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type);
>>  
>>  int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
>>  
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 4ab9da288f90..a03373743a3d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -2161,7 +2161,7 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
>>  			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
>>  }
>>  
>> -void kfd_flush_tlb(struct kfd_process_device *pdd)
>> +void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
>>  {
>>  	struct kfd_dev *dev = pdd->dev;
>>  
>> @@ -2174,7 +2174,7 @@ void kfd_flush_tlb(struct kfd_process_device *pdd)
>>  							pdd->qpd.vmid);
>>  	} else {
>>  		amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
>> -					pdd->process->pasid, TLB_FLUSH_LEGACY);
>> +					pdd->process->pasid, type);
>>  	}
>>  }
>>