[PATCH v2] drm/amdkfd: Add thermal throttling SMI event

Amber Lin Amber.Lin at amd.com
Thu Jul 23 21:57:57 UTC 2020



On 2020-07-23 5:41 p.m., Joshi, Mukul wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> -----Original Message-----
> From: Lin, Amber <Amber.Lin at amd.com>
> Sent: Thursday, July 23, 2020 5:27 PM
> To: Joshi, Mukul <Mukul.Joshi at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>
> Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
>
>
>
> On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
>> Add support for reporting thermal throttling events through SMI.
>> Also, add a counter to count the number of throttling interrupts
>> observed and report the count in the SMI event message.
>>
>> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>>    drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>>    drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>>    .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>>    drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>>    include/uapi/linux/kfd_ioctl.h                |  3 +-
>>    10 files changed, 75 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 1b865fed74ca..19e4658756d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>>    {
>>    }
>> +
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) { }
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 3f2b695cf19e..e8b0258aae24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>>    int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>>    					       struct dma_fence *fence);
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask);
>>    
>>    #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 4bfedaab183f..d5e790f046b4 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -29,6 +29,7 @@
>>    #include "cwsr_trap_handler.h"
>>    #include "kfd_iommu.h"
>>    #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>    
>>    #define MQD_SIZE_ALIGNED 768
>>    
>> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>>    	WARN_ONCE(count < 0, "Compute profile ref. count error");
>>    }
>>    
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) {
>> +	if (kfd)
>> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); }
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    /* This function will send a package to HIQ to hang the HWS diff
>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index 7b348bf9df21..00c90b47155b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -24,6 +24,7 @@
>>    #include <linux/wait.h>
>>    #include <linux/anon_inodes.h>
>>    #include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu.h"
>>    #include "amdgpu_vm.h"
>>    #include "kfd_priv.h"
>>    #include "kfd_smi_events.h"
>> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>    	return 0;
>>    }
>>    
>> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
>> +			      char *event_msg, int len)
>> +{
>> +	struct kfd_smi_client *client;
>> +
>> +	rcu_read_lock();
>> +
>> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> +		if (!(READ_ONCE(client->events) & smi_event))
>> +			continue;
>> +		spin_lock(&client->lock);
>> +		if (kfifo_avail(&client->fifo) >= len) {
>> +			kfifo_in(&client->fifo, event_msg, len);
>> +			wake_up_all(&client->wait_queue);
>> +		} else {
>> +			pr_debug("smi_event(EventID: %llu): no space left\n",
>> +					smi_event);
>> +		}
>> +		spin_unlock(&client->lock);
>> +	}
>> +
>> +	rcu_read_unlock();
>> +}
>> +
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask)
>> +{
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>> +	/*
>> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
> gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU.
>
> I agree with you. But I would prefer to keep gpu_id in the SMI event message. The benefit is it explicitly identifies the GPU sending the message. This way user-space doesn't need to maintain an internal mapping of anon_fd to gpu_id.
We want to keep the message as short as possible so the 1K kfifo can 
contain more events in case multiple events rush in and the user doesn't 
consume fast enough. For the same reason vmfault event doesn't log 
gpu_id. On the user side, the user shouldn't need to do the 
anon_fd/gpu_id convert. That information should be already in their 
structure. Instead of reading/parsing the message, the user reads from 
their structure.
>
> Regards,
> Mukul
>
>> +	 * 			 thermal_interrupt_count(8):
>> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
>> +	 * 4 byte throttle_bitmask + 1 byte : +
>> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
>> +	 */
>> +	char fifo_in[36];
>> +	int len;
>> +
>> +	if (list_empty(&dev->smi_clients))
>> +		return;
>> +
>> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
>> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
>> +		       dev->id, throttle_bitmask,
>> +		       atomic64_read(&adev->smu.throttle_int_counter));
>> +
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in,
>> +len); }
>> +
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    {
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@
>> -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>>    	 */
>>    	char fifo_in[43];
>> -	struct kfd_smi_client *client;
>>    	int len;
>>    
>>    	if (list_empty(&dev->smi_clients))
>> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>>    		task_info.pid, task_info.task_name);
>>    
>> -	rcu_read_lock();
>> -
>> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
>> -			continue;
>> -		spin_lock(&client->lock);
>> -		if (kfifo_avail(&client->fifo) >= len) {
>> -			kfifo_in(&client->fifo, fifo_in, len);
>> -			wake_up_all(&client->wait_queue);
>> -		}
>> -		else
>> -			pr_debug("smi_event(vmfault): no space left\n");
>> -		spin_unlock(&client->lock);
>> -	}
>> -
>> -	rcu_read_unlock();
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>>    }
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> index a9cb218fef96..15537b2cccb5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -25,5 +25,7 @@
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t
>> pasid);
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask);
>>    
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> index b197dcaed064..52b52cbb90e2 100644
>> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>>    	mutex_init(&smu->message_lock);
>>    
>>    	INIT_WORK(&smu->throttling_logging_work,
>> smu_throttling_logging_work_fn);
>> +	atomic64_set(&smu->throttle_int_counter, 0);
>>    	smu->watermarks_bitmap = 0;
>>    	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>>    	smu->default_power_profile_mode =
>> PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> index 9b68760dd35b..eb3a57593f69 100644
>> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> @@ -2251,6 +2251,7 @@ static void
>> arcturus_log_thermal_throttling_event(struct smu_context *smu)
>>    
>>    	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>>    			log_buf);
>> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>>    }
>>    
>>    static const struct pptable_funcs arcturus_ppt_funcs = { diff --git
>> a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> index 896b443f1ce8..18ba421c59bd 100644
>> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> @@ -446,6 +446,7 @@ struct smu_context
>>    	bool dc_controlled_by_gpio;
>>    
>>    	struct work_struct throttling_logging_work;
>> +	atomic64_t throttle_int_counter;
>>    };
>>    
>>    struct i2c_adapter;
>> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> index fd82402065e6..a9453ec01619 100644
>> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>>    				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>>    				break;
>>    			case 0x7:
>> +				/*
>> +				 * Increment the throttle interrupt counter
>> +				 */
>> +				atomic64_inc(&smu->throttle_int_counter);
>> +
>>    				if (!atomic_read(&adev->throttling_logging_enabled))
>>    					return 0;
>>    
>> diff --git a/include/uapi/linux/kfd_ioctl.h
>> b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc
>> 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>>     * KFD SMI(System Management Interface) events
>>     */
>>    /* Event type (defined by bitmask) */
>> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
>> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>>    
>>    struct kfd_ioctl_smi_events_args {
>>    	__u32 gpuid;	/* to KFD */



More information about the amd-gfx mailing list