[PATCH] drm/amdkfd: Add thermal throttling SMI event
Felix Kuehling
felix.kuehling at amd.com
Tue Jul 21 21:53:05 UTC 2020
Am 2020-07-21 um 5:01 p.m. schrieb Mukul Joshi:
> Add support for reporting thermal throttling events through SMI.
> Also, add a counter to count the number of throttling interrupts
> observed and report the count in the SMI event message.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 7 ++
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 70 ++++++++++++++-----
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 2 +
> drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 1 +
> drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 1 +
> .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 +
> drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++
> include/uapi/linux/kfd_ioctl.h | 1 +
> 10 files changed, 77 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1b865fed74ca..19e4658756d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
> void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
> {
> }
> +
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> +}
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3f2b695cf19e..e8b0258aae24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
> int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
> struct dma_fence *fence);
> void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
>
> #endif /* AMDGPU_AMDKFD_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4bfedaab183f..d5e790f046b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -29,6 +29,7 @@
> #include "cwsr_trap_handler.h"
> #include "kfd_iommu.h"
> #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
> #define MQD_SIZE_ALIGNED 768
>
> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
> WARN_ONCE(count < 0, "Compute profile ref. count error");
> }
>
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> + if (kfd)
> + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
> +}
> +
> #if defined(CONFIG_DEBUG_FS)
>
> /* This function will send a package to HIQ to hang the HWS
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7b348bf9df21..247538bccba2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -24,6 +24,7 @@
> #include <linux/wait.h>
> #include <linux/anon_inodes.h>
> #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu.h"
> #include "amdgpu_vm.h"
> #include "kfd_priv.h"
> #include "kfd_smi_events.h"
> @@ -148,6 +149,56 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> return 0;
> }
>
> +static int add_event_to_kfifo(struct kfd_dev *dev, long long smi_event,
> + char *event_msg, int len)
> +{
> + struct kfd_smi_client *client;
> + int ret = 0;
> +
> + rcu_read_lock();
> +
> + list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> + if (!(READ_ONCE(client->events) & smi_event))
> + continue;
> + spin_lock(&client->lock);
> + if (kfifo_avail(&client->fifo) >= len) {
> + kfifo_in(&client->fifo, event_msg, len);
> + wake_up_all(&client->wait_queue);
> + }
> + else
"else" should be on the same line as the closing brace. Also, if one
if-else branch uses braces, all branches should for readability and
maintainability.
> + ret = -ENOSPC;
> + spin_unlock(&client->lock);
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> + uint32_t throttle_bitmask)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> + /*
> + * ThermalThrottle msg = gpu_id(4):thermal_interrupt_count(4):
> + * throttle_bitmask(4)
> + * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
> + * 4 byte thermal_interupt_counter + 1 byte : +
The interrupt counter can be more than 4 bytes. It's a 32bit counter (or
maybe 64bit, see below).
> + * 4 byte throttle_bitmask + 1 byte \n = 32
> + */
> + char fifo_in[32];
> + int len;
> +
> + if (list_empty(&dev->smi_clients))
> + return;
> +
> + len = snprintf(fifo_in, 32, "%x %d:%d:%x\n", KFD_SMI_EVENT_THERMAL,
> + dev->id, READ_ONCE(adev->smu.throttle_int_counter), throttle_bitmask);
I'd recommend using hexadecimal for everything. It's more compact and
avoids mistakes parsing it in user mode.
> +
> + if (add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL, fifo_in, len))
> + pr_debug("smi_event(throttle): no space left\n");
I'd move the message into add_event_to_kfifo to avoid duplicating it in
multiple functions and to avoid guessing the cause of the problem (no
space left).
> +}
> +
> void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> @@ -156,7 +207,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
> */
> char fifo_in[43];
> - struct kfd_smi_client *client;
> int len;
>
> if (list_empty(&dev->smi_clients))
> @@ -171,22 +221,8 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
> task_info.pid, task_info.task_name);
>
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> - if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> - continue;
> - spin_lock(&client->lock);
> - if (kfifo_avail(&client->fifo) >= len) {
> - kfifo_in(&client->fifo, fifo_in, len);
> - wake_up_all(&client->wait_queue);
> - }
> - else
> - pr_debug("smi_event(vmfault): no space left\n");
> - spin_unlock(&client->lock);
> - }
> -
> - rcu_read_unlock();
> + if (add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len))
> + pr_debug("smi_event(vmfault): no space left\n");
> }
>
> int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index a9cb218fef96..15537b2cccb5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -25,5 +25,7 @@
>
> int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
> void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> + uint32_t throttle_bitmask);
>
> #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index b197dcaed064..26f4f28848c9 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
> mutex_init(&smu->message_lock);
>
> INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
> + smu->throttle_int_counter = 0;
> smu->watermarks_bitmap = 0;
> smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 9b68760dd35b..eb3a57593f69 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
>
> dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
> log_buf);
> + kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
> }
>
> static const struct pptable_funcs arcturus_ppt_funcs = {
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 896b443f1ce8..0d8c70eca39d 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -446,6 +446,7 @@ struct smu_context
> bool dc_controlled_by_gpio;
>
> struct work_struct throttling_logging_work;
> + uint32_t throttle_int_counter; // Throttle interrupt counter
The comment adds no useful information. The variable name is already
quite descriptive.
> };
>
> struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index fd82402065e6..67653dd7862d 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1311,6 +1311,12 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
> smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
> break;
> case 0x7:
> + /*
> + * Increment the throttle interrupt counter
> + */
> + WRITE_ONCE(smu->throttle_int_counter,
> + smu->throttle_int_counter+1);
> +
This is not atomic. Use atomic_t or atomic64_t if 32-bit may overflow in
a reasonable time frame. Then you'd use atomic_set to initialize it,
atomic_read to read it and atomic_inc_return to increment it.
> if (!atomic_read(&adev->throttling_logging_enabled))
> return 0;
>
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f738c3b53f4e..1e083e20c23d 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -451,6 +451,7 @@ struct kfd_ioctl_import_dmabuf_args {
> */
> /* Event type (defined by bitmask) */
> #define KFD_SMI_EVENT_VMFAULT 0x0000000000000001
> +#define KFD_SMI_EVENT_THERMAL 0x0000000000000002
Maybe call it THERMAL_THROTTLE explicitly. There may be other types of
thermal events in the future (e.g. exceeding a critical temperature or
thermal shutdown).
Regards,
Felix
>
> struct kfd_ioctl_smi_events_args {
> __u32 gpuid; /* to KFD */
More information about the amd-gfx
mailing list