[PATCH 2/5] drm/amdkfd: enable per process SMI event
Felix Kuehling
felix.kuehling at amd.com
Fri Jan 14 22:07:04 UTC 2022
Am 2022-01-14 um 3:38 p.m. schrieb Philip Yang:
> Process receive event log from same process by default. Add a flag to
> be able to receive event log from all processes, this requires super
> user permission.
>
> Event log with pid 0 send to all processes.
>
> Define new event log id, migration trigger, user queue eviction
> trigger, those new event log will be added in following patches.
>
> Update kfd_ioctl.h version.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 28 +++++++++++++++------
> include/uapi/linux/kfd_ioctl.h | 27 ++++++++++++++++++++
> 2 files changed, 48 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7023fa21a0a9..5818ea8ad4ce 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -37,6 +37,8 @@ struct kfd_smi_client {
> uint64_t events;
> struct kfd_dev *dev;
> spinlock_t lock;
> + pid_t pid;
> + bool suser;
> };
>
> #define MAX_KFIFO_SIZE 1024
> @@ -150,16 +152,26 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> return 0;
> }
>
> -static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event,
> - char *event_msg, int len)
> +static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client,
> + unsigned int smi_event)
> +{
> + if (pid &&
> + !(client->suser && client->events & KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESSES)) &&
> + client->pid != pid)
We cannot change the default behaviour for existing SMI events.
Otherwise be break existing user mode.
I'd make the new flag KFD_SMI_EVENT_SAME_PROCESS. Its default is 0,
which preserves the existing behaviour.
The SAME_PROCESS flag can apply to all events. But for the new profiler
events, we need to check both the flag and the ADMIN capability to allow
events from all processes, because they contain more sensitive
information than the existing SMI events.
> + return false;
> +
> + return client->events & KFD_SMI_EVENT_MASK_FROM_INDEX(smi_event);
You still need to access client->events with READ_ONCE here. The reason
is, that this function is not holding the lock. So we need to read the
variable atomically.
Regards,
Felix
> +}
> +
> +static void add_event_to_kfifo(pid_t pid, struct kfd_dev *dev,
> + unsigned int smi_event, char *event_msg, int len)
> {
> struct kfd_smi_client *client;
>
> rcu_read_lock();
>
> list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> - if (!(READ_ONCE(client->events) &
> - KFD_SMI_EVENT_MASK_FROM_INDEX(smi_event)))
> + if (!kfd_smi_ev_enabled(pid, client, smi_event))
> continue;
> spin_lock(&client->lock);
> if (kfifo_avail(&client->fifo) >= len) {
> @@ -202,7 +214,7 @@ void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
> len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event,
> dev->reset_seq_num);
>
> - add_event_to_kfifo(dev, event, fifo_in, len);
> + add_event_to_kfifo(0, dev, event, fifo_in, len);
> }
>
> void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> @@ -225,7 +237,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
> amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
>
> - add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
> + add_event_to_kfifo(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
> }
>
> void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> @@ -250,7 +262,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
> task_info.pid, task_info.task_name);
>
> - add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
> + add_event_to_kfifo(task_info.pid, dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
> }
>
> int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> @@ -282,6 +294,8 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> spin_lock_init(&client->lock);
> client->events = 0;
> client->dev = dev;
> + client->pid = current->pid;
> + client->suser = capable(CAP_SYS_ADMIN);
>
> spin_lock(&dev->smi_lock);
> list_add_rcu(&client->list, &dev->smi_clients);
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index af96af174dc4..bbbae8ad9721 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -32,6 +32,7 @@
> * - 1.4 - Indicate new SRAM EDC bit in device properties
> * - 1.5 - Add SVM API
> * - 1.6 - Query clear flags in SVM get_attr API
> + * - 1.7 - Add SMI profiler event log
> */
> #define KFD_IOCTL_MAJOR_VERSION 1
> #define KFD_IOCTL_MINOR_VERSION 6
> @@ -459,10 +460,36 @@ enum kfd_smi_event {
> KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
> KFD_SMI_EVENT_GPU_PRE_RESET = 3,
> KFD_SMI_EVENT_GPU_POST_RESET = 4,
> + KFD_SMI_EVENT_MIGRATION = 5,
> + KFD_SMI_EVENT_PAGE_FAULT = 6,
> + KFD_SMI_EVENT_QUEUE_EVICTION = 7,
> + KFD_SMI_EVENT_QUEUE_EVICTION_RESTORE = 8,
> +
> + /*
> + * max event number, as a flag bit to get events from all processes,
> + * this requires super user permission, otherwise will not be able to
> + * receive events from any process. Without this flag to receive events
> + * from same process.
> + */
> + KFD_SMI_EVENT_ALL_PROCESSES = 64
> };
>
> #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
>
> +enum KFD_MIGRATION_TRIGGER {
> + MIGRATION_TRIGGER_PREFETCH = 1,
> + MIGRATION_TRIGGER_PAGEFAULT,
> + MIGRATION_TRIGGER_PAGEFAULT_CPU,
> + MIGRATION_TRIGGER_TTM_EVICTION
> +};
> +
> +enum KFD_USER_QUEUE_EVICTION_TRIGGER {
> + SVM_RANGE_EVICTION = 1,
> + USERPTR_EVICTION,
> + TTM_EVICTION,
> + SUSPEND_EVICTION
> +};
> +
> struct kfd_ioctl_smi_events_args {
> __u32 gpuid; /* to KFD */
> __u32 anon_fd; /* from KFD */
More information about the amd-gfx
mailing list