[PATCH 20/21] drm/xe/eudebug: Dynamically toggle debugger functionality
Manszewski, Christoph
christoph.manszewski at intel.com
Wed Aug 7 10:09:48 UTC 2024
Hi Matt,
On 28.07.2024 06:50, Matthew Brost wrote:
> On Fri, Jul 26, 2024 at 05:08:17PM +0300, Mika Kuoppala wrote:
>> From: Christoph Manszewski <christoph.manszewski at intel.com>
>>
>> Make it possible to dynamically enable/disable debugger funtionality,
>> including the setting and unsetting of required hw register values via a
>> sysfs entry located at '/sys/class/drm/card<X>/device/enable_eudebug'.
>>
>> This entry uses 'kstrtobool' and as such it accepts inputs as documented
>> by this function, in particular '0' and '1'.
>>
>> 1) Adjust to xe_rtp graphics ranges changes.
>> 2) Fix pile placement. Wa 14019869343 (aka. 16021232320) was
>> added later in the pile, move disablement to appropriate commit.
>> 3) flush reset (Christoph)
>> 4) dont allow exec queue enable if feature is disabled (Dominik)
>> 5) bind to drm sysfs functions (Maciej)
>>
>> Signed-off-by: Christoph Manszewski <christoph.manszewski at intel.com>
>> Signed-off-by: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
>> Signed-off-by: Maciej Patelczyk <maciej.patelczyk at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_device.c | 2 -
>> drivers/gpu/drm/xe/xe_device_types.h | 5 +
>> drivers/gpu/drm/xe/xe_eudebug.c | 174 +++++++++++++++++++++++----
>> drivers/gpu/drm/xe/xe_eudebug.h | 2 -
>> drivers/gpu/drm/xe/xe_exec_queue.c | 3 +
>> drivers/gpu/drm/xe/xe_hw_engine.c | 1 -
>> drivers/gpu/drm/xe/xe_reg_sr.c | 21 +++-
>> drivers/gpu/drm/xe/xe_reg_sr.h | 4 +-
>> drivers/gpu/drm/xe/xe_rtp.c | 2 +-
>> drivers/gpu/drm/xe/xe_rtp_types.h | 1 +
>> 10 files changed, 178 insertions(+), 37 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>> index 268faa1800c4..30a2bf2a7e00 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -785,8 +785,6 @@ int xe_device_probe(struct xe_device *xe)
>>
>> xe_debugfs_register(xe);
>>
>> - xe_eudebug_init_late(xe);
>> -
>> xe_hwmon_register(xe);
>>
>> for_each_gt(gt, xe, id)
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index beb1f3c8dc63..17aedbba3130 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -517,6 +517,11 @@ struct xe_device {
>> /** @ordered_wq: used to discovery */
>> struct workqueue_struct *ordered_wq;
>>
>> + /** @enable_lock: protects the enable toggle */
>> + struct mutex enable_lock;
>> + /** @enable: is the debugging functionality enabled */
>> + bool enable;
>> +
>> /** @attention_scan: attention scan worker */
>> struct delayed_work attention_scan;
>> } eudebug;
>> diff --git a/drivers/gpu/drm/xe/xe_eudebug.c b/drivers/gpu/drm/xe/xe_eudebug.c
>> index a80ffb64df15..80066f787ec2 100644
>> --- a/drivers/gpu/drm/xe/xe_eudebug.c
>> +++ b/drivers/gpu/drm/xe/xe_eudebug.c
>> @@ -2000,9 +2000,6 @@ xe_eudebug_connect(struct xe_device *xe,
>>
>> param->version = DRM_XE_EUDEBUG_VERSION;
>>
>> - if (!xe->eudebug.available)
>> - return -EOPNOTSUPP;
>> -
>> d = kzalloc(sizeof(*d), GFP_KERNEL);
>> if (!d)
>> return -ENOMEM;
>> @@ -2064,70 +2061,199 @@ int xe_eudebug_connect_ioctl(struct drm_device *dev,
>> struct drm_xe_eudebug_connect * const param = data;
>> int ret = 0;
>>
>> + mutex_lock(&xe->eudebug.enable_lock);
>> +
>> + if (!xe->eudebug.enable) {
>> + mutex_unlock(&xe->eudebug.enable_lock);
>> + return -ENODEV;
>> + }
>> +
>> ret = xe_eudebug_connect(xe, param);
>>
>> + mutex_unlock(&xe->eudebug.enable_lock);
>> +
>> return ret;
>> }
>>
>> #undef XE_REG_MCR
>> #define XE_REG_MCR(...) XE_REG(__VA_ARGS__, .mcr = 1)
>>
>> -void xe_eudebug_init_hw_engine(struct xe_hw_engine *hwe)
>> +static void xe_eudebug_init_hw_engine(struct xe_hw_engine *hwe)
>> {
>> const struct xe_rtp_entry_sr eudebug_was[] = {
>> - { XE_RTP_NAME("GlobalDebugEnable"),
>> - XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210),
>> - ENGINE_CLASS(RENDER)),
>> - XE_RTP_ACTIONS(SET(CS_DEBUG_MODE2(RENDER_RING_BASE),
>> - GLOBAL_DEBUG_ENABLE))
>> - },
>> { XE_RTP_NAME("TdCtlDebugEnable"),
>> XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 3499),
>> FUNC(xe_rtp_match_first_render_or_compute)),
>> XE_RTP_ACTIONS(SET(TD_CTL,
>> TD_CTL_BREAKPOINT_ENABLE |
>> TD_CTL_FORCE_THREAD_BREAKPOINT_ENABLE |
>> - TD_CTL_FEH_AND_FEE_ENABLE))
>> + TD_CTL_FEH_AND_FEE_ENABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> },
>> { XE_RTP_NAME("TdCtlGlobalDebugEnable"),
>> XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1250, XE_RTP_END_VERSION_UNDEFINED),
>> FUNC(xe_rtp_match_first_render_or_compute)),
>> - XE_RTP_ACTIONS(SET(TD_CTL, TD_CTL_GLOBAL_DEBUG_ENABLE))
>> + XE_RTP_ACTIONS(SET(TD_CTL, TD_CTL_GLOBAL_DEBUG_ENABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> },
>> { XE_RTP_NAME("18022722726"),
>> XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1250, 1274),
>> FUNC(xe_rtp_match_first_render_or_compute)),
>> - XE_RTP_ACTIONS(SET(ROW_CHICKEN, STALL_DOP_GATING_DISABLE))
>> + XE_RTP_ACTIONS(SET(ROW_CHICKEN, STALL_DOP_GATING_DISABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> },
>> { XE_RTP_NAME("14015527279"),
>> XE_RTP_RULES(PLATFORM(PVC),
>> FUNC(xe_rtp_match_first_render_or_compute)),
>> - XE_RTP_ACTIONS(SET(ROW_CHICKEN2, XEHPC_DISABLE_BTB))
>> + XE_RTP_ACTIONS(SET(ROW_CHICKEN2, XEHPC_DISABLE_BTB,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> },
>> {}
>> };
>> struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe);
>> - struct xe_device *xe = gt_to_xe(hwe->gt);
>>
>> - if (xe->eudebug.available)
>> - xe_rtp_process_to_sr(&ctx, eudebug_was, &hwe->reg_sr);
>> + xe_rtp_process_to_sr(&ctx, eudebug_was, &hwe->reg_sr);
>> +}
>> +
>> +static void xe_eudebug_fini_hw_engine(struct xe_hw_engine *hwe)
>> +{
>> + const struct xe_rtp_entry_sr eudebug_was[] = {
>> + { XE_RTP_NAME("TdCtlDebugEnable"),
>> + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 3499),
>> + FUNC(xe_rtp_match_first_render_or_compute)),
>> + XE_RTP_ACTIONS(CLR(TD_CTL,
>> + TD_CTL_BREAKPOINT_ENABLE |
>> + TD_CTL_FORCE_THREAD_BREAKPOINT_ENABLE |
>> + TD_CTL_FEH_AND_FEE_ENABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> + },
>> + { XE_RTP_NAME("TdCtlGlobalDebugEnable"),
>> + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1250, XE_RTP_END_VERSION_UNDEFINED),
>> + FUNC(xe_rtp_match_first_render_or_compute)),
>> + XE_RTP_ACTIONS(CLR(TD_CTL, TD_CTL_GLOBAL_DEBUG_ENABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> + },
>> + { XE_RTP_NAME("18022722726"),
>> + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1250, 1274),
>> + FUNC(xe_rtp_match_first_render_or_compute)),
>> + XE_RTP_ACTIONS(CLR(ROW_CHICKEN, STALL_DOP_GATING_DISABLE,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> + },
>> + { XE_RTP_NAME("14015527279"),
>> + XE_RTP_RULES(PLATFORM(PVC),
>> + FUNC(xe_rtp_match_first_render_or_compute)),
>> + XE_RTP_ACTIONS(CLR(ROW_CHICKEN2, XEHPC_DISABLE_BTB,
>> + XE_RTP_ACTION_FLAG(OVERWRITE)))
>> + },
>> + {}
>> + };
>> + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe);
>> +
>> + xe_rtp_process_to_sr(&ctx, eudebug_was, &hwe->reg_sr);
>> +}
>> +
>> +static int xe_eudebug_enable(struct xe_device *xe, bool enable)
>> +{
>> + struct xe_gt *gt;
>> + int i;
>> + u8 id;
>> +
>> + if (!xe->eudebug.available)
>> + return -EOPNOTSUPP;
>> +
>> + /* XXX: TODO hold list lock? */
>> + mutex_lock(&xe->eudebug.enable_lock);
>> +
>> + if (!enable && !list_empty(&xe->eudebug.list)) {
>> + mutex_unlock(&xe->eudebug.enable_lock);
>> + return -EBUSY;
>> + }
>> +
>> + if (enable == xe->eudebug.enable) {
>> + mutex_unlock(&xe->eudebug.enable_lock);
>> + return 0;
>> + }
>> +
>> + for_each_gt(gt, xe, id) {
>> + for (i = 0; i < ARRAY_SIZE(gt->hw_engines); i++) {
>> + if (!(gt->info.engine_mask & BIT(i)))
>> + continue;
>> +
>> + if (enable)
>> + xe_eudebug_init_hw_engine(>->hw_engines[i]);
>> + else
>> + xe_eudebug_fini_hw_engine(>->hw_engines[i]);
>> + }
>> +
>> + xe_gt_reset_async(gt);
>
> What are you trying to accomplish with the reset? Reinit of some HW
> settings? Doing GT reset does have a side affect of killing exec queues
> with jobs active on the GPU which is not ideal. Please explain exactly
> what needs to be done, and maybe we can brainstorm something that won't
> kill things.
>
> Also exposing a GT reset via sysfs might be potential security issue
> too.
>
>> + flush_work(>->reset.worker);
>
> Flushing the reset worker here creates a lock dependency chain with
> xe->eudebug.enable_lock and all locks taken during a GT reset which is
> not ideal either. If we need to do a GT reset (or something else with a
> worker that takes locks) it probably best to flush the worker after
> dropping xe->eudebug.enable_lock. This relates to my other comments on
> locking, we shouldn't create lock depeendecy chains unless we have to
> and the chains are well defined.
Well the problem I see with doing the flush outside the lock is that
other processes may try to write to the sysfs entry and the flush/hw
reinitialization may happen for a inconsistent state (with some engines
having reg_sr entries set to enable and others to disable debugging).
Of course that would happen for a brief period since after the final
write to the sysfs node, the state would be well defined. So it's a
question whether the above scenario is actually a problem? Should the
kernel allow for such things to happen?
Thanks,
Christoph
>
> Matt
>
>> + }
>> +
>> + if (enable)
>> + attention_scan_flush(xe);
>> + else
>> + attention_scan_cancel(xe);
>> +
>> + xe->eudebug.enable = enable;
>> + mutex_unlock(&xe->eudebug.enable_lock);
>> +
>> + return 0;
>> +}
>> +
>> +static ssize_t enable_eudebug_show(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> + struct xe_device *xe = pdev_to_xe_device(to_pci_dev(dev));
>> +
>> + return sysfs_emit(buf, "%u\n", xe->eudebug.enable);
>> +}
>> +
>> +static ssize_t enable_eudebug_store(struct device *dev, struct device_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + struct xe_device *xe = pdev_to_xe_device(to_pci_dev(dev));
>> + bool enable;
>> + int ret;
>> +
>> + ret = kstrtobool(buf, &enable);
>> + if (ret)
>> + return ret;
>> +
>> + ret = xe_eudebug_enable(xe, enable);
>> + if (ret)
>> + return ret;
>> +
>> + return count;
>> +}
>> +
>> +static DEVICE_ATTR_RW(enable_eudebug);
>> +
>> +static void xe_eudebug_sysfs_fini(void *arg)
>> +{
>> + struct xe_device *xe = arg;
>> +
>> + sysfs_remove_file(&xe->drm.dev->kobj, &dev_attr_enable_eudebug.attr);
>> }
>>
>> void xe_eudebug_init(struct xe_device *xe)
>> {
>> + struct device *dev = xe->drm.dev;
>> + int ret;
>> +
>> spin_lock_init(&xe->eudebug.lock);
>> INIT_LIST_HEAD(&xe->eudebug.list);
>> INIT_DELAYED_WORK(&xe->eudebug.attention_scan, attention_scan_fn);
>>
>> - xe->eudebug.available = true;
>> -}
>> + drmm_mutex_init(&xe->drm, &xe->eudebug.enable_lock);
>> + xe->eudebug.enable = false;
>>
>> -void xe_eudebug_init_late(struct xe_device *xe)
>> -{
>> - if (!xe->eudebug.available)
>> - return;
>>
>> - attention_scan_flush(xe);
>> + ret = sysfs_create_file(&xe->drm.dev->kobj, &dev_attr_enable_eudebug.attr);
>> + if (ret)
>> + drm_warn(&xe->drm, "eudebug sysfs init failed: %d, debugger unavailable\n", ret);
>> + else
>> + devm_add_action_or_reset(dev, xe_eudebug_sysfs_fini, xe);
>> +
>> + xe->eudebug.available = ret == 0;
>> }
>>
>> void xe_eudebug_fini(struct xe_device *xe)
>> diff --git a/drivers/gpu/drm/xe/xe_eudebug.h b/drivers/gpu/drm/xe/xe_eudebug.h
>> index 02c9ba56e752..2f66aa87a0f6 100644
>> --- a/drivers/gpu/drm/xe/xe_eudebug.h
>> +++ b/drivers/gpu/drm/xe/xe_eudebug.h
>> @@ -24,9 +24,7 @@ int xe_eudebug_connect_ioctl(struct drm_device *dev,
>> struct drm_file *file);
>>
>> void xe_eudebug_init(struct xe_device *xe);
>> -void xe_eudebug_init_late(struct xe_device *xe);
>> void xe_eudebug_fini(struct xe_device *xe);
>> -void xe_eudebug_init_hw_engine(struct xe_hw_engine *hwe);
>>
>> void xe_eudebug_file_open(struct xe_file *xef);
>> void xe_eudebug_file_close(struct xe_file *xef);
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>> index bc2edade5e5b..b6fc65ab8aa9 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>> @@ -360,6 +360,9 @@ static int exec_queue_set_eudebug(struct xe_device *xe, struct xe_exec_queue *q,
>> !(value & DRM_XE_EXEC_QUEUE_EUDEBUG_FLAG_ENABLE)))
>> return -EINVAL;
>>
>> + if (XE_IOCTL_DBG(xe, !xe->eudebug.enable))
>> + return -EPERM;
>> +
>> q->eudebug_flags = EXEC_QUEUE_EUDEBUG_FLAG_ENABLE;
>>
>> return 0;
>> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
>> index 74813bc20787..0f90416be0ba 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
>> @@ -504,7 +504,6 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>> xe_tuning_process_engine(hwe);
>> xe_wa_process_engine(hwe);
>> hw_engine_setup_default_state(hwe);
>> - xe_eudebug_init_hw_engine(hwe);
>>
>> xe_reg_sr_init(&hwe->reg_whitelist, hwe->name, gt_to_xe(gt));
>> xe_reg_whitelist_process_engine(hwe);
>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c b/drivers/gpu/drm/xe/xe_reg_sr.c
>> index 440ac572f6e5..a7671722a84e 100644
>> --- a/drivers/gpu/drm/xe/xe_reg_sr.c
>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.c
>> @@ -92,22 +92,31 @@ static void reg_sr_inc_error(struct xe_reg_sr *sr)
>>
>> int xe_reg_sr_add(struct xe_reg_sr *sr,
>> const struct xe_reg_sr_entry *e,
>> - struct xe_gt *gt)
>> + struct xe_gt *gt,
>> + bool overwrite)
>> {
>> unsigned long idx = e->reg.addr;
>> struct xe_reg_sr_entry *pentry = xa_load(&sr->xa, idx);
>> int ret;
>>
>> if (pentry) {
>> - if (!compatible_entries(pentry, e)) {
>> + if (overwrite && e->set_bits) {
>> + pentry->clr_bits |= e->clr_bits;
>> + pentry->set_bits |= e->set_bits;
>> + pentry->read_mask |= e->read_mask;
>> + } else if (overwrite && !e->set_bits) {
>> + pentry->clr_bits |= e->clr_bits;
>> + pentry->set_bits &= ~e->clr_bits;
>> + pentry->read_mask |= e->read_mask;
>> + } else if (!compatible_entries(pentry, e)) {
>> ret = -EINVAL;
>> goto fail;
>> + } else {
>> + pentry->clr_bits |= e->clr_bits;
>> + pentry->set_bits |= e->set_bits;
>> + pentry->read_mask |= e->read_mask;
>> }
>>
>> - pentry->clr_bits |= e->clr_bits;
>> - pentry->set_bits |= e->set_bits;
>> - pentry->read_mask |= e->read_mask;
>> -
>> return 0;
>> }
>>
>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.h b/drivers/gpu/drm/xe/xe_reg_sr.h
>> index 51fbba423e27..d67fafdcd847 100644
>> --- a/drivers/gpu/drm/xe/xe_reg_sr.h
>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.h
>> @@ -6,6 +6,8 @@
>> #ifndef _XE_REG_SR_
>> #define _XE_REG_SR_
>>
>> +#include <linux/types.h>
>> +
>> /*
>> * Reg save/restore bookkeeping
>> */
>> @@ -21,7 +23,7 @@ int xe_reg_sr_init(struct xe_reg_sr *sr, const char *name, struct xe_device *xe)
>> void xe_reg_sr_dump(struct xe_reg_sr *sr, struct drm_printer *p);
>>
>> int xe_reg_sr_add(struct xe_reg_sr *sr, const struct xe_reg_sr_entry *e,
>> - struct xe_gt *gt);
>> + struct xe_gt *gt, bool overwrite);
>> void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr, struct xe_gt *gt);
>> void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe);
>>
>> diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c
>> index 02e28274282f..5643bcde52bd 100644
>> --- a/drivers/gpu/drm/xe/xe_rtp.c
>> +++ b/drivers/gpu/drm/xe/xe_rtp.c
>> @@ -153,7 +153,7 @@ static void rtp_add_sr_entry(const struct xe_rtp_action *action,
>> };
>>
>> sr_entry.reg.addr += mmio_base;
>> - xe_reg_sr_add(sr, &sr_entry, gt);
>> + xe_reg_sr_add(sr, &sr_entry, gt, action->flags & XE_RTP_ACTION_FLAG_OVERWRITE);
>> }
>>
>> static bool rtp_process_one_sr(const struct xe_rtp_entry_sr *entry,
>> diff --git a/drivers/gpu/drm/xe/xe_rtp_types.h b/drivers/gpu/drm/xe/xe_rtp_types.h
>> index 1b76b947c706..20d228067da3 100644
>> --- a/drivers/gpu/drm/xe/xe_rtp_types.h
>> +++ b/drivers/gpu/drm/xe/xe_rtp_types.h
>> @@ -33,6 +33,7 @@ struct xe_rtp_action {
>> /** @read_mask: mask for bits to consider when reading value back */
>> u32 read_mask;
>> #define XE_RTP_ACTION_FLAG_ENGINE_BASE BIT(0)
>> +#define XE_RTP_ACTION_FLAG_OVERWRITE BIT(1)
>> /** @flags: flags to apply on rule evaluation or action */
>> u8 flags;
>> };
>> --
>> 2.34.1
>>
More information about the Intel-xe
mailing list