[PATCH 6/6] drm/xe: Introduce the busted_mode debugfs
Rodrigo Vivi
rodrigo.vivi at intel.com
Mon Mar 18 20:14:04 UTC 2024
On Mon, Mar 18, 2024 at 02:31:31PM -0500, Lucas De Marchi wrote:
> On Fri, Mar 15, 2024 at 10:01:08AM -0400, Rodrigo Vivi wrote:
> > So, the busted mode can be selected at runtime with the device
> > granularity, rather then a module policy.
>
> did you mean to squash this in the previous commit?
doh! yes, that was the intention, but forgot to mark it as a fixup.
>
> for the entire series, it seems it's going the right direction. It would
> be good to have some more testing with it before merging though. I asked
> SV folks to give it a try. I also saw some typos I forgot to comment on
> so I will have to go through the patches again.
yeap, I also want their ack on that as well.
>
> Another question is about naming since some people didn't like "busted".
> Options:
>
> 1) keep busted
> 2) zombie
> 3) back to wedged
> 4) dead
> 5) blocked
> 6) disabled
> 7) unusable
> 8) unreliable
> 9) misbehaving
>
> Did I miss any suggestion? Well.... the order above is just _my_
> preference, but I'm totally fine if other people disagree and we decide
> something else.
>
> Cc'ing some people who may chime in with their preference.
well, at this point anything works to me. Just let me know the most popular
and I change the patches.
>
> Lucas De Marchi
>
> >
> > Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> > Cc: Alan Previn <alan.previn.teres.alexis at intel.com>
> > Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_debugfs.c | 12 +++++++++
> > drivers/gpu/drm/xe/xe_guc_ads.c | 46 +++++++++++++++++++++++++++++++++
> > drivers/gpu/drm/xe/xe_guc_ads.h | 1 +
> > 3 files changed, 59 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> > index 175ba306c3eb..0cd20862d32e 100644
> > --- a/drivers/gpu/drm/xe/xe_debugfs.c
> > +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> > @@ -12,6 +12,7 @@
> > #include "xe_bo.h"
> > #include "xe_device.h"
> > #include "xe_gt_debugfs.h"
> > +#include "xe_guc_ads.h"
> > #include "xe_pm.h"
> > #include "xe_step.h"
> >
> > @@ -124,8 +125,10 @@ static ssize_t busted_mode_set(struct file *f, const char __user *ubuf,
> > size_t size, loff_t *pos)
> > {
> > struct xe_device *xe = file_inode(f)->i_private;
> > + struct xe_gt *gt;
> > u32 busted_mode;
> > ssize_t ret;
> > + u8 id;
> >
> > ret = kstrtouint_from_user(ubuf, size, 0, &busted_mode);
> > if (ret)
> > @@ -136,6 +139,15 @@ static ssize_t busted_mode_set(struct file *f, const char __user *ubuf,
> >
> > mutex_lock(&xe->busted.lock);
> > xe->busted.mode = busted_mode;
> > + if (busted_mode == 2) {
> > + for_each_gt(gt, xe, id) {
> > + ret = xe_guc_ads_scheduler_policy_disable_reset(>->uc.guc.ads);
> > + if (ret) {
> > + drm_err(&xe->drm, "Failed to update GuC ADS scheduler policy. GPU might still reset even on the busted_mode=2\n");
> > + break;
> > + }
> > + }
> > + }
> > mutex_unlock(&xe->busted.lock);
> >
> > return size;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > index 43f0a88bbe8a..5dccdbe595bf 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > @@ -7,6 +7,7 @@
> >
> > #include <drm/drm_managed.h>
> >
> > +#include "abi/guc_actions_abi.h"
> > #include "regs/xe_engine_regs.h"
> > #include "regs/xe_gt_regs.h"
> > #include "regs/xe_guc_regs.h"
> > @@ -14,6 +15,7 @@
> > #include "xe_gt.h"
> > #include "xe_gt_ccs_mode.h"
> > #include "xe_guc.h"
> > +#include "xe_guc_ct.h"
> > #include "xe_hw_engine.h"
> > #include "xe_lrc.h"
> > #include "xe_map.h"
> > @@ -679,3 +681,47 @@ void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads)
> > {
> > guc_populate_golden_lrc(ads);
> > }
> > +
> > +static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset)
> > +{
> > + struct xe_guc_ct *ct = &ads_to_guc(ads)->ct;
> > + u32 action[] = {
> > + XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE,
> > + policy_offset
> > + };
> > +
> > + return xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
> > +}
> > +
> > +int xe_guc_ads_scheduler_policy_disable_reset(struct xe_guc_ads *ads)
> > +{
> > + struct xe_device *xe = ads_to_xe(ads);
> > + struct xe_gt *gt = ads_to_gt(ads);
> > + struct xe_tile *tile = gt_to_tile(gt);
> > + struct guc_policies *policies;
> > + struct xe_bo *bo;
> > + int ret = 0;
> > +
> > + policies = kmalloc(sizeof(*policies), GFP_KERNEL);
> > + if (!policies)
> > + return -ENOMEM;
> > +
> > + policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
> > + policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
> > + policies->is_valid = 1;
> > + if (xe->busted.mode == 2)
> > + policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> > +
> > + bo = xe_managed_bo_create_from_data(xe, tile, policies, sizeof(struct guc_policies),
> > + XE_BO_CREATE_VRAM_IF_DGFX(tile) |
> > + XE_BO_CREATE_GGTT_BIT);
> > + if (IS_ERR(bo)) {
> > + ret = PTR_ERR(bo);
> > + goto out;
> > + }
> > +
> > + ret = guc_ads_action_update_policies(ads, xe_bo_ggtt_addr(bo));
> > +out:
> > + kfree(policies);
> > + return ret;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.h b/drivers/gpu/drm/xe/xe_guc_ads.h
> > index 138ef6267671..7c45c40fab34 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.h
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.h
> > @@ -13,5 +13,6 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
> > void xe_guc_ads_populate(struct xe_guc_ads *ads);
> > void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
> > void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
> > +int xe_guc_ads_scheduler_policy_disable_reset(struct xe_guc_ads *ads);
> >
> > #endif
> > --
> > 2.44.0
> >
More information about the Intel-xe
mailing list