[Intel-xe] [PATCH v2 6/7] drm/xe/mocs: LNCF MOCS settings only need to be restored on pre-Xe_HP
Matt Roper
matthew.d.roper at intel.com
Fri Mar 3 23:26:28 UTC 2023
Hmm. It seems that most of that patch is fine, but the regression is
caused by my attempt to tighten up the GuC's regset space allocated for
these registers. The condition in guc_mmio_regset_write() only adds the
LNCFMOCS registers to the GuC regset for RCS0 (they don't need to be
saved/restored for the other engines), so I thought increasing the
regset size by a total of just LNCFCMOCS_REG_COUNT (rather than
LNCFCMOCS_REG_COUNT*XE_NUM_HW_ENGINES) would be sufficient, but it seems
there's something I'm overlooking in the GuC regset code and we do still
need to allocate the extra space for every possible engine, even the
ones that don't need to save/restore it.
So I believe the following patch solves the problem; I just haven't
figured out the details on exactly why yet.
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index 7a892ff7aba3..0d926edd7656 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -224,10 +224,7 @@ static size_t calculate_regset_size(struct xe_gt *gt)
xa_for_each(&hwe->reg_sr.xa, sr_idx, sr_entry)
count++;
- count += ADS_REGSET_EXTRA_MAX * XE_NUM_HW_ENGINES;
-
- if (needs_wa_1607983814(gt_to_xe(gt)))
- count += LNCFCMOCS_REG_COUNT;
+ count += (ADS_REGSET_EXTRA_MAX + LNCFCMOCS_REG_COUNT) * XE_NUM_HW_ENGINES;
return count * sizeof(struct guc_mmio_reg);
}
On Fri, Mar 03, 2023 at 10:09:45AM +0100, Thomas Hellström wrote:
> Hi!
>
> After this commit I see the following on DG1, and attempting to run anything
> after that won't work.
>
> Thomas
>
>
> 10.306993] ------------[ cut here ]------------
> [ 10.307040] WARNING: CPU: 2 PID: 447 at
> drivers/gpu/drm/xe/xe_hw_fence.c:91 xe_hw_fence_irq_finish+0x36/0x120 [xe]
> [ 10.307327] Modules linked in: xe(+) drm_ttm_helper ttm i2c_algo_bit
> drm_suballoc_helper kunit drm_buddy gpu_sched drm_display_helper
> drm_kms_helper drm crct10dif_pclmul crc32_pclmul crc32c_intel nvme e1000e
> nvme_core drm_panel_orientation_quirks ghash_clmulni_intel video wmi
> pinctrl_tigerlake usb_storage ip6_tables ip_tables fuse
> [ 10.307656] CPU: 2 PID: 447 Comm: systemd-udevd Tainted: G U
> 6.1.0+ #120
> [ 10.307719] Hardware name: ASUS System Product Name/PRIME B560M-A AC,
> BIOS 0403 01/26/2021
> [ 10.307779] RIP: 0010:xe_hw_fence_irq_finish+0x36/0x120 [xe]
> [ 10.307980] Code: 54 4c 8d 67 60 55 53 48 83 ec 10 48 8b 47 60 49 39 c4
> 75 13 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 48 89 fd <0f>
> 0b e8 33 84 13 e2 48 89 ef 88 44 24 07 e8 c7 44 5e e2 48 8b 55
> [ 10.308103] RSP: 0018:ffffb917c12d7b90 EFLAGS: 00010216
> [ 10.308149] RAX: ffff8b189b6d85d8 RBX: 00000000ffffffc2 RCX:
> ffffb917c12d7aec
> [ 10.308201] RDX: 0000000000000000 RSI: ffffffffa356dc2b RDI:
> ffff8b189ee84c60
> [ 10.308253] RBP: ffff8b189ee84c60 R08: 0000000000000001 R09:
> ffff8b189ee82278
> [ 10.308304] R10: 0000000000000000 R11: 00000000d3ea3544 R12:
> ffff8b189ee84cc0
> [ 10.308356] R13: ffff8b189ee84cd8 R14: ffff8b189ee83e00 R15:
> ffffffffc08dda87
> [ 10.308407] FS: 00007f8a847cb0c0(0000) GS:ffff8b1af3280000(0000)
> knlGS:0000000000000000
> [ 10.308466] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 10.308511] CR2: 00007f69c6ba5298 CR3: 0000000114d8e003 CR4:
> 0000000000770ee0
> [ 10.308564] PKRU: 55555554
> [ 10.308614] Call Trace:
> [ 10.308640] <TASK>
> [ 10.308679] xe_gt_init+0x336/0x380 [xe]
> [ 10.308886] xe_device_probe+0x234/0x2a0 [xe]
> [ 10.309084] xe_pci_probe+0x418/0x540 [xe]
> [ 10.309294] local_pci_probe+0x3e/0x80
> [ 10.309342] pci_device_probe+0xb3/0x210
> [ 10.309392] really_probe+0xdb/0x380
> [ 10.309427] ? pm_runtime_barrier+0x50/0x90
> [ 10.309473] __driver_probe_device+0x78/0x170
> [ 10.309519] driver_probe_device+0x1f/0x90
> [ 10.309564] __driver_attach+0xd5/0x1d0
> [ 10.309632] ? __device_attach_driver+0x110/0x110
> [ 10.309675] bus_for_each_dev+0x62/0x90
> [ 10.309719] bus_add_driver+0x1b1/0x200
> [ 10.309767] driver_register+0x89/0xe0
> [ 10.309804] ? xe_hw_fence_module_init+0x35/0x35 [xe]
> [ 10.310034] do_one_initcall+0x5a/0x300
> [ 10.310080] ? rcu_read_lock_sched_held+0x3f/0x80
> [ 10.310122] ? trace_kmalloc+0x33/0xe0
> [ 10.310160] ? kmalloc_trace+0x46/0x60
> [ 10.310203] do_init_module+0x4a/0x200
> [ 10.310247] __do_sys_finit_module+0x93/0xf0
> [ 10.310331] do_syscall_64+0x37/0x90
> [ 10.310369] entry_SYSCALL_64_after_hwframe+0x63/0xcd
> [ 10.310417] RIP: 0033:0x7f8a8522892d
> [ 10.310452] Code: 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48
> 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48>
> 3d 01 f0 ff ff 73 01 c3 48 8b 0d d3 e4 0c 00 f7 d8 64 89 01 48
> [ 10.310576] RSP: 002b:00007ffdafa93138 EFLAGS: 00000246 ORIG_RAX:
> 0000000000000139
> [ 10.310671] RAX: ffffffffffffffda RBX: 0000562a45dc7f50 RCX:
> 00007f8a8522892d
> [ 10.310725] RDX: 0000000000000000 RSI: 0000562a45dc6100 RDI:
> 0000000000000016
> [ 10.310777] RBP: 0000562a45dc6100 R08: 0000000000000000 R09:
> 00007ffdafa93260
> [ 10.310827] R10: 0000000000000016 R11: 0000000000000246 R12:
> 0000000000020000
> [ 10.310878] R13: 0000562a45dbf440 R14: 0000000000000000 R15:
> 0000562a45dbfbb0
> [ 10.310967] </TASK>
> [ 10.310989] irq event stamp: 141793
> [ 10.311018] hardirqs last enabled at (141801): [<ffffffffa217660e>]
> __up_console_sem+0x5e/0x70
> [ 10.311086] hardirqs last disabled at (141808): [<ffffffffa21765f3>]
> __up_console_sem+0x43/0x70
> [ 10.311149] softirqs last enabled at (141740): [<ffffffffa20f026d>]
> __irq_exit_rcu+0xed/0x160
> [ 10.311215] softirqs last disabled at (141735): [<ffffffffa20f026d>]
> __irq_exit_rcu+0xed/0x160
> [ 10.311279] ---[ end trace 0000000000000000 ]---
> [ 10.312424] xe: probe of 0000:08:00.0 failed with error -62
> [ 10.321694] xe 0000:08:00.0: [drm] *ERROR* GuC PC Shutdown failed
> [ 10.321789] ------------[ cut here ]------------
> [ 10.321825] WARNING: CPU: 2 PID: 447 at
> drivers/gpu/drm/xe/xe_guc_pc.c:879 pc_fini+0xd0/0xf0 [xe]
>
> On 2/23/23 19:57, Matt Roper wrote:
> > Reprogramming the LNCF MOCS registers on render domain reset is not
> > intended to be regular driver programming, but rather the implementation
> > of a specific workaround (Wa_1607983814). This workaround no longer
> > applies on Xe_HP any beyond, so we can expect that these registers, like
> > the rest of the LNCF/LBCF registers, will maintain their values through
> > all engine resets. We should only add these registers to the GuC's
> > save/restore list on platforms that need the workaround.
> >
> > Furthermore, xe_mocs_init_engine() appears to be another attempt to
> > satisfy this same workaround. This is unnecessary on the Xe driver
> > since even on platforms where the workaround is necessary, all
> > single-engine resets are initiated by the GuC and thus the GuC will take
> > care of saving/restoring these registers. The only host-initiated
> > resets we have in Xe are full GT resets which will already
> > (re)initialize these registers as part of the regular xe_mocs_init()
> > flow.
> >
> > v2:
> > - Add needs_wa_1607983814() so that calculate_regset_size() doesn't
> > overallocate regset space when the workaround isn't needed. (Lucas)
> > - On platforms affected by Wa_1607983814, only add the LNCF MOCS
> > registers to the render engine's GuC save/restore list; resets of
> > other engines don't need to save/restore these.
> >
> > Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> > Signed-off-by: Matt Roper <matthew.d.roper at intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_execlist.c | 2 +-
> > drivers/gpu/drm/xe/xe_guc_ads.c | 20 ++++++++++++++++----
> > drivers/gpu/drm/xe/xe_guc_submit.c | 1 -
> > drivers/gpu/drm/xe/xe_mocs.c | 13 -------------
> > drivers/gpu/drm/xe/xe_mocs.h | 1 -
> > 5 files changed, 17 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
> > index d555d77cbf49..fd0ebfe7cae3 100644
> > --- a/drivers/gpu/drm/xe/xe_execlist.c
> > +++ b/drivers/gpu/drm/xe/xe_execlist.c
> > @@ -462,7 +462,7 @@ static void execlist_engine_suspend_wait(struct xe_engine *e)
> > static void execlist_engine_resume(struct xe_engine *e)
> > {
> > - xe_mocs_init_engine(e);
> > + /* NIY */
> > }
> > static const struct xe_engine_ops execlist_engine_ops = {
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > index 0c08cecaca40..0a458c083db7 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > @@ -206,6 +206,11 @@ static void guc_ads_fini(struct drm_device *drm, void *arg)
> > xe_bo_unpin_map_no_vm(ads->bo);
> > }
> > +static bool needs_wa_1607983814(struct xe_device *xe)
> > +{
> > + return GRAPHICS_VERx100(xe) < 1250;
> > +}
> > +
> > static size_t calculate_regset_size(struct xe_gt *gt)
> > {
> > struct xe_reg_sr_entry *sr_entry;
> > @@ -218,7 +223,10 @@ static size_t calculate_regset_size(struct xe_gt *gt)
> > xa_for_each(&hwe->reg_sr.xa, sr_idx, sr_entry)
> > count++;
> > - count += (ADS_REGSET_EXTRA_MAX + LNCFCMOCS_REG_COUNT) * XE_NUM_HW_ENGINES;
> > + count += ADS_REGSET_EXTRA_MAX * XE_NUM_HW_ENGINES;
> > +
> > + if (needs_wa_1607983814(gt_to_xe(gt)))
> > + count += LNCFCMOCS_REG_COUNT;
> > return count * sizeof(struct guc_mmio_reg);
> > }
> > @@ -430,6 +438,7 @@ static unsigned int guc_mmio_regset_write(struct xe_guc_ads *ads,
> > struct iosys_map *regset_map,
> > struct xe_hw_engine *hwe)
> > {
> > + struct xe_device *xe = ads_to_xe(ads);
> > struct xe_hw_engine *hwe_rcs_reset_domain =
> > xe_gt_any_hw_engine_by_reset_domain(hwe->gt, XE_ENGINE_CLASS_RENDER);
> > struct xe_reg_sr_entry *entry;
> > @@ -464,9 +473,12 @@ static unsigned int guc_mmio_regset_write(struct xe_guc_ads *ads,
> > e->reg, e->flags, count++);
> > }
> > - for (i = 0; i < LNCFCMOCS_REG_COUNT; i++) {
> > - guc_mmio_regset_write_one(ads, regset_map,
> > - GEN9_LNCFCMOCS(i).reg, 0, count++);
> > + /* Wa_1607983814 */
> > + if (needs_wa_1607983814(xe) && hwe->class == XE_ENGINE_CLASS_RENDER) {
> > + for (i = 0; i < LNCFCMOCS_REG_COUNT; i++) {
> > + guc_mmio_regset_write_one(ads, regset_map,
> > + GEN9_LNCFCMOCS(i).reg, 0, count++);
> > + }
> > }
> > XE_BUG_ON(ads->regset_size < (count * sizeof(struct guc_mmio_reg)));
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index a54f7f82d04d..3766b77a0d90 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1267,7 +1267,6 @@ static void guc_engine_resume(struct xe_engine *e)
> > XE_BUG_ON(e->guc->suspend_pending);
> > - xe_mocs_init_engine(e);
> > guc_engine_add_msg(e, msg, RESUME);
> > }
> > diff --git a/drivers/gpu/drm/xe/xe_mocs.c b/drivers/gpu/drm/xe/xe_mocs.c
> > index 1ede2d46e778..c019352ba7a8 100644
> > --- a/drivers/gpu/drm/xe/xe_mocs.c
> > +++ b/drivers/gpu/drm/xe/xe_mocs.c
> > @@ -517,19 +517,6 @@ static void init_l3cc_table(struct xe_gt *gt,
> > }
> > }
> > -void xe_mocs_init_engine(const struct xe_engine *engine)
> > -{
> > - struct xe_mocs_info table;
> > - unsigned int flags;
> > -
> > - flags = get_mocs_settings(engine->gt->xe, &table);
> > - if (!flags)
> > - return;
> > -
> > - if (flags & HAS_RENDER_L3CC && engine->class == XE_ENGINE_CLASS_RENDER)
> > - init_l3cc_table(engine->gt, &table);
> > -}
> > -
> > void xe_mocs_init(struct xe_gt *gt)
> > {
> > struct xe_mocs_info table;
> > diff --git a/drivers/gpu/drm/xe/xe_mocs.h b/drivers/gpu/drm/xe/xe_mocs.h
> > index aba1abe216ab..63500a1d6660 100644
> > --- a/drivers/gpu/drm/xe/xe_mocs.h
> > +++ b/drivers/gpu/drm/xe/xe_mocs.h
> > @@ -11,7 +11,6 @@
> > struct xe_engine;
> > struct xe_gt;
> > -void xe_mocs_init_engine(const struct xe_engine *engine);
> > void xe_mocs_init(struct xe_gt *gt);
> > /**
--
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation
More information about the Intel-xe
mailing list