[PATCH] Revert "drm/xe: Do not grab forcewakes when issuing GGTT TLB invalidation via GuC"

Lucas De Marchi lucas.demarchi at intel.com
Thu Mar 7 18:09:42 UTC 2024


On Wed, Mar 06, 2024 at 07:11:05PM -0500, Rodrigo Vivi wrote:
>This reverts commit 27ee413bbc0b04146f4ee1c7444422bf18dafd47.
>
>On DG2 after this patch:
>
>[  439.105953] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI
>[  439.117349] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
>[  439.124924] CPU: 8 PID: 7160 Comm: insmod Tainted: G     U     OE      6.8.0-rc7+ #5
>[  439.132669] Hardware name: iBUYPOWER INTEL/B660 DS3H AC DDR4-Y1, BIOS F5 12/17/2021
>[  439.140332] RIP: 0010:xe_ggtt_invalidate+0x150/0x260 [xe]
>[  439.145860] Code: 48 8d 7d 18 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b 5d 18 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 87 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b
>[  439.164609] RSP: 0018:ffffc9000201eee8 EFLAGS: 00010246
>[  439.169843] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
>[  439.176980] RDX: 0000000000000000 RSI: ffffffffb6ae4040 RDI: ffff88818231a478
>[  439.184121] RBP: ffff88818231a460 R08: 0000000000000001 R09: fffffbfff7563e5c
>[  439.191259] R10: fffff52000403d92 R11: 0000000000000001 R12: ffff888185330028
>[  439.198401] R13: ffffffffffffffff R14: 0000000000001000 R15: 0000000000000000
>[  439.205543] FS:  00007f6c0fd92740(0000) GS:ffff888f76400000(0000) knlGS:0000000000000000
>[  439.213634] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[  439.219393] CR2: 00007ffedd41de70 CR3: 00000001063ec000 CR4: 0000000000750ef0
>[  439.226534] PKRU: 55555554
>[  439.229256] Call Trace:
>[  439.231713]  <TASK>
>[  439.233832]  ? die_addr+0x3c/0xa0
>[  439.237165]  ? exc_general_protection+0x158/0x240
>[  439.241882]  ? asm_exc_general_protection+0x22/0x30
>[  439.246774]  ? xe_ggtt_invalidate+0x150/0x260 [xe]
>[  439.251679]  ? xe_ggtt_invalidate+0xa0/0x260 [xe]
>[  439.256490]  __xe_ggtt_insert_bo_at+0x24b/0x720 [xe]
>[  439.261556]  ? __pfx____xe_bo_create_locked+0x10/0x10 [xe]
>[  439.267144]  ? __pfx___xe_ggtt_insert_bo_at+0x10/0x10 [xe]
>[  439.272737]  __xe_bo_create_locked+0x4af/0x1080 [xe]
>[  439.277806]  xe_bo_create_pin_map_at+0x40/0x400 [xe]
>[  439.282873]  ? xe_uc_fw_check_version_requirements+0x496/0x900 [xe]
>[  439.289255]  xe_managed_bo_create_from_data+0x38/0x130 [xe]
>[  439.294934]  xe_uc_fw_init+0xe71/0x3270 [xe]
>[  439.299317]  ? __pfx_xe_uc_fw_init+0x10/0x10 [xe]
>[  439.304127]  ? __pfx_lock_acquired+0x10/0x10
>[  439.308406]  ? __pfx___drm_printfn_info+0x10/0x10
>[  439.313118]  ? _raw_spin_unlock_irqrestore+0x4b/0x80
>[  439.318093]  ? __pm_runtime_resume+0x7f/0x110
>[  439.322458]  xe_guc_init+0x8d/0x890 [xe]
>[  439.326497]  xe_uc_init+0x65/0x1a0 [xe]
>[  439.330440]  xe_gt_init_hwconfig+0xdc/0x180 [xe]
>[  439.335161]  xe_device_probe+0x747/0x1060 [xe]
>[  439.339712]  ? __pfx___drmm_mutex_release+0x10/0x10
>[  439.344599]  ? __drmm_add_action+0x19d/0x280
>[  439.348883]  ? __pfx___drmm_mutex_release+0x10/0x10
>[  439.353769]  xe_pci_probe+0x168b/0x2f30 [xe]
>[  439.358154]  ? __pfx_lock_acquired+0x10/0x10
>[  439.362433]  ? __pfx_xe_pci_probe+0x10/0x10 [xe]
>[  439.367158]  ? _raw_spin_unlock_irqrestore+0x62/0x80
>[  439.372131]  ? lockdep_hardirqs_on+0xc7/0x140
>[  439.376498]  ? _raw_spin_unlock_irqrestore+0x4b/0x80
>[  439.381473]  ? __pfx_xe_pci_probe+0x10/0x10 [xe]
>[  439.386196]  local_pci_probe+0xd6/0x190
>[  439.390042]  pci_device_probe+0x223/0x740
>[  439.394064]  ? __pfx_pci_device_probe+0x10/0x10
>[  439.398605]  ? kernfs_create_link+0x167/0x230
>[  439.402970]  ? do_raw_spin_unlock+0x54/0x1f0
>[  439.407252]  really_probe+0x3df/0xb80
>[  439.410926]  __driver_probe_device+0x18c/0x450
>[  439.415382]  driver_probe_device+0x4a/0x120
>[  439.419572]  __driver_attach+0x1e1/0x4a0
>[  439.423505]  ? __pfx___driver_attach+0x10/0x10
>[  439.427963]  bus_for_each_dev+0xf2/0x160
>[  439.431897]  ? __pfx_bus_for_each_dev+0x10/0x10
>[  439.436436]  bus_add_driver+0x29d/0x570
>[  439.440283]  driver_register+0x130/0x450
>[  439.444218]  ? __pfx_xe_init+0x10/0x10 [xe]
>[  439.448507]  xe_init+0x81/0x140 [xe]
>[  439.452188]  ? __pfx_xe_init+0x10/0x10 [xe]
>[  439.456478]  do_one_initcall+0xcf/0x420
>[  439.460326]  ? __pfx_do_one_initcall+0x10/0x10
>[  439.464783]  ? kasan_unpoison+0x40/0x70
>[  439.468631]  do_init_module+0x238/0x730
>[  439.472478]  load_module+0x5ff7/0x6c30
>[  439.476242]  ? __pfx_load_module+0x10/0x10
>[  439.480356]  ? ima_post_read_file+0x163/0x190
>[  439.484722]  ? __pfx_ima_post_read_file+0x10/0x10
>[  439.489435]  ? security_kernel_post_read_file+0x6d/0xb0
>[  439.494673]  ? __pfx_kernel_read_file+0x10/0x10
>[  439.499220]  ? init_module_from_file+0xc0/0x100
>[  439.503758]  init_module_from_file+0xc0/0x100
>[  439.508125]  ? __pfx_init_module_from_file+0x10/0x10
>[  439.513103]  ? do_raw_spin_unlock+0x54/0x1f0
>[  439.517382]  idempotent_init_module+0x241/0x660
>[  439.521924]  ? __pfx_idempotent_init_module+0x10/0x10
>[  439.526986]  ? security_capable+0x6d/0xb0
>[  439.531003]  __x64_sys_finit_module+0xba/0x130
>[  439.535459]  do_syscall_64+0x97/0x190
>[  439.539135]  ? lockdep_hardirqs_on_prepare+0x17b/0x420
>[  439.544282]  ? do_syscall_64+0xa7/0x190
>[  439.548125]  ? lockdep_hardirqs_on+0xc7/0x140
>[  439.552493]  ? do_syscall_64+0xa7/0x190
>[  439.556341]  ? do_syscall_64+0xa7/0x190
>[  439.560188]  ? lockdep_hardirqs_on+0xc7/0x140
>[  439.564556]  ? do_syscall_64+0xa7/0x190
>[  439.568401]  ? do_syscall_64+0xa7/0x190
>[  439.572249]  ? lockdep_hardirqs_on_prepare+0x17b/0x420
>[  439.577395]  entry_SYSCALL_64_after_hwframe+0x6e/0x76
>[  439.582460] RIP: 0033:0x7f6c0f73160d
>[  439.586047] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f3 47 0c 00 f7 d8 64 89 01 48
>[  439.604792] RSP: 002b:00007ffd0ecad3b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
>[  439.612368] RAX: ffffffffffffffda RBX: 0000565382a36740 RCX: 00007f6c0f73160d
>[  439.619506] RDX: 0000000000000000 RSI: 0000565382a362a0 RDI: 0000000000000003
>[  439.626644] RBP: 00007ffd0ecad470 R08: 0000000000000000 R09: 0000565382a37760
>[  439.633782] R10: 0000000000000003 R11: 0000000000000246 R12: 0000565382a362a0
>[  439.640925] R13: 0000000000000000 R14: 0000565382a38790 R15: 0000565382a362a0
>[  439.648066]  </TASK>
>[  439.650263] Modules linked in: xe(OE+) snd_hda_codec_hdmi snd_seq_dummy snd_hrtimer rfcomm nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc vfat fat intel_rapl_msr snd_sof_pci_intel_tgl iwlmvm intel_rapl_common snd_sof_intel_hda_common snd_soc_hdac_hda snd_sof_pci intel_uncore_frequency snd_sof_xtensa_dsp intel_uncore_frequency_common snd_sof_intel_hda x86_pkg_temp_thermal snd_sof intel_powerclamp mac80211 snd_sof_utils snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core coretemp snd_compress snd_sof_intel_hda_mlink snd_hda_ext_core kvm_intel snd_hda_intel snd_intel_dspcfg libarc4 snd_hda_codec kvm snd_hwdep snd_hda_core btusb iwlwifi irqbypass snd_seq btrtl snd_seq_device iTCO_wdt btintel rapl pmt_telemetry intel_pmc_bxt snd_pcm intel_cstate btbcm ee1004 iTCO_vendor_support btmtk mei_hdcp
>[  439.650319]  mei_pxp pmt_class cfg80211 i2c_i801 snd_timer intel_uncore wmi_bmof gigabyte_wmi bluetooth pcspkr snd i2c_smbus mei_me soundcore mei idma64 rfkill intel_vsec intel_hid joydev acpi_tad acpi_pad sparse_keymap loop zram drm_gpuvm hid_logitech_hidpp i915 crct10dif_pclmul crc32_pclmul crc32c_intel polyval_clmulni polyval_generic ghash_clmulni_intel nvme r8169 sha512_ssse3 nvme_core pinctrl_alderlake hid_logitech_dj ip6_tables ip_tables fuse
>[  439.739896] Unloaded tainted modules: xe(OE):2 [last unloaded: xe(OE)]
>[  439.786092] ---[ end trace 0000000000000000 ]---
>[  439.790733] RIP: 0010:xe_ggtt_invalidate+0x150/0x260 [xe]
>[  439.796254] Code: 48 8d 7d 18 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b 5d 18 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 87 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b
>[  439.815026] RSP: 0018:ffffc9000201eee8 EFLAGS: 00010246
>[  439.820262] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
>[  439.827407] RDX: 0000000000000000 RSI: ffffffffb6ae4040 RDI: ffff88818231a478
>[  439.834549] RBP: ffff88818231a460 R08: 0000000000000001 R09: fffffbfff7563e5c
>[  439.841691] R10: fffff52000403d92 R11: 0000000000000001 R12: ffff888185330028
>[  439.848835] R13: ffffffffffffffff R14: 0000000000001000 R15: 0000000000000000
>[  439.855978] FS:  00007f6c0fd92740(0000) GS:ffff888f76400000(0000) knlGS:0000000000000000
>[  439.864072] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[  439.869824] CR2: 00007ffedd41de70 CR3: 00000001063ec000 CR4: 0000000000750ef0
>[  439.876966] PKRU: 55555554
>
>Fixes: 27ee413bbc0b ("drm/xe: Do not grab forcewakes when issuing GGTT TLB invalidation via GuC")
>Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>Cc: Matthew Brost <matthew.brost at intel.com>
>Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
>---
> drivers/gpu/drm/xe/xe_ggtt.c                | 7 +++++++
> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 2 --
> 2 files changed, 7 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
>index 5e739513ab0a..717d0e76277a 100644
>--- a/drivers/gpu/drm/xe/xe_ggtt.c
>+++ b/drivers/gpu/drm/xe/xe_ggtt.c
>@@ -257,9 +257,16 @@ static void ggtt_invalidate_gt_tlb(struct xe_gt *gt)
> 	if (!gt)
> 		return;
>
>+	/*
>+	 * Invalidation can happen when there's no in-flight work keeping the
>+	 * GT awake.  We need to explicitly grab forcewake to ensure the GT
>+	 * and GuC are accessible.


aside from the null pointer deref, what exactly
from this comment above that would not be true to make the patch
correct?

Lucas De Marchi

>+	 */
>+	xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> 	err = xe_gt_tlb_invalidation_ggtt(gt);
> 	if (err)
> 		drm_warn(&xe->drm, "xe_gt_tlb_invalidation_ggtt error=%d", err);
>+	xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
> }
>
> void xe_ggtt_invalidate(struct xe_ggtt *ggtt)
>diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>index a3c4ffba679d..f29ee1ccfa71 100644
>--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>@@ -247,7 +247,6 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
>
> 		xe_gt_tlb_invalidation_wait(gt, seqno);
> 	} else if (xe_device_uc_enabled(xe)) {
>-		xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> 		if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
> 			xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
> 					PVC_GUC_TLB_INV_DESC1_INVALIDATE);
>@@ -257,7 +256,6 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
> 			xe_mmio_write32(gt, GUC_TLB_INV_CR,
> 					GUC_TLB_INV_CR_INVALIDATE);
> 		}
>-		xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
> 	}
>
> 	return 0;
>-- 
>2.43.2
>


More information about the Intel-xe mailing list