<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 12-04-2024 22:28, Lucas De Marchi
wrote:<br>
</div>
<blockquote type="cite" cite="mid:ub2u2jq47h5uy7zydrmoewdad644qf5eo6finfz4mfpwa4orm3@22uzbbikjtkw">On
Fri, Apr 12, 2024 at 10:01:20PM +0530, Ghimiray, Himal Prasad
wrote:
<br>
<blockquote type="cite">
<br>
On 12-04-2024 19:12, Lucas De Marchi wrote:
<br>
<blockquote type="cite">On Fri, Apr 12, 2024 at 01:32:45PM
+0530, Himal Prasad Ghimiray wrote:
<br>
<blockquote type="cite">xe_pm_init may encounter failures for
various reasons, such as a failure
<br>
in initializing drmm_mutex, or when dealing with a
d3cold-capable device
<br>
for vram_threshold sysfs creation and setting default
threshold.
<br>
Presently, all these potential failures are disregarded.
<br>
<br>
Move d3cold.lock initialization to xe_pm_init_early and
cause driver
<br>
abort if mutex initialization has failed.
<br>
<br>
Warn about failure to create and setting default threshold.
<br>
<br>
Cc: Lucas De Marchi <a class="moz-txt-link-rfc2396E" href="mailto:lucas.demarchi@intel.com"><lucas.demarchi@intel.com></a>
<br>
Cc: Rodrigo Vivi <a class="moz-txt-link-rfc2396E" href="mailto:rodrigo.vivi@intel.com"><rodrigo.vivi@intel.com></a>
<br>
Signed-off-by: Himal Prasad Ghimiray
<a class="moz-txt-link-rfc2396E" href="mailto:himal.prasad.ghimiray@intel.com"><himal.prasad.ghimiray@intel.com></a>
<br>
---
<br>
drivers/gpu/drm/xe/xe_device_sysfs.c | 12 ++++------
<br>
drivers/gpu/drm/xe/xe_device_sysfs.h | 2 +-
<br>
drivers/gpu/drm/xe/xe_pm.c | 34
+++++++++++++++++++++++-----
<br>
drivers/gpu/drm/xe/xe_pm.h | 2 +-
<br>
4 files changed, 34 insertions(+), 16 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c
b/drivers/gpu/drm/xe/xe_device_sysfs.c
<br>
index e47c8ad1bb17..21677b8cd977 100644
<br>
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
<br>
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
<br>
@@ -76,18 +76,14 @@ static void xe_device_sysfs_fini(struct
drm_device *drm, void *arg)
<br>
sysfs_remove_file(&xe->drm.dev->kobj,
&dev_attr_vram_d3cold_threshold.attr);
<br>
}
<br>
<br>
-void xe_device_sysfs_init(struct xe_device *xe)
<br>
+int xe_device_sysfs_init(struct xe_device *xe)
<br>
{
<br>
struct device *dev = xe->drm.dev;
<br>
int ret;
<br>
<br>
ret = sysfs_create_file(&dev->kobj,
&dev_attr_vram_d3cold_threshold.attr);
<br>
- if (ret) {
<br>
- drm_warn(&xe->drm, "Failed to create sysfs
file\n");
<br>
- return;
<br>
- }
<br>
-
<br>
- ret = drmm_add_action_or_reset(&xe->drm,
xe_device_sysfs_fini, xe);
<br>
if (ret)
<br>
- drm_warn(&xe->drm, "Failed to add sysfs fini
drm action\n");
<br>
+ return ret;
<br>
+
<br>
+ return drmm_add_action_or_reset(&xe->drm,
xe_device_sysfs_fini, xe);
<br>
}
<br>
diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.h
b/drivers/gpu/drm/xe/xe_device_sysfs.h
<br>
index 38b240684bee..f9e83d8bd2c7 100644
<br>
--- a/drivers/gpu/drm/xe/xe_device_sysfs.h
<br>
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.h
<br>
@@ -8,6 +8,6 @@
<br>
<br>
struct xe_device;
<br>
<br>
-void xe_device_sysfs_init(struct xe_device *xe);
<br>
+int xe_device_sysfs_init(struct xe_device *xe);
<br>
<br>
#endif
<br>
diff --git a/drivers/gpu/drm/xe/xe_pm.c
b/drivers/gpu/drm/xe/xe_pm.c
<br>
index f1fc83845c01..f4d9441720b4 100644
<br>
--- a/drivers/gpu/drm/xe/xe_pm.c
<br>
+++ b/drivers/gpu/drm/xe/xe_pm.c
<br>
@@ -208,10 +208,25 @@ static void xe_pm_runtime_init(struct
xe_device *xe)
<br>
pm_runtime_put(dev);
<br>
}
<br>
<br>
-void xe_pm_init_early(struct xe_device *xe)
<br>
+int xe_pm_init_early(struct xe_device *xe)
<br>
{
<br>
+ int err;
<br>
+
<br>
INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
<br>
- drmm_mutex_init(&xe->drm,
&xe->mem_access.vram_userfault.lock);
<br>
+
<br>
+ err = drmm_mutex_init(&xe->drm,
&xe->mem_access.vram_userfault.lock);
<br>
+ if (err)
<br>
+ return err;
<br>
+
<br>
+ /* Currently d3cold.lock will be used only with GuC */
<br>
</blockquote>
<br>
it's cheaper to just initialize it regardless so this can be
simpler
<br>
<br>
int xe_pm_init_early(struct xe_device *xe)
<br>
{
<br>
int err;
<br>
<br>
INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
<br>
<br>
if ((err = drmm_mutex_init(&xe->drm,
&xe->mem_access.vram_userfault.lock) ||
<br>
(err = drmm_mutex_init(&xe->drm,
&xe->d3cold.lock)))
<br>
return err;
<br>
</blockquote>
Looks clean. Will prefer using this.
<br>
<blockquote type="cite">
<br>
return 0;
<br>
}
<br>
<br>
or keep the err assignment separate, doesn't matter much. But
<br>
when we mix success and failure for a return-early style it
makes
<br>
it harder to read.
<br>
<br>
<blockquote type="cite">
<br>
/**
<br>
@@ -219,20 +234,27 @@ void xe_pm_init_early(struct xe_device
*xe)
<br>
* @xe: xe device instance
<br>
*
<br>
* This component is responsible for System and Device sleep
states.
<br>
+ *
<br>
</blockquote>
<br>
wrong line
<br>
<br>
<blockquote type="cite"> */
<br>
void xe_pm_init(struct xe_device *xe)
<br>
{
<br>
+ int err;
<br>
+
<br>
/* For now suspend/resume is only allowed with GuC */
<br>
if (!xe_device_uc_enabled(xe))
<br>
return;
<br>
<br>
- drmm_mutex_init(&xe->drm,
&xe->d3cold.lock);
<br>
-
<br>
xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe);
<br>
<br>
if (xe->d3cold.capable) {
<br>
- xe_device_sysfs_init(xe);
<br>
- xe_pm_set_vram_threshold(xe,
DEFAULT_VRAM_THRESHOLD);
<br>
+ err = xe_device_sysfs_init(xe);
<br>
</blockquote>
<br>
apparently not because of this patch, but why do we call a
function
<br>
named xe_device_sysfs_init() iff xe->d3cold.capable? And
from within
<br>
xe_pm. That seems totally misplaced. +Rodrigo
<br>
<br>
<blockquote type="cite">+ if (err)
<br>
+ drm_warn(&xe->drm,
<br>
+ "Sysfs create for user to set vram
threshold failed\n");
<br>
</blockquote>
<br>
just warning?
<br>
</blockquote>
<br>
If I propagate xe_pm_int errors to xe_pci.c, it exhibits
unexpected behavior. The PCI module throws errors as
anticipated, but when I remove the xe module with "module -r
xe", it doesn't properly clean up the "/sys/class/drm/card*"
directory. Subsequently, upon reloading, it complains about
existing sysfs entries and fails to load. This behavior aligns
with the issue described in
<a class="moz-txt-link-freetext" href="https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1352">https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1352</a>.
<br>
</blockquote>
<br>
Ok, it seems xe_pci.c is not ready yet to handle the propagated
error.
<br>
Please add a comment above and in the commit message about this,
so we
<br>
remember to fix it later when xe_pci.c is fixed.
<br>
<br>
/*
<br>
* FIXME: Just warn on error for now since caller is not
completely
<br>
* ready to handle the fallout.
<br>
*/
<br>
</blockquote>
<p>Thanks for the input. <br>
</p>
<p style="border: 0px solid rgb(227, 227, 227); box-sizing: border-box; --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-gradient-from-position: ; --tw-gradient-via-position: ; --tw-gradient-to-position: ; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgba(69,89,164,.5); --tw-ring-offset-shadow: 0 0 transparent; --tw-ring-shadow: 0 0 transparent; --tw-shadow: 0 0 transparent; --tw-shadow-colored: 0 0 transparent; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; --tw-contain-size: ; --tw-contain-layout: ; --tw-contain-paint: ; --tw-contain-style: ; margin: 0px 0px 1.25em; color: rgb(13, 13, 13); font-family: Söhne, ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Ubuntu, Cantarell, "Noto Sans", sans-serif, "Helvetica Neue", Arial, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; font-size: 16px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: pre-wrap; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;">Since errors come up after the DRM device probing, there's currently no cleanup mechanism for the driver. Adding handling via xe_pci_remove() might sort out issues popping up post driver probe. I'll give it a try, and if it works out, I'll update the patch accordingly. Otherwise, I'll resort to adding warning messages and fixme notes for now.</p>
<p></p>
<blockquote type="cite" cite="mid:ub2u2jq47h5uy7zydrmoewdad644qf5eo6finfz4mfpwa4orm3@22uzbbikjtkw">
<br>
thanks
<br>
Lucas De Marchi
<br>
</blockquote>
</body>
</html>