<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 12-04-2024 22:28, Lucas De Marchi
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:ub2u2jq47h5uy7zydrmoewdad644qf5eo6finfz4mfpwa4orm3@22uzbbikjtkw">On
      Fri, Apr 12, 2024 at 10:01:20PM +0530, Ghimiray, Himal Prasad
      wrote:
      <br>
      <blockquote type="cite">
        <br>
        On 12-04-2024 19:12, Lucas De Marchi wrote:
        <br>
        <blockquote type="cite">On Fri, Apr 12, 2024 at 01:32:45PM
          +0530, Himal Prasad Ghimiray wrote:
          <br>
          <blockquote type="cite">xe_pm_init may encounter failures for
            various reasons, such as a failure
            <br>
            in initializing drmm_mutex, or when dealing with a
            d3cold-capable device
            <br>
            for vram_threshold sysfs creation and setting default
            threshold.
            <br>
            Presently, all these potential failures are disregarded.
            <br>
            <br>
            Move d3cold.lock initialization to xe_pm_init_early and
            cause driver
            <br>
            abort if mutex initialization has failed.
            <br>
            <br>
            Warn about failure to create and setting default threshold.
            <br>
            <br>
            Cc: Lucas De Marchi <a class="moz-txt-link-rfc2396E" href="mailto:lucas.demarchi@intel.com"><lucas.demarchi@intel.com></a>
            <br>
            Cc: Rodrigo Vivi <a class="moz-txt-link-rfc2396E" href="mailto:rodrigo.vivi@intel.com"><rodrigo.vivi@intel.com></a>
            <br>
            Signed-off-by: Himal Prasad Ghimiray
            <a class="moz-txt-link-rfc2396E" href="mailto:himal.prasad.ghimiray@intel.com"><himal.prasad.ghimiray@intel.com></a>
            <br>
            ---
            <br>
            drivers/gpu/drm/xe/xe_device_sysfs.c | 12 ++++------
            <br>
            drivers/gpu/drm/xe/xe_device_sysfs.h |  2 +-
            <br>
            drivers/gpu/drm/xe/xe_pm.c           | 34
            +++++++++++++++++++++++-----
            <br>
            drivers/gpu/drm/xe/xe_pm.h           |  2 +-
            <br>
            4 files changed, 34 insertions(+), 16 deletions(-)
            <br>
            <br>
            diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c
            b/drivers/gpu/drm/xe/xe_device_sysfs.c
            <br>
            index e47c8ad1bb17..21677b8cd977 100644
            <br>
            --- a/drivers/gpu/drm/xe/xe_device_sysfs.c
            <br>
            +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
            <br>
            @@ -76,18 +76,14 @@ static void xe_device_sysfs_fini(struct
            drm_device *drm, void *arg)
            <br>
                sysfs_remove_file(&xe->drm.dev->kobj,
            &dev_attr_vram_d3cold_threshold.attr);
            <br>
            }
            <br>
            <br>
            -void xe_device_sysfs_init(struct xe_device *xe)
            <br>
            +int xe_device_sysfs_init(struct xe_device *xe)
            <br>
            {
            <br>
                struct device *dev = xe->drm.dev;
            <br>
                int ret;
            <br>
            <br>
                ret = sysfs_create_file(&dev->kobj,
            &dev_attr_vram_d3cold_threshold.attr);
            <br>
            -    if (ret) {
            <br>
            -        drm_warn(&xe->drm, "Failed to create sysfs
            file\n");
            <br>
            -        return;
            <br>
            -    }
            <br>
            -
            <br>
            -    ret = drmm_add_action_or_reset(&xe->drm,
            xe_device_sysfs_fini, xe);
            <br>
                if (ret)
            <br>
            -        drm_warn(&xe->drm, "Failed to add sysfs fini
            drm action\n");
            <br>
            +        return ret;
            <br>
            +
            <br>
            +    return drmm_add_action_or_reset(&xe->drm,
            xe_device_sysfs_fini, xe);
            <br>
            }
            <br>
            diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.h
            b/drivers/gpu/drm/xe/xe_device_sysfs.h
            <br>
            index 38b240684bee..f9e83d8bd2c7 100644
            <br>
            --- a/drivers/gpu/drm/xe/xe_device_sysfs.h
            <br>
            +++ b/drivers/gpu/drm/xe/xe_device_sysfs.h
            <br>
            @@ -8,6 +8,6 @@
            <br>
            <br>
            struct xe_device;
            <br>
            <br>
            -void xe_device_sysfs_init(struct xe_device *xe);
            <br>
            +int xe_device_sysfs_init(struct xe_device *xe);
            <br>
            <br>
            #endif
            <br>
            diff --git a/drivers/gpu/drm/xe/xe_pm.c
            b/drivers/gpu/drm/xe/xe_pm.c
            <br>
            index f1fc83845c01..f4d9441720b4 100644
            <br>
            --- a/drivers/gpu/drm/xe/xe_pm.c
            <br>
            +++ b/drivers/gpu/drm/xe/xe_pm.c
            <br>
            @@ -208,10 +208,25 @@ static void xe_pm_runtime_init(struct
            xe_device *xe)
            <br>
                pm_runtime_put(dev);
            <br>
            }
            <br>
            <br>
            -void xe_pm_init_early(struct xe_device *xe)
            <br>
            +int xe_pm_init_early(struct xe_device *xe)
            <br>
            {
            <br>
            +    int err;
            <br>
            +
            <br>
    INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
            <br>
            -    drmm_mutex_init(&xe->drm,
            &xe->mem_access.vram_userfault.lock);
            <br>
            +
            <br>
            +    err = drmm_mutex_init(&xe->drm,
            &xe->mem_access.vram_userfault.lock);
            <br>
            +    if (err)
            <br>
            +        return err;
            <br>
            +
            <br>
            +    /* Currently d3cold.lock will be used only with GuC */
            <br>
          </blockquote>
          <br>
          it's cheaper to just initialize it regardless so this can be
          simpler
          <br>
          <br>
          int xe_pm_init_early(struct xe_device *xe)
          <br>
          {
          <br>
              int err;
          <br>
          <br>
    INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
          <br>
          <br>
              if ((err = drmm_mutex_init(&xe->drm,
          &xe->mem_access.vram_userfault.lock) ||
          <br>
                  (err = drmm_mutex_init(&xe->drm,
          &xe->d3cold.lock)))
          <br>
                  return err;
          <br>
        </blockquote>
        Looks clean. Will prefer using this.
        <br>
        <blockquote type="cite">
          <br>
              return 0;
          <br>
          }
          <br>
          <br>
          or keep the err assignment separate, doesn't matter much. But
          <br>
          when we mix success and failure for a return-early style it
          makes
          <br>
          it harder to read.
          <br>
          <br>
          <blockquote type="cite">
            <br>
            /**
            <br>
            @@ -219,20 +234,27 @@ void xe_pm_init_early(struct xe_device
            *xe)
            <br>
             * @xe: xe device instance
            <br>
             *
            <br>
             * This component is responsible for System and Device sleep
            states.
            <br>
            + *
            <br>
          </blockquote>
          <br>
          wrong line
          <br>
          <br>
          <blockquote type="cite"> */
            <br>
            void xe_pm_init(struct xe_device *xe)
            <br>
            {
            <br>
            +    int err;
            <br>
            +
            <br>
                /* For now suspend/resume is only allowed with GuC */
            <br>
                if (!xe_device_uc_enabled(xe))
            <br>
                    return;
            <br>
            <br>
            -    drmm_mutex_init(&xe->drm,
            &xe->d3cold.lock);
            <br>
            -
            <br>
                xe->d3cold.capable = xe_pm_pci_d3cold_capable(xe);
            <br>
            <br>
                if (xe->d3cold.capable) {
            <br>
            -        xe_device_sysfs_init(xe);
            <br>
            -        xe_pm_set_vram_threshold(xe,
            DEFAULT_VRAM_THRESHOLD);
            <br>
            +        err = xe_device_sysfs_init(xe);
            <br>
          </blockquote>
          <br>
          apparently not because of this patch, but why do we call a
          function
          <br>
          named xe_device_sysfs_init() iff xe->d3cold.capable? And
          from within
          <br>
          xe_pm. That seems totally misplaced. +Rodrigo
          <br>
          <br>
          <blockquote type="cite">+        if (err)
            <br>
            +            drm_warn(&xe->drm,
            <br>
            +                 "Sysfs create for user to set vram
            threshold failed\n");
            <br>
          </blockquote>
          <br>
          just warning?
          <br>
        </blockquote>
        <br>
        If I propagate xe_pm_int errors to xe_pci.c, it exhibits
        unexpected behavior. The PCI module throws errors as
        anticipated, but when I remove the xe module with "module -r
        xe", it doesn't properly clean up the "/sys/class/drm/card*"
        directory. Subsequently, upon reloading, it complains about
        existing sysfs entries and fails to load. This behavior aligns
        with the issue described in
        <a class="moz-txt-link-freetext" href="https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1352">https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1352</a>.
        <br>
      </blockquote>
      <br>
      Ok, it seems xe_pci.c is not ready yet to handle the propagated
      error.
      <br>
      Please add a comment above and in the commit message about this,
      so we
      <br>
      remember to fix it later when xe_pci.c is fixed.
      <br>
      <br>
      /*
      <br>
       * FIXME: Just warn on error for now since caller is not
      completely
      <br>
       * ready to handle the fallout.
      <br>
       */
      <br>
    </blockquote>
    <p>Thanks for the input. <br>
    </p>
    <p style="border: 0px solid rgb(227, 227, 227); box-sizing: border-box; --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-gradient-from-position: ; --tw-gradient-via-position: ; --tw-gradient-to-position: ; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgba(69,89,164,.5); --tw-ring-offset-shadow: 0 0 transparent; --tw-ring-shadow: 0 0 transparent; --tw-shadow: 0 0 transparent; --tw-shadow-colored: 0 0 transparent; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; --tw-contain-size: ; --tw-contain-layout: ; --tw-contain-paint: ; --tw-contain-style: ; margin: 0px 0px 1.25em; color: rgb(13, 13, 13); font-family: Söhne, ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Ubuntu, Cantarell, "Noto Sans", sans-serif, "Helvetica Neue", Arial, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; font-size: 16px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; white-space: pre-wrap; background-color: rgb(255, 255, 255); text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial;">Since errors come up after the DRM device probing, there's currently no cleanup mechanism for the driver. Adding handling via xe_pci_remove() might sort out issues popping up post driver probe. I'll give it a try, and if it works out, I'll update the patch accordingly. Otherwise, I'll resort to adding warning messages and fixme notes for now.</p>
    <p></p>
    <blockquote type="cite" cite="mid:ub2u2jq47h5uy7zydrmoewdad644qf5eo6finfz4mfpwa4orm3@22uzbbikjtkw">
      <br>
      thanks
      <br>
      Lucas De Marchi
      <br>
    </blockquote>
  </body>
</html>