<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2022-04-13 12:03, Shuotao Xu wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com">
      
      <br class="">
      <div><br class="">
        <blockquote type="cite" class="">
          <div class="">On Apr 11, 2022, at 11:52 PM, Andrey Grodzovsky
            <<a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a>>
            wrote:</div>
          <br class="Apple-interchange-newline">
          <div class=""><span style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">[Some people who received this
              message don't often get email from<span class="Apple-converted-space"> </span></span><a href="mailto:andrey.grodzovsky@amd.com" style="font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; orphans: auto;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; widows: auto; word-spacing: 0px;
              -webkit-text-size-adjust: auto; -webkit-text-stroke-width:
              0px;" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;
              float: none; display: inline !important;" class="">. Learn
              why this is important at<span class="Apple-converted-space"> </span></span><a href="http://aka.ms/LearnAboutSenderIdentification" style="font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; orphans: auto;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; widows: auto; word-spacing: 0px;
              -webkit-text-size-adjust: auto; -webkit-text-stroke-width:
              0px;" class="moz-txt-link-freetext" moz-do-not-send="true">http://aka.ms/LearnAboutSenderIdentification</a><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;
              float: none; display: inline !important;" class="">.]</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">On 2022-04-08 21:28, Shuotao Xu
              wrote:</span><br style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <blockquote type="cite" style="font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal; orphans:
              auto; text-align: start; text-indent: 0px; text-transform:
              none; white-space: normal; widows: auto; word-spacing:
              0px; -webkit-text-size-adjust: auto;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
              <br class="">
              <blockquote type="cite" class="">On Apr 8, 2022, at 11:28
                PM, Andrey Grodzovsky <<a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a>>
                wrote:<br class="">
                <br class="">
                [Some people who received this message don't often get
                email from <a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">
                  andrey.grodzovsky@amd.com</a>. Learn why this is
                important at <a href="http://aka.ms/LearnAboutSenderIdentification" class="moz-txt-link-freetext" moz-do-not-send="true">
                  http://aka.ms/LearnAboutSenderIdentification</a>.]<br class="">
                <br class="">
                On 2022-04-08 04:45, Shuotao Xu wrote:<br class="">
                <blockquote type="cite" class="">Adding PCIe Hotplug
                  Support for AMDKFD: the support of hot-plug of GPU<br class="">
                  devices can open doors for many advanced applications
                  in data center<br class="">
                  in the next few years, such as for GPU resource<br class="">
                  disaggregation. Current AMDKFD does not support
                  hotplug out b/o the<br class="">
                  following reasons:<br class="">
                  <br class="">
                  1. During PCIe removal, decrement KFD lock which was
                  incremented at<br class="">
                  the beginning of hw fini; otherwise kfd_open later is
                  going to<br class="">
                  fail.<br class="">
                </blockquote>
                I assumed you read my comment last time, still you do
                same approach.<br class="">
                More in details bellow<br class="">
              </blockquote>
              Aha, I like your fix:) I was not familiar with drm APIs so
              just only half understood your comment last time.<br class="">
              <br class="">
              BTW, I tried hot-plugging out a GPU when rocm application
              is still running.<br class="">
              From dmesg, application is still trying to access the
              removed kfd device, and are met with some errors.<br class="">
            </blockquote>
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">Application us supposed to keep
              running, it holds the drm_device</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">reference as long as it has an open</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">FD to the device and final cleanup
              will come only after the app will die</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">thus releasing the FD and the last</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">drm_device reference.</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <blockquote type="cite" style="font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal; orphans:
              auto; text-align: start; text-indent: 0px; text-transform:
              none; white-space: normal; widows: auto; word-spacing:
              0px; -webkit-text-size-adjust: auto;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
              Application would hang and not exiting in this case.<br class="">
            </blockquote>
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
          </div>
        </blockquote>
        <div><br class="">
        </div>
        Actually I tried kill -7 $pid, and the process exists. The dmesg
        has some warning though.</div>
      <div><br class="">
      </div>
      <div>
        <div>[  711.769977] WARNING: CPU: 23 PID: 344 at
          .../amdgpu-rocm5.0.2/src/amd/amdgpu/amdgpu_object.c:1336
          amdgpu_bo_release_notify+0x150/0x160 [amdgpu]</div>
        <div>[  711.770528] Modules linked in: amdgpu(OE) amdttm(OE)
          amd_sched(OE) amdkcl(OE) iommu_v2 nf_conntrack_netlink
          nfnetlink xfrm_user xfrm_algo xt_addrtype br_netfilter
          xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat
          xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4
          ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc
          ebtable_filter ebtables ip6table_filter ip6_tables
          iptable_filter overlay binfmt_misc intel_rapl_msr i40iw
          intel_rapl_common skx_edac nfit x86_pkg_temp_thermal
          intel_powerclamp coretemp kvm_intel rpcrdma kvm sunrpc
          ipmi_ssif ib_umad ib_ipoib rdma_ucm irqbypass rapl joydev
          acpi_ipmi input_leds intel_cstate ipmi_si ipmi_devintf mei_me
          mei intel_pch_thermal ipmi_msghandler ioatdma mac_hid lpc_ich
          dca acpi_power_meter acpi_pad sch_fq_codel ib_iser rdma_cm
          iw_cm ib_cm iscsi_tcp libiscsi_tcp libiscsi
          scsi_transport_iscsi pci_stub ip_tables x_tables autofs4 btrfs
          blake2b_generic zstd_compress raid10 raid456 async_raid6_recov
          async_memcpy async_pq async_xor async_tx xor</div>
        <div>[  711.779359]  raid6_pq libcrc32c raid1 raid0 multipath
          linear mlx5_ib ib_uverbs ib_core ast drm_vram_helper
          i2c_algo_bit drm_ttm_helper ttm drm_kms_helper syscopyarea
          crct10dif_pclmul crc32_pclmul ghash_clmulni_intel sysfillrect
          uas hid_generic sysimgblt aesni_intel mlx5_core fb_sys_fops
          crypto_simd usbhid cryptd drm i40e pci_hyperv_intf usb_storage
          glue_helper mlxfw hid ahci libahci wmi</div>
        <div>[  711.779752] CPU: 23 PID: 344 Comm: kworker/23:1 Tainted:
          G        W  OE     5.11.0+ #1</div>
        <div>[  711.779755] Hardware name: Supermicro
          SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 2.1 08/14/2018</div>
        <div>[  711.779756] Workqueue: kfd_process_wq
          kfd_process_wq_release [amdgpu]</div>
        <div>[  711.779955] RIP:
          0010:amdgpu_bo_release_notify+0x150/0x160 [amdgpu]</div>
        <div>[  711.780141] Code: e8 b5 af 34 f4 e9 1f ff ff ff 48 39 c2
          74 07 0f 0b e9 69 ff ff ff 4c 89 e7 e8 3c b4 16 00 e9 5c ff ff
          ff e8 a2 ce fd f3 eb cf <0f> 0b eb cb e8 d7 02 34 f4 0f
          1f 80 00 00 00 00 0f 1f 44 00 00 55</div>
        <div>[  711.780143] RSP: 0018:ffffa8100dd67c30 EFLAGS: 00010282</div>
        <div>[  711.780145] RAX: 00000000ffffffea RBX: ffff89980e792058
          RCX: 0000000000000000</div>
        <div>[  711.780147] RDX: 0000000000000000 RSI: ffff89a8f9ad8870
          RDI: ffff89a8f9ad8870</div>
        <div>[  711.780148] RBP: ffffa8100dd67c50 R08: 0000000000000000
          R09: fffffffffff99b18</div>
        <div>[  711.780149] R10: ffffa8100dd67bd0 R11: ffffa8100dd67908
          R12: ffff89980e792000</div>
        <div>[  711.780151] R13: ffff89980e792058 R14: ffff89980e7921bc
          R15: dead000000000100</div>
        <div>[  711.780152] FS:  0000000000000000(0000)
          GS:ffff89a8f9ac0000(0000) knlGS:0000000000000000</div>
        <div>[  711.780154] CS:  0010 DS: 0000 ES: 0000 CR0:
          0000000080050033</div>
        <div>[  711.780156] CR2: 00007ffddac6f71f CR3: 00000030bb80a003
          CR4: 00000000007706e0</div>
        <div>[  711.780157] DR0: 0000000000000000 DR1: 0000000000000000
          DR2: 0000000000000000</div>
        <div>[  711.780159] DR3: 0000000000000000 DR6: 00000000fffe0ff0
          DR7: 0000000000000400</div>
        <div>[  711.780160] PKRU: 55555554</div>
        <div>[  711.780161] Call Trace:</div>
        <div>[  711.780163]  ttm_bo_release+0x2ae/0x320 [amdttm]</div>
        <div>[  711.780169]  amdttm_bo_put+0x30/0x40 [amdttm]</div>
        <div>[  711.780357]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]</div>
        <div>[  711.780543]  amdgpu_gem_object_free+0x8b/0x160 [amdgpu]</div>
        <div>[  711.781119]  drm_gem_object_free+0x1d/0x30 [drm]</div>
        <div>[  711.781489]
           amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x34a/0x380 [amdgpu]</div>
        <div>[  711.782044]  kfd_process_device_free_bos+0xe0/0x130
          [amdgpu]</div>
        <div>[  711.782611]  kfd_process_wq_release+0x286/0x380 [amdgpu]</div>
        <div>[  711.783172]  process_one_work+0x236/0x420</div>
        <div>[  711.783543]  worker_thread+0x34/0x400</div>
        <div>[  711.783911]  ? process_one_work+0x420/0x420</div>
        <div>[  711.784279]  kthread+0x126/0x140</div>
        <div>[  711.784653]  ? kthread_park+0x90/0x90</div>
        <div>[  711.785018]  ret_from_fork+0x22/0x30</div>
        <div>[  711.785387] ---[ end trace d8f50f6594817c84 ]---</div>
        <div>[  711.798716] [drm] amdgpu: ttm finalized</div>
      </div>
    </blockquote>
    <p><br>
    </p>
    <p>So it means the process was stuck in some wait_event_killable
      (maybe here drm_sched_entity_flush) - you can try
      'cat/proc/$process_pid/stack' maybe before<br>
      you kill it to see where it was stuck so we can go from there.<br>
    </p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com">
      <div>
        <div><br class="">
        </div>
      </div>
      <div>
        <blockquote type="cite" class="">
          <div class=""><br style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">For graphic apps what i usually see
              is a crash because of sigsev when</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">the app tries to access</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">an unmapped MMIO region on the
              device. I haven't tested for compute</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">stack and so there might</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">be something I haven't covered. Hang
              could mean for example waiting on a</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">fence which is not being</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">signaled - please provide full dmesg
              from this case.</span><br style="caret-color: rgb(0, 0,
              0); font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <blockquote type="cite" style="font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal; orphans:
              auto; text-align: start; text-indent: 0px; text-transform:
              none; white-space: normal; widows: auto; word-spacing:
              0px; -webkit-text-size-adjust: auto;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
              <br class="">
              Do you have any good suggestions on how to fix it down the
              line? (HIP runtime/libhsakmt or driver)<br class="">
              <br class="">
              [64036.631333] amdgpu: amdgpu_vm_bo_update failed<br class="">
              [64036.631702] amdgpu: validate_invalid_user_pages: update
              PTE failed<br class="">
              [64036.640754] amdgpu: amdgpu_vm_bo_update failed<br class="">
              [64036.641120] amdgpu: validate_invalid_user_pages: update
              PTE failed<br class="">
              [64036.650394] amdgpu: amdgpu_vm_bo_update failed<br class="">
              [64036.650765] amdgpu: validate_invalid_user_pages: update
              PTE failed<br class="">
            </blockquote>
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
          </div>
        </blockquote>
        <div><br class="">
        </div>
        The full dmesg will just the repetition of those two messages,</div>
      <div>
        <div>[186885.764079] amdgpu 0000:43:00.0: amdgpu: amdgpu:
          finishing device.</div>
        <div>[186885.766916] [drm] free PSP TMR buffer</div>
        <div>[186893.868173] amdgpu: amdgpu_vm_bo_update failed</div>
        <div>[186893.868235] amdgpu: validate_invalid_user_pages: update
          PTE failed</div>
        <div>[186893.876154] amdgpu: amdgpu_vm_bo_update failed</div>
        <div>[186893.876190] amdgpu: validate_invalid_user_pages: update
          PTE failed</div>
        <div>[186893.884150] amdgpu: amdgpu_vm_bo_update failed</div>
        <div>[186893.884185] amdgpu: validate_invalid_user_pages: update
          PTE failed</div>
      </div>
      <div><br class="">
      </div>
      <div>
        <blockquote type="cite" class="">
          <div class=""><br style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">This just probably means trying to
              update PTEs after the physical device</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">is gone - we usually avoid this by</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">first trying to do all HW shutdowns
              early before PCI remove completion</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">but when it's really tricky by</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">protecting HW access sections with
              drm_dev_enter/exit scope.</span><br style="caret-color:
              rgb(0, 0, 0); font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; text-align:
              start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">For this particular error it would
              be the best to flush</span><br style="caret-color: rgb(0,
              0, 0); font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; text-align:
              start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">info->restore_userptr_work before
              the end of</span><br style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">amdgpu_pci_remove (rejecting new
              process creation and calling</span><br style="caret-color:
              rgb(0, 0, 0); font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; text-align:
              start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">cancel_delayed_work_sync(&process_info->restore_userptr_work)
              for all</span><br style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">running processes)</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <span style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">somewhere in amdgpu_pci_remove.</span><br style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal;
              text-align: start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <br class="">
          </div>
        </blockquote>
        I tried something like *kfd_process_ref_release* which I think
        did what you described, but it did not work.</div>
    </blockquote>
    <p><br>
    </p>
    <p>I don't see how kfd_process_ref_release is the same as I
      mentioned above, what i meant is calling the code above within
      kgd2kfd_suspend (where you tried to call
      kfd_kill_all_user_processes bellow) <br>
    </p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com">
      <div><br class="">
      </div>
      <div>Instead I tried to kill the process from the kernel, but the
        amdgpu could **only** be hot-plugged in back successfully only
        if there was no rocm kernel running when it was plugged out. If
        not, amdgpu_probe will just hang later. (Maybe because amdgpu
        was plugged out while running state, it leaves a bad HW state
        which causes probe to hang).</div>
    </blockquote>
    <p><br>
    </p>
    <p>We usually do asic_reset during probe to reset all HW state
      (checlk if amdgpu_device_init->amdgpu_asic_reset is running
      when you  plug back). <br>
    </p>
    <p>  <br>
    </p>
    <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com">
      <div><br class="">
      </div>
      <div>I don’t know if this is a viable solution worth pursuing, but
        I attached the diff anyway.</div>
      <div><br class="">
      </div>
      <div>Another solution could be let compute stack user mode detect
        a topology change via <span class="">generation_count change,
          and abort gracefully there.</span></div>
      <div class=""><br class="">
      </div>
      <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
      <div>index 4e7d9cb09a69..79b4c9b84cd0 100644</div>
      <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
      <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
      <div>@@ -697,12 +697,15 @@ void kgd2kfd_suspend(struct kfd_dev
        *kfd, bool run_pm, bool force)</div>
      <div>                return;</div>
      <div><br class="">
      </div>
      <div>        /* for runtime suspend, skip locking kfd */</div>
      <div>-       if (!run_pm) {</div>
      <div>+       if (!run_pm &&
        !drm_dev_is_unplugged(kfd->ddev)) {</div>
      <div>                /* For first KFD device suspend all the KFD
        processes */</div>
      <div>                if (atomic_inc_return(&kfd_locked) == 1)</div>
      <div>                        kfd_suspend_all_processes(force);</div>
      <div>        }</div>
      <div><br class="">
      </div>
      <div>+       if (drm_dev_is_unplugged(kfd->ddev))</div>
      <div>+               kfd_kill_all_user_processes();</div>
      <div>+</div>
      <div>        kfd->dqm->ops.stop(kfd->dqm);</div>
      <div>        kfd_iommu_suspend(kfd);</div>
      <div> }</div>
      <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
        b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
      <div>index 55c9e1922714..84cbcd857856 100644</div>
      <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
      <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
      <div>@@ -1065,6 +1065,7 @@ void kfd_unref_process(struct
        kfd_process *p);</div>
      <div> int kfd_process_evict_queues(struct kfd_process *p, bool
        force);</div>
      <div> int kfd_process_restore_queues(struct kfd_process *p);</div>
      <div> void kfd_suspend_all_processes(bool force);</div>
      <div>+void kfd_kill_all_user_processes(void);</div>
      <div> /*</div>
      <div>  * kfd_resume_all_processes:</div>
      <div>  *     bool sync: If kfd_resume_all_processes() should wait
        for the</div>
      <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
      <div>index 6cdc855abb6d..fb0c753b682c 100644</div>
      <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
      <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
      <div>@@ -2206,6 +2206,24 @@ void kfd_suspend_all_processes(bool
        force)</div>
      <div>        srcu_read_unlock(&kfd_processes_srcu, idx);</div>
      <div> }</div>
      <div><br class="">
      </div>
      <div>+</div>
      <div>+void kfd_kill_all_user_processes(void)</div>
      <div>+{</div>
      <div>+       struct kfd_process *p;</div>
      <div>+       struct amdkfd_process_info *p_info;</div>
      <div>+       unsigned int temp;</div>
      <div>+       int idx = srcu_read_lock(&kfd_processes_srcu);</div>
      <div>+</div>
      <div>+       pr_info("Killing all processes\n");</div>
      <div>+       hash_for_each_rcu(kfd_processes_table, temp, p,
        kfd_processes) {</div>
      <div>+               p_info = p->kgd_process_info;</div>
      <div>+               pr_info("Killing  processes, pid = %d",
        pid_nr(p_info->pid));</div>
      <div>+               kill_pid(p_info->pid, SIGBUS, 1);</div>
    </blockquote>
    <p><br>
    </p>
    <p>From looking into kill_pid I see it only sends a signal but
      doesn't wait for completion, it would make sense to wait for
      completion here. In any case I would actually try to put here<span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><br>
      </span><span style="caret-color: rgb(0, 0, 0); font-family:
        Helvetica; font-size: 12px; font-style: normal;
        font-variant-caps: normal; font-weight: 400; letter-spacing:
        normal; text-align: start; text-indent: 0px; text-transform:
        none; white-space: normal; word-spacing: 0px;
        -webkit-text-stroke-width: 0px; text-decoration: none; float:
        none; display: inline !important;" class=""><font size="4"><br>
        </font></span></p>
    <p><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><font size="4">hash_for_each_rcu(</font></span><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><font size="4">p_info)    <br>
              cancel_delayed_work_sync(&</font></span><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><font size="4"><span style="caret-color: rgb(0, 0, 0);
            font-family: Helvetica; font-size: 12px; font-style: normal;
            font-variant-caps: normal; font-weight: 400; letter-spacing:
            normal; text-align: start; text-indent: 0px; text-transform:
            none; white-space: normal; word-spacing: 0px;
            -webkit-text-stroke-width: 0px; text-decoration: none;
            float: none; display: inline !important;" class=""></span><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
            font-size: 12px; font-style: normal; font-variant-caps:
            normal; font-weight: 400; letter-spacing: normal;
            text-align: start; text-indent: 0px; text-transform: none;
            white-space: normal; word-spacing: 0px;
            -webkit-text-stroke-width: 0px; text-decoration: none;
            float: none; display: inline !important;" class=""><font size="4">p_info</font></span>->restore_userptr_work) <br>
        </font></span></p>
    <p><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><font size="4">instead  at least that what i meant in
          the previous mail. <br>
        </font></span></p>
    <p><span style="caret-color: rgb(0, 0, 0); font-family: Helvetica;
        font-size: 12px; font-style: normal; font-variant-caps: normal;
        font-weight: 400; letter-spacing: normal; text-align: start;
        text-indent: 0px; text-transform: none; white-space: normal;
        word-spacing: 0px; -webkit-text-stroke-width: 0px;
        text-decoration: none; float: none; display: inline !important;" class=""><font size="4">Andrey</font><br>
        <br>
      </span></p>
    <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com">
      <div>+       }</div>
      <div>+       srcu_read_unlock(&kfd_processes_srcu, idx);</div>
      <div>+}</div>
      <div>+</div>
      <div>+</div>
      <div> int kfd_resume_all_processes(bool sync)</div>
      <div> {</div>
      <div>        struct kfd_process *p;</div>
      <div><br class="">
      </div>
      <div><br class="">
        <blockquote type="cite" class="">
          <div class=""><span style="caret-color: rgb(0, 0, 0);
              font-family: Helvetica; font-size: 12px; font-style:
              normal; font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none; float: none; display: inline
              !important;" class="">Andrey</span><br style="caret-color:
              rgb(0, 0, 0); font-family: Helvetica; font-size: 12px;
              font-style: normal; font-variant-caps: normal;
              font-weight: 400; letter-spacing: normal; text-align:
              start; text-indent: 0px; text-transform: none;
              white-space: normal; word-spacing: 0px;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <br style="caret-color: rgb(0, 0, 0); font-family:
              Helvetica; font-size: 12px; font-style: normal;
              font-variant-caps: normal; font-weight: 400;
              letter-spacing: normal; text-align: start; text-indent:
              0px; text-transform: none; white-space: normal;
              word-spacing: 0px; -webkit-text-stroke-width: 0px;
              text-decoration: none;" class="">
            <blockquote type="cite" style="font-family: Helvetica;
              font-size: 12px; font-style: normal; font-variant-caps:
              normal; font-weight: 400; letter-spacing: normal; orphans:
              auto; text-align: start; text-indent: 0px; text-transform:
              none; white-space: normal; widows: auto; word-spacing:
              0px; -webkit-text-size-adjust: auto;
              -webkit-text-stroke-width: 0px; text-decoration: none;" class="">
              <br class="">
              Really appreciate your help!<br class="">
              <br class="">
              Best,<br class="">
              Shuotao<br class="">
              <br class="">
              <blockquote type="cite" class="">
                <blockquote type="cite" class="">2. Remove redudant
                  p2p/io links in sysfs when device is hotplugged<br class="">
                  out.<br class="">
                  <br class="">
                  3. New kfd node_id is not properly assigned after a
                  new device is<br class="">
                  added after a gpu is hotplugged out in a system.
                  libhsakmt will<br class="">
                  find this anomaly, (i.e. node_from != <dev node
                  id> in iolinks),<br class="">
                  when taking a topology_snapshot, thus returns fault to
                  the rocm<br class="">
                  stack.<br class="">
                  <br class="">
                  -- This patch fixes issue 1; another patch by Mukul
                  fixes issues 2&3.<br class="">
                  -- Tested on a 4-GPU MI100 gpu nodes with kernel
                  5.13.0-kfd; kernel<br class="">
                  5.16.0-kfd is unstable out of box for MI100.<br class="">
                  ---<br class="">
                  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++<br class="">
                  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++++++<br class="">
                  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +<br class="">
                  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 13
                  +++++++++++++<br class="">
                  4 files changed, 26 insertions(+)<br class="">
                  <br class="">
                  diff --git
                  a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
                  b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                  index c18c4be1e4ac..d50011bdb5c4 100644<br class="">
                  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                  @@ -213,6 +213,11 @@ int amdgpu_amdkfd_resume(struct
                  amdgpu_device *adev, bool run_pm)<br class="">
                  return r;<br class="">
                  }<br class="">
                  <br class="">
                  +int amdgpu_amdkfd_resume_processes(void)<br class="">
                  +{<br class="">
                  + return kgd2kfd_resume_processes();<br class="">
                  +}<br class="">
                  +<br class="">
                  int amdgpu_amdkfd_pre_reset(struct amdgpu_device
                  *adev)<br class="">
                  {<br class="">
                  int r = 0;<br class="">
                  diff --git
                  a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
                  b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                  index f8b9f27adcf5..803306e011c3 100644<br class="">
                  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                  @@ -140,6 +140,7 @@ void amdgpu_amdkfd_fini(void);<br class="">
                  void amdgpu_amdkfd_suspend(struct amdgpu_device *adev,
                  bool run_pm);<br class="">
                  int amdgpu_amdkfd_resume_iommu(struct amdgpu_device
                  *adev);<br class="">
                  int amdgpu_amdkfd_resume(struct amdgpu_device *adev,
                  bool run_pm);<br class="">
                  +int amdgpu_amdkfd_resume_processes(void);<br class="">
                  void amdgpu_amdkfd_interrupt(struct amdgpu_device
                  *adev,<br class="">
                  const void *ih_ring_entry);<br class="">
                  void amdgpu_amdkfd_device_probe(struct amdgpu_device
                  *adev);<br class="">
                  @@ -347,6 +348,7 @@ void kgd2kfd_device_exit(struct
                  kfd_dev *kfd);<br class="">
                  void kgd2kfd_suspend(struct kfd_dev *kfd, bool
                  run_pm);<br class="">
                  int kgd2kfd_resume_iommu(struct kfd_dev *kfd);<br class="">
                  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);<br class="">
                  +int kgd2kfd_resume_processes(void);<br class="">
                  int kgd2kfd_pre_reset(struct kfd_dev *kfd);<br class="">
                  int kgd2kfd_post_reset(struct kfd_dev *kfd);<br class="">
                  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void
                  *ih_ring_entry);<br class="">
                  @@ -393,6 +395,11 @@ static inline int
                  kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)<br class="">
                  return 0;<br class="">
                  }<br class="">
                  <br class="">
                  +static inline int kgd2kfd_resume_processes(void)<br class="">
                  +{<br class="">
                  + return 0;<br class="">
                  +}<br class="">
                  +<br class="">
                  static inline int kgd2kfd_pre_reset(struct kfd_dev
                  *kfd)<br class="">
                  {<br class="">
                  return 0;<br class="">
                  diff --git
                  a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
                  b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                  index fa4a9f13c922..5827b65b7489 100644<br class="">
                  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                  @@ -4004,6 +4004,7 @@ void
                  amdgpu_device_fini_hw(struct amdgpu_device *adev)<br class="">
                  if (drm_dev_is_unplugged(adev_to_drm(adev)))<br class="">
                  amdgpu_device_unmap_mmio(adev);<br class="">
                  <br class="">
                  + amdgpu_amdkfd_resume_processes();<br class="">
                  }<br class="">
                  <br class="">
                  void amdgpu_device_fini_sw(struct amdgpu_device *adev)<br class="">
                  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
                  b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                  index 62aa6c9d5123..ef05aae9255e 100644<br class="">
                  --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                  +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                  @@ -714,6 +714,19 @@ int kgd2kfd_resume(struct kfd_dev
                  *kfd, bool run_pm)<br class="">
                  return ret;<br class="">
                  }<br class="">
                  <br class="">
                  +/* for non-runtime resume only */<br class="">
                  +int kgd2kfd_resume_processes(void)<br class="">
                  +{<br class="">
                  + int count;<br class="">
                  +<br class="">
                  + count = atomic_dec_return(&kfd_locked);<br class="">
                  + WARN_ONCE(count < 0, "KFD suspend / resume ref.
                  error");<br class="">
                  + if (count == 0)<br class="">
                  + return kfd_resume_all_processes();<br class="">
                  +<br class="">
                  + return 0;<br class="">
                  +}<br class="">
                </blockquote>
                <br class="">
                It doesn't make sense to me to just increment kfd_locked
                in<br class="">
                kgd2kfd_suspend to only decrement it again a few
                functions down the<br class="">
                road.<br class="">
                <br class="">
                I suggest this instead - you only incrmemnt if not
                during PCI remove<br class="">
                <br class="">
                diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                index 1c2cf3a33c1f..7754f77248a4 100644<br class="">
                --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                @@ -952,11 +952,12 @@ bool kfd_is_locked(void)<br class="">
                <br class="">
                void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)<br class="">
                {<br class="">
                +<br class="">
                if (!kfd->init_complete)<br class="">
                return;<br class="">
                <br class="">
                /* for runtime suspend, skip locking kfd */<br class="">
                - if (!run_pm) {<br class="">
                + if (!run_pm &&
                !drm_dev_is_unplugged(kfd->ddev)) {<br class="">
                /* For first KFD device suspend all the KFD processes */<br class="">
                if (atomic_inc_return(&kfd_locked) == 1)<br class="">
                kfd_suspend_all_processes();<br class="">
                <br class="">
                <br class="">
                Andrey<br class="">
                <br class="">
                <br class="">
                <br class="">
                <blockquote type="cite" class="">+<br class="">
                  int kgd2kfd_resume_iommu(struct kfd_dev *kfd)<br class="">
                  {<br class="">
                  int err = 0;</blockquote>
              </blockquote>
            </blockquote>
          </div>
        </blockquote>
      </div>
      <br class="">
    </blockquote>
  </body>
</html>