<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2022-04-14 10:00, Shuotao Xu wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:B7970589-ACF6-41F3-8622-1C0F705F3EE0@microsoft.com">
      
      <br class="">
      <div><br class="">
        <blockquote type="cite" class="">
          <div class="">On Apr 14, 2022, at 1:31 AM, Andrey Grodzovsky
            <<a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a>>
            wrote:</div>
          <br class="Apple-interchange-newline">
          <div class="">
            <div class="">
              <p class=""><br class="">
              </p>
              <div class="moz-cite-prefix">On 2022-04-13 12:03, Shuotao
                Xu wrote:<br class="">
              </div>
              <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com" class="">
                <br class="">
                <div class=""><br class="">
                  <blockquote type="cite" class="">
                    <div class="">On Apr 11, 2022, at 11:52 PM, Andrey
                      Grodzovsky <<a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a>>
                      wrote:</div>
                    <br class="Apple-interchange-newline">
                    <div class=""><span style="caret-color: rgb(0, 0,
                        0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">[Some
                        people who received this message don't often get
                        email from<span class="Apple-converted-space"> </span></span><a href="mailto:andrey.grodzovsky@amd.com" style="font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        orphans: auto; text-align: start; text-indent:
                        0px; text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px;" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a><span style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none; float: none; display: inline !important;" class="">. Learn why this is important at<span class="Apple-converted-space"> </span></span><a href="http://aka.ms/LearnAboutSenderIdentification" style="font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; orphans: auto;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px;" class="moz-txt-link-freetext" moz-do-not-send="true">http://aka.ms/LearnAboutSenderIdentification</a><span style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none; float: none; display: inline !important;" class="">.]</span><br style="caret-color: rgb(0,
                        0, 0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">On
                        2022-04-08 21:28, Shuotao Xu wrote:</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <blockquote type="cite" style="font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; orphans: auto;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                        <br class="">
                        <blockquote type="cite" class="">On Apr 8, 2022,
                          at 11:28 PM, Andrey Grodzovsky <<a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">andrey.grodzovsky@amd.com</a>>
                          wrote:<br class="">
                          <br class="">
                          [Some people who received this message don't
                          often get email from <a href="mailto:andrey.grodzovsky@amd.com" class="moz-txt-link-freetext" moz-do-not-send="true">
                            andrey.grodzovsky@amd.com</a>. Learn why
                          this is important at <a href="http://aka.ms/LearnAboutSenderIdentification" class="moz-txt-link-freetext" moz-do-not-send="true">
                            http://aka.ms/LearnAboutSenderIdentification</a>.]<br class="">
                          <br class="">
                          On 2022-04-08 04:45, Shuotao Xu wrote:<br class="">
                          <blockquote type="cite" class="">Adding PCIe
                            Hotplug Support for AMDKFD: the support of
                            hot-plug of GPU<br class="">
                            devices can open doors for many advanced
                            applications in data center<br class="">
                            in the next few years, such as for GPU
                            resource<br class="">
                            disaggregation. Current AMDKFD does not
                            support hotplug out b/o the<br class="">
                            following reasons:<br class="">
                            <br class="">
                            1. During PCIe removal, decrement KFD lock
                            which was incremented at<br class="">
                            the beginning of hw fini; otherwise kfd_open
                            later is going to<br class="">
                            fail.<br class="">
                          </blockquote>
                          I assumed you read my comment last time, still
                          you do same approach.<br class="">
                          More in details bellow<br class="">
                        </blockquote>
                        Aha, I like your fix:) I was not familiar with
                        drm APIs so just only half understood your
                        comment last time.<br class="">
                        <br class="">
                        BTW, I tried hot-plugging out a GPU when rocm
                        application is still running.<br class="">
                        From dmesg, application is still trying to
                        access the removed kfd device, and are met with
                        some errors.<br class="">
                      </blockquote>
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">Application
                        us supposed to keep running, it holds the
                        drm_device</span><br style="caret-color: rgb(0,
                        0, 0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">reference
                        as long as it has an open</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">FD to the
                        device and final cleanup will come only after
                        the app will die</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">thus
                        releasing the FD and the last</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">drm_device
                        reference.</span><br style="caret-color: rgb(0,
                        0, 0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <blockquote type="cite" style="font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; orphans: auto;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                        Application would hang and not exiting in this
                        case.<br class="">
                      </blockquote>
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                    </div>
                  </blockquote>
                  <div class=""><br class="">
                  </div>
                  Actually I tried kill -7 $pid, and the process exists.
                  The dmesg has some warning though.</div>
                <div class=""><br class="">
                </div>
                <div class="">
                  <div class="">[  711.769977] WARNING: CPU: 23 PID: 344
                    at
                    .../amdgpu-rocm5.0.2/src/amd/amdgpu/amdgpu_object.c:1336
                    amdgpu_bo_release_notify+0x150/0x160 [amdgpu]</div>
                  <div class="">[  711.770528] Modules linked in:
                    amdgpu(OE) amdttm(OE) amd_sched(OE) amdkcl(OE)
                    iommu_v2 nf_conntrack_netlink nfnetlink xfrm_user
                    xfrm_algo xt_addrtype br_netfilter xt_CHECKSUM
                    iptable_mangle xt_MASQUERADE iptable_nat nf_nat
                    xt_conntrack nf_conntrack nf_defrag_ipv6
                    nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp
                    bridge stp llc ebtable_filter ebtables
                    ip6table_filter ip6_tables iptable_filter overlay
                    binfmt_misc intel_rapl_msr i40iw intel_rapl_common
                    skx_edac nfit x86_pkg_temp_thermal intel_powerclamp
                    coretemp kvm_intel rpcrdma kvm sunrpc ipmi_ssif
                    ib_umad ib_ipoib rdma_ucm irqbypass rapl joydev
                    acpi_ipmi input_leds intel_cstate ipmi_si
                    ipmi_devintf mei_me mei intel_pch_thermal
                    ipmi_msghandler ioatdma mac_hid lpc_ich dca
                    acpi_power_meter acpi_pad sch_fq_codel ib_iser
                    rdma_cm iw_cm ib_cm iscsi_tcp libiscsi_tcp libiscsi
                    scsi_transport_iscsi pci_stub ip_tables x_tables
                    autofs4 btrfs blake2b_generic zstd_compress raid10
                    raid456 async_raid6_recov async_memcpy async_pq
                    async_xor async_tx xor</div>
                  <div class="">[  711.779359]  raid6_pq libcrc32c raid1
                    raid0 multipath linear mlx5_ib ib_uverbs ib_core ast
                    drm_vram_helper i2c_algo_bit drm_ttm_helper ttm
                    drm_kms_helper syscopyarea crct10dif_pclmul
                    crc32_pclmul ghash_clmulni_intel sysfillrect uas
                    hid_generic sysimgblt aesni_intel mlx5_core
                    fb_sys_fops crypto_simd usbhid cryptd drm i40e
                    pci_hyperv_intf usb_storage glue_helper mlxfw hid
                    ahci libahci wmi</div>
                  <div class="">[  711.779752] CPU: 23 PID: 344 Comm:
                    kworker/23:1 Tainted: G        W  OE     5.11.0+ #1</div>
                  <div class="">[  711.779755] Hardware name: Supermicro
                    SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 2.1 08/14/2018</div>
                  <div class="">[  711.779756] Workqueue: kfd_process_wq
                    kfd_process_wq_release [amdgpu]</div>
                  <div class="">[  711.779955] RIP:
                    0010:amdgpu_bo_release_notify+0x150/0x160 [amdgpu]</div>
                  <div class="">[  711.780141] Code: e8 b5 af 34 f4 e9
                    1f ff ff ff 48 39 c2 74 07 0f 0b e9 69 ff ff ff 4c
                    89 e7 e8 3c b4 16 00 e9 5c ff ff ff e8 a2 ce fd f3
                    eb cf <0f> 0b eb cb e8 d7 02 34 f4 0f 1f 80 00
                    00 00 00 0f 1f 44 00 00 55</div>
                  <div class="">[  711.780143] RSP:
                    0018:ffffa8100dd67c30 EFLAGS: 00010282</div>
                  <div class="">[  711.780145] RAX: 00000000ffffffea
                    RBX: ffff89980e792058 RCX: 0000000000000000</div>
                  <div class="">[  711.780147] RDX: 0000000000000000
                    RSI: ffff89a8f9ad8870 RDI: ffff89a8f9ad8870</div>
                  <div class="">[  711.780148] RBP: ffffa8100dd67c50
                    R08: 0000000000000000 R09: fffffffffff99b18</div>
                  <div class="">[  711.780149] R10: ffffa8100dd67bd0
                    R11: ffffa8100dd67908 R12: ffff89980e792000</div>
                  <div class="">[  711.780151] R13: ffff89980e792058
                    R14: ffff89980e7921bc R15: dead000000000100</div>
                  <div class="">[  711.780152] FS:
                     0000000000000000(0000) GS:ffff89a8f9ac0000(0000)
                    knlGS:0000000000000000</div>
                  <div class="">[  711.780154] CS:  0010 DS: 0000 ES:
                    0000 CR0: 0000000080050033</div>
                  <div class="">[  711.780156] CR2: 00007ffddac6f71f
                    CR3: 00000030bb80a003 CR4: 00000000007706e0</div>
                  <div class="">[  711.780157] DR0: 0000000000000000
                    DR1: 0000000000000000 DR2: 0000000000000000</div>
                  <div class="">[  711.780159] DR3: 0000000000000000
                    DR6: 00000000fffe0ff0 DR7: 0000000000000400</div>
                  <div class="">[  711.780160] PKRU: 55555554</div>
                  <div class="">[  711.780161] Call Trace:</div>
                  <div class="">[  711.780163]
                     ttm_bo_release+0x2ae/0x320 [amdttm]</div>
                  <div class="">[  711.780169]  amdttm_bo_put+0x30/0x40
                    [amdttm]</div>
                  <div class="">[  711.780357]
                     amdgpu_bo_unref+0x1e/0x30 [amdgpu]</div>
                  <div class="">[  711.780543]
                     amdgpu_gem_object_free+0x8b/0x160 [amdgpu]</div>
                  <div class="">[  711.781119]
                     drm_gem_object_free+0x1d/0x30 [drm]</div>
                  <div class="">[  711.781489]
                     amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x34a/0x380
                    [amdgpu]</div>
                  <div class="">[  711.782044]
                     kfd_process_device_free_bos+0xe0/0x130 [amdgpu]</div>
                  <div class="">[  711.782611]
                     kfd_process_wq_release+0x286/0x380 [amdgpu]</div>
                  <div class="">[  711.783172]
                     process_one_work+0x236/0x420</div>
                  <div class="">[  711.783543]  worker_thread+0x34/0x400</div>
                  <div class="">[  711.783911]  ?
                    process_one_work+0x420/0x420</div>
                  <div class="">[  711.784279]  kthread+0x126/0x140</div>
                  <div class="">[  711.784653]  ? kthread_park+0x90/0x90</div>
                  <div class="">[  711.785018]  ret_from_fork+0x22/0x30</div>
                  <div class="">[  711.785387] ---[ end trace
                    d8f50f6594817c84 ]---</div>
                  <div class="">[  711.798716] [drm] amdgpu: ttm
                    finalized</div>
                </div>
              </blockquote>
              <p class=""><br class="">
              </p>
              <p class="">So it means the process was stuck in some
                wait_event_killable (maybe here drm_sched_entity_flush)
                - you can try 'cat/proc/$process_pid/stack' maybe before<br class="">
                you kill it to see where it was stuck so we can go from
                there.<br class="">
              </p>
              <p class=""><br class="">
              </p>
              <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com" class="">
                <div class="">
                  <div class=""><br class="">
                  </div>
                </div>
                <div class="">
                  <blockquote type="cite" class="">
                    <div class=""><br style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">For
                        graphic apps what i usually see is a crash
                        because of sigsev when</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">the app
                        tries to access</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">an
                        unmapped MMIO region on the device. I haven't
                        tested for compute</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">stack and
                        so there might</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">be
                        something I haven't covered. Hang could mean for
                        example waiting on a</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">fence
                        which is not being</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">signaled -
                        please provide full dmesg from this case.</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <blockquote type="cite" style="font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; orphans: auto;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                        <br class="">
                        Do you have any good suggestions on how to fix
                        it down the line? (HIP runtime/libhsakmt or
                        driver)<br class="">
                        <br class="">
                        [64036.631333] amdgpu: amdgpu_vm_bo_update
                        failed<br class="">
                        [64036.631702] amdgpu:
                        validate_invalid_user_pages: update PTE failed<br class="">
                        [64036.640754] amdgpu: amdgpu_vm_bo_update
                        failed<br class="">
                        [64036.641120] amdgpu:
                        validate_invalid_user_pages: update PTE failed<br class="">
                        [64036.650394] amdgpu: amdgpu_vm_bo_update
                        failed<br class="">
                        [64036.650765] amdgpu:
                        validate_invalid_user_pages: update PTE failed<br class="">
                      </blockquote>
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                    </div>
                  </blockquote>
                  <div class=""><br class="">
                  </div>
                  The full dmesg will just the repetition of those two
                  messages,</div>
                <div class="">
                  <div class="">[186885.764079] amdgpu 0000:43:00.0:
                    amdgpu: amdgpu: finishing device.</div>
                  <div class="">[186885.766916] [drm] free PSP TMR
                    buffer</div>
                  <div class="">[186893.868173] amdgpu:
                    amdgpu_vm_bo_update failed</div>
                  <div class="">[186893.868235] amdgpu:
                    validate_invalid_user_pages: update PTE failed</div>
                  <div class="">[186893.876154] amdgpu:
                    amdgpu_vm_bo_update failed</div>
                  <div class="">[186893.876190] amdgpu:
                    validate_invalid_user_pages: update PTE failed</div>
                  <div class="">[186893.884150] amdgpu:
                    amdgpu_vm_bo_update failed</div>
                  <div class="">[186893.884185] amdgpu:
                    validate_invalid_user_pages: update PTE failed</div>
                </div>
                <div class=""><br class="">
                </div>
                <div class="">
                  <blockquote type="cite" class="">
                    <div class=""><br style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">This just
                        probably means trying to update PTEs after the
                        physical device</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">is gone -
                        we usually avoid this by</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">first
                        trying to do all HW shutdowns early before PCI
                        remove completion</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">but when
                        it's really tricky by</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">protecting
                        HW access sections with drm_dev_enter/exit
                        scope.</span><br style="caret-color: rgb(0, 0,
                        0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">For this
                        particular error it would be the best to flush</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">info->restore_userptr_work
                        before the end of</span><br style="caret-color:
                        rgb(0, 0, 0); font-family: Helvetica; font-size:
                        12px; font-style: normal; font-variant-caps:
                        normal; font-weight: 400; letter-spacing:
                        normal; text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">amdgpu_pci_remove
                        (rejecting new process creation and calling</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">cancel_delayed_work_sync(&process_info->restore_userptr_work)
                        for all</span><br style="caret-color: rgb(0, 0,
                        0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">running
                        processes)</span><br style="caret-color: rgb(0,
                        0, 0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none;" class="">
                      <span style="caret-color: rgb(0, 0, 0);
                        font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">somewhere
                        in amdgpu_pci_remove.</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <br class="">
                    </div>
                  </blockquote>
                  I tried something like *kfd_process_ref_release* which
                  I think did what you described, but it did not work.</div>
              </blockquote>
              <p class=""><br class="">
              </p>
              <p class="">I don't see how kfd_process_ref_release is the
                same as I mentioned above, what i meant is calling the
                code above within kgd2kfd_suspend (where you tried to
                call kfd_kill_all_user_processes bellow)
                <br class="">
              </p>
            </div>
          </div>
        </blockquote>
        Yes, you are right. It was not called by it.  <br class="">
        <blockquote type="cite" class="">
          <div class="">
            <div class="">
              <p class=""><br class="">
              </p>
              <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com" class="">
                <div class=""><br class="">
                </div>
                <div class="">Instead I tried to kill the process from
                  the kernel, but the amdgpu could **only** be
                  hot-plugged in back successfully only if there was no
                  rocm kernel running when it was plugged out. If not,
                  amdgpu_probe will just hang later. (Maybe because
                  amdgpu was plugged out while running state, it leaves
                  a bad HW state which causes probe to hang).</div>
              </blockquote>
              <p class=""><br class="">
              </p>
              <p class="">We usually do asic_reset during probe to reset
                all HW state (checlk if
                amdgpu_device_init->amdgpu_asic_reset is running when
                you  plug back).
                <br class="">
              </p>
            </div>
          </div>
        </blockquote>
        OK<br class="">
        <blockquote type="cite" class="">
          <div class="">
            <div class="">
              <p class="">  <br class="">
              </p>
              <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com" class="">
                <div class=""><br class="">
                </div>
                <div class="">I don’t know if this is a viable solution
                  worth pursuing, but I attached the diff anyway.</div>
                <div class=""><br class="">
                </div>
                <div class="">Another solution could be let compute
                  stack user mode detect a topology change via <span class="">generation_count change, and abort
                    gracefully there.</span></div>
                <div class=""><br class="">
                </div>
                <div class="">diff --git
                  a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
                  b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
                <div class="">index 4e7d9cb09a69..79b4c9b84cd0 100644</div>
                <div class="">---
                  a/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
                <div class="">+++
                  b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
                <div class="">@@ -697,12 +697,15 @@ void
                  kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool
                  force)</div>
                <div class="">                return;</div>
                <div class=""><br class="">
                </div>
                <div class="">        /* for runtime suspend, skip
                  locking kfd */</div>
                <div class="">-       if (!run_pm) {</div>
                <div class="">+       if (!run_pm &&
                  !drm_dev_is_unplugged(kfd->ddev)) {</div>
                <div class="">                /* For first KFD device
                  suspend all the KFD processes */</div>
                <div class="">                if
                  (atomic_inc_return(&kfd_locked) == 1)</div>
                <div class="">                       
                  kfd_suspend_all_processes(force);</div>
                <div class="">        }</div>
                <div class=""><br class="">
                </div>
                <div class="">+       if
                  (drm_dev_is_unplugged(kfd->ddev))</div>
                <div class="">+              
                  kfd_kill_all_user_processes();</div>
                <div class="">+</div>
                <div class="">       
                  kfd->dqm->ops.stop(kfd->dqm);</div>
                <div class="">        kfd_iommu_suspend(kfd);</div>
                <div class=""> }</div>
                <div class="">diff --git
                  a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
                  b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
                <div class="">index 55c9e1922714..84cbcd857856 100644</div>
                <div class="">---
                  a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
                <div class="">+++
                  b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
                <div class="">@@ -1065,6 +1065,7 @@ void
                  kfd_unref_process(struct kfd_process *p);</div>
                <div class=""> int kfd_process_evict_queues(struct
                  kfd_process *p, bool force);</div>
                <div class=""> int kfd_process_restore_queues(struct
                  kfd_process *p);</div>
                <div class=""> void kfd_suspend_all_processes(bool
                  force);</div>
                <div class="">+void kfd_kill_all_user_processes(void);</div>
                <div class=""> /*</div>
                <div class="">  * kfd_resume_all_processes:</div>
                <div class="">  *     bool sync: If
                  kfd_resume_all_processes() should wait for the</div>
                <div class="">diff --git
                  a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
                  b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
                <div class="">index 6cdc855abb6d..fb0c753b682c 100644</div>
                <div class="">---
                  a/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
                <div class="">+++
                  b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
                <div class="">@@ -2206,6 +2206,24 @@ void
                  kfd_suspend_all_processes(bool force)</div>
                <div class="">       
                  srcu_read_unlock(&kfd_processes_srcu, idx);</div>
                <div class=""> }</div>
                <div class=""><br class="">
                </div>
                <div class="">+</div>
                <div class="">+void kfd_kill_all_user_processes(void)</div>
                <div class="">+{</div>
                <div class="">+       struct kfd_process *p;</div>
                <div class="">+       struct amdkfd_process_info
                  *p_info;</div>
                <div class="">+       unsigned int temp;</div>
                <div class="">+       int idx =
                  srcu_read_lock(&kfd_processes_srcu);</div>
                <div class="">+</div>
                <div class="">+       pr_info("Killing all
                  processes\n");</div>
                <div class="">+      
                  hash_for_each_rcu(kfd_processes_table, temp, p,
                  kfd_processes) {</div>
                <div class="">+               p_info =
                  p->kgd_process_info;</div>
                <div class="">+               pr_info("Killing
                   processes, pid = %d", pid_nr(p_info->pid));</div>
                <div class="">+               kill_pid(p_info->pid,
                  SIGBUS, 1);</div>
              </blockquote>
              <p class=""><br class="">
              </p>
              <p class="">From looking into kill_pid I see it only sends
                a signal but doesn't wait for completion, it would make
                sense to wait for completion here. In any case I would
                actually try to put here<span style="caret-color: rgb(0,
                  0, 0); font-family: Helvetica; font-size: 12px;
                  font-style: normal; font-variant-caps: normal;
                  font-weight: 400; letter-spacing: normal; text-align:
                  start; text-indent: 0px; text-transform: none;
                  white-space: normal; word-spacing: 0px;
                  -webkit-text-stroke-width: 0px; text-decoration: none;
                  float: none; display: inline !important;" class=""><br class="">
                </span></p>
            </div>
          </div>
        </blockquote>
        I have made a version which does that with some atomic counters.
        Please read later in the diff.<br class="">
        <blockquote type="cite" class="">
          <div class="">
            <div class="">
              <p class=""><span style="caret-color: rgb(0, 0, 0);
                  font-family: Helvetica; font-size: 12px; font-style:
                  normal; font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""></span><span style="caret-color: rgb(0, 0, 0); font-family:
                  Helvetica; font-size: 12px; font-style: normal;
                  font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4"><br class="">
                  </font></span></p>
              <p class=""><span style="caret-color: rgb(0, 0, 0);
                  font-family: Helvetica; font-size: 12px; font-style:
                  normal; font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4">hash_for_each_rcu(</font></span><span style="caret-color: rgb(0, 0, 0); font-family:
                  Helvetica; font-size: 12px; font-style: normal;
                  font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4">p_info)
                       <br class="">
                        cancel_delayed_work_sync(&</font></span><span style="caret-color: rgb(0, 0, 0); font-family:
                  Helvetica; font-size: 12px; font-style: normal;
                  font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4"><span style="caret-color: rgb(0, 0, 0); font-family:
                      Helvetica; font-size: 12px; font-style: normal;
                      font-variant-caps: normal; font-weight: 400;
                      letter-spacing: normal; text-align: start;
                      text-indent: 0px; text-transform: none;
                      white-space: normal; word-spacing: 0px;
                      -webkit-text-stroke-width: 0px; text-decoration:
                      none; float: none; display: inline !important;" class=""></span><span style="caret-color: rgb(0,
                      0, 0); font-family: Helvetica; font-size: 12px;
                      font-style: normal; font-variant-caps: normal;
                      font-weight: 400; letter-spacing: normal;
                      text-align: start; text-indent: 0px;
                      text-transform: none; white-space: normal;
                      word-spacing: 0px; -webkit-text-stroke-width: 0px;
                      text-decoration: none; float: none; display:
                      inline !important;" class=""><font class="" size="4">p_info</font></span>->restore_userptr_work)
                    <br class="">
                  </font></span></p>
              <p class=""><span style="caret-color: rgb(0, 0, 0);
                  font-family: Helvetica; font-size: 12px; font-style:
                  normal; font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4">instead 
                    at least that what i meant in the previous mail. </font></span></p>
            </div>
          </div>
        </blockquote>
        <div>I actually tried that earlier, and it did not work.
          Application still keeps running, and you have to send a kill
          to the user process.</div>
        <div><br class="">
        </div>
        <div>I have made the following version. It waits for processes
          to terminate synchronously after sending SIGBUS. After that it
          does the real work of amdgpu_pci_remove.</div>
        <div>However, it hangs at amdgpu_device_ip_fini_early when it is
          trying to deinit ip_block 6 <sdma_v4_0> (<a href="https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fagd5f%2Flinux%2F-%2Fblob%2Famd-staging-drm-next%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L2818&data=04%7C01%7Candrey.grodzovsky%40amd.com%7C37a2503747384d07944608da1e1f37ee%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637855416726313174%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=FBdaleltc3PbJRmaWr8D3gxU7zuZ7n%2Bcu7J2yUrzD1I%3D&reserved=0" originalsrc="https://gitlab.freedesktop.org/agd5f/linux/-/blob/amd-staging-drm-next/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L2818" shash="EB2/NmIDxiErtBtyih/irqtozZF9gQENs53MuIUA9XCdWismupBtNvUDcK8Lb5R+ZDzoiW/ZPcYbF4heDWzSRaJ9AJhvmC2723gxxYKAqxInevDmIOhdpFC17ijrfbnrM0eHrEtGgO/AlVjEPBiX+xafHURLJUhHFuQO8J2xTSk=" class="" moz-do-not-send="true">https://gitlab.freedesktop.org/agd5f/linux/-/blob/amd-staging-drm-next/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L2818</a>).
          I assume that there are still some inflight dma, therefore
          fini of this ip block thus hangs? </div>
        <div><br class="">
        </div>
        <div>The following is an excerpt of the dmesg: please excuse for
          putting my own pr_info, but I hope you get my point of where
          it hangs.</div>
        <div><br class="">
        </div>
        <div>
          <div>[  392.344735] amdgpu: all processes has been fully
            released</div>
          <div>[  392.346557] amdgpu: amdgpu_acpi_fini done</div>
          <div>[  392.346568] amdgpu 0000:b3:00.0: amdgpu: amdgpu:
            finishing device.</div>
          <div>[  392.349238] amdgpu: amdgpu_device_ip_fini_early enter
            ip_blocks = 9</div>
          <div>[  392.349248] amdgpu: Free mem_obj = 000000007bf54275,
            range_start = 14, range_end = 14</div>
          <div>[  392.350299] amdgpu: Free mem_obj = 00000000a85bc878,
            range_start = 12, range_end = 12</div>
          <div>[  392.350304] amdgpu: Free mem_obj = 00000000b8019e32,
            range_start = 13, range_end = 13</div>
          <div>[  392.350308] amdgpu: Free mem_obj = 000000002d296168,
            range_start = 4, range_end = 11</div>
          <div>[  392.350313] amdgpu: Free mem_obj = 000000001fc4f934,
            range_start = 0, range_end = 3</div>
          <div>[  392.350322] amdgpu: amdgpu_amdkfd_suspend(adev, false)
            done</div>
          <div>[  392.350672] amdgpu: hw_fini of IP block[8]
            <jpeg_v2_5> done 0</div>
          <div>[  392.350679] amdgpu: hw_fini of IP block[7]
            <vcn_v2_5> done 0</div>
        </div>
      </div>
    </blockquote>
    <p><br>
    </p>
    <p>I just remembered that the idea to actively kill and wait for
      running user processes during unplug was rejected<br>
      as a bad idea in the first iteration of unplug work I did (don't
      remember why now, need to look) so this is a no go.<br>
      Our policy is to let zombie processes (zombie in a sense that the
      underlying device is gone) live as long as they want <br>
      (as long as you able to terminate them - which you do, so that ok)<br>
      and the system should finish PCI remove gracefully and be able to
      hot plug back the device.  Hence, i suggest dropping<br>
      this direction of forcing all user processes to be killed, confirm
      you have graceful shutdown and remove of device<br>
      from PCI topology and then concentrate on why when you plug back
      it hangs. First confirm if ASIC reset happens on<br>
      next init. Second please confirm if the timing you kill manually
      the user process has impact on whether you have a hang<br>
      on next plug back (if you kill before or you kill after plug back
      does it makes a difference). <br>
    </p>
    <p>Andrey</p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:B7970589-ACF6-41F3-8622-1C0F705F3EE0@microsoft.com">
      <div>
        <div>
        </div>
        <div><br class="">
        </div>
        <div><br class="">
        </div>
        <div>
          <div>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c</div>
          <div>index 8fa9b86ac9d2..c0b27f722281 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c</div>
          <div>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c</div>
          <div>@@ -188,6 +188,12 @@ void amdgpu_amdkfd_interrupt(struct
            amdgpu_device *adev,</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>kgd2kfd_interrupt(adev->kfd.dev,
            ih_ring_entry);</div>
          <div> }</div>
          <div> </div>
          <div>+void amdgpu_amdkfd_kill_all_processes(struct
            amdgpu_device *adev)</div>
          <div>+{</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>if
            (adev->kfd.dev)</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>kgd2kfd_kill_all_user_processes(adev->kfd.dev);</div>
          <div>+}</div>
          <div>+</div>
          <div> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev,
            bool run_pm)</div>
          <div> {</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>if
            (adev->kfd.dev)</div>
          <div>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h</div>
          <div>index 27c74fcec455..f4e485d60442 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h</div>
          <div>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h</div>
          <div>@@ -141,6 +141,7 @@ struct amdkfd_process_info {</div>
          <div> int amdgpu_amdkfd_init(void);</div>
          <div> void amdgpu_amdkfd_fini(void);</div>
          <div> </div>
          <div>+void amdgpu_amdkfd_kill_all_processes(struct
            amdgpu_device *adev);</div>
          <div> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev,
            bool run_pm);</div>
          <div> int amdgpu_amdkfd_resume_iommu(struct amdgpu_device
            *adev);</div>
          <div> int amdgpu_amdkfd_resume(struct amdgpu_device *adev,
            bool run_pm, bool sync);</div>
          <div>@@ -405,6 +406,7 @@ bool kgd2kfd_device_init(struct
            kfd_dev *kfd,</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>const
            struct kgd2kfd_shared_resources *gpu_resources);</div>
          <div> void kgd2kfd_device_exit(struct kfd_dev *kfd);</div>
          <div> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm,
            bool force);</div>
          <div>+void kgd2kfd_kill_all_user_processes(struct kfd_dev
            *kfd);</div>
          <div> int kgd2kfd_resume_iommu(struct kfd_dev *kfd);</div>
          <div> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm,
            bool sync);</div>
          <div> int kgd2kfd_pre_reset(struct kfd_dev *kfd);</div>
          <div>@@ -443,6 +445,9 @@ static inline void
            kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool
            force)</div>
          <div> {</div>
          <div> }</div>
          <div> </div>
          <div>+void kgd2kfd_kill_all_user_processes(struct kfd_dev
            *kfd){</div>
          <div>+}</div>
          <div>+</div>
          <div> static int __maybe_unused kgd2kfd_resume_iommu(struct
            kfd_dev *kfd)</div>
          <div> {</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>return
            0;</div>
          <div>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
            b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c</div>
          <div>index 3d5fc0751829..af6fe5080cfa 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c</div>
          <div>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c</div>
          <div>@@ -2101,6 +2101,9 @@ amdgpu_pci_remove(struct pci_dev
            *pdev)</div>
          <div> {</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>struct
            drm_device *dev = pci_get_drvdata(pdev);</div>
          <div> </div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>/*
            kill all kfd processes before drm_dev_unplug */</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>amdgpu_amdkfd_kill_all_processes(drm_to_adev(dev));</div>
          <div>+</div>
          <div> #ifdef HAVE_DRM_DEV_UNPLUG</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>drm_dev_unplug(dev);</div>
          <div> #else</div>
          <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
            b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
          <div>index 5504a18b5a45..480c23bef5e2 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
          <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c</div>
          <div>@@ -691,6 +691,12 @@ bool kfd_is_locked(void)</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>return
             (atomic_read(&kfd_locked) > 0);</div>
          <div> }</div>
          <div> </div>
          <div>+inline void kgd2kfd_kill_all_user_processes(struct
            kfd_dev* dev)</div>
          <div>+{</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>kfd_kill_all_user_processes();</div>
          <div>+}</div>
          <div>+</div>
          <div>+</div>
          <div> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm,
            bool force)</div>
          <div> {</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>if
            (!kfd->init_complete)</div>
          <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
            b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
          <div>index 55c9e1922714..a35a2cb5bb9f 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
          <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h</div>
          <div>@@ -1064,6 +1064,7 @@ static inline struct
            kfd_process_device *kfd_process_device_from_gpuidx(</div>
          <div> void kfd_unref_process(struct kfd_process *p);</div>
          <div> int kfd_process_evict_queues(struct kfd_process *p, bool
            force);</div>
          <div> int kfd_process_restore_queues(struct kfd_process *p);</div>
          <div>+void kfd_kill_all_user_processes(void);</div>
          <div> void kfd_suspend_all_processes(bool force);</div>
          <div> /*</div>
          <div>  * kfd_resume_all_processes:</div>
          <div>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
            b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
          <div>index 6cdc855abb6d..17e769e6951d 100644</div>
          <div>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
          <div>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c</div>
          <div>@@ -46,6 +46,9 @@ struct mm_struct;</div>
          <div> #include "kfd_trace.h"</div>
          <div> #include "kfd_debug.h"</div>
          <div> </div>
          <div>+static atomic_t kfd_process_locked = ATOMIC_INIT(0);</div>
          <div>+static atomic_t kfd_inflight_kills = ATOMIC_INIT(0);</div>
          <div>+</div>
          <div> /*</div>
          <div>  * List of struct kfd_process (field kfd_process).</div>
          <div>  * Unique/indexed by mm_struct*</div>
          <div>@@ -802,6 +805,9 @@ struct kfd_process
            *kfd_create_process(struct task_struct *thread)</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>struct
            kfd_process *process;</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>int
            ret;</div>
          <div> </div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>if
            ( atomic_read(&kfd_process_locked) > 0 )</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>return
            ERR_PTR(-EINVAL);</div>
          <div>+</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>if
            (!(thread->mm && mmget_not_zero(thread->mm)))</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>return
            ERR_PTR(-EINVAL);</div>
          <div> </div>
          <div>@@ -1126,6 +1132,10 @@ static void
            kfd_process_wq_release(struct work_struct *work)</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>put_task_struct(p->lead_thread);</div>
          <div> </div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>kfree(p);</div>
          <div>+</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>if
            ( atomic_read(&kfd_process_locked) > 0 ){</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>atomic_dec(&kfd_inflight_kills);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>}</div>
          <div> }</div>
          <div> </div>
          <div> static void kfd_process_ref_release(struct kref *ref)</div>
          <div>@@ -2186,6 +2196,35 @@ static void
            restore_process_worker(struct work_struct *work)</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>pr_err("Failed
            to restore queues of pasid 0x%x\n", p->pasid);</div>
          <div> }</div>
          <div> </div>
          <div>+void kfd_kill_all_user_processes(void)</div>
          <div>+{</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>struct
            kfd_process *p;</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>/*
            struct amdkfd_process_info *p_info; */</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>unsigned
            int temp;</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>int
            idx;</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>atomic_inc(&kfd_process_locked);</div>
          <div>+</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>idx
            = srcu_read_lock(&kfd_processes_srcu);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>pr_info("Killing
            all processes\n");</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>hash_for_each_rcu(kfd_processes_table,
            temp, p, kfd_processes) {</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>dev_warn(kfd_device,</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>"Sending
            SIGBUS to process %d (pasid 0x%x)",</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>p->lead_thread->pid,
            p->pasid);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>send_sig(SIGBUS,
            p->lead_thread, 0);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>atomic_inc(&kfd_inflight_kills);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>}</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>srcu_read_unlock(&kfd_processes_srcu,
            idx);</div>
          <div>+</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>while
            ( atomic_read(&kfd_inflight_kills) > 0 ){</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>dev_warn(kfd_device,</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>"kfd_processes_table
            is not empty, going to sleep for 10ms\n");</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>msleep(10);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>}</div>
          <div>+</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>atomic_dec(&kfd_process_locked);</div>
          <div>+<span class="Apple-tab-span" style="white-space:pre"> </span>pr_info("all
            processes has been fully released");</div>
          <div>+}</div>
          <div>+</div>
          <div> void kfd_suspend_all_processes(bool force)</div>
          <div> {</div>
          <div> <span class="Apple-tab-span" style="white-space:pre"> </span>struct
            kfd_process *p;</div>
          <div class=""><br class="">
          </div>
          <div class=""><br class="">
          </div>
        </div>
        <div><br class="">
        </div>
        <div>Regards,</div>
        <div>Shuotao</div>
        <br class="">
        <blockquote type="cite" class="">
          <div class="">
            <div class="">
              <p class=""><span style="caret-color: rgb(0, 0, 0);
                  font-family: Helvetica; font-size: 12px; font-style:
                  normal; font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""></span></p>
              <p class=""><span style="caret-color: rgb(0, 0, 0);
                  font-family: Helvetica; font-size: 12px; font-style:
                  normal; font-variant-caps: normal; font-weight: 400;
                  letter-spacing: normal; text-align: start;
                  text-indent: 0px; text-transform: none; white-space:
                  normal; word-spacing: 0px; -webkit-text-stroke-width:
                  0px; text-decoration: none; float: none; display:
                  inline !important;" class=""><font class="" size="4">Andrey</font><br class="">
                  <br class="">
                </span></p>
              <blockquote type="cite" cite="mid:5A64FAEA-CCE8-4EB6-8E7B-852D4F384255@microsoft.com" class="">
                <div class="">+       }</div>
                <div class="">+      
                  srcu_read_unlock(&kfd_processes_srcu, idx);</div>
                <div class="">+}</div>
                <div class="">+</div>
                <div class="">+</div>
                <div class=""> int kfd_resume_all_processes(bool sync)</div>
                <div class=""> {</div>
                <div class="">        struct kfd_process *p;</div>
                <div class=""><br class="">
                </div>
                <div class=""><br class="">
                  <blockquote type="cite" class="">
                    <div class=""><span style="caret-color: rgb(0, 0,
                        0); font-family: Helvetica; font-size: 12px;
                        font-style: normal; font-variant-caps: normal;
                        font-weight: 400; letter-spacing: normal;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        word-spacing: 0px; -webkit-text-stroke-width:
                        0px; text-decoration: none; float: none;
                        display: inline !important;" class="">Andrey</span><br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <br style="caret-color: rgb(0, 0, 0); font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; text-align: start;
                        text-indent: 0px; text-transform: none;
                        white-space: normal; word-spacing: 0px;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                      <blockquote type="cite" style="font-family:
                        Helvetica; font-size: 12px; font-style: normal;
                        font-variant-caps: normal; font-weight: 400;
                        letter-spacing: normal; orphans: auto;
                        text-align: start; text-indent: 0px;
                        text-transform: none; white-space: normal;
                        widows: auto; word-spacing: 0px;
                        -webkit-text-size-adjust: auto;
                        -webkit-text-stroke-width: 0px; text-decoration:
                        none;" class="">
                        <br class="">
                        Really appreciate your help!<br class="">
                        <br class="">
                        Best,<br class="">
                        Shuotao<br class="">
                        <br class="">
                        <blockquote type="cite" class="">
                          <blockquote type="cite" class="">2. Remove
                            redudant p2p/io links in sysfs when device
                            is hotplugged<br class="">
                            out.<br class="">
                            <br class="">
                            3. New kfd node_id is not properly assigned
                            after a new device is<br class="">
                            added after a gpu is hotplugged out in a
                            system. libhsakmt will<br class="">
                            find this anomaly, (i.e. node_from !=
                            <dev node id> in iolinks),<br class="">
                            when taking a topology_snapshot, thus
                            returns fault to the rocm<br class="">
                            stack.<br class="">
                            <br class="">
                            -- This patch fixes issue 1; another patch
                            by Mukul fixes issues 2&3.<br class="">
                            -- Tested on a 4-GPU MI100 gpu nodes with
                            kernel 5.13.0-kfd; kernel<br class="">
                            5.16.0-kfd is unstable out of box for MI100.<br class="">
                            ---<br class="">
                            drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |
                            5 +++++<br class="">
                            drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |
                            7 +++++++<br class="">
                            drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |
                            1 +<br class="">
                            drivers/gpu/drm/amd/amdkfd/kfd_device.c | 13
                            +++++++++++++<br class="">
                            4 files changed, 26 insertions(+)<br class="">
                            <br class="">
                            diff --git
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                            index c18c4be1e4ac..d50011bdb5c4 100644<br class="">
                            ---
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                            +++
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c<br class="">
                            @@ -213,6 +213,11 @@ int
                            amdgpu_amdkfd_resume(struct amdgpu_device
                            *adev, bool run_pm)<br class="">
                            return r;<br class="">
                            }<br class="">
                            <br class="">
                            +int amdgpu_amdkfd_resume_processes(void)<br class="">
                            +{<br class="">
                            + return kgd2kfd_resume_processes();<br class="">
                            +}<br class="">
                            +<br class="">
                            int amdgpu_amdkfd_pre_reset(struct
                            amdgpu_device *adev)<br class="">
                            {<br class="">
                            int r = 0;<br class="">
                            diff --git
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                            index f8b9f27adcf5..803306e011c3 100644<br class="">
                            ---
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                            +++
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h<br class="">
                            @@ -140,6 +140,7 @@ void
                            amdgpu_amdkfd_fini(void);<br class="">
                            void amdgpu_amdkfd_suspend(struct
                            amdgpu_device *adev, bool run_pm);<br class="">
                            int amdgpu_amdkfd_resume_iommu(struct
                            amdgpu_device *adev);<br class="">
                            int amdgpu_amdkfd_resume(struct
                            amdgpu_device *adev, bool run_pm);<br class="">
                            +int amdgpu_amdkfd_resume_processes(void);<br class="">
                            void amdgpu_amdkfd_interrupt(struct
                            amdgpu_device *adev,<br class="">
                            const void *ih_ring_entry);<br class="">
                            void amdgpu_amdkfd_device_probe(struct
                            amdgpu_device *adev);<br class="">
                            @@ -347,6 +348,7 @@ void
                            kgd2kfd_device_exit(struct kfd_dev *kfd);<br class="">
                            void kgd2kfd_suspend(struct kfd_dev *kfd,
                            bool run_pm);<br class="">
                            int kgd2kfd_resume_iommu(struct kfd_dev
                            *kfd);<br class="">
                            int kgd2kfd_resume(struct kfd_dev *kfd, bool
                            run_pm);<br class="">
                            +int kgd2kfd_resume_processes(void);<br class="">
                            int kgd2kfd_pre_reset(struct kfd_dev *kfd);<br class="">
                            int kgd2kfd_post_reset(struct kfd_dev *kfd);<br class="">
                            void kgd2kfd_interrupt(struct kfd_dev *kfd,
                            const void *ih_ring_entry);<br class="">
                            @@ -393,6 +395,11 @@ static inline int
                            kgd2kfd_resume(struct kfd_dev *kfd, bool
                            run_pm)<br class="">
                            return 0;<br class="">
                            }<br class="">
                            <br class="">
                            +static inline int
                            kgd2kfd_resume_processes(void)<br class="">
                            +{<br class="">
                            + return 0;<br class="">
                            +}<br class="">
                            +<br class="">
                            static inline int kgd2kfd_pre_reset(struct
                            kfd_dev *kfd)<br class="">
                            {<br class="">
                            return 0;<br class="">
                            diff --git
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                            index fa4a9f13c922..5827b65b7489 100644<br class="">
                            ---
                            a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                            +++
                            b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br class="">
                            @@ -4004,6 +4004,7 @@ void
                            amdgpu_device_fini_hw(struct amdgpu_device
                            *adev)<br class="">
                            if (drm_dev_is_unplugged(adev_to_drm(adev)))<br class="">
                            amdgpu_device_unmap_mmio(adev);<br class="">
                            <br class="">
                            + amdgpu_amdkfd_resume_processes();<br class="">
                            }<br class="">
                            <br class="">
                            void amdgpu_device_fini_sw(struct
                            amdgpu_device *adev)<br class="">
                            diff --git
                            a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
                            b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                            index 62aa6c9d5123..ef05aae9255e 100644<br class="">
                            ---
                            a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                            +++
                            b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                            @@ -714,6 +714,19 @@ int
                            kgd2kfd_resume(struct kfd_dev *kfd, bool
                            run_pm)<br class="">
                            return ret;<br class="">
                            }<br class="">
                            <br class="">
                            +/* for non-runtime resume only */<br class="">
                            +int kgd2kfd_resume_processes(void)<br class="">
                            +{<br class="">
                            + int count;<br class="">
                            +<br class="">
                            + count =
                            atomic_dec_return(&kfd_locked);<br class="">
                            + WARN_ONCE(count < 0, "KFD suspend /
                            resume ref. error");<br class="">
                            + if (count == 0)<br class="">
                            + return kfd_resume_all_processes();<br class="">
                            +<br class="">
                            + return 0;<br class="">
                            +}<br class="">
                          </blockquote>
                          <br class="">
                          It doesn't make sense to me to just increment
                          kfd_locked in<br class="">
                          kgd2kfd_suspend to only decrement it again a
                          few functions down the<br class="">
                          road.<br class="">
                          <br class="">
                          I suggest this instead - you only incrmemnt if
                          not during PCI remove<br class="">
                          <br class="">
                          diff --git
                          a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                          b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                          index 1c2cf3a33c1f..7754f77248a4 100644<br class="">
                          --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                          +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c<br class="">
                          @@ -952,11 +952,12 @@ bool kfd_is_locked(void)<br class="">
                          <br class="">
                          void kgd2kfd_suspend(struct kfd_dev *kfd, bool
                          run_pm)<br class="">
                          {<br class="">
                          +<br class="">
                          if (!kfd->init_complete)<br class="">
                          return;<br class="">
                          <br class="">
                          /* for runtime suspend, skip locking kfd */<br class="">
                          - if (!run_pm) {<br class="">
                          + if (!run_pm &&
                          !drm_dev_is_unplugged(kfd->ddev)) {<br class="">
                          /* For first KFD device suspend all the KFD
                          processes */<br class="">
                          if (atomic_inc_return(&kfd_locked) == 1)<br class="">
                          kfd_suspend_all_processes();<br class="">
                          <br class="">
                          <br class="">
                          Andrey<br class="">
                          <br class="">
                          <br class="">
                          <br class="">
                          <blockquote type="cite" class="">+<br class="">
                            int kgd2kfd_resume_iommu(struct kfd_dev
                            *kfd)<br class="">
                            {<br class="">
                            int err = 0;</blockquote>
                        </blockquote>
                      </blockquote>
                    </div>
                  </blockquote>
                </div>
                <br class="">
              </blockquote>
            </div>
          </div>
        </blockquote>
      </div>
      <br class="">
    </blockquote>
  </body>
</html>