<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2025-01-09 12:14, Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:4925dfbe-9a0f-4c73-989f-d2b1d7784161@amd.com">
      <br>
      On 2025-01-08 20:11, Philip Yang wrote:
      <br>
      <blockquote type="cite">
        <br>
        <br>
        On 2025-01-07 22:08, Deng, Emily wrote:
        <br>
        <blockquote type="cite">
          <br>
          [AMD Official Use Only - AMD Internal Distribution Only]
          <br>
          <br>
          <br>
          Hi Philip,
          <br>
          <br>
          It still has the deadlock, maybe the best way is trying to
          remove the delayed free pt work.
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000000>] INFO: task
          kfdtest:5827 blocked for more than 122 seconds.
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000290>] Tainted:
          G           OE K   5.10.134-17.2.al8.x86_64 #1
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000243>] "echo 0 >
          /proc/sys/kernel/hung_task_timeout_secs" disables this
          message.
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000317>]
          task:kfdtest         state:D stack:    0 pid: 5827 ppid:  5756
          flags:0x00004080
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>] Call Trace:
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000006>]
          __schedule+0x1ba/0x490
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]  ?
          usleep_range+0x90/0x90
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          schedule+0x46/0xb0
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]
          schedule_timeout+0x12a/0x140
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000003>]  ?
          __prepare_to_swait+0x4f/0x70
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          __wait_for_common+0xb1/0x160
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000004>]
          flush_workqueue+0x12f/0x410
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000126>]
          svm_range_map_to_gpu+0x1b8/0x730 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000078>]
          svm_range_validate_and_map+0x978/0xd30 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000065>]
          svm_range_set_attr+0x55f/0xb20 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000060>]
          kfd_ioctl+0x208/0x540 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000058>]  ?
          kfd_ioctl_set_xnack_mode+0xd0/0xd0 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000004>]  ?
          vm_mmap_pgoff+0xf2/0x120
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          __x64_sys_ioctl+0x88/0xc0
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000003>]
          do_syscall_64+0x2e/0x50
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          entry_SYSCALL_64_after_hwframe+0x62/0xc7
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000008>] RIP:
          0033:0x7f8c472617db
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>] RSP:
          002b:00007ffd2908a688 EFLAGS: 00000246 ORIG_RAX:
          0000000000000010
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>] RAX:
          ffffffffffffffda RBX: 00007ffd2908a6fc RCX: 00007f8c472617db
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>] RDX:
          00007ffd2908a6c0 RSI: 00000000c0384b20 RDI: 0000000000000003
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000000>] RBP:
          00007ffd2908a6c0 R08: 0000000000000000 R09: 0000000000000000
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>] R10:
          00007f70f467f000 R11: 0000000000000246 R12: 00000000c0384b20
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000000>] R13:
          0000000000000003 R14: 0000000000200000 R15: 00007ffd2908a770
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000003>] INFO: task
          kworker/u129:7:5942 blocked for more than 122 seconds.
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.001897>] Tainted:
          G           OE K   5.10.134-17.2.al8.x86_64 #1
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000247>] "echo 0 >
          /proc/sys/kernel/hung_task_timeout_secs" disables this
          message.
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000315>]
          task:kworker/u129:7  state:D stack:    0 pid: 5942 ppid:     2
          flags:0x00004080
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000067>] Workqueue:
          amdgpu_recycle amdgpu_vm_pt_free_work [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>] Call Trace:
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000003>]
          __schedule+0x1ba/0x490
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]  ?
          newidle_balance+0x16a/0x3b0
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]
          schedule+0x46/0xb0
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          schedule_preempt_disabled+0xa/0x10
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]
          __ww_mutex_lock.constprop.0+0x390/0x6e0
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000045>]
          amdgpu_vm_pt_free_work+0x97/0x160 [amdgpu]
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000003>]
          process_one_work+0x1ad/0x380
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]
          worker_thread+0x49/0x310
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]  ?
          process_one_work+0x380/0x380
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000001>]
          kthread+0x118/0x140
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]  ?
          __kthread_bind_mask+0x60/0x60
          <br>
          <br>
          [Wed Jan  8 10:35:44 2025 <    0.000002>]
          ret_from_fork+0x1f/0x30
          <br>
          <br>
        </blockquote>
        Move flush_workqueue to the beginning of
        svm_range_validate_and_map should fix the deadlock, deadlock is
        because it is after svm_range_reserve_bos. Also there is no
        concurrent unmap mmu notifier callback to free pt bo as mmap
        read lock is taken outside svm_range_validate_and_map.
        <br>
        <br>
      </blockquote>
      I don't think the mmap_read_lock protects you from concurrent MMU
      notifiers. I believe we have made that assumption in the past and
      it proved to be incorrect.
      <br>
      <br>
    </blockquote>
    <p>Thanks for the reminding, yes, if we cannot prevent concurrent
      MMU notifier, there is race condition, flush work can not fix the
      issue completely.</p>
    <p>We are testing another approach , unmap only clear page table
      leaves.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:4925dfbe-9a0f-4c73-989f-d2b1d7784161@amd.com">Regards,
      <br>
        Felix
      <br>
      <br>
      <br>
      <blockquote type="cite">Ideally it is enough to flush work
        amdgpu_vm_pt_free_work (not flush queue system_wq), but
        svm_range_validate_and_map cannot get the correct vm to flush.
        <br>
        <br>
        adev->wq is shared by all processes and all xcp partitions,
        maybe better to add wq to KFD process info, but right now
        amdgpu_vm_update_range cannot access KFD process info.
        <br>
        <br>
        Regards,
        <br>
        <br>
        Philip
        <br>
        <br>
        <br>
        <blockquote type="cite">Emily Deng
          <br>
          <br>
          Best Wishes
          <br>
          <br>
          *From:*amd-gfx <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx-bounces@lists.freedesktop.org"><amd-gfx-bounces@lists.freedesktop.org></a>
          *On Behalf Of *Deng, Emily
          <br>
          *Sent:* Wednesday, January 8, 2025 8:34 AM
          <br>
          *To:* Yang, Philip <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>; Kuehling,
          Felix <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a>;
          <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>; Koenig, Christian
          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>
          <br>
          *Subject:* RE: [PATCH v2] drm/amdgpu: Fix the looply call
          svm_range_restore_pages issue
          <br>
          <br>
          [AMD Official Use Only - AMD Internal Distribution Only]
          <br>
          <br>
          [AMD Official Use Only - AMD Internal Distribution Only]
          <br>
          <br>
          *From:*Yang, Philip <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
          <br>
          *Sent:* Tuesday, January 7, 2025 11:19 PM
          <br>
          *To:* Deng, Emily <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a>; Kuehling, Felix
          <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a>; <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>;
          Yang, Philip <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>; Koenig, Christian
          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>
          <br>
          *Subject:* Re: [PATCH v2] drm/amdgpu: Fix the looply call
          svm_range_restore_pages issue
          <br>
          <br>
          On 2025-01-07 07:30, Deng, Emily wrote:
          <br>
          <br>
              [AMD Official Use Only - AMD Internal Distribution Only]
          <br>
          <br>
              Hi Felix,
          <br>
          <br>
                   You are right, it is easily to hit deadlock, don't
          know why LOCKDEP doesn't catch this. Need to find another
          solution.
          <br>
          <br>
              Hi Philip,
          <br>
          <br>
                    Do you have a solution for this delay free pt?
          <br>
          <br>
          Thanks for debugging this case, I had a patch to not free PTB
          bo when unmapping from GPU, but it will waste VRAM memory. My
          test case also passed with the tlb flush fence fix, I don't
          see the no-retry fault generated any more.
          <br>
          <br>
          The deadlock is probably from svm_range_unmap_from_gpu ->
          flush_workqueue(adev->wq), this is from mmu notifier
          callback, actually we only need flush pt_free_work before
          mapping to gpu, please remove the flush_workqueue in unmap
          from gpu. If deadlock still happens, please post the
          backtrace.
          <br>
          <br>
          [Emily]Yes, you are right, will try to remove flush_workqueue
          in unmap from gpu to have a try. Will send a v3.
          <br>
          <br>
          I think you don't need add new adev->wq, use default
          system_wq and flush_work.
          <br>
          <br>
          [Emily]No, it doesn’t allow to flush a system_wq in driver, it
          will trigger a kernel warning, as lots of other work will be
          put in system_wq. I have tried this.
          <br>
          <br>
          Regards,
          <br>
          <br>
          Philip
          <br>
          <br>
              Emily Deng
          <br>
          <br>
              Best Wishes
          <br>
          <br>
                  -----Original Message-----
          <br>
          <br>
                  From: Deng, Emily<a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><mailto:Emily.Deng@amd.com></a>
          <br>
          <br>
                  Sent: Tuesday, January 7, 2025 10:34 AM
          <br>
          <br>
                  To: Deng, Emily<a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><mailto:Emily.Deng@amd.com></a>; Kuehling, Felix
          <br>
          <br>
                  <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><mailto:Felix.Kuehling@amd.com></a>;amd-gfx@lists.freedesktop.org;
          Yang, Philip
          <br>
          <br>
                  <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><mailto:Philip.Yang@amd.com></a>; Koenig,
          Christian<a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><mailto:Christian.Koenig@amd.com></a>
          <br>
          <br>
                  Subject: RE: [PATCH v2] drm/amdgpu: Fix the looply
          call svm_range_restore_pages
          <br>
          <br>
                  issue
          <br>
          <br>
                  [AMD Official Use Only - AMD Internal Distribution
          Only]
          <br>
          <br>
                  Ping....
          <br>
          <br>
                  How about this? Currently it is easily to reproduce
          the issue on our environment. We
          <br>
          <br>
                  need this change to fix it.
          <br>
          <br>
                  Emily Deng
          <br>
          <br>
                  Best Wishes
          <br>
          <br>
                      -----Original Message-----
          <br>
          <br>
                      From:
          amd-gfx<a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx-bounces@lists.freedesktop.org"><amd-gfx-bounces@lists.freedesktop.org></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx-bounces@lists.freedesktop.org"><mailto:amd-gfx-bounces@lists.freedesktop.org></a>  On
          Behalf Of
          <br>
          <br>
                      Deng, Emily
          <br>
          <br>
                      Sent: Monday, January 6, 2025 9:52 AM
          <br>
          <br>
                      To: Kuehling, Felix<a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><mailto:Felix.Kuehling@amd.com></a>;
          <br>
          <br>
                      <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>; Yang,
          Philip<a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><mailto:Philip.Yang@amd.com></a>;
          <br>
          <br>
                      Koenig, Christian<a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><mailto:Christian.Koenig@amd.com></a>
          <br>
          <br>
                      Subject: RE: [PATCH v2] drm/amdgpu: Fix the looply
          call
          <br>
          <br>
                      svm_range_restore_pages issue
          <br>
          <br>
                      [AMD Official Use Only - AMD Internal Distribution
          Only]
          <br>
          <br>
                      [AMD Official Use Only - AMD Internal Distribution
          Only]
          <br>
          <br>
                          -----Original Message-----
          <br>
          <br>
                          From: Kuehling,
          Felix<a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><mailto:Felix.Kuehling@amd.com></a>
          <br>
          <br>
                          Sent: Saturday, January 4, 2025 7:18 AM
          <br>
          <br>
                          To: Deng, Emily<a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><mailto:Emily.Deng@amd.com></a>;amd-gfx@lists.freedesktop.org;
          <br>
          <br>
                          Yang, Philip<a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><mailto:Philip.Yang@amd.com></a>; Koenig, Christian
          <br>
          <br>
                          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><mailto:Christian.Koenig@amd.com></a>
          <br>
          <br>
                          Subject: Re: [PATCH v2] drm/amdgpu: Fix the
          looply call
          <br>
          <br>
                          svm_range_restore_pages issue
          <br>
          <br>
                          On 2025-01-02 21:26, Emily Deng wrote:
          <br>
          <br>
                              As the delayed free pt, the wanted freed
          bo has been reused which
          <br>
          <br>
                              will cause unexpected page fault, and then
          call svm_range_restore_pages.
          <br>
          <br>
                              Detail as below:
          <br>
          <br>
                              1.It wants to free the pt in follow code,
          but it is not freed
          <br>
          <br>
                              immediately and used
          “schedule_work(&vm->pt_free_work);”.
          <br>
          <br>
                              [   92.276838] Call Trace:
          <br>
          <br>
                              [   92.276841]  dump_stack+0x63/0xa0
          <br>
          <br>
                              [   92.276887] 
          amdgpu_vm_pt_free_list+0xfb/0x120 [amdgpu]
          <br>
          <br>
                              [   92.276932] 
          amdgpu_vm_update_range+0x69c/0x8e0 [amdgpu]
          <br>
          <br>
                              [   92.276990] 
          svm_range_unmap_from_gpus+0x112/0x310 [amdgpu]
          <br>
          <br>
                              [   92.277046] 
          svm_range_cpu_invalidate_pagetables+0x725/0x780 [amdgpu]
          <br>
          <br>
                              [   92.277050]  ?
          __alloc_pages_nodemask+0x19f/0x3e0
          <br>
          <br>
                              [   92.277051] 
          mn_itree_invalidate+0x72/0xc0
          <br>
          <br>
                              [   92.277052] 
          __mmu_notifier_invalidate_range_start+0x48/0x60
          <br>
          <br>
                              [   92.277054] 
          migrate_vma_collect+0xf6/0x100
          <br>
          <br>
                              [   92.277055] 
          migrate_vma_setup+0xcf/0x120
          <br>
          <br>
                              [   92.277109] 
          svm_migrate_ram_to_vram+0x256/0x6b0 [amdgpu]
          <br>
          <br>
                              2.Call
          svm_range_map_to_gpu->amdgpu_vm_update_range to update the
          <br>
          <br>
                              page table, at this time it will use the
          same entry bo which is the
          <br>
          <br>
                              want free bo in step1.
          <br>
          <br>
                              3.Then it executes the pt_free_work to
          free the bo. At this time,
          <br>
          <br>
                              the GPU access memory will cause page
          fault as pt bo has been freed.
          <br>
          <br>
                              And then it will call
          svm_range_restore_pages again.
          <br>
          <br>
                              How to fix?
          <br>
          <br>
                              Add a workqueue, and flush the workqueue
          each time before updating page
          <br>
          <br>
                  table.
          <br>
          <br>
                          I think this is kind of a known issue in the
          GPUVM code. Philip was
          <br>
          <br>
                          looking at it before.
          <br>
          <br>
                          Just flushing a workqueue may seem like a
          simple and elegant solution,
          <br>
          <br>
                          but I'm afraid it introduces lock dependency
          issues. By flushing the
          <br>
          <br>
                          workqueue, you're effectively creating a lock
          dependency of the MMU
          <br>
          <br>
                          notifier on any locks held inside the worker
          function. You now get a
          <br>
          <br>
                          circular lock dependency with any of those
          locks and memory reclaim. I
          <br>
          <br>
                          think LOCKDEP would be able to catch that if
          you compile your kernel
          <br>
          <br>
                          with that
          <br>
          <br>
                      feature enabled.
          <br>
          <br>
                          The proper solution is to prevent delayed
          freeing of page tables if
          <br>
          <br>
                          they happened to get reused, or prevent reuse
          of page tables if they
          <br>
          <br>
                          are flagged for
          <br>
          <br>
                      delayed freeing.
          <br>
          <br>
                          Regards,
          <br>
          <br>
                             Felix
          <br>
          <br>
                      Thanks, already enabled LOCKDEP while compiling
          the kernel. The delay
          <br>
          <br>
                      work seems for other reasons, I am not sure
          whether it could be deleted completely.
          <br>
          <br>
                      Emily Deng
          <br>
          <br>
                      Best Wishes
          <br>
          <br>
                              Signed-off-by: Emily
          Deng<a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a> 
          <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><mailto:Emily.Deng@amd.com></a>
          <br>
          <br>
                              ---
          <br>
          <br>
                                
          drivers/gpu/drm/amd/amdgpu/amdgpu.h              | 1 +
          <br>
          <br>
                                
          drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1 +
          <br>
          <br>
                                
          drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c           | 7 +++++--
          <br>
          <br>
                                
          drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c        | 6 +++++-
          <br>
          <br>
                                
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c             | 3 +++
          <br>
          <br>
                                 5 files changed, 15 insertions(+), 3
          deletions(-)
          <br>
          <br>
                              diff --git
          a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
          <br>
          <br>
                              b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
          <br>
          <br>
                              index 93c352b08969..cbf68ad1c8d0 100644
          <br>
          <br>
                              --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
          <br>
          <br>
                              +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
          <br>
          <br>
                              @@ -1188,6 +1188,7 @@ struct amdgpu_device
          {
          <br>
          <br>
                                    struct mutex                   
          enforce_isolation_mutex;
          <br>
          <br>
                                    struct amdgpu_init_level *init_lvl;
          <br>
          <br>
                              +    struct workqueue_struct *wq;
          <br>
          <br>
                                 };
          <br>
          <br>
                                 static inline uint32_t
          amdgpu_ip_version(const struct
          <br>
          <br>
                              amdgpu_device *adev, diff --git
          <br>
          <br>
                             
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
          <br>
          <br>
                             
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
          <br>
          <br>
                              index f30548f4c3b3..5b4835bc81b3 100644
          <br>
          <br>
                              ---
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
          <br>
          <br>
                              +++
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
          <br>
          <br>
                              @@ -2069,6 +2069,7 @@ int
          amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
          <br>
          <br>
                                            if (ret)
          <br>
          <br>
                                                    goto out;
          <br>
          <br>
                                    }
          <br>
          <br>
                              +    flush_workqueue(adev->wq);
          <br>
          <br>
                                    ret = reserve_bo_and_vm(mem, avm,
          &ctx);
          <br>
          <br>
                                    if (unlikely(ret))
          <br>
          <br>
                              diff --git
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
          <br>
          <br>
                              b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
          <br>
          <br>
                              index 9d6ffe38b48a..500d97cd9114 100644
          <br>
          <br>
                              ---
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
          <br>
          <br>
                              +++
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
          <br>
          <br>
                              @@ -2607,7 +2607,7 @@ void
          amdgpu_vm_fini(struct amdgpu_device
          <br>
          <br>
                              *adev,
          <br>
          <br>
                          struct amdgpu_vm *vm)
          <br>
          <br>
                                    amdgpu_amdkfd_gpuvm_destroy_cb(adev,
          vm);
          <br>
          <br>
                                   
          flush_work(&vm->pt_free_work);
          <br>
          <br>
                              -
          <br>
          <br>
                              +   
          cancel_work_sync(&vm->pt_free_work);
          <br>
          <br>
                                    root =
          amdgpu_bo_ref(vm->root.bo);
          <br>
          <br>
                                    amdgpu_bo_reserve(root, true);
          <br>
          <br>
                                   
          amdgpu_vm_put_task_info(vm->task_info);
          <br>
          <br>
                              @@ -2708,6 +2708,8 @@ void
          amdgpu_vm_manager_init(struct
          <br>
          <br>
                              amdgpu_device
          <br>
          <br>
                          *adev)
          <br>
          <br>
                                 #endif
          <br>
          <br>
                                   
          xa_init_flags(&adev->vm_manager.pasids,
          XA_FLAGS_LOCK_IRQ);
          <br>
          <br>
                              +    adev->wq =
          alloc_workqueue("amdgpu_recycle",
          <br>
          <br>
                              +                              
          WQ_MEM_RECLAIM | WQ_HIGHPRI |
          <br>
          <br>
                          WQ_UNBOUND, 16);
          <br>
          <br>
                                 }
          <br>
          <br>
                                 /**
          <br>
          <br>
                              @@ -2721,7 +2723,8 @@ void
          amdgpu_vm_manager_fini(struct
          <br>
          <br>
                              amdgpu_device
          <br>
          <br>
                          *adev)
          <br>
          <br>
                                 {
          <br>
          <br>
                                   
          WARN_ON(!xa_empty(&adev->vm_manager.pasids));
          <br>
          <br>
                                   
          xa_destroy(&adev->vm_manager.pasids);
          <br>
          <br>
                              -
          <br>
          <br>
                              +    flush_workqueue(adev->wq);
          <br>
          <br>
                              +    destroy_workqueue(adev->wq);
          <br>
          <br>
                                    amdgpu_vmid_mgr_fini(adev);
          <br>
          <br>
                                 }
          <br>
          <br>
                              diff --git
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
          <br>
          <br>
                             
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
          <br>
          <br>
                              index f78a0434a48f..1204406215ee 100644
          <br>
          <br>
                              ---
          a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
          <br>
          <br>
                              +++
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
          <br>
          <br>
                              @@ -554,15 +554,19 @@ void
          amdgpu_vm_pt_free_work(struct work_struct
          <br>
          <br>
                              *work)
          <br>
          <br>
                                    vm = container_of(work, struct
          amdgpu_vm, pt_free_work);
          <br>
          <br>
                              +    printk("Emily:%s\n", __func__);
          <br>
          <br>
                                    spin_lock(&vm->status_lock);
          <br>
          <br>
                                   
          list_splice_init(&vm->pt_freed, &pt_freed);
          <br>
          <br>
                                   
          spin_unlock(&vm->status_lock);
          <br>
          <br>
                              +    printk("Emily:%s 1\n", __func__);
          <br>
          <br>
                                    /* flush_work in amdgpu_vm_fini
          ensure vm->root.bo is valid. */
          <br>
          <br>
                                    amdgpu_bo_reserve(vm->root.bo,
          true);
          <br>
          <br>
                              +    printk("Emily:%s 2\n", __func__);
          <br>
          <br>
                                    list_for_each_entry_safe(entry,
          next, &pt_freed, vm_status)
          <br>
          <br>
                                            amdgpu_vm_pt_free(entry);
          <br>
          <br>
                              +    printk("Emily:%s 3\n", __func__);
          <br>
          <br>
                                    amdgpu_bo_unreserve(vm->root.bo);
          <br>
          <br>
                                 }
          <br>
          <br>
                              @@ -589,7 +593,7 @@ void
          amdgpu_vm_pt_free_list(struct amdgpu_device
          <br>
          <br>
                          *adev,
          <br>
          <br>
                                           
          spin_lock(&vm->status_lock);
          <br>
          <br>
                                           
          list_splice_init(&params->tlb_flush_waitlist,
          &vm->pt_freed);
          <br>
          <br>
                                           
          spin_unlock(&vm->status_lock);
          <br>
          <br>
                              -           
          schedule_work(&vm->pt_free_work);
          <br>
          <br>
                              +            queue_work(adev->wq,
          &vm->pt_free_work);
          <br>
          <br>
                                            return;
          <br>
          <br>
                                    }
          <br>
          <br>
                              diff --git
          a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
          <br>
          <br>
                              b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
          <br>
          <br>
                              index 3e2911895c74..55edf96d5a95 100644
          <br>
          <br>
                              --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
          <br>
          <br>
                              +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
          <br>
          <br>
                              @@ -1314,6 +1314,7 @@
          svm_range_unmap_from_gpu(struct amdgpu_device
          <br>
          <br>
                          *adev, struct amdgpu_vm *vm,
          <br>
          <br>
                                    uint64_t init_pte_value = 0;
          <br>
          <br>
                                    pr_debug("[0x%llx 0x%llx]\n", start,
          last);
          <br>
          <br>
                              +    flush_workqueue(adev->wq);
          <br>
          <br>
                                    return amdgpu_vm_update_range(adev,
          vm, false, true, true,
          <br>
          <br>
                              false, NULL,
          <br>
          <br>
                          start,
          <br>
          <br>
                                                                  last,
          init_pte_value, 0, 0, NULL,
          <br>
          <br>
                              NULL, @@ -1422,6
          <br>
          <br>
                          +1423,8
          <br>
          <br>
                              @@ svm_range_map_to_gpu(struct
          kfd_process_device *pdd, struct
          <br>
          <br>
                              svm_range
          <br>
          <br>
                          *prange,
          <br>
          <br>
                                             * different memory
          partition based on fpfn/lpfn, we should use
          <br>
          <br>
                                             * same
          vm_manager.vram_base_offset regardless memory partition.
          <br>
          <br>
                                             */
          <br>
          <br>
                              +            flush_workqueue(adev->wq);
          <br>
          <br>
                              +
          <br>
          <br>
                                            r =
          amdgpu_vm_update_range(adev, vm, false, false, flush_tlb,
          true,
          <br>
          <br>
                                                                      
          NULL, last_start, prange->start + i,
          <br>
          <br>
                                                                      
          pte_flags,
          <br>
          <br>
        </blockquote>
      </blockquote>
    </blockquote>
  </body>
</html>