[PATCH] drm/amdgpu: revert "fix system hang issue during GPU reset"

Christian König ckoenig.leichtzumerken at gmail.com
Thu Aug 13 10:58:45 UTC 2020


Am 12.08.20 um 17:55 schrieb Alex Deucher:
> On Wed, Aug 12, 2020 at 11:54 AM Christian König
> <ckoenig.leichtzumerken at gmail.com> wrote:
>> The whole approach wasn't thought through till the end.
>>
>> We already had a reset lock like this in the past and it caused the same problems like this one.
>>
>> Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary.
>>
>> This reverts commit edad8312cbbf9a33c86873fc4093664f150dd5c1.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
> This also broke GPU overclocking.
>
> Acked-by: Alex Deucher <alexander.deucher at amd.com>

Dennis since we still want to fix the hardware access I suggest to split 
this patch up into the structural changes and individual patches which 
add the lock to the different places where a hardware access happens.

This way we can discuss and eventually fix/revert each hardware access 
individually

Thanks,
Christian.

>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   9 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  40 +-
>>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |   2 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |   2 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |   2 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |   2 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   7 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |   4 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       |   4 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   |  14 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  57 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |   4 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       |   6 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  14 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c       |   4 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c        | 353 ++++--------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c       |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c     |   3 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c      |  11 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h      |   3 +-
>>   drivers/gpu/drm/amd/amdgpu/atom.c             |   1 -
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        |  10 +-
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c         |   6 +-
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  10 +-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c        |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c         |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c         |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c         |   7 +-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  13 +-
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         |  13 +-
>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  16 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   4 -
>>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   4 +-
>>   drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |   2 +-
>>   .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c    |   2 +-
>>   39 files changed, 184 insertions(+), 469 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 1f9d97f61aa5..9c6fb38ce59d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -952,9 +952,9 @@ struct amdgpu_device {
>>          bool                            in_suspend;
>>          bool                            in_hibernate;
>>
>> -       atomic_t                        in_gpu_reset;
>> +       bool                            in_gpu_reset;
>>          enum pp_mp1_state               mp1_state;
>> -       struct rw_semaphore     reset_sem;
>> +       struct mutex  lock_reset;
>>          struct amdgpu_doorbell_index doorbell_index;
>>
>>          struct mutex                    notifier_lock;
>> @@ -1269,9 +1269,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
>>          return adev->gmc.tmz_enabled;
>>   }
>>
>> -static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
>> -{
>> -       return atomic_read(&adev->in_gpu_reset) ? true : false;
>> -}
>> -
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 9738dccb1c2c..0effc1d46824 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
>>          if (cp_mqd_gfx9)
>>                  bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
>>
>> -       if (!down_read_trylock(&adev->reset_sem))
>> -               return -EIO;
>> -
>>          r = amdgpu_bo_create(adev, &bp, &bo);
>>          if (r) {
>>                  dev_err(adev->dev,
>>                          "failed to allocate BO for amdkfd (%d)\n", r);
>> -               goto err;
>> +               return r;
>>          }
>>
>>          /* map the buffer */
>> @@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
>>
>>          amdgpu_bo_unreserve(bo);
>>
>> -       up_read(&adev->reset_sem);
>>          return 0;
>>
>>   allocate_mem_kmap_bo_failed:
>> @@ -295,25 +291,19 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
>>          amdgpu_bo_unreserve(bo);
>>   allocate_mem_reserve_bo_failed:
>>          amdgpu_bo_unref(&bo);
>> -err:
>> -       up_read(&adev->reset_sem);
>> +
>>          return r;
>>   }
>>
>>   void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
>>   {
>> -       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>>          struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          amdgpu_bo_reserve(bo, true);
>>          amdgpu_bo_kunmap(bo);
>>          amdgpu_bo_unpin(bo);
>>          amdgpu_bo_unreserve(bo);
>>          amdgpu_bo_unref(&(bo));
>> -
>> -       up_read(&adev->reset_sem);
>>   }
>>
>>   int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
>> @@ -345,14 +335,9 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
>>
>>   void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
>>   {
>> -       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>>          struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          amdgpu_bo_unref(&bo);
>> -
>> -       up_read(&adev->reset_sem);
>>   }
>>
>>   uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
>> @@ -626,15 +611,8 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>>          /* This works for NO_HWS. TODO: need to handle without knowing VMID */
>>          job->vmid = vmid;
>>
>> -       if (!down_read_trylock(&adev->reset_sem)) {
>> -               ret = -EIO;
>> -               goto err_ib_sched;
>> -       }
>> -
>>          ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          if (ret) {
>>                  DRM_ERROR("amdgpu: failed to schedule IB.\n");
>>                  goto err_ib_sched;
>> @@ -670,9 +648,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
>>   {
>>          struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>>
>> -       if (!down_read_trylock(&adev->reset_sem))
>> -               return -EIO;
>> -
>>          if (adev->family == AMDGPU_FAMILY_AI) {
>>                  int i;
>>
>> @@ -682,8 +657,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
>>                  amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          return 0;
>>   }
>>
>> @@ -692,18 +665,11 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
>>          struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>>          const uint32_t flush_type = 0;
>>          bool all_hub = false;
>> -       int ret = -EIO;
>>
>>          if (adev->family == AMDGPU_FAMILY_AI)
>>                  all_hub = true;
>>
>> -       if (down_read_trylock(&adev->reset_sem)) {
>> -               ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
>> -                                       pasid, flush_type, all_hub);
>> -               up_read(&adev->reset_sem);
>> -       }
>> -
>> -       return ret;
>> +       return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
>>   }
>>
>>   bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index b872cdb0b705..691c89705bcd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -543,7 +543,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>>          uint32_t temp;
>>          struct v10_compute_mqd *m = get_mqd(mqd);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>   #if 0
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> index 832a200bb62f..0b7e78748540 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> @@ -425,7 +425,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>>          unsigned long flags, end_jiffies;
>>          int retry;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          acquire_queue(kgd, pipe_id, queue_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> index d0940121a6a9..ccd635b812b5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> @@ -421,7 +421,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>>          int retry;
>>          struct vi_mqd *m = get_mqd(mqd);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          acquire_queue(kgd, pipe_id, queue_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> index 7e11625b419e..961424bc7a1f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
>> @@ -541,7 +541,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>>          uint32_t temp;
>>          struct v9_mqd *m = get_mqd(mqd);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          acquire_queue(kgd, pipe_id, queue_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 0d75726bd228..7e2394b50fbf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1194,9 +1194,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>                  return -EINVAL;
>>          }
>>
>> -       if (!down_read_trylock(&adev->reset_sem))
>> -               return -EIO;
>> -
>>          *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
>>          if (!*mem) {
>>                  ret = -ENOMEM;
>> @@ -1263,7 +1260,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>          if (offset)
>>                  *offset = amdgpu_bo_mmap_offset(bo);
>>
>> -       up_read(&adev->reset_sem);
>>          return 0;
>>
>>   allocate_init_user_pages_failed:
>> @@ -1281,9 +1277,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>                  sg_free_table(sg);
>>                  kfree(sg);
>>          }
>> -
>> -       up_read(&adev->reset_sem);
>> -
>>          return ret;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index a94b3f862fc2..ffbcaf4bfb8b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1292,8 +1292,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>>          parser.adev = adev;
>>          parser.filp = filp;
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          r = amdgpu_cs_parser_init(&parser, data);
>>          if (r) {
>>                  DRM_ERROR("Failed to initialize parser %d!\n", r);
>> @@ -1333,8 +1331,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>>   out:
>>          amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          return r;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index d85d13f7a043..8842c55d4490 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -358,8 +358,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>          if (atomic_read(&ctx->guilty))
>>                  out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          /*query ue count*/
>>          ras_counter = amdgpu_ras_query_error_count(adev, false);
>>          /*ras counter is monotonic increasing*/
>> @@ -375,8 +373,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>                  ctx->ras_counter_ce = ras_counter;
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          mutex_unlock(&mgr->lock);
>>          return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 0af249a1e35b..35fed75a4397 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -101,14 +101,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
>>
>>          file->private_data = adev;
>>
>> -       down_read(&adev->reset_sem);
>> +       mutex_lock(&adev->lock_reset);
>>          if (adev->autodump.dumping.done) {
>>                  reinit_completion(&adev->autodump.dumping);
>>                  ret = 0;
>>          } else {
>>                  ret = -EBUSY;
>>          }
>> -       up_read(&adev->reset_sem);
>> +       mutex_unlock(&adev->lock_reset);
>>
>>          return ret;
>>   }
>> @@ -127,7 +127,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_
>>
>>          poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return POLLIN | POLLRDNORM | POLLWRNORM;
>>
>>          return 0;
>> @@ -1242,7 +1242,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>>          }
>>
>>          /* Avoid accidently unparking the sched thread during GPU reset */
>> -       down_read(&adev->reset_sem);
>> +       mutex_lock(&adev->lock_reset);
>>
>>          /* hold on the scheduler */
>>          for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>> @@ -1269,7 +1269,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>>                  kthread_unpark(ring->sched.thread);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> +       mutex_unlock(&adev->lock_reset);
>>
>>          pm_runtime_mark_last_busy(dev->dev);
>>          pm_runtime_put_autosuspend(dev->dev);
>> @@ -1459,7 +1459,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>>                  return -ENOMEM;
>>
>>          /* Avoid accidently unparking the sched thread during GPU reset */
>> -       down_read(&adev->reset_sem);
>> +       mutex_lock(&adev->lock_reset);
>>
>>          /* stop the scheduler */
>>          kthread_park(ring->sched.thread);
>> @@ -1500,7 +1500,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>>          /* restart the scheduler */
>>          kthread_unpark(ring->sched.thread);
>>
>> -       up_read(&adev->reset_sem);
>> +       mutex_unlock(&adev->lock_reset);
>>
>>          ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index fe8878761c29..19aa0d7334c7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1940,7 +1940,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>                          if (adev->ip_blocks[i].status.hw == true)
>>                                  break;
>>
>> -                       if (amdgpu_in_reset(adev) || adev->in_suspend) {
>> +                       if (adev->in_gpu_reset || adev->in_suspend) {
>>                                  r = adev->ip_blocks[i].version->funcs->resume(adev);
>>                                  if (r) {
>>                                          DRM_ERROR("resume of IP block <%s> failed %d\n",
>> @@ -2117,7 +2117,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
>>                          AMDGPU_RESET_MAGIC_NUM))
>>                  return true;
>>
>> -       if (!amdgpu_in_reset(adev))
>> +       if (!adev->in_gpu_reset)
>>                  return false;
>>
>>          /*
>> @@ -3055,8 +3055,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>          mutex_init(&adev->mn_lock);
>>          mutex_init(&adev->virt.vf_errors.lock);
>>          hash_init(adev->mn_hash);
>> -       init_rwsem(&adev->reset_sem);
>> -       atomic_set(&adev->in_gpu_reset, 0);
>> +       mutex_init(&adev->lock_reset);
>>          mutex_init(&adev->psp.mutex);
>>          mutex_init(&adev->notifier_lock);
>>
>> @@ -4084,11 +4083,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>                  if (need_full_reset) {
>>                          /* post card */
>> -                       if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
>> -                               dev_warn(tmp_adev->dev, "asic atom init failed!");
>> -                               r = -EAGAIN;
>> -                               goto out;
>> -                       }
>> +                       if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
>> +                               DRM_WARN("asic atom init failed!");
>>
>>                          if (!r) {
>>                                  dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
>> @@ -4178,18 +4174,16 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>          return r;
>>   }
>>
>> -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>   {
>> -       if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
>> -               return false;
>> -
>> -       if (hive) {
>> -               down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
>> -       } else {
>> -               down_write(&adev->reset_sem);
>> -       }
>> +       if (trylock) {
>> +               if (!mutex_trylock(&adev->lock_reset))
>> +                       return false;
>> +       } else
>> +               mutex_lock(&adev->lock_reset);
>>
>>          atomic_inc(&adev->gpu_reset_counter);
>> +       adev->in_gpu_reset = true;
>>          switch (amdgpu_asic_reset_method(adev)) {
>>          case AMD_RESET_METHOD_MODE1:
>>                  adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
>> @@ -4209,8 +4203,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>   {
>>          amdgpu_vf_error_trans_all(adev);
>>          adev->mp1_state = PP_MP1_STATE_NONE;
>> -       atomic_set(&adev->in_gpu_reset, 0);
>> -       up_write(&adev->reset_sem);
>> +       adev->in_gpu_reset = false;
>> +       mutex_unlock(&adev->lock_reset);
>>   }
>>
>>   static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
>> @@ -4320,14 +4314,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>           * We always reset all schedulers for device and all devices for XGMI
>>           * hive so that should take care of them too.
>>           */
>> -       hive = amdgpu_get_xgmi_hive(adev, false);
>> -       if (hive) {
>> -               if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
>> -                       DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> -                               job ? job->base.id : -1, hive->hive_id);
>> -                       return 0;
>> -               }
>> -               mutex_lock(&hive->hive_lock);
>> +       hive = amdgpu_get_xgmi_hive(adev, true);
>> +       if (hive && !mutex_trylock(&hive->reset_lock)) {
>> +               DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>> +                         job ? job->base.id : -1, hive->hive_id);
>> +               mutex_unlock(&hive->hive_lock);
>> +               return 0;
>>          }
>>
>>          /*
>> @@ -4349,11 +4341,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>
>>          /* block all schedulers and reset given job's ring */
>>          list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -               if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
>> +               if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
>>                          DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>                                    job ? job->base.id : -1);
>> -                       r = 0;
>> -                       goto skip_recovery;
>> +                       mutex_unlock(&hive->hive_lock);
>> +                       return 0;
>>                  }
>>
>>                  /*
>> @@ -4486,9 +4478,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                  amdgpu_device_unlock_adev(tmp_adev);
>>          }
>>
>> -skip_recovery:
>>          if (hive) {
>> -               atomic_set(&hive->in_reset, 0);
>> +               mutex_unlock(&hive->reset_lock);
>>                  mutex_unlock(&hive->hive_lock);
>>          }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index ee1e8fff83b2..8c64d8d6cb82 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -670,8 +670,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>>                  bo_va = NULL;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          switch (args->operation) {
>>          case AMDGPU_VA_OP_MAP:
>>                  va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
>> @@ -701,8 +699,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>>                  amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
>>                                          args->operation);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>   error_backoff:
>>          ttm_eu_backoff_reservation(&ticket, &list);
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index 8ccd17d02cc6..a819360a4b6a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -719,7 +719,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>           *
>>           * also don't wait anymore for IRQ context
>>           * */
>> -       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
>> +       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>                  goto failed_kiq_read;
>>
>>          might_sleep();
>> @@ -777,7 +777,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>           *
>>           * also don't wait anymore for IRQ context
>>           * */
>> -       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
>> +       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>                  goto failed_kiq_write;
>>
>>          might_sleep();
>> @@ -796,5 +796,5 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>          amdgpu_ring_undo(ring);
>>          spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>   failed_kiq_write:
>> -       dev_warn(adev->dev, "failed to write reg:%x\n", reg);
>> +       pr_err("failed to write reg:%x\n", reg);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 75d37dfb51aa..937029ad5271 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>>
>>          trace_amdgpu_sched_run_job(job);
>>
>> -       if (down_read_trylock(&ring->adev->reset_sem)) {
>> +       if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
>> +               dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
>> +
>> +       if (finished->error < 0) {
>> +               DRM_INFO("Skip scheduling IBs!\n");
>> +       } else {
>>                  r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>> -                                       &fence);
>> -               up_read(&ring->adev->reset_sem);
>> +                                      &fence);
>>                  if (r)
>>                          DRM_ERROR("Error scheduling IBs (%d)\n", r);
>> -       } else {
>> -               dma_fence_set_error(finished, -ECANCELED);
>> -               DRM_INFO("Skip scheduling IBs!\n");
>>          }
>> -
>>          /* if gpu reset, hw fence will be replaced here */
>>          dma_fence_put(job->fence);
>>          job->fence = dma_fence_get(fence);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index f8de949d2510..b4a9e0478f25 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1087,8 +1087,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>>          if (!fpriv)
>>                  return;
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          pm_runtime_get_sync(dev->dev);
>>
>>          if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
>> @@ -1127,8 +1125,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>>
>>          pm_runtime_mark_last_busy(dev->dev);
>>          pm_runtime_put_autosuspend(dev->dev);
>> -
>> -       up_read(&adev->reset_sem);
>>   }
>>
>>   /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
>> index 1705e328c6fc..65ad174bb976 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
>> @@ -163,7 +163,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
>>          enum amd_pm_state_type pm;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -172,8 +172,6 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  if (adev->smu.ppt_funcs->get_current_power_state)
>>                          pm = smu_get_current_power_state(&adev->smu);
>> @@ -185,8 +183,6 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
>>                  pm = adev->pm.dpm.user_state;
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -205,7 +201,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
>>          enum amd_pm_state_type  state;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (strncmp("battery", buf, strlen("battery")) == 0)
>> @@ -223,8 +219,6 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  mutex_lock(&adev->pm.mutex);
>>                  adev->pm.dpm.user_state = state;
>> @@ -238,9 +232,6 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
>>
>>                  amdgpu_pm_compute_clocks(adev);
>>          }
>> -
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -316,7 +307,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
>>          enum amd_dpm_forced_level level = 0xff;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -325,8 +316,6 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  level = smu_get_performance_level(&adev->smu);
>>          else if (adev->powerplay.pp_funcs->get_performance_level)
>> @@ -334,8 +323,6 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
>>          else
>>                  level = adev->pm.dpm.forced_level;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -362,7 +349,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>          enum amd_dpm_forced_level current_level = 0xff;
>>          int ret = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (strncmp("low", buf, strlen("low")) == 0) {
>> @@ -393,8 +380,6 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  current_level = smu_get_performance_level(&adev->smu);
>>          else if (adev->powerplay.pp_funcs->get_performance_level)
>> @@ -403,8 +388,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>          if (current_level == level) {
>>                  pm_runtime_mark_last_busy(ddev->dev);
>>                  pm_runtime_put_autosuspend(ddev->dev);
>> -               ret = count;
>> -               goto pro_end;
>> +               return count;
>>          }
>>
>>          if (adev->asic_type == CHIP_RAVEN) {
>> @@ -425,8 +409,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>                  pr_err("Currently not in any profile mode!\n");
>>                  pm_runtime_mark_last_busy(ddev->dev);
>>                  pm_runtime_put_autosuspend(ddev->dev);
>> -               ret = -EINVAL;
>> -               goto pro_end;
>> +               return -EINVAL;
>>          }
>>
>>          if (is_support_sw_smu(adev)) {
>> @@ -434,8 +417,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>                  if (ret) {
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       ret = -EINVAL;
>> -                       goto pro_end;
>> +                       return -EINVAL;
>>                  }
>>          } else if (adev->powerplay.pp_funcs->force_performance_level) {
>>                  mutex_lock(&adev->pm.mutex);
>> @@ -443,16 +425,14 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>                          mutex_unlock(&adev->pm.mutex);
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       ret = -EINVAL;
>> -                       goto pro_end;
>> +                       return -EINVAL;
>>                  }
>>                  ret = amdgpu_dpm_force_performance_level(adev, level);
>>                  if (ret) {
>>                          mutex_unlock(&adev->pm.mutex);
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       ret = -EINVAL;
>> -                       goto pro_end;
>> +                       return -EINVAL;
>>                  } else {
>>                          adev->pm.dpm.forced_level = level;
>>                  }
>> @@ -461,9 +441,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> -pro_end:
>> -       up_read(&adev->reset_sem);
>> -       return ret;
>> +       return count;
>>   }
>>
>>   static ssize_t amdgpu_get_pp_num_states(struct device *dev,
>> @@ -475,7 +453,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,
>>          struct pp_states_info data;
>>          int i, buf_len, ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -519,7 +497,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,
>>          enum amd_pm_state_type pm = 0;
>>          int i = 0, ret = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -560,7 +538,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev,
>>          struct drm_device *ddev = dev_get_drvdata(dev);
>>          struct amdgpu_device *adev = ddev->dev_private;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (adev->pp_force_state_enabled)
>> @@ -580,7 +558,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
>>          unsigned long idx;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (strlen(buf) == 1)
>> @@ -606,7 +584,6 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
>>                          return ret;
>>                  }
>>
>> -               down_read(&adev->reset_sem);
>>                  /* only set user selected power states */
>>                  if (state != POWER_STATE_TYPE_INTERNAL_BOOT &&
>>                      state != POWER_STATE_TYPE_DEFAULT) {
>> @@ -614,8 +591,6 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
>>                                          AMD_PP_TASK_ENABLE_USER_STATE, &state);
>>                          adev->pp_force_state_enabled = true;
>>                  }
>> -               up_read(&adev->reset_sem);
>> -
>>                  pm_runtime_mark_last_busy(ddev->dev);
>>                  pm_runtime_put_autosuspend(ddev->dev);
>>          }
>> @@ -643,7 +618,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
>>          char *table = NULL;
>>          int size, ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -687,7 +662,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
>>          struct amdgpu_device *adev = ddev->dev_private;
>>          int ret = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -696,21 +671,16 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  ret = smu_sys_set_pp_table(&adev->smu, (void *)buf, count);
>>                  if (ret) {
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return ret;
>>                  }
>>          } else if (adev->powerplay.pp_funcs->set_pp_table)
>>                  amdgpu_dpm_set_pp_table(adev, buf, count);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -845,7 +815,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>          const char delimiter[3] = {' ', '\n', '\0'};
>>          uint32_t type;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (count > 127)
>> @@ -889,10 +859,6 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       ret = count;
>> -
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  ret = smu_od_edit_dpm_table(&adev->smu, type,
>>                                              parameter, parameter_size);
>> @@ -900,8 +866,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>                  if (ret) {
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       ret = -EINVAL;
>> -                       goto pro_end;
>> +                       return -EINVAL;
>>                  }
>>          } else {
>>                  if (adev->powerplay.pp_funcs->odn_edit_dpm_table) {
>> @@ -910,8 +875,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>                          if (ret) {
>>                                  pm_runtime_mark_last_busy(ddev->dev);
>>                                  pm_runtime_put_autosuspend(ddev->dev);
>> -                               ret = -EINVAL;
>> -                               goto pro_end;
>> +                               return -EINVAL;
>>                          }
>>                  }
>>
>> @@ -922,22 +886,18 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>                                                  NULL);
>>                                  pm_runtime_mark_last_busy(ddev->dev);
>>                                  pm_runtime_put_autosuspend(ddev->dev);
>> -                               ret = count;
>> -                               goto pro_end;
>> +                               return count;
>>                          } else {
>>                                  pm_runtime_mark_last_busy(ddev->dev);
>>                                  pm_runtime_put_autosuspend(ddev->dev);
>> -                               ret = -EINVAL;
>> -                               goto pro_end;
>> +                               return -EINVAL;
>>                          }
>>                  }
>>          }
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> -pro_end:
>> -       up_read(&adev->reset_sem);
>> -       return ret;
>> +       return count;
>>   }
>>
>>   static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
>> @@ -949,7 +909,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1003,7 +963,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
>>          uint64_t featuremask;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = kstrtou64(buf, 0, &featuremask);
>> @@ -1018,13 +978,11 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          if (is_support_sw_smu(adev)) {
>>                  ret = smu_sys_set_pp_feature_mask(&adev->smu, featuremask);
>>                  if (ret) {
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>          } else if (adev->powerplay.pp_funcs->set_ppfeature_status) {
>> @@ -1032,12 +990,9 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
>>                  if (ret) {
>>                          pm_runtime_mark_last_busy(ddev->dev);
>>                          pm_runtime_put_autosuspend(ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>          }
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1053,7 +1008,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1062,8 +1017,6 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_sys_get_pp_feature_mask(&adev->smu, buf);
>>          else if (adev->powerplay.pp_funcs->get_ppfeature_status)
>> @@ -1071,8 +1024,6 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1118,7 +1069,7 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1127,8 +1078,6 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_SCLK, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1136,8 +1085,6 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1190,7 +1137,7 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev,
>>          int ret;
>>          uint32_t mask = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1203,15 +1150,11 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_SCLK, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>>                  ret = amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1230,7 +1173,7 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1239,8 +1182,6 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_MCLK, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1248,8 +1189,6 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1266,7 +1205,7 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev,
>>          uint32_t mask = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1279,15 +1218,11 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_MCLK, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>>                  ret = amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1306,7 +1241,7 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1315,8 +1250,6 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_SOCCLK, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1324,8 +1257,6 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1342,7 +1273,7 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
>>          int ret;
>>          uint32_t mask = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1355,8 +1286,6 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_SOCCLK, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>> @@ -1364,8 +1293,6 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
>>          else
>>                  ret = 0;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1384,7 +1311,7 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1393,8 +1320,6 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_FCLK, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1402,8 +1327,6 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1420,7 +1343,7 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
>>          int ret;
>>          uint32_t mask = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1433,8 +1356,6 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_FCLK, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>> @@ -1442,8 +1363,6 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
>>          else
>>                  ret = 0;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1462,7 +1381,7 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1471,8 +1390,6 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_DCEFCLK, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1480,8 +1397,6 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1498,7 +1413,7 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
>>          int ret;
>>          uint32_t mask = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1511,8 +1426,6 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_DCEFCLK, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>> @@ -1520,8 +1433,6 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
>>          else
>>                  ret = 0;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1540,7 +1451,7 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1549,8 +1460,6 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_print_clk_levels(&adev->smu, SMU_PCIE, buf);
>>          else if (adev->powerplay.pp_funcs->print_clock_levels)
>> @@ -1558,8 +1467,6 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1576,7 +1483,7 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
>>          int ret;
>>          uint32_t mask = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = amdgpu_read_mask(buf, count, &mask);
>> @@ -1589,8 +1496,6 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_force_clk_levels(&adev->smu, SMU_PCIE, mask);
>>          else if (adev->powerplay.pp_funcs->force_clock_level)
>> @@ -1598,8 +1503,6 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
>>          else
>>                  ret = 0;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1618,7 +1521,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
>>          uint32_t value = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1627,15 +1530,11 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  value = smu_get_od_percentage(&(adev->smu), SMU_OD_SCLK);
>>          else if (adev->powerplay.pp_funcs->get_sclk_od)
>>                  value = amdgpu_dpm_get_sclk_od(adev);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1652,7 +1551,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
>>          int ret;
>>          long int value;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = kstrtol(buf, 0, &value);
>> @@ -1666,8 +1565,6 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  value = smu_set_od_percentage(&(adev->smu), SMU_OD_SCLK, (uint32_t)value);
>>          } else {
>> @@ -1682,8 +1579,6 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
>>                  }
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1699,7 +1594,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
>>          uint32_t value = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1708,15 +1603,11 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  value = smu_get_od_percentage(&(adev->smu), SMU_OD_MCLK);
>>          else if (adev->powerplay.pp_funcs->get_mclk_od)
>>                  value = amdgpu_dpm_get_mclk_od(adev);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1733,7 +1624,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
>>          int ret;
>>          long int value;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = kstrtol(buf, 0, &value);
>> @@ -1747,8 +1638,6 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  value = smu_set_od_percentage(&(adev->smu), SMU_OD_MCLK, (uint32_t)value);
>>          } else {
>> @@ -1763,8 +1652,6 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
>>                  }
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1800,7 +1687,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
>>          ssize_t size;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -1809,8 +1696,6 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  size = smu_get_power_profile_mode(&adev->smu, buf);
>>          else if (adev->powerplay.pp_funcs->get_power_profile_mode)
>> @@ -1818,8 +1703,6 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
>>          else
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1844,7 +1727,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
>>          long int profile_mode = 0;
>>          const char delimiter[3] = {' ', '\n', '\0'};
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          tmp[0] = *(buf);
>> @@ -1878,15 +1761,11 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  ret = smu_set_power_profile_mode(&adev->smu, parameter, parameter_size, true);
>>          else if (adev->powerplay.pp_funcs->set_power_profile_mode)
>>                  ret = amdgpu_dpm_set_power_profile_mode(adev, parameter, parameter_size);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1912,7 +1791,7 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev,
>>          struct amdgpu_device *adev = ddev->dev_private;
>>          int r, value, size = sizeof(value);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(ddev->dev);
>> @@ -1921,11 +1800,9 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* read the IP busy sensor */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_LOAD,
>>                                     (void *)&value, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>> @@ -1952,7 +1829,7 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev,
>>          struct amdgpu_device *adev = ddev->dev_private;
>>          int r, value, size = sizeof(value);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(ddev->dev);
>> @@ -1961,14 +1838,10 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          /* read the IP busy sensor */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MEM_LOAD,
>>                                     (void *)&value, &size);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -1999,7 +1872,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
>>          uint64_t count0 = 0, count1 = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (adev->flags & AMD_IS_APU)
>> @@ -2014,12 +1887,8 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          amdgpu_asic_get_pcie_usage(adev, &count0, &count1);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(ddev->dev);
>>          pm_runtime_put_autosuspend(ddev->dev);
>>
>> @@ -2044,7 +1913,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
>>          struct drm_device *ddev = dev_get_drvdata(dev);
>>          struct amdgpu_device *adev = ddev->dev_private;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (adev->unique_id)
>> @@ -2142,7 +2011,7 @@ static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
>>          ssize_t size = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(ddev->dev);
>> @@ -2151,12 +2020,10 @@ static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          if (is_support_sw_smu(adev))
>>                  size = smu_sys_get_gpu_metrics(&adev->smu, &gpu_metrics);
>>          else if (adev->powerplay.pp_funcs->get_gpu_metrics)
>>                  size = amdgpu_dpm_get_gpu_metrics(adev, &gpu_metrics);
>> -       up_read(&adev->reset_sem);
>>
>>          if (size <= 0)
>>                  goto out;
>> @@ -2368,7 +2235,7 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
>>          int channel = to_sensor_dev_attr(attr)->index;
>>          int r, temp = 0, size = sizeof(temp);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (channel >= PP_TEMP_MAX)
>> @@ -2380,8 +2247,6 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          switch (channel) {
>>          case PP_TEMP_JUNCTION:
>>                  /* get current junction temperature */
>> @@ -2403,8 +2268,6 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
>>                  break;
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2508,7 +2371,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
>>          u32 pwm_mode = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2517,23 +2380,18 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  pwm_mode = smu_get_fan_control_mode(&adev->smu);
>>          } else {
>>                  if (!adev->powerplay.pp_funcs->get_fan_control_mode) {
>>                          pm_runtime_mark_last_busy(adev->ddev->dev);
>>                          pm_runtime_put_autosuspend(adev->ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>
>>                  pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2549,7 +2407,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
>>          int err, ret;
>>          int value;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = kstrtoint(buf, 10, &value);
>> @@ -2562,23 +2420,18 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  smu_set_fan_control_mode(&adev->smu, value);
>>          } else {
>>                  if (!adev->powerplay.pp_funcs->set_fan_control_mode) {
>>                          pm_runtime_mark_last_busy(adev->ddev->dev);
>>                          pm_runtime_put_autosuspend(adev->ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>
>>                  amdgpu_dpm_set_fan_control_mode(adev, value);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2608,7 +2461,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
>>          u32 value;
>>          u32 pwm_mode;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2617,15 +2470,11 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  pwm_mode = smu_get_fan_control_mode(&adev->smu);
>>          else
>>                  pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          if (pwm_mode != AMD_FAN_CTRL_MANUAL) {
>>                  pr_info("manual fan speed control should be enabled first\n");
>>                  pm_runtime_mark_last_busy(adev->ddev->dev);
>> @@ -2666,7 +2515,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
>>          int err;
>>          u32 speed = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2675,8 +2524,6 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  err = smu_get_fan_speed_percent(&adev->smu, &speed);
>>          else if (adev->powerplay.pp_funcs->get_fan_speed_percent)
>> @@ -2684,8 +2531,6 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
>>          else
>>                  err = -EINVAL;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2705,7 +2550,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
>>          int err;
>>          u32 speed = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2714,8 +2559,6 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  err = smu_get_fan_speed_rpm(&adev->smu, &speed);
>>          else if (adev->powerplay.pp_funcs->get_fan_speed_rpm)
>> @@ -2723,8 +2566,6 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
>>          else
>>                  err = -EINVAL;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2743,7 +2584,7 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev,
>>          u32 size = sizeof(min_rpm);
>>          int r;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2752,13 +2593,9 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MIN_FAN_RPM,
>>                                     (void *)&min_rpm, &size);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2777,7 +2614,7 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev,
>>          u32 size = sizeof(max_rpm);
>>          int r;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2786,13 +2623,9 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MAX_FAN_RPM,
>>                                     (void *)&max_rpm, &size);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2810,7 +2643,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
>>          int err;
>>          u32 rpm = 0;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2819,8 +2652,6 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  err = smu_get_fan_speed_rpm(&adev->smu, &rpm);
>>          else if (adev->powerplay.pp_funcs->get_fan_speed_rpm)
>> @@ -2828,8 +2659,6 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
>>          else
>>                  err = -EINVAL;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2848,7 +2677,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
>>          u32 value;
>>          u32 pwm_mode;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2857,15 +2686,11 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  pwm_mode = smu_get_fan_control_mode(&adev->smu);
>>          else
>>                  pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          if (pwm_mode != AMD_FAN_CTRL_MANUAL) {
>>                  pm_runtime_mark_last_busy(adev->ddev->dev);
>>                  pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -2879,8 +2704,6 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  err = smu_set_fan_speed_rpm(&adev->smu, value);
>>          else if (adev->powerplay.pp_funcs->set_fan_speed_rpm)
>> @@ -2888,8 +2711,6 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
>>          else
>>                  err = -EINVAL;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2907,7 +2728,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
>>          u32 pwm_mode = 0;
>>          int ret;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          ret = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -2916,23 +2737,18 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
>>                  return ret;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  pwm_mode = smu_get_fan_control_mode(&adev->smu);
>>          } else {
>>                  if (!adev->powerplay.pp_funcs->get_fan_control_mode) {
>>                          pm_runtime_mark_last_busy(adev->ddev->dev);
>>                          pm_runtime_put_autosuspend(adev->ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>
>>                  pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2949,7 +2765,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
>>          int value;
>>          u32 pwm_mode;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          err = kstrtoint(buf, 10, &value);
>> @@ -2969,22 +2785,17 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  smu_set_fan_control_mode(&adev->smu, pwm_mode);
>>          } else {
>>                  if (!adev->powerplay.pp_funcs->set_fan_control_mode) {
>>                          pm_runtime_mark_last_busy(adev->ddev->dev);
>>                          pm_runtime_put_autosuspend(adev->ddev->dev);
>> -                       up_read(&adev->reset_sem);
>>                          return -EINVAL;
>>                  }
>>                  amdgpu_dpm_set_fan_control_mode(adev, pwm_mode);
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -2999,7 +2810,7 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev,
>>          u32 vddgfx;
>>          int r, size = sizeof(vddgfx);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3008,11 +2819,9 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* get the voltage */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDGFX,
>>                                     (void *)&vddgfx, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -3038,7 +2847,7 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev,
>>          u32 vddnb;
>>          int r, size = sizeof(vddnb);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          /* only APUs have vddnb */
>> @@ -3051,11 +2860,9 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* get the voltage */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB,
>>                                     (void *)&vddnb, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -3082,7 +2889,7 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev,
>>          int r, size = sizeof(u32);
>>          unsigned uw;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3091,11 +2898,9 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* get the voltage */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_POWER,
>>                                     (void *)&query, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -3125,7 +2930,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
>>          ssize_t size;
>>          int r;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3134,8 +2939,6 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  smu_get_power_limit(&adev->smu, &limit, true);
>>                  size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
>> @@ -3146,8 +2949,6 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -3163,7 +2964,7 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
>>          ssize_t size;
>>          int r;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3172,8 +2973,6 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev)) {
>>                  smu_get_power_limit(&adev->smu, &limit, false);
>>                  size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
>> @@ -3184,8 +2983,6 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
>>                  size = snprintf(buf, PAGE_SIZE, "\n");
>>          }
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -3202,7 +2999,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
>>          int err;
>>          u32 value;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          if (amdgpu_sriov_vf(adev))
>> @@ -3221,8 +3018,6 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
>>                  return err;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>> -
>>          if (is_support_sw_smu(adev))
>>                  err = smu_set_power_limit(&adev->smu, value);
>>          else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_power_limit)
>> @@ -3230,8 +3025,6 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
>>          else
>>                  err = -EINVAL;
>>
>> -       up_read(&adev->reset_sem);
>> -
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>>
>> @@ -3249,7 +3042,7 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev,
>>          uint32_t sclk;
>>          int r, size = sizeof(sclk);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3258,11 +3051,9 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* get the sclk */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_SCLK,
>>                                     (void *)&sclk, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -3288,7 +3079,7 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev,
>>          uint32_t mclk;
>>          int r, size = sizeof(mclk);
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(adev->ddev->dev);
>> @@ -3297,11 +3088,9 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev,
>>                  return r;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          /* get the sclk */
>>          r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_MCLK,
>>                                     (void *)&mclk, &size);
>> -       up_read(&adev->reset_sem);
>>
>>          pm_runtime_mark_last_busy(adev->ddev->dev);
>>          pm_runtime_put_autosuspend(adev->ddev->dev);
>> @@ -4188,7 +3977,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
>>          u32 flags = 0;
>>          int r;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EPERM;
>>
>>          r = pm_runtime_get_sync(dev->dev);
>> @@ -4204,7 +3993,6 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
>>                  return 0;
>>          }
>>
>> -       down_read(&adev->reset_sem);
>>          if (!is_support_sw_smu(adev) &&
>>              adev->powerplay.pp_funcs->debugfs_print_current_performance_level) {
>>                  mutex_lock(&adev->pm.mutex);
>> @@ -4217,13 +4005,10 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
>>          } else {
>>                  r = amdgpu_debugfs_pm_info_pp(m, adev);
>>          }
>> -       up_read(&adev->reset_sem);
>>          if (r)
>>                  goto out;
>>
>> -       down_read(&adev->reset_sem);
>>          amdgpu_device_ip_get_clockgating_state(adev, &flags);
>> -       up_read(&adev->reset_sem);
>>
>>          seq_printf(m, "Clock Gating Flags Mask: 0x%x\n", flags);
>>          amdgpu_parse_cg_state(m, flags);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> index 116a89990f39..aa1e77c60c0a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> @@ -1869,7 +1869,7 @@ static int psp_load_smu_fw(struct psp_context *psp)
>>                  return 0;
>>
>>
>> -       if (amdgpu_in_reset(adev) && ras && ras->supported) {
>> +       if (adev->in_gpu_reset && ras && ras->supported) {
>>                  ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD);
>>                  if (ret) {
>>                          DRM_WARN("Failed to set MP1 state prepare for reload\n");
>> @@ -1984,7 +1984,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
>>          int ret;
>>          struct psp_context *psp = &adev->psp;
>>
>> -       if (amdgpu_sriov_vf(adev) && amdgpu_in_reset(adev)) {
>> +       if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) {
>>                  psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */
>>                  goto skip_memalloc;
>>          }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index cd1403f83dcf..f09082578865 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -2079,7 +2079,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>                          amdgpu_ras_request_reset_on_boot(adev,
>>                                          ras_block->block);
>>                          return 0;
>> -               } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
>> +               } else if (adev->in_suspend || adev->in_gpu_reset) {
>>                          /* in resume phase, if fail to enable ras,
>>                           * clean up all ras fs nodes, and disable ras */
>>                          goto cleanup;
>> @@ -2088,7 +2088,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>          }
>>
>>          /* in resume phase, no need to create ras fs node */
>> -       if (adev->in_suspend || amdgpu_in_reset(adev))
>> +       if (adev->in_suspend || adev->in_gpu_reset)
>>                  return 0;
>>
>>          if (ih_info->cb) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 20fa0497aaa4..1e19d130473f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -2103,7 +2103,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
>>          uint64_t size;
>>          int r;
>>
>> -       if (!adev->mman.initialized || amdgpu_in_reset(adev) ||
>> +       if (!adev->mman.initialized || adev->in_gpu_reset ||
>>              adev->mman.buffer_funcs_enabled == enable)
>>                  return;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
>> index 039245c98ff8..183743c5fb7b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
>> @@ -628,8 +628,7 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>>          struct amdgpu_firmware_info *ucode = NULL;
>>
>>    /* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */
>> -       if (!amdgpu_sriov_vf(adev) &&
>> -               (amdgpu_in_reset(adev) || adev->in_suspend))
>> +       if (!amdgpu_sriov_vf(adev) && (adev->in_gpu_reset || adev->in_suspend))
>>                  return 0;
>>          /*
>>           * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 1e211544f2dc..ae720a6dc5a0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -93,7 +93,7 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
>>          amdgpu_ring_undo(ring);
>>          spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>   failed_kiq:
>> -       dev_warn(adev->dev, "failed to write reg %x wait reg %x\n", reg0, reg1);
>> +       pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index b2046c3a404d..f826945989c7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -325,9 +325,9 @@ static inline bool is_virtual_machine(void)
>>   #define amdgpu_sriov_is_pp_one_vf(adev) \
>>          ((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF)
>>   #define amdgpu_sriov_is_debug(adev) \
>> -       ((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug)
>> +       ((!adev->in_gpu_reset) && adev->virt.tdr_debug)
>>   #define amdgpu_sriov_is_normal(adev) \
>> -       ((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug))
>> +       ((!adev->in_gpu_reset) && (!adev->virt.tdr_debug))
>>
>>   bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
>>   void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 67a756f4337b..cd6e6eb7d966 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -372,7 +372,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
>>          tmp->hive_id = adev->gmc.xgmi.hive_id;
>>          INIT_LIST_HEAD(&tmp->device_list);
>>          mutex_init(&tmp->hive_lock);
>> -       atomic_set(&tmp->in_reset, 0);
>> +       mutex_init(&tmp->reset_lock);
>>          task_barrier_init(&tmp->tb);
>>
>>          if (lock)
>> @@ -397,7 +397,6 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
>>                                                  hive->hi_req_gpu : adev;
>>          bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
>>          bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
>> -       bool locked;
>>
>>          /* fw bug so temporarily disable pstate switching */
>>          return 0;
>> @@ -405,9 +404,7 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
>>          if (!hive || adev->asic_type != CHIP_VEGA20)
>>                  return 0;
>>
>> -       locked = atomic_read(&hive->in_reset) ? false : true;
>> -       if (locked)
>> -               mutex_lock(&hive->hive_lock);
>> +       mutex_lock(&hive->hive_lock);
>>
>>          if (is_hi_req)
>>                  hive->hi_req_count++;
>> @@ -442,8 +439,7 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
>>                                                          adev : NULL;
>>          }
>>   out:
>> -       if (locked)
>> -               mutex_unlock(&hive->hive_lock);
>> +       mutex_unlock(&hive->hive_lock);
>>          return ret;
>>   }
>>
>> @@ -598,6 +594,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>>          if(!(--hive->number_devices)){
>>                  amdgpu_xgmi_sysfs_destroy(adev, hive);
>>                  mutex_destroy(&hive->hive_lock);
>> +               mutex_destroy(&hive->reset_lock);
>>          }
>>
>>          return psp_xgmi_terminate(&adev->psp);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 61720cd4a1ee..6999eab16a72 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -30,8 +30,7 @@ struct amdgpu_hive_info {
>>          uint64_t                hive_id;
>>          struct list_head        device_list;
>>          int number_devices;
>> -       struct mutex hive_lock;
>> -       atomic_t in_reset;
>> +       struct mutex hive_lock, reset_lock;
>>          struct kobject *kobj;
>>          struct device_attribute dev_attr;
>>          struct amdgpu_device *adev;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
>> index 8341bd965202..4cfc786699c7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
>> @@ -755,7 +755,6 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
>>                                  /* jiffies wrap around we will just wait a little longer */
>>                                  ctx->last_jump_jiffies = jiffies;
>>                          }
>> -                       schedule();
>>                  } else {
>>                          ctx->last_jump = ctx->start + target;
>>                          ctx->last_jump_jiffies = jiffies;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index de6e6de41867..e87d43537013 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -6201,7 +6201,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
>>          struct v10_gfx_mqd *mqd = ring->mqd_ptr;
>>          int mqd_idx = ring - &adev->gfx.gfx_ring[0];
>>
>> -       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
>> +       if (!adev->in_gpu_reset && !adev->in_suspend) {
>>                  memset((void *)mqd, 0, sizeof(*mqd));
>>                  mutex_lock(&adev->srbm_mutex);
>>                  nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
>> @@ -6213,7 +6213,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
>>                  mutex_unlock(&adev->srbm_mutex);
>>                  if (adev->gfx.me.mqd_backup[mqd_idx])
>>                          memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
>> -       } else if (amdgpu_in_reset(adev)) {
>> +       } else if (adev->in_gpu_reset) {
>>                  /* reset mqd with the backup copy */
>>                  if (adev->gfx.me.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd));
>> @@ -6566,7 +6566,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring)
>>
>>          gfx_v10_0_kiq_setting(ring);
>>
>> -       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
>> @@ -6602,7 +6602,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
>>          struct v10_compute_mqd *mqd = ring->mqd_ptr;
>>          int mqd_idx = ring - &adev->gfx.compute_ring[0];
>>
>> -       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
>> +       if (!adev->in_gpu_reset && !adev->in_suspend) {
>>                  memset((void *)mqd, 0, sizeof(*mqd));
>>                  mutex_lock(&adev->srbm_mutex);
>>                  nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
>> @@ -6612,7 +6612,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
>>
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
>> -       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index 7df567a6656d..14fd04b699da 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -4633,7 +4633,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
>>
>>          gfx_v8_0_kiq_setting(ring);
>>
>> -       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
>> @@ -4670,7 +4670,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
>>          struct vi_mqd *mqd = ring->mqd_ptr;
>>          int mqd_idx = ring - &adev->gfx.compute_ring[0];
>>
>> -       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
>> +       if (!adev->in_gpu_reset && !adev->in_suspend) {
>>                  memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
>>                  ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
>>                  ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
>> @@ -4682,7 +4682,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
>>
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
>> -       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 93c63ff3b35e..2c5bb282cc01 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -3686,7 +3686,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
>>
>>          gfx_v9_0_kiq_setting(ring);
>>
>> -       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
>> @@ -3724,7 +3724,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
>>          struct v9_mqd *mqd = ring->mqd_ptr;
>>          int mqd_idx = ring - &adev->gfx.compute_ring[0];
>>
>> -       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
>> +       if (!adev->in_gpu_reset && !adev->in_suspend) {
>>                  memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
>>                  ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
>>                  ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
>> @@ -3736,7 +3736,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
>>
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
>> -       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
>> +       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
>>                  /* reset MQD to a clean status */
>>                  if (adev->gfx.mec.mqd_backup[mqd_idx])
>>                          memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
>> @@ -3930,7 +3930,7 @@ static int gfx_v9_0_hw_fini(void *handle)
>>          /* Use deinitialize sequence from CAIL when unbinding device from driver,
>>           * otherwise KIQ is hanging when binding back
>>           */
>> -       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
>> +       if (!adev->in_gpu_reset && !adev->in_suspend) {
>>                  mutex_lock(&adev->srbm_mutex);
>>                  soc15_grbm_select(adev, adev->gfx.kiq.ring.me,
>>                                  adev->gfx.kiq.ring.pipe,
>> @@ -4088,7 +4088,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
>>           *
>>           * also don't wait anymore for IRQ context
>>           * */
>> -       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
>> +       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>                  goto failed_kiq_read;
>>
>>          might_sleep();
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 9d3b1245a339..ec8c0af39553 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>           */
>>          if (adev->gfx.kiq.ring.sched.ready &&
>>              (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -           !amdgpu_in_reset(adev)) {
>> +           !adev->in_gpu_reset) {
>>
>>                  struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>>                  const unsigned eng = 17;
>> @@ -312,7 +312,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>
>>          if (!adev->mman.buffer_funcs_enabled ||
>>              !adev->ib_pool_ready ||
>> -           amdgpu_in_reset(adev) ||
>> +           adev->in_gpu_reset ||
>>              ring->sched.ready == false) {
>>                  gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0);
>>                  mutex_unlock(&adev->mman.gtt_window_lock);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> index 80c146df338a..3ce5c1d2fdf2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> @@ -434,7 +434,7 @@ static int gmc_v7_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>          int vmid;
>>          unsigned int tmp;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          for (vmid = 1; vmid < 16; vmid++) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 9ab65ca7df77..3e6615f9d39c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -635,7 +635,7 @@ static int gmc_v8_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>          int vmid;
>>          unsigned int tmp;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          for (vmid = 1; vmid < 16; vmid++) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 773ee11b3d17..6a780b674018 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -501,7 +501,7 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>>           */
>>          if (adev->gfx.kiq.ring.sched.ready &&
>>                          (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
>> -                       !amdgpu_in_reset(adev)) {
>> +                       !adev->in_gpu_reset) {
>>                  uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>>                  uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>>
>> @@ -596,7 +596,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>          struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
>>          struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>
>> -       if (amdgpu_in_reset(adev))
>> +       if (adev->in_gpu_reset)
>>                  return -EIO;
>>
>>          if (ring->sched.ready) {
>> @@ -633,8 +633,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>>                  spin_unlock(&adev->gfx.kiq.ring_lock);
>>                  r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>>                  if (r < 1) {
>> -                       dev_info(adev->dev,
>> -                               "wait for kiq fence error: %ld\n", r);
>> +                       DRM_ERROR("wait for kiq fence error: %ld.\n", r);
>>                          return -ETIME;
>>                  }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index fe31cbeccfe9..5fd67e1cc2a0 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -238,16 +238,20 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>          struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>>          struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>> +       int locked;
>>
>>          /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>           * otherwise the mailbox msg will be ruined/reseted by
>>           * the VF FLR.
>>           *
>> -        * we can unlock the reset_sem to allow "amdgpu_job_timedout"
>> +        * we can unlock the lock_reset to allow "amdgpu_job_timedout"
>>           * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
>>           * which means host side had finished this VF's FLR.
>>           */
>> -       down_read(&adev->reset_sem);
>> +       locked = mutex_trylock(&adev->lock_reset);
>> +       if (locked)
>> +               adev->in_gpu_reset = true;
>> +
>>          do {
>>                  if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
>>                          goto flr_done;
>> @@ -257,7 +261,10 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>>          } while (timeout > 1);
>>
>>   flr_done:
>> -       up_read(&adev->reset_sem);
>> +       if (locked) {
>> +               adev->in_gpu_reset = false;
>> +               mutex_unlock(&adev->lock_reset);
>> +       }
>>
>>          /* Trigger recovery for world switch failure if no TDR */
>>          if (amdgpu_device_should_recover_gpu(adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index 6f55172e8337..ce2bf1fb79ed 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -259,16 +259,20 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>          struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
>>          struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>> +       int locked;
>>
>>          /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>           * otherwise the mailbox msg will be ruined/reseted by
>>           * the VF FLR.
>>           *
>> -        * we can unlock the reset_sem to allow "amdgpu_job_timedout"
>> +        * we can unlock the lock_reset to allow "amdgpu_job_timedout"
>>           * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
>>           * which means host side had finished this VF's FLR.
>>           */
>> -       down_read(&adev->reset_sem);
>> +       locked = mutex_trylock(&adev->lock_reset);
>> +       if (locked)
>> +               adev->in_gpu_reset = true;
>> +
>>          do {
>>                  if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
>>                          goto flr_done;
>> @@ -278,7 +282,10 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>>          } while (timeout > 1);
>>
>>   flr_done:
>> -       up_read(&adev->reset_sem);
>> +       if (locked) {
>> +               adev->in_gpu_reset = false;
>> +               mutex_unlock(&adev->lock_reset);
>> +       }
>>
>>          /* Trigger recovery for world switch failure if no TDR */
>>          if (amdgpu_device_should_recover_gpu(adev)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 7ad1537820b5..e0e60b0d0669 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -304,17 +304,15 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
>>                                  struct qcm_process_device *qpd,
>>                                  struct queue *q)
>>   {
>> -       if (!dqm->is_resetting) {
>> -               /* On GFX v7, CP doesn't flush TC at dequeue */
>> -               if (q->device->device_info->asic_family == CHIP_HAWAII)
>> -                       if (flush_texture_cache_nocpsch(q->device, qpd))
>> -                               pr_err("Failed to flush TC\n");
>> +       /* On GFX v7, CP doesn't flush TC at dequeue */
>> +       if (q->device->device_info->asic_family == CHIP_HAWAII)
>> +               if (flush_texture_cache_nocpsch(q->device, qpd))
>> +                       pr_err("Failed to flush TC\n");
>>
>> -               kfd_flush_tlb(qpd_to_pdd(qpd));
>> +       kfd_flush_tlb(qpd_to_pdd(qpd));
>>
>> -               /* Release the vmid mapping */
>> -               set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
>> -       }
>> +       /* Release the vmid mapping */
>> +       set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
>>          dqm->vmid_pasid[qpd->vmid] = 0;
>>
>>          qpd->vmid = 0;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 71be897d4c2a..013c2b018edc 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1551,10 +1551,6 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
>>   void kfd_flush_tlb(struct kfd_process_device *pdd)
>>   {
>>          struct kfd_dev *dev = pdd->dev;
>> -       struct device_queue_manager *dqm = dev->dqm;
>> -
>> -       if (dqm->is_resetting)
>> -               return;
>>
>>          if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
>>                  /* Nothing to flush until a VMID is assigned, which
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> index ff5f7f7ceec6..c4daa22904da 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> @@ -1658,7 +1658,7 @@ static int dm_suspend(void *handle)
>>          struct amdgpu_display_manager *dm = &adev->dm;
>>          int ret = 0;
>>
>> -       if (amdgpu_in_reset(adev)) {
>> +       if (adev->in_gpu_reset) {
>>                  mutex_lock(&dm->dc_lock);
>>                  dm->cached_dc_state = dc_copy_state(dm->dc->current_state);
>>
>> @@ -1844,7 +1844,7 @@ static int dm_resume(void *handle)
>>          struct dc_state *dc_state;
>>          int i, r, j;
>>
>> -       if (amdgpu_in_reset(adev)) {
>> +       if (adev->in_gpu_reset) {
>>                  dc_state = dm->cached_dc_state;
>>
>>                  r = dm_dmub_hw_init(adev);
>> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> index 1ffacc712e53..c8e30d59e658 100644
>> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> @@ -1110,7 +1110,7 @@ static int smu_disable_dpms(struct smu_context *smu)
>>          struct amdgpu_device *adev = smu->adev;
>>          int ret = 0;
>>          bool use_baco = !smu->is_apu &&
>> -               ((amdgpu_in_reset(adev) &&
>> +               ((adev->in_gpu_reset &&
>>                    (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
>>                   ((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev)));
>>
>> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
>> index da84012b7fd5..c7216362b68d 100644
>> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
>> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
>> @@ -489,7 +489,7 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr)
>>   {
>>          struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
>>          int ret = 0;
>> -       bool use_baco = (amdgpu_in_reset(adev) &&
>> +       bool use_baco = (adev->in_gpu_reset &&
>>                           (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
>>                  (adev->in_runpm && amdgpu_asic_supports_baco(adev));
>>
>> --
>> 2.17.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list