[RFC v4 08/11] drm/amdgpu: Move reset sem into reset_domain
Christian König
ckoenig.leichtzumerken at gmail.com
Wed Feb 9 07:59:11 UTC 2022
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> We want single instance of reset sem across all
> reset clients because in case of XGMI we should stop
> access cross device MMIO because any of them could be
> in a reset in the moment.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 -
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +++++++++----------
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 18 ++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +++--
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++++++-----
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++--
> 10 files changed, 46 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index cb9764513df8..ddfbcc8fd3d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1058,7 +1058,6 @@ struct amdgpu_device {
>
> atomic_t in_gpu_reset;
> enum pp_mp1_state mp1_state;
> - struct rw_semaphore reset_sem;
> struct amdgpu_doorbell_index doorbell_index;
>
> struct mutex notifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 25e2e5bf90eb..c3728061d65a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -37,6 +37,8 @@
> #include "amdgpu_fw_attestation.h"
> #include "amdgpu_umr.h"
>
> +#include "amdgpu_reset.h"
> +
> #if defined(CONFIG_DEBUG_FS)
>
> /**
> @@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> }
>
> /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_write_killable(&adev->reset_sem);
> + r = down_write_killable(&adev->reset_domain->sem);
> if (r)
> return r;
>
> @@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
> kthread_unpark(ring->sched.thread);
> }
>
> - up_write(&adev->reset_sem);
> + up_write(&adev->reset_domain->sem);
>
> pm_runtime_mark_last_busy(dev->dev);
> pm_runtime_put_autosuspend(dev->dev);
> @@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> return -ENOMEM;
>
> /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_read_killable(&adev->reset_sem);
> + r = down_read_killable(&adev->reset_domain->sem);
> if (r)
> goto pro_end;
>
> @@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> /* restart the scheduler */
> kthread_unpark(ring->sched.thread);
>
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
>
> ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d61bc0a0457c..dcbb175d336f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
> * the lock.
> */
> if (in_task()) {
> - if (down_read_trylock(&adev->reset_sem))
> - up_read(&adev->reset_sem);
> + if (down_read_trylock(&adev->reset_domain->sem))
> + up_read(&adev->reset_domain->sem);
> else
> - lockdep_assert_held(&adev->reset_sem);
> + lockdep_assert_held(&adev->reset_domain->sem);
> }
> #endif
> return false;
> @@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
> if ((reg * 4) < adev->rmmio_size) {
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> amdgpu_sriov_runtime(adev) &&
> - down_read_trylock(&adev->reset_sem)) {
> + down_read_trylock(&adev->reset_domain->sem)) {
> ret = amdgpu_kiq_rreg(adev, reg);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> } else {
> ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
> }
> @@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
> if ((reg * 4) < adev->rmmio_size) {
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> amdgpu_sriov_runtime(adev) &&
> - down_read_trylock(&adev->reset_sem)) {
> + down_read_trylock(&adev->reset_domain->sem)) {
> amdgpu_kiq_wreg(adev, reg, v);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> } else {
> writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
> }
> @@ -3555,7 +3555,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->virt.vf_errors.lock);
> hash_init(adev->mn_hash);
> atomic_set(&adev->in_gpu_reset, 0);
> - init_rwsem(&adev->reset_sem);
> mutex_init(&adev->psp.mutex);
> mutex_init(&adev->notifier_lock);
>
> @@ -4833,9 +4832,9 @@ static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
> atomic_set(&adev->in_gpu_reset, 1);
>
> if (hive) {
> - down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
> + down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock);
> } else {
> - down_write(&adev->reset_sem);
> + down_write(&adev->reset_domain->sem);
> }
>
> switch (amdgpu_asic_reset_method(adev)) {
> @@ -4856,7 +4855,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> amdgpu_vf_error_trans_all(adev);
> adev->mp1_state = PP_MP1_STATE_NONE;
> atomic_set(&adev->in_gpu_reset, 0);
> - up_write(&adev->reset_sem);
> + up_write(&adev->reset_domain->sem);
> }
>
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> @@ -5476,7 +5475,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
> /* Fatal error, prepare for slot reset */
> case pci_channel_io_frozen:
> /*
> - * Locking adev->reset_sem will prevent any external access
> + * Locking adev->reset_domain->sem will prevent any external access
> * to GPU during PCI error recovery
> */
> amdgpu_device_lock_adev(adev, NULL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 05117eda105b..d3e055314804 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -31,6 +31,8 @@
> #include <linux/debugfs.h>
> #include <linux/uaccess.h>
>
> +#include "amdgpu_reset.h"
> +
> #define EEPROM_I2C_MADDR_VEGA20 0x0
> #define EEPROM_I2C_MADDR_ARCTURUS 0x40000
> #define EEPROM_I2C_MADDR_ARCTURUS_D342 0x0
> @@ -193,12 +195,12 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
> __encode_table_header_to_buf(&control->tbl_hdr, buf);
>
> /* i2c may be unstable in gpu reset */
> - down_read(&adev->reset_sem);
> + down_read(&adev->reset_domain->sem);
> res = amdgpu_eeprom_write(&adev->pm.smu_i2c,
> control->i2c_address +
> control->ras_header_offset,
> buf, RAS_TABLE_HEADER_SIZE);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
>
> if (res < 0) {
> DRM_ERROR("Failed to write EEPROM table header:%d", res);
> @@ -387,13 +389,13 @@ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
> int res;
>
> /* i2c may be unstable in gpu reset */
> - down_read(&adev->reset_sem);
> + down_read(&adev->reset_domain->sem);
> buf_size = num * RAS_TABLE_RECORD_SIZE;
> res = amdgpu_eeprom_write(&adev->pm.smu_i2c,
> control->i2c_address +
> RAS_INDEX_TO_OFFSET(control, fri),
> buf, buf_size);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> if (res < 0) {
> DRM_ERROR("Writing %d EEPROM table records error:%d",
> num, res);
> @@ -547,12 +549,12 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
> goto Out;
> }
>
> - down_read(&adev->reset_sem);
> + down_read(&adev->reset_domain->sem);
> res = amdgpu_eeprom_read(&adev->pm.smu_i2c,
> control->i2c_address +
> control->ras_record_offset,
> buf, buf_size);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> if (res < 0) {
> DRM_ERROR("EEPROM failed reading records:%d\n",
> res);
> @@ -642,13 +644,13 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
> int res;
>
> /* i2c may be unstable in gpu reset */
> - down_read(&adev->reset_sem);
> + down_read(&adev->reset_domain->sem);
> buf_size = num * RAS_TABLE_RECORD_SIZE;
> res = amdgpu_eeprom_read(&adev->pm.smu_i2c,
> control->i2c_address +
> RAS_INDEX_TO_OFFSET(control, fri),
> buf, buf_size);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> if (res < 0) {
> DRM_ERROR("Reading %d EEPROM table records error:%d",
> num, res);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 91864947063f..c0988c804459 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -131,6 +131,8 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>
> }
>
> + init_rwsem(&reset_domain->sem);
> +
> return reset_domain;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index cc625e441fa0..80f918e87d4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -80,6 +80,7 @@ struct amdgpu_reset_domain {
> struct kref refcount;
> struct workqueue_struct *wq;
> enum amdgpu_reset_domain_type type;
> + struct rw_semaphore sem;
> };
>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 38bb42727715..222b1da9d601 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -48,6 +48,8 @@
> #include "athub_v2_0.h"
> #include "athub_v2_1.h"
>
> +#include "amdgpu_reset.h"
> +
> #if 0
> static const struct soc15_reg_golden golden_settings_navi10_hdp[] =
> {
> @@ -328,7 +330,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> */
> if (adev->gfx.kiq.ring.sched.ready &&
> (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> - down_read_trylock(&adev->reset_sem)) {
> + down_read_trylock(&adev->reset_domain->sem)) {
> struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
> const unsigned eng = 17;
> u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
> @@ -338,7 +340,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
> 1 << vmid);
>
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> return;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 88c1eb9ad068..3a5efe969735 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -62,6 +62,8 @@
> #include "amdgpu_ras.h"
> #include "amdgpu_xgmi.h"
>
> +#include "amdgpu_reset.h"
> +
> /* add these here since we already include dce12 headers and these are for DCN */
> #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION 0x055d
> #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_BASE_IDX 2
> @@ -787,13 +789,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> */
> if (adev->gfx.kiq.ring.sched.ready &&
> (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> - down_read_trylock(&adev->reset_sem)) {
> + down_read_trylock(&adev->reset_domain->sem)) {
> uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
> uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>
> amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
> 1 << vmid);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> return;
> }
>
> @@ -900,7 +902,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
> if (amdgpu_in_reset(adev))
> return -EIO;
>
> - if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
> + if (ring->sched.ready && down_read_trylock(&adev->reset_domain->sem)) {
> /* Vega20+XGMI caches PTEs in TC and TLB. Add a
> * heavy-weight TLB flush (type 2), which flushes
> * both. Due to a race condition with concurrent
> @@ -927,7 +929,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
> if (r) {
> amdgpu_ring_undo(ring);
> spin_unlock(&adev->gfx.kiq.ring_lock);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> return -ETIME;
> }
>
> @@ -936,10 +938,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
> r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
> if (r < 1) {
> dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> return -ETIME;
> }
> - up_read(&adev->reset_sem);
> + up_read(&adev->reset_domain->sem);
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 6740eef84ee1..4e23c29e665c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -262,7 +262,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
> return;
>
> - down_write(&adev->reset_sem);
> + down_write(&adev->reset_domain->sem);
>
> amdgpu_virt_fini_data_exchange(adev);
>
> @@ -278,7 +278,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>
> flr_done:
> atomic_set(&adev->in_gpu_reset, 0);
> - up_write(&adev->reset_sem);
> + up_write(&adev->reset_domain->sem);
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index e967d61c7134..f715780f7d20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -286,7 +286,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
> return;
>
> - down_write(&adev->reset_sem);
> + down_write(&adev->reset_domain->sem);
>
> amdgpu_virt_fini_data_exchange(adev);
>
> @@ -302,7 +302,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>
> flr_done:
> atomic_set(&adev->in_gpu_reset, 0);
> - up_write(&adev->reset_sem);
> + up_write(&adev->reset_domain->sem);
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
More information about the dri-devel
mailing list