[PATCH v2] drm/amdgpu: fix return random value when multiple threads read registers via mes.

Deng, Emily Emily.Deng at amd.com
Tue Nov 5 05:05:04 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Emily Deng <Emily.Deng at amd.com>

Emily Deng
Best Wishes



>-----Original Message-----
>From: Li, Chong(Alan) <Chong.Li at amd.com>
>Sent: Tuesday, November 5, 2024 10:52 AM
>To: amd-gfx at lists.freedesktop.org
>Cc: Koenig, Christian <Christian.Koenig at amd.com>; Deng, Emily
><Emily.Deng at amd.com>; cao, lin <lin.cao at amd.com>; Andjelkovic, Dejan
><Dejan.Andjelkovic at amd.com>; Yin, ZhenGuo (Chris) <ZhenGuo.Yin at amd.com>;
>Li, Chong(Alan) <Chong.Li at amd.com>
>Subject: [PATCH v2] drm/amdgpu: fix return random value when multiple threads read
>registers via mes.
>
>The currect code use the address "adev->mes.read_val_ptr" to store the value read
>from register via mes.
>So when multiple threads read register,
>multiple threads have to share the one address, and overwrite the value each other.
>
>Assign an address by "amdgpu_device_wb_get" to store register value.
>each thread will has an address to store register value.
>
>Signed-off-by: Chong Li <chongli2 at amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 30 +++++++++++--------------
>drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 ---
> 2 files changed, 13 insertions(+), 20 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>index 83d0f731fb65..d74e3507e155 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>@@ -189,17 +189,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>                       (uint64_t *)&adev->wb.wb[adev-
>>mes.query_status_fence_offs[i]];
>       }
>
>-      r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs);
>-      if (r) {
>-              dev_err(adev->dev,
>-                      "(%d) read_val_offs alloc failed\n", r);
>-              goto error;
>-      }
>-      adev->mes.read_val_gpu_addr =
>-              adev->wb.gpu_addr + (adev->mes.read_val_offs * 4);
>-      adev->mes.read_val_ptr =
>-              (uint32_t *)&adev->wb.wb[adev->mes.read_val_offs];
>-
>       r = amdgpu_mes_doorbell_init(adev);
>       if (r)
>               goto error;
>@@ -220,8 +209,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>                       amdgpu_device_wb_free(adev,
>                                     adev->mes.query_status_fence_offs[i]);
>       }
>-      if (adev->mes.read_val_ptr)
>-              amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
>
>       idr_destroy(&adev->mes.pasid_idr);
>       idr_destroy(&adev->mes.gang_id_idr);
>@@ -246,8 +233,6 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>                       amdgpu_device_wb_free(adev,
>                                     adev->mes.query_status_fence_offs[i]);
>       }
>-      if (adev->mes.read_val_ptr)
>-              amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
>
>       amdgpu_mes_doorbell_free(adev);
>
>@@ -918,10 +903,19 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev,
>uint32_t reg)  {
>       struct mes_misc_op_input op_input;
>       int r, val = 0;
>+      uint32_t addr_offset = 0;
>+      uint64_t read_val_gpu_addr = 0;
>+      uint32_t *read_val_ptr = NULL;
>
>+      if (amdgpu_device_wb_get(adev, &addr_offset)) {
>+              DRM_ERROR("critical bug! too many mes readers\n");
>+              goto error;
>+      }
>+      read_val_gpu_addr = adev->wb.gpu_addr + (addr_offset * 4);
>+      read_val_ptr = (uint32_t *)&adev->wb.wb[addr_offset];
>       op_input.op = MES_MISC_OP_READ_REG;
>       op_input.read_reg.reg_offset = reg;
>-      op_input.read_reg.buffer_addr = adev->mes.read_val_gpu_addr;
>+      op_input.read_reg.buffer_addr = read_val_gpu_addr;
>
>       if (!adev->mes.funcs->misc_op) {
>               DRM_ERROR("mes rreg is not supported!\n"); @@ -932,9 +926,11
>@@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
>       if (r)
>               DRM_ERROR("failed to read reg (0x%x)\n", reg);
>       else
>-              val = *(adev->mes.read_val_ptr);
>+              val = *(read_val_ptr);
>
> error:
>+      if (addr_offset)
>+              amdgpu_device_wb_free(adev, addr_offset);
>       return val;
> }
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>index 45e3508f0f8e..83f45bb48427 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>@@ -119,9 +119,6 @@ struct amdgpu_mes {
>       uint32_t
>       query_status_fence_offs[AMDGPU_MAX_MES_PIPES];
>       uint64_t
>       query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES];
>       uint64_t
>       *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES];
>-      uint32_t                        read_val_offs;
>-      uint64_t                        read_val_gpu_addr;
>-      uint32_t                        *read_val_ptr;
>
>       uint32_t                        saved_flags;
>
>--
>2.34.1



More information about the amd-gfx mailing list