[PATCH] drm/amdgpu: fix return random value when multiple threads read registers via mes.

Alex Deucher alexdeucher at gmail.com
Tue Nov 5 14:30:26 UTC 2024


On Tue, Nov 5, 2024 at 4:39 AM Chong Li <chongli2 at amd.com> wrote:
>
> The currect code use the address "adev->mes.read_val_ptr" to
> store the value read from register via mes.
> So when multiple threads read register,
> multiple threads have to share the one address,
> and overwrite the value each other.
>
> Assign an address by "amdgpu_device_wb_get" to store register value.
> each thread will has an address to store register value.
>
> Signed-off-by: Chong Li <chongli2 at amd.com>

Reviewed-by: Alex Deucher <alexander.deucher at amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 30 +++++++++++--------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 ---
>  2 files changed, 13 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 83d0f731fb65..a8f7173b2663 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -189,17 +189,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>                         (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]];
>         }
>
> -       r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs);
> -       if (r) {
> -               dev_err(adev->dev,
> -                       "(%d) read_val_offs alloc failed\n", r);
> -               goto error;
> -       }
> -       adev->mes.read_val_gpu_addr =
> -               adev->wb.gpu_addr + (adev->mes.read_val_offs * 4);
> -       adev->mes.read_val_ptr =
> -               (uint32_t *)&adev->wb.wb[adev->mes.read_val_offs];
> -
>         r = amdgpu_mes_doorbell_init(adev);
>         if (r)
>                 goto error;
> @@ -220,8 +209,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>                         amdgpu_device_wb_free(adev,
>                                       adev->mes.query_status_fence_offs[i]);
>         }
> -       if (adev->mes.read_val_ptr)
> -               amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
>
>         idr_destroy(&adev->mes.pasid_idr);
>         idr_destroy(&adev->mes.gang_id_idr);
> @@ -246,8 +233,6 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>                         amdgpu_device_wb_free(adev,
>                                       adev->mes.query_status_fence_offs[i]);
>         }
> -       if (adev->mes.read_val_ptr)
> -               amdgpu_device_wb_free(adev, adev->mes.read_val_offs);
>
>         amdgpu_mes_doorbell_free(adev);
>
> @@ -918,10 +903,19 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
>  {
>         struct mes_misc_op_input op_input;
>         int r, val = 0;
> +       uint32_t addr_offset = 0;
> +       uint64_t read_val_gpu_addr;
> +       uint32_t *read_val_ptr;
>
> +       if (amdgpu_device_wb_get(adev, &addr_offset)) {
> +               DRM_ERROR("critical bug! too many mes readers\n");
> +               goto error;
> +       }
> +       read_val_gpu_addr = adev->wb.gpu_addr + (addr_offset * 4);
> +       read_val_ptr = (uint32_t *)&adev->wb.wb[addr_offset];
>         op_input.op = MES_MISC_OP_READ_REG;
>         op_input.read_reg.reg_offset = reg;
> -       op_input.read_reg.buffer_addr = adev->mes.read_val_gpu_addr;
> +       op_input.read_reg.buffer_addr = read_val_gpu_addr;
>
>         if (!adev->mes.funcs->misc_op) {
>                 DRM_ERROR("mes rreg is not supported!\n");
> @@ -932,9 +926,11 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
>         if (r)
>                 DRM_ERROR("failed to read reg (0x%x)\n", reg);
>         else
> -               val = *(adev->mes.read_val_ptr);
> +               val = *(read_val_ptr);
>
>  error:
> +       if (addr_offset)
> +               amdgpu_device_wb_free(adev, addr_offset);
>         return val;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 45e3508f0f8e..83f45bb48427 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -119,9 +119,6 @@ struct amdgpu_mes {
>         uint32_t                        query_status_fence_offs[AMDGPU_MAX_MES_PIPES];
>         uint64_t                        query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES];
>         uint64_t                        *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES];
> -       uint32_t                        read_val_offs;
> -       uint64_t                        read_val_gpu_addr;
> -       uint32_t                        *read_val_ptr;
>
>         uint32_t                        saved_flags;
>
> --
> 2.34.1
>


More information about the amd-gfx mailing list