[PATCH] drm/amdgpu: add mutex to protect ras shared memory
Lazar, Lijo
lijo.lazar at amd.com
Mon Apr 29 07:30:38 UTC 2024
On 4/28/2024 12:38 PM, YiPeng Chai wrote:
> Add mutex to protect ras shared memory.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 121 ++++++++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 2 +
> 3 files changed, 84 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 5583e2d1b12f..fa4fea00f6b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
> }
> }
>
> +static int psp_ras_send_cmd(struct psp_context *psp,
> + enum ras_command cmd_id, void *in, void *out)
> +{
> + struct ta_ras_shared_memory *ras_cmd;
> + uint32_t cmd = cmd_id;
> + int ret = 0;
> +
> + mutex_lock(&psp->ras_context.mutex);
> + ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> + memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> +
> + switch (cmd) {
> + case TA_RAS_COMMAND__ENABLE_FEATURES:
> + case TA_RAS_COMMAND__DISABLE_FEATURES:
> + memcpy(&ras_cmd->ras_in_message,
> + in, sizeof(ras_cmd->ras_in_message));
> + break;
> + case TA_RAS_COMMAND__TRIGGER_ERROR:
> + memcpy(&ras_cmd->ras_in_message.trigger_error,
> + in, sizeof(ras_cmd->ras_in_message.trigger_error));
> + break;
> + case TA_RAS_COMMAND__QUERY_ADDRESS:
> + memcpy(&ras_cmd->ras_in_message.address,
> + in, sizeof(ras_cmd->ras_in_message.address));
> + break;
> + default:
> + dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
> + ret = -EINVAL;
> + goto err_out;
> + }
> +
> + ras_cmd->cmd_id = cmd;
> + ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +
> + switch (cmd) {
> + case TA_RAS_COMMAND__TRIGGER_ERROR:
> + if (out) {
As NULL check for 'out' is done before copying, better to do the same
for 'in' also.
> + uint32_t *ras_status = (uint32_t *)out;
> +
> + *ras_status = ras_cmd->ras_status;
> + }
> + break;
> + case TA_RAS_COMMAND__QUERY_ADDRESS:
> + if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> + ret = -EINVAL;
> + else if (out)
> + memcpy(out,
> + &ras_cmd->ras_out_message.address,
> + sizeof(ras_cmd->ras_out_message.address));
> + break;
> + default:
> + break;
> + }
> +
> +err_out:
> + mutex_unlock(&psp->ras_context.mutex);
> +
> + return ret;
> +}
> +
> int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
> {
> struct ta_ras_shared_memory *ras_cmd;
> @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
> int psp_ras_enable_features(struct psp_context *psp,
> union ta_ras_cmd_input *info, bool enable)
> {
> - struct ta_ras_shared_memory *ras_cmd;
> + enum ras_command cmd_id;
> int ret;
>
> - if (!psp->ras_context.context.initialized)
> + if (!psp->ras_context.context.initialized || !info)
> return -EINVAL;
>
> - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> - if (enable)
> - ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
> - else
> - ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
> -
> - ras_cmd->ras_in_message = *info;
> -
> - ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> + cmd_id = enable ?
> + TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
> + ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
> if (ret)
> return -EINVAL;
>
> @@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)
>
> psp->ras_context.context.initialized = false;
>
> + mutex_destroy(&psp->ras_context.mutex);
> +
> return ret;
> }
>
> @@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)
>
> ret = psp_ta_load(psp, &psp->ras_context.context);
>
> - if (!ret && !ras_cmd->ras_status)
> + if (!ret && !ras_cmd->ras_status) {
> psp->ras_context.context.initialized = true;
> - else {
> + mutex_init(&psp->ras_context.mutex);
> + } else {
> if (ras_cmd->ras_status)
> dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);
>
> @@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)
> int psp_ras_trigger_error(struct psp_context *psp,
> struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
> {
> - struct ta_ras_shared_memory *ras_cmd;
> struct amdgpu_device *adev = psp->adev;
> int ret;
> uint32_t dev_mask;
> + uint32_t ras_status;
>
> - if (!psp->ras_context.context.initialized)
> + if (!psp->ras_context.context.initialized || !info)
> return -EINVAL;
>
> switch (info->block_id) {
> @@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
> dev_mask &= AMDGPU_RAS_INST_MASK;
> info->sub_block_index |= dev_mask;
>
> - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> - ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
> - ras_cmd->ras_in_message.trigger_error = *info;
> -
> - ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> + ret = psp_ras_send_cmd(psp,
> + TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
> if (ret)
> return -EINVAL;
>
> @@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
> if (amdgpu_ras_intr_triggered())
> return 0;
>
> - if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
> + if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
> return -EACCES;
> - else if (ras_cmd->ras_status)
> + else if (ras_status)
> return -EINVAL;
>
> return 0;
> @@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
> struct ta_ras_query_address_input *addr_in,
> struct ta_ras_query_address_output *addr_out)
> {
> - struct ta_ras_shared_memory *ras_cmd;
> int ret;
>
> - if (!psp->ras_context.context.initialized)
> - return -EINVAL;
> -
> - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
> - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> - ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
> - ras_cmd->ras_in_message.address = *addr_in;
> -
> - ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> - if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> + if (!psp->ras_context.context.initialized ||
> + !addr_in || !addr_out)
> return -EINVAL;
>
> - *addr_out = ras_cmd->ras_out_message.address;
> + ret = psp_ras_send_cmd(psp,
> + TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);
return psp_ras_send_cmd() will do.
>
> - return 0;
> + return ret;
> }
> // ras end
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index ee16f134ae92..686023918ce3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -197,6 +197,7 @@ struct psp_xgmi_context {
> struct psp_ras_context {
> struct ta_context context;
> struct amdgpu_ras *ras;
> + struct mutex mutex;
> };
>
> #define MEM_TRAIN_SYSTEM_SIGNATURE 0x54534942
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> index ca5c86e5f7cd..87f213f92d83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
>
> context->session_id = ta_id;
>
> + mutex_lock(&psp->ras_context.mutex);
> ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
> if (ret)
> goto err_free_shared_buf;
> @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
> ret = -EFAULT;
>
> err_free_shared_buf:
This error label is used at other places as well and that happens even
before acquiring the mutex.
Thanks,
Lijo
> + mutex_unlock(&psp->ras_context.mutex);
> kfree(shared_buf);
>
> return ret;
More information about the amd-gfx
mailing list