[PATCH v2 03/12] drm/xe/pxp: Add VCS inline termination support

Thu Nov 14 18:46:31 UTC 2024

On 11/6/2024 15:49, Daniele Ceraolo Spurio wrote:
> On 10/4/24 15:25, John Harrison wrote:
>> On 8/16/2024 12:00, Daniele Ceraolo Spurio wrote:
>>> The key termination is done with a specific submission to the VCS
>>> engine.
>>>
>>> Note that this patch is meant to be squashed with the follow-up patches
>>> that implement the other pieces of the termination flow. It is separate
>>> for now for ease of review.
>>>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>> ---
>>>   .../gpu/drm/xe/instructions/xe_instr_defs.h   |   1 +
>>>   .../gpu/drm/xe/instructions/xe_mfx_commands.h |  29 +++++
>>>   .../gpu/drm/xe/instructions/xe_mi_commands.h  |   5 +
>>>   drivers/gpu/drm/xe/xe_lrc.h                   |   3 +-
>>>   drivers/gpu/drm/xe/xe_pxp_submit.c            | 108 
>>> ++++++++++++++++++
>>>   drivers/gpu/drm/xe/xe_pxp_submit.h            |   2 +
>>>   drivers/gpu/drm/xe/xe_ring_ops.c              |   4 +-
>>>   7 files changed, 149 insertions(+), 3 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/xe/instructions/xe_mfx_commands.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h 
>>> b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> index fd2ce7ace510..e559969468c4 100644
>>> --- a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> +++ b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> @@ -16,6 +16,7 @@
>>>   #define XE_INSTR_CMD_TYPE        GENMASK(31, 29)
>>>   #define   XE_INSTR_MI REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x0)
>>>   #define   XE_INSTR_GSC REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x2)
>>> +#define   XE_INSTR_VIDEOPIPE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x3)
>>>   #define   XE_INSTR_GFXPIPE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x3)
>>>   #define   XE_INSTR_GFX_STATE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x4)
>>>   diff --git a/drivers/gpu/drm/xe/instructions/xe_mfx_commands.h 
>>> b/drivers/gpu/drm/xe/instructions/xe_mfx_commands.h
>>> new file mode 100644
>>> index 000000000000..686ca3b1d9e8
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/instructions/xe_mfx_commands.h
>>> @@ -0,0 +1,29 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2024 Intel Corporation
>>> + */
>>> +
>>> +#ifndef _XE_MFX_COMMANDS_H_
>>> +#define _XE_MFX_COMMANDS_H_
>>> +
>>> +#include "instructions/xe_instr_defs.h"
>>> +
>>> +#define MFX_CMD_SUBTYPE        REG_GENMASK(28, 27) /* A.K.A cmd 
>>> pipe */
>>> +#define MFX_CMD_OPCODE        REG_GENMASK(26, 24)
>>> +#define MFX_CMD_SUB_OPCODE    REG_GENMASK(23, 16)
>>> +#define MFX_FLAGS_AND_LEN    REG_GENMASK(15, 0)
>>> +
>>> +#define XE_MFX_INSTR(subtype, op, sub_op, flags) \
>>> +    (XE_INSTR_VIDEOPIPE | \
>>> +     REG_FIELD_PREP(MFX_CMD_SUBTYPE, subtype) | \
>>> +     REG_FIELD_PREP(MFX_CMD_OPCODE, op) | \
>>> +     REG_FIELD_PREP(MFX_CMD_SUB_OPCODE, sub_op) | \
>>> +     REG_FIELD_PREP(MFX_FLAGS_AND_LEN, flags))
>>> +
>>> +#define MFX_WAIT                XE_MFX_INSTR(1, 0, 0, 0)
>>> +#define MFX_WAIT_DW0_PXP_SYNC_CONTROL_FLAG    REG_BIT(9)
>>> +#define MFX_WAIT_DW0_MFX_SYNC_CONTROL_FLAG    REG_BIT(8)
>>> +
>>> +#define CRYPTO_KEY_EXCHANGE            XE_MFX_INSTR(2, 6, 9, 0)
>>> +
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h 
>>> b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>> index 10ec2920d31b..167fb0f742de 100644
>>> --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>> +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
>>> @@ -48,6 +48,7 @@
>>>   #define   MI_LRI_LEN(x)            (((x) & 0xff) + 1)
>>>     #define MI_FLUSH_DW            __MI_INSTR(0x26)
>>> +#define   MI_FLUSH_DW_PROTECTED_MEM_EN    REG_BIT(22)
>>>   #define   MI_FLUSH_DW_STORE_INDEX    REG_BIT(21)
>>>   #define   MI_INVALIDATE_TLB        REG_BIT(18)
>>>   #define   MI_FLUSH_DW_CCS        REG_BIT(16)
>>> @@ -66,4 +67,8 @@
>>>     #define MI_BATCH_BUFFER_START        __MI_INSTR(0x31)
>>>   +#define MI_SET_APPID            __MI_INSTR(0x0e)
>>> +#define MI_SET_APPID_SESSION_ID_MASK    REG_GENMASK(6, 0)
>>> +#define MI_SET_APPID_SESSION_ID(x) 
>>> REG_FIELD_PREP(MI_SET_APPID_SESSION_ID_MASK, x)
>>> +
>>>   #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
>>> index c24542e89318..d411c3fbcbc6 100644
>>> --- a/drivers/gpu/drm/xe/xe_lrc.h
>>> +++ b/drivers/gpu/drm/xe/xe_lrc.h
>>> @@ -20,7 +20,8 @@ struct xe_lrc;
>>>   struct xe_lrc_snapshot;
>>>   struct xe_vm;
>>>   -#define LRC_PPHWSP_SCRATCH_ADDR (0x34 * 4)
>>> +#define LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR (0x34 * 4)
>>> +#define LRC_PPHWSP_PXP_INVAL_SCRATCH_ADDR (0x40 * 4)
>>>     struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct 
>>> xe_vm *vm,
>>>                    u32 ring_size);
>>> diff --git a/drivers/gpu/drm/xe/xe_pxp_submit.c 
>>> b/drivers/gpu/drm/xe/xe_pxp_submit.c
>>> index b777b0765c8a..3b69dcc0a00f 100644
>>> --- a/drivers/gpu/drm/xe/xe_pxp_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_pxp_submit.c
>>> @@ -6,14 +6,20 @@
>>>   #include "xe_pxp_submit.h"
>>>     #include <drm/xe_drm.h>
>>> +#include <linux/delay.h>
>>>     #include "xe_device_types.h"
>>> +#include "xe_bb.h"
>>>   #include "xe_bo.h"
>>>   #include "xe_exec_queue.h"
>>>   #include "xe_gsc_submit.h"
>>>   #include "xe_gt.h"
>>> +#include "xe_lrc.h"
>>>   #include "xe_pxp_types.h"
>>> +#include "xe_sched_job.h"
>>>   #include "xe_vm.h"
>>> +#include "instructions/xe_mfx_commands.h"
>>> +#include "instructions/xe_mi_commands.h"
>>>   #include "regs/xe_gt_regs.h"
>>>     /*
>>> @@ -199,3 +205,105 @@ void xe_pxp_destroy_execution_resources(struct 
>>> xe_pxp *pxp)
>>>       destroy_vcs_execution_resources(pxp);
>>>   }
>>>   +#define emit_cmd(xe_, map_, offset_, val_) \
>>> +    xe_map_wr(xe_, map_, (offset_) * sizeof(u32), u32, val_)
>>> +
>>> +/* stall until prior PXP and MFX/HCP/HUC objects are cmopleted */
>> completed
>>
>>> +#define MFX_WAIT_PXP (MFX_WAIT | \
>>> +              MFX_WAIT_DW0_PXP_SYNC_CONTROL_FLAG | \
>>> +              MFX_WAIT_DW0_MFX_SYNC_CONTROL_FLAG)
>> Why define an XE_MFX_INSTR macro that takes a flags word only to 
>> manually OR the flags in outside of the macro?
>
> Maybe flags is a misnomer here, but I couldn't think of anything 
> clearer. Some commands have bits that are always set within that flags 
> field, so for those we would add them at definition time; other 
> commands, like MFX_WAIT, do instead have optional flags, which we can 
> set as needed in the code.
>
> Given that I haven't defined any XE_MFX_INSTR command that requires a 
> value in the flags field, I'll just remove that field from the 
> XE_MFX_INSTR macro for now to make things clearer.
>
>
>>
>>> +static u32 pxp_emit_wait(struct xe_device *xe, struct iosys_map 
>>> *batch, u32 offset)
>>> +{
>>> +    /* wait for cmds to go through */
>>> +    emit_cmd(xe, batch, offset++, MFX_WAIT_PXP);
>>> +    emit_cmd(xe, batch, offset++, 0);
>> This zero is just padding to ensure 64bit alignment of future 
>> instructions?
>
> yes
>
>
>>
>>> +
>>> +    return offset;
>>> +}
>>> +
>>> +static u32 pxp_emit_session_selection(struct xe_device *xe, struct 
>>> iosys_map *batch,
>>> +                      u32 offset, u32 idx)
>>> +{
>>> +    offset = pxp_emit_wait(xe, batch, offset);
>>> +
>>> +    /* pxp off */
>>> +    emit_cmd(xe, batch, offset++, MI_FLUSH_DW | MI_FLUSH_IMM_DW);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +
>>> +    /* select session */
>>> +    emit_cmd(xe, batch, offset++, MI_SET_APPID | 
>>> MI_SET_APPID_SESSION_ID(idx));
>>> +    emit_cmd(xe, batch, offset++, MFX_WAIT_PXP);
>> Seems odd to define a helper function to emit this instruction and 
>> then only use it for some instances.
>
>
> I didn't want the extra padding here or to make the function 
> conditionally add it only when needed. I'll just add it, it's not like 
> we don't have enough memory (the BO is 1 page and we only use a few 
> dwords for the termination).
I guess what we really want is an emit function that looks at the size 
of the instruction being added and pre-pads as appropriate. Then you 
don't need to worry about optional padding after an instruction 
depending upon what may or may not happen next! But probably not worth 
the complication. As you say, a few extra words of padding in a 
non-critical buffer is no biggie.

John.

>
>
>>
>>> +
>>> +    /* pxp on */
>>> +    emit_cmd(xe, batch, offset++, MI_FLUSH_DW |
>>> +                      MI_FLUSH_DW_PROTECTED_MEM_EN |
>>> +                      MI_FLUSH_DW_OP_STOREDW | 
>>> MI_FLUSH_DW_STORE_INDEX |
>>> +                      MI_FLUSH_IMM_DW);
>>> +    emit_cmd(xe, batch, offset++, LRC_PPHWSP_PXP_INVAL_SCRATCH_ADDR |
>>> +                      MI_FLUSH_DW_USE_GTT);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +
>>> +    offset = pxp_emit_wait(xe, batch, offset);
>>> +
>>> +    return offset;
>>> +}
>>> +
>>> +static u32 pxp_emit_inline_termination(struct xe_device *xe,
>>> +                       struct iosys_map *batch, u32 offset)
>>> +{
>>> +    /* session inline termination */
>>> +    emit_cmd(xe, batch, offset++, CRYPTO_KEY_EXCHANGE);
>>> +    emit_cmd(xe, batch, offset++, 0);
>>> +
>>> +    return offset;
>>> +}
>>> +
>>> +static u32 pxp_emit_session_termination(struct xe_device *xe, 
>>> struct iosys_map *batch,
>>> +                    u32 offset, u32 idx)
>>> +{
>>> +    offset = pxp_emit_session_selection(xe, batch, offset, idx);
>>> +    offset = pxp_emit_inline_termination(xe, batch, offset);
>>> +
>>> +    return offset;
>>> +}
>>> +
>>> +/**
>>> + * xe_pxp_submit_session_termination - submits a PXP inline 
>>> termination
>>> + * @pxp: the xe_pxp structure
>>> + * @id: the session to terminate
>>> + *
>>> + * Emit an inline termination via the VCS engine to terminate a 
>>> session.
>>> + *
>>> + * Returns 0 if the submission is successful, an errno value 
>>> otherwise.
>>> + */
>>> +int xe_pxp_submit_session_termination(struct xe_pxp *pxp, u32 id)
>>> +{
>>> +    struct xe_sched_job *job;
>>> +    struct dma_fence *fence;
>>> +    long timeout;
>>> +    u32 offset = 0;
>>> +    u64 addr = xe_bo_ggtt_addr(pxp->vcs_exec.bo);
>>> +
>>> +    offset = pxp_emit_session_termination(pxp->xe, 
>>> &pxp->vcs_exec.bo->vmap, offset, id);
>>> +    offset = pxp_emit_wait(pxp->xe, &pxp->vcs_exec.bo->vmap, offset);
>>> +    emit_cmd(pxp->xe, &pxp->vcs_exec.bo->vmap, offset, 
>>> MI_BATCH_BUFFER_END);
>>> +
>>> +    job =  xe_sched_job_create(pxp->vcs_exec.q, &addr);
>> Double space
>>
>>> +    if (IS_ERR(job))
>>> +        return PTR_ERR(job);
>>> +
>>> +    xe_sched_job_arm(job);
>>> +    fence = dma_fence_get(&job->drm.s_fence->finished);
>>> +    xe_sched_job_push(job);
>>> +
>>> +    timeout = dma_fence_wait_timeout(fence, false, HZ);
>>> +
>>> +    dma_fence_put(fence);
>>> +    if (timeout <= 0)
>>> +        return -EAGAIN;
>> Does it not matter what the error was? Why/how would this fail in a 
>> way that needs to be re-tried?
>>
>> Although looking at the later patches, the return value from this 
>> function is just treated as a pass/fail bool anyway. So why bother 
>> munging it at all?
>
>
> Timeout can be 0 on failure, so can't return that directly. I'll 
> change it to return timeout if negative and an error if it actually 
> timed out.
>
> Daniele
>
>
>>
>> John.
>>
>>> +
>>> +    return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_pxp_submit.h 
>>> b/drivers/gpu/drm/xe/xe_pxp_submit.h
>>> index 1a971fadc081..4ee8c0acfed9 100644
>>> --- a/drivers/gpu/drm/xe/xe_pxp_submit.h
>>> +++ b/drivers/gpu/drm/xe/xe_pxp_submit.h
>>> @@ -13,4 +13,6 @@ struct xe_pxp;
>>>   int xe_pxp_allocate_execution_resources(struct xe_pxp *pxp);
>>>   void xe_pxp_destroy_execution_resources(struct xe_pxp *pxp);
>>>   +int xe_pxp_submit_session_termination(struct xe_pxp *pxp, u32 id);
>>> +
>>>   #endif /* __XE_PXP_SUBMIT_H__ */
>>> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c 
>>> b/drivers/gpu/drm/xe/xe_ring_ops.c
>>> index 0be4f489d3e1..a4b5a0f68a32 100644
>>> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
>>> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
>>> @@ -118,7 +118,7 @@ static int emit_flush_invalidate(u32 flag, u32 
>>> *dw, int i)
>>>       dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | 
>>> MI_FLUSH_IMM_DW |
>>>           MI_FLUSH_DW_STORE_INDEX;
>>>   -    dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
>>> +    dw[i++] = LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR | 
>>> MI_FLUSH_DW_USE_GTT;
>>>       dw[i++] = 0;
>>>       dw[i++] = ~0U;
>>>   @@ -156,7 +156,7 @@ static int emit_pipe_invalidate(u32 
>>> mask_flags, bool invalidate_tlb, u32 *dw,
>>>         flags &= ~mask_flags;
>>>   -    return emit_pipe_control(dw, i, 0, flags, 
>>> LRC_PPHWSP_SCRATCH_ADDR, 0);
>>> +    return emit_pipe_control(dw, i, 0, flags, 
>>> LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR, 0);
>>>   }
>>>     static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
>>