[Intel-xe] [PATCH 06/12] drm/xe/gsc: GSC FW load

John Harrison john.c.harrison at intel.com
Wed Nov 8 22:29:47 UTC 2023


On 11/8/2023 14:23, Daniele Ceraolo Spurio wrote:
> On 11/8/2023 2:17 PM, John Harrison wrote:
>> On 10/27/2023 15:29, Daniele Ceraolo Spurio wrote:
>>> The GSC FW must be copied in a 4MB stolen memory allocation, whose GGTT
>>> address is then passed as a parameter to a dedicated load instruction
>>> submitted via the GSC engine.
>>>
>>> Since the GSC load is relatively slow (up to 250ms), we perform it
>>> asynchronously via a worker. This requires us to make sure that the
>>> worker has stopped before suspending/unloading.
>>>
>>> Note that we can't yet use xe_migrate_copy for the copy because it
>>> doesn't work with stolen memory right now, so we do a memcpy from the
>>> CPU side instead.
>>>
>>> Bspec: 65306, 65346
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>> Cc: Alan Previn <alan.previn.teres.alexis at intel.com>
>>> Cc: John Harrison <John.C.Harrison at Intel.com>
>>> ---
>>>   .../gpu/drm/xe/instructions/xe_gsc_commands.h |  34 +++
>>>   .../gpu/drm/xe/instructions/xe_instr_defs.h   |   1 +
>>>   drivers/gpu/drm/xe/regs/xe_gsc_regs.h         |  29 +++
>>>   drivers/gpu/drm/xe/xe_gsc.c                   | 240 
>>> ++++++++++++++++++
>>>   drivers/gpu/drm/xe/xe_gsc.h                   |   3 +
>>>   drivers/gpu/drm/xe/xe_gsc_types.h             |  17 ++
>>>   drivers/gpu/drm/xe/xe_uc.c                    |  12 +-
>>>   7 files changed, 335 insertions(+), 1 deletion(-)
>>>   create mode 100644 drivers/gpu/drm/xe/instructions/xe_gsc_commands.h
>>>   create mode 100644 drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h 
>>> b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h
>>> new file mode 100644
>>> index 000000000000..c7a833d7f965
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h
>>> @@ -0,0 +1,34 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +
>>> +#ifndef _XE_GSC_COMMANDS_H_
>>> +#define _XE_GSC_COMMANDS_H_
>>> +
>>> +#include "instructions/xe_instr_defs.h"
>>> +
>>> +/*
>>> + * All GSCCS-specific commands have fixed length, so we can include 
>>> it in the
>>> + * defines. Note that the generic GSC command header structure 
>>> includes an
>>> + * optional data field in bits 9-21, but there are no commands that 
>>> actually use
>>> + * it; some of the commands are instead defined as having an 
>>> extended length
>>> + * field spanning bits 0-15, even if the extra bits are not 
>>> required because the
>>> + * longest GSCCS command is only 8 dwords. To handle this, the 
>>> defines below use
>>> + * a single field for both data and len. If we ever get a commands 
>>> that does
>>> + * actually have data and this approach doesn't work for it we can 
>>> re-work it
>>> + * at that point.
>>> + */
>>> +
>>> +#define GSC_OPCODE        REG_GENMASK(28, 22)
>>> +#define GSC_CMD_DATA_AND_LEN    REG_GENMASK(21, 0)
>>> +
>>> +#define __GSC_INSTR(op, dl) \
>>> +    (XE_INSTR_GSC | \
>>> +    REG_FIELD_PREP(GSC_OPCODE, op) | \
>>> +    REG_FIELD_PREP(GSC_CMD_DATA_AND_LEN, dl))
>>> +
>>> +#define GSC_FW_LOAD __GSC_INSTR(1, 2)
>>> +#define   GSC_FW_LOAD_LIMIT_VALID REG_BIT(31)
>>> +
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h 
>>> b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> index e403b4fcc20a..04179b2a48e1 100644
>>> --- a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> +++ b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h
>>> @@ -15,6 +15,7 @@
>>>    */
>>>   #define XE_INSTR_CMD_TYPE        GENMASK(31, 29)
>>>   #define   XE_INSTR_MI REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x0)
>>> +#define   XE_INSTR_GSC REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x2)
>>>   #define   XE_INSTR_GFXPIPE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x3)
>>>     /*
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h 
>>> b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>>> new file mode 100644
>>> index 000000000000..22d2ad9cb64d
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h
>>> @@ -0,0 +1,29 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +
>>> +#ifndef _XE_GSC_REGS_H_
>>> +#define _XE_GSC_REGS_H_
>>> +
>>> +#include <linux/compiler.h>
>>> +#include <linux/types.h>
>>> +
>>> +#include "regs/xe_reg_defs.h"
>>> +
>>> +/* Definitions of GSC H/W registers, bits, etc */
>>> +
>>> +#define MTL_GSC_HECI1_BASE    0x00116000
>>> +#define MTL_GSC_HECI2_BASE    0x00117000
>>> +
>>> +/*
>>> + * The FWSTS register values are FW defined and can be different 
>>> between
>>> + * HECI1 and HECI2
>>> + */
>>> +#define HECI_FWSTS1(base)                XE_REG((base) + 0xc40)
>>> +#define   HECI1_FWSTS1_CURRENT_STATE REG_GENMASK(3, 0)
>>> +#define   HECI1_FWSTS1_CURRENT_STATE_RESET        0
>>> +#define   HECI1_FWSTS1_PROXY_STATE_NORMAL        5
>>> +#define   HECI1_FWSTS1_INIT_COMPLETE            REG_BIT(9)
>>> +
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c
>>> index 3f709577d73b..7e5ba9039163 100644
>>> --- a/drivers/gpu/drm/xe/xe_gsc.c
>>> +++ b/drivers/gpu/drm/xe/xe_gsc.c
>>> @@ -5,10 +5,20 @@
>>>     #include "xe_gsc.h"
>>>   +#include <drm/drm_managed.h>
>>> +
>>> +#include "xe_bb.h"
>>> +#include "xe_bo.h"
>>>   #include "xe_device.h"
>>> +#include "xe_exec_queue.h"
>>>   #include "xe_gt.h"
>>>   #include "xe_gt_printk.h"
>>> +#include "xe_map.h"
>>> +#include "xe_mmio.h"
>>> +#include "xe_sched_job.h"
>>>   #include "xe_uc_fw.h"
>>> +#include "instructions/xe_gsc_commands.h"
>>> +#include "regs/xe_gsc_regs.h"
>>>     static struct xe_gt *
>>>   gsc_to_gt(struct xe_gsc *gsc)
>>> @@ -16,6 +26,134 @@ gsc_to_gt(struct xe_gsc *gsc)
>>>       return container_of(gsc, struct xe_gt, uc.gsc);
>>>   }
>>>   +static int memcpy_fw(struct xe_gsc *gsc)
>>> +{
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>> +    u32 fw_size = gsc->fw.size;
>>> +    void *storage;
>>> +
>>> +    /*
>>> +     * FIXME: xe_migrate_copy does not work with stolen mem yet, so 
>>> we use
>>> +     * a memcpy for now.
>>> +     */
>>> +    storage = kmalloc(fw_size, GFP_KERNEL);
>>> +    if (!storage)
>>> +        return -ENOMEM;
>>> +
>>> +    xe_map_memcpy_from(xe, storage, &gsc->fw.bo->vmap, 0, fw_size);
>>> +    xe_map_memcpy_to(xe, &gsc->private->vmap, 0, storage, fw_size);
>>> +    xe_map_memset(xe, &gsc->private->vmap, fw_size, 0, 
>>> gsc->private->size - fw_size);
>>> +
>>> +    kfree(storage);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int emit_gsc_upload(struct xe_gsc *gsc)
>>> +{
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +    u32 offset = xe_bo_ggtt_addr(gsc->private);
>>> +    struct xe_bb *bb;
>>> +    struct xe_sched_job *job;
>>> +    struct dma_fence *fence;
>>> +    long timeout;
>>> +
>>> +    bb = xe_bb_new(gt, 4, false);
>>> +    if (IS_ERR(bb))
>>> +        return PTR_ERR(bb);
>>> +
>>> +    bb->cs[bb->len++] = GSC_FW_LOAD;
>>> +    bb->cs[bb->len++] = lower_32_bits(offset);
>>> +    bb->cs[bb->len++] = upper_32_bits(offset);
>>> +    bb->cs[bb->len++] = (gsc->private->size / SZ_4K) | 
>>> GSC_FW_LOAD_LIMIT_VALID;
>>> +
>>> +    job = xe_bb_create_job(gsc->q, bb);
>>> +    if (IS_ERR(job)) {
>>> +        xe_bb_free(bb, NULL);
>>> +        return PTR_ERR(job);
>>> +    }
>>> +
>>> +    xe_sched_job_arm(job);
>>> +    fence = dma_fence_get(&job->drm.s_fence->finished);
>>> +    xe_sched_job_push(job);
>>> +
>>> +    timeout = dma_fence_wait_timeout(fence, false, HZ);
>>> +    dma_fence_put(fence);
>>> +    xe_bb_free(bb, NULL);
>>> +    if (timeout < 0)
>>> +        return timeout;
>>> +    else if (!timeout)
>>> +        return -ETIME;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int gsc_fw_is_loaded(struct xe_gt *gt)
>>> +{
>>> +    return xe_mmio_read32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE)) &
>>> +                  HECI1_FWSTS1_INIT_COMPLETE;
>>> +}
>>> +
>>> +static int gsc_fw_wait(struct xe_gt *gt)
>>> +{
>>> +    return xe_mmio_wait32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE),
>>> +                  HECI1_FWSTS1_INIT_COMPLETE,
>>> +                  HECI1_FWSTS1_INIT_COMPLETE,
>>> +                  500, NULL, false);
>> The 500 at least needs a comment to explain where it comes from and 
>> why. If not a #define as well.
>>
>> I assume this is the <=250ms in the commit description?
>>
>
> yup,  I just doubled the 250 to 500 to give it a bit of wiggle room. 
> I'll add a comment.
>
>>> +}
>>> +
>>> +static int gsc_upload(struct xe_gsc *gsc)
>>> +{
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +    int err;
>>> +
>>> +    if (XE_WARN_ON(!xe_uc_fw_is_loadable(&gsc->fw) || !gsc->q || 
>>> gsc_fw_is_loaded(gt)))
>> The last term here is a register read of a firmware written value? 
>> The kernel should not barf with WARNs and such due to external data. 
>> Those are for internal bugs only. Dodgy external data should just be 
>> an xe_gt_err, shouldn't it?
>
> The idea was that we shouldn't be here if the GSC HW reports that 
> we've already loaded, we should have bailed early (in 
> xe_gsc_load_start), so somewhere in the stack we have a programming bug.
Right but once you include external inputs, that programming bug might 
not be in the KMD. The KMD is perfect and never goes wrong, but we don't 
trust those pesky hardware designers ;).

If nothing else, you could get a PCIe drop-out between the previous 
check and this one. So all the software state is perfect but the 
hardware is toast. That should not be a WARN, it should be a xe_gt_err.

John.

>
>>
>>> +        return 0;
>> And returning zero means the state will transition to TRANSFERRED not 
>> LOAD_FAIL. That seems plausible in the case of already loaded, but in 
>> the other cases that seems wrong?
>
> You're right, will fix.
>
> Daniele
>
>>
>> John.
>>
>>> +
>>> +    err = memcpy_fw(gsc);
>>> +    if (err) {
>>> +        xe_gt_err(gt, "Failed to memcpy GSC FW\n");
>>> +        return err;
>>> +    }
>>> +
>>> +    err = emit_gsc_upload(gsc);
>>> +    if (err) {
>>> +        xe_gt_err(gt, "Failed to emit GSC FW upload (%pe)\n", 
>>> ERR_PTR(err));
>>> +        return err;
>>> +    }
>>> +
>>> +    err = gsc_fw_wait(gt);
>>> +    if (err) {
>>> +        xe_gt_err(gt, "Failed to wait for GSC load (%pe)\n", 
>>> ERR_PTR(err));
>>> +        return err;
>>> +    }
>>> +
>>> +    xe_gt_dbg(gt, "GSC FW async load completed\n");
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void gsc_work(struct work_struct *work)
>>> +{
>>> +    struct xe_gsc *gsc = container_of(work, typeof(*gsc), work);
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>> +    int ret;
>>> +
>>> +    xe_device_mem_access_get(xe);
>>> +    xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC);
>>> +
>>> +    ret = gsc_upload(gsc);
>>> +    if (ret)
>>> +        xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOAD_FAIL);
>>> +    else
>>> +        xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED);
>>> +
>>> +    xe_force_wake_put(gt_to_fw(gt), XE_FW_GSC);
>>> +    xe_device_mem_access_put(xe);
>>> +}
>>> +
>>>   int xe_gsc_init(struct xe_gsc *gsc)
>>>   {
>>>       struct xe_gt *gt = gsc_to_gt(gsc);
>>> @@ -23,6 +161,7 @@ int xe_gsc_init(struct xe_gsc *gsc)
>>>       int ret;
>>>         gsc->fw.type = XE_UC_FW_TYPE_GSC;
>>> +    INIT_WORK(&gsc->work, gsc_work);
>>>         /* The GSC uC is only available on the media GT */
>>>       if (tile->media_gt && (gt != tile->media_gt)) {
>>> @@ -48,3 +187,104 @@ int xe_gsc_init(struct xe_gsc *gsc)
>>>       return ret;
>>>   }
>>>   +static void free_resources(struct drm_device *drm, void *arg)
>>> +{
>>> +    struct xe_gsc *gsc = arg;
>>> +
>>> +    if (gsc->wq) {
>>> +        destroy_workqueue(gsc->wq);
>>> +        gsc->wq = NULL;
>>> +    }
>>> +
>>> +    if (gsc->q) {
>>> +        xe_exec_queue_put(gsc->q);
>>> +        gsc->q = NULL;
>>> +    }
>>> +
>>> +    if (gsc->private) {
>>> +        xe_bo_unpin_map_no_vm(gsc->private);
>>> +        gsc->private = NULL;
>>> +    }
>>> +}
>>> +
>>> +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc)
>>> +{
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +    struct xe_tile *tile = gt_to_tile(gt);
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>> +    struct xe_hw_engine *hwe = xe_gt_hw_engine(gt, 
>>> XE_ENGINE_CLASS_OTHER, 0, true);
>>> +    struct xe_exec_queue *q;
>>> +    struct workqueue_struct *wq;
>>> +    struct xe_bo *bo;
>>> +    int err;
>>> +
>>> +    if (!xe_uc_fw_is_available(&gsc->fw))
>>> +        return 0;
>>> +
>>> +    if (!hwe)
>>> +        return -ENODEV;
>>> +
>>> +    bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4M,
>>> +                  ttm_bo_type_kernel,
>>> +                  XE_BO_CREATE_STOLEN_BIT |
>>> +                  XE_BO_CREATE_GGTT_BIT);
>>> +    if (IS_ERR(bo))
>>> +        return PTR_ERR(bo);
>>> +
>>> +    q = xe_exec_queue_create(xe, NULL,
>>> +                 BIT(hwe->logical_instance), 1, hwe,
>>> +                 EXEC_QUEUE_FLAG_KERNEL |
>>> +                 EXEC_QUEUE_FLAG_PERMANENT);
>>> +    if (IS_ERR(q)) {
>>> +        xe_gt_err(gt, "Failed to create queue for GSC submission\n");
>>> +        err = PTR_ERR(q);
>>> +        goto out_bo;
>>> +    }
>>> +
>>> +    wq = alloc_ordered_workqueue("gsc-ordered-wq", 0);
>>> +    if (!wq) {
>>> +        err = -ENOMEM;
>>> +        goto out_q;
>>> +    }
>>> +
>>> +    gsc->private = bo;
>>> +    gsc->q = q;
>>> +    gsc->wq = wq;
>>> +
>>> +    err = drmm_add_action_or_reset(&xe->drm, free_resources, gsc);
>>> +    if (err)
>>> +        return err;
>>> +
>>> +    xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOADABLE);
>>> +
>>> +    return 0;
>>> +
>>> +out_q:
>>> +    xe_exec_queue_put(q);
>>> +out_bo:
>>> +    xe_bo_unpin_map_no_vm(bo);
>>> +    return err;
>>> +
>>> +}
>>> +
>>> +void xe_gsc_load_start(struct xe_gsc *gsc)
>>> +{
>>> +    struct xe_gt *gt = gsc_to_gt(gsc);
>>> +
>>> +    if (!xe_uc_fw_is_loadable(&gsc->fw) || !gsc->q)
>>> +        return;
>>> +
>>> +    /* GSC FW survives GT reset and D3Hot */
>>> +    if (gsc_fw_is_loaded(gt)) {
>>> +        xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED);
>>> +        return;
>>> +    }
>>> +
>>> +    queue_work(gsc->wq, &gsc->work);
>>> +}
>>> +
>>> +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc)
>>> +{
>>> +    if (xe_uc_fw_is_loadable(&gsc->fw) && gsc->wq)
>>> +        flush_work(&gsc->work);
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_gsc.h b/drivers/gpu/drm/xe/xe_gsc.h
>>> index baa7f21f4204..f870eddc77d4 100644
>>> --- a/drivers/gpu/drm/xe/xe_gsc.h
>>> +++ b/drivers/gpu/drm/xe/xe_gsc.h
>>> @@ -9,5 +9,8 @@
>>>   #include "xe_gsc_types.h"
>>>     int xe_gsc_init(struct xe_gsc *gsc);
>>> +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc);
>>> +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc);
>>> +void xe_gsc_load_start(struct xe_gsc *gsc);
>>>     #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_gsc_types.h 
>>> b/drivers/gpu/drm/xe/xe_gsc_types.h
>>> index 1bc50583fe58..57fefd66a7ea 100644
>>> --- a/drivers/gpu/drm/xe/xe_gsc_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_gsc_types.h
>>> @@ -6,8 +6,13 @@
>>>   #ifndef _XE_GSC_TYPES_H_
>>>   #define _XE_GSC_TYPES_H_
>>>   +#include <linux/workqueue.h>
>>> +
>>>   #include "xe_uc_fw_types.h"
>>>   +struct xe_bo;
>>> +struct xe_exec_queue;
>>> +
>>>   /**
>>>    * struct xe_gsc - GSC
>>>    */
>>> @@ -17,6 +22,18 @@ struct xe_gsc {
>>>         /** @security_version: SVN found in the fetched blob */
>>>       u32 security_version;
>>> +
>>> +    /** @private: Private data for use by the GSC FW */
>>> +    struct xe_bo *private;
>>> +
>>> +    /** @q: Default queue used for submissions to GSC FW */
>>> +    struct xe_exec_queue *q;
>>> +
>>> +    /** @wq: workqueue to handle jobs for delayed load and proxy 
>>> handling */
>>> +    struct workqueue_struct *wq;
>>> +
>>> +    /** @work: delayed load and proxy handling work */
>>> +    struct work_struct work;
>>>   };
>>>     #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
>>> index b67154c78dff..15dcd1f91e9c 100644
>>> --- a/drivers/gpu/drm/xe/xe_uc.c
>>> +++ b/drivers/gpu/drm/xe/xe_uc.c
>>> @@ -74,11 +74,17 @@ int xe_uc_init(struct xe_uc *uc)
>>>    */
>>>   int xe_uc_init_post_hwconfig(struct xe_uc *uc)
>>>   {
>>> +    int err;
>>> +
>>>       /* GuC submission not enabled, nothing to do */
>>>       if (!xe_device_uc_enabled(uc_to_xe(uc)))
>>>           return 0;
>>>   -    return xe_guc_init_post_hwconfig(&uc->guc);
>>> +    err = xe_guc_init_post_hwconfig(&uc->guc);
>>> +    if (err)
>>> +        return err;
>>> +
>>> +    return xe_gsc_init_post_hwconfig(&uc->gsc);
>>>   }
>>>     static int uc_reset(struct xe_uc *uc)
>>> @@ -173,6 +179,9 @@ int xe_uc_init_hw(struct xe_uc *uc)
>>>       ret = xe_huc_auth(&uc->huc);
>>>       xe_gt_assert(uc_to_gt(uc), !ret);
>>>   +    /* GSC load is async */
>>> +    xe_gsc_load_start(&uc->gsc);
>>> +
>>>       return 0;
>>>   }
>>>   @@ -197,6 +206,7 @@ void xe_uc_gucrc_disable(struct xe_uc *uc)
>>>     void xe_uc_stop_prepare(struct xe_uc *uc)
>>>   {
>>> +    xe_gsc_wait_for_worker_completion(&uc->gsc);
>>>       xe_guc_stop_prepare(&uc->guc);
>>>   }
>>
>



More information about the Intel-xe mailing list