[PATCH 2/2] drm/xe/guc: Add support for G2G communications

Sat Nov 16 02:16:06 UTC 2024

On 11/15/2024 13:22, Daniele Ceraolo Spurio wrote:
> On 11/8/2024 12:22 PM, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> Some features require inter-GuC communication channels on multi-tile
>> devices. So allocate and enable such.
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/xe/abi/guc_actions_abi.h |  20 ++
>>   drivers/gpu/drm/xe/xe_guc.c              | 276 ++++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_guc_types.h        |  10 +
>>   3 files changed, 305 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h 
>> b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> index b54fe40fc5a9..fee385532fb0 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> @@ -134,6 +134,8 @@ enum xe_guc_action {
>>       XE_GUC_ACTION_DEREGISTER_CONTEXT = 0x4503,
>>       XE_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505,
>>       XE_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>> +    XE_GUC_ACTION_REGISTER_G2G = 0x4507,
>> +    XE_GUC_ACTION_DEREGISTER_G2G = 0x4508,
>>       XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>       XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
>>       XE_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
>> @@ -218,4 +220,22 @@ enum xe_guc_tlb_inval_mode {
>>       XE_GUC_TLB_INVAL_MODE_LITE = 0x1,
>>   };
>>   +/*
>> + * GuC to GuC communication (de-)registration fields:
>> + */
>> +enum xe_guc_g2g_type {
>> +    XE_G2G_TYPE_IN = 0x0,
>> +    XE_G2G_TYPE_OUT,
>> +    XE_G2G_TYPE_LIMIT,
>> +};
>> +
>> +#define XE_G2G_REGISTER_DEVICE    REG_GENMASK(16, 16)
>
> I'd call this DEVICE_ID like in the GuC specs, or GT_ID if you want a 
> shorter define. Just DEVICE sounds like it's going to a different card 
> instead of a different GuC device. Not a blocker.
Not seeing how device id is any different to just device in that 
respect. It is clearly a index or id of some sort given that it is an 
integer. And I would prefer to use the API's naming rather than calling 
it a GT id. That could be confused with gt->id, which it really isn't!

>
>> +#define XE_G2G_REGISTER_TILE REG_GENMASK(15, 12)
>> +#define XE_G2G_REGISTER_TYPE    REG_GENMASK(11, 8)
>> +#define XE_G2G_REGISTER_SIZE    REG_GENMASK(7, 0)
>> +
>> +#define XE_G2G_DEREGISTER_DEVICE    REG_GENMASK(16, 16)
>> +#define XE_G2G_DEREGISTER_TILE    REG_GENMASK(15, 12)
>> +#define XE_G2G_DEREGISTER_TYPE    REG_GENMASK(11, 8)
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 1cbad8a91798..899094423291 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -44,7 +44,15 @@ static u32 guc_bo_ggtt_addr(struct xe_guc *guc,
>>                   struct xe_bo *bo)
>>   {
>>       struct xe_device *xe = guc_to_xe(guc);
>> -    u32 addr = xe_bo_ggtt_addr(bo);
>> +    u32 addr;
>> +
>> +    /*
>> +     * For most BOs, the address on the allocating tile is fine. 
>> However for
>> +     * some, e.g. G2G CTB, the address on a specific tile is 
>> required as it
>> +     * might be different for each tile. So, just always ask for the 
>> address
>> +     * on the target GuC.
>> +     */
>> +    addr = __xe_bo_ggtt_addr(bo, gt_to_tile(guc_to_gt(guc))->id);
>>         /* GuC addresses above GUC_GGTT_TOP don't map through the GTT */
>>       xe_assert(xe, addr >= xe_wopcm_size(guc_to_xe(guc)));
>> @@ -244,6 +252,263 @@ static void guc_write_params(struct xe_guc *guc)
>>           xe_mmio_write32(&gt->mmio, SOFT_SCRATCH(1 + i), 
>> guc->params[i]);
>>   }
>>   +static int guc_action_register_g2g_buffer(struct xe_guc *guc, u32 
>> type, u32 dst_tile, u32 dst_dev,
>> +                      u32 desc_addr, u32 buff_addr, u32 size)
>> +{
>> +    struct xe_gt *gt = guc_to_gt(guc);
>> +    struct xe_device *xe = gt_to_xe(gt);
>> +    u32 action[] = {
>> +        XE_GUC_ACTION_REGISTER_G2G,
>> +        FIELD_PREP(XE_G2G_REGISTER_SIZE, size / SZ_4K - 1) |
>
> This needs a comment somewhere to explain that the GuC expects the 
> number of pages minus 1, because otherwise that -1 is confusing.
Really? A lot of size fields are specified as being +1 given that 
providing a size of zero generally makes no sense.

>
>> +        FIELD_PREP(XE_G2G_REGISTER_TYPE, type) |
>> +        FIELD_PREP(XE_G2G_REGISTER_TILE, dst_tile) |
>> +        FIELD_PREP(XE_G2G_REGISTER_DEVICE, dst_dev),
>> +        desc_addr,
>> +        buff_addr,
>> +    };
>> +
>> +    xe_assert(xe, (type == XE_G2G_TYPE_IN) || (type == 
>> XE_G2G_TYPE_OUT));
>> +    xe_assert(xe, !(size % SZ_4K));
>
> Is it worth having asserts for max size (1MB) and for dst_dev (0 or 1)?
The FIELD_PREP macro should take care of out-of-range values.

>
>> +
>> +    return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>> +}
>> +
>> +static int guc_action_deregister_g2g_buffer(struct xe_guc *guc, u32 
>> type, u32 dst_tile, u32 dst_dev)
>> +{
>> +    struct xe_gt *gt = guc_to_gt(guc);
>> +    struct xe_device *xe = gt_to_xe(gt);
>> +    u32 action[] = {
>> +        XE_GUC_ACTION_DEREGISTER_G2G,
>> +        FIELD_PREP(XE_G2G_DEREGISTER_TYPE, type) |
>> +        FIELD_PREP(XE_G2G_DEREGISTER_TILE, dst_tile) |
>> +        FIELD_PREP(XE_G2G_DEREGISTER_DEVICE, dst_dev),
>> +    };
>> +
>> +    xe_assert(xe, (type == XE_G2G_TYPE_IN) || (type == 
>> XE_G2G_TYPE_OUT));
>> +
>> +    return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>> +}
>> +
>> +#define G2G_DEV(gt)    (((gt)->info.type == XE_GT_TYPE_MAIN) ? 0 : 1)
>> +
>> +#define G2G_BUFFER_SIZE (SZ_4K)
>> +#define G2G_DESC_SIZE (64)
>> +#define G2G_DESC_AREA_SIZE (SZ_4K)
>> +
>> +/*
>> + * Generate a unique id for each bi-directional CTB for each pair of
>> + * near and far tiles/devices. The id can then be used as an index into
>> + * a single allocation that is sub-divided into multiple CTBs.
>> + *
>> + * For example, with two devices per tile and two tiles, the table 
>> should
>> + * look like:
>> + *           Far <tile>.<dev>
>> + *         0.0   0.1   1.0   1.1
>> + * N 0.0  --/-- 00/01 02/03 04/05
>> + * e 0.1  01/00 --/-- 06/07 08/09
>> + * a 1.0  03/02 07/06 --/-- 10/11
>> + * r 1.1  05/04 09/08 11/10 --/--
>> + *
>> + * Where each entry is Rx/Tx channel id.
>> + *
>> + * So GuC #3 (tile 1, dev 1) talking to GuC #2 (tile 1, dev 0) would
>> + * be reading from channel #11 and writing to channel #10. Whereas,
>> + * GuC #2 talking to GuC #3 would be read on #10 and write to #11.
>> + */
>> +static unsigned int g2g_slot(u32 near_tile, u32 near_dev, u32 
>> far_tile, u32 far_dev,
>> +                 u32 type, u32 max_inst, bool have_dev)
>> +{
>> +    u32 near = near_tile, far = far_tile;
>> +    u32 idx = 0;
>> +    int i;
>> +
>> +    if (have_dev) {
>> +        near = (near << 1) | near_dev;
>> +        far = (far << 1) | far_dev;
>> +    }
>> +
>> +    if (far > near) {
>> +        for (i = near; i > 0; i--)
>> +            idx += max_inst - i;
>
> The loop can be replaced by a single line with:
>
> idx += near * (2 * max_inst  - near - 1) / 2
>
> But the loop is probably easier to understand.
> (I've spent way too much time trying to find a better formula with 
> little luck :P)
>
>> +        idx += (far - 1 - near);
>> +        idx *= 2;
>> +        idx += type;
>> +        return idx;
>> +    }
>
> This needs some comments, because otherwise it's very hard to parse. 
> Maybe stick with to the table example? something like
>
> /* Count all BO pairs from the rows above the one where the pair we 
> want is */
> for (i = near; i > 0; i--)
>         idx += max_inst - i;
>
> /* Count all BO pairs on the same row and to the left of the pair we 
> want */
> idx += (far - 1 - near);
>
> /* Switch from BO pairs to individual BOs */
> idx *= 2;
>
> /* Select which of the 2 BOs in the pair we want */
> idx += type;
>     return idx;
>
> Same thing for the other one, just swap rows with columns and left 
> with above.
>
> I'm also ok with a different comments as along as there are some.
Only if you provide a comment to explain your anti-loop version first...

Not sure calling it 'BO pairs' is valid. The point is to divide a single 
BO into multiple slots. So 'slot pairs'  or 'slot rx/tx pairs' even 
seems more accurate.

>
>> +
>> +    if (far < near) {
>> +        for (i = far; i > 0; i--)
>> +            idx += max_inst - i;
>> +        idx += (near - 1 - far);
>> +        idx *= 2;
>> +        idx += (1 - type);
>> +        return idx;
>> +    }
>> +
>> +    return -1;
>> +}
>> +
>> +static int guc_g2g_register(struct xe_guc *near_guc, struct xe_gt 
>> *far_gt, u32 type, bool have_dev)
>
> you don't really need the have_dev passed in from outside, you can 
> just check for tile->media_gt being not null.
Passing it in means having a single point of calculation rather than 
evaluating it multiple times. And I prefer the gt count > tile count 
version just in case we one day have a platform where one tile has twin 
GTs and another does not or something else weird.

>
>> +{
>> +    struct xe_gt *near_gt = guc_to_gt(near_guc);
>> +    struct xe_device *xe = gt_to_xe(near_gt);
>> +    struct xe_bo *g2g_bo;
>> +    u32 near_tile = gt_to_tile(near_gt)->id;
>> +    u32 near_dev = G2G_DEV(near_gt);
>> +    u32 far_tile = gt_to_tile(far_gt)->id;
>> +    u32 far_dev = G2G_DEV(far_gt);
>> +    u32 max = xe->info.gt_count;
>> +    u32 base, desc, buf;
>> +    int slot;
>> +
>> +    xe_assert(xe, xe == gt_to_xe(far_gt));
>> +
>> +    g2g_bo = near_guc->g2g.bo;
>> +    xe_assert(xe, g2g_bo);
>> +
>> +    slot = g2g_slot(near_tile, near_dev, far_tile, far_dev, type, 
>> max, have_dev);
>> +    xe_assert(xe, slot >= 0);
>> +
>> +    base = guc_bo_ggtt_addr(near_guc, g2g_bo);
>> +    desc = base + slot * G2G_DESC_SIZE;
>> +    buf = base + slot * G2G_BUFFER_SIZE + G2G_DESC_AREA_SIZE;
>
> nit: I'd reorder this to have G2G_DESC_AREA_SIZE before the slot, so 
> it's in order to how the memory is actually filled.
>
>> +
>> +    xe_assert(xe, (desc - base + G2G_DESC_SIZE) <= G2G_DESC_AREA_SIZE);
>
> nit: We should be able to assert this only once at alloc time (see 
> below).
It's not just about ensuring the channel count does not overflow but 
that the calculations above are correct (including that slot does not 
exceed num_channels.

>
>> +    xe_assert(xe, (buf - base + G2G_BUFFER_SIZE) <= g2g_bo->size);
>> +
>> +    return guc_action_register_g2g_buffer(near_guc, type, far_tile, 
>> far_dev,
>> +                          desc, buf, G2G_BUFFER_SIZE);
>> +}
>> +
>> +static void guc_g2g_deregister(struct xe_guc *guc, u32 far_tile, u32 
>> far_dev, u32 type)
>> +{
>> +    guc_action_deregister_g2g_buffer(guc, type, far_tile, far_dev);
>> +}
>> +
>> +static u32 guc_g2g_size(struct xe_guc *guc)
>> +{
>> +    struct xe_gt *gt = guc_to_gt(guc);
>> +    struct xe_device *xe = gt_to_xe(gt);
>> +    unsigned int count = xe->info.gt_count;
>> +    u32 num_channels = (count * (count - 1)) / 2;
>> +
>> +    return num_channels * XE_G2G_TYPE_LIMIT * G2G_BUFFER_SIZE + 
>> G2G_DESC_AREA_SIZE;
>
> Here we can assert:
> num_channels * XE_G2G_TYPE_LIMIT * G2G_DESC_SIZE <= G2G_DESC_AREA_SIZE
>
> to cover all the descriptors in one go
>
>> +}
>> +
>> +static bool xe_guc_g2g_wanted(struct xe_device *xe)
>> +{
>> +    /* Can't do GuC to GuC communication if there is only one GuC */
>> +    if (xe->info.gt_count <= 1)
>> +        return false;
>> +
>> +    /* No current user */
>> +    return false;
>> +}
>> +
>> +static int guc_g2g_alloc(struct xe_guc *guc)
>> +{
>> +    struct xe_gt *gt = guc_to_gt(guc);
>> +    struct xe_device *xe = gt_to_xe(gt);
>> +    struct xe_tile *tile = gt_to_tile(gt);
>> +    struct xe_bo *bo;
>> +    u32 g2g_size;
>> +
>> +    if (guc->g2g.bo)
>> +        return 0;
>> +
>> +    if (gt->info.id != 0) {
>> +        struct xe_gt *root_gt = xe_device_get_gt(xe, 0);
>> +        struct xe_guc *root_guc = &root_gt->uc.guc;
>> +        struct xe_bo *bo;
>> +
>> +        bo = xe_bo_get(root_guc->g2g.bo);
>> +        if (IS_ERR(bo))
>> +            return PTR_ERR(bo);
>> +
>> +        guc->g2g.bo = bo;
>> +        guc->g2g.owned = false;
>
> It doesn't look like you're actually using the "owned" variable (which 
> makes sense, because the BO is refcounted so it doesn't matter which 
> GuC it was originally allocated for)
Yeah, owned is only there for the selftest that comes later. But not 
including it now means the g2g structure only has a single entry, so 
really should not be a structure. Which suddenly makes the deltas for 
adding in owned later significantly larger!

Having said that, as Matthew pointed out, the code is currently leaking 
reference counts as they are not auto-cleaned. So it does actually need 
a fini function which will use the owned field to decide whether to call 
put or not.

John.

>
> Daniele
>
>> +        return 0;
>> +    }
>> +
>> +    g2g_size = guc_g2g_size(guc);
>> +    bo = xe_managed_bo_create_pin_map(xe, tile, g2g_size,
>> +                      XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>> +                      XE_BO_FLAG_GGTT |
>> +                      XE_BO_FLAG_GGTT_ALL |
>> +                      XE_BO_FLAG_GGTT_INVALIDATE);
>> +    if (IS_ERR(bo))
>> +        return PTR_ERR(bo);
>> +
>> +    xe_map_memset(xe, &bo->vmap, 0, 0, g2g_size);
>> +    guc->g2g.bo = bo;
>> +    guc->g2g.owned = true;
>> +
>> +    return 0;
>> +}
>> +
>> +static int guc_g2g_start(struct xe_guc *guc)
>> +{
>> +    struct xe_gt *far_gt, *gt = guc_to_gt(guc);
>> +    struct xe_device *xe = gt_to_xe(gt);
>> +    unsigned int i, j;
>> +    int t, err;
>> +    bool have_dev;
>> +
>> +    if (!guc->g2g.bo) {
>> +        int ret;
>> +
>> +        ret = guc_g2g_alloc(guc);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    /* GuC interface will need extending if more GT device types are 
>> ever created. */
>> +    xe_gt_assert(gt, (gt->info.type == XE_GT_TYPE_MAIN) || 
>> (gt->info.type == XE_GT_TYPE_MEDIA));
>> +
>> +    /* Channel numbering depends on whether there are multiple GTs 
>> per tile */
>> +    have_dev = xe->info.gt_count > xe->info.tile_count;
>> +
>> +    for_each_gt(far_gt, xe, i) {
>> +        u32 far_tile, far_dev;
>> +
>> +        if (far_gt->info.id == gt->info.id)
>> +            continue;
>> +
>> +        far_tile = gt_to_tile(far_gt)->id;
>> +        far_dev = G2G_DEV(far_gt);
>> +
>> +        for (t = 0; t < XE_G2G_TYPE_LIMIT; t++) {
>> +            err = guc_g2g_register(guc, far_gt, t, have_dev);
>> +            if (err) {
>> +                while (--t >= 0)
>> +                    guc_g2g_deregister(guc, far_tile, far_dev, t);
>> +                goto err_deregister;
>> +            }
>> +        }
>> +    }
>> +
>> +    return 0;
>> +
>> +err_deregister:
>> +    for_each_gt(far_gt, xe, j) {
>> +        u32 tile, dev;
>> +
>> +        if (far_gt->info.id == gt->info.id)
>> +            continue;
>> +
>> +        if (j >= i)
>> +            break;
>> +
>> +        tile = gt_to_tile(far_gt)->id;
>> +        dev = G2G_DEV(far_gt);
>> +
>> +        for (t = 0; t < XE_G2G_TYPE_LIMIT; t++)
>> +            guc_g2g_deregister(guc, tile, dev, t);
>> +    }
>> +
>> +    return err;
>> +}
>> +
>>   static void guc_fini_hw(void *arg)
>>   {
>>       struct xe_guc *guc = arg;
>> @@ -423,7 +688,16 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
>>     int xe_guc_post_load_init(struct xe_guc *guc)
>>   {
>> +    int ret;
>> +
>>       xe_guc_ads_populate_post_load(&guc->ads);
>> +
>> +    if (xe_guc_g2g_wanted(guc_to_xe(guc))) {
>> +        ret = guc_g2g_start(guc);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>>       guc->submission_state.enabled = true;
>>         return 0;
>> diff --git a/drivers/gpu/drm/xe/xe_guc_types.h 
>> b/drivers/gpu/drm/xe/xe_guc_types.h
>> index fa75f57bf5da..83a41ebcdc91 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_types.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_types.h
>> @@ -64,6 +64,15 @@ struct xe_guc {
>>       struct xe_guc_pc pc;
>>       /** @dbm: GuC Doorbell Manager */
>>       struct xe_guc_db_mgr dbm;
>> +
>> +    /** @g2g: GuC to GuC communication state */
>> +    struct {
>> +        /** @g2g.bo: Storage for GuC to GuC communication channels */
>> +        struct xe_bo *bo;
>> +        /** @g2g.owned: Is the BO owned by this GT or just mapped in */
>> +        bool owned;
>> +    } g2g;
>> +
>>       /** @submission_state: GuC submission state */
>>       struct {
>>           /** @submission_state.idm: GuC context ID Manager */
>> @@ -79,6 +88,7 @@ struct xe_guc {
>>           /** @submission_state.fini_wq: submit fini wait queue */
>>           wait_queue_head_t fini_wq;
>>       } submission_state;
>> +
>>       /** @hwconfig: Hardware config state */
>>       struct {
>>           /** @hwconfig.bo: buffer object of the hardware config */
>