[PATCH 2/2] drm/xe/guc: Add support for G2G communications

Fri Nov 15 19:34:40 UTC 2024

On 11/15/2024 11:11, Matthew Brost wrote:
> On Fri, Nov 08, 2024 at 12:22:16PM -0800, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> Some features require inter-GuC communication channels on multi-tile
>> devices. So allocate and enable such.
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/xe/abi/guc_actions_abi.h |  20 ++
>>   drivers/gpu/drm/xe/xe_guc.c              | 276 ++++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_guc_types.h        |  10 +
>>   3 files changed, 305 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> index b54fe40fc5a9..fee385532fb0 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> @@ -134,6 +134,8 @@ enum xe_guc_action {
>>   	XE_GUC_ACTION_DEREGISTER_CONTEXT = 0x4503,
>>   	XE_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505,
>>   	XE_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>> +	XE_GUC_ACTION_REGISTER_G2G = 0x4507,
>> +	XE_GUC_ACTION_DEREGISTER_G2G = 0x4508,
>>   	XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>   	XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
>>   	XE_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
>> @@ -218,4 +220,22 @@ enum xe_guc_tlb_inval_mode {
>>   	XE_GUC_TLB_INVAL_MODE_LITE = 0x1,
>>   };
>>   
>> +/*
>> + * GuC to GuC communication (de-)registration fields:
>> + */
>> +enum xe_guc_g2g_type {
>> +	XE_G2G_TYPE_IN = 0x0,
>> +	XE_G2G_TYPE_OUT,
>> +	XE_G2G_TYPE_LIMIT,
>> +};
>> +
>> +#define XE_G2G_REGISTER_DEVICE	REG_GENMASK(16, 16)
>> +#define XE_G2G_REGISTER_TILE	REG_GENMASK(15, 12)
>> +#define XE_G2G_REGISTER_TYPE	REG_GENMASK(11, 8)
>> +#define XE_G2G_REGISTER_SIZE	REG_GENMASK(7, 0)
>> +
>> +#define XE_G2G_DEREGISTER_DEVICE	REG_GENMASK(16, 16)
>> +#define XE_G2G_DEREGISTER_TILE	REG_GENMASK(15, 12)
>> +#define XE_G2G_DEREGISTER_TYPE	REG_GENMASK(11, 8)
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 1cbad8a91798..899094423291 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -44,7 +44,15 @@ static u32 guc_bo_ggtt_addr(struct xe_guc *guc,
>>   			    struct xe_bo *bo)
>>   {
>>   	struct xe_device *xe = guc_to_xe(guc);
>> -	u32 addr = xe_bo_ggtt_addr(bo);
>> +	u32 addr;
>> +
>> +	/*
>> +	 * For most BOs, the address on the allocating tile is fine. However for
>> +	 * some, e.g. G2G CTB, the address on a specific tile is required as it
>> +	 * might be different for each tile. So, just always ask for the address
>> +	 * on the target GuC.
>> +	 */
>> +	addr = __xe_bo_ggtt_addr(bo, gt_to_tile(guc_to_gt(guc))->id);
>>   
>>   	/* GuC addresses above GUC_GGTT_TOP don't map through the GTT */
>>   	xe_assert(xe, addr >= xe_wopcm_size(guc_to_xe(guc)));
>> @@ -244,6 +252,263 @@ static void guc_write_params(struct xe_guc *guc)
>>   		xe_mmio_write32(&gt->mmio, SOFT_SCRATCH(1 + i), guc->params[i]);
>>   }
>>   
>> +static int guc_action_register_g2g_buffer(struct xe_guc *guc, u32 type, u32 dst_tile, u32 dst_dev,
>> +					  u32 desc_addr, u32 buff_addr, u32 size)
>> +{
>> +	struct xe_gt *gt = guc_to_gt(guc);
>> +	struct xe_device *xe = gt_to_xe(gt);
>> +	u32 action[] = {
>> +		XE_GUC_ACTION_REGISTER_G2G,
>> +		FIELD_PREP(XE_G2G_REGISTER_SIZE, size / SZ_4K - 1) |
>> +		FIELD_PREP(XE_G2G_REGISTER_TYPE, type) |
>> +		FIELD_PREP(XE_G2G_REGISTER_TILE, dst_tile) |
>> +		FIELD_PREP(XE_G2G_REGISTER_DEVICE, dst_dev),
>> +		desc_addr,
>> +		buff_addr,
>> +	};
>> +
>> +	xe_assert(xe, (type == XE_G2G_TYPE_IN) || (type == XE_G2G_TYPE_OUT));
>> +	xe_assert(xe, !(size % SZ_4K));
>> +
>> +	return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>> +}
>> +
>> +static int guc_action_deregister_g2g_buffer(struct xe_guc *guc, u32 type, u32 dst_tile, u32 dst_dev)
>> +{
>> +	struct xe_gt *gt = guc_to_gt(guc);
>> +	struct xe_device *xe = gt_to_xe(gt);
>> +	u32 action[] = {
>> +		XE_GUC_ACTION_DEREGISTER_G2G,
>> +		FIELD_PREP(XE_G2G_DEREGISTER_TYPE, type) |
>> +		FIELD_PREP(XE_G2G_DEREGISTER_TILE, dst_tile) |
>> +		FIELD_PREP(XE_G2G_DEREGISTER_DEVICE, dst_dev),
>> +	};
>> +
>> +	xe_assert(xe, (type == XE_G2G_TYPE_IN) || (type == XE_G2G_TYPE_OUT));
>> +
>> +	return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>> +}
>> +
>> +#define G2G_DEV(gt)	(((gt)->info.type == XE_GT_TYPE_MAIN) ? 0 : 1)
>> +
>> +#define G2G_BUFFER_SIZE (SZ_4K)
>> +#define G2G_DESC_SIZE (64)
>> +#define G2G_DESC_AREA_SIZE (SZ_4K)
>> +
>> +/*
>> + * Generate a unique id for each bi-directional CTB for each pair of
>> + * near and far tiles/devices. The id can then be used as an index into
>> + * a single allocation that is sub-divided into multiple CTBs.
>> + *
>> + * For example, with two devices per tile and two tiles, the table should
>> + * look like:
>> + *           Far <tile>.<dev>
>> + *         0.0   0.1   1.0   1.1
>> + * N 0.0  --/-- 00/01 02/03 04/05
>> + * e 0.1  01/00 --/-- 06/07 08/09
>> + * a 1.0  03/02 07/06 --/-- 10/11
>> + * r 1.1  05/04 09/08 11/10 --/--
>> + *
>> + * Where each entry is Rx/Tx channel id.
>> + *
>> + * So GuC #3 (tile 1, dev 1) talking to GuC #2 (tile 1, dev 0) would
>> + * be reading from channel #11 and writing to channel #10. Whereas,
>> + * GuC #2 talking to GuC #3 would be read on #10 and write to #11.
>> + */
>> +static unsigned int g2g_slot(u32 near_tile, u32 near_dev, u32 far_tile, u32 far_dev,
>> +			     u32 type, u32 max_inst, bool have_dev)
>> +{
> This function makes my head hurt but I don't have any other suggestions.
>
>> +	u32 near = near_tile, far = far_tile;
>> +	u32 idx = 0;
>> +	int i;
>> +
>> +	if (have_dev) {
>> +		near = (near << 1) | near_dev;
>> +		far = (far << 1) | far_dev;
>> +	}
>> +
>> +	if (far > near) {
>> +		for (i = near; i > 0; i--)
>> +			idx += max_inst - i;
>> +		idx += (far - 1 - near);
>> +		idx *= 2;
>> +		idx += type;
>> +		return idx;
>> +	}
>> +
>> +	if (far < near) {
>> +		for (i = far; i > 0; i--)
>> +			idx += max_inst - i;
>> +		idx += (near - 1 - far);
>> +		idx *= 2;
>> +		idx += (1 - type);
>> +		return idx;
>> +	}
>> +
>> +	return -1;
>> +}
>> +
>> +static int guc_g2g_register(struct xe_guc *near_guc, struct xe_gt *far_gt, u32 type, bool have_dev)
>> +{
>> +	struct xe_gt *near_gt = guc_to_gt(near_guc);
>> +	struct xe_device *xe = gt_to_xe(near_gt);
>> +	struct xe_bo *g2g_bo;
>> +	u32 near_tile = gt_to_tile(near_gt)->id;
>> +	u32 near_dev = G2G_DEV(near_gt);
>> +	u32 far_tile = gt_to_tile(far_gt)->id;
>> +	u32 far_dev = G2G_DEV(far_gt);
>> +	u32 max = xe->info.gt_count;
>> +	u32 base, desc, buf;
>> +	int slot;
>> +
>> +	xe_assert(xe, xe == gt_to_xe(far_gt));
>> +
>> +	g2g_bo = near_guc->g2g.bo;
>> +	xe_assert(xe, g2g_bo);
>> +
>> +	slot = g2g_slot(near_tile, near_dev, far_tile, far_dev, type, max, have_dev);
>> +	xe_assert(xe, slot >= 0);
>> +
>> +	base = guc_bo_ggtt_addr(near_guc, g2g_bo);
>> +	desc = base + slot * G2G_DESC_SIZE;
>> +	buf = base + slot * G2G_BUFFER_SIZE + G2G_DESC_AREA_SIZE;
>> +
>> +	xe_assert(xe, (desc - base + G2G_DESC_SIZE) <= G2G_DESC_AREA_SIZE);
>> +	xe_assert(xe, (buf - base + G2G_BUFFER_SIZE) <= g2g_bo->size);
>> +
>> +	return guc_action_register_g2g_buffer(near_guc, type, far_tile, far_dev,
>> +					      desc, buf, G2G_BUFFER_SIZE);
>> +}
>> +
>> +static void guc_g2g_deregister(struct xe_guc *guc, u32 far_tile, u32 far_dev, u32 type)
>> +{
>> +	guc_action_deregister_g2g_buffer(guc, type, far_tile, far_dev);
>> +}
>> +
>> +static u32 guc_g2g_size(struct xe_guc *guc)
>> +{
>> +	struct xe_gt *gt = guc_to_gt(guc);
>> +	struct xe_device *xe = gt_to_xe(gt);
>> +	unsigned int count = xe->info.gt_count;
>> +	u32 num_channels = (count * (count - 1)) / 2;
>> +
>> +	return num_channels * XE_G2G_TYPE_LIMIT * G2G_BUFFER_SIZE + G2G_DESC_AREA_SIZE;
>> +}
>> +
>> +static bool xe_guc_g2g_wanted(struct xe_device *xe)
>> +{
>> +	/* Can't do GuC to GuC communication if there is only one GuC */
>> +	if (xe->info.gt_count <= 1)
>> +		return false;
>> +
>> +	/* No current user */
>> +	return false;
>> +}
>> +
>> +static int guc_g2g_alloc(struct xe_guc *guc)
>> +{
>> +	struct xe_gt *gt = guc_to_gt(guc);
>> +	struct xe_device *xe = gt_to_xe(gt);
>> +	struct xe_tile *tile = gt_to_tile(gt);
>> +	struct xe_bo *bo;
>> +	u32 g2g_size;
>> +
>> +	if (guc->g2g.bo)
>> +		return 0;
>> +
>> +	if (gt->info.id != 0) {
>> +		struct xe_gt *root_gt = xe_device_get_gt(xe, 0);
>> +		struct xe_guc *root_guc = &root_gt->uc.guc;
>> +		struct xe_bo *bo;
>> +
>> +		bo = xe_bo_get(root_guc->g2g.bo);
>> +		if (IS_ERR(bo))
>> +			return PTR_ERR(bo);
> A few things.
>
> - xe_bo_get returns BO or NULL.
Meaning just use if(bo) rather than the IS_ERR wrapper?

> - Checking the return seems overkill
Seems wrong to not return an error if there is no allocation. E.g. just 
change it to 'return -ENODEV' or some such if the bo is null?

> - I don't see an associated xe_bo_put
> - Do we even need a reference? i.e. The clean up is handled by DRM managed on driver unload.
Technically, the get is not required at present if the secondary GTs can 
all assume that the root GT is guaranteed to be holding its reference at 
all times. It seems cleaner (and more future proof) to not make such 
assumptions, though. Just have each GT look after its own usages and 
requirements.

And yes, there is currently no put because everything just magically 
vanishes on driver unload (same as per a whole bunch of other driver 
internal BOs). The selftest does require dynamically releasing and 
re-allocating the shared BO in various different configurations, though 
(including different per GT splits that mean the root GT isn't 
necessarily owning the allocation, it might not even have an 
allocation). That code isn't included in this patch set at present for a 
bunch of 'not ready' reasons.

>
>> +
>> +		guc->g2g.bo = bo;
>> +		guc->g2g.owned = false;
>> +		return 0;
>> +	}
>> +
>> +	g2g_size = guc_g2g_size(guc);
>> +	bo = xe_managed_bo_create_pin_map(xe, tile, g2g_size,
>> +					  XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>> +					  XE_BO_FLAG_GGTT |
>> +					  XE_BO_FLAG_GGTT_ALL |
>> +					  XE_BO_FLAG_GGTT_INVALIDATE);
>> +	if (IS_ERR(bo))
>> +		return PTR_ERR(bo);
>> +
>> +	xe_map_memset(xe, &bo->vmap, 0, 0, g2g_size);
>> +	guc->g2g.bo = bo;
>> +	guc->g2g.owned = true;
>> +
>> +	return 0;
>> +}
>> +
>> +static int guc_g2g_start(struct xe_guc *guc)
>> +{
>> +	struct xe_gt *far_gt, *gt = guc_to_gt(guc);
>> +	struct xe_device *xe = gt_to_xe(gt);
>> +	unsigned int i, j;
>> +	int t, err;
>> +	bool have_dev;
>> +
>> +	if (!guc->g2g.bo) {
>> +		int ret;
>> +
>> +		ret = guc_g2g_alloc(guc);
>> +		if (ret)
>> +			return ret;
>> +	}
>> +
>> +	/* GuC interface will need extending if more GT device types are ever created. */
>> +	xe_gt_assert(gt, (gt->info.type == XE_GT_TYPE_MAIN) || (gt->info.type == XE_GT_TYPE_MEDIA));
>> +
>> +	/* Channel numbering depends on whether there are multiple GTs per tile */
>> +	have_dev = xe->info.gt_count > xe->info.tile_count;
>> +
>> +	for_each_gt(far_gt, xe, i) {
>> +		u32 far_tile, far_dev;
>> +
>> +		if (far_gt->info.id == gt->info.id)
>> +			continue;
>> +
>> +		far_tile = gt_to_tile(far_gt)->id;
>> +		far_dev = G2G_DEV(far_gt);
>> +
>> +		for (t = 0; t < XE_G2G_TYPE_LIMIT; t++) {
>> +			err = guc_g2g_register(guc, far_gt, t, have_dev);
>> +			if (err) {
>> +				while (--t >= 0)
>> +					guc_g2g_deregister(guc, far_tile, far_dev, t);
>> +				goto err_deregister;
>> +			}
>> +		}
>> +	}
>> +
>> +	return 0;
>> +
>> +err_deregister:
>> +	for_each_gt(far_gt, xe, j) {
>> +		u32 tile, dev;
>> +
>> +		if (far_gt->info.id == gt->info.id)
>> +			continue;
>> +
>> +		if (j >= i)
>> +			break;
>> +
>> +		tile = gt_to_tile(far_gt)->id;
>> +		dev = G2G_DEV(far_gt);
>> +
>> +		for (t = 0; t < XE_G2G_TYPE_LIMIT; t++)
>> +			guc_g2g_deregister(guc, tile, dev, t);
>> +	}
>> +
>> +	return err;
>> +}
>> +
>>   static void guc_fini_hw(void *arg)
>>   {
>>   	struct xe_guc *guc = arg;
>> @@ -423,7 +688,16 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc)
>>   
>>   int xe_guc_post_load_init(struct xe_guc *guc)
>>   {
>> +	int ret;
>> +
>>   	xe_guc_ads_populate_post_load(&guc->ads);
>> +
>> +	if (xe_guc_g2g_wanted(guc_to_xe(guc))) {
>> +		ret = guc_g2g_start(guc);
>> +		if (ret)
>> +			return ret;
>> +	}
>> +
>>   	guc->submission_state.enabled = true;
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h
>> index fa75f57bf5da..83a41ebcdc91 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_types.h
>> +++ b/drivers/gpu/drm/xe/xe_guc_types.h
>> @@ -64,6 +64,15 @@ struct xe_guc {
>>   	struct xe_guc_pc pc;
>>   	/** @dbm: GuC Doorbell Manager */
>>   	struct xe_guc_db_mgr dbm;
>> +
>> +	/** @g2g: GuC to GuC communication state */
>> +	struct {
>> +		/** @g2g.bo: Storage for GuC to GuC communication channels */
>> +		struct xe_bo *bo;
>> +		/** @g2g.owned: Is the BO owned by this GT or just mapped in */
>> +		bool owned;
>> +	} g2g;
>> +
>>   	/** @submission_state: GuC submission state */
>>   	struct {
>>   		/** @submission_state.idm: GuC context ID Manager */
>> @@ -79,6 +88,7 @@ struct xe_guc {
>>   		/** @submission_state.fini_wq: submit fini wait queue */
>>   		wait_queue_head_t fini_wq;
>>   	} submission_state;
>> +
> Unrelated.
Sort of. If the coding style is to have blank lines between sub-structs 
then adding the g2g struct ahead of submission_state gives it a blank 
line above but not below, which is therefore wrong because it is both 
mis-matched start vs end and not following the style guide. On the other 
hand, if blank lines should not be included at all then the lines 
before/after the g2g struct should be dropped.

John.

>
> Matt
>
>>   	/** @hwconfig: Hardware config state */
>>   	struct {
>>   		/** @hwconfig.bo: buffer object of the hardware config */
>> -- 
>> 2.47.0
>>