[Intel-xe] [PATCH 2/2] drm/xe: Add support for CCS engine fusing

Wed Mar 8 01:03:35 UTC 2023

On Mon, Mar 06, 2023 at 02:36:51PM -0800, Matt Roper wrote:
>For Xe_HP platforms that can have multiple CCS engines, the
>presence/absence of each CCS is inferred by the presence/absence of any
>DSS in the corresponding quadrant of the GT's DSS mask.
>
>This handling is only needed on platforms that can have more than one
>CCS.  The CCS is never fused off on platforms like MTL that can only
>have one.
>
>Signed-off-by: Matt Roper <matthew.d.roper at intel.com>
>---
> drivers/gpu/drm/xe/xe_gt_topology.c | 47 +++++++++++++++++++++++------
> drivers/gpu/drm/xe/xe_gt_topology.h |  3 ++
> drivers/gpu/drm/xe/xe_hw_engine.c   | 29 +++++++++++++++++-
> 3 files changed, 68 insertions(+), 11 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_gt_topology.c b/drivers/gpu/drm/xe/xe_gt_topology.c
>index 2123f84be336..9f1930381d52 100644
>--- a/drivers/gpu/drm/xe/xe_gt_topology.c
>+++ b/drivers/gpu/drm/xe/xe_gt_topology.c
>@@ -62,6 +62,21 @@ load_eu_mask(struct xe_gt *gt, xe_eu_mask_t mask)
> 	bitmap_from_arr32(mask, &val, XE_MAX_EU_FUSE_BITS);
> }
>
>+static void
>+get_num_dss_regs(struct xe_device *xe, int *geometry_regs, int *compute_regs)
>+{
>+	if (GRAPHICS_VERx100(xe) == 1260) {
>+		*geometry_regs = 0;
>+		*compute_regs = 2;
>+	} else if (GRAPHICS_VERx100(xe) >= 1250) {
>+		*geometry_regs = 1;
>+		*compute_regs = 1;
>+	} else {
>+		*geometry_regs = 1;
>+		*compute_regs = 0;
>+	}
>+}
>+
> void
> xe_gt_topology_init(struct xe_gt *gt)
> {
>@@ -69,16 +84,7 @@ xe_gt_topology_init(struct xe_gt *gt)
> 	struct drm_printer p = drm_debug_printer("GT topology");
> 	int num_geometry_regs, num_compute_regs;
>
>-	if (GRAPHICS_VERx100(xe) == 1260) {
>-		num_geometry_regs = 0;
>-		num_compute_regs = 2;
>-	} else if (GRAPHICS_VERx100(xe) >= 1250) {
>-		num_geometry_regs = 1;
>-		num_compute_regs = 1;
>-	} else {
>-		num_geometry_regs = 1;
>-		num_compute_regs = 0;
>-	}
>+	get_num_dss_regs(xe, &num_geometry_regs, &num_compute_regs);
>
> 	load_dss_mask(gt, gt->fuse_topo.g_dss_mask, num_geometry_regs,
> 		      XELP_GT_GEOMETRY_DSS_ENABLE.reg);

particularly now that it is in a separate function, it seems
dangerous to keep it like this as we the number of registers dictates
the stack being accesed.

can we add a warn here if num_geometry_regs > 1
and in the line below if num_compute_regs > 2?

>@@ -113,3 +119,24 @@ xe_dss_mask_group_ffs(xe_dss_mask_t mask, int groupsize, int groupnum)
> {
> 	return find_next_bit(mask, XE_MAX_DSS_FUSE_BITS, groupnum * groupsize);
> }
>+
>+/*
>+ * Determine whether the given quadrant has any DSS that are not fused off.
>+ * Used to infer CCS engine availability on Xe_HP.

please turn this into a kernel doc and add a sentence explaining what a
quadrant it.

>+ */
>+bool xe_gt_topology_has_dss_in_quadrant(struct xe_gt *gt, int quad)
>+{
>+	struct xe_device *xe = gt_to_xe(gt);
>+	xe_dss_mask_t all_dss;
>+	int g_dss_regs, c_dss_regs, max_dss, quad_first;
>+
>+	bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, gt->fuse_topo.c_dss_mask,
>+		  XE_MAX_DSS_FUSE_BITS);
>+
>+	get_num_dss_regs(xe, &g_dss_regs, &c_dss_regs);
>+	max_dss = 32 * max(g_dss_regs, c_dss_regs);

instead of max_dss, maybe:

max_quads =  32 * max(g_dss_regs, c_dss_regs) / 4;?

>+
>+	quad_first = xe_dss_mask_group_ffs(all_dss, max_dss / 4, quad);
>+
>+	return quad_first < (quad + 1) * max_dss / 4;
>+}
>diff --git a/drivers/gpu/drm/xe/xe_gt_topology.h b/drivers/gpu/drm/xe/xe_gt_topology.h
>index b2540dc266f2..f47ab1b1269c 100644
>--- a/drivers/gpu/drm/xe/xe_gt_topology.h
>+++ b/drivers/gpu/drm/xe/xe_gt_topology.h
>@@ -17,4 +17,7 @@ void xe_gt_topology_dump(struct xe_gt *gt, struct drm_printer *p);
> unsigned int
> xe_dss_mask_group_ffs(xe_dss_mask_t mask, int groupsize, int groupnum);
>
>+bool
>+xe_gt_topology_has_dss_in_quadrant(struct xe_gt *gt, int quad);
>+
> #endif /* _XE_GT_TOPOLOGY_H_ */
>diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
>index ac00693ba0b2..f72f2e3b8c9a 100644
>--- a/drivers/gpu/drm/xe/xe_hw_engine.c
>+++ b/drivers/gpu/drm/xe/xe_hw_engine.c
>@@ -436,13 +436,40 @@ static void read_copy_fuses(struct xe_gt *gt)
> 	}
> }
>
>+static void read_compute_fuses(struct xe_gt *gt)
>+{
>+	struct xe_device *xe = gt_to_xe(gt);
>+
>+	/*
>+	 * CCS fusing based on DSS masks only applies to platforms that can
>+	 * have more than one CCS.
>+	 */
>+	if (hweight32(gt->info.engine_mask &
>+		      GENMASK(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0)) <= 1)

I don't think spreading the engine numbers will scale well.
I added XE_HW_ENGINE_CCS_MASK and such in

https://lore.kernel.org/intel-xe/20230307092446.1263266-7-lucas.demarchi@intel.com/T/#u

Should I split that out of that series so it can be used here?

Also info.engine_mask is u64, but here you are using
hweight32? I think the right thing would be to use hweight64 and
GENMASK_ULL() like in the patch mentioned above.

Lucas De Marchi

>+		return;
>+
>+	/*
>+	 * CCS availability on Xe_HP is inferred from the presence of DSS in
>+	 * each quadrant.
>+	 */
>+	for (int i = XE_HW_ENGINE_CCS0, j = 0; i <= XE_HW_ENGINE_CCS3; ++i, ++j) {
>+		if (!(gt->info.engine_mask & BIT(i)))
>+			continue;
>+
>+		if (!xe_gt_topology_has_dss_in_quadrant(gt, j)) {
>+			gt->info.engine_mask &= ~BIT(i);
>+			drm_info(&xe->drm, "ccs%u fused off\n", j);
>+		}
>+	}
>+}
>+
> int xe_hw_engines_init_early(struct xe_gt *gt)
> {
> 	int i;
>
> 	read_media_fuses(gt);
> 	read_copy_fuses(gt);
>-	/* TODO: compute engines */
>+	read_compute_fuses(gt);
>
> 	for (i = 0; i < ARRAY_SIZE(gt->hw_engines); i++)
> 		hw_engine_init_early(gt, &gt->hw_engines[i], i);
>-- 
>2.39.2
>