[Intel-xe] [RFC v2 1/2] drm/xe: Enable Fixed CCS mode setting
Niranjana Vishwanathapura
niranjana.vishwanathapura at intel.com
Fri Nov 3 04:59:20 UTC 2023
On Thu, Nov 02, 2023 at 02:40:02PM -0700, Matt Roper wrote:
>On Wed, Nov 01, 2023 at 04:57:48PM -0700, Niranjana Vishwanathapura wrote:
>> Disable dynamic HW load balancing of compute resource assignment
>> to engines and instead enabled fixed mode of mapping compute
>> resources to engines.
>>
>> Allow user to configure this CCS mode setting through a 'ccs_mode'
>> sysfs entry.
>>
>> Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile | 1 +
>> drivers/gpu/drm/xe/regs/xe_gt_regs.h | 14 +++++
>> drivers/gpu/drm/xe/xe_gt.c | 27 ++++++++--
>> drivers/gpu/drm/xe/xe_gt.h | 2 +
>> drivers/gpu/drm/xe/xe_gt_ccs_mode.c | 79 ++++++++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_gt_ccs_mode.h | 22 ++++++++
>> drivers/gpu/drm/xe/xe_gt_types.h | 16 ++++++
>> drivers/gpu/drm/xe/xe_guc_ads.c | 3 ++
>> drivers/gpu/drm/xe/xe_hw_engine.c | 14 +++++
>> drivers/gpu/drm/xe/xe_tile_sysfs.c | 54 +++++++++++++++++++
>> 10 files changed, 227 insertions(+), 5 deletions(-)
>> create mode 100644 drivers/gpu/drm/xe/xe_gt_ccs_mode.c
>> create mode 100644 drivers/gpu/drm/xe/xe_gt_ccs_mode.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 776c31e73f29..a335d8762741 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -58,6 +58,7 @@ xe-y += xe_bb.o \
>> xe_force_wake.o \
>> xe_ggtt.o \
>> xe_gt.o \
>> + xe_gt_ccs_mode.o \
>> xe_gt_clock.o \
>> xe_gt_debugfs.o \
>> xe_gt_idle_sysfs.o \
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> index 7a6407e38265..b15aca465fc9 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>> @@ -388,8 +388,22 @@
>> #define COMP_CKN_IN REG_GENMASK(30, 29)
>>
>> #define RCU_MODE XE_REG(0x14800, XE_REG_OPTION_MASKED)
>> +#define RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1)
>> #define RCU_MODE_CCS_ENABLE REG_BIT(0)
>>
>> +/*
>> + * Total of 4 cslices, where each cslice is in the form:
>> + * [0-3] CCS ID
>> + * [4-6] RSVD
>> + * [7] Disabled
>> + */
>> +#define CCS_MODE XE_REG(0x14804)
>> +#define CCS_MODE_CSLICE_0_3_MASK REG_GENMASK(11, 0) /* 3 bits per cslice */
>> +#define CCS_MODE_CSLICE_MASK 0x7 /* CCS0-3 + rsvd */
>> +#define CCS_MODE_CSLICE_WIDTH ilog2(CCS_MODE_CSLICE_MASK + 1)
>> +#define CCS_MODE_CSLICE(cslice, ccs) \
>> + (ccs << (cslice * CCS_MODE_CSLICE_WIDTH))
>> +
>> #define FORCEWAKE_ACK_GT XE_REG(0x130044)
>> #define FORCEWAKE_KERNEL BIT(0)
>> #define FORCEWAKE_USER BIT(1)
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index d380f67b3365..5f3e2d3d63c3 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -22,6 +22,7 @@
>> #include "xe_force_wake.h"
>> #include "xe_ggtt.h"
>> #include "xe_gt_clock.h"
>> +#include "xe_gt_ccs_mode.h"
>
>Nitpick: We usually try to keep the #include's sorted, so "ccs" should
>come before "clock." Some of the other #include's farther down this
>patch also need to be moved too.
Thanks Matt for the review.
Yah, I tried to keep #include's sorted, but looks like I missed at couple
locations, will fix.
>
>> #include "xe_gt_idle_sysfs.h"
>> #include "xe_gt_mcr.h"
>> #include "xe_gt_pagefault.h"
>> @@ -78,6 +79,7 @@ static void gt_fini(struct drm_device *drm, void *arg)
>> int i;
>>
>> destroy_workqueue(gt->ordered_wq);
>> + mutex_destroy(>->ccs.mutex);
>>
>> for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i)
>> xe_hw_fence_irq_finish(>->fence_irq[i]);
>> @@ -447,6 +449,12 @@ static int all_fw_domain_init(struct xe_gt *gt)
>> if (err)
>> goto err_force_wake;
>>
>> + /* Configure default CCS mode of 1 engine with all resources */
>> + if (xe_gt_needs_ccs_mode(gt)) {
>
>If we're making this the new driver default, that's a pretty big
>behavior change. That's probably fine right now since Xe is still under
>development, but you should probably get ack's on this change from the
>relevant UMD devs just to make sure everyone is on the same page here.
>It might also be good to put some justification in the commit message
>too about why we think this is the best default.
>
Yah, compute UMD is fine with it. Sure will do.
>> + gt->ccs.num_engines = 1;
>> + xe_gt_apply_ccs_mode(gt);
>> + }
>> +
>> err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>> XE_WARN_ON(err);
>> xe_device_mem_access_put(gt_to_xe(gt));
>> @@ -468,6 +476,7 @@ int xe_gt_init(struct xe_gt *gt)
>> int err;
>> int i;
>>
>> + mutex_init(>->ccs.mutex);
>> INIT_WORK(>->reset.worker, gt_reset_worker);
>>
>> for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) {
>> @@ -477,11 +486,11 @@ int xe_gt_init(struct xe_gt *gt)
>>
>> err = xe_gt_tlb_invalidation_init(gt);
>> if (err)
>> - return err;
>> + goto err_ccs_mode;
>>
>> err = xe_gt_pagefault_init(gt);
>> if (err)
>> - return err;
>> + goto err_ccs_mode;
>>
>> xe_mocs_init_early(gt);
>>
>> @@ -489,19 +498,24 @@ int xe_gt_init(struct xe_gt *gt)
>>
>> err = gt_fw_domain_init(gt);
>> if (err)
>> - return err;
>> + goto err_ccs_mode;
>>
>> xe_force_wake_init_engines(gt, gt_to_fw(gt));
>>
>> err = all_fw_domain_init(gt);
>> if (err)
>> - return err;
>> + goto err_ccs_mode;
>>
>> err = drmm_add_action_or_reset(>_to_xe(gt)->drm, gt_fini, gt);
>> if (err)
>> - return err;
>> + goto err_ccs_mode;
>>
>> return 0;
>> +
>> +err_ccs_mode:
>> + mutex_destroy(>->ccs.mutex);
>> +
>> + return err;
>> }
>>
>> static int do_gt_reset(struct xe_gt *gt)
>> @@ -549,6 +563,9 @@ static int do_gt_restart(struct xe_gt *gt)
>> xe_reg_sr_apply_whitelist(hwe);
>> }
>>
>> + /* Get CCS mode in sync between sw/hw */
>> + xe_gt_apply_ccs_mode(gt);
>> +
>> return 0;
>> }
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
>> index caded203a8a0..1c2a1d35a2c1 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.h
>> +++ b/drivers/gpu/drm/xe/xe_gt.h
>> @@ -17,6 +17,8 @@
>> for_each_if(((hwe__) = (gt__)->hw_engines + (id__)) && \
>> xe_hw_engine_is_valid((hwe__)))
>>
>> +#define CCS_MASK(gt) ((gt->info.engine_mask & XE_HW_ENGINE_CCS_MASK) >> XE_HW_ENGINE_CCS0)
>> +
>> #ifdef CONFIG_FAULT_INJECTION
>> extern struct fault_attr gt_reset_failure;
>> static inline bool xe_fault_inject_gt_reset(void)
>> diff --git a/drivers/gpu/drm/xe/xe_gt_ccs_mode.c b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c
>> new file mode 100644
>> index 000000000000..cec552e823b6
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c
>> @@ -0,0 +1,79 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2023 Intel Corporation
>> + */
>> +
>> +#include "xe_assert.h"
>> +#include "xe_gt.h"
>> +#include "xe_mmio.h"
>> +#include "xe_gt_ccs_mode.h"
>> +#include "regs/xe_gt_regs.h"
>> +
>> +static void __xe_gt_apply_ccs_mode(struct xe_gt *gt, u32 num_engines)
>> +{
>> + u32 mode = CCS_MODE_CSLICE_0_3_MASK; /* disable all by default */
>> + struct xe_device *xe = gt_to_xe(gt);
>> + struct xe_hw_engine *hwe;
>> + int num_slices = hweight32(CCS_MASK(gt));
>> + int width, cslice;
>> + enum xe_hw_engine_id id;
>> + u32 config = 0;
>> +
>> + lockdep_assert_held(>->ccs.mutex);
>> + xe_assert(xe, GRAPHICS_VERx100(gt_to_xe(gt)) <= 1260);
>
>Why a <= here when the other checks elsewhere are ==? Actually, if we
>think we might extend this to other platforms in the future, it's
>probably best to just pull out the version matching to a dedicated
>function now so that when new platforms need this in the future we can
>just make a single change in one place, rather than hunting down all the
>1260 checks in the code.
>
Yah, and that one place is the function xe_gt_needs_ccs_mode().
Ok, will call this function in this assert.
>> +
>> + xe_assert(xe, num_engines && num_engines <= num_slices);
>> + xe_assert(xe, !(num_slices % num_engines));
>> +
>> + /*
>> + * Loop over all available slices and assign each a user engine.
>> + *
>> + * With 1 engine (ccs0):
>> + * slice 0, 1, 2, 3: ccs0
>> + *
>> + * With 2 engines (ccs0, ccs1):
>> + * slice 0, 2: ccs0
>> + * slice 1, 3: ccs1
>> + *
>> + * With 4 engines (ccs0, ccs1, ccs2, ccs3):
>> + * slice 0: ccs0
>> + * slice 1: ccs1
>> + * slice 2: ccs2
>> + * slice 3: ccs3
>> + */
>> + for (width = num_slices / num_engines, cslice = 0; width--;) {
>> + for_each_hw_engine(hwe, gt, id) {
>> + if (hwe->class != XE_ENGINE_CLASS_COMPUTE)
>> + continue;
>> +
>> + if (hwe->logical_instance >= num_engines)
>> + break;
>> +
>> + config |= BIT(hwe->instance) << XE_HW_ENGINE_CCS0;
>> +
>> + /* If a slice is fused off, leave disabled */
>> + while ((CCS_MASK(gt) & BIT(cslice)) == 0)
>> + cslice++;
>> +
>> + mode &= ~CCS_MODE_CSLICE(cslice, CCS_MODE_CSLICE_MASK);
>> + mode |= CCS_MODE_CSLICE(cslice, hwe->instance);
>> + cslice++;
>> + }
>> + }
>> +
>> + xe_mmio_write32(gt, CCS_MODE, mode);
>> +
>> + drm_info(&xe->drm,
>> + "tile-%d CCS_MODE=%x config:%08x, num_engines:%d, num_slices:%d\n",
>> + gt_to_tile(gt)->id, mode, config, num_engines, num_slices);
>
>You probably want to use xe_gt_info() here so that it will print out the
>GT identification automatically in a standard form. Then you can drop
>the tile ID part of the message (which could become confusing anyway if
>we ever have multiple GTs with CCS engines within the same tile).
>
Yah, makes sense, will fix.
>
>> +}
>> +
>> +void xe_gt_apply_ccs_mode(struct xe_gt *gt)
>> +{
>> + mutex_lock(>->ccs.mutex);
>
>What exactly is ccs.mutex protecting? From a quick glance it doesn't
>look like racing calls to this function without the lock would be any
>more problematic than racing calls with the lock? And this only gets
>called during init / reset so is it even possible to have a race?
>
>> +
>> + if (gt->ccs.num_engines)
>> + __xe_gt_apply_ccs_mode(gt, gt->ccs.num_engines);
>> +
>> + mutex_unlock(>->ccs.mutex);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_gt_ccs_mode.h b/drivers/gpu/drm/xe/xe_gt_ccs_mode.h
>> new file mode 100644
>> index 000000000000..793f3bbb4937
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_gt_ccs_mode.h
>> @@ -0,0 +1,22 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2023 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_GT_CCS_MODE_H_
>> +#define _XE_GT_CCS_MODE_H_
>> +
>> +#include "xe_device_types.h"
>> +#include "xe_gt.h"
>> +#include "xe_gt_types.h"
>> +#include "xe_platform_types.h"
>> +
>> +void xe_gt_apply_ccs_mode(struct xe_gt *gt);
>> +
>> +static inline bool xe_gt_needs_ccs_mode(const struct xe_gt *gt)
>> +{
>> + return GRAPHICS_VERx100(gt_to_xe(gt)) == 1260 && CCS_MASK(gt);
>> +}
>> +
>> +#endif
>> +
>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
>> index d3f2793684e2..5342aff9dd68 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>> @@ -185,6 +185,22 @@ struct xe_gt {
>> spinlock_t lock;
>> } tlb_invalidation;
>>
>> + /**
>> + * @ccs: Fixed mapping between CCS engines and compute slices.
>> + * Through the per-tile 'ccs_mode' sysfs interface, the user can specify a
>> + * fixed number of compute hardware engines to which the available compute
>> + * slices are to be allocated. By default all compute slices are allocated
>> + * to the first available compute engine instance. This user configuration
>> + * change triggers a tile reset and it is expected that user will ensure
>> + * the tile is idle while doing so.
>> + */
>> + struct {
>> + /** @mutex: Serialize CCS mode access */
>> + struct mutex mutex;
>> + /** @num_engines: Number of CCS engines enabled */
>> + u32 num_engines;
>> + } ccs;
>> +
>> /** @usm: unified shared memory state */
>> struct {
>> /**
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
>> index 88789826e781..74b550be0e36 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
>> @@ -8,6 +8,7 @@
>> #include <drm/drm_managed.h>
>>
>> #include "regs/xe_engine_regs.h"
>> +#include "xe_gt_ccs_mode.h"
>> #include "regs/xe_gt_regs.h"
>> #include "regs/xe_guc_regs.h"
>> #include "xe_bo.h"
>> @@ -454,6 +455,8 @@ static unsigned int guc_mmio_regset_write(struct xe_guc_ads *ads,
>> { .reg = RING_HWS_PGA(hwe->mmio_base), },
>> { .reg = RING_IMR(hwe->mmio_base), },
>> { .reg = RCU_MODE, .skip = hwe != hwe_rcs_reset_domain },
>> + { .reg = CCS_MODE,
>> + .skip = hwe != hwe_rcs_reset_domain || !xe_gt_needs_ccs_mode(hwe->gt) },
>
>I don't think CCS_MODE exists on older (pre-xehp platforms). So I think
>this will cause either an invalid register (or possibly an unrelated
>register if there's something else at that offset) to get saved/restored
>on older platforms like TGL, ADL, etc.
>
yes, that is why we have '!xe_gt_needs_ccs_mode()' call here. ie, if ccs_mode is not
needed (like in pre-xehpc platforms), this register will be skipped.
>> };
>> u32 i;
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
>> index b5b084590888..0058bbf8fd06 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
>> @@ -8,6 +8,7 @@
>> #include <drm/drm_managed.h>
>>
>> #include "regs/xe_engine_regs.h"
>> +#include "xe_gt_ccs_mode.h"
>> #include "regs/xe_gt_regs.h"
>> #include "regs/xe_regs.h"
>> #include "xe_assert.h"
>> @@ -283,6 +284,13 @@ void xe_hw_engine_enable_ring(struct xe_hw_engine *hwe)
>> hw_engine_mmio_read32(hwe, RING_MI_MODE(0));
>> }
>>
>> +static bool xe_hw_engine_match_fixed_cslice_mode(const struct xe_gt *gt,
>> + const struct xe_hw_engine *hwe)
>> +{
>> + return xe_gt_needs_ccs_mode(gt) &&
>> + xe_rtp_match_first_render_or_compute(gt, hwe);
>> +}
>> +
>> void
>> xe_hw_engine_setup_default_lrc_state(struct xe_hw_engine *hwe)
>> {
>> @@ -307,6 +315,12 @@ xe_hw_engine_setup_default_lrc_state(struct xe_hw_engine *hwe)
>> blit_cctl_val,
>> XE_RTP_ACTION_FLAG(ENGINE_BASE)))
>> },
>> + /* Use Fixed slice CCS mode */
>> + { XE_RTP_NAME("RCU_MODE_FIXED_SLICE_CCS_MODE"),
>> + XE_RTP_RULES(FUNC(xe_hw_engine_match_fixed_cslice_mode)),
>> + XE_RTP_ACTIONS(FIELD_SET(RCU_MODE, RCU_MODE_FIXED_SLICE_CCS_MODE,
>> + RCU_MODE_FIXED_SLICE_CCS_MODE))
>> + },
>> {}
>> };
>>
>> diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c
>> index 16376607c68f..a0162744ce24 100644
>> --- a/drivers/gpu/drm/xe/xe_tile_sysfs.c
>> +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c
>
>Engines are a GT concept, not a tile concept, so you probably want this
>in the GT sysfs instead. Otherwise we're going to be stuck with a
>problematic ABI down the road if we have a platform with multiple
>compute-capable GTs within a single tile.
>
yah, makes sense, will fix.
>
>> @@ -7,8 +7,12 @@
>> #include <linux/sysfs.h>
>> #include <drm/drm_managed.h>
>>
>> +#include "xe_gt.h"
>> +#include "xe_gt_ccs_mode.h"
>> +#include "xe_mmio.h"
>> #include "xe_tile.h"
>> #include "xe_tile_sysfs.h"
>> +#include "regs/xe_gt_regs.h"
>>
>> static void xe_tile_sysfs_kobj_release(struct kobject *kobj)
>> {
>> @@ -34,6 +38,51 @@ static DEVICE_ATTR_RO(physical_vram_size_bytes);
>> static const struct attribute *physical_memsize_attr =
>> &dev_attr_physical_vram_size_bytes.attr;
>>
>> +static ssize_t
>> +ccs_mode_show(struct device *kdev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + struct xe_tile *tile = kobj_to_tile(&kdev->kobj);
>> +
>> + return sysfs_emit(buf, "%d\n", tile->primary_gt->ccs.num_engines);
>> +}
>> +
>> +static ssize_t
>> +ccs_mode_store(struct device *kdev, struct device_attribute *attr,
>> + const char *buff, size_t count)
>> +{
>> + struct xe_tile *tile = kobj_to_tile(&kdev->kobj);
>> + struct xe_device *xe = tile_to_xe(tile);
>> + struct xe_gt *gt = tile->primary_gt;
>> + u32 num_engines, num_slices;
>> + int ret;
>> +
>> + ret = kstrtou32(buff, 0, &num_engines);
>> + if (ret)
>> + return ret;
>> +
>> + /*
>> + * Ensure number of engines specified is valid and there is an
>> + * exact multiple of engines for slices.
>> + */
>> + num_slices = hweight32(CCS_MASK(gt));
>> + if (!num_engines || num_engines > num_slices || num_slices % num_engines) {
>> + drm_dbg(&xe->drm, "Invalid compute config for tile %d, %d engines %d slices\n",
>
>As earlier, we should probably use xe_gt_dbg() and drop the
>tile-specific number since the function will print the GT identification
>automatically. Same for the other drm_info (xe_gt_info) and drm_warn
>(xe_gt_warn) messages below.
>
Yah, makes sense, will fix.
>> + tile->id, num_engines, num_slices);
>> + return -EINVAL;
>> + }
>> +
>> + if (gt->ccs.num_engines != num_engines) {
>> + drm_info(&xe->drm, "tile-%d: setting compute mode to %d\n", tile->id, num_engines);
>> + gt->ccs.num_engines = num_engines;
>> + xe_gt_reset_async(gt);
>> + }
>> +
>> + return count;
>> +}
>> +
>> +static DEVICE_ATTR_RW(ccs_mode);
>> +
>> static void tile_sysfs_fini(struct drm_device *drm, void *arg)
>> {
>> struct xe_tile *tile = arg;
>> @@ -69,6 +118,11 @@ void xe_tile_sysfs_init(struct xe_tile *tile)
>
>We'll want to move this to xe_gt_sysfs_init as well.
>
Yah, will fix.
Niranjana
>
>Matt
>
>> drm_warn(&xe->drm,
>> "Sysfs creation to read addr_range per tile failed\n");
>>
>> + if (GRAPHICS_VERx100(xe) == 1260 &&
>> + sysfs_create_file(tile->sysfs, &dev_attr_ccs_mode.attr))
>> + drm_warn(&xe->drm,
>> + "Sysfs creation to set ccs_mode failed\n");
>> +
>> err = drmm_add_action_or_reset(&xe->drm, tile_sysfs_fini, tile);
>> if (err) {
>> drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n",
>> --
>> 2.21.0.rc0.32.g243a4c7e27
>>
>
>--
>Matt Roper
>Graphics Software Engineer
>Linux GPU Platform Enablement
>Intel Corporation
More information about the Intel-xe
mailing list