[Intel-gfx] [PATCH 2/2] drm/i915: cleanup TLB invalidation code
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Dec 9 12:04:28 UTC 2022
On 09/12/2022 11:33, Andrzej Hajda wrote:
>
>
> On 09.12.2022 11:16, Tvrtko Ursulin wrote:
>>
>> On 07/12/2022 17:36, Andrzej Hajda wrote:
>>> Whole register/bit selection logic has been moved to separate helper.
>>
>> Why is missing.
>
> ...to clean up mmio_invalidate_full function.
>
> Will add.
>
>>
>>> Signed-off-by: Andrzej Hajda <andrzej.hajda at intel.com>
>>> ---
>>> drivers/gpu/drm/i915/gt/intel_gt.c | 136 +++++++++++------------------
>>> 1 file changed, 51 insertions(+), 85 deletions(-)
>>
>> Diffstat suggests because more streamlined code. Any other reason?
>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c
>>> b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> index f0224b607aa4a7..05520ec3264db8 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct
>>> intel_gt_info *info,
>>> intel_sseu_dump(&info->sseu, p);
>>> }
>>> -struct reg_and_bit {
>>> +struct reg_and_bits {
>>> union {
>>> i915_reg_t reg;
>>> i915_mcr_reg_t mcr_reg;
>>> };
>>> - u32 bit;
>>> + u32 bits;
>>> };
>>> -static struct reg_and_bit
>>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>>> - const i915_reg_t *regs, const unsigned int num)
>>> +static struct reg_and_bits
>>> +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool
>>> write)
>>> {
>>> + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = {
>>> + [RENDER_CLASS] = GEN8_RTCR,
>>> + [VIDEO_DECODE_CLASS] = GEN8_M1TCR,
>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR,
>>> + [COPY_ENGINE_CLASS] = GEN8_BTCR,
>>> + };
>>> + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = {
>>> + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR,
>>> + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR,
>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR,
>>> + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
>>> + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR,
>>> + };
>>> + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = {
>>> + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR,
>>> + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR,
>>> + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR,
>>> + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR,
>>> + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR,
>>> + };
>>> const unsigned int class = engine->class;
>>> - struct reg_and_bit rb = { };
>>> + struct reg_and_bits rb = { .bits = BIT(engine->instance) };
>>> - if (drm_WARN_ON_ONCE(&engine->i915->drm,
>>> - class >= num || !regs[class].reg))
>>> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
>>> + rb.mcr_reg = xehp_regs[class];
>>> + else if (GRAPHICS_VER(engine->i915) >= 12)
>>> + rb.reg = gen12_regs[class];
>>> + else if (GRAPHICS_VER(engine->i915) >= 8)
>>> + rb.reg = gen8_regs[class];
>>> +
>>> + if (drm_WARN_ON_ONCE(&engine->i915->drm,
>>> !i915_mmio_reg_offset(rb.reg)))
>>
>> I'd prefer user readable message was kept but not a blocker.
>
> Tried to avoid changes in refactoring, will change.
>
>>
>>> return rb;
>>> - rb.reg = regs[class];
>>> - if (gen8 && class == VIDEO_DECODE_CLASS)
>>> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>>> - else
>>> - rb.bit = engine->instance;
>>> + if (GRAPHICS_VER(engine->i915) < 12 && class ==
>>> VIDEO_DECODE_CLASS) {
>>> + rb.bits = 1;
>>> + rb.reg.reg += 4 * engine->instance;
>>
>> No reason to drop the comment IMO. It explains things somewhat, or at
>> least provides a hint.
>
> OK
>
>>
>>> + }
>>> - rb.bit = BIT(rb.bit);
>>> + if (write && GRAPHICS_VER(engine->i915) >= 12 &&
>>> + (class == VIDEO_DECODE_CLASS || class ==
>>> VIDEO_ENHANCEMENT_CLASS ||
>>> + class == COMPUTE_CLASS))
>>> + rb.bits = _MASKED_BIT_ENABLE(rb.bits);
>>
>> This could be else if to not have < 12 followed by explicit >= 12, but
>> perhaps it is clearer like this, to signify it's two completely
>> separate quirks.
>>
>> Also, I would perhaps consider having a local i915 since there's a
>> good number of engine->i915, but it's up to you what looks nicer.
>
> OK
>
>>
>>> return rb;
>>> }
>>> @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs
>>> *engine, const bool gen8,
>>> * but are now considered MCR registers. Since they exist within a
>>> GAM range,
>>> * the primary instance of the register rolls up the status from
>>> each unit.
>>> */
>>> -static int wait_for_invalidate(struct intel_gt *gt, struct
>>> reg_and_bit rb)
>>> +static int wait_for_invalidate(struct intel_gt *gt, struct
>>> reg_and_bits rb)
>>> {
>>> if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
>>> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
>>> + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0,
>>> TLB_INVAL_TIMEOUT_US,
>>> TLB_INVAL_TIMEOUT_MS);
>>> else
>>> - return __intel_wait_for_register_fw(gt->uncore, rb.reg,
>>> rb.bit, 0,
>>> + return __intel_wait_for_register_fw(gt->uncore, rb.reg,
>>> rb.bits, 0,
>>> TLB_INVAL_TIMEOUT_US,
>>> TLB_INVAL_TIMEOUT_MS,
>>> NULL);
>>> @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct
>>> intel_gt *gt, struct reg_and_bit rb)
>>> static void mmio_invalidate_full(struct intel_gt *gt)
>>> {
>>> - static const i915_reg_t gen8_regs[] = {
>>> - [RENDER_CLASS] = GEN8_RTCR,
>>> - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */
>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR,
>>> - [COPY_ENGINE_CLASS] = GEN8_BTCR,
>>> - };
>>> - static const i915_reg_t gen12_regs[] = {
>>> - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR,
>>> - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR,
>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR,
>>> - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
>>> - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR,
>>> - };
>>> - static const i915_mcr_reg_t xehp_regs[] = {
>>> - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR,
>>> - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR,
>>> - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR,
>>> - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR,
>>> - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR,
>>> - };
>>> struct drm_i915_private *i915 = gt->i915;
>>> struct intel_uncore *uncore = gt->uncore;
>>> struct intel_engine_cs *engine;
>>> intel_engine_mask_t awake, tmp;
>>> enum intel_engine_id id;
>>> - const i915_reg_t *regs;
>>> - unsigned int num = 0;
>>> unsigned long flags;
>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> - regs = NULL;
>>> - num = ARRAY_SIZE(xehp_regs);
>>> - } else if (GRAPHICS_VER(i915) == 12) {
>>> - regs = gen12_regs;
>>> - num = ARRAY_SIZE(gen12_regs);
>>> - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
>>> - regs = gen8_regs;
>>> - num = ARRAY_SIZE(gen8_regs);
>>> - } else if (GRAPHICS_VER(i915) < 8) {
>>> - return;
>>> - }
>>> -
>>> - if (drm_WARN_ONCE(&i915->drm, !num,
>>> - "Platform does not implement TLB invalidation!"))
>>> + if (GRAPHICS_VER(i915) < 8)
>>> return;
>>> intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>>> @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct
>>> intel_gt *gt)
>>> awake = 0;
>>> for_each_engine(engine, gt, id) {
>>> - struct reg_and_bit rb;
>>> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine,
>>> true);
>>
>> Ugh so actually what was a once per invalidation lookup is now
>> repeated per engine, times two. I wonder if we can do this better.
>> Lets think about it a bit.
>
> It was always twice, see below.
>
>>
>> Regards,
>>
>> Tvrtko
>>
>>> if (!intel_engine_pm_is_awake(engine))
>>> continue;
>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>> - u32 val = BIT(engine->instance);
>>> -
>>> - if (engine->class == VIDEO_DECODE_CLASS ||
>>> - engine->class == VIDEO_ENHANCEMENT_CLASS ||
>>> - engine->class == COMPUTE_CLASS)
>>> - val = _MASKED_BIT_ENABLE(val);
>>> - intel_gt_mcr_multicast_write_fw(gt,
>>> - xehp_regs[engine->class],
>>> - val);
>>> - } else {
>>> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>
> Here is the 2nd call, from old code.
> Since there are two separate loops there are two calls, caching call
> results would be overkill IMO.
> Or I can put back whole logic to mmio_invalidate_full, GEN12 quirk is
> needed only in 1st loop (write), the only redundancy will be with GEN8
> quirk, which could be handled with some helper.
> Is it worth trying? I guess it is no big gain.
Yes it was always twice in get_reg_and_bit but not the whole register table selection.
We have some checkes which are per platform, and some which are platform and engine. I propose to keep them split. I made a stab at it like this:
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 0377e1b25be9..d907b9005dd6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -988,33 +988,50 @@ void intel_gt_info_print(const struct intel_gt_info *info,
intel_sseu_dump(&info->sseu, p);
}
-struct reg_and_bit {
+struct inv_reg {
union {
i915_reg_t reg;
i915_mcr_reg_t mcr_reg;
};
+};
+
+struct reg_and_bit {
+ struct inv_reg reg;
u32 bit;
};
static struct reg_and_bit
-get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
- const i915_reg_t *regs, const unsigned int num)
+get_reg_and_bit(const struct intel_engine_cs *engine,
+ const i915_reg_t *regs, const unsigned int num,
+ bool write)
{
+ struct drm_i915_private *i915 = engine->i915;
const unsigned int class = engine->class;
struct reg_and_bit rb = { };
+ BUILD_BUG_ON(sizeof(rb.reg.reg) != sizeof(rb.reg.mcr_reg));
+ BUILD_BUG_ON(!__builtin_types_compatible_p(typeof(rb.reg.reg.reg),
+ typeof(rb.reg.mcr_reg.reg)));
+
if (drm_WARN_ON_ONCE(&engine->i915->drm,
class >= num || !regs[class].reg))
return rb;
- rb.reg = regs[class];
- if (gen8 && class == VIDEO_DECODE_CLASS)
- rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
+ rb.reg.reg = regs[class];
+
+ if (GRAPHICS_VER(i915) < 12 && class == VIDEO_DECODE_CLASS)
+ rb.reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
else
rb.bit = engine->instance;
rb.bit = BIT(rb.bit);
+ if (write && GRAPHICS_VER(i915) >= 12 &&
+ (engine->class == VIDEO_DECODE_CLASS ||
+ engine->class == VIDEO_ENHANCEMENT_CLASS ||
+ engine->class == COMPUTE_CLASS))
+ rb.bit = _MASKED_BIT_ENABLE(rb.bit);
+
return rb;
}
@@ -1031,14 +1048,16 @@ get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
* but are now considered MCR registers. Since they exist within a GAM range,
* the primary instance of the register rolls up the status from each unit.
*/
-static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
+static int
+wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb, bool mcr)
{
- if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
- return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
+ if (mcr)
+ return intel_gt_mcr_wait_for_reg(gt, rb.reg.mcr_reg, rb.bit, 0,
TLB_INVAL_TIMEOUT_US,
TLB_INVAL_TIMEOUT_MS);
else
- return __intel_wait_for_register_fw(gt->uncore, rb.reg, rb.bit, 0,
+ return __intel_wait_for_register_fw(gt->uncore,
+ rb.reg.reg, rb.bit, 0,
TLB_INVAL_TIMEOUT_US,
TLB_INVAL_TIMEOUT_MS,
NULL);
@@ -1068,6 +1087,7 @@ static void mmio_invalidate_full(struct intel_gt *gt)
};
struct drm_i915_private *i915 = gt->i915;
struct intel_uncore *uncore = gt->uncore;
+ const bool mcr = GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50);
struct intel_engine_cs *engine;
intel_engine_mask_t awake, tmp;
enum intel_engine_id id;
@@ -1076,7 +1096,7 @@ static void mmio_invalidate_full(struct intel_gt *gt)
unsigned long flags;
if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
- regs = NULL;
+ regs = (i915_reg_t *)xehp_regs;
num = ARRAY_SIZE(xehp_regs);
} else if (GRAPHICS_VER(i915) == 12) {
regs = gen12_regs;
@@ -1104,28 +1124,15 @@ static void mmio_invalidate_full(struct intel_gt *gt)
if (!intel_engine_pm_is_awake(engine))
continue;
- if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
- u32 val = BIT(engine->instance);
-
- if (engine->class == VIDEO_DECODE_CLASS ||
- engine->class == VIDEO_ENHANCEMENT_CLASS ||
- engine->class == COMPUTE_CLASS)
- val = _MASKED_BIT_ENABLE(val);
- intel_gt_mcr_multicast_write_fw(gt,
- xehp_regs[engine->class],
- val);
- } else {
- rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
- if (!i915_mmio_reg_offset(rb.reg))
- continue;
-
- if (GRAPHICS_VER(i915) == 12 && (engine->class == VIDEO_DECODE_CLASS ||
- engine->class == VIDEO_ENHANCEMENT_CLASS ||
- engine->class == COMPUTE_CLASS))
- rb.bit = _MASKED_BIT_ENABLE(rb.bit);
-
- intel_uncore_write_fw(uncore, rb.reg, rb.bit);
- }
+ rb = get_reg_and_bit(engine, regs, num, true);
+ if (!i915_mmio_reg_offset(rb.reg.reg))
+ continue;
+
+ if (mcr)
+ intel_gt_mcr_multicast_write_fw(gt, rb.reg.mcr_reg,
+ rb.bit);
+ else
+ intel_uncore_write_fw(uncore, rb.reg.reg, rb.bit);
awake |= engine->mask;
}
@@ -1144,16 +1151,10 @@ static void mmio_invalidate_full(struct intel_gt *gt)
intel_gt_mcr_unlock(gt, flags);
for_each_engine_masked(engine, gt, awake, tmp) {
- struct reg_and_bit rb;
-
- if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
- rb.mcr_reg = xehp_regs[engine->class];
- rb.bit = BIT(engine->instance);
- } else {
- rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
- }
+ struct reg_and_bit rb =
+ get_reg_and_bit(engine, regs, num, false);
- if (wait_for_invalidate(gt, rb))
+ if (wait_for_invalidate(gt, rb, mcr))
drm_err_ratelimited(>->i915->drm,
"%s TLB invalidation did not complete in %ums!\n",
engine->name, TLB_INVAL_TIMEOUT_MS);
So only questions which vary per engine are asked in the engine loops.
A bit hacky with asserts i915_reg_t and i915_mcr_reg_t are the same underlying type really but may be passable. See what you think.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list