[Intel-gfx] [PATCH 2/2] drm/i915: cleanup TLB invalidation code
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Dec 9 12:35:25 UTC 2022
On 09/12/2022 12:04, Tvrtko Ursulin wrote:
>
> On 09/12/2022 11:33, Andrzej Hajda wrote:
>>
>>
>> On 09.12.2022 11:16, Tvrtko Ursulin wrote:
>>>
>>> On 07/12/2022 17:36, Andrzej Hajda wrote:
>>>> Whole register/bit selection logic has been moved to separate helper.
>>>
>>> Why is missing.
>>
>> ...to clean up mmio_invalidate_full function.
>>
>> Will add.
>>
>>>
>>>> Signed-off-by: Andrzej Hajda <andrzej.hajda at intel.com>
>>>> ---
>>>> drivers/gpu/drm/i915/gt/intel_gt.c | 136
>>>> +++++++++++------------------
>>>> 1 file changed, 51 insertions(+), 85 deletions(-)
>>>
>>> Diffstat suggests because more streamlined code. Any other reason?
>>>
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c
>>>> b/drivers/gpu/drm/i915/gt/intel_gt.c
>>>> index f0224b607aa4a7..05520ec3264db8 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>>>> @@ -1003,32 +1003,59 @@ void intel_gt_info_print(const struct
>>>> intel_gt_info *info,
>>>> intel_sseu_dump(&info->sseu, p);
>>>> }
>>>> -struct reg_and_bit {
>>>> +struct reg_and_bits {
>>>> union {
>>>> i915_reg_t reg;
>>>> i915_mcr_reg_t mcr_reg;
>>>> };
>>>> - u32 bit;
>>>> + u32 bits;
>>>> };
>>>> -static struct reg_and_bit
>>>> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
>>>> - const i915_reg_t *regs, const unsigned int num)
>>>> +static struct reg_and_bits
>>>> +get_tlb_inv_reg_and_bits(const struct intel_engine_cs *engine, bool
>>>> write)
>>>> {
>>>> + static const i915_reg_t gen8_regs[MAX_ENGINE_CLASS + 1] = {
>>>> + [RENDER_CLASS] = GEN8_RTCR,
>>>> + [VIDEO_DECODE_CLASS] = GEN8_M1TCR,
>>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR,
>>>> + [COPY_ENGINE_CLASS] = GEN8_BTCR,
>>>> + };
>>>> + static const i915_reg_t gen12_regs[MAX_ENGINE_CLASS + 1] = {
>>>> + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR,
>>>> + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR,
>>>> + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR,
>>>> + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
>>>> + [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR,
>>>> + };
>>>> + static const i915_mcr_reg_t xehp_regs[MAX_ENGINE_CLASS + 1] = {
>>>> + [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR,
>>>> + [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR,
>>>> + [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR,
>>>> + [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR,
>>>> + [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR,
>>>> + };
>>>> const unsigned int class = engine->class;
>>>> - struct reg_and_bit rb = { };
>>>> + struct reg_and_bits rb = { .bits = BIT(engine->instance) };
>>>> - if (drm_WARN_ON_ONCE(&engine->i915->drm,
>>>> - class >= num || !regs[class].reg))
>>>> + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
>>>> + rb.mcr_reg = xehp_regs[class];
>>>> + else if (GRAPHICS_VER(engine->i915) >= 12)
>>>> + rb.reg = gen12_regs[class];
>>>> + else if (GRAPHICS_VER(engine->i915) >= 8)
>>>> + rb.reg = gen8_regs[class];
>>>> +
>>>> + if (drm_WARN_ON_ONCE(&engine->i915->drm,
>>>> !i915_mmio_reg_offset(rb.reg)))
>>>
>>> I'd prefer user readable message was kept but not a blocker.
>>
>> Tried to avoid changes in refactoring, will change.
>>
>>>
>>>> return rb;
>>>> - rb.reg = regs[class];
>>>> - if (gen8 && class == VIDEO_DECODE_CLASS)
>>>> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
>>>> - else
>>>> - rb.bit = engine->instance;
>>>> + if (GRAPHICS_VER(engine->i915) < 12 && class ==
>>>> VIDEO_DECODE_CLASS) {
>>>> + rb.bits = 1;
>>>> + rb.reg.reg += 4 * engine->instance;
>>>
>>> No reason to drop the comment IMO. It explains things somewhat, or at
>>> least provides a hint.
>>
>> OK
>>
>>>
>>>> + }
>>>> - rb.bit = BIT(rb.bit);
>>>> + if (write && GRAPHICS_VER(engine->i915) >= 12 &&
>>>> + (class == VIDEO_DECODE_CLASS || class ==
>>>> VIDEO_ENHANCEMENT_CLASS ||
>>>> + class == COMPUTE_CLASS))
>>>> + rb.bits = _MASKED_BIT_ENABLE(rb.bits);
>>>
>>> This could be else if to not have < 12 followed by explicit >= 12,
>>> but perhaps it is clearer like this, to signify it's two completely
>>> separate quirks.
>>>
>>> Also, I would perhaps consider having a local i915 since there's a
>>> good number of engine->i915, but it's up to you what looks nicer.
>>
>> OK
>>
>>>
>>>> return rb;
>>>> }
>>>> @@ -1046,14 +1073,14 @@ get_reg_and_bit(const struct intel_engine_cs
>>>> *engine, const bool gen8,
>>>> * but are now considered MCR registers. Since they exist within
>>>> a GAM range,
>>>> * the primary instance of the register rolls up the status from
>>>> each unit.
>>>> */
>>>> -static int wait_for_invalidate(struct intel_gt *gt, struct
>>>> reg_and_bit rb)
>>>> +static int wait_for_invalidate(struct intel_gt *gt, struct
>>>> reg_and_bits rb)
>>>> {
>>>> if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
>>>> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
>>>> + return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bits, 0,
>>>> TLB_INVAL_TIMEOUT_US,
>>>> TLB_INVAL_TIMEOUT_MS);
>>>> else
>>>> - return __intel_wait_for_register_fw(gt->uncore, rb.reg,
>>>> rb.bit, 0,
>>>> + return __intel_wait_for_register_fw(gt->uncore, rb.reg,
>>>> rb.bits, 0,
>>>> TLB_INVAL_TIMEOUT_US,
>>>> TLB_INVAL_TIMEOUT_MS,
>>>> NULL);
>>>> @@ -1061,50 +1088,14 @@ static int wait_for_invalidate(struct
>>>> intel_gt *gt, struct reg_and_bit rb)
>>>> static void mmio_invalidate_full(struct intel_gt *gt)
>>>> {
>>>> - static const i915_reg_t gen8_regs[] = {
>>>> - [RENDER_CLASS] = GEN8_RTCR,
>>>> - [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */
>>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR,
>>>> - [COPY_ENGINE_CLASS] = GEN8_BTCR,
>>>> - };
>>>> - static const i915_reg_t gen12_regs[] = {
>>>> - [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR,
>>>> - [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR,
>>>> - [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR,
>>>> - [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
>>>> - [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR,
>>>> - };
>>>> - static const i915_mcr_reg_t xehp_regs[] = {
>>>> - [RENDER_CLASS] = XEHP_GFX_TLB_INV_CR,
>>>> - [VIDEO_DECODE_CLASS] = XEHP_VD_TLB_INV_CR,
>>>> - [VIDEO_ENHANCEMENT_CLASS] = XEHP_VE_TLB_INV_CR,
>>>> - [COPY_ENGINE_CLASS] = XEHP_BLT_TLB_INV_CR,
>>>> - [COMPUTE_CLASS] = XEHP_COMPCTX_TLB_INV_CR,
>>>> - };
>>>> struct drm_i915_private *i915 = gt->i915;
>>>> struct intel_uncore *uncore = gt->uncore;
>>>> struct intel_engine_cs *engine;
>>>> intel_engine_mask_t awake, tmp;
>>>> enum intel_engine_id id;
>>>> - const i915_reg_t *regs;
>>>> - unsigned int num = 0;
>>>> unsigned long flags;
>>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>>> - regs = NULL;
>>>> - num = ARRAY_SIZE(xehp_regs);
>>>> - } else if (GRAPHICS_VER(i915) == 12) {
>>>> - regs = gen12_regs;
>>>> - num = ARRAY_SIZE(gen12_regs);
>>>> - } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
>>>> - regs = gen8_regs;
>>>> - num = ARRAY_SIZE(gen8_regs);
>>>> - } else if (GRAPHICS_VER(i915) < 8) {
>>>> - return;
>>>> - }
>>>> -
>>>> - if (drm_WARN_ONCE(&i915->drm, !num,
>>>> - "Platform does not implement TLB invalidation!"))
>>>> + if (GRAPHICS_VER(i915) < 8)
>>>> return;
>>>> intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>>>> @@ -1114,33 +1105,15 @@ static void mmio_invalidate_full(struct
>>>> intel_gt *gt)
>>>> awake = 0;
>>>> for_each_engine(engine, gt, id) {
>>>> - struct reg_and_bit rb;
>>>> + struct reg_and_bits rb = get_tlb_inv_reg_and_bits(engine,
>>>> true);
>>>
>>> Ugh so actually what was a once per invalidation lookup is now
>>> repeated per engine, times two. I wonder if we can do this better.
>>> Lets think about it a bit.
>>
>> It was always twice, see below.
>>
>>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>>> if (!intel_engine_pm_is_awake(engine))
>>>> continue;
>>>> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
>>>> - u32 val = BIT(engine->instance);
>>>> -
>>>> - if (engine->class == VIDEO_DECODE_CLASS ||
>>>> - engine->class == VIDEO_ENHANCEMENT_CLASS ||
>>>> - engine->class == COMPUTE_CLASS)
>>>> - val = _MASKED_BIT_ENABLE(val);
>>>> - intel_gt_mcr_multicast_write_fw(gt,
>>>> - xehp_regs[engine->class],
>>>> - val);
>>>> - } else {
>>>> - rb = get_reg_and_bit(engine, regs == gen8_regs, regs,
>>>> num);
>>
>> Here is the 2nd call, from old code.
>> Since there are two separate loops there are two calls, caching call
>> results would be overkill IMO.
>> Or I can put back whole logic to mmio_invalidate_full, GEN12 quirk is
>> needed only in 1st loop (write), the only redundancy will be with GEN8
>> quirk, which could be handled with some helper.
>> Is it worth trying? I guess it is no big gain.
>
> Yes it was always twice in get_reg_and_bit but not the whole register
> table selection.
>
> We have some checkes which are per platform, and some which are platform
> and engine. I propose to keep them split. I made a stab at it like this:
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c
> b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 0377e1b25be9..d907b9005dd6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -988,33 +988,50 @@ void intel_gt_info_print(const struct
> intel_gt_info *info,
> intel_sseu_dump(&info->sseu, p);
> }
>
> -struct reg_and_bit {
> +struct inv_reg {
> union {
> i915_reg_t reg;
> i915_mcr_reg_t mcr_reg;
> };
> +};
> +
> +struct reg_and_bit {
> + struct inv_reg reg;
> u32 bit;
> };
>
> static struct reg_and_bit
> -get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
> - const i915_reg_t *regs, const unsigned int num)
> +get_reg_and_bit(const struct intel_engine_cs *engine,
> + const i915_reg_t *regs, const unsigned int num,
> + bool write)
> {
> + struct drm_i915_private *i915 = engine->i915;
> const unsigned int class = engine->class;
> struct reg_and_bit rb = { };
>
> + BUILD_BUG_ON(sizeof(rb.reg.reg) != sizeof(rb.reg.mcr_reg));
> + BUILD_BUG_ON(!__builtin_types_compatible_p(typeof(rb.reg.reg.reg),
> +
> typeof(rb.reg.mcr_reg.reg)));
> +
> if (drm_WARN_ON_ONCE(&engine->i915->drm,
> class >= num || !regs[class].reg))
> return rb;
>
> - rb.reg = regs[class];
> - if (gen8 && class == VIDEO_DECODE_CLASS)
> - rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
> + rb.reg.reg = regs[class];
> +
> + if (GRAPHICS_VER(i915) < 12 && class == VIDEO_DECODE_CLASS)
> + rb.reg.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
> else
> rb.bit = engine->instance;
>
> rb.bit = BIT(rb.bit);
>
> + if (write && GRAPHICS_VER(i915) >= 12 &&
> + (engine->class == VIDEO_DECODE_CLASS ||
> + engine->class == VIDEO_ENHANCEMENT_CLASS ||
> + engine->class == COMPUTE_CLASS))
> + rb.bit = _MASKED_BIT_ENABLE(rb.bit);
> +
> return rb;
> }
>
> @@ -1031,14 +1048,16 @@ get_reg_and_bit(const struct intel_engine_cs
> *engine, const bool gen8,
> * but are now considered MCR registers. Since they exist within a
> GAM range,
> * the primary instance of the register rolls up the status from each
> unit.
> */
> -static int wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb)
> +static int
> +wait_for_invalidate(struct intel_gt *gt, struct reg_and_bit rb, bool mcr)
> {
> - if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
> - return intel_gt_mcr_wait_for_reg(gt, rb.mcr_reg, rb.bit, 0,
> + if (mcr)
> + return intel_gt_mcr_wait_for_reg(gt, rb.reg.mcr_reg,
> rb.bit, 0,
> TLB_INVAL_TIMEOUT_US,
> TLB_INVAL_TIMEOUT_MS);
> else
> - return __intel_wait_for_register_fw(gt->uncore, rb.reg,
> rb.bit, 0,
> + return __intel_wait_for_register_fw(gt->uncore,
> + rb.reg.reg, rb.bit, 0,
> TLB_INVAL_TIMEOUT_US,
> TLB_INVAL_TIMEOUT_MS,
> NULL);
> @@ -1068,6 +1087,7 @@ static void mmio_invalidate_full(struct intel_gt *gt)
> };
> struct drm_i915_private *i915 = gt->i915;
> struct intel_uncore *uncore = gt->uncore;
> + const bool mcr = GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50);
> struct intel_engine_cs *engine;
> intel_engine_mask_t awake, tmp;
> enum intel_engine_id id;
> @@ -1076,7 +1096,7 @@ static void mmio_invalidate_full(struct intel_gt *gt)
> unsigned long flags;
>
> if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> - regs = NULL;
> + regs = (i915_reg_t *)xehp_regs;
> num = ARRAY_SIZE(xehp_regs);
> } else if (GRAPHICS_VER(i915) == 12) {
> regs = gen12_regs;
> @@ -1104,28 +1124,15 @@ static void mmio_invalidate_full(struct intel_gt
> *gt)
> if (!intel_engine_pm_is_awake(engine))
> continue;
>
> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> - u32 val = BIT(engine->instance);
> -
> - if (engine->class == VIDEO_DECODE_CLASS ||
> - engine->class == VIDEO_ENHANCEMENT_CLASS ||
> - engine->class == COMPUTE_CLASS)
> - val = _MASKED_BIT_ENABLE(val);
> - intel_gt_mcr_multicast_write_fw(gt,
> -
> xehp_regs[engine->class],
> - val);
> - } else {
> - rb = get_reg_and_bit(engine, regs == gen8_regs,
> regs, num);
> - if (!i915_mmio_reg_offset(rb.reg))
> - continue;
> -
> - if (GRAPHICS_VER(i915) == 12 && (engine->class
> == VIDEO_DECODE_CLASS ||
> - engine->class == VIDEO_ENHANCEMENT_CLASS ||
> - engine->class == COMPUTE_CLASS))
> - rb.bit = _MASKED_BIT_ENABLE(rb.bit);
> -
> - intel_uncore_write_fw(uncore, rb.reg, rb.bit);
> - }
> + rb = get_reg_and_bit(engine, regs, num, true);
> + if (!i915_mmio_reg_offset(rb.reg.reg))
> + continue;
> +
> + if (mcr)
> + intel_gt_mcr_multicast_write_fw(gt, rb.reg.mcr_reg,
> + rb.bit);
> + else
> + intel_uncore_write_fw(uncore, rb.reg.reg, rb.bit);
> awake |= engine->mask;
> }
>
> @@ -1144,16 +1151,10 @@ static void mmio_invalidate_full(struct intel_gt
> *gt)
> intel_gt_mcr_unlock(gt, flags);
>
> for_each_engine_masked(engine, gt, awake, tmp) {
> - struct reg_and_bit rb;
> -
> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
> - rb.mcr_reg = xehp_regs[engine->class];
> - rb.bit = BIT(engine->instance);
> - } else {
> - rb = get_reg_and_bit(engine, regs == gen8_regs,
> regs, num);
> - }
> + struct reg_and_bit rb =
> + get_reg_and_bit(engine, regs, num, false);
>
> - if (wait_for_invalidate(gt, rb))
> + if (wait_for_invalidate(gt, rb, mcr))
> drm_err_ratelimited(>->i915->drm,
> "%s TLB invalidation did
> not complete in %ums!\n",
> engine->name,
> TLB_INVAL_TIMEOUT_MS);
>
> So only questions which vary per engine are asked in the engine loops.
>
> A bit hacky with asserts i915_reg_t and i915_mcr_reg_t are the same
> underlying type really but may be passable. See what you think.
Or even store register and values (write/read) in struct intel_engine_cs
at engine init time?
Regards,
Tvrtko
More information about the Intel-gfx
mailing list