[Intel-gfx] [PATCH v7 06/12] drm/i915/perf: implement active wait for noa configurations

Tue Jul 9 10:44:53 UTC 2019

On 09/07/2019 12:53, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-07-09 10:32:02)
>> +static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
>> +                         i915_reg_t reg, u32 offset, u32 dword_count)
>> +{
>> +       uint32_t d;
>> +
>> +       for (d = 0; d < dword_count; d++) {
>> +               *cs++ = INTEL_GEN(i915) >= 8 ?
>> +                       MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
>> +               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
>> +               *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
>> +               if (INTEL_GEN(i915) >= 8)
>> +                       *cs++ = 0;
> restore_register doesn't care about the extra MI_NOOP :)
>
>> +       }
>> +
>> +       return cs;
>> +}
>> +
>> +static u32 *restore_register(struct drm_i915_private *i915, u32 *cs,
>> +                            i915_reg_t reg, u32 offset, u32 dword_count)
>> +{
>> +       uint32_t d;
>> +
>> +       for (d = 0; d < dword_count; d++) {
>> +               *cs++ = INTEL_GEN(i915) >= 8 ?
>> +                       MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM;
>> +               *cs++ = i915_mmio_reg_offset(reg);
>> +               *cs++ = intel_gt_scratch_offset(&i915->gt, offset);
>> +               *cs++ = 0;
>> +       }
>> +
>> +       return cs;
>> +}
>> +
>> +static int alloc_noa_wait(struct drm_i915_private *i915)
>> +{
>> +       struct drm_i915_gem_object *bo;
>> +       struct i915_vma *vma;
>> +       u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks;
> I would const u64 delay_ticks = foo(i915);
>
> That would save the distraction later in the middle of CS.
>
>> +       u32 *batch, *ts0, *cs, *jump;
>> +       int ret, i;
>> +       enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR };
>> +
>> +       bo = i915_gem_object_create_internal(i915, 4096);
>> +       if (IS_ERR(bo)) {
>> +               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
>> +               return PTR_ERR(bo);
>> +       }
>> +
>> +       /*
>> +        * We pin in GGTT because we jump into this buffer now because
>> +        * multiple OA config BOs will have a jump to this address and it
>> +        * needs to be fixed during the lifetime of the i915/perf stream.
>> +        */
>> +       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
>> +       if (IS_ERR(vma)) {
>> +               ret = PTR_ERR(vma);
>> +               goto err_unref;
>> +       }
>> +
>> +       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
>> +       if (IS_ERR(batch)) {
>> +               ret = PTR_ERR(batch);
>> +               goto err_unpin;
>> +       }
>> +
>> +       /* Save registers. */
>> +       for (i = 0; i < N_CS_GPR; i++) {
>> +               cs = save_register(i915, cs, HSW_CS_GPR(i),
>> +                                  INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                          INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* First timestamp snapshot location. */
>> +       ts0 = cs;
>> +
>> +       /*
>> +        * Initial snapshot of the timestamp register to implement the wait.
>> +        * We work with 32b values, so clear out the top 32b bits of the
>> +        * register because the ALU works 64bits.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS));
>> +
>> +       /*
>> +        * This is the location we're going to jump back into until the
>> +        * required amount of time has passed.
>> +        */
>> +       jump = cs;
>> +
>> +       /*
>> +        * Take another snapshot of the timestamp register. Take care to clear
>> +        * up the top 32bits of CS_GPR(1) as we're using it for other
>> +        * operations below.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS));
>> +
>> +       /*
>> +        * Do a diff between the 2 timestamps and store the result back into
>> +        * CS_GPR(1).
>> +        */
>> +       *cs++ = MI_MATH(5);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
>> +
>> +       /*
>> +        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
>> +        * timestamp have rolled over the 32bits) into the predicate register
>> +        * to be used for the predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Restart from the beginning if we had timestamps roll over. */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
>> +       *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
>> +       *cs++ = 0;
>> +
>> +       /*
>> +        * Now add the diff between to previous timestamps and add it to :
>> +        *      (((1 * << 64) - 1) - delay_ns)
>> +        *
>> +        * When the Carry Flag contains 1 this means the elapsed time is
>> +        * longer than the expected delay, and we can exit the wait loop.
>> +        */
>> +       delay_ticks = 0xffffffffffffffff -
>> +               DIV64_U64_ROUND_UP(delay_ns *
>> +                                  RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
>> +                                  1000000ull);
>> +       *cs++ = MI_LOAD_REGISTER_IMM(2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET));
>> +       *cs++ = lower_32_bits(delay_ticks);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4;
>> +       *cs++ = upper_32_bits(delay_ticks);
>> +
>> +       *cs++ = MI_MATH(4);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF);
>> +
>> +       /*
>> +        * Transfer the result into the predicate register to be used for the
>> +        * predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Predicate the jump.  */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
>> +       *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
>> +       *cs++ = 0;
>> +
>> +       /* Restore registers. */
>> +       for (i = 0; i < N_CS_GPR; i++) {
>> +               cs = restore_register(i915, cs, HSW_CS_GPR(i),
>> +                                     INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                             INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* And return to the ring. */
>> +       *cs++ = MI_BATCH_BUFFER_END;
>> +
>> +       GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch)));
>> +
>> +       i915_gem_object_flush_map(bo);
>> +       i915_gem_object_unpin_map(bo);
>> +
>> +       i915->perf.oa.noa_wait = vma;
>> +
>> +       return 0;
>> +
>> +err_unpin:
>> +       __i915_vma_unpin(vma);
>> +
>> +err_unref:
>> +       i915_gem_object_put(bo);
>> +
>> +       return ret;
>> +}
> Preferably with the nit above,
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
> -Chris
>
Thanks, added the 2 nis locally.


-Lionel