[Intel-gfx] [PATCH v6 05/11] drm/i915/perf: implement active wait for noa configurations

Mon Jul 1 13:10:53 UTC 2019

On 01/07/2019 15:43, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-07-01 12:34:31)
>> NOA configuration take some amount of time to apply. That amount of
>> time depends on the size of the GT. There is no documented time for
>> this. For example, past experimentations with powergating
>> configuration changes seem to indicate a 60~70us delay. We go with
>> 500us as default for now which should be over the required amount of
>> time (according to HW architects).
>>
>> v2: Don't forget to save/restore registers used for the wait (Chris)
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  24 ++
>>   drivers/gpu/drm/i915/gt/intel_gt_types.h     |   5 +
>>   drivers/gpu/drm/i915/i915_debugfs.c          |  25 +++
>>   drivers/gpu/drm/i915/i915_drv.h              |   8 +
>>   drivers/gpu/drm/i915/i915_perf.c             | 225 ++++++++++++++++++-
>>   drivers/gpu/drm/i915/i915_reg.h              |   4 +-
>>   6 files changed, 288 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> index e7eff9db343e..4a66af38c87b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> @@ -151,6 +151,7 @@
>>   #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
>>   #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
>>   #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
>> +#define   MI_BATCH_PREDICATE         (1 << 15) /* HSW+ on RCS only*/
>>   
>>   /*
>>    * 3D instructions used by the kernel
>> @@ -226,6 +227,29 @@
>>   #define   PIPE_CONTROL_DEPTH_CACHE_FLUSH               (1<<0)
>>   #define   PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
>>   
>> +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
>> +#define   MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
>> +/* operands */
>> +#define   MI_ALU_OP_NOOP     0
>> +#define   MI_ALU_OP_LOAD     128
>> +#define   MI_ALU_OP_LOADINV  1152
>> +#define   MI_ALU_OP_LOAD0    129
>> +#define   MI_ALU_OP_LOAD1    1153
>> +#define   MI_ALU_OP_ADD      256
>> +#define   MI_ALU_OP_SUB      257
>> +#define   MI_ALU_OP_AND      258
>> +#define   MI_ALU_OP_OR       259
>> +#define   MI_ALU_OP_XOR      260
>> +#define   MI_ALU_OP_STORE    384
>> +#define   MI_ALU_OP_STOREINV 1408
>> +/* sources */
>> +#define   MI_ALU_SRC_REG(x)  (x) /* 0 -> 15 */
>> +#define   MI_ALU_SRC_SRCA    32
>> +#define   MI_ALU_SRC_SRCB    33
>> +#define   MI_ALU_SRC_ACCU    49
>> +#define   MI_ALU_SRC_ZF      50
>> +#define   MI_ALU_SRC_CF      51
>> +
>>   /*
>>    * Commands used only by the command parser
>>    */
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> index e625a5e320d3..0750ac49a05b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> @@ -70,6 +70,11 @@ enum intel_gt_scratch_field {
>>          /* 8 bytes */
>>          INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
>>   
>> +       /* 6 * 8 bytes */
>> +       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
>> +
>> +       /* 4 bytes */
>> +       INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
>>   };
>>   
>>   #endif /* __INTEL_GT_TYPES_H__ */
>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> index eeecdad0e3ca..6b49fda145e7 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -3646,6 +3646,30 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
>>                          i915_wedged_get, i915_wedged_set,
>>                          "%llu\n");
>>   
>> +static int
>> +i915_perf_noa_delay_set(void *data, u64 val)
>> +{
>> +       struct drm_i915_private *i915 = data;
>> +
>> +       atomic64_set(&i915->perf.oa.noa_programming_delay, val);
>> +       return 0;
>> +}
>> +
>> +static int
>> +i915_perf_noa_delay_get(void *data, u64 *val)
>> +{
>> +       struct drm_i915_private *i915 = data;
>> +
>> +       *val = atomic64_read(&i915->perf.oa.noa_programming_delay);
>> +       return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
>> +                       i915_perf_noa_delay_get,
>> +                       i915_perf_noa_delay_set,
>> +                       "%llu\n");
>> +
>> +
>>   #define DROP_UNBOUND   BIT(0)
>>   #define DROP_BOUND     BIT(1)
>>   #define DROP_RETIRE    BIT(2)
>> @@ -4411,6 +4435,7 @@ static const struct i915_debugfs_files {
>>          const char *name;
>>          const struct file_operations *fops;
>>   } i915_debugfs_files[] = {
>> +       {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
>>          {"i915_wedged", &i915_wedged_fops},
>>          {"i915_cache_sharing", &i915_cache_sharing_fops},
>>          {"i915_gem_drop_caches", &i915_drop_caches_fops},
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index df39b2ee6bd9..fe93a260bd28 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1837,6 +1837,14 @@ struct drm_i915_private {
>>   
>>                          struct i915_oa_ops ops;
>>                          const struct i915_oa_format *oa_formats;
>> +
>> +                       /**
>> +                        * A batch buffer doing a wait on the GPU for the NOA
>> +                        * logic to be reprogrammed.
>> +                        */
>> +                       struct i915_vma *noa_wait;
>> +
>> +                       atomic64_t noa_programming_delay;
>>                  } oa;
>>          } perf;
>>   
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 5ba771468078..03e6908282e3 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -197,6 +197,7 @@
>>   
>>   #include "gem/i915_gem_context.h"
>>   #include "gem/i915_gem_pm.h"
>> +#include "gt/intel_gt.h"
>>   #include "gt/intel_lrc_reg.h"
>>   
>>   #include "i915_drv.h"
>> @@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>>                                                MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
>>                  config_length += oa_config->flex_regs_len * 8;
>>          }
>> -       config_length += 4; /* MI_BATCH_BUFFER_END */
>> +       config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
>>          config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
>>   
>>          bo = i915_gem_object_create_shmem(i915, config_length);
>> @@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>>          cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
>>          cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
>>   
>> -       *cs++ = MI_BATCH_BUFFER_END;
>> +
>> +       /* Jump into the NOA wait busy loop. */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
>> +       *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
>> +       *cs++ = 0;
>>   
>>          i915_gem_object_flush_map(bo);
>>          i915_gem_object_unpin_map(bo);
>> @@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>>          mutex_lock(&dev_priv->drm.struct_mutex);
>>          dev_priv->perf.oa.exclusive_stream = NULL;
>>          dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
>> +       i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
>>          mutex_unlock(&dev_priv->drm.struct_mutex);
>>   
>>          free_oa_buffer(dev_priv);
>> @@ -1653,6 +1660,204 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>>          return ret;
>>   }
>>   
>> +static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
>> +                         i915_reg_t reg, u32 offset, u32 dword_count)
>> +{
>> +       uint32_t d;
>> +
>> +       for (d = 0; d < dword_count; d++) {
>> +               *cs++ = INTEL_GEN(i915) >= 8 ?
>> +                       MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
>> +               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
>> +               *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
>> +               if (INTEL_GEN(i915) >= 8)
>> +                       *cs++ = 0;
> Will anyone care about the extra nop on hsw? :)

Alright :)

I didn't noticed I left a NOOP below too.

>
>> +static int alloc_noa_wait(struct drm_i915_private *i915)
>> +{
>> +       struct drm_i915_gem_object *bo;
>> +       struct i915_vma *vma;
>> +       u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks;
>> +       u32 *batch, *ts0, *cs, *jump;
>> +       int ret, i;
>> +
>> +       bo = i915_gem_object_create_shmem(i915, 4096);
> So swappable. At 4k, we almost consume as much in our bookkeeping as we
> allocate for the backing store. Yes, it's atrocious.
>
> Hang on. This can never be unpinned as it stores absolute addresses. So
> this can be i915_gem_object_create_internal().
>
>> +       if (IS_ERR(bo)) {
>> +               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
>> +               return PTR_ERR(bo);
>> +       }
>> +
>> +       /*
>> +        * We pin in GGTT because we jump into this buffer now because
>> +        * multiple OA config BOs will have a jump to this address and it
>> +        * needs to be fixed during the lifetime of the i915/perf stream.
>> +        */
>> +       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
>> +       if (IS_ERR(vma)) {
>> +               ret = PTR_ERR(vma);
>> +               goto err_unref;
>> +       }
>> +
>> +       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
>> +       if (IS_ERR(batch)) {
>> +               ret = PTR_ERR(batch);
>> +               goto err_unpin;
>> +       }
>> +
>> +       /* Save registers. */
>> +       for (i = 0; i <= 5; i++) {
>> +               cs = save_register(i915, cs, HSW_CS_GPR(i),
>> +                                  INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                          INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* First timestamp snapshot location. */
>> +       ts0 = cs;
> Did this ever get used?

Oops, my mistake. This is buggy, the jump back to the beginning should 
come back here.

>
>> +       /*
>> +        * Initial snapshot of the timestamp register to implement the wait.
>> +        * We work with 32b values, so clear out the top 32b bits of the
>> +        * register because the ALU works 64bits.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0));
>> +
>> +       /*
>> +        * This is the location we're going to jump back into until the
>> +        * required amount of time has passed.
>> +        */
>> +       jump = cs;
>> +
>> +       /*
>> +        * Take another snapshot of the timestamp register. Take care to clear
>> +        * up the top 32bits of CS_GPR(1) as we're using it for other
>> +        * operations below.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1));
> cs = get_timestamp(cs, 1);
>
> enum { START, NOW, DELTA, RESULT, TARGET } ?
>
> That would also help with save/restore registers, as all CS_GPR should
> then be named.

Very sensible :)

If only you had 
https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/intel/common/gen_mi_builder.h

>
>> +       /*
>> +        * Do a diff between the 2 timestamps and store the result back into
>> +        * CS_GPR(1).
>> +        */
>> +       *cs++ = MI_MATH(5);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(1));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(0));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(2), MI_ALU_SRC_ACCU);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(3), MI_ALU_SRC_CF);
>> +
>> +       /*
>> +        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
>> +        * timestamp have rolled over the 32bits) into the predicate register
>> +        * to be used for the predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(3));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Restart from the beginning if we had timestamps roll over. */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
>> +       *cs++ = vma->node.start;
>> +       *cs++ = 0;
>> +
>> +       /*
>> +        * Now add the diff between to previous timestamps and add it to :
>> +        *      (((1 * << 64) - 1) - delay_ns)
>> +        *
>> +        * When the Carry Flag contains 1 this means the elapsed time is
>> +        * longer than the expected delay, and we can exit the wait loop.
>> +        */
>> +       delay_ticks = 0xffffffffffffffff -
>> +               DIV64_U64_ROUND_UP(delay_ns *
>> +                                  RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
>> +                                  1000000ull);
>> +       *cs++ = MI_LOAD_REGISTER_IMM(2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4));
>> +       *cs++ = lower_32_bits(delay_ticks);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)) + 4;
>> +       *cs++ = upper_32_bits(delay_ticks);
> Now, I was expecting to compute the 32b end timestamp and compare now to
> that using the carry-flag to indicate the completion.
>
> Why the detour? (I'm sure I am missing something here.)

Probably trying to save a register... My thinking is not that deep too ;)

>
> Quick question request for >32b delays are rejected in the user debug api?

Nop :( Will add.

>
>> +       *cs++ = MI_MATH(4);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(2));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(4));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(5), MI_ALU_SRC_CF);
> Comparing delta against target delay. Inverted store to give
> 	(delay - target) >= 0

I don't trust myself too much with ALU changes ;)

>
>> +       /*
>> +        * Transfer the result into the predicate register to be used for the
>> +        * predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(5));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Predicate the jump.  */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
> The joy of being on rcs.

You'll be delighted to know about this new TGL generation where all CS 
(afaict) have graduated from predication.

>
>> +       *cs++ = vma->node.start + (jump - batch) * 4;
> Mutters i915_ggtt_offset(vma)
>
>> +       *cs++ = 0;
>> +
>> +       /* Restore registers. */
>> +       for (i = 0; i <= 5; i++) {
>> +               cs = restore_register(i915, cs, HSW_CS_GPR(i),
>> +                                     INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                             INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* And return to the ring. */
>> +       *cs++ = MI_BATCH_BUFFER_END;
> GEM_BUG_ON(cs - batch > PAGE_SIZE/sizeof(*batch));

Done!

>
>> +
>> +       i915_gem_object_flush_map(bo);
>> +       i915_gem_object_unpin_map(bo);
>> +
>> +       i915->perf.oa.noa_wait = vma;
>> +
>> +       return 0;
>> +
>> +err_unpin:
>> +       __i915_vma_unpin(vma);
>> +
>> +err_unref:
>> +       i915_gem_object_put(bo);
>> +
>> +       return ret;
>> +}
> Certainly seems to do what you say on the tin.
> -Chris
>