[Intel-gfx] [PATCH v3 1/6] drm/i915/gt: Use XY_FASR_COLOR_BLT to clear obj on graphics ver 12+

Mon Mar 7 14:29:22 UTC 2022

Hi, Ram.

Typo in patch title FASR/FAST

On Mon, 2022-03-07 at 19:10 +0530, Ramalingam C wrote:
> XY_FAST_COLOR_BLT cmd is faster than the older XY_COLOR_BLT. Hence
> for
> clearing (Zero out) the pages of the newly allocated object, faster
> cmd
> is used.
> 
> Signed-off-by: Ramalingam C <ramalingam.c at intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 ++
>  drivers/gpu/drm/i915/gt/intel_migrate.c      | 51 +++++++++++++++++-
> --
>  2 files changed, 49 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index d112ffd56418..925e55b6a94f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -205,6 +205,11 @@
>  
>  #define COLOR_BLT_CMD                  (2 << 29 | 0x40 << 22 | (5 -
> 2))
>  #define XY_COLOR_BLT_CMD               (2 << 29 | 0x50 << 22)
> +#define XY_FAST_COLOR_BLT_CMD          (2 << 29 | 0x44 << 22)
> +#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
> +#define   XY_FAST_COLOR_BLT_DW         16
> +#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
> +#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
>  #define SRC_COPY_BLT_CMD               (2 << 29 | 0x43 << 22)
>  #define GEN9_XY_FAST_COPY_BLT_CMD      (2 << 29 | 0x42 << 22)
>  #define XY_SRC_COPY_BLT_CMD            (2 << 29 | 0x53 << 22)
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c
> b/drivers/gpu/drm/i915/gt/intel_migrate.c
> index 20444d6ceb3c..cb68f7bf6b28 100644
> --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -16,6 +16,8 @@ struct insert_pte_data {
>  };
>  
>  #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> +#define GET_CCS_BYTES(i915, size)      (HAS_FLAT_CCS(i915) ? \
> +                                        DIV_ROUND_UP(size,
> NUM_BYTES_PER_CCS_BYTE) : 0)
>  
>  static bool engine_supports_migration(struct intel_engine_cs
> *engine)
>  {
> @@ -614,20 +616,56 @@ intel_context_migrate_copy(struct intel_context
> *ce,
>         return err;
>  }
>  
> -static int emit_clear(struct i915_request *rq, u64 offset, int size,
> u32 value)
> +static int emit_clear(struct i915_request *rq, u64 offset, int size,
> +                     u32 value, bool is_lmem)
>  {
> -       const int ver = GRAPHICS_VER(rq->engine->i915);
> -       u32 *cs;
> +       struct drm_i915_private *i915 = rq->engine->i915;
> +       int mocs = rq->engine->gt->mocs.uc_index << 1;
> +       const int ver = GRAPHICS_VER(i915);
> +       u32 *cs, mem_type = 0;
> +       int ring_sz;
>  
>         GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
>  
>         offset += (u64)rq->engine->instance << 32;
>  
> -       cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
> +       if (ver >= 12)
> +               ring_sz = 16;

Noting that DG1 doesn't use more than 11 dwords? Doesn't matter much I
guess if we pad with NOP.

> +       else if (ver >= 8)
> +               ring_sz = 8;
> +       else
> +               ring_sz = 6;
> +
> +       if (!is_lmem)
> +               mem_type = 1 << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;

Should we use the MEM_TYPE macros so it becomes clearer what we're
doing? 

Also does DG1 support the mocs and mem_type fields? If not should we
set these to 0 for relevant hardware?

> +
> +       cs = intel_ring_begin(rq, ring_sz);
>         if (IS_ERR(cs))
>                 return PTR_ERR(cs);
>  
> -       if (ver >= 8) {
> +       if (ver >= 12) {
> +               *cs++ = XY_FAST_COLOR_BLT_CMD |
> XY_FAST_COLOR_BLT_DEPTH_32 |
> +                       (XY_FAST_COLOR_BLT_DW - 2);
> +               *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs)
> |
> +                       (PAGE_SIZE - 1);
> +               *cs++ = 0;
> +               *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +               *cs++ = lower_32_bits(offset);
> +               *cs++ = upper_32_bits(offset);
> +               *cs++ = mem_type;
> +               /* BG7 */
> +               *cs++ = value;
> +               *cs++ = 0;
> +               *cs++ = 0;
> +               *cs++ = 0;
> +               /* BG11 */
> +               *cs++ = 0;
> +               *cs++ = 0;
> +               /* BG13 */
> +               *cs++ = 0;
> +               *cs++ = 0;
> +               *cs++ = 0;
> +       } else if (ver >= 8) {
>                 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
>                 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY |
> PAGE_SIZE;
>                 *cs++ = 0;
> @@ -645,7 +683,6 @@ static int emit_clear(struct i915_request *rq,
> u64 offset, int size, u32 value)
>                 *cs++ = lower_32_bits(offset);
>                 *cs++ = value;
>         }
> -
>         intel_ring_advance(rq, cs);
>         return 0;
>  }
> @@ -711,7 +748,7 @@ intel_context_migrate_clear(struct intel_context
> *ce,
>                 if (err)
>                         goto out_rq;
>  
> -               err = emit_clear(rq, offset, len, value);
> +               err = emit_clear(rq, offset, len, value, is_lmem);
>  
>                 /* Arbitration is re-enabled between requests. */
>  out_rq:

Otherwise looks good.

/Thomas

----------------------------------------------------------------------
Intel Sweden AB
Registered Office: Isafjordsgatan 30B, 164 40 Kista, Stockholm, Sweden
Registration Number: 556189-6027

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.