[Mesa-dev] [PATCH] i965: Allow 48-bit addressing on Gen8+.

Tue Feb 27 22:06:16 UTC 2018

On 2018-02-26 16:05:46, Kenneth Graunke wrote:
> This allows most GPU objects to use the full 48-bit address space
> offered by Gen8+ platforms, rather than being stuck with 32-bit.
> This expands the available GPU memory from 4G to 256TB or so.
> 
> A few objects - instruction, scratch, and vertex buffers - need to
> remain pinned in the low 4GB of the address space for various reasons.
> We default everything to 48-bit but disable it in those cases.
> 
> Thanks to Jason Ekstrand for blazing this trail in anv first and
> finding the nasty undocumented hardware issues.  This patch simply
> rips off all of his findings.
> ---
>  src/mesa/drivers/dri/i965/brw_bufmgr.c           | 23 +++++++++
>  src/mesa/drivers/dri/i965/brw_misc_state.c       | 13 +++--
>  src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 23 ++++++---
>  src/mesa/drivers/dri/i965/genX_blorp_exec.c      |  9 ++++
>  src/mesa/drivers/dri/i965/genX_state_upload.c    | 60 ++++++++++++++++++++----
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c    | 15 ++++++
>  src/mesa/drivers/dri/i965/intel_batchbuffer.h    |  2 +
>  7 files changed, 127 insertions(+), 18 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> index fb180289a0c..2e54adb3ed2 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> @@ -119,6 +119,7 @@ struct brw_bufmgr {
>     bool has_llc:1;
>     bool has_mmap_wc:1;
>     bool bo_reuse:1;
> +   bool supports_48b_addresses:1;
>  };
>  
>  static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
> @@ -409,6 +410,8 @@ retry:
>     bo->reusable = true;
>     bo->cache_coherent = bufmgr->has_llc;
>     bo->index = -1;
> +   if (bufmgr->supports_48b_addresses)
> +      bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>  
>     mtx_unlock(&bufmgr->lock);
>  
> @@ -1385,6 +1388,24 @@ gem_param(int fd, int name)
>     return v;
>  }
>  
> +static bool
> +gem_supports_48b_addresses(int fd)
> +{
> +   struct drm_i915_gem_exec_object2 obj = {
> +      .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +   };
> +
> +   struct drm_i915_gem_execbuffer2 execbuf = {
> +      .buffers_ptr = (uintptr_t)&obj,
> +      .buffer_count = 1,
> +      .rsvd1 = 0xffffffu,
> +   };
> +
> +   int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
> +
> +   return ret == -1 && errno == ENOENT;
> +}
> +
>  /**
>   * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
>   * and manage map buffer objections.
> @@ -1418,6 +1439,8 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
>  
>     bufmgr->has_llc = devinfo->has_llc;
>     bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
> +   bufmgr->supports_48b_addresses =
> +      devinfo->gen >= 8 && gem_supports_48b_addresses(fd);
>  
>     init_cache_buckets(bufmgr);
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index c4ef6812bff..29d74876c27 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -634,6 +634,12 @@ brw_upload_state_base_address(struct brw_context *brw)
>     }
>  
>     if (devinfo->gen >= 8) {
> +      /* STATE_BASE_ADDRESS has issues with 48-bit address spaces.  If the
> +       * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
> +       * the GPU appears to treat all accesses to the buffer as being out
> +       * of bounds and returns zero.  To work around this, we pin all SBAs
> +       * to the bottom 4GB.
> +       */
>        uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
>        int pkt_len = devinfo->gen >= 9 ? 19 : 16;
>  
> @@ -644,15 +650,14 @@ brw_upload_state_base_address(struct brw_context *brw)
>        OUT_BATCH(0);
>        OUT_BATCH(mocs_wb << 16);
>        /* Surface state base address: */
> -      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
> +      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);

Lines like this seem a little confusing with the RELOC_32BIT name.
What about something like RELOC_BELOW4G?

Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>

>        /* Dynamic state base address: */
> -      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
> +      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
>        /* Indirect object base address: MEDIA_OBJECT data */
>        OUT_BATCH(mocs_wb << 4 | 1);
>        OUT_BATCH(0);
>        /* Instruction base address: shader kernels (incl. SIP) */
> -      OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1);
> -
> +      OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1);
>        /* General state buffer size */
>        OUT_BATCH(0xfffff001);
>        /* Dynamic state buffer size */
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> index 0b6016427bd..55e752261a5 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> @@ -203,12 +203,23 @@ brw_emit_surface_state(struct brw_context *brw,
>         * FIXME: move to the point of assignment.
>         */
>        assert((aux_offset & 0xfff) == 0);
> -      uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> -      *aux_addr = brw_state_reloc(&brw->batch,
> -                                  *surf_offset +
> -                                  brw->isl_dev.ss.aux_addr_offset,
> -                                  aux_bo, *aux_addr,
> -                                  reloc_flags);
> +
> +      if (devinfo->gen >= 8) {
> +         uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> +         *aux_addr = brw_state_reloc(&brw->batch,
> +                                     *surf_offset +
> +                                     brw->isl_dev.ss.aux_addr_offset,
> +                                     aux_bo, *aux_addr,
> +                                     reloc_flags);
> +      } else {
> +         uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> +         *aux_addr = brw_state_reloc(&brw->batch,
> +                                     *surf_offset +
> +                                     brw->isl_dev.ss.aux_addr_offset,
> +                                     aux_bo, *aux_addr,
> +                                     reloc_flags);
> +
> +      }
>     }
>  }
>  
> diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> index 062171af600..2bdd93e9bdd 100644
> --- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> +++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> @@ -166,6 +166,15 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
>        .buffer = brw->batch.state.bo,
>        .offset = offset,
>  
> +      /* The VF cache designers apparently cut corners, and made the cache
> +       * only consider the bottom 32 bits of memory addresses.  If you happen
> +       * to have two vertex buffers which get placed exactly 4 GiB apart and
> +       * use them in back-to-back draw calls, you can get collisions.  To work
> +       * around this problem, we restrict vertex buffers to the low 32 bits of
> +       * the address space.
> +       */
> +      .reloc_flags = RELOC_32BIT,
> +
>  #if GEN_GEN == 10
>        .mocs = CNL_MOCS_WB,
>  #elif GEN_GEN == 9
> diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
> index 8668abd591f..0ab1f12152a 100644
> --- a/src/mesa/drivers/dri/i965/genX_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
> @@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
>     }
>  }
>  
> -static struct brw_address
> +UNUSED static struct brw_address
>  rw_bo(struct brw_bo *bo, uint32_t offset)
>  {
>     return (struct brw_address) {
> @@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
>     };
>  }
>  
> +static struct brw_address
> +rw_32_bo(struct brw_bo *bo, uint32_t offset)
> +{
> +   return (struct brw_address) {
> +            .bo = bo,
> +            .offset = offset,
> +            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
> +   };
> +}
> +
> +static struct brw_address
> +ro_32_bo(struct brw_bo *bo, uint32_t offset)
> +{
> +   return (struct brw_address) {
> +            .bo = bo,
> +            .offset = offset,
> +            .reloc_flags = RELOC_32BIT,
> +   };
> +}
> +
>  UNUSED static struct brw_address
>  ggtt_bo(struct brw_bo *bo, uint32_t offset)
>  {
> @@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
>     struct GENX(VERTEX_BUFFER_STATE) buf_state = {
>        .VertexBufferIndex = buffer_nr,
>        .BufferPitch = stride,
> -      .BufferStartingAddress = ro_bo(bo, start_offset),
> +
> +      /* The VF cache designers apparently cut corners, and made the cache
> +       * only consider the bottom 32 bits of memory addresses.  If you happen
> +       * to have two vertex buffers which get placed exactly 4 GiB apart and
> +       * use them in back-to-back draw calls, you can get collisions.  To work
> +       * around this problem, we restrict vertex buffers to the low 32 bits of
> +       * the address space.
> +       */
> +      .BufferStartingAddress = ro_32_bo(bo, start_offset),
>  #if GEN_GEN >= 8
>        .BufferSize = end_offset - start_offset,
>  #endif
> @@ -858,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw)
>        ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
>  #endif
>        ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
> -      ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
> +
> +      /* The VF cache designers apparently cut corners, and made the cache
> +       * only consider the bottom 32 bits of memory addresses.  If you happen
> +       * to have two index buffers which get placed exactly 4 GiB apart and
> +       * use them in back-to-back draw calls, you can get collisions.  To work
> +       * around this problem, we restrict index buffers to the low 32 bits of
> +       * the address space.
> +       */
> +      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
>  #if GEN_GEN >= 8
>        ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
>        ib.BufferSize = brw->ib.size;
> @@ -1895,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw)
>  #endif
>  
>        if (wm_prog_data->base.total_scratch) {
> -         wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
> +         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
>           wm.PerThreadScratchSpace =
>              ffs(stage_state->per_thread_scratch) - 11;
>        }
> @@ -2014,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = {
>  
>  /* ---------------------------------------------------------------------- */
>  
> +/* We restrict scratch buffers to the bottom 32 bits of the address space
> + * by using rw_32_bo().
> + *
> + * General State Base Address is a bit broken.  If the address + size as
> + * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
> + * all accesses to the buffer as being out of bounds and returns zero.
> + */
> +
>  #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
>     pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
>     pkt.SamplerCount       =                                               \
> @@ -2023,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = {
>     pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
>                                                                            \
>     if (stage_prog_data->total_scratch) {                                  \
> -      pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
> +      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
>        pkt.PerThreadScratchSpace =                                         \
>           ffs(stage_state->per_thread_scratch) - 11;                       \
>     }                                                                      \
> @@ -3892,8 +3936,8 @@ genX(upload_ps)(struct brw_context *brw)
>  
>        if (prog_data->base.total_scratch) {
>           ps.ScratchSpaceBasePointer =
> -            rw_bo(stage_state->scratch_bo,
> -                  ffs(stage_state->per_thread_scratch) - 11);
> +            rw_32_bo(stage_state->scratch_bo,
> +                     ffs(stage_state->per_thread_scratch) - 11);
>        }
>     }
>  }
> @@ -4214,7 +4258,7 @@ genX(upload_cs_state)(struct brw_context *brw)
>               */
>              per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
>           }
> -         vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
> +         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
>           vfe.PerThreadScratchSpace = per_thread_scratch_value;
>        }
>  
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 26718e0d1a2..0a8d3a80b64 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -1093,6 +1093,21 @@ emit_reloc(struct intel_batchbuffer *batch,
>     unsigned int index = add_exec_bo(batch, target);
>     struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
>  
> +   if (reloc_flags & RELOC_32BIT) {
> +      /* Restrict this buffer to the low 32 bits of the address space.
> +       *
> +       * Altering the validation list flags restricts it for this batch,
> +       * but we also alter the BO's kflags to restrict it permanently
> +       * (until the BO is destroyed and put back in the cache).  Buffers
> +       * may stay bound across batches, and we want keep it constrained.
> +       */
> +      target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
> +      entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
> +
> +      /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
> +      reloc_flags &= ~RELOC_32BIT;
> +   }
> +
>     if (reloc_flags)
>        entry->flags |= reloc_flags & batch->valid_reloc_flags;
>  
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index a9a34600ad1..7be5b10f3ab 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -53,6 +53,8 @@ bool brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo);
>  
>  #define RELOC_WRITE EXEC_OBJECT_WRITE
>  #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
> +/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
> +#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
>  uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
>                           uint32_t batch_offset,
>                           struct brw_bo *target,
> -- 
> 2.16.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev