[Mesa-dev] [PATCH] i965: Allow 48-bit addressing on Gen8+.
Jordan Justen
jordan.l.justen at intel.com
Tue Feb 27 22:06:16 UTC 2018
On 2018-02-26 16:05:46, Kenneth Graunke wrote:
> This allows most GPU objects to use the full 48-bit address space
> offered by Gen8+ platforms, rather than being stuck with 32-bit.
> This expands the available GPU memory from 4G to 256TB or so.
>
> A few objects - instruction, scratch, and vertex buffers - need to
> remain pinned in the low 4GB of the address space for various reasons.
> We default everything to 48-bit but disable it in those cases.
>
> Thanks to Jason Ekstrand for blazing this trail in anv first and
> finding the nasty undocumented hardware issues. This patch simply
> rips off all of his findings.
> ---
> src/mesa/drivers/dri/i965/brw_bufmgr.c | 23 +++++++++
> src/mesa/drivers/dri/i965/brw_misc_state.c | 13 +++--
> src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 23 ++++++---
> src/mesa/drivers/dri/i965/genX_blorp_exec.c | 9 ++++
> src/mesa/drivers/dri/i965/genX_state_upload.c | 60 ++++++++++++++++++++----
> src/mesa/drivers/dri/i965/intel_batchbuffer.c | 15 ++++++
> src/mesa/drivers/dri/i965/intel_batchbuffer.h | 2 +
> 7 files changed, 127 insertions(+), 18 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> index fb180289a0c..2e54adb3ed2 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> @@ -119,6 +119,7 @@ struct brw_bufmgr {
> bool has_llc:1;
> bool has_mmap_wc:1;
> bool bo_reuse:1;
> + bool supports_48b_addresses:1;
> };
>
> static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
> @@ -409,6 +410,8 @@ retry:
> bo->reusable = true;
> bo->cache_coherent = bufmgr->has_llc;
> bo->index = -1;
> + if (bufmgr->supports_48b_addresses)
> + bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>
> mtx_unlock(&bufmgr->lock);
>
> @@ -1385,6 +1388,24 @@ gem_param(int fd, int name)
> return v;
> }
>
> +static bool
> +gem_supports_48b_addresses(int fd)
> +{
> + struct drm_i915_gem_exec_object2 obj = {
> + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> + };
> +
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = (uintptr_t)&obj,
> + .buffer_count = 1,
> + .rsvd1 = 0xffffffu,
> + };
> +
> + int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
> +
> + return ret == -1 && errno == ENOENT;
> +}
> +
> /**
> * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
> * and manage map buffer objections.
> @@ -1418,6 +1439,8 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
>
> bufmgr->has_llc = devinfo->has_llc;
> bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
> + bufmgr->supports_48b_addresses =
> + devinfo->gen >= 8 && gem_supports_48b_addresses(fd);
>
> init_cache_buckets(bufmgr);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index c4ef6812bff..29d74876c27 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -634,6 +634,12 @@ brw_upload_state_base_address(struct brw_context *brw)
> }
>
> if (devinfo->gen >= 8) {
> + /* STATE_BASE_ADDRESS has issues with 48-bit address spaces. If the
> + * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
> + * the GPU appears to treat all accesses to the buffer as being out
> + * of bounds and returns zero. To work around this, we pin all SBAs
> + * to the bottom 4GB.
> + */
> uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
> int pkt_len = devinfo->gen >= 9 ? 19 : 16;
>
> @@ -644,15 +650,14 @@ brw_upload_state_base_address(struct brw_context *brw)
> OUT_BATCH(0);
> OUT_BATCH(mocs_wb << 16);
> /* Surface state base address: */
> - OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
> + OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
Lines like this seem a little confusing with the RELOC_32BIT name.
What about something like RELOC_BELOW4G?
Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>
> /* Dynamic state base address: */
> - OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
> + OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
> /* Indirect object base address: MEDIA_OBJECT data */
> OUT_BATCH(mocs_wb << 4 | 1);
> OUT_BATCH(0);
> /* Instruction base address: shader kernels (incl. SIP) */
> - OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1);
> -
> + OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1);
> /* General state buffer size */
> OUT_BATCH(0xfffff001);
> /* Dynamic state buffer size */
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> index 0b6016427bd..55e752261a5 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> @@ -203,12 +203,23 @@ brw_emit_surface_state(struct brw_context *brw,
> * FIXME: move to the point of assignment.
> */
> assert((aux_offset & 0xfff) == 0);
> - uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> - *aux_addr = brw_state_reloc(&brw->batch,
> - *surf_offset +
> - brw->isl_dev.ss.aux_addr_offset,
> - aux_bo, *aux_addr,
> - reloc_flags);
> +
> + if (devinfo->gen >= 8) {
> + uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> + *aux_addr = brw_state_reloc(&brw->batch,
> + *surf_offset +
> + brw->isl_dev.ss.aux_addr_offset,
> + aux_bo, *aux_addr,
> + reloc_flags);
> + } else {
> + uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
> + *aux_addr = brw_state_reloc(&brw->batch,
> + *surf_offset +
> + brw->isl_dev.ss.aux_addr_offset,
> + aux_bo, *aux_addr,
> + reloc_flags);
> +
> + }
> }
> }
>
> diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> index 062171af600..2bdd93e9bdd 100644
> --- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> +++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
> @@ -166,6 +166,15 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
> .buffer = brw->batch.state.bo,
> .offset = offset,
>
> + /* The VF cache designers apparently cut corners, and made the cache
> + * only consider the bottom 32 bits of memory addresses. If you happen
> + * to have two vertex buffers which get placed exactly 4 GiB apart and
> + * use them in back-to-back draw calls, you can get collisions. To work
> + * around this problem, we restrict vertex buffers to the low 32 bits of
> + * the address space.
> + */
> + .reloc_flags = RELOC_32BIT,
> +
> #if GEN_GEN == 10
> .mocs = CNL_MOCS_WB,
> #elif GEN_GEN == 9
> diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
> index 8668abd591f..0ab1f12152a 100644
> --- a/src/mesa/drivers/dri/i965/genX_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
> @@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
> }
> }
>
> -static struct brw_address
> +UNUSED static struct brw_address
> rw_bo(struct brw_bo *bo, uint32_t offset)
> {
> return (struct brw_address) {
> @@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
> };
> }
>
> +static struct brw_address
> +rw_32_bo(struct brw_bo *bo, uint32_t offset)
> +{
> + return (struct brw_address) {
> + .bo = bo,
> + .offset = offset,
> + .reloc_flags = RELOC_WRITE | RELOC_32BIT,
> + };
> +}
> +
> +static struct brw_address
> +ro_32_bo(struct brw_bo *bo, uint32_t offset)
> +{
> + return (struct brw_address) {
> + .bo = bo,
> + .offset = offset,
> + .reloc_flags = RELOC_32BIT,
> + };
> +}
> +
> UNUSED static struct brw_address
> ggtt_bo(struct brw_bo *bo, uint32_t offset)
> {
> @@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
> struct GENX(VERTEX_BUFFER_STATE) buf_state = {
> .VertexBufferIndex = buffer_nr,
> .BufferPitch = stride,
> - .BufferStartingAddress = ro_bo(bo, start_offset),
> +
> + /* The VF cache designers apparently cut corners, and made the cache
> + * only consider the bottom 32 bits of memory addresses. If you happen
> + * to have two vertex buffers which get placed exactly 4 GiB apart and
> + * use them in back-to-back draw calls, you can get collisions. To work
> + * around this problem, we restrict vertex buffers to the low 32 bits of
> + * the address space.
> + */
> + .BufferStartingAddress = ro_32_bo(bo, start_offset),
> #if GEN_GEN >= 8
> .BufferSize = end_offset - start_offset,
> #endif
> @@ -858,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw)
> ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
> #endif
> ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
> - ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
> +
> + /* The VF cache designers apparently cut corners, and made the cache
> + * only consider the bottom 32 bits of memory addresses. If you happen
> + * to have two index buffers which get placed exactly 4 GiB apart and
> + * use them in back-to-back draw calls, you can get collisions. To work
> + * around this problem, we restrict index buffers to the low 32 bits of
> + * the address space.
> + */
> + ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
> #if GEN_GEN >= 8
> ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
> ib.BufferSize = brw->ib.size;
> @@ -1895,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw)
> #endif
>
> if (wm_prog_data->base.total_scratch) {
> - wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
> + wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
> wm.PerThreadScratchSpace =
> ffs(stage_state->per_thread_scratch) - 11;
> }
> @@ -2014,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = {
>
> /* ---------------------------------------------------------------------- */
>
> +/* We restrict scratch buffers to the bottom 32 bits of the address space
> + * by using rw_32_bo().
> + *
> + * General State Base Address is a bit broken. If the address + size as
> + * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
> + * all accesses to the buffer as being out of bounds and returns zero.
> + */
> +
> #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
> pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \
> pkt.SamplerCount = \
> @@ -2023,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = {
> pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
> \
> if (stage_prog_data->total_scratch) { \
> - pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); \
> + pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
> pkt.PerThreadScratchSpace = \
> ffs(stage_state->per_thread_scratch) - 11; \
> } \
> @@ -3892,8 +3936,8 @@ genX(upload_ps)(struct brw_context *brw)
>
> if (prog_data->base.total_scratch) {
> ps.ScratchSpaceBasePointer =
> - rw_bo(stage_state->scratch_bo,
> - ffs(stage_state->per_thread_scratch) - 11);
> + rw_32_bo(stage_state->scratch_bo,
> + ffs(stage_state->per_thread_scratch) - 11);
> }
> }
> }
> @@ -4214,7 +4258,7 @@ genX(upload_cs_state)(struct brw_context *brw)
> */
> per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
> }
> - vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
> + vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
> vfe.PerThreadScratchSpace = per_thread_scratch_value;
> }
>
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 26718e0d1a2..0a8d3a80b64 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -1093,6 +1093,21 @@ emit_reloc(struct intel_batchbuffer *batch,
> unsigned int index = add_exec_bo(batch, target);
> struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
>
> + if (reloc_flags & RELOC_32BIT) {
> + /* Restrict this buffer to the low 32 bits of the address space.
> + *
> + * Altering the validation list flags restricts it for this batch,
> + * but we also alter the BO's kflags to restrict it permanently
> + * (until the BO is destroyed and put back in the cache). Buffers
> + * may stay bound across batches, and we want keep it constrained.
> + */
> + target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
> + entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
> +
> + /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
> + reloc_flags &= ~RELOC_32BIT;
> + }
> +
> if (reloc_flags)
> entry->flags |= reloc_flags & batch->valid_reloc_flags;
>
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index a9a34600ad1..7be5b10f3ab 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -53,6 +53,8 @@ bool brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo);
>
> #define RELOC_WRITE EXEC_OBJECT_WRITE
> #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
> +/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
> +#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
> uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
> uint32_t batch_offset,
> struct brw_bo *target,
> --
> 2.16.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list