[PATCH v4 1/2] intel: 48b ppgtt support (EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag)

Kristian Høgsberg krh at bitplanet.net
Mon Oct 5 11:06:50 PDT 2015


On Mon, Oct 5, 2015 at 7:03 AM, Michel Thierry <michel.thierry at intel.com> wrote:
> On 9/14/2015 2:54 PM, Michał Winiarski wrote:
>>
>> On Thu, Sep 03, 2015 at 03:23:58PM +0100, Michel Thierry wrote:
>>>
>>> Gen8+ supports 48-bit virtual addresses, but some objects must always be
>>> allocated inside the 32-bit address range.
>>>
>>> In specific, any resource used with flat/heapless (0x00000000-0xfffff000)
>>> General State Heap (GSH) or Instruction State Heap (ISH) must be in a
>>> 32-bit range, because the General State Offset and Instruction State
>>> Offset
>>> are limited to 32-bits.
>>>
>>> The i915 driver has been modified to provide a flag to set when the 4GB
>>> limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS).
>>> 48-bit range will only be used when explicitly requested.
>>>
>>> Callers to the existing drm_intel_bo_emit_reloc function should set the
>>> use_48b_address_range flag beforehand, in order to use full ppgtt range.
>>>
>>> v2: Make set/clear functions nops on pre-gen8 platforms, and use them
>>>      internally in emit_reloc functions (Ben)
>>>      s/48BADDRESS/48B_ADDRESS/ (Dave)
>>> v3: Keep set/clear functions internal, no-one needs to use them directly.
>>> v4: Don't set 48bit-support flag in emit reloc, check for ppgtt type
>>>      before enabling set/clear function, print full offsets in debug
>>>      statements, using port of lower_32_bits and upper_32_bits from linux
>>>      kernel (Michał)
>>>
>>> References:
>>> http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html
>>> Cc: Ben Widawsky <ben at bwidawsk.net>
>>> Cc: Michał Winiarski <michal.winiarski at intel.com>
>>
>>
>> +Kristian
>>
>> LGTM.
>> Acked-by: Michał Winiarski <michal.winiarski at intel.com>
>>
>>> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
>
>
>
> Hi Kristian,
>
> More comments on this?
> I've resent the mesa patch with the last changes
> (http://lists.freedesktop.org/archives/dri-devel/2015-October/091752.html).
>
> Michał has agreed with the interface and will use it for his work.
> If mesa doesn't plan to use this for now, it's ok. The kernel changes have
> been done to prevent any regressions when the support 48-bit flag is not
> set.

I didn't get any replies to my last comments on this:

http://lists.freedesktop.org/archives/mesa-dev/2015-August/091398.html

Basically, I tried explicitly placing buffers above 8G and didn't see
the HW problem described (ie it all worked fine).  I still think
there's some confusion as to what the W/A is about.

Here's my take: the W/A is a SW-only workaround to handle the cases
where a driver uses heapless and 48-bit ppgtt. The problem is that the
heaps are limited to 4G but with 48bit ppgtt a buffer can be placed
anywhere it the 48 bit address space. If that happens it's consideredd
out-of-bounds for the heap and access fails. To prevent this we need
to make sure the bos we address in a heapless fashion are placed below
4g.

For mesa, we only configure general state base as heap-less, which
affects scratch bos. What this boils down to is that we need the 4G
restriction only for the scratch bos set up on 3DSTATE_VS, 3DSTATE_GS
and 3DSTATE_PS (and tesselation stage eventually). Look for the
OUT_RELOC64 for stage->scratch_bo in gen8_vs_state.c, gen8_gs_state.c
and gen8_ps_state.c

Kristian

> Thanks,
>
> -Michel
>
>
>>> ---
>>>   include/drm/i915_drm.h    |  3 +-
>>>   intel/intel_bufmgr.c      | 11 ++++++
>>>   intel/intel_bufmgr.h      |  1 +
>>>   intel/intel_bufmgr_gem.c  | 88
>>> +++++++++++++++++++++++++++++++++++++----------
>>>   intel/intel_bufmgr_priv.h | 14 ++++++++
>>>   5 files changed, 97 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
>>> index ded43b1..426b25c 100644
>>> --- a/include/drm/i915_drm.h
>>> +++ b/include/drm/i915_drm.h
>>> @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 {
>>>   #define EXEC_OBJECT_NEEDS_FENCE (1<<0)
>>>   #define EXEC_OBJECT_NEEDS_GTT (1<<1)
>>>   #define EXEC_OBJECT_WRITE     (1<<2)
>>> -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
>>> +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
>>> +#define __EXEC_OBJECT_UNKNOWN_FLAGS
>>> -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1)
>>>         __u64 flags;
>>>
>>>         __u64 rsvd1;
>>> diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c
>>> index 14ea9f9..0856e60 100644
>>> --- a/intel/intel_bufmgr.c
>>> +++ b/intel/intel_bufmgr.c
>>> @@ -293,6 +293,17 @@ drm_intel_bo_madvise(drm_intel_bo *bo, int madv)
>>>   }
>>>
>>>   int
>>> +drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t enable)
>>> +{
>>> +       if (bo->bufmgr->bo_use_48b_address_range) {
>>> +               bo->bufmgr->bo_use_48b_address_range(bo, enable);
>>> +               return 0;
>>> +       }
>>> +
>>> +       return -ENODEV;
>>> +}
>>> +
>>> +int
>>>   drm_intel_bo_references(drm_intel_bo *bo, drm_intel_bo *target_bo)
>>>   {
>>>         return bo->bufmgr->bo_references(bo, target_bo);
>>> diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
>>> index 95eecb8..a14c78f 100644
>>> --- a/intel/intel_bufmgr.h
>>> +++ b/intel/intel_bufmgr.h
>>> @@ -164,6 +164,7 @@ int drm_intel_bo_get_tiling(drm_intel_bo *bo,
>>> uint32_t * tiling_mode,
>>>   int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name);
>>>   int drm_intel_bo_busy(drm_intel_bo *bo);
>>>   int drm_intel_bo_madvise(drm_intel_bo *bo, int madv);
>>> +int drm_intel_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t
>>> enable);
>>>
>>>   int drm_intel_bo_disable_reuse(drm_intel_bo *bo);
>>>   int drm_intel_bo_is_reusable(drm_intel_bo *bo);
>>> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
>>> index 2723e21..09d82d2 100644
>>> --- a/intel/intel_bufmgr_gem.c
>>> +++ b/intel/intel_bufmgr_gem.c
>>> @@ -83,6 +83,22 @@
>>>   #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>>>   #define MAX2(A, B) ((A) > (B) ? (A) : (B))
>>>
>>> +/**
>>> + * upper_32_bits - return bits 32-63 of a number
>>> + * @n: the number we're accessing
>>> + *
>>> + * A basic shift-right of a 64- or 32-bit quantity.  Use this to
>>> suppress
>>> + * the "right shift count >= width of type" warning when that quantity
>>> is
>>> + * 32-bits.
>>> + */
>>> +#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16))
>>> +
>>> +/**
>>> + * lower_32_bits - return bits 0-31 of a number
>>> + * @n: the number we're accessing
>>> + */
>>> +#define lower_32_bits(n) ((__u32)(n))
>>> +
>>>   typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
>>>
>>>   struct drm_intel_gem_bo_bucket {
>>> @@ -237,6 +253,15 @@ struct _drm_intel_bo_gem {
>>>         bool is_userptr;
>>>
>>>         /**
>>> +        * Boolean of whether this buffer can be placed in the full
>>> 48-bit
>>> +        * address range on gen8+.
>>> +        *
>>> +        * By default, buffers will be keep in a 32-bit range, unless
>>> this
>>> +        * flag is explicitly set.
>>> +        */
>>> +       bool use_48b_address_range;
>>> +
>>> +       /**
>>>          * Size in bytes of this buffer and its relocation descendents.
>>>          *
>>>          * Used to avoid costly tree walking in
>>> @@ -395,14 +420,16 @@
>>> drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
>>>                         drm_intel_bo_gem *target_gem =
>>>                             (drm_intel_bo_gem *) target_bo;
>>>
>>> -                       DBG("%2d: %d (%s)@0x%08llx -> "
>>> -                           "%d (%s)@0x%08lx + 0x%08x\n",
>>> +                       DBG("%2d: %d (%s)@0x%08x %08x -> "
>>> +                           "%d (%s)@0x%08x %08x + 0x%08x\n",
>>>                             i,
>>>                             bo_gem->gem_handle, bo_gem->name,
>>> -                           (unsigned long long)bo_gem->relocs[j].offset,
>>> +                           upper_32_bits(bo_gem->relocs[j].offset),
>>> +                           lower_32_bits(bo_gem->relocs[j].offset),
>>>                             target_gem->gem_handle,
>>>                             target_gem->name,
>>> -                           target_bo->offset64,
>>> +                           upper_32_bits(target_bo->offset64),
>>> +                           lower_32_bits(target_bo->offset64),
>>>                             bo_gem->relocs[j].delta);
>>>                 }
>>>         }
>>> @@ -468,11 +495,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo,
>>> int need_fence)
>>>         drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem
>>> *)bo->bufmgr;
>>>         drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo;
>>>         int index;
>>> +       int flags = 0;
>>> +
>>> +       if (need_fence)
>>> +               flags |= EXEC_OBJECT_NEEDS_FENCE;
>>> +       if (bo_gem->use_48b_address_range)
>>> +               flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>>>
>>>         if (bo_gem->validate_index != -1) {
>>> -               if (need_fence)
>>> -
>>> bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
>>> -                               EXEC_OBJECT_NEEDS_FENCE;
>>> +               bufmgr_gem->exec2_objects[bo_gem->validate_index].flags
>>> |= flags;
>>>                 return;
>>>         }
>>>
>>> @@ -501,13 +532,9 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int
>>> need_fence)
>>>         bufmgr_gem->exec2_objects[index].alignment = bo->align;
>>>         bufmgr_gem->exec2_objects[index].offset = 0;
>>>         bufmgr_gem->exec_bos[index] = bo;
>>> -       bufmgr_gem->exec2_objects[index].flags = 0;
>>> +       bufmgr_gem->exec2_objects[index].flags = flags;
>>>         bufmgr_gem->exec2_objects[index].rsvd1 = 0;
>>>         bufmgr_gem->exec2_objects[index].rsvd2 = 0;
>>> -       if (need_fence) {
>>> -               bufmgr_gem->exec2_objects[index].flags |=
>>> -                       EXEC_OBJECT_NEEDS_FENCE;
>>> -       }
>>>         bufmgr_gem->exec_count++;
>>>   }
>>>
>>> @@ -780,6 +807,7 @@ retry:
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = true;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem,
>>> alignment);
>>>
>>> @@ -926,6 +954,7 @@ drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr
>>> *bufmgr,
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
>>>
>>> @@ -1081,6 +1110,7 @@ drm_intel_bo_gem_create_from_name(drm_intel_bufmgr
>>> *bufmgr,
>>>         bo_gem->bo.handle = open_arg.handle;
>>>         bo_gem->global_name = handle;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         memclear(get_tiling);
>>>         get_tiling.handle = bo_gem->gem_handle;
>>> @@ -1930,6 +1960,13 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t
>>> offset,
>>>         return 0;
>>>   }
>>>
>>> +static void
>>> +drm_intel_gem_bo_use_48b_address_range(drm_intel_bo *bo, uint32_t
>>> enable)
>>> +{
>>> +       drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
>>> +       bo_gem->use_48b_address_range = enable;
>>> +}
>>> +
>>>   static int
>>>   drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset,
>>>                             drm_intel_bo *target_bo, uint32_t
>>> target_offset,
>>> @@ -2073,10 +2110,12 @@
>>> drm_intel_update_buffer_offsets(drm_intel_bufmgr_gem *bufmgr_gem)
>>>
>>>                 /* Update the buffer offset */
>>>                 if (bufmgr_gem->exec_objects[i].offset != bo->offset64) {
>>> -                       DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>>> -                           bo_gem->gem_handle, bo_gem->name,
>>> bo->offset64,
>>> -                           (unsigned long
>>> long)bufmgr_gem->exec_objects[i].
>>> -                           offset);
>>> +                       DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x
>>> %08x\n",
>>> +                           bo_gem->gem_handle, bo_gem->name,
>>> +                           upper_32_bits(bo->offset64),
>>> +                           lower_32_bits(bo->offset64),
>>> +
>>> upper_32_bits(bufmgr_gem->exec_objects[i].offset),
>>> +
>>> lower_32_bits(bufmgr_gem->exec_objects[i].offset));
>>>                         bo->offset64 =
>>> bufmgr_gem->exec_objects[i].offset;
>>>                         bo->offset = bufmgr_gem->exec_objects[i].offset;
>>>                 }
>>> @@ -2094,9 +2133,12 @@ drm_intel_update_buffer_offsets2
>>> (drm_intel_bufmgr_gem *bufmgr_gem)
>>>
>>>                 /* Update the buffer offset */
>>>                 if (bufmgr_gem->exec2_objects[i].offset != bo->offset64)
>>> {
>>> -                       DBG("BO %d (%s) migrated: 0x%08lx -> 0x%08llx\n",
>>> -                           bo_gem->gem_handle, bo_gem->name,
>>> bo->offset64,
>>> -                           (unsigned long
>>> long)bufmgr_gem->exec2_objects[i].offset);
>>> +                       DBG("BO %d (%s) migrated: 0x%08x %08x -> 0x%08x
>>> %08x\n",
>>> +                           bo_gem->gem_handle, bo_gem->name,
>>> +                           upper_32_bits(bo->offset64),
>>> +                           lower_32_bits(bo->offset64),
>>> +
>>> upper_32_bits(bufmgr_gem->exec2_objects[i].offset),
>>> +
>>> lower_32_bits(bufmgr_gem->exec2_objects[i].offset));
>>>                         bo->offset64 =
>>> bufmgr_gem->exec2_objects[i].offset;
>>>                         bo->offset = bufmgr_gem->exec2_objects[i].offset;
>>>                 }
>>> @@ -2481,6 +2523,7 @@ drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr
>>> *bufmgr, int prime_fd, int s
>>>         bo_gem->used_as_reloc_target = false;
>>>         bo_gem->has_error = false;
>>>         bo_gem->reusable = false;
>>> +       bo_gem->use_48b_address_range = false;
>>>
>>>         DRMINITLISTHEAD(&bo_gem->vma_list);
>>>         DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named);
>>> @@ -3278,6 +3321,13 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
>>>                 }
>>>         }
>>>
>>> +       if (bufmgr_gem->gen >= 8) {
>>> +               gp.param = I915_PARAM_HAS_ALIASING_PPGTT;
>>> +               ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM,
>>> &gp);
>>> +               if (ret == 0 && *gp.value == 3)
>>> +                       bufmgr_gem->bufmgr.bo_use_48b_address_range =
>>> drm_intel_gem_bo_use_48b_address_range;
>>> +       }
>>> +
>>>         /* Let's go with one relocation per every 2 dwords (but round
>>> down a bit
>>>          * since a power of two will mean an extra page allocation for
>>> the reloc
>>>          * buffer).
>>> diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h
>>> index 59ebd18..5c17ffb 100644
>>> --- a/intel/intel_bufmgr_priv.h
>>> +++ b/intel/intel_bufmgr_priv.h
>>> @@ -152,6 +152,20 @@ struct _drm_intel_bufmgr {
>>>         void (*destroy) (drm_intel_bufmgr *bufmgr);
>>>
>>>         /**
>>> +        * Indicate if the buffer can be placed anywhere in the full
>>> ppgtt
>>> +        * address range (2^48).
>>> +        *
>>> +        * Any resource used with flat/heapless (0x00000000-0xfffff000)
>>> +        * General State Heap (GSH) or Intructions State Heap (ISH) must
>>> +        * be in a 32-bit range. 48-bit range will only be used when
>>> explicitly
>>> +        * requested.
>>> +        *
>>> +        * \param bo Buffer to set the use_48b_address_range flag.
>>> +        * \param enable The flag value.
>>> +        */
>>> +       void (*bo_use_48b_address_range) (drm_intel_bo *bo, uint32_t
>>> enable);
>>> +
>>> +       /**
>>>          * Add relocation entry in reloc_buf, which will be updated with
>>> the
>>>          * target buffer's real offset on on command submission.
>>>          *
>>> --
>>> 2.5.0
>>>
>


More information about the dri-devel mailing list