[Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup
Christian König
deathsimple at vodafone.de
Sat Apr 12 04:26:32 PDT 2014
Am 12.04.2014 12:52, schrieb Marek Olšák:
> Do you mean having bo->cs and bo->index? That would work, but there
> would have to be mutex_lock and mutex_unlock when accessing the
> variables. I'm not sure if locking a mutex is more expensive than the
> hash table lookup. OpenGL with GLX allows sharing buffers and textures
> between contexts, so we must play safe.
Obviously thread safety must be taken care of. Maybe we could squash CS
identifier and index into a 64bit atomic? If we need to lock a mutex for
every access it would indeed take longer than the hashtable handling.
> Note that if all handles are less than 512, the lookup is always in
> O(1), so it's very cheap. We can increase the number to 1024 or 2048
> if we find out that apps use too many buffers.
Didn't knew that there are so many entries in the hashtable, thought
that we rather get a collision every 32 handles or something like this.
If we have O(1) anyway most of the time than changing it would probably
not make much sense.
Christian.
>
> Marek
>
>
>
> On Sat, Apr 12, 2014 at 10:35 AM, Christian König
> <deathsimple at vodafone.de> wrote:
>> Am 11.04.2014 19:03, schrieb Marek Olšák:
>>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> Reviewed-by: Christian König <christian.koenig at amd.com>
>>
>> BTW:
>>
>> I've always wondered if the custom hash table is the best approach here.
>> Having a BO active in more than one command submission context at the same
>> time sounds rather unlikely to me.
>>
>> Something like storing a pointer to the last used CS and it's index in the
>> BO and only when a BO is really used in more than one CS context at the same
>> time fall back to the hashtable and lockup the index from there sounds like
>> less overhead.
>>
>>
>>> I should have done this long ago.
>>> ---
>>> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110
>>> +++++++++++---------------
>>> src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 7 +-
>>> 2 files changed, 49 insertions(+), 68 deletions(-)
>>>
>>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> index db9fbfa..b55eb80 100644
>>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct
>>> drm_radeon_cs_reloc *reloc,
>>> reloc->flags = MAX2(reloc->flags, priority);
>>> }
>>> -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
>>> *bo)
>>> +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
>>> + struct drm_radeon_cs_reloc **out_reloc)
>>> {
>>> - struct drm_radeon_cs_reloc *reloc;
>>> - unsigned i;
>>> + struct drm_radeon_cs_reloc *reloc = NULL;
>>> unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
>>> + int i = -1;
>>> if (csc->is_handle_added[hash]) {
>>> i = csc->reloc_indices_hashlist[hash];
>>> reloc = &csc->relocs[i];
>>> - if (reloc->handle == bo->handle) {
>>> - return i;
>>> - }
>>> - /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> - for (i = csc->crelocs; i != 0;) {
>>> - --i;
>>> - reloc = &csc->relocs[i];
>>> - if (reloc->handle == bo->handle) {
>>> - /* Put this reloc in the hash list.
>>> - * This will prevent additional hash collisions if there
>>> are
>>> - * several consecutive get_reloc calls for the same
>>> buffer.
>>> - *
>>> - * Example: Assuming buffers A,B,C collide in the hash
>>> list,
>>> - * the following sequence of relocs:
>>> - * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
>>> - * will collide here: ^ and here: ^,
>>> - * meaning that we should get very few collisions in the
>>> end. */
>>> - csc->reloc_indices_hashlist[hash] = i;
>>> - /*printf("write_reloc collision, hash: %i, handle: %i\n",
>>> hash, bo->handle);*/
>>> - return i;
>>> + if (reloc->handle != bo->handle) {
>>> + /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> + for (i = csc->crelocs - 1; i >= 0; i--) {
>>> + reloc = &csc->relocs[i];
>>> + if (reloc->handle == bo->handle) {
>>> + /* Put this reloc in the hash list.
>>> + * This will prevent additional hash collisions if
>>> there are
>>> + * several consecutive get_reloc calls for the same
>>> buffer.
>>> + *
>>> + * Example: Assuming buffers A,B,C collide in the
>>> hash list,
>>> + * the following sequence of relocs:
>>> + * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
>>> + * will collide here: ^ and here: ^,
>>> + * meaning that we should get very few collisions in
>>> the end. */
>>> + csc->reloc_indices_hashlist[hash] = i;
>>> + break;
>>> + }
>>> }
>>> }
>>> }
>>> -
>>> - return -1;
>>> + if (out_reloc)
>>> + *out_reloc = reloc;
>>> + return i;
>>> }
>>> static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
>>> @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct
>>> radeon_drm_cs *cs,
>>> unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
>>> enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
>>> enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
>>> - bool update_hash = TRUE;
>>> - int i;
>>> + int i = -1;
>>> priority = MIN2(priority, 15);
>>> *added_domains = 0;
>>> - if (csc->is_handle_added[hash]) {
>>> - i = csc->reloc_indices_hashlist[hash];
>>> - reloc = &csc->relocs[i];
>>> -
>>> - if (reloc->handle != bo->handle) {
>>> - /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> - for (i = csc->crelocs - 1; i >= 0; i--) {
>>> - reloc = &csc->relocs[i];
>>> - if (reloc->handle == bo->handle) {
>>> - /*printf("write_reloc collision, hash: %i, handle:
>>> %i\n", hash, bo->handle);*/
>>> - break;
>>> - }
>>> - }
>>> - }
>>> -
>>> - if (i >= 0) {
>>> - update_reloc(reloc, rd, wd, priority, added_domains);
>>> -
>>> - /* For async DMA, every add_reloc call must add a buffer to
>>> the list
>>> - * no matter how many duplicates there are. This is due to
>>> the fact
>>> - * the DMA CS checker doesn't use NOP packets for offset
>>> patching,
>>> - * but always uses the i-th buffer from the list to patch the
>>> i-th
>>> - * offset. If there are N offsets in a DMA CS, there must
>>> also be N
>>> - * buffers in the relocation list.
>>> - *
>>> - * This doesn't have to be done if virtual memory is enabled,
>>> - * because there is no offset patching with virtual memory.
>>> - */
>>> - if (cs->base.ring_type != RING_DMA ||
>>> cs->ws->info.r600_virtual_address) {
>>> - csc->reloc_indices_hashlist[hash] = i;
>>> - return i;
>>> - }
>>> - update_hash = FALSE;
>>> + i = radeon_get_reloc(csc, bo, &reloc);
>>> +
>>> + if (i >= 0) {
>>> + update_reloc(reloc, rd, wd, priority, added_domains);
>>> +
>>> + /* For async DMA, every add_reloc call must add a buffer to the
>>> list
>>> + * no matter how many duplicates there are. This is due to the
>>> fact
>>> + * the DMA CS checker doesn't use NOP packets for offset
>>> patching,
>>> + * but always uses the i-th buffer from the list to patch the
>>> i-th
>>> + * offset. If there are N offsets in a DMA CS, there must also be
>>> N
>>> + * buffers in the relocation list.
>>> + *
>>> + * This doesn't have to be done if virtual memory is enabled,
>>> + * because there is no offset patching with virtual memory.
>>> + */
>>> + if (cs->base.ring_type != RING_DMA ||
>>> cs->ws->info.r600_virtual_address) {
>>> + return i;
>>> }
>>> }
>>> @@ -304,9 +286,7 @@ static unsigned radeon_add_reloc(struct
>>> radeon_drm_cs *cs,
>>> reloc->flags = priority;
>>> csc->is_handle_added[hash] = TRUE;
>>> - if (update_hash) {
>>> - csc->reloc_indices_hashlist[hash] = csc->crelocs;
>>> - }
>>> + csc->reloc_indices_hashlist[hash] = csc->crelocs;
>>> csc->chunks[1].length_dw += RELOC_DWORDS;
>>> @@ -384,7 +364,7 @@ static void radeon_drm_cs_write_reloc(struct
>>> radeon_winsys_cs *rcs,
>>> {
>>> struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>> struct radeon_bo *bo = (struct radeon_bo*)buf;
>>> - unsigned index = radeon_get_reloc(cs->csc, bo);
>>> + unsigned index = radeon_get_reloc(cs->csc, bo, NULL);
>>> if (index == -1) {
>>> fprintf(stderr, "radeon: Cannot get a relocation in %s.\n",
>>> __func__);
>>> @@ -600,7 +580,7 @@ static boolean radeon_bo_is_referenced(struct
>>> radeon_winsys_cs *rcs,
>>> if (!bo->num_cs_references)
>>> return FALSE;
>>> - index = radeon_get_reloc(cs->csc, bo);
>>> + index = radeon_get_reloc(cs->csc, bo, NULL);
>>> if (index == -1)
>>> return FALSE;
>>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> index ebec161..460e9fa 100644
>>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> @@ -80,7 +80,8 @@ struct radeon_drm_cs {
>>> struct radeon_bo *trace_buf;
>>> };
>>> -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
>>> *bo);
>>> +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
>>> + struct drm_radeon_cs_reloc **out_reloc);
>>> static INLINE struct radeon_drm_cs *
>>> radeon_drm_cs(struct radeon_winsys_cs *base)
>>> @@ -94,7 +95,7 @@ radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
>>> {
>>> int num_refs = bo->num_cs_references;
>>> return num_refs == bo->rws->num_cs ||
>>> - (num_refs && radeon_get_reloc(cs->csc, bo) != -1);
>>> + (num_refs && radeon_get_reloc(cs->csc, bo, NULL) != -1);
>>> }
>>> static INLINE boolean
>>> @@ -106,7 +107,7 @@ radeon_bo_is_referenced_by_cs_for_write(struct
>>> radeon_drm_cs *cs,
>>> if (!bo->num_cs_references)
>>> return FALSE;
>>> - index = radeon_get_reloc(cs->csc, bo);
>>> + index = radeon_get_reloc(cs->csc, bo, NULL);
>>> if (index == -1)
>>> return FALSE;
>>>
>>
More information about the mesa-dev
mailing list