[Mesa-dev] [PATCH] winsys/radeon: consolidate hash table lookup

Sat Apr 12 04:26:32 PDT 2014

Am 12.04.2014 12:52, schrieb Marek Olšák:
> Do you mean having bo->cs and bo->index? That would work, but there
> would have to be mutex_lock and mutex_unlock when accessing the
> variables. I'm not sure if locking a mutex is more expensive than the
> hash table lookup. OpenGL with GLX allows sharing buffers and textures
> between contexts, so we must play safe.

Obviously thread safety must be taken care of. Maybe we could squash CS 
identifier and index into a 64bit atomic? If we need to lock a mutex for 
every access it would indeed take longer than the hashtable handling.

> Note that if all handles are less than 512, the lookup is always in
> O(1), so it's very cheap. We can increase the number to 1024 or 2048
> if we find out that apps use too many buffers.

Didn't knew that there are so many entries in the hashtable, thought 
that we rather get a collision every 32 handles or something like this. 
If we have O(1) anyway most of the time than changing it would probably 
not make much sense.

Christian.

>
> Marek
>
>
>
> On Sat, Apr 12, 2014 at 10:35 AM, Christian König
> <deathsimple at vodafone.de> wrote:
>> Am 11.04.2014 19:03, schrieb Marek Olšák:
>>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> Reviewed-by: Christian König <christian.koenig at amd.com>
>>
>> BTW:
>>
>> I've always wondered if the custom hash table is the best approach here.
>> Having a BO active in more than one command submission context at the same
>> time sounds rather unlikely to me.
>>
>> Something like storing a pointer to the last used CS and it's index in the
>> BO and only when a BO is really used in more than one CS context at the same
>> time fall back to the hashtable and lockup the index from there sounds like
>> less overhead.
>>
>>
>>> I should have done this long ago.
>>> ---
>>>    src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 110
>>> +++++++++++---------------
>>>    src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   7 +-
>>>    2 files changed, 49 insertions(+), 68 deletions(-)
>>>
>>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> index db9fbfa..b55eb80 100644
>>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
>>> @@ -188,41 +188,40 @@ static INLINE void update_reloc(struct
>>> drm_radeon_cs_reloc *reloc,
>>>        reloc->flags = MAX2(reloc->flags, priority);
>>>    }
>>>    -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
>>> *bo)
>>> +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
>>> +                     struct drm_radeon_cs_reloc **out_reloc)
>>>    {
>>> -    struct drm_radeon_cs_reloc *reloc;
>>> -    unsigned i;
>>> +    struct drm_radeon_cs_reloc *reloc = NULL;
>>>        unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
>>> +    int i = -1;
>>>          if (csc->is_handle_added[hash]) {
>>>            i = csc->reloc_indices_hashlist[hash];
>>>            reloc = &csc->relocs[i];
>>> -        if (reloc->handle == bo->handle) {
>>> -            return i;
>>> -        }
>>>    -        /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> -        for (i = csc->crelocs; i != 0;) {
>>> -            --i;
>>> -            reloc = &csc->relocs[i];
>>> -            if (reloc->handle == bo->handle) {
>>> -                /* Put this reloc in the hash list.
>>> -                 * This will prevent additional hash collisions if there
>>> are
>>> -                 * several consecutive get_reloc calls for the same
>>> buffer.
>>> -                 *
>>> -                 * Example: Assuming buffers A,B,C collide in the hash
>>> list,
>>> -                 * the following sequence of relocs:
>>> -                 *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
>>> -                 * will collide here: ^ and here:   ^,
>>> -                 * meaning that we should get very few collisions in the
>>> end. */
>>> -                csc->reloc_indices_hashlist[hash] = i;
>>> -                /*printf("write_reloc collision, hash: %i, handle: %i\n",
>>> hash, bo->handle);*/
>>> -                return i;
>>> +        if (reloc->handle != bo->handle) {
>>> +            /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> +            for (i = csc->crelocs - 1; i >= 0; i--) {
>>> +                reloc = &csc->relocs[i];
>>> +                if (reloc->handle == bo->handle) {
>>> +                    /* Put this reloc in the hash list.
>>> +                     * This will prevent additional hash collisions if
>>> there are
>>> +                     * several consecutive get_reloc calls for the same
>>> buffer.
>>> +                     *
>>> +                     * Example: Assuming buffers A,B,C collide in the
>>> hash list,
>>> +                     * the following sequence of relocs:
>>> +                     *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
>>> +                     * will collide here: ^ and here:   ^,
>>> +                     * meaning that we should get very few collisions in
>>> the end. */
>>> +                    csc->reloc_indices_hashlist[hash] = i;
>>> +                    break;
>>> +                }
>>>                }
>>>            }
>>>        }
>>> -
>>> -    return -1;
>>> +    if (out_reloc)
>>> +        *out_reloc = reloc;
>>> +    return i;
>>>    }
>>>      static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
>>> @@ -237,45 +236,28 @@ static unsigned radeon_add_reloc(struct
>>> radeon_drm_cs *cs,
>>>        unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
>>>        enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
>>>        enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
>>> -    bool update_hash = TRUE;
>>> -    int i;
>>> +    int i = -1;
>>>          priority = MIN2(priority, 15);
>>>        *added_domains = 0;
>>>    -    if (csc->is_handle_added[hash]) {
>>> -        i = csc->reloc_indices_hashlist[hash];
>>> -        reloc = &csc->relocs[i];
>>> -
>>> -        if (reloc->handle != bo->handle) {
>>> -            /* Hash collision, look for the BO in the list of relocs
>>> linearly. */
>>> -            for (i = csc->crelocs - 1; i >= 0; i--) {
>>> -                reloc = &csc->relocs[i];
>>> -                if (reloc->handle == bo->handle) {
>>> -                    /*printf("write_reloc collision, hash: %i, handle:
>>> %i\n", hash, bo->handle);*/
>>> -                    break;
>>> -                }
>>> -            }
>>> -        }
>>> -
>>> -        if (i >= 0) {
>>> -            update_reloc(reloc, rd, wd, priority, added_domains);
>>> -
>>> -            /* For async DMA, every add_reloc call must add a buffer to
>>> the list
>>> -             * no matter how many duplicates there are. This is due to
>>> the fact
>>> -             * the DMA CS checker doesn't use NOP packets for offset
>>> patching,
>>> -             * but always uses the i-th buffer from the list to patch the
>>> i-th
>>> -             * offset. If there are N offsets in a DMA CS, there must
>>> also be N
>>> -             * buffers in the relocation list.
>>> -             *
>>> -             * This doesn't have to be done if virtual memory is enabled,
>>> -             * because there is no offset patching with virtual memory.
>>> -             */
>>> -            if (cs->base.ring_type != RING_DMA ||
>>> cs->ws->info.r600_virtual_address) {
>>> -                csc->reloc_indices_hashlist[hash] = i;
>>> -                return i;
>>> -            }
>>> -            update_hash = FALSE;
>>> +    i = radeon_get_reloc(csc, bo, &reloc);
>>> +
>>> +    if (i >= 0) {
>>> +        update_reloc(reloc, rd, wd, priority, added_domains);
>>> +
>>> +        /* For async DMA, every add_reloc call must add a buffer to the
>>> list
>>> +         * no matter how many duplicates there are. This is due to the
>>> fact
>>> +         * the DMA CS checker doesn't use NOP packets for offset
>>> patching,
>>> +         * but always uses the i-th buffer from the list to patch the
>>> i-th
>>> +         * offset. If there are N offsets in a DMA CS, there must also be
>>> N
>>> +         * buffers in the relocation list.
>>> +         *
>>> +         * This doesn't have to be done if virtual memory is enabled,
>>> +         * because there is no offset patching with virtual memory.
>>> +         */
>>> +        if (cs->base.ring_type != RING_DMA ||
>>> cs->ws->info.r600_virtual_address) {
>>> +            return i;
>>>            }
>>>        }
>>>    @@ -304,9 +286,7 @@ static unsigned radeon_add_reloc(struct
>>> radeon_drm_cs *cs,
>>>        reloc->flags = priority;
>>>          csc->is_handle_added[hash] = TRUE;
>>> -    if (update_hash) {
>>> -        csc->reloc_indices_hashlist[hash] = csc->crelocs;
>>> -    }
>>> +    csc->reloc_indices_hashlist[hash] = csc->crelocs;
>>>          csc->chunks[1].length_dw += RELOC_DWORDS;
>>>    @@ -384,7 +364,7 @@ static void radeon_drm_cs_write_reloc(struct
>>> radeon_winsys_cs *rcs,
>>>    {
>>>        struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
>>>        struct radeon_bo *bo = (struct radeon_bo*)buf;
>>> -    unsigned index = radeon_get_reloc(cs->csc, bo);
>>> +    unsigned index = radeon_get_reloc(cs->csc, bo, NULL);
>>>          if (index == -1) {
>>>            fprintf(stderr, "radeon: Cannot get a relocation in %s.\n",
>>> __func__);
>>> @@ -600,7 +580,7 @@ static boolean radeon_bo_is_referenced(struct
>>> radeon_winsys_cs *rcs,
>>>        if (!bo->num_cs_references)
>>>            return FALSE;
>>>    -    index = radeon_get_reloc(cs->csc, bo);
>>> +    index = radeon_get_reloc(cs->csc, bo, NULL);
>>>        if (index == -1)
>>>            return FALSE;
>>>    diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> index ebec161..460e9fa 100644
>>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
>>> @@ -80,7 +80,8 @@ struct radeon_drm_cs {
>>>        struct radeon_bo                    *trace_buf;
>>>    };
>>>    -int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo
>>> *bo);
>>> +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo,
>>> +                     struct drm_radeon_cs_reloc **out_reloc);
>>>      static INLINE struct radeon_drm_cs *
>>>    radeon_drm_cs(struct radeon_winsys_cs *base)
>>> @@ -94,7 +95,7 @@ radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
>>>    {
>>>        int num_refs = bo->num_cs_references;
>>>        return num_refs == bo->rws->num_cs ||
>>> -           (num_refs && radeon_get_reloc(cs->csc, bo) != -1);
>>> +           (num_refs && radeon_get_reloc(cs->csc, bo, NULL) != -1);
>>>    }
>>>      static INLINE boolean
>>> @@ -106,7 +107,7 @@ radeon_bo_is_referenced_by_cs_for_write(struct
>>> radeon_drm_cs *cs,
>>>        if (!bo->num_cs_references)
>>>            return FALSE;
>>>    -    index = radeon_get_reloc(cs->csc, bo);
>>> +    index = radeon_get_reloc(cs->csc, bo, NULL);
>>>        if (index == -1)
>>>            return FALSE;
>>>
>>