[Mesa-dev] [PATCH 3/5] mesa: Import a copy of the open-addressing hash table code I wrote.

Wed Oct 31 11:33:17 PDT 2012

I look forward to seeing this improved hash table land.

Great that it includes tests. I failed to find a test that artificially
constructed collisions and verified that they were handled correctly.
Maybe I'll submit a test for that.

Comments below. Everything looked correct, so my review mostly asks for
an occasional clarifying comment.

On 10/25/2012 09:13 AM, Eric Anholt wrote:
> Mesa's chaining hash table for object names is slow, and this should be much
> faster.  I namespaced the functions under _mesa_*, to avoid visibility
> troubles that we may have had before with hash_table_* functions. 

> The
> hash_table.c file unfortunately lives in program/ still to avoid confusion
> with automake's dependency files that would otherwise require a make distclean
> across this change.

The end result is absurd. There is a .c/.h pair in a directory that don't match
up.  It looks like this:
  program/hash_table.c implements main/hash_table.h
  program/hash_table.h is header for program/chaining_hash_table.c

> diff --git a/src/mesa/program/hash_table.c b/src/mesa/program/hash_table.c
> new file mode 100644
> index 0000000..ba49437
> --- /dev/null
> +++ b/src/mesa/program/hash_table.c
> @@ -0,0 +1,431 @@
> +/*
> + * Copyright © 2009,2012 Intel Corporation

> +
> +/**
> + * Implements an open-addressing, linear-reprobing hash table.
> + *
> + * For more information, see:
> + *
> + * http://cgit.freedesktop.org/~anholt/hash_table/tree/README
> + */
> +
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include "main/hash_table.h"
> +#include "ralloc.h"
> +
> +#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
> +

Static please.
> +uint32_t deleted_key_value;

> +/**
> + * Finds a hash table entry with the given key and hash of that key.
> + *
> + * Returns NULL if no entry is found.  Note that the data pointer may be
> + * modified by the user.
> + */
> +struct hash_entry *
> +_mesa_hash_table_search(struct hash_table *ht, uint32_t hash,
> +                        const void *key)
> +{
> +   uint32_t hash_address;
> +
> +   hash_address = hash % ht->size;
> +   do {
> +      uint32_t double_hash;
> +
> +      struct hash_entry *entry = ht->table + hash_address;
> +
> +      if (entry_is_free(entry)) {
> +         return NULL;
> +      } else if (entry_is_present(ht, entry) && entry->hash == hash) {
> +         if (ht->key_equals_function(key, entry->key)) {
> +            return entry;
> +         }
> +      }
> +
> +      double_hash = 1 + hash % ht->rehash;
> +
> +      hash_address = (hash_address + double_hash) % ht->size;

The while condition looks mystic. A comment here would be nice
explaining that we break the loop because we've cycled around to the
first probed address. Or simply a self-documenting variable name would
suffice.

> +   } while (hash_address != hash % ht->size);
> +
> +   return NULL;
> +}

> +/**
> + * Inserts the key with the given hash into the table.
> + *
> + * Note that insertion may rearrange the table on a resize or rehash,
> + * so previously found hash_entries are no longer valid after this function.
> + */
> +struct hash_entry *
> +_mesa_hash_table_insert(struct hash_table *ht, uint32_t hash,
> +                        const void *key, void *data)
> +{
> +   uint32_t hash_address;
> +
> +   if (ht->entries >= ht->max_entries) {
> +      _mesa_hash_table_rehash(ht, ht->size_index + 1);
> +   } else if (ht->deleted_entries + ht->entries >= ht->max_entries) {
> +      _mesa_hash_table_rehash(ht, ht->size_index);
> +   }
> +
> +   hash_address = hash % ht->size;
> +   do {
> +      struct hash_entry *entry = ht->table + hash_address;
> +      uint32_t double_hash;
> +
> +      if (!entry_is_present(ht, entry)) {
> +         if (entry_is_deleted(ht, entry))
> +            ht->deleted_entries--;
> +         entry->hash = hash;
> +         entry->key = key;
> +         entry->data = data;
> +         ht->entries++;
> +         return entry;
> +      }
> +
> +      /* Implement replacement when another insert happens
> +       * with a matching key.  This is a relatively common
> +       * feature of hash tables, with the alternative
> +       * generally being "insert the new value as well, and
> +       * return it first when the key is searched for".
> +       *
> +       * Note that the hash table doesn't have a delete
> +       * callback.  If freeing of old data pointers is
> +       * required to avoid memory leaks, perform a search
> +       * before inserting.
> +       */
> +      if (entry->hash == hash &&
> +          ht->key_equals_function(key, entry->key)) {
> +         entry->key = key;
> +         entry->data = data;
> +         return entry;
> +      }
> +
> +
> +      double_hash = 1 + hash % ht->rehash;
> +
> +      hash_address = (hash_address + double_hash) % ht->size;

Ditto as for _mesa_hash_table_search().

> +   } while (hash_address != hash % ht->size);
> +
> +   /* We could hit here if a required resize failed. An unchecked-malloc
> +    * application could ignore this result.
> +    */
> +   return NULL;
> +}

Please document here that 'predicate' is ignored if null.

> +struct hash_entry *
> +_mesa_hash_table_random_entry(struct hash_table *ht,
> +                              bool (*predicate)(struct hash_entry *entry))
> +{
> +   struct hash_entry *entry;
> +   uint32_t i = random() % ht->size;
> +
> +   if (ht->entries == 0)
> +      return NULL;
> +
> +   for (entry = ht->table + i; entry != ht->table + ht->size; entry++) {
> +      if (entry_is_present(ht, entry) &&
> +          (!predicate || predicate(entry))) {
> +         return entry;
> +      }
> +   }
> +
> +   for (entry = ht->table; entry != ht->table + i; entry++) {
> +      if (entry_is_present(ht, entry) &&
> +          (!predicate || predicate(entry))) {
> +         return entry;
> +      }
> +   }
> +
> +   return NULL;
> +}

> +/**
> + * Quick FNV-1 hash implementation based on:
> + * http://www.isthe.com/chongo/tech/comp/fnv/
> + *
> + * FNV-1 is not be the best hash out there -- Jenkins's lookup3 is supposed to
> + * be quite good, and it probably beats FNV.  But FNV has the advantage that
> + * it involves almost no code.  For an improvement on both, see Paul
> + * Hsieh's //www.azillionmonkeys.com/qed/hash.html
> + */
> +uint32_t
> +_mesa_hash_data(const void *key, size_t size)

This prototype differs from the one in hash_table.h:
    _mesa_hash_data(const void *data, size_t size);
Should it be 'data' or 'key'?

> +{
> +   uint32_t hash = 2166136261ul;
> +   const uint8_t *data = key;
> +
> +   while (size-- != 0) {
> +      hash ^= *data;
> +      hash = hash * 0x01000193;
> +      data++;
> +   }
> +
> +   return hash;
> +}
> +
> +/** FNV-1 string hash implementation */
> +uint32_t
> +_mesa_hash_string(const void *key)

I expect the signature here to be _mesa_hash_string(const char *key). I believe
using void* will result in a future misuse of the function that would have been
caught by the type system.

Why did you choose void*?

> +{
> +   uint32_t hash = 2166136261ul;
> +   const unsigned char *str = key;
> +
> +   while (*str != 0) {
> +      hash ^= *str;
> +      hash = hash * 0x01000193;
> +      str++;
> +   }
> +
> +   return hash;
> +}
> +
> +/** String compare function for a hash table. */

A comment here stating that _mesa_key_string_equal() is intended to be passed
to _mesa_hash_table_create() would be nice.

> +bool
> +_mesa_key_string_equal(const void *a, const void *b)
> +{
> +   return strcmp(a, b) == 0;
> +}

As for _mesa_key_string_equal(), a comment here would be nice stating
that _mesa_key_value_equal() function is intended to be passed to
_mesa_hash_table_create().

Also, I find the name of this function very confusing. "It's comparing
the values of what? The pointers? The pointed-to's? How does it compare
the pointed-to's of void pointers?" I think renaming this to
_mesa_key_pointer_equal() makes it immediately clear what this function does:
it compares pointers.

> +bool
> +_mesa_key_value_equal(const void *a, const void *b)
> +{
> +   return a == b;
> +}
> diff --git a/src/mesa/sources.mak b/src/mesa/sources.mak
> index d51adf5..13ba16a 100644
> --- a/src/mesa/sources.mak
> +++ b/src/mesa/sources.mak
> @@ -249,6 +249,7 @@ STATETRACKER_FILES = \
>  PROGRAM_FILES = \
>  	$(SRCDIR)program/arbprogparse.c \
>  	$(SRCDIR)program/chaining_hash_table.c \
> +	$(SRCDIR)program/hash_table.c \
>  	$(SRCDIR)program/program.c \
>  	$(SRCDIR)program/program_parse_extra.c \
>  	$(SRCDIR)program/prog_cache.c \
>