[Mesa-dev] [PATCH v4 04/10] anv/allocator: Simplify anv_scratch_pool

Mon Nov 7 22:27:48 UTC 2016

The previous implementation was being overly clever and using the
anv_bo::size field as its mutex.  Scratch pool allocations don't happen
often, will happen at most a fixed number of times, and never happen in the
critical path (they only happen in shader compilation).  We can make this
much simpler by just using the device mutex.  This also means that we can
start using anv_bo_init_new directly on the bo and avoid setting fields
one-at-a-time.

Signed-off-by: Jason Ekstrand <jason at jlekstrand.net>
---
 src/intel/vulkan/anv_allocator.c | 109 ++++++++++++++++++---------------------
 src/intel/vulkan/anv_private.h   |   7 ++-
 2 files changed, 55 insertions(+), 61 deletions(-)

diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index b846a62..e7cad45 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -888,9 +888,9 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
 {
    for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
       for (unsigned i = 0; i < 16; i++) {
-         struct anv_bo *bo = &pool->bos[i][s];
-         if (bo->size > 0)
-            anv_gem_close(device, bo->gem_handle);
+         struct anv_scratch_bo *bo = &pool->bos[i][s];
+         if (bo->exists > 0)
+            anv_gem_close(device, bo->bo.gem_handle);
       }
    }
 }
@@ -905,70 +905,59 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
    assert(scratch_size_log2 < 16);
 
-   struct anv_bo *bo = &pool->bos[scratch_size_log2][stage];
+   struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage];
 
-   /* From now on, we go into a critical section.  In order to remain
-    * thread-safe, we use the bo size as a lock.  A value of 0 means we don't
-    * have a valid BO yet.  A value of 1 means locked.  A value greater than 1
-    * means we have a bo of the given size.
-    */
+   /* We can use "exists" to shortcut and ignore the critical section */
+   if (bo->exists)
+      return &bo->bo;
 
-   if (bo->size > 1)
-      return bo;
-
-   uint64_t size = __sync_val_compare_and_swap(&bo->size, 0, 1);
-   if (size == 0) {
-      /* We own the lock.  Allocate a buffer */
-
-      const struct anv_physical_device *physical_device =
-         &device->instance->physicalDevice;
-      const struct gen_device_info *devinfo = &physical_device->info;
-
-      /* WaCSScratchSize:hsw
-       *
-       * Haswell's scratch space address calculation appears to be sparse
-       * rather than tightly packed. The Thread ID has bits indicating which
-       * subslice, EU within a subslice, and thread within an EU it is.
-       * There's a maximum of two slices and two subslices, so these can be
-       * stored with a single bit. Even though there are only 10 EUs per
-       * subslice, this is stored in 4 bits, so there's an effective maximum
-       * value of 16 EUs. Similarly, although there are only 7 threads per EU,
-       * this is stored in a 3 bit number, giving an effective maximum value
-       * of 8 threads per EU.
-       *
-       * This means that we need to use 16 * 8 instead of 10 * 7 for the
-       * number of threads per subslice.
-       */
-      const unsigned subslices = MAX2(physical_device->subslice_total, 1);
-      const unsigned scratch_ids_per_subslice =
-         device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
+   pthread_mutex_lock(&device->mutex);
+
+   __sync_synchronize();
+   if (bo->exists)
+      return &bo->bo;
 
-      uint32_t max_threads[] = {
-         [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
-         [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
-         [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
-         [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
-         [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
-      };
+   const struct anv_physical_device *physical_device =
+      &device->instance->physicalDevice;
+   const struct gen_device_info *devinfo = &physical_device->info;
+
+   /* WaCSScratchSize:hsw
+    *
+    * Haswell's scratch space address calculation appears to be sparse
+    * rather than tightly packed. The Thread ID has bits indicating which
+    * subslice, EU within a subslice, and thread within an EU it is.
+    * There's a maximum of two slices and two subslices, so these can be
+    * stored with a single bit. Even though there are only 10 EUs per
+    * subslice, this is stored in 4 bits, so there's an effective maximum
+    * value of 16 EUs. Similarly, although there are only 7 threads per EU,
+    * this is stored in a 3 bit number, giving an effective maximum value
+    * of 8 threads per EU.
+    *
+    * This means that we need to use 16 * 8 instead of 10 * 7 for the
+    * number of threads per subslice.
+    */
+   const unsigned subslices = MAX2(physical_device->subslice_total, 1);
+   const unsigned scratch_ids_per_subslice =
+      device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
 
-      size = per_thread_scratch * max_threads[stage];
+   uint32_t max_threads[] = {
+      [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
+      [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
+      [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
+      [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
+      [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
+      [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
+   };
 
-      struct anv_bo new_bo;
-      anv_bo_init_new(&new_bo, device, size);
+   uint32_t size = per_thread_scratch * max_threads[stage];
 
-      bo->gem_handle = new_bo.gem_handle;
+   anv_bo_init_new(&bo->bo, device, size);
 
-      /* Set the size last because we use it as a lock */
-      __sync_synchronize();
-      bo->size = size;
+   /* Set the exists last because it may be read by other threads */
+   __sync_synchronize();
+   bo->exists = true;
 
-      futex_wake((uint32_t *)&bo->size, INT_MAX);
-   } else {
-      /* Someone else got here first */
-      while (bo->size == 1)
-         futex_wait((uint32_t *)&bo->size, 1);
-   }
+   pthread_mutex_unlock(&device->mutex);
 
-   return bo;
+   return &bo->bo;
 }
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 7eedf06..5d78d16 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -448,9 +448,14 @@ VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo,
                            uint32_t size);
 void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo);
 
+struct anv_scratch_bo {
+   bool exists;
+   struct anv_bo bo;
+};
+
 struct anv_scratch_pool {
    /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
-   struct anv_bo bos[16][MESA_SHADER_STAGES];
+   struct anv_scratch_bo bos[16][MESA_SHADER_STAGES];
 };
 
 void anv_scratch_pool_init(struct anv_device *device,
-- 
2.5.0.400.gff86faf