[Intel-gfx] [PATCH] intel: Stop using sized buckets for the BO cache.

Mon Jun 7 08:52:59 CEST 2010

Theory being that we don't keep *that* many buffers around, and the
pain of trying to find the right size in the list is less than the
pain of using gratuitous amounts of system memory for our BOs.

This brings my GM45 gnome desktop with 2 terminals from 39MB allocated
(13.8MB pinned) to 32MB allocated (11.0MB pinned), and the same
desktop with an openarena demo running from 69MB to 55MB.
---

This is the patch I'm testing on my desktop currently to deal with the
couple of reports we've had recently of our overallocation for the BO
cache seriously hurting some interesting workloads (HD video playback
and UrbanTerror on gen3).  I'd love to see some reports from more than
just me, and especially some performance numbers on CPU-bound apps if
anyone can come up with some, since I'm having a hard time getting
stable numbers out of my system right now.

You'll want the 2D driver patch if you're testing this on your x201s,
though.

 intel/intel_bufmgr_gem.c |  296 +++++++++++++++++++++-------------------------
 1 files changed, 136 insertions(+), 160 deletions(-)

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 398c7d0..eebcf4a 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -68,40 +68,7 @@
 
 typedef struct _drm_intel_bo_gem drm_intel_bo_gem;
 
-struct drm_intel_gem_bo_bucket {
-	drmMMListHead head;
-	unsigned long size;
-};
-
-/* Only cache objects up to 64MB.  Bigger than that, and the rounding of the
- * size makes many operations fail that wouldn't otherwise.
- */
-#define DRM_INTEL_GEM_BO_BUCKETS	14
-typedef struct _drm_intel_bufmgr_gem {
-	drm_intel_bufmgr bufmgr;
-
-	int fd;
-
-	int max_relocs;
-
-	pthread_mutex_t lock;
-
-	struct drm_i915_gem_exec_object *exec_objects;
-	struct drm_i915_gem_exec_object2 *exec2_objects;
-	drm_intel_bo **exec_bos;
-	int exec_size;
-	int exec_count;
-
-	/** Array of lists of cached gem objects of power-of-two sizes */
-	struct drm_intel_gem_bo_bucket cache_bucket[DRM_INTEL_GEM_BO_BUCKETS];
-
-	uint64_t gtt_size;
-	int available_fences;
-	int pci_device;
-	int gen;
-	char bo_reuse;
-	char fenced_relocs;
-} drm_intel_bufmgr_gem;
+#define BO_CACHE_MAX_SIZE (64 * 1024 * 1024)
 
 #define DRM_INTEL_RELOC_FENCE (1<<0)
 
@@ -150,7 +117,8 @@ struct _drm_intel_bo_gem {
 	void *gtt_virtual;
 
 	/** BO cache list */
-	drmMMListHead head;
+	drm_intel_bo_gem *next;
+	drm_intel_bo_gem *prev;
 
 	/**
 	 * Boolean of whether this BO and its children have been included in
@@ -190,6 +158,37 @@ struct _drm_intel_bo_gem {
 	int reloc_tree_fences;
 };
 
+typedef struct _drm_intel_bufmgr_gem {
+	drm_intel_bufmgr bufmgr;
+
+	int fd;
+
+	int max_relocs;
+
+	pthread_mutex_t lock;
+
+	struct drm_i915_gem_exec_object *exec_objects;
+	struct drm_i915_gem_exec_object2 *exec2_objects;
+	drm_intel_bo **exec_bos;
+	int exec_size;
+	int exec_count;
+
+	/**
+	 * List structure for the BO cache.  It would be nice to not
+	 * store this thing as a bo_gem but just a pair of pointers,
+	 * but the libdrm list macros are sadly not the linux list
+	 * macros.
+	 */
+	drm_intel_bo_gem bo_cache;
+
+	uint64_t gtt_size;
+	int available_fences;
+	int pci_device;
+	int gen;
+	char bo_reuse;
+	char fenced_relocs;
+} drm_intel_bufmgr_gem;
+
 static unsigned int
 drm_intel_gem_estimate_batch_space(drm_intel_bo ** bo_array, int count);
 
@@ -279,23 +278,6 @@ drm_intel_gem_bo_tile_pitch(drm_intel_bufmgr_gem *bufmgr_gem,
 	return i;
 }
 
-static struct drm_intel_gem_bo_bucket *
-drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
-				 unsigned long size)
-{
-	int i;
-
-	for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-		if (bucket->size >= size) {
-			return bucket;
-		}
-	}
-
-	return NULL;
-}
-
 static void
 drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem)
 {
@@ -529,23 +511,84 @@ drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)
 
 /* drop the oldest entries that have been purged by the kernel */
 static void
-drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
-				    struct drm_intel_gem_bo_bucket *bucket)
+drm_intel_gem_bo_cache_purge(drm_intel_bufmgr_gem *bufmgr_gem)
 {
-	while (!DRMLISTEMPTY(&bucket->head)) {
-		drm_intel_bo_gem *bo_gem;
+	while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+		drm_intel_bo_gem *bo_gem = bufmgr_gem->bo_cache.next;
 
-		bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-				      bucket->head.next, head);
 		if (drm_intel_gem_bo_madvise_internal
 		    (bufmgr_gem, bo_gem, I915_MADV_DONTNEED))
 			break;
 
-		DRMLISTDEL(&bo_gem->head);
+		DRMLISTDEL(bo_gem);
 		drm_intel_gem_bo_free(&bo_gem->bo);
 	}
 }
 
+/* For non-render-target BOs (where we're probably going to map it
+ * first thing in order to fill it with data), check if the last
+ * appropriately-sized BO in the cache is unbusy, and only reuse in
+ * that case. Otherwise, allocating a new buffer is probably faster
+ * than waiting for the GPU to finish.
+ */
+static drm_intel_bo_gem *
+find_cached_bo_for_cpu(drm_intel_bufmgr *bufmgr, unsigned long size)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
+	drm_intel_bo_gem *bo_gem, *temp;
+
+retry:
+	DRMLISTFOREACHSAFE(bo_gem, temp, &bufmgr_gem->bo_cache) {
+		if (bo_gem->bo.size != size)
+			continue;
+
+		if (drm_intel_gem_bo_busy(&bo_gem->bo))
+			return NULL;
+
+		DRMLISTDEL(bo_gem);
+
+		if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+						       I915_MADV_WILLNEED)) {
+			drm_intel_gem_bo_free(&bo_gem->bo);
+			drm_intel_gem_bo_cache_purge(bufmgr_gem);
+			goto retry;
+		} else {
+			return bo_gem;
+		}
+	}
+
+	return NULL;
+}
+
+/* Allocate new render-target BOs from the tail (MRU) of the list, as
+ * it will likely be hot in the GPU cache and in the aperture for us.
+ */
+static drm_intel_bo_gem *
+find_cached_bo_for_render(drm_intel_bufmgr *bufmgr, unsigned long size)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
+	drm_intel_bo_gem *bo_gem, *temp;
+
+retry:
+	DRMLISTFOREACHSAFEREVERSE(bo_gem, temp, &bufmgr_gem->bo_cache) {
+		if (bo_gem->bo.size != size)
+			continue;
+
+		DRMLISTDEL(bo_gem);
+
+		if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+						       I915_MADV_WILLNEED)) {
+			drm_intel_gem_bo_free(&bo_gem->bo);
+			drm_intel_gem_bo_cache_purge(bufmgr_gem);
+			goto retry;
+		} else {
+			return bo_gem;
+		}
+	}
+
+	return NULL;
+}
+
 static drm_intel_bo *
 drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 				const char *name,
@@ -553,83 +596,35 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 				unsigned long flags)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
-	drm_intel_bo_gem *bo_gem;
+	drm_intel_bo_gem *bo_gem = NULL;
 	unsigned int page_size = getpagesize();
 	int ret;
-	struct drm_intel_gem_bo_bucket *bucket;
-	int alloc_from_cache;
-	unsigned long bo_size;
 	int for_render = 0;
 
 	if (flags & BO_ALLOC_FOR_RENDER)
 		for_render = 1;
 
-	/* Round the allocated size up to a power of two number of pages. */
-	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
+	size = ALIGN(size, page_size);
 
-	/* If we don't have caching at this size, don't actually round the
-	 * allocation up.
-	 */
-	if (bucket == NULL) {
-		bo_size = size;
-		if (bo_size < page_size)
-			bo_size = page_size;
-	} else {
-		bo_size = bucket->size;
-	}
-
-	pthread_mutex_lock(&bufmgr_gem->lock);
-	/* Get a buffer out of the cache if available */
-retry:
-	alloc_from_cache = 0;
-	if (bucket != NULL && !DRMLISTEMPTY(&bucket->head)) {
-		if (for_render) {
-			/* Allocate new render-target BOs from the tail (MRU)
-			 * of the list, as it will likely be hot in the GPU
-			 * cache and in the aperture for us.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.prev, head);
-			DRMLISTDEL(&bo_gem->head);
-			alloc_from_cache = 1;
-		} else {
-			/* For non-render-target BOs (where we're probably
-			 * going to map it first thing in order to fill it
-			 * with data), check if the last BO in the cache is
-			 * unbusy, and only reuse in that case. Otherwise,
-			 * allocating a new buffer is probably faster than
-			 * waiting for the GPU to finish.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			if (!drm_intel_gem_bo_busy(&bo_gem->bo)) {
-				alloc_from_cache = 1;
-				DRMLISTDEL(&bo_gem->head);
-			}
-		}
-
-		if (alloc_from_cache) {
-			if (!drm_intel_gem_bo_madvise_internal
-			    (bufmgr_gem, bo_gem, I915_MADV_WILLNEED)) {
-				drm_intel_gem_bo_free(&bo_gem->bo);
-				drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem,
-								    bucket);
-				goto retry;
-			}
-		}
+	if (size < BO_CACHE_MAX_SIZE) {
+		pthread_mutex_lock(&bufmgr_gem->lock);
+		if (flags & BO_ALLOC_FOR_RENDER)
+			bo_gem = find_cached_bo_for_render(bufmgr, size);
+		else
+			bo_gem = find_cached_bo_for_cpu(bufmgr, size);
+		pthread_mutex_unlock(&bufmgr_gem->lock);
 	}
-	pthread_mutex_unlock(&bufmgr_gem->lock);
 
-	if (!alloc_from_cache) {
+	if (!bo_gem) {
 		struct drm_i915_gem_create create;
 
 		bo_gem = calloc(1, sizeof(*bo_gem));
 		if (!bo_gem)
 			return NULL;
 
-		bo_gem->bo.size = bo_size;
+		bo_gem->bo.size = size;
 		memset(&create, 0, sizeof(create));
-		create.size = bo_size;
+		create.size = size;
 
 		do {
 			ret = ioctl(bufmgr_gem->fd,
@@ -653,7 +648,8 @@ retry:
 	bo_gem->has_error = 0;
 	bo_gem->tiling_mode = I915_TILING_NONE;
 	bo_gem->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
-	bo_gem->reusable = 1;
+	if (size < BO_CACHE_MAX_SIZE)
+		bo_gem->reusable = 1;
 
 	drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem);
 
@@ -820,24 +816,16 @@ drm_intel_gem_bo_free(drm_intel_bo *bo)
 static void
 drm_intel_gem_cleanup_bo_cache(drm_intel_bufmgr_gem *bufmgr_gem, time_t time)
 {
-	int i;
-
-	for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-
-		while (!DRMLISTEMPTY(&bucket->head)) {
-			drm_intel_bo_gem *bo_gem;
+	while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+		drm_intel_bo_gem *bo_gem;
 
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			if (time - bo_gem->free_time <= 1)
-				break;
+		bo_gem = bufmgr_gem->bo_cache.next;
+		if (time - bo_gem->free_time <= 1)
+			break;
 
-			DRMLISTDEL(&bo_gem->head);
+		DRMLISTDEL(bo_gem);
 
-			drm_intel_gem_bo_free(&bo_gem->bo);
-		}
+		drm_intel_gem_bo_free(&bo_gem->bo);
 	}
 }
 
@@ -846,7 +834,6 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
 	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
-	struct drm_intel_gem_bo_bucket *bucket;
 	uint32_t tiling_mode;
 	int i;
 
@@ -872,10 +859,9 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 		bo_gem->relocs = NULL;
 	}
 
-	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, bo->size);
 	/* Put the buffer into our internal cache for reuse if we can. */
 	tiling_mode = I915_TILING_NONE;
-	if (bufmgr_gem->bo_reuse && bo_gem->reusable && bucket != NULL &&
+	if (bufmgr_gem->bo_reuse && bo_gem->reusable &&
 	    drm_intel_gem_bo_set_tiling(bo, &tiling_mode, 0) == 0 &&
 	    drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
 					      I915_MADV_DONTNEED)) {
@@ -884,7 +870,7 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 		bo_gem->name = NULL;
 		bo_gem->validate_index = -1;
 
-		DRMLISTADDTAIL(&bo_gem->head, &bucket->head);
+		DRMLISTADDTAIL(bo_gem, &bufmgr_gem->bo_cache);
 
 		drm_intel_gem_cleanup_bo_cache(bufmgr_gem, time);
 	} else {
@@ -1241,7 +1227,7 @@ static void
 drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
-	int i;
+	drm_intel_bo_gem *bo_gem;
 
 	free(bufmgr_gem->exec2_objects);
 	free(bufmgr_gem->exec_objects);
@@ -1250,18 +1236,12 @@ drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
 	pthread_mutex_destroy(&bufmgr_gem->lock);
 
 	/* Free any cached buffer objects we were going to reuse */
-	for (i = 0; i < DRM_INTEL_GEM_BO_BUCKETS; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-		drm_intel_bo_gem *bo_gem;
 
-		while (!DRMLISTEMPTY(&bucket->head)) {
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			DRMLISTDEL(&bo_gem->head);
+	while (!DRMLISTEMPTY(&bufmgr_gem->bo_cache)) {
+		bo_gem = bufmgr_gem->bo_cache.next;
+		DRMLISTDEL(bo_gem);
 
-			drm_intel_gem_bo_free(&bo_gem->bo);
-		}
+		drm_intel_gem_bo_free(&bo_gem->bo);
 	}
 
 	free(bufmgr);
@@ -1723,9 +1703,10 @@ drm_intel_gem_bo_flink(drm_intel_bo *bo, uint32_t * name)
 /**
  * Enables unlimited caching of buffer objects for reuse.
  *
- * This is potentially very memory expensive, as the cache at each bucket
- * size is only bounded by how many buffers of that size we've managed to have
- * in flight at once.
+ * This is potentially very memory expensive, as the cache at each bo
+ * size is only bounded by how many buffers of that size we've managed
+ * to have in flight at once.  However, the kernel's memory pressure
+ * handler will throw out BOs if required.
  */
 void
 drm_intel_bufmgr_gem_enable_reuse(drm_intel_bufmgr *bufmgr)
@@ -1985,8 +1966,7 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
 	drm_intel_bufmgr_gem *bufmgr_gem;
 	struct drm_i915_gem_get_aperture aperture;
 	drm_i915_getparam_t gp;
-	int ret, i;
-	unsigned long size;
+	int ret;
 	int exec2 = 0, has_bsd = 0;
 
 	bufmgr_gem = calloc(1, sizeof(*bufmgr_gem));
@@ -2112,11 +2092,7 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
 	    drm_intel_gem_get_pipe_from_crtc_id;
 	bufmgr_gem->bufmgr.bo_references = drm_intel_gem_bo_references;
 
-	/* Initialize the linked lists for BO reuse cache. */
-	for (i = 0, size = 4096; i < DRM_INTEL_GEM_BO_BUCKETS; i++, size *= 2) {
-		DRMINITLISTHEAD(&bufmgr_gem->cache_bucket[i].head);
-		bufmgr_gem->cache_bucket[i].size = size;
-	}
+	DRMINITLISTHEAD(&bufmgr_gem->bo_cache);
 
 	return &bufmgr_gem->bufmgr;
 }
-- 
1.7.1