[igt-dev] [PATCH i-g-t v8 04/16] lib/rendercopy: remove libdrm dependency

Zbigniew Kempczyński zbigniew.kempczynski at intel.com
Mon Jul 27 11:44:00 UTC 2020


Use intel_bb as main batch implementation to remove libdrm dependency.
Rewrite all pipelines to use intel_bb and update render|vebox_copy
function prototypes.

Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
Cc: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
---
 lib/intel_aux_pgtable.c |  325 +++++------
 lib/intel_aux_pgtable.h |   29 +-
 lib/intel_batchbuffer.h |   42 +-
 lib/rendercopy.h        |  102 ++--
 lib/rendercopy_gen4.c   |  568 ++++++++++---------
 lib/rendercopy_gen6.c   |  590 ++++++++++----------
 lib/rendercopy_gen7.c   |  612 ++++++++++-----------
 lib/rendercopy_gen8.c   | 1030 +++++++++++++++--------------------
 lib/rendercopy_gen9.c   | 1145 +++++++++++++++++----------------------
 lib/rendercopy_i830.c   |  278 +++++-----
 lib/rendercopy_i915.c   |  281 +++++-----
 lib/veboxcopy.h         |    8 +-
 lib/veboxcopy_gen12.c   |  117 ++--
 13 files changed, 2423 insertions(+), 2704 deletions(-)

diff --git a/lib/intel_aux_pgtable.c b/lib/intel_aux_pgtable.c
index db5055c8..b43a366b 100644
--- a/lib/intel_aux_pgtable.c
+++ b/lib/intel_aux_pgtable.c
@@ -4,7 +4,7 @@
 #include "drmtest.h"
 #include "intel_aux_pgtable.h"
 #include "intel_batchbuffer.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "ioctl_wrappers.h"
 
 #include "i915/gem_mman.h"
@@ -31,8 +31,6 @@
 
 #define GFX_ADDRESS_BITS	48
 
-#define max(a, b)		((a) > (b) ? (a) : (b))
-
 #define AUX_FORMAT_YCRCB	0x03
 #define AUX_FORMAT_P010		0x07
 #define AUX_FORMAT_P016		0x08
@@ -58,10 +56,12 @@ struct pgtable {
 	struct pgtable_level_info *level_info;
 	int size;
 	int max_align;
-	drm_intel_bo *bo;
+	struct intel_bb *ibb;
+	struct intel_buf *buf;
+	void *ptr;
 };
 
-static uint64_t last_buf_surface_end(const struct igt_buf *buf)
+static uint64_t last_buf_surface_end(struct intel_buf *buf)
 {
 	uint64_t end_offset = 0;
 	int num_surfaces = buf->format_is_yuv_semiplanar ? 2 : 1;
@@ -79,7 +79,7 @@ static uint64_t last_buf_surface_end(const struct igt_buf *buf)
 }
 
 static int
-pgt_table_count(int address_bits, const struct igt_buf **bufs, int buf_count)
+pgt_table_count(int address_bits, struct intel_buf **bufs, int buf_count)
 {
 	uint64_t end;
 	int count;
@@ -88,19 +88,19 @@ pgt_table_count(int address_bits, const struct igt_buf **bufs, int buf_count)
 	count = 0;
 	end = 0;
 	for (i = 0; i < buf_count; i++) {
-		const struct igt_buf *buf = bufs[i];
+		struct intel_buf *buf = bufs[i];
 		uint64_t start;
 
 		/* We require bufs to be sorted. */
 		igt_assert(i == 0 ||
-			   buf->bo->offset64 >= bufs[i - 1]->bo->offset64 +
-						bufs[i - 1]->bo->size);
+			   buf->addr.offset >= bufs[i - 1]->addr.offset +
+					       intel_buf_bo_size(bufs[i - 1]));
 
-		start = ALIGN_DOWN(buf->bo->offset64, 1UL << address_bits);
+		start = ALIGN_DOWN(buf->addr.offset, 1UL << address_bits);
 		/* Avoid double counting for overlapping aligned bufs. */
 		start = max(start, end);
 
-		end = ALIGN(buf->bo->offset64 + last_buf_surface_end(buf),
+		end = ALIGN(buf->addr.offset + last_buf_surface_end(buf),
 			    1UL << address_bits);
 		igt_assert(end >= start);
 
@@ -111,7 +111,7 @@ pgt_table_count(int address_bits, const struct igt_buf **bufs, int buf_count)
 }
 
 static void
-pgt_calc_size(struct pgtable *pgt, const struct igt_buf **bufs, int buf_count)
+pgt_calc_size(struct pgtable *pgt, struct intel_buf **bufs, int buf_count)
 {
 	int level;
 
@@ -171,28 +171,33 @@ pgt_get_child_table(struct pgtable *pgt, uint64_t parent_table,
 	uint64_t *child_entry_ptr;
 	uint64_t child_table;
 
-	parent_table_ptr = pgt->bo->virtual + parent_table;
+	parent_table_ptr = pgt->ptr + parent_table;
 	child_entry_idx = pgt_entry_index(pgt, level, address);
 	child_entry_ptr = &parent_table_ptr[child_entry_idx];
 
 	if (!*child_entry_ptr) {
 		uint64_t pte;
+		uint32_t offset;
 
 		child_table = pgt_alloc_table(pgt, level - 1);
-		igt_assert(!((child_table + pgt->bo->offset64) &
+		igt_assert(!((child_table + pgt->buf->addr.offset) &
 			     ~ptr_mask(pgt, level)));
 
 		pte = child_table | flags;
-		*child_entry_ptr = pgt->bo->offset64 + pte;
+		*child_entry_ptr = pgt->buf->addr.offset + pte;
 
 		igt_assert(pte <= INT32_MAX);
-		drm_intel_bo_emit_reloc(pgt->bo,
-					parent_table +
-						child_entry_idx * sizeof(uint64_t),
-					pgt->bo, pte, 0, 0);
+
+		offset = parent_table + child_entry_idx * sizeof(uint64_t);
+		intel_bb_offset_reloc_to_object(pgt->ibb,
+						pgt->buf->handle,
+						pgt->buf->handle,
+						0, 0,
+						pte, offset,
+						pgt->buf->addr.offset);
 	} else {
 		child_table = (*child_entry_ptr & ptr_mask(pgt, level)) -
-			      pgt->bo->offset64;
+			      pgt->buf->addr.offset;
 	}
 
 	return child_table;
@@ -205,7 +210,7 @@ pgt_set_l1_entry(struct pgtable *pgt, uint64_t l1_table,
 	uint64_t *l1_table_ptr;
 	uint64_t *l1_entry_ptr;
 
-	l1_table_ptr = pgt->bo->virtual + l1_table;
+	l1_table_ptr = pgt->ptr + l1_table;
 	l1_entry_ptr = &l1_table_ptr[pgt_entry_index(pgt, 0, address)];
 
 	igt_assert(!(ptr & ~ptr_mask(pgt, 0)));
@@ -234,7 +239,7 @@ static int bpp_to_depth_val(int bpp)
 	}
 }
 
-static uint64_t pgt_get_l1_flags(const struct igt_buf *buf, int surface_idx)
+static uint64_t pgt_get_l1_flags(const struct intel_buf *buf, int surface_idx)
 {
 	/*
 	 * The offset of .tile_mode isn't specifed by bspec, it's what Mesa
@@ -337,15 +342,15 @@ static uint64_t pgt_get_lx_flags(void)
 
 static void
 pgt_populate_entries_for_buf(struct pgtable *pgt,
-			       const struct igt_buf *buf,
-			       uint64_t top_table,
-			       int surface_idx)
+			     struct intel_buf *buf,
+			     uint64_t top_table,
+			     int surface_idx)
 {
-	uint64_t surface_addr = buf->bo->offset64 +
+	uint64_t surface_addr = buf->addr.offset +
 				buf->surface[surface_idx].offset;
 	uint64_t surface_end = surface_addr +
 			       buf->surface[surface_idx].size;
-	uint64_t aux_addr = buf->bo->offset64 + buf->ccs[surface_idx].offset;
+	uint64_t aux_addr = buf->addr.offset + buf->ccs[surface_idx].offset;
 	uint64_t l1_flags = pgt_get_l1_flags(buf, surface_idx);
 	uint64_t lx_flags = pgt_get_lx_flags();
 
@@ -367,19 +372,24 @@ pgt_populate_entries_for_buf(struct pgtable *pgt,
 	}
 }
 
+static void pgt_map(int i915, struct pgtable *pgt)
+{
+	pgt->ptr = gem_mmap__device_coherent(i915, pgt->buf->handle, 0,
+					     pgt->size, PROT_READ | PROT_WRITE);
+}
+
+static void pgt_unmap(struct pgtable *pgt)
+{
+	munmap(pgt->ptr, pgt->size);
+}
+
 static void pgt_populate_entries(struct pgtable *pgt,
-				 const struct igt_buf **bufs,
-				 int buf_count,
-				 drm_intel_bo *pgt_bo)
+				 struct intel_buf **bufs,
+				 int buf_count)
 {
 	uint64_t top_table;
 	int i;
 
-	pgt->bo = pgt_bo;
-
-	igt_assert(pgt_bo->size >= pgt->size);
-	memset(pgt_bo->virtual, 0, pgt->size);
-
 	top_table = pgt_alloc_table(pgt, pgt->levels - 1);
 	/* Top level table must be at offset 0. */
 	igt_assert(top_table == 0);
@@ -395,7 +405,7 @@ static void pgt_populate_entries(struct pgtable *pgt,
 
 static struct pgtable *
 pgt_create(const struct pgtable_level_desc *level_descs, int levels,
-	   const struct igt_buf **bufs, int buf_count)
+	   struct intel_buf **bufs, int buf_count)
 {
 	struct pgtable *pgt;
 	int level;
@@ -427,10 +437,11 @@ static void pgt_destroy(struct pgtable *pgt)
 	free(pgt);
 }
 
-drm_intel_bo *
-intel_aux_pgtable_create(drm_intel_bufmgr *bufmgr,
-		       const struct igt_buf **bufs, int buf_count)
+struct intel_buf *
+intel_aux_pgtable_create(struct intel_bb *ibb,
+			 struct intel_buf **bufs, int buf_count)
 {
+	struct drm_i915_gem_exec_object2 *obj;
 	static const struct pgtable_level_desc level_desc[] = {
 		{
 			.idx_shift = 16,
@@ -452,99 +463,43 @@ intel_aux_pgtable_create(drm_intel_bufmgr *bufmgr,
 		},
 	};
 	struct pgtable *pgt;
-	drm_intel_bo *pgt_bo;
-
-	pgt = pgt_create(level_desc, ARRAY_SIZE(level_desc), bufs, buf_count);
+	struct buf_ops *bops;
+	struct intel_buf *buf;
+	uint64_t prev_alignment;
 
-	pgt_bo = drm_intel_bo_alloc_for_render(bufmgr, "aux pgt",
-					       pgt->size, pgt->max_align);
-	igt_assert(pgt_bo);
-
-	igt_assert(drm_intel_bo_map(pgt_bo, true) == 0);
-	pgt_populate_entries(pgt, bufs, buf_count, pgt_bo);
-	igt_assert(drm_intel_bo_unmap(pgt_bo) == 0);
+	igt_assert(buf_count);
+	bops = bufs[0]->bops;
 
+	pgt = pgt_create(level_desc, ARRAY_SIZE(level_desc), bufs, buf_count);
+	pgt->ibb = ibb;
+	pgt->buf = intel_buf_create(bops, pgt->size, 1, 8, 0, I915_TILING_NONE,
+				    I915_COMPRESSION_NONE);
+
+	/* We need to use pgt->max_align for aux table */
+	prev_alignment = intel_bb_set_default_object_alignment(ibb,
+							       pgt->max_align);
+	obj = intel_bb_add_intel_buf(ibb, pgt->buf, false);
+	intel_bb_set_default_object_alignment(ibb, prev_alignment);
+	obj->alignment = pgt->max_align;
+
+	pgt_map(ibb->i915, pgt);
+	pgt_populate_entries(pgt, bufs, buf_count);
+	pgt_unmap(pgt);
+
+	buf = pgt->buf;
 	pgt_destroy(pgt);
 
-	return pgt_bo;
-}
-
-static void
-aux_pgtable_find_max_free_range(const struct igt_buf **bufs, int buf_count,
-				uint64_t *range_start, uint64_t *range_size)
-{
-	/*
-	 * Keep the first page reserved, so we can differentiate pinned
-	 * objects based on a non-NULL offset.
-	 */
-	uint64_t start = 0x1000;
-	/* For now alloc only from the first 4GB address space. */
-	const uint64_t end = 1ULL << 32;
-	uint64_t max_range_start = 0;
-	uint64_t max_range_size = 0;
-	int i;
-
-	for (i = 0; i < buf_count; i++) {
-		if (bufs[i]->bo->offset64 >= end)
-			break;
-
-		if (bufs[i]->bo->offset64 - start > max_range_size) {
-			max_range_start = start;
-			max_range_size = bufs[i]->bo->offset64 - start;
-		}
-		start = bufs[i]->bo->offset64 + bufs[i]->bo->size;
-	}
-
-	if (start < end && end - start > max_range_size) {
-		max_range_start = start;
-		max_range_size = end - start;
-	}
-
-	*range_start = max_range_start;
-	*range_size = max_range_size;
-}
-
-static uint64_t
-aux_pgtable_find_free_range(const struct igt_buf **bufs, int buf_count,
-			    uint32_t size)
-{
-	uint64_t range_start;
-	uint64_t range_size;
-	/* A compressed surface must be 64kB aligned. */
-	const uint32_t align = 0x10000;
-	int pad;
-
-	aux_pgtable_find_max_free_range(bufs, buf_count,
-					&range_start, &range_size);
-
-	pad = ALIGN(range_start, align) - range_start;
-	range_start += pad;
-	range_size -= pad;
-	igt_assert(range_size >= size);
-
-	return range_start +
-	       ALIGN_DOWN(rand() % ((range_size - size) + 1), align);
+	return buf;
 }
 
 static void
-aux_pgtable_reserve_range(const struct igt_buf **bufs, int buf_count,
-			  const struct igt_buf *new_buf)
+aux_pgtable_reserve_buf_slot(struct intel_buf **bufs, int buf_count,
+			     struct intel_buf *new_buf)
 {
 	int i;
 
-	if (igt_buf_compressed(new_buf)) {
-		uint64_t pin_offset = new_buf->bo->offset64;
-
-		if (!pin_offset)
-			pin_offset = aux_pgtable_find_free_range(bufs,
-								 buf_count,
-								 new_buf->bo->size);
-		drm_intel_bo_set_softpin_offset(new_buf->bo, pin_offset);
-		igt_assert(new_buf->bo->offset64 == pin_offset);
-	}
-
 	for (i = 0; i < buf_count; i++)
-		if (bufs[i]->bo->offset64 > new_buf->bo->offset64)
+		if (bufs[i]->addr.offset > new_buf->addr.offset)
 			break;
 
 	memmove(&bufs[i + 1], &bufs[i], sizeof(bufs[0]) * (buf_count - i));
@@ -554,107 +509,115 @@ aux_pgtable_reserve_range(const struct igt_buf **bufs, int buf_count,
 
 void
 gen12_aux_pgtable_init(struct aux_pgtable_info *info,
-		       drm_intel_bufmgr *bufmgr,
-		       const struct igt_buf *src_buf,
-		       const struct igt_buf *dst_buf)
+		       struct intel_bb *ibb,
+		       struct intel_buf *src_buf,
+		       struct intel_buf *dst_buf)
 {
-	const struct igt_buf *bufs[2];
-	const struct igt_buf *reserved_bufs[2];
+	struct intel_buf *bufs[2];
+	struct intel_buf *reserved_bufs[2];
 	int reserved_buf_count;
 	int i;
 
-	if (!igt_buf_compressed(src_buf) && !igt_buf_compressed(dst_buf))
+	igt_assert_f(ibb->enforce_relocs == false,
+		     "We support aux pgtables for non-forced relocs yet!");
+
+	if (!intel_buf_compressed(src_buf) && !intel_buf_compressed(dst_buf))
 		return;
 
 	bufs[0] = src_buf;
 	bufs[1] = dst_buf;
 
 	/*
-	 * Ideally we'd need an IGT-wide GFX address space allocator, which
-	 * would consider all allocations and thus avoid evictions. For now use
-	 * a simpler scheme here, which only considers the buffers involved in
-	 * the blit, which should at least minimize the chance for evictions
-	 * in the case of subsequent blits:
-	 *   1. If they were already bound (bo->offset64 != 0), use this
-	 *      address.
-	 *   2. Pick a range randomly from the 4GB address space, that is not
-	 *      already occupied by a bound object, or an object we pinned.
+	 * Surface index in pgt table depend on its address so:
+	 *   1. if handle was previously executed in batch use that address
+	 *   2. add object to batch, this will generate random address
+	 *
+	 * Randomizing addresses can lead to overlapping, but we don't have
+	 * global address space generator in IGT. Currently assumption is
+	 * randomizing address is spread over 48-bit address space equally
+	 * so risk with overlapping is minimal. Of course it is growing
+	 * with number of objects (+its sizes) involved in blit.
+	 * To avoid relocation EXEC_OBJECT_PINNED flag is set for compressed
+	 * surfaces.
 	 */
+
+	intel_bb_add_intel_buf(ibb, src_buf, false);
+	if (intel_buf_compressed(src_buf))
+		intel_bb_object_set_flag(ibb, src_buf->handle, EXEC_OBJECT_PINNED);
+
+	intel_bb_add_intel_buf(ibb, dst_buf, true);
+	if (intel_buf_compressed(dst_buf))
+		intel_bb_object_set_flag(ibb, dst_buf->handle, EXEC_OBJECT_PINNED);
+
 	reserved_buf_count = 0;
 	/* First reserve space for any bufs that are bound already. */
-	for (i = 0; i < ARRAY_SIZE(bufs); i++)
-		if (bufs[i]->bo->offset64)
-			aux_pgtable_reserve_range(reserved_bufs,
-						  reserved_buf_count++,
-						  bufs[i]);
-
-	/* Next, reserve space for unbound bufs with an AUX surface. */
-	for (i = 0; i < ARRAY_SIZE(bufs); i++)
-		if (!bufs[i]->bo->offset64 && igt_buf_compressed(bufs[i]))
-			aux_pgtable_reserve_range(reserved_bufs,
-						  reserved_buf_count++,
-						  bufs[i]);
+	for (i = 0; i < ARRAY_SIZE(bufs); i++) {
+		igt_assert(bufs[i]->addr.offset != INTEL_BUF_INVALID_ADDRESS);
+		aux_pgtable_reserve_buf_slot(reserved_bufs,
+					     reserved_buf_count++,
+					     bufs[i]);
+	}
 
 	/* Create AUX pgtable entries only for bufs with an AUX surface */
 	info->buf_count = 0;
 	for (i = 0; i < reserved_buf_count; i++) {
-		if (!igt_buf_compressed(reserved_bufs[i]))
+		if (!intel_buf_compressed(reserved_bufs[i]))
 			continue;
 
 		info->bufs[info->buf_count] = reserved_bufs[i];
 		info->buf_pin_offsets[info->buf_count] =
-			reserved_bufs[i]->bo->offset64;
+			reserved_bufs[i]->addr.offset;
+
 		info->buf_count++;
 	}
 
-	info->pgtable_bo = intel_aux_pgtable_create(bufmgr,
-						    info->bufs,
-						    info->buf_count);
-	igt_assert(info->pgtable_bo);
+	info->pgtable_buf = intel_aux_pgtable_create(ibb,
+						     info->bufs,
+						     info->buf_count);
+
+	igt_assert(info->pgtable_buf);
 }
 
 void
-gen12_aux_pgtable_cleanup(struct aux_pgtable_info *info)
+gen12_aux_pgtable_cleanup(struct intel_bb *ibb, struct aux_pgtable_info *info)
 {
 	int i;
 
 	/* Check that the pinned bufs kept their offset after the exec. */
-	for (i = 0; i < info->buf_count; i++)
-		igt_assert_eq_u64(info->bufs[i]->bo->offset64,
-				  info->buf_pin_offsets[i]);
+	for (i = 0; i < info->buf_count; i++) {
+		uint64_t addr;
+
+		addr = intel_bb_get_object_offset(ibb, info->bufs[i]->handle);
+		igt_assert_eq_u64(addr, info->buf_pin_offsets[i]);
+	}
 
-	drm_intel_bo_unreference(info->pgtable_bo);
+	if (info->pgtable_buf)
+		intel_buf_destroy(info->pgtable_buf);
 }
 
 uint32_t
-gen12_create_aux_pgtable_state(struct intel_batchbuffer *batch,
-			       drm_intel_bo *aux_pgtable_bo)
+gen12_create_aux_pgtable_state(struct intel_bb *ibb,
+			       struct intel_buf *aux_pgtable_buf)
 {
 	uint64_t *pgtable_ptr;
 	uint32_t pgtable_ptr_offset;
-	int ret;
 
-	if (!aux_pgtable_bo)
+	if (!aux_pgtable_buf)
 		return 0;
 
-	pgtable_ptr = intel_batchbuffer_subdata_alloc(batch,
-						      sizeof(*pgtable_ptr),
-						      sizeof(*pgtable_ptr));
-	pgtable_ptr_offset = intel_batchbuffer_subdata_offset(batch,
-							      pgtable_ptr);
+	pgtable_ptr = intel_bb_ptr(ibb);
+	pgtable_ptr_offset = intel_bb_offset(ibb);
 
-	*pgtable_ptr = aux_pgtable_bo->offset64;
-	ret = drm_intel_bo_emit_reloc(batch->bo, pgtable_ptr_offset,
-				      aux_pgtable_bo, 0,
-				      0, 0);
-	assert(ret == 0);
+	*pgtable_ptr = intel_bb_offset_reloc(ibb, aux_pgtable_buf->handle,
+					     0, 0,
+					     pgtable_ptr_offset, -1);
+	intel_bb_ptr_add(ibb, sizeof(*pgtable_ptr));
 
 	return pgtable_ptr_offset;
 }
 
 void
-gen12_emit_aux_pgtable_state(struct intel_batchbuffer *batch, uint32_t state,
-			     bool render)
+gen12_emit_aux_pgtable_state(struct intel_bb *ibb, uint32_t state, bool render)
 {
 	uint32_t table_base_reg = render ? GEN12_GFX_AUX_TABLE_BASE_ADDR :
 					   GEN12_VEBOX_AUX_TABLE_BASE_ADDR;
@@ -662,11 +625,11 @@ gen12_emit_aux_pgtable_state(struct intel_batchbuffer *batch, uint32_t state,
 	if (!state)
 		return;
 
-	OUT_BATCH(MI_LOAD_REGISTER_MEM_GEN8 | MI_MMIO_REMAP_ENABLE_GEN12);
-	OUT_BATCH(table_base_reg);
-	OUT_RELOC(batch->bo, 0, 0, state);
+	intel_bb_out(ibb, MI_LOAD_REGISTER_MEM_GEN8 | MI_MMIO_REMAP_ENABLE_GEN12);
+	intel_bb_out(ibb, table_base_reg);
+	intel_bb_emit_reloc(ibb, ibb->handle, 0, 0, state, ibb->batch_offset);
 
-	OUT_BATCH(MI_LOAD_REGISTER_MEM_GEN8 | MI_MMIO_REMAP_ENABLE_GEN12);
-	OUT_BATCH(table_base_reg + 4);
-	OUT_RELOC(batch->bo, 0, 0, state + 4);
+	intel_bb_out(ibb, MI_LOAD_REGISTER_MEM_GEN8 | MI_MMIO_REMAP_ENABLE_GEN12);
+	intel_bb_out(ibb, table_base_reg + 4);
+	intel_bb_emit_reloc(ibb, ibb->handle, 0, 0, state + 4, ibb->batch_offset);
 }
diff --git a/lib/intel_aux_pgtable.h b/lib/intel_aux_pgtable.h
index ac82b7d2..e9976e52 100644
--- a/lib/intel_aux_pgtable.h
+++ b/lib/intel_aux_pgtable.h
@@ -1,36 +1,33 @@
 #ifndef __INTEL_AUX_PGTABLE_H__
 #define __INTEL_AUX_PGTABLE_H__
 
-#include "intel_bufmgr.h"
-
-struct igt_buf;
-struct intel_batchbuffer;
+#include "intel_bufops.h"
 
 struct aux_pgtable_info {
 	int buf_count;
-	const struct igt_buf *bufs[2];
+	struct intel_buf *bufs[2];
 	uint64_t buf_pin_offsets[2];
-	drm_intel_bo *pgtable_bo;
+	struct intel_buf *pgtable_buf;
 };
 
-drm_intel_bo *
-intel_aux_pgtable_create(drm_intel_bufmgr *bufmgr,
-			 const struct igt_buf **bufs, int buf_count);
+struct intel_buf *
+intel_aux_pgtable_create(struct intel_bb *ibb,
+			 struct intel_buf **bufs, int buf_count);
 
 void
 gen12_aux_pgtable_init(struct aux_pgtable_info *info,
-		       drm_intel_bufmgr *bufmgr,
-		       const struct igt_buf *src_buf,
-		       const struct igt_buf *dst_buf);
+		       struct intel_bb *ibb,
+		       struct intel_buf *src_buf,
+		       struct intel_buf *dst_buf);
 
 void
-gen12_aux_pgtable_cleanup(struct aux_pgtable_info *info);
+gen12_aux_pgtable_cleanup(struct intel_bb *ibb, struct aux_pgtable_info *info);
 
 uint32_t
-gen12_create_aux_pgtable_state(struct intel_batchbuffer *batch,
-			       drm_intel_bo *aux_pgtable_bo);
+gen12_create_aux_pgtable_state(struct intel_bb *batch,
+			       struct intel_buf *aux_pgtable_buf);
 void
-gen12_emit_aux_pgtable_state(struct intel_batchbuffer *batch, uint32_t state,
+gen12_emit_aux_pgtable_state(struct intel_bb *batch, uint32_t state,
 			     bool render);
 
 #endif
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index cc8c84b6..477ea423 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -318,14 +318,14 @@ void igt_blitter_fast_copy__raw(int fd,
 
 /**
  * igt_render_copyfunc_t:
- * @batch: batchbuffer object
- * @context: libdrm hardware context to use
- * @src: source i-g-t buffer object
+ * @ibb: batchbuffer
+ * @ctx: context to use
+ * @src: intel_buf source object
  * @src_x: source pixel x-coordination
  * @src_y: source pixel y-coordination
  * @width: width of the copied rectangle
  * @height: height of the copied rectangle
- * @dst: destination i-g-t buffer object
+ * @dst: intel_buf destination object
  * @dst_x: destination pixel x-coordination
  * @dst_y: destination pixel y-coordination
  *
@@ -334,25 +334,30 @@ void igt_blitter_fast_copy__raw(int fd,
  * igt_get_render_copyfunc().
  *
  * A render copy function will emit a batchbuffer to the kernel which executes
- * the specified blit copy operation using the render engine. @context is
- * optional and can be NULL.
+ * the specified blit copy operation using the render engine. @ctx is
+ * optional and can be 0.
  */
-typedef void (*igt_render_copyfunc_t)(struct intel_batchbuffer *batch,
-				      drm_intel_context *context,
-				      const struct igt_buf *src, unsigned src_x, unsigned src_y,
-				      unsigned width, unsigned height,
-				      const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
+struct intel_bb;
+struct intel_buf;
+
+typedef void (*igt_render_copyfunc_t)(struct intel_bb *ibb,
+				      uint32_t ctx,
+				      struct intel_buf *src,
+				      uint32_t src_x, uint32_t src_y,
+				      uint32_t width, uint32_t height,
+				      struct intel_buf *dst,
+				      uint32_t dst_x, uint32_t dst_y);
 
 igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
 
 
 /**
  * igt_vebox_copyfunc_t:
- * @batch: batchbuffer object
- * @src: source i-g-t buffer object
+ * @ibb: batchbuffer
+ * @src: intel_buf source object
  * @width: width of the copied rectangle
  * @height: height of the copied rectangle
- * @dst: destination i-g-t buffer object
+ * @dst: intel_buf destination object
  *
  * This is the type of the per-platform vebox copy functions. The
  * platform-specific implementation can be obtained by calling
@@ -361,10 +366,10 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
  * A vebox copy function will emit a batchbuffer to the kernel which executes
  * the specified blit copy operation using the vebox engine.
  */
-typedef void (*igt_vebox_copyfunc_t)(struct intel_batchbuffer *batch,
-				     const struct igt_buf *src,
-				     unsigned width, unsigned height,
-				     const struct igt_buf *dst);
+typedef void (*igt_vebox_copyfunc_t)(struct intel_bb *ibb,
+				     struct intel_buf *src,
+				     unsigned int width, unsigned int height,
+				     struct intel_buf *dst);
 
 igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid);
 
@@ -385,7 +390,6 @@ igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid);
  * A fill function will emit a batchbuffer to the kernel which executes
  * the specified blit fill operation using the media/gpgpu engine.
  */
-struct intel_buf;
 typedef void (*igt_fillfunc_t)(int i915,
 			       struct intel_buf *buf,
 			       unsigned x, unsigned y,
diff --git a/lib/rendercopy.h b/lib/rendercopy.h
index e0577cac..8dd867bc 100644
--- a/lib/rendercopy.h
+++ b/lib/rendercopy.h
@@ -1,70 +1,70 @@
 #include "intel_batchbuffer.h"
 
 
-static inline void emit_vertex_2s(struct intel_batchbuffer *batch,
+static inline void emit_vertex_2s(struct intel_bb *ibb,
 				  int16_t x, int16_t y)
 {
-	OUT_BATCH((uint16_t)y << 16 | (uint16_t)x);
+	intel_bb_out(ibb, ((uint16_t)y << 16 | (uint16_t)x));
 }
 
-static inline void emit_vertex(struct intel_batchbuffer *batch,
+static inline void emit_vertex(struct intel_bb *ibb,
 			       float f)
 {
 	union { float f; uint32_t ui; } u;
 	u.f = f;
-	OUT_BATCH(u.ui);
+	intel_bb_out(ibb, u.ui);
 }
 
-static inline void emit_vertex_normalized(struct intel_batchbuffer *batch,
+static inline void emit_vertex_normalized(struct intel_bb *ibb,
 					  float f, float total)
 {
 	union { float f; uint32_t ui; } u;
 	u.f = f / total;
-	OUT_BATCH(u.ui);
+	intel_bb_out(ibb, u.ui);
 }
 
-void gen12_render_copyfunc(struct intel_batchbuffer *batch,
-			   drm_intel_context *context,
-			   const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			   unsigned width, unsigned height,
-			   const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen11_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen9_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen8_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen7_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen6_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen4_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen3_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
-void gen2_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
+void gen12_render_copyfunc(struct intel_bb *ibb,
+			   uint32_t ctx,
+			   struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			   uint32_t width, uint32_t height,
+			   struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen11_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen9_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen8_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen7_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen6_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen4_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen3_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
+void gen2_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src, uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst, uint32_t dst_x, uint32_t dst_y);
diff --git a/lib/rendercopy_gen4.c b/lib/rendercopy_gen4.c
index 413e3357..027ab571 100644
--- a/lib/rendercopy_gen4.c
+++ b/lib/rendercopy_gen4.c
@@ -2,6 +2,7 @@
 #include "intel_chipset.h"
 #include "gen4_render.h"
 #include "surfaceformat.h"
+#include "intel_bufops.h"
 
 #include <assert.h>
 
@@ -80,18 +81,13 @@ static const uint32_t gen5_ps_kernel_nomask_affine[][4] = {
 };
 
 static uint32_t
-batch_used(struct intel_batchbuffer *batch)
+batch_round_upto(struct intel_bb *ibb, uint32_t divisor)
 {
-	return batch->ptr - batch->buffer;
-}
-
-static uint32_t
-batch_round_upto(struct intel_batchbuffer *batch, uint32_t divisor)
-{
-	uint32_t offset = batch_used(batch);
+	uint32_t offset = intel_bb_offset(ibb);
 
 	offset = (offset + divisor - 1) / divisor * divisor;
-	batch->ptr = batch->buffer + offset;
+	intel_bb_ptr_set(ibb, offset);
+
 	return offset;
 }
 
@@ -120,30 +116,16 @@ static int gen4_max_wm_threads(uint32_t devid)
 	return IS_GEN5(devid) ? 72 : IS_G4X(devid) ? 50 : 32;
 }
 
-static void
-gen4_render_flush(struct intel_batchbuffer *batch,
-		  drm_intel_context *context, uint32_t batch_end)
-{
-	igt_assert_eq(drm_intel_bo_subdata(batch->bo,
-					   0, 4096, batch->buffer),
-		      0);
-	igt_assert_eq(drm_intel_gem_bo_context_exec(batch->bo, context,
-						    batch_end, 0),
-		      0);
-}
-
 static uint32_t
-gen4_bind_buf(struct intel_batchbuffer *batch,
-	      const struct igt_buf *buf,
-	      int is_dst)
+gen4_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst)
 {
 	struct gen4_surface_state *ss;
 	uint32_t write_domain, read_domain;
-	int ret;
+	uint64_t address;
 
 	igt_assert_lte(buf->surface[0].stride, 128*1024);
-	igt_assert_lte(igt_buf_width(buf), 8192);
-	igt_assert_lte(igt_buf_height(buf), 8192);
+	igt_assert_lte(intel_buf_width(buf), 8192);
+	igt_assert_lte(intel_buf_height(buf), 8192);
 
 	if (is_dst) {
 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
@@ -152,7 +134,7 @@ gen4_bind_buf(struct intel_batchbuffer *batch,
 		read_domain = I915_GEM_DOMAIN_SAMPLER;
 	}
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 
 	ss->ss0.surface_type = SURFACE_2D;
 	switch (buf->bpp) {
@@ -165,102 +147,102 @@ gen4_bind_buf(struct intel_batchbuffer *batch,
 
 	ss->ss0.data_return_format = SURFACERETURNFORMAT_FLOAT32;
 	ss->ss0.color_blend = 1;
-	ss->ss1.base_addr = buf->bo->offset;
 
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, ss) + 4,
-				      buf->bo, 0,
-				      read_domain, write_domain);
-	assert(ret == 0);
+	address = intel_bb_offset_reloc(ibb, buf->handle,
+					read_domain, write_domain,
+					intel_bb_offset(ibb) + 4,
+					buf->addr.offset);
+	ss->ss1.base_addr = (uint32_t) address;
 
-	ss->ss2.height = igt_buf_height(buf) - 1;
-	ss->ss2.width  = igt_buf_width(buf) - 1;
+	ss->ss2.height = intel_buf_height(buf) - 1;
+	ss->ss2.width  = intel_buf_width(buf) - 1;
 	ss->ss3.pitch  = buf->surface[0].stride - 1;
 	ss->ss3.tiled_surface = buf->tiling != I915_TILING_NONE;
 	ss->ss3.tile_walk     = buf->tiling == I915_TILING_Y;
 
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen4_bind_surfaces(struct intel_batchbuffer *batch,
-		   const struct igt_buf *src,
-		   const struct igt_buf *dst)
+gen4_bind_surfaces(struct intel_bb *ibb,
+		   const struct intel_buf *src,
+		   const struct intel_buf *dst)
 {
-	uint32_t *binding_table;
+	uint32_t *binding_table, binding_table_offset;
 
-	binding_table = intel_batchbuffer_subdata_alloc(batch, 32, 32);
+	binding_table = intel_bb_ptr_align(ibb, 32);
+	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
 
-	binding_table[0] = gen4_bind_buf(batch, dst, 1);
-	binding_table[1] = gen4_bind_buf(batch, src, 0);
+	binding_table[0] = gen4_bind_buf(ibb, dst, 1);
+	binding_table[1] = gen4_bind_buf(ibb, src, 0);
 
-	return intel_batchbuffer_subdata_offset(batch, binding_table);
+	return binding_table_offset;
 }
 
 static void
-gen4_emit_sip(struct intel_batchbuffer *batch)
+gen4_emit_sip(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_STATE_SIP | (2 - 2));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_STATE_SIP | (2 - 2));
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen4_emit_state_base_address(struct intel_batchbuffer *batch)
+gen4_emit_state_base_address(struct intel_bb *ibb)
 {
-	if (IS_GEN5(batch->devid)) {
-		OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (8 - 2));
-		OUT_RELOC(batch->bo, /* general */
-			  I915_GEM_DOMAIN_INSTRUCTION, 0,
-			  BASE_ADDRESS_MODIFY);
-		OUT_RELOC(batch->bo, /* surface */
-			  I915_GEM_DOMAIN_INSTRUCTION, 0,
-			  BASE_ADDRESS_MODIFY);
-		OUT_BATCH(0); /* media */
-		OUT_RELOC(batch->bo, /* instruction */
-			  I915_GEM_DOMAIN_INSTRUCTION, 0,
-			  BASE_ADDRESS_MODIFY);
+	if (IS_GEN5(ibb->devid)) {
+		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (8 - 2));
+		intel_bb_emit_reloc(ibb, ibb->handle, /* general */
+				    I915_GEM_DOMAIN_INSTRUCTION, 0,
+				    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+		intel_bb_emit_reloc(ibb, ibb->handle, /* surface */
+				    I915_GEM_DOMAIN_INSTRUCTION, 0,
+				    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+		intel_bb_out(ibb, 0); /* media */
+		intel_bb_emit_reloc(ibb, ibb->handle, /* instruction */
+				    I915_GEM_DOMAIN_INSTRUCTION, 0,
+				    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 		/* upper bounds, disable */
-		OUT_BATCH(BASE_ADDRESS_MODIFY); /* general */
-		OUT_BATCH(0); /* media */
-		OUT_BATCH(BASE_ADDRESS_MODIFY); /* instruction */
+		intel_bb_out(ibb, BASE_ADDRESS_MODIFY); /* general */
+		intel_bb_out(ibb, 0); /* media */
+		intel_bb_out(ibb, BASE_ADDRESS_MODIFY); /* instruction */
 	} else {
-		OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (6 - 2));
-		OUT_RELOC(batch->bo, /* general */
-			  I915_GEM_DOMAIN_INSTRUCTION, 0,
-			  BASE_ADDRESS_MODIFY);
-		OUT_RELOC(batch->bo, /* surface */
-			  I915_GEM_DOMAIN_INSTRUCTION, 0,
-			  BASE_ADDRESS_MODIFY);
-		OUT_BATCH(0); /* media */
+		intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (6 - 2));
+		intel_bb_emit_reloc(ibb, ibb->handle, /* general */
+				    I915_GEM_DOMAIN_INSTRUCTION, 0,
+				    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+		intel_bb_emit_reloc(ibb, ibb->handle, /* surface */
+				    I915_GEM_DOMAIN_INSTRUCTION, 0,
+				    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+		intel_bb_out(ibb, 0); /* media */
 
 		/* upper bounds, disable */
-		OUT_BATCH(BASE_ADDRESS_MODIFY); /* general */
-		OUT_BATCH(0); /* media */
+		intel_bb_out(ibb, BASE_ADDRESS_MODIFY); /* general */
+		intel_bb_out(ibb, 0); /* media */
 	}
 }
 
 static void
-gen4_emit_pipelined_pointers(struct intel_batchbuffer *batch,
+gen4_emit_pipelined_pointers(struct intel_bb *ibb,
 			     uint32_t vs, uint32_t sf,
 			     uint32_t wm, uint32_t cc)
 {
-	OUT_BATCH(GEN4_3DSTATE_PIPELINED_POINTERS | (7 - 2));
-	OUT_BATCH(vs);
-	OUT_BATCH(GEN4_GS_DISABLE);
-	OUT_BATCH(GEN4_CLIP_DISABLE);
-	OUT_BATCH(sf);
-	OUT_BATCH(wm);
-	OUT_BATCH(cc);
+	intel_bb_out(ibb, GEN4_3DSTATE_PIPELINED_POINTERS | (7 - 2));
+	intel_bb_out(ibb, vs);
+	intel_bb_out(ibb, GEN4_GS_DISABLE);
+	intel_bb_out(ibb, GEN4_CLIP_DISABLE);
+	intel_bb_out(ibb, sf);
+	intel_bb_out(ibb, wm);
+	intel_bb_out(ibb, cc);
 }
 
 static void
-gen4_emit_urb(struct intel_batchbuffer *batch)
+gen4_emit_urb(struct intel_bb *ibb)
 {
-	int vs_entries = gen4_max_vs_nr_urb_entries(batch->devid);
+	int vs_entries = gen4_max_vs_nr_urb_entries(ibb->devid);
 	int gs_entries = 0;
 	int cl_entries = 0;
-	int sf_entries = gen4_max_sf_nr_urb_entries(batch->devid);
+	int sf_entries = gen4_max_sf_nr_urb_entries(ibb->devid);
 	int cs_entries = 0;
 
 	int urb_vs_end =              vs_entries * URB_VS_ENTRY_SIZE;
@@ -269,91 +251,91 @@ gen4_emit_urb(struct intel_batchbuffer *batch)
 	int urb_sf_end = urb_cl_end + sf_entries * URB_SF_ENTRY_SIZE;
 	int urb_cs_end = urb_sf_end + cs_entries * URB_CS_ENTRY_SIZE;
 
-	assert(urb_cs_end <= gen4_urb_size(batch->devid));
-
-	intel_batchbuffer_align(batch, 16);
-
-	OUT_BATCH(GEN4_URB_FENCE |
-		  UF0_CS_REALLOC |
-		  UF0_SF_REALLOC |
-		  UF0_CLIP_REALLOC |
-		  UF0_GS_REALLOC |
-		  UF0_VS_REALLOC |
-		  (3 - 2));
-	OUT_BATCH(urb_cl_end << UF1_CLIP_FENCE_SHIFT |
-		  urb_gs_end << UF1_GS_FENCE_SHIFT |
-		  urb_vs_end << UF1_VS_FENCE_SHIFT);
-	OUT_BATCH(urb_cs_end << UF2_CS_FENCE_SHIFT |
-		  urb_sf_end << UF2_SF_FENCE_SHIFT);
-
-	OUT_BATCH(GEN4_CS_URB_STATE | (2 - 2));
-	OUT_BATCH((URB_CS_ENTRY_SIZE - 1) << 4 | cs_entries << 0);
+	assert(urb_cs_end <= gen4_urb_size(ibb->devid));
+
+	intel_bb_ptr_align(ibb, 16);
+
+	intel_bb_out(ibb, GEN4_URB_FENCE |
+		     UF0_CS_REALLOC |
+		     UF0_SF_REALLOC |
+		     UF0_CLIP_REALLOC |
+		     UF0_GS_REALLOC |
+		     UF0_VS_REALLOC |
+		     (3 - 2));
+	intel_bb_out(ibb, urb_cl_end << UF1_CLIP_FENCE_SHIFT |
+		     urb_gs_end << UF1_GS_FENCE_SHIFT |
+		     urb_vs_end << UF1_VS_FENCE_SHIFT);
+	intel_bb_out(ibb, urb_cs_end << UF2_CS_FENCE_SHIFT |
+		     urb_sf_end << UF2_SF_FENCE_SHIFT);
+
+	intel_bb_out(ibb, GEN4_CS_URB_STATE | (2 - 2));
+	intel_bb_out(ibb, (URB_CS_ENTRY_SIZE - 1) << 4 | cs_entries << 0);
 }
 
 static void
-gen4_emit_null_depth_buffer(struct intel_batchbuffer *batch)
+gen4_emit_null_depth_buffer(struct intel_bb *ibb)
 {
-	if (IS_G4X(batch->devid) || IS_GEN5(batch->devid)) {
-		OUT_BATCH(GEN4_3DSTATE_DEPTH_BUFFER | (6 - 2));
-		OUT_BATCH(SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
-			  GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
+	if (IS_G4X(ibb->devid) || IS_GEN5(ibb->devid)) {
+		intel_bb_out(ibb, GEN4_3DSTATE_DEPTH_BUFFER | (6 - 2));
+		intel_bb_out(ibb, SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
+			     GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
 	} else {
-		OUT_BATCH(GEN4_3DSTATE_DEPTH_BUFFER | (5 - 2));
-		OUT_BATCH(SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
-			  GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
+		intel_bb_out(ibb, GEN4_3DSTATE_DEPTH_BUFFER | (5 - 2));
+		intel_bb_out(ibb, SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
+			     GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
 	}
 
-	if (IS_GEN5(batch->devid)) {
-		OUT_BATCH(GEN4_3DSTATE_CLEAR_PARAMS | (2 - 2));
-		OUT_BATCH(0);
+	if (IS_GEN5(ibb->devid)) {
+		intel_bb_out(ibb, GEN4_3DSTATE_CLEAR_PARAMS | (2 - 2));
+		intel_bb_out(ibb, 0);
 	}
 }
 
 static void
-gen4_emit_invariant(struct intel_batchbuffer *batch)
+gen4_emit_invariant(struct intel_bb *ibb)
 {
-	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
+	intel_bb_out(ibb, MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
 
-	if (IS_GEN5(batch->devid) || IS_G4X(batch->devid))
-		OUT_BATCH(G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+	if (IS_GEN5(ibb->devid) || IS_G4X(ibb->devid))
+		intel_bb_out(ibb, G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 	else
-		OUT_BATCH(GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+		intel_bb_out(ibb, GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 }
 
 static uint32_t
-gen4_create_vs_state(struct intel_batchbuffer *batch)
+gen4_create_vs_state(struct intel_bb *ibb)
 {
 	struct gen4_vs_state *vs;
 	int nr_urb_entries;
 
-	vs = intel_batchbuffer_subdata_alloc(batch, sizeof(*vs), 32);
+	vs = intel_bb_ptr_align(ibb, 32);
 
 	/* Set up the vertex shader to be disabled (passthrough) */
-	nr_urb_entries = gen4_max_vs_nr_urb_entries(batch->devid);
-	if (IS_GEN5(batch->devid))
+	nr_urb_entries = gen4_max_vs_nr_urb_entries(ibb->devid);
+	if (IS_GEN5(ibb->devid))
 		nr_urb_entries >>= 2;
 	vs->vs4.nr_urb_entries = nr_urb_entries;
 	vs->vs4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
 	vs->vs6.vs_enable = 0;
 	vs->vs6.vert_cache_disable = 1;
 
-	return intel_batchbuffer_subdata_offset(batch, vs);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vs));
 }
 
 static uint32_t
-gen4_create_sf_state(struct intel_batchbuffer *batch,
+gen4_create_sf_state(struct intel_bb *ibb,
 		     uint32_t kernel)
 {
 	struct gen4_sf_state *sf;
 
-	sf = intel_batchbuffer_subdata_alloc(batch, sizeof(*sf), 32);
+	sf = intel_bb_ptr_align(ibb, 32);
 
 	sf->sf0.grf_reg_count = GEN4_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
 	sf->sf0.kernel_start_pointer = kernel >> 6;
@@ -363,25 +345,25 @@ gen4_create_sf_state(struct intel_batchbuffer *batch,
 	sf->sf3.urb_entry_read_offset = 1;
 	sf->sf3.dispatch_grf_start_reg = 3;
 
-	sf->sf4.max_threads = gen4_max_sf_threads(batch->devid) - 1;
+	sf->sf4.max_threads = gen4_max_sf_threads(ibb->devid) - 1;
 	sf->sf4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
-	sf->sf4.nr_urb_entries = gen4_max_sf_nr_urb_entries(batch->devid);
+	sf->sf4.nr_urb_entries = gen4_max_sf_nr_urb_entries(ibb->devid);
 
 	sf->sf6.cull_mode = GEN4_CULLMODE_NONE;
 	sf->sf6.dest_org_vbias = 0x8;
 	sf->sf6.dest_org_hbias = 0x8;
 
-	return intel_batchbuffer_subdata_offset(batch, sf);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*sf));
 }
 
 static uint32_t
-gen4_create_wm_state(struct intel_batchbuffer *batch,
+gen4_create_wm_state(struct intel_bb *ibb,
 		     uint32_t kernel,
 		     uint32_t sampler)
 {
 	struct gen4_wm_state *wm;
 
-	wm = intel_batchbuffer_subdata_alloc(batch, sizeof(*wm), 32);
+	wm = intel_bb_ptr_align(ibb, 32);
 
 	assert((kernel & 63) == 0);
 	wm->wm0.kernel_start_pointer = kernel >> 6;
@@ -394,48 +376,48 @@ gen4_create_wm_state(struct intel_batchbuffer *batch,
 	wm->wm4.sampler_state_pointer = sampler >> 5;
 	wm->wm4.sampler_count = 1;
 
-	wm->wm5.max_threads = gen4_max_wm_threads(batch->devid);
+	wm->wm5.max_threads = gen4_max_wm_threads(ibb->devid);
 	wm->wm5.thread_dispatch_enable = 1;
 	wm->wm5.enable_16_pix = 1;
 	wm->wm5.early_depth_test = 1;
 
-	if (IS_GEN5(batch->devid))
+	if (IS_GEN5(ibb->devid))
 		wm->wm1.binding_table_entry_count = 0;
 	else
 		wm->wm1.binding_table_entry_count = 2;
 	wm->wm3.urb_entry_read_length = 2;
 
-	return intel_batchbuffer_subdata_offset(batch, wm);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*wm));
 }
 
 static void
-gen4_emit_binding_table(struct intel_batchbuffer *batch,
+gen4_emit_binding_table(struct intel_bb *ibb,
 			uint32_t wm_table)
 {
-	OUT_BATCH(GEN4_3DSTATE_BINDING_TABLE_POINTERS | (6 - 2));
-	OUT_BATCH(0);		/* vs */
-	OUT_BATCH(0);		/* gs */
-	OUT_BATCH(0);		/* clip */
-	OUT_BATCH(0);		/* sf */
-	OUT_BATCH(wm_table);    /* ps */
+	intel_bb_out(ibb, GEN4_3DSTATE_BINDING_TABLE_POINTERS | (6 - 2));
+	intel_bb_out(ibb, 0);		/* vs */
+	intel_bb_out(ibb, 0);		/* gs */
+	intel_bb_out(ibb, 0);		/* clip */
+	intel_bb_out(ibb, 0);		/* sf */
+	intel_bb_out(ibb, wm_table);    /* ps */
 }
 
 static void
-gen4_emit_drawing_rectangle(struct intel_batchbuffer *batch,
-			    const struct igt_buf *dst)
+gen4_emit_drawing_rectangle(struct intel_bb *ibb,
+			    const struct intel_buf *dst)
 {
-	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH((igt_buf_height(dst) - 1) << 16 |
-		  (igt_buf_width(dst) - 1));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 |
+		     (intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen4_emit_vertex_elements(struct intel_batchbuffer *batch)
+gen4_emit_vertex_elements(struct intel_bb *ibb)
 {
 
-	if (IS_GEN5(batch->devid)) {
+	if (IS_GEN5(ibb->devid)) {
 		/* The VUE layout
 		 *    dword 0-3: pad (0.0, 0.0, 0.0, 0.0),
 		 *    dword 4-7: position (x, y, 1.0, 1.0),
@@ -443,34 +425,34 @@ gen4_emit_vertex_elements(struct intel_batchbuffer *batch)
 		 *
 		 * dword 4-11 are fetched from vertex buffer
 		 */
-		OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
+		intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
 
 		/* pad */
-		OUT_BATCH(0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
-			  SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-			  0 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+		intel_bb_out(ibb, 0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
+			     SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
+			     0 << VE0_OFFSET_SHIFT);
+		intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 
 		/* x,y */
-		OUT_BATCH(0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
-			  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-			  0 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+		intel_bb_out(ibb, 0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
+			     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+			     0 << VE0_OFFSET_SHIFT);
+		intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 		/* u0, v0 */
-		OUT_BATCH(0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
-			  SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
-			  4 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+		intel_bb_out(ibb, 0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
+			     SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
+			     4 << VE0_OFFSET_SHIFT);
+		intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 	} else {
 		/* The VUE layout
 		 *    dword 0-3: position (x, y, 1.0, 1.0),
@@ -478,90 +460,88 @@ gen4_emit_vertex_elements(struct intel_batchbuffer *batch)
 		 *
 		 * dword 0-7 are fetched from vertex buffer
 		 */
-		OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * 2 + 1 - 2));
+		intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * 2 + 1 - 2));
 
 		/* x,y */
-		OUT_BATCH(0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
-			  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-			  0 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
-			  4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
+		intel_bb_out(ibb, 0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
+			     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+			     0 << VE0_OFFSET_SHIFT);
+		intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
+			     4 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
 
 		/* u0, v0 */
-		OUT_BATCH(0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
-			  SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
-			  4 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT |
-			  8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
+		intel_bb_out(ibb, 0 << GEN4_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN4_VE0_VALID |
+			     SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
+			     4 << VE0_OFFSET_SHIFT);
+		intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+			     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT |
+			     8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
 	}
 }
 
 static uint32_t
-gen4_create_cc_viewport(struct intel_batchbuffer *batch)
+gen4_create_cc_viewport(struct intel_bb *ibb)
 {
 	struct gen4_cc_viewport *vp;
 
-	vp = intel_batchbuffer_subdata_alloc(batch, sizeof(*vp), 32);
+	vp = intel_bb_ptr_align(ibb, 32);
 
 	vp->min_depth = -1.e35;
 	vp->max_depth = 1.e35;
 
-	return intel_batchbuffer_subdata_offset(batch, vp);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vp));
 }
 
 static uint32_t
-gen4_create_cc_state(struct intel_batchbuffer *batch,
+gen4_create_cc_state(struct intel_bb *ibb,
 		     uint32_t cc_vp)
 {
 	struct gen4_color_calc_state *cc;
 
-	cc = intel_batchbuffer_subdata_alloc(batch, sizeof(*cc), 64);
+	cc = intel_bb_ptr_align(ibb, 64);
 
 	cc->cc4.cc_viewport_state_offset = cc_vp;
 
-	return intel_batchbuffer_subdata_offset(batch, cc);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*cc));
 }
 
 static uint32_t
-gen4_create_sf_kernel(struct intel_batchbuffer *batch)
+gen4_create_sf_kernel(struct intel_bb *ibb)
 {
-	if (IS_GEN5(batch->devid))
-		return intel_batchbuffer_copy_data(batch, gen5_sf_kernel_nomask,
-						   sizeof(gen5_sf_kernel_nomask),
-						   64);
+	if (IS_GEN5(ibb->devid))
+		return intel_bb_copy_data(ibb, gen5_sf_kernel_nomask,
+					  sizeof(gen5_sf_kernel_nomask), 64);
 	else
-		return intel_batchbuffer_copy_data(batch, gen4_sf_kernel_nomask,
-						   sizeof(gen4_sf_kernel_nomask),
-						   64);
+		return intel_bb_copy_data(ibb, gen4_sf_kernel_nomask,
+					  sizeof(gen4_sf_kernel_nomask), 64);
 }
 
 static uint32_t
-gen4_create_ps_kernel(struct intel_batchbuffer *batch)
+gen4_create_ps_kernel(struct intel_bb *ibb)
 {
-	if (IS_GEN5(batch->devid))
-		return intel_batchbuffer_copy_data(batch, gen5_ps_kernel_nomask_affine,
-						   sizeof(gen5_ps_kernel_nomask_affine),
-						   64);
+	if (IS_GEN5(ibb->devid))
+		return intel_bb_copy_data(ibb, gen5_ps_kernel_nomask_affine,
+					  sizeof(gen5_ps_kernel_nomask_affine),
+					  64);
 	else
-		return intel_batchbuffer_copy_data(batch, gen4_ps_kernel_nomask_affine,
-						   sizeof(gen4_ps_kernel_nomask_affine),
-						   64);
+		return intel_bb_copy_data(ibb, gen4_ps_kernel_nomask_affine,
+					  sizeof(gen4_ps_kernel_nomask_affine),
+					  64);
 }
 
 static uint32_t
-gen4_create_sampler(struct intel_batchbuffer *batch,
+gen4_create_sampler(struct intel_bb *ibb,
 		    sampler_filter_t filter,
 		    sampler_extend_t extend)
 {
 	struct gen4_sampler_state *ss;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 
 	ss->ss0.lod_preclamp = GEN4_LOD_PRECLAMP_OGL;
 
@@ -606,50 +586,52 @@ gen4_create_sampler(struct intel_batchbuffer *batch,
 		break;
 	}
 
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
-static void gen4_emit_vertex_buffer(struct intel_batchbuffer *batch)
+static void gen4_emit_vertex_buffer(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | (5 - 2));
-	OUT_BATCH(GEN4_VB0_VERTEXDATA |
-		  0 << GEN4_VB0_BUFFER_INDEX_SHIFT |
-		  VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
-	if (IS_GEN5(batch->devid))
-		OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0,
-			  batch->bo->size - 1);
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_BUFFERS | (5 - 2));
+	intel_bb_out(ibb, GEN4_VB0_VERTEXDATA |
+		     0 << GEN4_VB0_BUFFER_INDEX_SHIFT |
+		     VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_VERTEX, 0, 0, ibb->batch_offset);
+	if (IS_GEN5(ibb->devid))
+		intel_bb_emit_reloc(ibb, ibb->handle,
+				    I915_GEM_DOMAIN_VERTEX, 0,
+				    ibb->size - 1, ibb->batch_offset);
 	else
-		OUT_BATCH(batch->bo->size / VERTEX_SIZE - 1);
-	OUT_BATCH(0);
+		intel_bb_out(ibb, ibb->size / VERTEX_SIZE - 1);
+	intel_bb_out(ibb, 0);
 }
 
-static uint32_t gen4_emit_primitive(struct intel_batchbuffer *batch)
+static uint32_t gen4_emit_primitive(struct intel_bb *ibb)
 {
 	uint32_t offset;
 
-	OUT_BATCH(GEN4_3DPRIMITIVE |
-		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
-		  _3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT |
-		  0 << 9 |
-		  (6 - 2));
-	OUT_BATCH(3);	/* vertex count */
-	offset = batch_used(batch);
-	OUT_BATCH(0);	/* vertex_index */
-	OUT_BATCH(1);	/* single instance */
-	OUT_BATCH(0);	/* start instance location */
-	OUT_BATCH(0);	/* index buffer offset, ignored */
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE |
+		     GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
+		     _3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT |
+		     0 << 9 |
+		     (6 - 2));
+	intel_bb_out(ibb, 3);	/* vertex count */
+	offset = intel_bb_offset(ibb);
+	intel_bb_out(ibb, 0);	/* vertex_index */
+	intel_bb_out(ibb, 1);	/* single instance */
+	intel_bb_out(ibb, 0);	/* start instance location */
+	intel_bb_out(ibb, 0);	/* index buffer offset, ignored */
 
 	return offset;
 }
 
-void gen4_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src,
-			  unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst,
-			  unsigned dst_x, unsigned dst_y)
+void gen4_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst,
+			  uint32_t dst_x, uint32_t dst_y)
 {
 	uint32_t cc, cc_vp;
 	uint32_t wm, wm_sampler, wm_kernel, wm_table;
@@ -658,60 +640,64 @@ void gen4_render_copyfunc(struct intel_batchbuffer *batch,
 	uint32_t offset, batch_end;
 
 	igt_assert(src->bpp == dst->bpp);
-	intel_batchbuffer_flush_with_context(batch, context);
 
-	batch->ptr = batch->buffer + 1024;
-	intel_batchbuffer_subdata_alloc(batch, 64, 64);
+	intel_bb_flush_render_with_context(ibb, ctx);
+
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
+
+	intel_bb_ptr_set(ibb, 1024 + 64);
 
-	vs = gen4_create_vs_state(batch);
+	vs = gen4_create_vs_state(ibb);
 
-	sf_kernel = gen4_create_sf_kernel(batch);
-	sf = gen4_create_sf_state(batch, sf_kernel);
+	sf_kernel = gen4_create_sf_kernel(ibb);
+	sf = gen4_create_sf_state(ibb, sf_kernel);
 
-	wm_table = gen4_bind_surfaces(batch, src, dst);
-	wm_kernel = gen4_create_ps_kernel(batch);
-	wm_sampler = gen4_create_sampler(batch,
+	wm_table = gen4_bind_surfaces(ibb, src, dst);
+	wm_kernel = gen4_create_ps_kernel(ibb);
+	wm_sampler = gen4_create_sampler(ibb,
 					 SAMPLER_FILTER_NEAREST,
 					 SAMPLER_EXTEND_NONE);
-	wm = gen4_create_wm_state(batch, wm_kernel, wm_sampler);
+	wm = gen4_create_wm_state(ibb, wm_kernel, wm_sampler);
 
-	cc_vp = gen4_create_cc_viewport(batch);
-	cc = gen4_create_cc_state(batch, cc_vp);
+	cc_vp = gen4_create_cc_viewport(ibb);
+	cc = gen4_create_cc_state(ibb, cc_vp);
 
-	batch->ptr = batch->buffer;
+	intel_bb_ptr_set(ibb, 0);
 
-	gen4_emit_invariant(batch);
-	gen4_emit_state_base_address(batch);
-	gen4_emit_sip(batch);
-	gen4_emit_null_depth_buffer(batch);
+	gen4_emit_invariant(ibb);
+	gen4_emit_state_base_address(ibb);
+	gen4_emit_sip(ibb);
+	gen4_emit_null_depth_buffer(ibb);
 
-	gen4_emit_drawing_rectangle(batch, dst);
-	gen4_emit_binding_table(batch, wm_table);
-	gen4_emit_vertex_elements(batch);
-	gen4_emit_pipelined_pointers(batch, vs, sf, wm, cc);
-	gen4_emit_urb(batch);
+	gen4_emit_drawing_rectangle(ibb, dst);
+	gen4_emit_binding_table(ibb, wm_table);
+	gen4_emit_vertex_elements(ibb);
+	gen4_emit_pipelined_pointers(ibb, vs, sf, wm, cc);
+	gen4_emit_urb(ibb);
 
-	gen4_emit_vertex_buffer(batch);
-	offset = gen4_emit_primitive(batch);
+	gen4_emit_vertex_buffer(ibb);
+	offset = gen4_emit_primitive(ibb);
 
-	OUT_BATCH(MI_BATCH_BUFFER_END);
-	batch_end = intel_batchbuffer_align(batch, 8);
+	batch_end = intel_bb_emit_bbe(ibb);
 
-	*(uint32_t *)(batch->buffer + offset) =
-		batch_round_upto(batch, VERTEX_SIZE)/VERTEX_SIZE;
+	ibb->batch[offset / sizeof(uint32_t)] =
+			batch_round_upto(ibb, VERTEX_SIZE)/VERTEX_SIZE;
 
-	emit_vertex_2s(batch, dst_x + width, dst_y + height);
-	emit_vertex_normalized(batch, src_x + width, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x + width, dst_y + height);
+	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y + height);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y + height);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
 
-	gen4_render_flush(batch, context, batch_end);
-	intel_batchbuffer_reset(batch);
+	intel_bb_exec_with_context(ibb, batch_end, ctx,
+				   I915_EXEC_DEFAULT | I915_EXEC_NO_RELOC,
+				   false);
+	intel_bb_reset(ibb, false);
 }
diff --git a/lib/rendercopy_gen6.c b/lib/rendercopy_gen6.c
index 16cbb679..227bde05 100644
--- a/lib/rendercopy_gen6.c
+++ b/lib/rendercopy_gen6.c
@@ -12,7 +12,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "drmtest.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_io.h"
 #include "rendercopy.h"
@@ -49,38 +49,26 @@ static const uint32_t ps_kernel_nomask_affine[][4] = {
 };
 
 static uint32_t
-batch_round_upto(struct intel_batchbuffer *batch, uint32_t divisor)
+batch_round_upto(struct intel_bb *ibb, uint32_t divisor)
 {
-	uint32_t offset = batch->ptr - batch->buffer;
+	uint32_t offset = intel_bb_offset(ibb);
 
-	offset = (offset + divisor-1) / divisor * divisor;
-	batch->ptr = batch->buffer + offset;
-	return offset;
-}
+	offset = (offset + divisor - 1) / divisor * divisor;
+	intel_bb_ptr_set(ibb, offset);
 
-static void
-gen6_render_flush(struct intel_batchbuffer *batch,
-		  drm_intel_context *context, uint32_t batch_end)
-{
-	igt_assert_eq(drm_intel_bo_subdata(batch->bo,
-					   0, 4096, batch->buffer),
-		      0);
-	igt_assert_eq(drm_intel_gem_bo_context_exec(batch->bo,
-						    context, batch_end, 0),
-		      0);
+	return offset;
 }
 
 static uint32_t
-gen6_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
-	      int is_dst)
+gen6_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst)
 {
 	struct gen6_surface_state *ss;
 	uint32_t write_domain, read_domain;
-	int ret;
+	uint64_t address;
 
 	igt_assert_lte(buf->surface[0].stride, 128*1024);
-	igt_assert_lte(igt_buf_width(buf), 8192);
-	igt_assert_lte(igt_buf_height(buf), 8192);
+	igt_assert_lte(intel_buf_width(buf), 8192);
+	igt_assert_lte(intel_buf_height(buf), 8192);
 
 	if (is_dst) {
 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
@@ -89,7 +77,7 @@ gen6_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
 		read_domain = I915_GEM_DOMAIN_SAMPLER;
 	}
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 	ss->ss0.surface_type = SURFACE_2D;
 
 	switch (buf->bpp) {
@@ -102,265 +90,265 @@ gen6_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
 
 	ss->ss0.data_return_format = SURFACERETURNFORMAT_FLOAT32;
 	ss->ss0.color_blend = 1;
-	ss->ss1.base_addr = buf->bo->offset;
 
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, &ss->ss1),
-				      buf->bo, 0,
-				      read_domain, write_domain);
-	igt_assert(ret == 0);
+	address = intel_bb_offset_reloc(ibb, buf->handle,
+					read_domain, write_domain,
+					intel_bb_offset(ibb) + 4,
+					buf->addr.offset);
+	ss->ss1.base_addr = (uint32_t) address;
 
-	ss->ss2.height = igt_buf_height(buf) - 1;
-	ss->ss2.width  = igt_buf_width(buf) - 1;
+	ss->ss2.height = intel_buf_height(buf) - 1;
+	ss->ss2.width  = intel_buf_width(buf) - 1;
 	ss->ss3.pitch  = buf->surface[0].stride - 1;
 	ss->ss3.tiled_surface = buf->tiling != I915_TILING_NONE;
 	ss->ss3.tile_walk     = buf->tiling == I915_TILING_Y;
 
 	ss->ss5.memory_object_control = GEN6_MOCS_PTE;
 
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen6_bind_surfaces(struct intel_batchbuffer *batch,
-		   const struct igt_buf *src,
-		   const struct igt_buf *dst)
+gen6_bind_surfaces(struct intel_bb *ibb,
+		   const struct intel_buf *src,
+		   const struct intel_buf *dst)
 {
-	uint32_t *binding_table;
+	uint32_t *binding_table, binding_table_offset;
 
-	binding_table = intel_batchbuffer_subdata_alloc(batch, 32, 32);
+	binding_table = intel_bb_ptr_align(ibb, 32);
+	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
 
-	binding_table[0] = gen6_bind_buf(batch, dst, 1);
-	binding_table[1] = gen6_bind_buf(batch, src, 0);
+	binding_table[0] = gen6_bind_buf(ibb, dst, 1);
+	binding_table[1] = gen6_bind_buf(ibb, src, 0);
 
-	return intel_batchbuffer_subdata_offset(batch, binding_table);
+	return binding_table_offset;
 }
 
 static void
-gen6_emit_sip(struct intel_batchbuffer *batch)
+gen6_emit_sip(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_STATE_SIP | 0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_STATE_SIP | 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_urb(struct intel_batchbuffer *batch)
+gen6_emit_urb(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_URB | (3 - 2));
-	OUT_BATCH((1 - 1) << GEN6_3DSTATE_URB_VS_SIZE_SHIFT |
-		  24 << GEN6_3DSTATE_URB_VS_ENTRIES_SHIFT); /* at least 24 on GEN6 */
-	OUT_BATCH(0 << GEN6_3DSTATE_URB_GS_SIZE_SHIFT |
-		  0 << GEN6_3DSTATE_URB_GS_ENTRIES_SHIFT); /* no GS thread */
+	intel_bb_out(ibb, GEN6_3DSTATE_URB | (3 - 2));
+	intel_bb_out(ibb, (1 - 1) << GEN6_3DSTATE_URB_VS_SIZE_SHIFT |
+		     24 << GEN6_3DSTATE_URB_VS_ENTRIES_SHIFT); /* at least 24 on GEN6 */
+	intel_bb_out(ibb, 0 << GEN6_3DSTATE_URB_GS_SIZE_SHIFT |
+		     0 << GEN6_3DSTATE_URB_GS_ENTRIES_SHIFT); /* no GS thread */
 }
 
 static void
-gen6_emit_state_base_address(struct intel_batchbuffer *batch)
+gen6_emit_state_base_address(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (10 - 2));
-	OUT_BATCH(0); /* general */
-	OUT_RELOC(batch->bo, /* surface */
-		  I915_GEM_DOMAIN_INSTRUCTION, 0,
-		  BASE_ADDRESS_MODIFY);
-	OUT_RELOC(batch->bo, /* instruction */
-		  I915_GEM_DOMAIN_INSTRUCTION, 0,
-		  BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0); /* indirect */
-	OUT_RELOC(batch->bo, /* dynamic */
-		  I915_GEM_DOMAIN_INSTRUCTION, 0,
-		  BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (10 - 2));
+	intel_bb_out(ibb, 0); /* general */
+	intel_bb_emit_reloc(ibb, ibb->handle, /* surface */
+			    I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+	intel_bb_emit_reloc(ibb, ibb->handle, /* instruction */
+			    I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
+	intel_bb_out(ibb, 0); /* indirect */
+	intel_bb_emit_reloc(ibb, ibb->handle, /* dynamic */
+			    I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* upper bounds, disable */
-	OUT_BATCH(0);
-	OUT_BATCH(BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0);
-	OUT_BATCH(BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, BASE_ADDRESS_MODIFY);
 }
 
 static void
-gen6_emit_viewports(struct intel_batchbuffer *batch, uint32_t cc_vp)
+gen6_emit_viewports(struct intel_bb *ibb, uint32_t cc_vp)
 {
-	OUT_BATCH(GEN6_3DSTATE_VIEWPORT_STATE_POINTERS |
-		  GEN6_3DSTATE_VIEWPORT_STATE_MODIFY_CC |
-		  (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(cc_vp);
+	intel_bb_out(ibb, GEN6_3DSTATE_VIEWPORT_STATE_POINTERS |
+		     GEN6_3DSTATE_VIEWPORT_STATE_MODIFY_CC |
+		     (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, cc_vp);
 }
 
 static void
-gen6_emit_vs(struct intel_batchbuffer *batch)
+gen6_emit_vs(struct intel_bb *ibb)
 {
 	/* disable VS constant buffer */
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_VS | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_VS | (6 - 2));
-	OUT_BATCH(0); /* no VS kernel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_VS | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_VS | (6 - 2));
+	intel_bb_out(ibb, 0); /* no VS kernel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
 }
 
 static void
-gen6_emit_gs(struct intel_batchbuffer *batch)
+gen6_emit_gs(struct intel_bb *ibb)
 {
 	/* disable GS constant buffer */
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_GS | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_GS | (7 - 2));
-	OUT_BATCH(0); /* no GS kernel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_GS | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_GS | (7 - 2));
+	intel_bb_out(ibb, 0); /* no GS kernel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
 }
 
 static void
-gen6_emit_clip(struct intel_batchbuffer *batch)
+gen6_emit_clip(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_CLIP | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_CLIP | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_wm_constants(struct intel_batchbuffer *batch)
+gen6_emit_wm_constants(struct intel_bb *ibb)
 {
 	/* disable WM constant buffer */
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_PS | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_null_depth_buffer(struct intel_batchbuffer *batch)
+gen6_emit_null_depth_buffer(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_3DSTATE_DEPTH_BUFFER | (7 - 2));
-	OUT_BATCH(SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
-		  GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN4_3DSTATE_CLEAR_PARAMS | (2 - 2));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DEPTH_BUFFER | (7 - 2));
+	intel_bb_out(ibb, SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
+		     GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN4_3DSTATE_CLEAR_PARAMS | (2 - 2));
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_invariant(struct intel_batchbuffer *batch)
+gen6_emit_invariant(struct intel_bb *ibb)
 {
-	OUT_BATCH(G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+	intel_bb_out(ibb, G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 
-	OUT_BATCH(GEN6_3DSTATE_MULTISAMPLE | (3 - 2));
-	OUT_BATCH(GEN6_3DSTATE_MULTISAMPLE_PIXEL_LOCATION_CENTER |
-		  GEN6_3DSTATE_MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_MULTISAMPLE | (3 - 2));
+	intel_bb_out(ibb, GEN6_3DSTATE_MULTISAMPLE_PIXEL_LOCATION_CENTER |
+		     GEN6_3DSTATE_MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN6_3DSTATE_SAMPLE_MASK | (2 - 2));
-	OUT_BATCH(1);
+	intel_bb_out(ibb, GEN6_3DSTATE_SAMPLE_MASK | (2 - 2));
+	intel_bb_out(ibb, 1);
 }
 
 static void
-gen6_emit_cc(struct intel_batchbuffer *batch, uint32_t blend)
+gen6_emit_cc(struct intel_bb *ibb, uint32_t blend)
 {
-	OUT_BATCH(GEN6_3DSTATE_CC_STATE_POINTERS | (4 - 2));
-	OUT_BATCH(blend | 1);
-	OUT_BATCH(1024 | 1);
-	OUT_BATCH(1024 | 1);
+	intel_bb_out(ibb, GEN6_3DSTATE_CC_STATE_POINTERS | (4 - 2));
+	intel_bb_out(ibb, blend | 1);
+	intel_bb_out(ibb, 1024 | 1);
+	intel_bb_out(ibb, 1024 | 1);
 }
 
 static void
-gen6_emit_sampler(struct intel_batchbuffer *batch, uint32_t state)
+gen6_emit_sampler(struct intel_bb *ibb, uint32_t state)
 {
-	OUT_BATCH(GEN6_3DSTATE_SAMPLER_STATE_POINTERS |
-		  GEN6_3DSTATE_SAMPLER_STATE_MODIFY_PS |
-		  (4 - 2));
-	OUT_BATCH(0); /* VS */
-	OUT_BATCH(0); /* GS */
-	OUT_BATCH(state);
+	intel_bb_out(ibb, GEN6_3DSTATE_SAMPLER_STATE_POINTERS |
+		     GEN6_3DSTATE_SAMPLER_STATE_MODIFY_PS |
+		     (4 - 2));
+	intel_bb_out(ibb, 0); /* VS */
+	intel_bb_out(ibb, 0); /* GS */
+	intel_bb_out(ibb, state);
 }
 
 static void
-gen6_emit_sf(struct intel_batchbuffer *batch)
+gen6_emit_sf(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_SF | (20 - 2));
-	OUT_BATCH(1 << GEN6_3DSTATE_SF_NUM_OUTPUTS_SHIFT |
-		  1 << GEN6_3DSTATE_SF_URB_ENTRY_READ_LENGTH_SHIFT |
-		  1 << GEN6_3DSTATE_SF_URB_ENTRY_READ_OFFSET_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN6_3DSTATE_SF_CULL_NONE);
-	OUT_BATCH(2 << GEN6_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT); /* DW4 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* DW9 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* DW14 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* DW19 */
+	intel_bb_out(ibb, GEN6_3DSTATE_SF | (20 - 2));
+	intel_bb_out(ibb, 1 << GEN6_3DSTATE_SF_NUM_OUTPUTS_SHIFT |
+		     1 << GEN6_3DSTATE_SF_URB_ENTRY_READ_LENGTH_SHIFT |
+		     1 << GEN6_3DSTATE_SF_URB_ENTRY_READ_OFFSET_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN6_3DSTATE_SF_CULL_NONE);
+	intel_bb_out(ibb, 2 << GEN6_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT); /* DW4 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* DW9 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* DW14 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* DW19 */
 }
 
 static void
-gen6_emit_wm(struct intel_batchbuffer *batch, int kernel)
+gen6_emit_wm(struct intel_bb *ibb, int kernel)
 {
-	OUT_BATCH(GEN6_3DSTATE_WM | (9 - 2));
-	OUT_BATCH(kernel);
-	OUT_BATCH(1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
-		  2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT); /* DW4 */
-	OUT_BATCH((40 - 1) << GEN6_3DSTATE_WM_MAX_THREADS_SHIFT |
-		  GEN6_3DSTATE_WM_DISPATCH_ENABLE |
-		  GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
-	OUT_BATCH(1 << GEN6_3DSTATE_WM_NUM_SF_OUTPUTS_SHIFT |
-		  GEN6_3DSTATE_WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_WM | (9 - 2));
+	intel_bb_out(ibb, kernel);
+	intel_bb_out(ibb, 1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
+		     2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT); /* DW4 */
+	intel_bb_out(ibb, (40 - 1) << GEN6_3DSTATE_WM_MAX_THREADS_SHIFT |
+		     GEN6_3DSTATE_WM_DISPATCH_ENABLE |
+		     GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
+	intel_bb_out(ibb, 1 << GEN6_3DSTATE_WM_NUM_SF_OUTPUTS_SHIFT |
+		     GEN6_3DSTATE_WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_binding_table(struct intel_batchbuffer *batch, uint32_t wm_table)
+gen6_emit_binding_table(struct intel_bb *ibb, uint32_t wm_table)
 {
-	OUT_BATCH(GEN4_3DSTATE_BINDING_TABLE_POINTERS |
-		  GEN6_3DSTATE_BINDING_TABLE_MODIFY_PS |
-		  (4 - 2));
-	OUT_BATCH(0);		/* vs */
-	OUT_BATCH(0);		/* gs */
-	OUT_BATCH(wm_table);
+	intel_bb_out(ibb, GEN4_3DSTATE_BINDING_TABLE_POINTERS |
+		     GEN6_3DSTATE_BINDING_TABLE_MODIFY_PS |
+		     (4 - 2));
+	intel_bb_out(ibb, 0);		/* vs */
+	intel_bb_out(ibb, 0);		/* gs */
+	intel_bb_out(ibb, wm_table);
 }
 
 static void
-gen6_emit_drawing_rectangle(struct intel_batchbuffer *batch, const struct igt_buf *dst)
+gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
 {
-	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH((igt_buf_height(dst) - 1) << 16 | (igt_buf_width(dst) - 1));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen6_emit_vertex_elements(struct intel_batchbuffer *batch)
+gen6_emit_vertex_elements(struct intel_bb *ibb)
 {
 	/* The VUE layout
 	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
@@ -369,54 +357,54 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch)
 	 *
 	 * dword 4-11 are fetched from vertex buffer
 	 */
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * 3 + 1 - 2));
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * 3 + 1 - 2));
 
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT);
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT);
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* x,y */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* u0, v0 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
-		  4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
+		     4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 }
 
 static uint32_t
-gen6_create_cc_viewport(struct intel_batchbuffer *batch)
+gen6_create_cc_viewport(struct intel_bb *ibb)
 {
 	struct gen4_cc_viewport *vp;
 
-	vp = intel_batchbuffer_subdata_alloc(batch, sizeof(*vp), 32);
+	vp = intel_bb_ptr_align(ibb, 32);
 
 	vp->min_depth = -1.e35;
 	vp->max_depth = 1.e35;
 
-	return intel_batchbuffer_subdata_offset(batch, vp);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vp));
 }
 
 static uint32_t
-gen6_create_cc_blend(struct intel_batchbuffer *batch)
+gen6_create_cc_blend(struct intel_bb *ibb)
 {
 	struct gen6_blend_state *blend;
 
-	blend = intel_batchbuffer_subdata_alloc(batch, sizeof(*blend), 64);
+	blend = intel_bb_ptr_align(ibb, 64);
 
 	blend->blend0.dest_blend_factor = GEN6_BLENDFACTOR_ZERO;
 	blend->blend0.source_blend_factor = GEN6_BLENDFACTOR_ONE;
@@ -426,25 +414,24 @@ gen6_create_cc_blend(struct intel_batchbuffer *batch)
 	blend->blend1.post_blend_clamp_enable = 1;
 	blend->blend1.pre_blend_clamp_enable = 1;
 
-	return intel_batchbuffer_subdata_offset(batch, blend);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*blend));
 }
 
 static uint32_t
-gen6_create_kernel(struct intel_batchbuffer *batch)
+gen6_create_kernel(struct intel_bb *ibb)
 {
-	return intel_batchbuffer_copy_data(batch, ps_kernel_nomask_affine,
-			  sizeof(ps_kernel_nomask_affine),
-			  64);
+	return intel_bb_copy_data(ibb, ps_kernel_nomask_affine,
+				  sizeof(ps_kernel_nomask_affine), 64);
 }
 
 static uint32_t
-gen6_create_sampler(struct intel_batchbuffer *batch,
+gen6_create_sampler(struct intel_bb *ibb,
 		    sampler_filter_t filter,
-		   sampler_extend_t extend)
+		    sampler_extend_t extend)
 {
 	struct gen6_sampler_state *ss;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 	ss->ss0.lod_preclamp = 1;	/* GL mode */
 
 	/* We use the legacy mode to get the semantics specified by
@@ -487,107 +474,116 @@ gen6_create_sampler(struct intel_batchbuffer *batch,
 		break;
 	}
 
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
-static void gen6_emit_vertex_buffer(struct intel_batchbuffer *batch)
+static void gen6_emit_vertex_buffer(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | 3);
-	OUT_BATCH(GEN6_VB0_VERTEXDATA |
-		  0 << GEN6_VB0_BUFFER_INDEX_SHIFT |
-		  VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, batch->bo->size-1);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_BUFFERS | 3);
+	intel_bb_out(ibb, GEN6_VB0_VERTEXDATA |
+		     0 << GEN6_VB0_BUFFER_INDEX_SHIFT |
+		     VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_VERTEX, 0,
+			    0, ibb->batch_offset);
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_VERTEX, 0,
+			    ibb->size - 1, ibb->batch_offset);
+	intel_bb_out(ibb, 0);
 }
 
-static uint32_t gen6_emit_primitive(struct intel_batchbuffer *batch)
+static uint32_t gen6_emit_primitive(struct intel_bb *ibb)
 {
 	uint32_t offset;
 
-	OUT_BATCH(GEN4_3DPRIMITIVE |
-		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
-		  _3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT |
-		  0 << 9 |
-		  4);
-	OUT_BATCH(3);	/* vertex count */
-	offset = batch->ptr - batch->buffer;
-	OUT_BATCH(0);	/* vertex_index */
-	OUT_BATCH(1);	/* single instance */
-	OUT_BATCH(0);	/* start instance location */
-	OUT_BATCH(0);	/* index buffer offset, ignored */
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE |
+		     GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
+		     _3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT |
+		     0 << 9 |
+		     4);
+	intel_bb_out(ibb, 3);	/* vertex count */
+	offset = intel_bb_offset(ibb);
+	intel_bb_out(ibb, 0);	/* vertex_index */
+	intel_bb_out(ibb, 1);	/* single instance */
+	intel_bb_out(ibb, 0);	/* start instance location */
+	intel_bb_out(ibb, 0);	/* index buffer offset, ignored */
 
 	return offset;
 }
 
-void gen6_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen6_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst,
+			  uint32_t dst_x, uint32_t dst_y)
 {
 	uint32_t wm_state, wm_kernel, wm_table;
 	uint32_t cc_vp, cc_blend, offset;
 	uint32_t batch_end;
 
 	igt_assert(src->bpp == dst->bpp);
-	intel_batchbuffer_flush_with_context(batch, context);
 
-	batch->ptr = batch->buffer + 1024;
-	intel_batchbuffer_subdata_alloc(batch, 64, 64);
-	wm_table  = gen6_bind_surfaces(batch, src, dst);
-	wm_kernel = gen6_create_kernel(batch);
-	wm_state  = gen6_create_sampler(batch,
+	intel_bb_flush_render_with_context(ibb, ctx);
+
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
+
+	intel_bb_ptr_set(ibb, 1024 + 64);
+
+	wm_table  = gen6_bind_surfaces(ibb, src, dst);
+	wm_kernel = gen6_create_kernel(ibb);
+	wm_state  = gen6_create_sampler(ibb,
 					SAMPLER_FILTER_NEAREST,
 					SAMPLER_EXTEND_NONE);
 
-	cc_vp = gen6_create_cc_viewport(batch);
-	cc_blend = gen6_create_cc_blend(batch);
+	cc_vp = gen6_create_cc_viewport(ibb);
+	cc_blend = gen6_create_cc_blend(ibb);
 
-	batch->ptr = batch->buffer;
+	intel_bb_ptr_set(ibb, 0);
 
-	gen6_emit_invariant(batch);
-	gen6_emit_state_base_address(batch);
+	gen6_emit_invariant(ibb);
+	gen6_emit_state_base_address(ibb);
 
-	gen6_emit_sip(batch);
-	gen6_emit_urb(batch);
+	gen6_emit_sip(ibb);
+	gen6_emit_urb(ibb);
 
-	gen6_emit_viewports(batch, cc_vp);
-	gen6_emit_vs(batch);
-	gen6_emit_gs(batch);
-	gen6_emit_clip(batch);
-	gen6_emit_wm_constants(batch);
-	gen6_emit_null_depth_buffer(batch);
+	gen6_emit_viewports(ibb, cc_vp);
+	gen6_emit_vs(ibb);
+	gen6_emit_gs(ibb);
+	gen6_emit_clip(ibb);
+	gen6_emit_wm_constants(ibb);
+	gen6_emit_null_depth_buffer(ibb);
 
-	gen6_emit_drawing_rectangle(batch, dst);
-	gen6_emit_cc(batch, cc_blend);
-	gen6_emit_sampler(batch, wm_state);
-	gen6_emit_sf(batch);
-	gen6_emit_wm(batch, wm_kernel);
-	gen6_emit_vertex_elements(batch);
-	gen6_emit_binding_table(batch, wm_table);
+	gen6_emit_drawing_rectangle(ibb, dst);
+	gen6_emit_cc(ibb, cc_blend);
+	gen6_emit_sampler(ibb, wm_state);
+	gen6_emit_sf(ibb);
+	gen6_emit_wm(ibb, wm_kernel);
+	gen6_emit_vertex_elements(ibb);
+	gen6_emit_binding_table(ibb, wm_table);
 
-	gen6_emit_vertex_buffer(batch);
-	offset = gen6_emit_primitive(batch);
+	gen6_emit_vertex_buffer(ibb);
+	offset = gen6_emit_primitive(ibb);
 
-	OUT_BATCH(MI_BATCH_BUFFER_END);
-	batch_end = intel_batchbuffer_align(batch, 8);
+	batch_end = intel_bb_emit_bbe(ibb);
 
-	*(uint32_t*)(batch->buffer + offset) =
-		batch_round_upto(batch, VERTEX_SIZE)/VERTEX_SIZE;
+	ibb->batch[offset / sizeof(uint32_t)] =
+			batch_round_upto(ibb, VERTEX_SIZE)/VERTEX_SIZE;
 
-	emit_vertex_2s(batch, dst_x + width, dst_y + height);
-	emit_vertex_normalized(batch, src_x + width, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x + width, dst_y + height);
+	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y + height);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y + height);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
 
-	gen6_render_flush(batch, context, batch_end);
-	intel_batchbuffer_reset(batch);
+	intel_bb_exec_with_context(ibb, batch_end, ctx,
+				   I915_EXEC_DEFAULT | I915_EXEC_NO_RELOC,
+				   false);
+	intel_bb_reset(ibb, false);
 }
diff --git a/lib/rendercopy_gen7.c b/lib/rendercopy_gen7.c
index 93b4da72..62ef4325 100644
--- a/lib/rendercopy_gen7.c
+++ b/lib/rendercopy_gen7.c
@@ -12,7 +12,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "drmtest.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_io.h"
 #include "intel_chipset.h"
@@ -20,6 +20,14 @@
 #include "gen7_render.h"
 #include "intel_reg.h"
 
+#if DEBUG_RENDERCPY
+static void dump_batch(struct intel_bb *ibb)
+{
+	intel_bb_dump(ibb, "/tmp/gen7-batchbuffers.dump");
+}
+#else
+#define dump_batch(x) do { } while (0)
+#endif
 
 static const uint32_t ps_kernel[][4] = {
 	{ 0x0080005a, 0x2e2077bd, 0x000000c0, 0x008d0040 },
@@ -32,17 +40,6 @@ static const uint32_t ps_kernel[][4] = {
 	{ 0x05800031, 0x20001fa8, 0x008d0e20, 0x90031000 },
 };
 
-static void
-gen7_render_flush(struct intel_batchbuffer *batch,
-		  drm_intel_context *context, uint32_t batch_end)
-{
-	igt_assert_eq(drm_intel_bo_subdata(batch->bo,
-					   0, 4096, batch->buffer),
-		      0);
-	igt_assert_eq(drm_intel_gem_bo_context_exec(batch->bo, context,
-						    batch_end, 0),
-		      0);
-}
 
 static uint32_t
 gen7_tiling_bits(uint32_t tiling)
@@ -56,17 +53,17 @@ gen7_tiling_bits(uint32_t tiling)
 }
 
 static uint32_t
-gen7_bind_buf(struct intel_batchbuffer *batch,
-	      const struct igt_buf *buf,
+gen7_bind_buf(struct intel_bb *ibb,
+	      const struct intel_buf *buf,
 	      int is_dst)
 {
 	uint32_t format, *ss;
 	uint32_t write_domain, read_domain;
-	int ret;
+	uint64_t address;
 
 	igt_assert_lte(buf->surface[0].stride, 256*1024);
-	igt_assert_lte(igt_buf_width(buf), 16384);
-	igt_assert_lte(igt_buf_height(buf), 16384);
+	igt_assert_lte(intel_buf_width(buf), 16384);
+	igt_assert_lte(intel_buf_height(buf), 16384);
 
 	switch (buf->bpp) {
 		case 8: format = SURFACEFORMAT_R8_UNORM; break;
@@ -83,77 +80,76 @@ gen7_bind_buf(struct intel_batchbuffer *batch,
 		read_domain = I915_GEM_DOMAIN_SAMPLER;
 	}
 
-	ss = intel_batchbuffer_subdata_alloc(batch, 8 * sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 
 	ss[0] = (SURFACE_2D << GEN7_SURFACE_TYPE_SHIFT |
 		 gen7_tiling_bits(buf->tiling) |
 		format << GEN7_SURFACE_FORMAT_SHIFT);
-	ss[1] = buf->bo->offset;
-	ss[2] = ((igt_buf_width(buf) - 1)  << GEN7_SURFACE_WIDTH_SHIFT |
-		 (igt_buf_height(buf) - 1) << GEN7_SURFACE_HEIGHT_SHIFT);
+
+	address = intel_bb_offset_reloc(ibb, buf->handle,
+					read_domain, write_domain,
+					intel_bb_offset(ibb) + 4,
+					buf->addr.offset);
+	ss[1] = address;
+	ss[2] = ((intel_buf_width(buf) - 1)  << GEN7_SURFACE_WIDTH_SHIFT |
+		 (intel_buf_height(buf) - 1) << GEN7_SURFACE_HEIGHT_SHIFT);
 	ss[3] = (buf->surface[0].stride - 1) << GEN7_SURFACE_PITCH_SHIFT;
 	ss[4] = 0;
-	if (IS_VALLEYVIEW(batch->devid))
+	if (IS_VALLEYVIEW(ibb->devid))
 		ss[5] = VLV_MOCS_L3 << 16;
 	else
 		ss[5] = (IVB_MOCS_L3 | IVB_MOCS_PTE) << 16;
 	ss[6] = 0;
 	ss[7] = 0;
-	if (IS_HASWELL(batch->devid))
+	if (IS_HASWELL(ibb->devid))
 		ss[7] |= HSW_SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
 
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, &ss[1]),
-				      buf->bo, 0,
-				      read_domain, write_domain);
-	igt_assert(ret == 0);
-
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, 8 * sizeof(*ss));
 }
 
 static void
-gen7_emit_vertex_elements(struct intel_batchbuffer *batch)
+gen7_emit_vertex_elements(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS |
-		  ((2 * (1 + 2)) + 1 - 2));
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS |
+		     ((2 * (1 + 2)) + 1 - 2));
 
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT);
 
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* x,y */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* s,t */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  4 << VE0_OFFSET_SHIFT);  /* offset vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		     4 << VE0_OFFSET_SHIFT);  /* offset vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 }
 
 static uint32_t
-gen7_create_vertex_buffer(struct intel_batchbuffer *batch,
+gen7_create_vertex_buffer(struct intel_bb *ibb,
 			  uint32_t src_x, uint32_t src_y,
 			  uint32_t dst_x, uint32_t dst_y,
 			  uint32_t width, uint32_t height)
 {
 	uint16_t *v;
 
-	v = intel_batchbuffer_subdata_alloc(batch, 12 * sizeof(*v), 8);
+	v = intel_bb_ptr_align(ibb, 8);
 
 	v[0] = dst_x + width;
 	v[1] = dst_y + height;
@@ -170,66 +166,68 @@ gen7_create_vertex_buffer(struct intel_batchbuffer *batch,
 	v[10] = src_x;
 	v[11] = src_y;
 
-	return intel_batchbuffer_subdata_offset(batch, v);
+	return intel_bb_ptr_add_return_prev_offset(ibb, 12 * sizeof(*v));
 }
 
-static void gen7_emit_vertex_buffer(struct intel_batchbuffer *batch,
+static void gen7_emit_vertex_buffer(struct intel_bb *ibb,
 				    int src_x, int src_y,
 				    int dst_x, int dst_y,
 				    int width, int height,
 				    uint32_t offset)
 {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | (5 - 2));
-	OUT_BATCH(0 << GEN6_VB0_BUFFER_INDEX_SHIFT |
-		  GEN6_VB0_VERTEXDATA |
-		  GEN7_VB0_ADDRESS_MODIFY_ENABLE |
-		  4 * 2 << VB0_BUFFER_PITCH_SHIFT);
-
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, offset);
-	OUT_BATCH(~0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_BUFFERS | (5 - 2));
+	intel_bb_out(ibb, 0 << GEN6_VB0_BUFFER_INDEX_SHIFT |
+		     GEN6_VB0_VERTEXDATA |
+		     GEN7_VB0_ADDRESS_MODIFY_ENABLE |
+		     4 * 2 << VB0_BUFFER_PITCH_SHIFT);
+
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_VERTEX, 0,
+			    offset, ibb->batch_offset);
+	intel_bb_out(ibb, ~0);
+	intel_bb_out(ibb, 0);
 }
 
 static uint32_t
-gen7_bind_surfaces(struct intel_batchbuffer *batch,
-		   const struct igt_buf *src,
-		   const struct igt_buf *dst)
+gen7_bind_surfaces(struct intel_bb *ibb,
+		   const struct intel_buf *src,
+		   const struct intel_buf *dst)
 {
-	uint32_t *binding_table;
+	uint32_t *binding_table, binding_table_offset;
 
-	binding_table = intel_batchbuffer_subdata_alloc(batch, 8, 32);
+	binding_table = intel_bb_ptr_align(ibb, 32);
+	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 8);
 
-	binding_table[0] = gen7_bind_buf(batch, dst, 1);
-	binding_table[1] = gen7_bind_buf(batch, src, 0);
+	binding_table[0] = gen7_bind_buf(ibb, dst, 1);
+	binding_table[1] = gen7_bind_buf(ibb, src, 0);
 
-	return intel_batchbuffer_subdata_offset(batch, binding_table);
+	return binding_table_offset;
 }
 
 static void
-gen7_emit_binding_table(struct intel_batchbuffer *batch,
-			const struct igt_buf *src,
-			const struct igt_buf *dst,
+gen7_emit_binding_table(struct intel_bb *ibb,
+			const struct intel_buf *src,
+			const struct intel_buf *dst,
 			uint32_t bind_surf_off)
 {
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
-	OUT_BATCH(bind_surf_off);
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
+	intel_bb_out(ibb, bind_surf_off);
 }
 
 static void
-gen7_emit_drawing_rectangle(struct intel_batchbuffer *batch, const struct igt_buf *dst)
+gen7_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
 {
-	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH((igt_buf_height(dst) - 1) << 16 | (igt_buf_width(dst) - 1));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);
 }
 
 static uint32_t
-gen7_create_blend_state(struct intel_batchbuffer *batch)
+gen7_create_blend_state(struct intel_bb *ibb)
 {
 	struct gen6_blend_state *blend;
 
-	blend = intel_batchbuffer_subdata_alloc(batch, sizeof(*blend), 64);
+	blend = intel_bb_ptr_align(ibb, 64);
 
 	blend->blend0.dest_blend_factor = GEN6_BLENDFACTOR_ZERO;
 	blend->blend0.source_blend_factor = GEN6_BLENDFACTOR_ONE;
@@ -237,54 +235,61 @@ gen7_create_blend_state(struct intel_batchbuffer *batch)
 	blend->blend1.post_blend_clamp_enable = 1;
 	blend->blend1.pre_blend_clamp_enable = 1;
 
-	return intel_batchbuffer_subdata_offset(batch, blend);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*blend));
 }
 
 static void
-gen7_emit_state_base_address(struct intel_batchbuffer *batch)
+gen7_emit_state_base_address(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (10 - 2));
-	OUT_BATCH(0);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
-
-	OUT_BATCH(0);
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0);
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (10 - 2));
+	intel_bb_out(ibb, 0);
+
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY,
+			    ibb->batch_offset);
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY,
+			    ibb->batch_offset);
+	intel_bb_out(ibb, 0);
+	intel_bb_emit_reloc(ibb, ibb->handle, I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY,
+			    ibb->batch_offset);
+
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
 }
 
 static uint32_t
-gen7_create_cc_viewport(struct intel_batchbuffer *batch)
+gen7_create_cc_viewport(struct intel_bb *ibb)
 {
 	struct gen4_cc_viewport *vp;
 
-	vp = intel_batchbuffer_subdata_alloc(batch, sizeof(*vp), 32);
+	vp = intel_bb_ptr_align(ibb, 32);
 	vp->min_depth = -1.e35;
 	vp->max_depth = 1.e35;
 
-	return intel_batchbuffer_subdata_offset(batch, vp);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vp));
 }
 
 static void
-gen7_emit_cc(struct intel_batchbuffer *batch, uint32_t blend_state,
+gen7_emit_cc(struct intel_bb *ibb, uint32_t blend_state,
 	     uint32_t cc_viewport)
 {
-	OUT_BATCH(GEN7_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
-	OUT_BATCH(blend_state);
+	intel_bb_out(ibb, GEN7_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
+	intel_bb_out(ibb, blend_state);
 
-	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
-	OUT_BATCH(cc_viewport);
+	intel_bb_out(ibb, GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
+	intel_bb_out(ibb, cc_viewport);
 }
 
 static uint32_t
-gen7_create_sampler(struct intel_batchbuffer *batch)
+gen7_create_sampler(struct intel_bb *ibb)
 {
 	struct gen7_sampler_state *ss;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 32);
+	ss = intel_bb_ptr_align(ibb, 32);
 
 	ss->ss0.min_filter = GEN4_MAPFILTER_NEAREST;
 	ss->ss0.mag_filter = GEN4_MAPFILTER_NEAREST;
@@ -295,283 +300,284 @@ gen7_create_sampler(struct intel_batchbuffer *batch)
 
 	ss->ss3.non_normalized_coord = 1;
 
-	return intel_batchbuffer_subdata_offset(batch, ss);
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static void
-gen7_emit_sampler(struct intel_batchbuffer *batch, uint32_t sampler_off)
+gen7_emit_sampler(struct intel_bb *ibb, uint32_t sampler_off)
 {
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
-	OUT_BATCH(sampler_off);
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
+	intel_bb_out(ibb, sampler_off);
 }
 
 static void
-gen7_emit_multisample(struct intel_batchbuffer *batch)
+gen7_emit_multisample(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_MULTISAMPLE | (4 - 2));
-	OUT_BATCH(GEN6_3DSTATE_MULTISAMPLE_PIXEL_LOCATION_CENTER |
-		  GEN6_3DSTATE_MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_SAMPLE_MASK | (2 - 2));
-	OUT_BATCH(1);
+	intel_bb_out(ibb, GEN6_3DSTATE_MULTISAMPLE | (4 - 2));
+	intel_bb_out(ibb, GEN6_3DSTATE_MULTISAMPLE_PIXEL_LOCATION_CENTER |
+		     GEN6_3DSTATE_MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_SAMPLE_MASK | (2 - 2));
+	intel_bb_out(ibb, 1);
 }
 
 static void
-gen7_emit_urb(struct intel_batchbuffer *batch)
+gen7_emit_urb(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
-	OUT_BATCH(8); /* in 1KBs */
+	intel_bb_out(ibb, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
+	intel_bb_out(ibb, 8); /* in 1KBs */
 
 	/* num of VS entries must be divisible by 8 if size < 9 */
-	OUT_BATCH(GEN7_3DSTATE_URB_VS | (2 - 2));
-	OUT_BATCH((64 << GEN7_URB_ENTRY_NUMBER_SHIFT) |
-		  (2 - 1) << GEN7_URB_ENTRY_SIZE_SHIFT |
-		  (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
-
-	OUT_BATCH(GEN7_3DSTATE_URB_HS | (2 - 2));
-	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-		  (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
-
-	OUT_BATCH(GEN7_3DSTATE_URB_DS | (2 - 2));
-	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-		  (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
-
-	OUT_BATCH(GEN7_3DSTATE_URB_GS | (2 - 2));
-	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-		  (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_VS | (2 - 2));
+	intel_bb_out(ibb, (64 << GEN7_URB_ENTRY_NUMBER_SHIFT) |
+		     (2 - 1) << GEN7_URB_ENTRY_SIZE_SHIFT |
+		     (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
+
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_HS | (2 - 2));
+	intel_bb_out(ibb, (0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
+		     (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
+
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_DS | (2 - 2));
+	intel_bb_out(ibb, (0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
+		     (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
+
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_GS | (2 - 2));
+	intel_bb_out(ibb, (0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
+		     (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
 }
 
 static void
-gen7_emit_vs(struct intel_batchbuffer *batch)
+gen7_emit_vs(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_VS | (6 - 2));
-	OUT_BATCH(0); /* no VS kernel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
+	intel_bb_out(ibb, GEN6_3DSTATE_VS | (6 - 2));
+	intel_bb_out(ibb, 0); /* no VS kernel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
 }
 
 static void
-gen7_emit_hs(struct intel_batchbuffer *batch)
+gen7_emit_hs(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_HS | (7 - 2));
-	OUT_BATCH(0); /* no HS kernel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
+	intel_bb_out(ibb, GEN7_3DSTATE_HS | (7 - 2));
+	intel_bb_out(ibb, 0); /* no HS kernel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
 }
 
 static void
-gen7_emit_te(struct intel_batchbuffer *batch)
+gen7_emit_te(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_TE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_TE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_ds(struct intel_batchbuffer *batch)
+gen7_emit_ds(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_DS | (6 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_DS | (6 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_gs(struct intel_batchbuffer *batch)
+gen7_emit_gs(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_GS | (7 - 2));
-	OUT_BATCH(0); /* no GS kernel */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through  */
+	intel_bb_out(ibb, GEN6_3DSTATE_GS | (7 - 2));
+	intel_bb_out(ibb, 0); /* no GS kernel */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through  */
 }
 
 static void
-gen7_emit_streamout(struct intel_batchbuffer *batch)
+gen7_emit_streamout(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_STREAMOUT | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_STREAMOUT | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_sf(struct intel_batchbuffer *batch)
+gen7_emit_sf(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_SF | (7 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(GEN6_3DSTATE_SF_CULL_NONE);
-	OUT_BATCH(2 << GEN6_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_SF | (7 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN6_3DSTATE_SF_CULL_NONE);
+	intel_bb_out(ibb, 2 << GEN6_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_sbe(struct intel_batchbuffer *batch)
+gen7_emit_sbe(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_SBE | (14 - 2));
-	OUT_BATCH(1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
-		  1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-		  1 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* dw4 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* dw8 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* dw12 */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_SBE | (14 - 2));
+	intel_bb_out(ibb, 1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
+		     1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
+		     1 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* dw4 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* dw8 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* dw12 */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel_off)
+gen7_emit_ps(struct intel_bb *ibb, uint32_t kernel_off)
 {
 	int threads;
 
-	if (IS_HASWELL(batch->devid))
+	if (IS_HASWELL(ibb->devid))
 		threads = 40 << HSW_PS_MAX_THREADS_SHIFT | 1 << HSW_PS_SAMPLE_MASK_SHIFT;
 	else
 		threads = 40 << IVB_PS_MAX_THREADS_SHIFT;
 
-	OUT_BATCH(GEN7_3DSTATE_PS | (8 - 2));
-	OUT_BATCH(kernel_off);
-	OUT_BATCH(1 << GEN7_PS_SAMPLER_COUNT_SHIFT |
-		  2 << GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-	OUT_BATCH(0); /* scratch address */
-	OUT_BATCH(threads |
-		  GEN7_PS_16_DISPATCH_ENABLE |
-		  GEN7_PS_ATTRIBUTE_ENABLE);
-	OUT_BATCH(6 << GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_PS | (8 - 2));
+	intel_bb_out(ibb, kernel_off);
+	intel_bb_out(ibb, 1 << GEN7_PS_SAMPLER_COUNT_SHIFT |
+		     2 << GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
+	intel_bb_out(ibb, 0); /* scratch address */
+	intel_bb_out(ibb, threads |
+		     GEN7_PS_16_DISPATCH_ENABLE |
+		     GEN7_PS_ATTRIBUTE_ENABLE);
+	intel_bb_out(ibb, 6 << GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_clip(struct intel_batchbuffer *batch)
+gen7_emit_clip(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_CLIP | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0); /* pass-through */
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_CLIP | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /* pass-through */
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL | (2 - 2));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL | (2 - 2));
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_wm(struct intel_batchbuffer *batch)
+gen7_emit_wm(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN6_3DSTATE_WM | (3 - 2));
-	OUT_BATCH(GEN7_WM_DISPATCH_ENABLE |
-		GEN7_WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_WM | (3 - 2));
+	intel_bb_out(ibb, GEN7_WM_DISPATCH_ENABLE |
+		     GEN7_WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_null_depth_buffer(struct intel_batchbuffer *batch)
+gen7_emit_null_depth_buffer(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER | (7 - 2));
-	OUT_BATCH(SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
-		  GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
-	OUT_BATCH(0); /* disable depth, stencil and hiz */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (7 - 2));
+	intel_bb_out(ibb, SURFACE_NULL << GEN4_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
+		     GEN4_DEPTHFORMAT_D32_FLOAT << GEN4_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
+	intel_bb_out(ibb, 0); /* disable depth, stencil and hiz */
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 #define BATCH_STATE_SPLIT 2048
-void gen7_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen7_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst,
+			  uint32_t dst_x, uint32_t dst_y)
 {
 	uint32_t ps_binding_table, ps_sampler_off, ps_kernel_off;
 	uint32_t blend_state, cc_viewport;
 	uint32_t vertex_buffer;
-	uint32_t batch_end;
 
 	igt_assert(src->bpp == dst->bpp);
-	intel_batchbuffer_flush_with_context(batch, context);
 
-	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+	intel_bb_flush_render_with_context(ibb, ctx);
+
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
 
+	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
 
-	blend_state = gen7_create_blend_state(batch);
-	cc_viewport = gen7_create_cc_viewport(batch);
-	ps_sampler_off = gen7_create_sampler(batch);
-	ps_kernel_off = intel_batchbuffer_copy_data(batch, ps_kernel,
-						    sizeof(ps_kernel), 64);
-	vertex_buffer = gen7_create_vertex_buffer(batch,
+	blend_state = gen7_create_blend_state(ibb);
+	cc_viewport = gen7_create_cc_viewport(ibb);
+	ps_sampler_off = gen7_create_sampler(ibb);
+	ps_kernel_off = intel_bb_copy_data(ibb, ps_kernel,
+					   sizeof(ps_kernel), 64);
+	vertex_buffer = gen7_create_vertex_buffer(ibb,
 						  src_x, src_y,
 						  dst_x, dst_y,
 						  width, height);
-	ps_binding_table = gen7_bind_surfaces(batch, src, dst);
-
-	igt_assert(batch->ptr < &batch->buffer[4095]);
-
-	batch->ptr = batch->buffer;
-	OUT_BATCH(G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
-
-	gen7_emit_state_base_address(batch);
-	gen7_emit_multisample(batch);
-	gen7_emit_urb(batch);
-	gen7_emit_vs(batch);
-	gen7_emit_hs(batch);
-	gen7_emit_te(batch);
-	gen7_emit_ds(batch);
-	gen7_emit_gs(batch);
-	gen7_emit_clip(batch);
-	gen7_emit_sf(batch);
-	gen7_emit_wm(batch);
-	gen7_emit_streamout(batch);
-	gen7_emit_null_depth_buffer(batch);
-	gen7_emit_cc(batch, blend_state, cc_viewport);
-	gen7_emit_sampler(batch, ps_sampler_off);
-	gen7_emit_sbe(batch);
-	gen7_emit_ps(batch, ps_kernel_off);
-	gen7_emit_vertex_elements(batch);
-	gen7_emit_vertex_buffer(batch, src_x, src_y,
+	ps_binding_table = gen7_bind_surfaces(ibb, src, dst);
+
+	intel_bb_ptr_set(ibb, 0);
+
+	intel_bb_out(ibb, G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+
+	gen7_emit_state_base_address(ibb);
+	gen7_emit_multisample(ibb);
+	gen7_emit_urb(ibb);
+	gen7_emit_vs(ibb);
+	gen7_emit_hs(ibb);
+	gen7_emit_te(ibb);
+	gen7_emit_ds(ibb);
+	gen7_emit_gs(ibb);
+	gen7_emit_clip(ibb);
+	gen7_emit_sf(ibb);
+	gen7_emit_wm(ibb);
+	gen7_emit_streamout(ibb);
+	gen7_emit_null_depth_buffer(ibb);
+	gen7_emit_cc(ibb, blend_state, cc_viewport);
+	gen7_emit_sampler(ibb, ps_sampler_off);
+	gen7_emit_sbe(ibb);
+	gen7_emit_ps(ibb, ps_kernel_off);
+	gen7_emit_vertex_elements(ibb);
+	gen7_emit_vertex_buffer(ibb, src_x, src_y,
 				dst_x, dst_y, width,
 				height, vertex_buffer);
-	gen7_emit_binding_table(batch, src, dst, ps_binding_table);
-	gen7_emit_drawing_rectangle(batch, dst);
-
-	OUT_BATCH(GEN4_3DPRIMITIVE | (7 - 2));
-	OUT_BATCH(GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL | _3DPRIM_RECTLIST);
-	OUT_BATCH(3);
-	OUT_BATCH(0);
-	OUT_BATCH(1);   /* single instance */
-	OUT_BATCH(0);   /* start instance location */
-	OUT_BATCH(0);   /* index buffer offset, ignored */
-
-	OUT_BATCH(MI_BATCH_BUFFER_END);
-
-	batch_end = batch->ptr - batch->buffer;
-	batch_end = ALIGN(batch_end, 8);
-	igt_assert(batch_end < BATCH_STATE_SPLIT);
-
-	gen7_render_flush(batch, context, batch_end);
-	intel_batchbuffer_reset(batch);
+	gen7_emit_binding_table(ibb, src, dst, ps_binding_table);
+	gen7_emit_drawing_rectangle(ibb, dst);
+
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE | (7 - 2));
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL | _3DPRIM_RECTLIST);
+	intel_bb_out(ibb, 3);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 1);   /* single instance */
+	intel_bb_out(ibb, 0);   /* start instance location */
+	intel_bb_out(ibb, 0);   /* index buffer offset, ignored */
+
+	intel_bb_emit_bbe(ibb);
+	intel_bb_exec_with_context(ibb, intel_bb_offset(ibb), ctx,
+				   I915_EXEC_DEFAULT | I915_EXEC_NO_RELOC,
+				   false);
+	dump_batch(ibb);
+	intel_bb_reset(ibb, false);
 }
diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
index 75005d0b..95c3c497 100644
--- a/lib/rendercopy_gen8.c
+++ b/lib/rendercopy_gen8.c
@@ -14,7 +14,7 @@
 #include <i915_drm.h>
 
 #include "drmtest.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_chipset.h"
 #include "intel_io.h"
@@ -23,17 +23,12 @@
 #include "intel_reg.h"
 #include "igt_aux.h"
 
-#include "intel_aub.h"
-
 #define VERTEX_SIZE (3*4)
 
 #if DEBUG_RENDERCPY
-static void dump_batch(struct intel_batchbuffer *batch) {
-	int fd = open("/tmp/i965-batchbuffers.dump", O_WRONLY | O_CREAT,  0666);
-	if (fd != -1) {
-		igt_assert_eq(write(fd, batch->buffer, 4096), 4096);
-		fd = close(fd);
-	}
+static void dump_batch(struct intel_bb *ibb)
+{
+	intel_bb_dump(ibb, "/tmp/gen8-batchbuffers.dump");
 }
 #else
 #define dump_batch(x) do { } while(0)
@@ -70,89 +65,18 @@ static const uint32_t ps_kernel[][4] = {
 #endif
 };
 
-/* AUB annotation support */
-#define MAX_ANNOTATIONS	33
-struct annotations_context {
-	drm_intel_aub_annotation annotations[MAX_ANNOTATIONS];
-	int index;
-	uint32_t offset;
-};
-
-static void annotation_init(struct annotations_context *aub)
-{
-	/* aub->annotations is an array keeping a list of annotations of the
-	 * batch buffer ordered by offset. aub->annotations[0] is thus left
-	 * for the command stream and will be filled just before executing
-	 * the batch buffer with annotations_add_batch() */
-	aub->index = 1;
-}
-
-static void add_annotation(drm_intel_aub_annotation *a,
-			   uint32_t type, uint32_t subtype,
-			   uint32_t ending_offset)
-{
-	a->type = type;
-	a->subtype = subtype;
-	a->ending_offset = ending_offset;
-}
-
-static void annotation_add_batch(struct annotations_context *aub, size_t size)
-{
-	add_annotation(&aub->annotations[0], AUB_TRACE_TYPE_BATCH, 0, size);
-}
-
-static void annotation_add_state(struct annotations_context *aub,
-				 uint32_t state_type,
-				 uint32_t start_offset,
-				 size_t   size)
-{
-	igt_assert(aub->index < MAX_ANNOTATIONS);
-
-	add_annotation(&aub->annotations[aub->index++],
-		       AUB_TRACE_TYPE_NOTYPE, 0,
-		       start_offset);
-	add_annotation(&aub->annotations[aub->index++],
-		       AUB_TRACE_TYPE(state_type),
-		       AUB_TRACE_SUBTYPE(state_type),
-		       start_offset + size);
-}
-
-static void annotation_flush(struct annotations_context *aub,
-			     struct intel_batchbuffer *batch)
-{
-	if (!igt_aub_dump_enabled())
-		return;
-
-	drm_intel_bufmgr_gem_set_aub_annotations(batch->bo,
-						 aub->annotations,
-						 aub->index);
-}
-
-static void
-gen6_render_flush(struct intel_batchbuffer *batch,
-		  drm_intel_context *context, uint32_t batch_end)
-{
-	igt_assert_eq(drm_intel_bo_subdata(batch->bo,
-					   0, 4096, batch->buffer),
-		      0);
-	igt_assert_eq(drm_intel_gem_bo_context_exec(batch->bo, context,
-						    batch_end, 0),
-		      0);
-}
-
 /* Mostly copy+paste from gen6, except height, width, pitch moved */
 static uint32_t
-gen8_bind_buf(struct intel_batchbuffer *batch,
-	      struct annotations_context *aub,
-	      const struct igt_buf *buf, int is_dst)
+gen8_bind_buf(struct intel_bb *ibb,
+	      const struct intel_buf *buf, int is_dst)
 {
 	struct gen8_surface_state *ss;
-	uint32_t write_domain, read_domain, offset;
-	int ret;
+	uint32_t write_domain, read_domain;
+	uint64_t address;
 
 	igt_assert_lte(buf->surface[0].stride, 256*1024);
-	igt_assert_lte(igt_buf_width(buf), 16384);
-	igt_assert_lte(igt_buf_height(buf), 16384);
+	igt_assert_lte(intel_buf_width(buf), 16384);
+	igt_assert_lte(intel_buf_height(buf), 16384);
 
 	if (is_dst) {
 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
@@ -161,9 +85,7 @@ gen8_bind_buf(struct intel_batchbuffer *batch,
 		read_domain = I915_GEM_DOMAIN_SAMPLER;
 	}
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, ss);
-	annotation_add_state(aub, AUB_TRACE_SURFACE_STATE, offset, sizeof(*ss));
+	ss = intel_bb_ptr_align(ibb, 64);
 
 	ss->ss0.surface_type = SURFACE_2D;
 	switch (buf->bpp) {
@@ -181,23 +103,21 @@ gen8_bind_buf(struct intel_batchbuffer *batch,
 	else if (buf->tiling == I915_TILING_Y)
 		ss->ss0.tiled_mode = 3;
 
-	if (IS_CHERRYVIEW(batch->devid))
+	if (IS_CHERRYVIEW(ibb->devid))
 		ss->ss1.memory_object_control = CHV_MOCS_WB | CHV_MOCS_L3;
 	else
 		ss->ss1.memory_object_control = BDW_MOCS_PTE |
 			BDW_MOCS_TC_L3_PTE | BDW_MOCS_AGE(0);
 
-	ss->ss8.base_addr = buf->bo->offset64;
-	ss->ss9.base_addr_hi = buf->bo->offset64 >> 32;
-
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, &ss->ss8),
-				      buf->bo, 0,
-				      read_domain, write_domain);
-	igt_assert(ret == 0);
+	address = intel_bb_offset_reloc(ibb, buf->handle,
+					read_domain, write_domain,
+					intel_bb_offset(ibb) + 4 * 8,
+					buf->addr.offset);
+	ss->ss8.base_addr = address;
+	ss->ss9.base_addr_hi = address >> 32;
 
-	ss->ss2.height = igt_buf_height(buf) - 1;
-	ss->ss2.width  = igt_buf_width(buf) - 1;
+	ss->ss2.height = intel_buf_height(buf) - 1;
+	ss->ss2.width  = intel_buf_width(buf) - 1;
 	ss->ss3.pitch  = buf->surface[0].stride - 1;
 
 	ss->ss7.shader_chanel_select_r = 4;
@@ -205,39 +125,32 @@ gen8_bind_buf(struct intel_batchbuffer *batch,
 	ss->ss7.shader_chanel_select_b = 6;
 	ss->ss7.shader_chanel_select_a = 7;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen8_bind_surfaces(struct intel_batchbuffer *batch,
-		   struct annotations_context *aub,
-		   const struct igt_buf *src,
-		   const struct igt_buf *dst)
+gen8_bind_surfaces(struct intel_bb *ibb,
+		   const struct intel_buf *src,
+		   const struct intel_buf *dst)
 {
-	uint32_t *binding_table, offset;
+	uint32_t *binding_table, binding_table_offset;
 
-	binding_table = intel_batchbuffer_subdata_alloc(batch, 8, 32);
-	offset = intel_batchbuffer_subdata_offset(batch, binding_table);
-	annotation_add_state(aub, AUB_TRACE_BINDING_TABLE, offset, 8);
+	binding_table = intel_bb_ptr_align(ibb, 32);
+	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 8);
 
-	binding_table[0] = gen8_bind_buf(batch, aub, dst, 1);
-	binding_table[1] = gen8_bind_buf(batch, aub, src, 0);
+	binding_table[0] = gen8_bind_buf(ibb, dst, 1);
+	binding_table[1] = gen8_bind_buf(ibb, src, 0);
 
-	return offset;
+	return binding_table_offset;
 }
 
 /* Mostly copy+paste from gen6, except wrap modes moved */
 static uint32_t
-gen8_create_sampler(struct intel_batchbuffer *batch,
-		    struct annotations_context *aub)
+gen8_create_sampler(struct intel_bb *ibb)
 {
 	struct gen8_sampler_state *ss;
-	uint32_t offset;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, ss);
-	annotation_add_state(aub, AUB_TRACE_SAMPLER_STATE,
-			     offset, sizeof(*ss));
+	ss = intel_bb_ptr_align(ibb, 64);
 
 	ss->ss0.min_filter = GEN4_MAPFILTER_NEAREST;
 	ss->ss0.mag_filter = GEN4_MAPFILTER_NEAREST;
@@ -249,21 +162,15 @@ gen8_create_sampler(struct intel_batchbuffer *batch,
 	 * sampler fetch, but couldn't make it work. */
 	ss->ss3.non_normalized_coord = 0;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen8_fill_ps(struct intel_batchbuffer *batch,
-	     struct annotations_context *aub,
+gen8_fill_ps(struct intel_bb *ibb,
 	     const uint32_t kernel[][4],
 	     size_t size)
 {
-	uint32_t offset;
-
-	offset = intel_batchbuffer_copy_data(batch, kernel, size, 64);
-	annotation_add_state(aub, AUB_TRACE_KERNEL_INSTRUCTIONS, offset, size);
-
-	return offset;
+	return intel_bb_copy_data(ibb, kernel, size, 64);
 }
 
 /*
@@ -277,34 +184,29 @@ gen8_fill_ps(struct intel_batchbuffer *batch,
  * see gen6_emit_vertex_elements
  */
 static uint32_t
-gen7_fill_vertex_buffer_data(struct intel_batchbuffer *batch,
-			     struct annotations_context *aub,
-			     const struct igt_buf *src,
+gen7_fill_vertex_buffer_data(struct intel_bb *ibb,
+			     const struct intel_buf *src,
 			     uint32_t src_x, uint32_t src_y,
 			     uint32_t dst_x, uint32_t dst_y,
 			     uint32_t width, uint32_t height)
 {
-	void *start;
 	uint32_t offset;
 
-	intel_batchbuffer_align(batch, 8);
-	start = batch->ptr;
+	intel_bb_ptr_align(ibb, 8);
+	offset = intel_bb_offset(ibb);
 
-	emit_vertex_2s(batch, dst_x + width, dst_y + height);
-	emit_vertex_normalized(batch, src_x + width, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x + width, dst_y + height);
+	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y + height);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y + height);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
 
-	offset = intel_batchbuffer_subdata_offset(batch, start);
-	annotation_add_state(aub, AUB_TRACE_VERTEX_BUFFER,
-			     offset, 3 * VERTEX_SIZE);
 	return offset;
 }
 
@@ -318,25 +220,25 @@ gen7_fill_vertex_buffer_data(struct intel_batchbuffer *batch,
  * packed.
  */
 static void
-gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
+gen6_emit_vertex_elements(struct intel_bb *ibb) {
 	/*
 	 * The VUE layout
 	 *    dword 0-3: pad (0, 0, 0. 0)
 	 *    dword 4-7: position (x, y, 0, 1.0),
 	 *    dword 8-11: texture coordinate 0 (u0, v0, 0, 1.0)
 	 */
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
 
 	/* Element state 0. These are 4 dwords of 0 required for the VUE format.
 	 * We don't really know or care what they do.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* we specify 0, but it's really does not exist */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* we specify 0, but it's really does not exist */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* Element state 1 - Our "destination" vertices. These are passed down
 	 * through the pipeline, and eventually make it to the pixel shader as
@@ -344,25 +246,25 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
 	 * signed/scaled because of gen6 rendercopy. I see no particular reason
 	 * for doing this though.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* Element state 2. Last but not least we store the U,V components as
 	 * normalized floats. These will be used in the pixel shader to sample
 	 * from the source buffer.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
-		  4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
+		     4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 }
 
 /*
@@ -371,44 +273,35 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
  * @batch
  * @offset - bytw offset within the @batch where the vertex buffer starts.
  */
-static void gen8_emit_vertex_buffer(struct intel_batchbuffer *batch,
-				    uint32_t offset) {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | (1 + (4 * 1) - 2));
-	OUT_BATCH(0 << GEN6_VB0_BUFFER_INDEX_SHIFT | /* VB 0th index */
-		  GEN8_VB0_BUFFER_ADDR_MOD_EN | /* Address Modify Enable */
-		  VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, offset);
-	OUT_BATCH(3 * VERTEX_SIZE);
+static void gen8_emit_vertex_buffer(struct intel_bb *ibb, uint32_t offset)
+{
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_BUFFERS | (1 + (4 * 1) - 2));
+	intel_bb_out(ibb, 0 << GEN6_VB0_BUFFER_INDEX_SHIFT | /* VB 0th index */
+		     GEN8_VB0_BUFFER_ADDR_MOD_EN | /* Address Modify Enable */
+		     VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_VERTEX, 0,
+			    offset, ibb->batch_offset);
+	intel_bb_out(ibb, 3 * VERTEX_SIZE);
 }
 
 static uint32_t
-gen6_create_cc_state(struct intel_batchbuffer *batch,
-		     struct annotations_context *aub)
+gen6_create_cc_state(struct intel_bb *ibb)
 {
 	struct gen6_color_calc_state *cc_state;
-	uint32_t offset;
 
-	cc_state = intel_batchbuffer_subdata_alloc(batch,
-						   sizeof(*cc_state), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, cc_state);
-	annotation_add_state(aub, AUB_TRACE_CC_STATE,
-			     offset, sizeof(*cc_state));
+	cc_state = intel_bb_ptr_align(ibb, 64);
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*cc_state));
 }
 
 static uint32_t
-gen8_create_blend_state(struct intel_batchbuffer *batch,
-			struct annotations_context *aub)
+gen8_create_blend_state(struct intel_bb *ibb)
 {
 	struct gen8_blend_state *blend;
 	int i;
-	uint32_t offset;
 
-	blend = intel_batchbuffer_subdata_alloc(batch, sizeof(*blend), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, blend);
-	annotation_add_state(aub, AUB_TRACE_BLEND_STATE,
-			     offset, sizeof(*blend));
+	blend = intel_bb_ptr_align(ibb, 64);
 
 	for (i = 0; i < 16; i++) {
 		blend->bs[i].dest_blend_factor = GEN6_BLENDFACTOR_ZERO;
@@ -418,451 +311,440 @@ gen8_create_blend_state(struct intel_batchbuffer *batch,
 		blend->bs[i].color_buffer_blend = 0;
 	}
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*blend));
 }
 
 static uint32_t
-gen6_create_cc_viewport(struct intel_batchbuffer *batch,
-			struct annotations_context *aub)
+gen6_create_cc_viewport(struct intel_bb *ibb)
 {
 	struct gen4_cc_viewport *vp;
-	uint32_t offset;
 
-	vp = intel_batchbuffer_subdata_alloc(batch, sizeof(*vp), 32);
-	offset = intel_batchbuffer_subdata_offset(batch, vp);
-	annotation_add_state(aub, AUB_TRACE_CC_VP_STATE,
-			     offset, sizeof(*vp));
+	vp = intel_bb_ptr_align(ibb, 32);
 
 	/* XXX I don't understand this */
 	vp->min_depth = -1.e35;
 	vp->max_depth = 1.e35;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vp));
 }
 
 static uint32_t
-gen7_create_sf_clip_viewport(struct intel_batchbuffer *batch,
-			struct annotations_context *aub)
+gen7_create_sf_clip_viewport(struct intel_bb *ibb)
 {
 	/* XXX these are likely not needed */
 	struct gen7_sf_clip_viewport *scv_state;
-	uint32_t offset;
 
-	scv_state = intel_batchbuffer_subdata_alloc(batch,
-						    sizeof(*scv_state), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, scv_state);
-	annotation_add_state(aub, AUB_TRACE_CLIP_VP_STATE,
-			     offset, sizeof(*scv_state));
+	scv_state = intel_bb_ptr_align(ibb, 64);
 
 	scv_state->guardband.xmin = 0;
 	scv_state->guardband.xmax = 1.0f;
 	scv_state->guardband.ymin = 0;
 	scv_state->guardband.ymax = 1.0f;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*scv_state));
 }
 
 static uint32_t
-gen6_create_scissor_rect(struct intel_batchbuffer *batch,
-			struct annotations_context *aub)
+gen6_create_scissor_rect(struct intel_bb *ibb)
 {
 	struct gen6_scissor_rect *scissor;
-	uint32_t offset;
 
-	scissor = intel_batchbuffer_subdata_alloc(batch, sizeof(*scissor), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, scissor);
-	annotation_add_state(aub, AUB_TRACE_SCISSOR_STATE,
-			     offset, sizeof(*scissor));
+	scissor = intel_bb_ptr_align(ibb, 64);
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*scissor));
 }
 
 static void
-gen8_emit_sip(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN4_STATE_SIP | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_sip(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN4_STATE_SIP | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_push_constants(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
-	OUT_BATCH(0);
+gen7_emit_push_constants(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (16 - 2));
+gen8_emit_state_base_address(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (16 - 2));
 
 	/* general */
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0);
 
 	/* stateless data port */
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
 
 	/* surface */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_SAMPLER, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* dynamic */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
-		  0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* indirect */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
 	/* instruction */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* general state buffer size */
-	OUT_BATCH(0xfffff000 | 1);
+	intel_bb_out(ibb, 0xfffff000 | 1);
 	/* dynamic state buffer size */
-	OUT_BATCH(1 << 12 | 1);
+	intel_bb_out(ibb, 1 << 12 | 1);
 	/* indirect object buffer size */
-	OUT_BATCH(0xfffff000 | 1);
+	intel_bb_out(ibb, 0xfffff000 | 1);
 	/* intruction buffer size */
-	OUT_BATCH(1 << 12 | 1);
+	intel_bb_out(ibb, 1 << 12 | 1);
 }
 
 static void
-gen7_emit_urb(struct intel_batchbuffer *batch) {
+gen7_emit_urb(struct intel_bb *ibb) {
 	/* XXX: Min valid values from mesa */
 	const int vs_entries = 64;
 	const int vs_size = 2;
 	const int vs_start = 2;
 
-	OUT_BATCH(GEN7_3DSTATE_URB_VS);
-	OUT_BATCH(vs_entries | ((vs_size - 1) << 16) | (vs_start << 25));
-	OUT_BATCH(GEN7_3DSTATE_URB_GS);
-	OUT_BATCH(vs_start << 25);
-	OUT_BATCH(GEN7_3DSTATE_URB_HS);
-	OUT_BATCH(vs_start << 25);
-	OUT_BATCH(GEN7_3DSTATE_URB_DS);
-	OUT_BATCH(vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_VS);
+	intel_bb_out(ibb, vs_entries | ((vs_size - 1) << 16) | (vs_start << 25));
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_GS);
+	intel_bb_out(ibb, vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_HS);
+	intel_bb_out(ibb, vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_DS);
+	intel_bb_out(ibb, vs_start << 25);
 }
 
 static void
-gen8_emit_cc(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_BLEND_STATE_POINTERS);
-	OUT_BATCH(cc.blend_state | 1);
+gen8_emit_cc(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_BLEND_STATE_POINTERS);
+	intel_bb_out(ibb, cc.blend_state | 1);
 
-	OUT_BATCH(GEN6_3DSTATE_CC_STATE_POINTERS);
-	OUT_BATCH(cc.cc_state | 1);
+	intel_bb_out(ibb, GEN6_3DSTATE_CC_STATE_POINTERS);
+	intel_bb_out(ibb, cc.cc_state | 1);
 }
 
 static void
-gen8_emit_multisample(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN8_3DSTATE_MULTISAMPLE);
-	OUT_BATCH(0);
+gen8_emit_multisample(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN8_3DSTATE_MULTISAMPLE);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN6_3DSTATE_SAMPLE_MASK);
-	OUT_BATCH(1);
+	intel_bb_out(ibb, GEN6_3DSTATE_SAMPLE_MASK);
+	intel_bb_out(ibb, 1);
 }
 
 static void
-gen8_emit_vs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_VS | (11 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_VS | (9-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_vs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_VS | (11 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_VS | (9-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_hs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CONSTANT_HS | (11 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_HS | (9-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS);
-	OUT_BATCH(0);
+gen8_emit_hs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CONSTANT_HS | (11 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_HS | (9-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_gs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_GS | (11 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_GS | (10-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS);
-	OUT_BATCH(0);
+gen8_emit_gs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_GS | (11 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_GS | (10-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_ds(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CONSTANT_DS | (11 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_DS | (9-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS);
-	OUT_BATCH(0);
+gen8_emit_ds(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CONSTANT_DS | (11 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_DS | (9-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_wm_hz_op(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN8_3DSTATE_WM_HZ_OP | (5-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_wm_hz_op(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_null_state(struct intel_batchbuffer *batch) {
-	gen8_emit_wm_hz_op(batch);
-	gen8_emit_hs(batch);
-	OUT_BATCH(GEN7_3DSTATE_TE | (4-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	gen8_emit_gs(batch);
-	gen8_emit_ds(batch);
-	gen8_emit_vs(batch);
+gen8_emit_null_state(struct intel_bb *ibb) {
+	gen8_emit_wm_hz_op(ibb);
+	gen8_emit_hs(ibb);
+	intel_bb_out(ibb, GEN7_3DSTATE_TE | (4-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	gen8_emit_gs(ibb);
+	gen8_emit_ds(ibb);
+	gen8_emit_vs(ibb);
 }
 
 static void
-gen7_emit_clip(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN6_3DSTATE_CLIP | (4 - 2));
-	OUT_BATCH(0); 
-	OUT_BATCH(0); /*  pass-through */
-	OUT_BATCH(0);
+gen7_emit_clip(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN6_3DSTATE_CLIP | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /*  pass-through */
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_sf(struct intel_batchbuffer *batch)
+gen8_emit_sf(struct intel_bb *ibb)
 {
 	int i;
 
-	OUT_BATCH(GEN7_3DSTATE_SBE | (4 - 2));
-	OUT_BATCH(1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
-		  GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
-		  GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET |
-		  1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-		  1 << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN7_3DSTATE_SBE | (4 - 2));
+	intel_bb_out(ibb, 1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
+		     GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
+		     GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET |
+		     1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
+		     1 << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
+	intel_bb_out(ibb, GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
 	for (i = 0; i < 8; i++)
-		OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+		intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN8_3DSTATE_RASTER | (5 - 2));
-	OUT_BATCH(GEN8_RASTER_FRONT_WINDING_CCW | GEN8_RASTER_CULL_NONE);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN8_3DSTATE_RASTER | (5 - 2));
+	intel_bb_out(ibb, GEN8_RASTER_FRONT_WINDING_CCW | GEN8_RASTER_CULL_NONE);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN6_3DSTATE_SF | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN6_3DSTATE_SF | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) {
+gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel) {
 	const int max_threads = 63;
 
-	OUT_BATCH(GEN6_3DSTATE_WM | (2 - 2));
-	OUT_BATCH(/* XXX: I don't understand the BARYCENTRIC stuff, but it
+	intel_bb_out(ibb, GEN6_3DSTATE_WM | (2 - 2));
+	intel_bb_out(ibb, /* XXX: I don't understand the BARYCENTRIC stuff, but it
 		   * appears we need it to put our setup data in the place we
 		   * expect (g6, see below) */
-		  GEN8_3DSTATE_PS_PERSPECTIVE_PIXEL_BARYCENTRIC);
-
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_PS | (12-2));
-	OUT_BATCH(kernel);
-	OUT_BATCH(0); /* kernel hi */
-	OUT_BATCH(1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
-		  2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-	OUT_BATCH(0); /* scratch space stuff */
-	OUT_BATCH(0); /* scratch hi */
-	OUT_BATCH((max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
-		  GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
-	OUT_BATCH(6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
-	OUT_BATCH(0); // kernel 1
-	OUT_BATCH(0); /* kernel 1 hi */
-	OUT_BATCH(0); // kernel 2
-	OUT_BATCH(0); /* kernel 2 hi */
-
-	OUT_BATCH(GEN8_3DSTATE_PS_BLEND | (2 - 2));
-	OUT_BATCH(GEN8_PS_BLEND_HAS_WRITEABLE_RT);
-
-	OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2));
-	OUT_BATCH(GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
+		     GEN8_3DSTATE_PS_PERSPECTIVE_PIXEL_BARYCENTRIC);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_PS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_PS | (12-2));
+	intel_bb_out(ibb, kernel);
+	intel_bb_out(ibb, 0); /* kernel hi */
+	intel_bb_out(ibb, 1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
+		     2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
+	intel_bb_out(ibb, 0); /* scratch space stuff */
+	intel_bb_out(ibb, 0); /* scratch hi */
+	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
+		     GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
+	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
+	intel_bb_out(ibb, 0); // kernel 1
+	intel_bb_out(ibb, 0); /* kernel 1 hi */
+	intel_bb_out(ibb, 0); // kernel 2
+	intel_bb_out(ibb, 0); /* kernel 2 hi */
+
+	intel_bb_out(ibb, GEN8_3DSTATE_PS_BLEND | (2 - 2));
+	intel_bb_out(ibb, GEN8_PS_BLEND_HAS_WRITEABLE_RT);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_PS_EXTRA | (2 - 2));
+	intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
 }
 
 static void
-gen8_emit_depth(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN8_3DSTATE_WM_DEPTH_STENCIL | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_STENCIL_BUFFER | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_depth(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN8_3DSTATE_WM_DEPTH_STENCIL | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_STENCIL_BUFFER | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_clear(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
-	OUT_BATCH(0);
-	OUT_BATCH(1); // clear valid
+gen7_emit_clear(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 1); // clear valid
 }
 
 static void
-gen6_emit_drawing_rectangle(struct intel_batchbuffer *batch, const struct igt_buf *dst)
+gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
 {
-	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH((igt_buf_height(dst) - 1) << 16 | (igt_buf_width(dst) - 1));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);
 }
 
-static void gen8_emit_vf_topology(struct intel_batchbuffer *batch)
+static void gen8_emit_vf_topology(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN8_3DSTATE_VF_TOPOLOGY);
-	OUT_BATCH(_3DPRIM_RECTLIST);
+	intel_bb_out(ibb, GEN8_3DSTATE_VF_TOPOLOGY);
+	intel_bb_out(ibb, _3DPRIM_RECTLIST);
 }
 
 /* Vertex elements MUST be defined before this according to spec */
-static void gen8_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset)
+static void gen8_emit_primitive(struct intel_bb *ibb, uint32_t offset)
 {
-	OUT_BATCH(GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN4_3DPRIMITIVE | (7-2));
-	OUT_BATCH(0);	/* gen8+ ignore the topology type field */
-	OUT_BATCH(3);	/* vertex count */
-	OUT_BATCH(0);	/*  We're specifying this instead with offset in GEN6_3DSTATE_VERTEX_BUFFERS */
-	OUT_BATCH(1);	/* single instance */
-	OUT_BATCH(0);	/* start instance location */
-	OUT_BATCH(0);	/* index buffer offset, ignored */
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE | (7-2));
+	intel_bb_out(ibb, 0);	/* gen8+ ignore the topology type field */
+	intel_bb_out(ibb, 3);	/* vertex count */
+	intel_bb_out(ibb, 0);	/*  We're specifying this instead with offset in GEN6_3DSTATE_VERTEX_BUFFERS */
+	intel_bb_out(ibb, 1);	/* single instance */
+	intel_bb_out(ibb, 0);	/* start instance location */
+	intel_bb_out(ibb, 0);	/* index buffer offset, ignored */
 }
 
 /* The general rule is if it's named gen6 it is directly copied from
@@ -897,114 +779,104 @@ static void gen8_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset
 
 #define BATCH_STATE_SPLIT 2048
 
-void gen8_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen8_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  unsigned int src_x, unsigned int src_y,
+			  unsigned int width, unsigned int height,
+			  struct intel_buf *dst,
+			  unsigned int dst_x, unsigned int dst_y)
 {
-	struct annotations_context aub_annotations;
 	uint32_t ps_sampler_state, ps_kernel_off, ps_binding_table;
 	uint32_t scissor_state;
 	uint32_t vertex_buffer;
-	uint32_t batch_end;
 
 	igt_assert(src->bpp == dst->bpp);
-	intel_batchbuffer_flush_with_context(batch, context);
 
-	intel_batchbuffer_align(batch, 8);
+	intel_bb_flush_render_with_context(ibb, ctx);
 
-	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
 
-	annotation_init(&aub_annotations);
+	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
 
-	ps_binding_table  = gen8_bind_surfaces(batch, &aub_annotations,
-					       src, dst);
-	ps_sampler_state  = gen8_create_sampler(batch, &aub_annotations);
-	ps_kernel_off = gen8_fill_ps(batch, &aub_annotations,
-				     ps_kernel, sizeof(ps_kernel));
-	vertex_buffer = gen7_fill_vertex_buffer_data(batch, &aub_annotations,
+	ps_binding_table  = gen8_bind_surfaces(ibb, src, dst);
+	ps_sampler_state  = gen8_create_sampler(ibb);
+	ps_kernel_off = gen8_fill_ps(ibb, ps_kernel, sizeof(ps_kernel));
+	vertex_buffer = gen7_fill_vertex_buffer_data(ibb,
 						     src,
 						     src_x, src_y,
 						     dst_x, dst_y,
 						     width, height);
-	cc.cc_state = gen6_create_cc_state(batch, &aub_annotations);
-	cc.blend_state = gen8_create_blend_state(batch, &aub_annotations);
-	viewport.cc_state = gen6_create_cc_viewport(batch, &aub_annotations);
-	viewport.sf_clip_state = gen7_create_sf_clip_viewport(batch, &aub_annotations);
-	scissor_state = gen6_create_scissor_rect(batch, &aub_annotations);
+	cc.cc_state = gen6_create_cc_state(ibb);
+	cc.blend_state = gen8_create_blend_state(ibb);
+	viewport.cc_state = gen6_create_cc_viewport(ibb);
+	viewport.sf_clip_state = gen7_create_sf_clip_viewport(ibb);
+	scissor_state = gen6_create_scissor_rect(ibb);
 	/* TODO: theree is other state which isn't setup */
 
-	igt_assert(batch->ptr < &batch->buffer[4095]);
-
-	batch->ptr = batch->buffer;
+	intel_bb_ptr_set(ibb, 0);
 
 	/* Start emitting the commands. The order roughly follows the mesa blorp
 	 * order */
-	OUT_BATCH(G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
-
-	gen8_emit_sip(batch);
-
-	gen7_emit_push_constants(batch);
-
-	gen8_emit_state_base_address(batch);
-
-	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
-	OUT_BATCH(viewport.cc_state);
-	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
-	OUT_BATCH(viewport.sf_clip_state);
+	intel_bb_out(ibb, G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 
-	gen7_emit_urb(batch);
+	gen8_emit_sip(ibb);
 
-	gen8_emit_cc(batch);
+	gen7_emit_push_constants(ibb);
 
-	gen8_emit_multisample(batch);
+	gen8_emit_state_base_address(ibb);
 
-	gen8_emit_null_state(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
+	intel_bb_out(ibb, viewport.cc_state);
+	intel_bb_out(ibb, GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
+	intel_bb_out(ibb, viewport.sf_clip_state);
 
-	OUT_BATCH(GEN7_3DSTATE_STREAMOUT | (5-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	gen7_emit_urb(ibb);
 
-	gen7_emit_clip(batch);
+	gen8_emit_cc(ibb);
 
-	gen8_emit_sf(batch);
+	gen8_emit_multisample(ibb);
 
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
-	OUT_BATCH(ps_binding_table);
+	gen8_emit_null_state(ibb);
 
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS);
-	OUT_BATCH(ps_sampler_state);
+	intel_bb_out(ibb, GEN7_3DSTATE_STREAMOUT | (5-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	gen8_emit_ps(batch, ps_kernel_off);
+	gen7_emit_clip(ibb);
 
-	OUT_BATCH(GEN8_3DSTATE_SCISSOR_STATE_POINTERS);
-	OUT_BATCH(scissor_state);
+	gen8_emit_sf(ibb);
 
-	gen8_emit_depth(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
+	intel_bb_out(ibb, ps_binding_table);
 
-	gen7_emit_clear(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS);
+	intel_bb_out(ibb, ps_sampler_state);
 
-	gen6_emit_drawing_rectangle(batch, dst);
+	gen8_emit_ps(ibb, ps_kernel_off);
 
-	gen8_emit_vertex_buffer(batch, vertex_buffer);
-	gen6_emit_vertex_elements(batch);
+	intel_bb_out(ibb, GEN8_3DSTATE_SCISSOR_STATE_POINTERS);
+	intel_bb_out(ibb, scissor_state);
 
-	gen8_emit_vf_topology(batch);
-	gen8_emit_primitive(batch, vertex_buffer);
+	gen8_emit_depth(ibb);
 
-	OUT_BATCH(MI_BATCH_BUFFER_END);
+	gen7_emit_clear(ibb);
 
-	batch_end = intel_batchbuffer_align(batch, 8);
-	igt_assert(batch_end < BATCH_STATE_SPLIT);
-	annotation_add_batch(&aub_annotations, batch_end);
+	gen6_emit_drawing_rectangle(ibb, dst);
 
-	dump_batch(batch);
+	gen8_emit_vertex_buffer(ibb, vertex_buffer);
+	gen6_emit_vertex_elements(ibb);
 
-	annotation_flush(&aub_annotations, batch);
+	gen8_emit_vf_topology(ibb);
+	gen8_emit_primitive(ibb, vertex_buffer);
 
-	gen6_render_flush(batch, context, batch_end);
-	intel_batchbuffer_reset(batch);
+	intel_bb_emit_bbe(ibb);
+	intel_bb_exec_with_context(ibb, intel_bb_offset(ibb), ctx,
+				   I915_EXEC_DEFAULT | I915_EXEC_NO_RELOC,
+				   false);
+	dump_batch(ibb);
+	intel_bb_reset(ibb, false);
 }
diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
index 85ae4cab..6bad7bb6 100644
--- a/lib/rendercopy_gen9.c
+++ b/lib/rendercopy_gen9.c
@@ -16,7 +16,7 @@
 
 #include "drmtest.h"
 #include "intel_aux_pgtable.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_io.h"
 #include "rendercopy.h"
@@ -24,17 +24,12 @@
 #include "intel_reg.h"
 #include "igt_aux.h"
 
-#include "intel_aub.h"
-
 #define VERTEX_SIZE (3*4)
 
 #if DEBUG_RENDERCPY
-static void dump_batch(struct intel_batchbuffer *batch) {
-	int fd = open("/tmp/i965-batchbuffers.dump", O_WRONLY | O_CREAT,  0666);
-	if (fd != -1) {
-		igt_assert_eq(write(fd, batch->buffer, 4096), 4096);
-		fd = close(fd);
-	}
+static void dump_batch(struct intel_bb *ibb)
+{
+	intel_bb_dump(ibb, "/tmp/gen9-batchbuffers.dump");
 }
 #else
 #define dump_batch(x) do { } while(0)
@@ -120,87 +115,16 @@ static const uint32_t gen12_render_copy[][4] = {
 	{ 0x80040131, 0x00000004, 0x50007144, 0x00c40000 },
 };
 
-/* AUB annotation support */
-#define MAX_ANNOTATIONS	33
-struct annotations_context {
-	drm_intel_aub_annotation annotations[MAX_ANNOTATIONS];
-	int index;
-	uint32_t offset;
-} aub_annotations;
-
-static void annotation_init(struct annotations_context *ctx)
-{
-	/* ctx->annotations is an array keeping a list of annotations of the
-	 * batch buffer ordered by offset. ctx->annotations[0] is thus left
-	 * for the command stream and will be filled just before executing
-	 * the batch buffer with annotations_add_batch() */
-	ctx->index = 1;
-}
-
-static void add_annotation(drm_intel_aub_annotation *a,
-			   uint32_t type, uint32_t subtype,
-			   uint32_t ending_offset)
-{
-	a->type = type;
-	a->subtype = subtype;
-	a->ending_offset = ending_offset;
-}
-
-static void annotation_add_batch(struct annotations_context *ctx, size_t size)
-{
-	add_annotation(&ctx->annotations[0], AUB_TRACE_TYPE_BATCH, 0, size);
-}
-
-static void annotation_add_state(struct annotations_context *ctx,
-				 uint32_t state_type,
-				 uint32_t start_offset,
-				 size_t   size)
-{
-	assert(ctx->index < MAX_ANNOTATIONS);
-
-	add_annotation(&ctx->annotations[ctx->index++],
-		       AUB_TRACE_TYPE_NOTYPE, 0,
-		       start_offset);
-	add_annotation(&ctx->annotations[ctx->index++],
-		       AUB_TRACE_TYPE(state_type),
-		       AUB_TRACE_SUBTYPE(state_type),
-		       start_offset + size);
-}
-
-static void annotation_flush(struct annotations_context *ctx,
-			     struct intel_batchbuffer *batch)
-{
-	if (!igt_aub_dump_enabled())
-		return;
-
-	drm_intel_bufmgr_gem_set_aub_annotations(batch->bo,
-						 ctx->annotations,
-						 ctx->index);
-}
-
-static void
-gen6_render_flush(struct intel_batchbuffer *batch,
-		  drm_intel_context *context, uint32_t batch_end)
-{
-	igt_assert_eq(drm_intel_bo_subdata(batch->bo,
-					   0, 4096, batch->buffer),
-		      0);
-	igt_assert_eq(drm_intel_gem_bo_context_exec(batch->bo,
-						    context, batch_end, 0),
-		      0);
-}
-
 /* Mostly copy+paste from gen6, except height, width, pitch moved */
 static uint32_t
-gen8_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
-	      int is_dst) {
+gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
 	struct gen9_surface_state *ss;
-	uint32_t write_domain, read_domain, offset;
-	int ret;
+	uint32_t write_domain, read_domain;
+	uint64_t address;
 
 	igt_assert_lte(buf->surface[0].stride, 256*1024);
-	igt_assert_lte(igt_buf_width(buf), 16384);
-	igt_assert_lte(igt_buf_height(buf), 16384);
+	igt_assert_lte(intel_buf_width(buf), 16384);
+	igt_assert_lte(intel_buf_height(buf), 16384);
 
 	if (is_dst) {
 		write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
@@ -209,10 +133,7 @@ gen8_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
 		read_domain = I915_GEM_DOMAIN_SAMPLER;
 	}
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, ss);
-	annotation_add_state(&aub_annotations, AUB_TRACE_SURFACE_STATE,
-			     offset, sizeof(*ss));
+	ss = intel_bb_ptr_align(ibb, 64);
 
 	ss->ss0.surface_type = SURFACE_2D;
 	switch (buf->bpp) {
@@ -238,17 +159,15 @@ gen8_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
 		ss->ss5.trmode = 2;
 	ss->ss5.mip_tail_start_lod = 1; /* needed with trmode */
 
-	ss->ss8.base_addr = buf->bo->offset64;
-	ss->ss9.base_addr_hi = buf->bo->offset64 >> 32;
+	address = intel_bb_offset_reloc(ibb, buf->handle,
+					read_domain, write_domain,
+					intel_bb_offset(ibb) + 4 * 8,
+					buf->addr.offset);
+	ss->ss8.base_addr = address;
+	ss->ss9.base_addr_hi = address >> 32;
 
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, &ss->ss8),
-				      buf->bo, 0,
-				      read_domain, write_domain);
-	assert(ret == 0);
-
-	ss->ss2.height = igt_buf_height(buf) - 1;
-	ss->ss2.width  = igt_buf_width(buf) - 1;
+	ss->ss2.height = intel_buf_height(buf) - 1;
+	ss->ss2.width  = intel_buf_width(buf) - 1;
 	ss->ss3.pitch  = buf->surface[0].stride - 1;
 
 	ss->ss7.skl.shader_chanel_select_r = 4;
@@ -264,60 +183,52 @@ gen8_bind_buf(struct intel_batchbuffer *batch, const struct igt_buf *buf,
 		ss->ss6.aux_mode = 0x5; /* AUX_CCS_E */
 		ss->ss6.aux_pitch = (buf->ccs[0].stride / 128) - 1;
 
-		ss->ss10.aux_base_addr = buf->bo->offset64 + buf->ccs[0].offset;
-		ss->ss11.aux_base_addr_hi = (buf->bo->offset64 + buf->ccs[0].offset) >> 32;
-
-		ret = drm_intel_bo_emit_reloc(batch->bo,
-					      intel_batchbuffer_subdata_offset(batch, &ss->ss10),
-					      buf->bo, buf->ccs[0].offset,
-					      read_domain, write_domain);
-		assert(ret == 0);
+		address = intel_bb_offset_reloc_with_delta(ibb, buf->handle,
+							   read_domain, write_domain,
+							   buf->ccs[0].offset,
+							   intel_bb_offset(ibb) + 4 * 10,
+							   buf->addr.offset);
+		ss->ss10.aux_base_addr = (address + buf->ccs[0].offset);
+		ss->ss11.aux_base_addr_hi = (address + buf->ccs[0].offset) >> 32;
 	}
 
 	if (buf->cc.offset) {
 		igt_assert(buf->compression == I915_COMPRESSION_RENDER);
 
-		ss->ss12.clear_address = buf->bo->offset64 + buf->cc.offset;
-		ss->ss13.clear_address_hi = (buf->bo->offset64 + buf->cc.offset) >> 32;
-
-		ret = drm_intel_bo_emit_reloc(batch->bo,
-					      intel_batchbuffer_subdata_offset(batch, &ss->ss12),
-					      buf->bo, buf->cc.offset,
-					      read_domain, write_domain);
-		assert(ret == 0);
+		address = intel_bb_offset_reloc_with_delta(ibb, buf->handle,
+							   read_domain, write_domain,
+							   buf->cc.offset,
+							   intel_bb_offset(ibb) + 4 * 12,
+							   buf->addr.offset);
+		ss->ss12.clear_address = address + buf->cc.offset;
+		ss->ss13.clear_address_hi = (address + buf->cc.offset) >> 32;
 	}
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen8_bind_surfaces(struct intel_batchbuffer *batch,
-		   const struct igt_buf *src,
-		   const struct igt_buf *dst)
+gen8_bind_surfaces(struct intel_bb *ibb,
+		   const struct intel_buf *src,
+		   const struct intel_buf *dst)
 {
-	uint32_t *binding_table, offset;
+	uint32_t *binding_table, binding_table_offset;
 
-	binding_table = intel_batchbuffer_subdata_alloc(batch, 8, 32);
-	offset = intel_batchbuffer_subdata_offset(batch, binding_table);
-	annotation_add_state(&aub_annotations, AUB_TRACE_BINDING_TABLE,
-			     offset, 8);
+	binding_table = intel_bb_ptr_align(ibb, 32);
+	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
 
-	binding_table[0] = gen8_bind_buf(batch, dst, 1);
-	binding_table[1] = gen8_bind_buf(batch, src, 0);
+	binding_table[0] = gen8_bind_buf(ibb, dst, 1);
+	binding_table[1] = gen8_bind_buf(ibb, src, 0);
 
-	return offset;
+	return binding_table_offset;
 }
 
 /* Mostly copy+paste from gen6, except wrap modes moved */
 static uint32_t
-gen8_create_sampler(struct intel_batchbuffer *batch) {
+gen8_create_sampler(struct intel_bb *ibb) {
 	struct gen8_sampler_state *ss;
-	uint32_t offset;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, ss);
-	annotation_add_state(&aub_annotations, AUB_TRACE_SAMPLER_STATE,
-			     offset, sizeof(*ss));
+	ss = intel_bb_ptr_align(ibb, 64);
 
 	ss->ss0.min_filter = GEN4_MAPFILTER_NEAREST;
 	ss->ss0.mag_filter = GEN4_MAPFILTER_NEAREST;
@@ -329,21 +240,15 @@ gen8_create_sampler(struct intel_batchbuffer *batch) {
 	 * sampler fetch, but couldn't make it work. */
 	ss->ss3.non_normalized_coord = 0;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*ss));
 }
 
 static uint32_t
-gen8_fill_ps(struct intel_batchbuffer *batch,
+gen8_fill_ps(struct intel_bb *ibb,
 	     const uint32_t kernel[][4],
 	     size_t size)
 {
-	uint32_t offset;
-
-	offset = intel_batchbuffer_copy_data(batch, kernel, size, 64);
-	annotation_add_state(&aub_annotations, AUB_TRACE_KERNEL_INSTRUCTIONS,
-			     offset, size);
-
-	return offset;
+	return intel_bb_copy_data(ibb, kernel, size, 64);
 }
 
 /*
@@ -357,33 +262,29 @@ gen8_fill_ps(struct intel_batchbuffer *batch,
  * see gen6_emit_vertex_elements
  */
 static uint32_t
-gen7_fill_vertex_buffer_data(struct intel_batchbuffer *batch,
-			     const struct igt_buf *src,
+gen7_fill_vertex_buffer_data(struct intel_bb *ibb,
+			     const struct intel_buf *src,
 			     uint32_t src_x, uint32_t src_y,
 			     uint32_t dst_x, uint32_t dst_y,
 			     uint32_t width, uint32_t height)
 {
-	void *start;
 	uint32_t offset;
 
-	intel_batchbuffer_align(batch, 8);
-	start = batch->ptr;
+	intel_bb_ptr_align(ibb, 8);
+	offset = intel_bb_offset(ibb);
 
-	emit_vertex_2s(batch, dst_x + width, dst_y + height);
-	emit_vertex_normalized(batch, src_x + width, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x + width, dst_y + height);
+	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y + height);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y + height);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex_2s(batch, dst_x, dst_y);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y, igt_buf_height(src));
+	emit_vertex_2s(ibb, dst_x, dst_y);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
 
-	offset = intel_batchbuffer_subdata_offset(batch, start);
-	annotation_add_state(&aub_annotations, AUB_TRACE_VERTEX_BUFFER,
-			     offset, 3 * VERTEX_SIZE);
 	return offset;
 }
 
@@ -397,25 +298,25 @@ gen7_fill_vertex_buffer_data(struct intel_batchbuffer *batch,
  * packed.
  */
 static void
-gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
+gen6_emit_vertex_elements(struct intel_bb *ibb) {
 	/*
 	 * The VUE layout
 	 *    dword 0-3: pad (0, 0, 0. 0)
 	 *    dword 4-7: position (x, y, 0, 1.0),
 	 *    dword 8-11: texture coordinate 0 (u0, v0, 0, 1.0)
 	 */
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_ELEMENTS | (3 * 2 + 1 - 2));
 
 	/* Element state 0. These are 4 dwords of 0 required for the VUE format.
 	 * We don't really know or care what they do.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* we specify 0, but it's really does not exist */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* we specify 0, but it's really does not exist */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* Element state 1 - Our "destination" vertices. These are passed down
 	 * through the pipeline, and eventually make it to the pixel shader as
@@ -423,25 +324,25 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
 	 * signed/scaled because of gen6 rendercopy. I see no particular reason
 	 * for doing this though.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		     0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* Element state 2. Last but not least we store the U,V components as
 	 * normalized floats. These will be used in the pixel shader to sample
 	 * from the source buffer.
 	 */
-	OUT_BATCH(0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
-		  SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
-		  4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
+	intel_bb_out(ibb, 0 << GEN6_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN6_VE0_VALID |
+		     SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT |
+		     4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
+	intel_bb_out(ibb, GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+		     GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 }
 
 /*
@@ -450,42 +351,35 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch) {
  * @batch
  * @offset - bytw offset within the @batch where the vertex buffer starts.
  */
-static void gen7_emit_vertex_buffer(struct intel_batchbuffer *batch,
-				    uint32_t offset) {
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | (1 + (4 * 1) - 2));
-	OUT_BATCH(0 << GEN6_VB0_BUFFER_INDEX_SHIFT | /* VB 0th index */
-		  GEN8_VB0_BUFFER_ADDR_MOD_EN | /* Address Modify Enable */
-		  VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, offset);
-	OUT_BATCH(3 * VERTEX_SIZE);
+static void gen7_emit_vertex_buffer(struct intel_bb *ibb, uint32_t offset)
+{
+	intel_bb_out(ibb, GEN4_3DSTATE_VERTEX_BUFFERS | (1 + (4 * 1) - 2));
+	intel_bb_out(ibb, 0 << GEN6_VB0_BUFFER_INDEX_SHIFT | /* VB 0th index */
+		     GEN8_VB0_BUFFER_ADDR_MOD_EN | /* Address Modify Enable */
+		     VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_VERTEX, 0,
+			    offset, ibb->batch_offset);
+	intel_bb_out(ibb, 3 * VERTEX_SIZE);
 }
 
 static uint32_t
-gen6_create_cc_state(struct intel_batchbuffer *batch)
+gen6_create_cc_state(struct intel_bb *ibb)
 {
 	struct gen6_color_calc_state *cc_state;
-	uint32_t offset;
 
-	cc_state = intel_batchbuffer_subdata_alloc(batch,
-						   sizeof(*cc_state), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, cc_state);
-	annotation_add_state(&aub_annotations, AUB_TRACE_CC_STATE,
-			     offset, sizeof(*cc_state));
+	cc_state = intel_bb_ptr_align(ibb, 64);
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*cc_state));
 }
 
 static uint32_t
-gen8_create_blend_state(struct intel_batchbuffer *batch)
+gen8_create_blend_state(struct intel_bb *ibb)
 {
 	struct gen8_blend_state *blend;
 	int i;
-	uint32_t offset;
 
-	blend = intel_batchbuffer_subdata_alloc(batch, sizeof(*blend), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, blend);
-	annotation_add_state(&aub_annotations, AUB_TRACE_BLEND_STATE,
-			     offset, sizeof(*blend));
+	blend = intel_bb_ptr_align(ibb, 64);
 
 	for (i = 0; i < 16; i++) {
 		blend->bs[i].dest_blend_factor = GEN6_BLENDFACTOR_ZERO;
@@ -495,466 +389,458 @@ gen8_create_blend_state(struct intel_batchbuffer *batch)
 		blend->bs[i].color_buffer_blend = 0;
 	}
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*blend));
 }
 
 static uint32_t
-gen6_create_cc_viewport(struct intel_batchbuffer *batch)
+gen6_create_cc_viewport(struct intel_bb *ibb)
 {
 	struct gen4_cc_viewport *vp;
-	uint32_t offset;
 
-	vp = intel_batchbuffer_subdata_alloc(batch, sizeof(*vp), 32);
-	offset = intel_batchbuffer_subdata_offset(batch, vp);
-	annotation_add_state(&aub_annotations, AUB_TRACE_CC_VP_STATE,
-			     offset, sizeof(*vp));
+	vp = intel_bb_ptr_align(ibb, 32);
 
 	/* XXX I don't understand this */
 	vp->min_depth = -1.e35;
 	vp->max_depth = 1.e35;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*vp));
 }
 
 static uint32_t
-gen7_create_sf_clip_viewport(struct intel_batchbuffer *batch) {
+gen7_create_sf_clip_viewport(struct intel_bb *ibb) {
 	/* XXX these are likely not needed */
 	struct gen7_sf_clip_viewport *scv_state;
-	uint32_t offset;
 
-	scv_state = intel_batchbuffer_subdata_alloc(batch,
-						    sizeof(*scv_state), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, scv_state);
-	annotation_add_state(&aub_annotations, AUB_TRACE_CLIP_VP_STATE,
-			     offset, sizeof(*scv_state));
+	scv_state = intel_bb_ptr_align(ibb, 64);
 
 	scv_state->guardband.xmin = 0;
 	scv_state->guardband.xmax = 1.0f;
 	scv_state->guardband.ymin = 0;
 	scv_state->guardband.ymax = 1.0f;
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*scv_state));
 }
 
 static uint32_t
-gen6_create_scissor_rect(struct intel_batchbuffer *batch)
+gen6_create_scissor_rect(struct intel_bb *ibb)
 {
 	struct gen6_scissor_rect *scissor;
-	uint32_t offset;
 
-	scissor = intel_batchbuffer_subdata_alloc(batch, sizeof(*scissor), 64);
-	offset = intel_batchbuffer_subdata_offset(batch, scissor);
-	annotation_add_state(&aub_annotations, AUB_TRACE_SCISSOR_STATE,
-			     offset, sizeof(*scissor));
+	scissor = intel_bb_ptr_align(ibb, 64);
 
-	return offset;
+	return intel_bb_ptr_add_return_prev_offset(ibb, sizeof(*scissor));
 }
 
 static void
-gen8_emit_sip(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN4_STATE_SIP | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_sip(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN4_STATE_SIP | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_push_constants(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
-	OUT_BATCH(0);
+gen7_emit_push_constants(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_VS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen9_emit_state_base_address(struct intel_batchbuffer *batch) {
+gen9_emit_state_base_address(struct intel_bb *ibb) {
 
 	/* WaBindlessSurfaceStateModifyEnable:skl,bxt */
 	/* The length has to be one less if we dont modify
 	   bindless state */
-	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
+	intel_bb_out(ibb, GEN4_STATE_BASE_ADDRESS | (19 - 1 - 2));
 
 	/* general */
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0);
 
 	/* stateless data port */
-	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
+	intel_bb_out(ibb, 0 | BASE_ADDRESS_MODIFY);
 
 	/* surface */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_SAMPLER, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* dynamic */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
-		  0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* indirect */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
 	/* instruction */
-	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
+	intel_bb_emit_reloc(ibb, ibb->handle,
+			    I915_GEM_DOMAIN_INSTRUCTION, 0,
+			    BASE_ADDRESS_MODIFY, ibb->batch_offset);
 
 	/* general state buffer size */
-	OUT_BATCH(0xfffff000 | 1);
+	intel_bb_out(ibb, 0xfffff000 | 1);
 	/* dynamic state buffer size */
-	OUT_BATCH(1 << 12 | 1);
+	intel_bb_out(ibb, 1 << 12 | 1);
 	/* indirect object buffer size */
-	OUT_BATCH(0xfffff000 | 1);
+	intel_bb_out(ibb, 0xfffff000 | 1);
 	/* intruction buffer size */
-	OUT_BATCH(1 << 12 | 1);
+	intel_bb_out(ibb, 1 << 12 | 1);
 
 	/* Bindless surface state base address */
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_urb(struct intel_batchbuffer *batch) {
+gen7_emit_urb(struct intel_bb *ibb) {
 	/* XXX: Min valid values from mesa */
 	const int vs_entries = 64;
 	const int vs_size = 2;
 	const int vs_start = 4;
 
-	OUT_BATCH(GEN7_3DSTATE_URB_VS);
-	OUT_BATCH(vs_entries | ((vs_size - 1) << 16) | (vs_start << 25));
-	OUT_BATCH(GEN7_3DSTATE_URB_GS);
-	OUT_BATCH(vs_start << 25);
-	OUT_BATCH(GEN7_3DSTATE_URB_HS);
-	OUT_BATCH(vs_start << 25);
-	OUT_BATCH(GEN7_3DSTATE_URB_DS);
-	OUT_BATCH(vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_VS);
+	intel_bb_out(ibb, vs_entries | ((vs_size - 1) << 16) | (vs_start << 25));
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_GS);
+	intel_bb_out(ibb, vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_HS);
+	intel_bb_out(ibb, vs_start << 25);
+	intel_bb_out(ibb, GEN7_3DSTATE_URB_DS);
+	intel_bb_out(ibb, vs_start << 25);
 }
 
 static void
-gen8_emit_cc(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_BLEND_STATE_POINTERS);
-	OUT_BATCH(cc.blend_state | 1);
+gen8_emit_cc(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_BLEND_STATE_POINTERS);
+	intel_bb_out(ibb, cc.blend_state | 1);
 
-	OUT_BATCH(GEN6_3DSTATE_CC_STATE_POINTERS);
-	OUT_BATCH(cc.cc_state | 1);
+	intel_bb_out(ibb, GEN6_3DSTATE_CC_STATE_POINTERS);
+	intel_bb_out(ibb, cc.cc_state | 1);
 }
 
 static void
-gen8_emit_multisample(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN8_3DSTATE_MULTISAMPLE | 0);
-	OUT_BATCH(0);
+gen8_emit_multisample(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN8_3DSTATE_MULTISAMPLE | 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN6_3DSTATE_SAMPLE_MASK);
-	OUT_BATCH(1);
+	intel_bb_out(ibb, GEN6_3DSTATE_SAMPLE_MASK);
+	intel_bb_out(ibb, 1);
 }
 
 static void
-gen8_emit_vs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_VS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_VS | (9-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_vs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_VS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_VS | (9-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_hs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CONSTANT_HS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_HS | (9-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS);
-	OUT_BATCH(0);
+gen8_emit_hs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CONSTANT_HS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_HS | (9-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_gs(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_GS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_GS | (10-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS);
-	OUT_BATCH(0);
+gen8_emit_gs(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_GS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_GS | (10-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen9_emit_ds(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CONSTANT_DS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_DS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS);
-	OUT_BATCH(0);
+gen9_emit_ds(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CONSTANT_DS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_DS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS);
+	intel_bb_out(ibb, 0);
 }
 
 
 static void
-gen8_emit_wm_hz_op(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN8_3DSTATE_WM_HZ_OP | (5-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+gen8_emit_wm_hz_op(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN8_3DSTATE_WM_HZ_OP | (5-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_null_state(struct intel_batchbuffer *batch) {
-	gen8_emit_wm_hz_op(batch);
-	gen8_emit_hs(batch);
-	OUT_BATCH(GEN7_3DSTATE_TE | (4-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	gen8_emit_gs(batch);
-	gen9_emit_ds(batch);
-	gen8_emit_vs(batch);
+gen8_emit_null_state(struct intel_bb *ibb) {
+	gen8_emit_wm_hz_op(ibb);
+	gen8_emit_hs(ibb);
+	intel_bb_out(ibb, GEN7_3DSTATE_TE | (4-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	gen8_emit_gs(ibb);
+	gen9_emit_ds(ibb);
+	gen8_emit_vs(ibb);
 }
 
 static void
-gen7_emit_clip(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN6_3DSTATE_CLIP | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0); /*  pass-through */
-	OUT_BATCH(0);
+gen7_emit_clip(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN6_3DSTATE_CLIP | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0); /*  pass-through */
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_sf(struct intel_batchbuffer *batch)
+gen8_emit_sf(struct intel_bb *ibb)
 {
 	int i;
 
-	OUT_BATCH(GEN7_3DSTATE_SBE | (6 - 2));
-	OUT_BATCH(1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
-		  GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
-		  GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET |
-		  1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-		  1 << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(GEN9_SBE_ACTIVE_COMPONENT_XYZW << 0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
+	intel_bb_out(ibb, GEN7_3DSTATE_SBE | (6 - 2));
+	intel_bb_out(ibb, 1 << GEN7_SBE_NUM_OUTPUTS_SHIFT |
+		     GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
+		     GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET |
+		     1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
+		     1 << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, GEN9_SBE_ACTIVE_COMPONENT_XYZW << 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
 	for (i = 0; i < 8; i++)
-		OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_RASTER | (5 - 2));
-	OUT_BATCH(GEN8_RASTER_FRONT_WINDING_CCW | GEN8_RASTER_CULL_NONE);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN6_3DSTATE_SF | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+		intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_RASTER | (5 - 2));
+	intel_bb_out(ibb, GEN8_RASTER_FRONT_WINDING_CCW | GEN8_RASTER_CULL_NONE);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_SF | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) {
+gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel) {
 	const int max_threads = 63;
 
-	OUT_BATCH(GEN6_3DSTATE_WM | (2 - 2));
-	OUT_BATCH(/* XXX: I don't understand the BARYCENTRIC stuff, but it
+	intel_bb_out(ibb, GEN6_3DSTATE_WM | (2 - 2));
+	intel_bb_out(ibb, /* XXX: I don't understand the BARYCENTRIC stuff, but it
 		   * appears we need it to put our setup data in the place we
 		   * expect (g6, see below) */
-		  GEN8_3DSTATE_PS_PERSPECTIVE_PIXEL_BARYCENTRIC);
-
-	OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | (11-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_PS | (12-2));
-	OUT_BATCH(kernel);
-	OUT_BATCH(0); /* kernel hi */
-	OUT_BATCH(1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
-		  2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
-	OUT_BATCH(0); /* scratch space stuff */
-	OUT_BATCH(0); /* scratch hi */
-	OUT_BATCH((max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
-		  GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
-	OUT_BATCH(6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
-	OUT_BATCH(0); // kernel 1
-	OUT_BATCH(0); /* kernel 1 hi */
-	OUT_BATCH(0); // kernel 2
-	OUT_BATCH(0); /* kernel 2 hi */
-
-	OUT_BATCH(GEN8_3DSTATE_PS_BLEND | (2 - 2));
-	OUT_BATCH(GEN8_PS_BLEND_HAS_WRITEABLE_RT);
-
-	OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2));
-	OUT_BATCH(GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
+		     GEN8_3DSTATE_PS_PERSPECTIVE_PIXEL_BARYCENTRIC);
+
+	intel_bb_out(ibb, GEN6_3DSTATE_CONSTANT_PS | (11-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_PS | (12-2));
+	intel_bb_out(ibb, kernel);
+	intel_bb_out(ibb, 0); /* kernel hi */
+	intel_bb_out(ibb, 1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHIFT |
+		     2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);
+	intel_bb_out(ibb, 0); /* scratch space stuff */
+	intel_bb_out(ibb, 0); /* scratch hi */
+	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
+		     GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
+	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
+	intel_bb_out(ibb, 0); // kernel 1
+	intel_bb_out(ibb, 0); /* kernel 1 hi */
+	intel_bb_out(ibb, 0); // kernel 2
+	intel_bb_out(ibb, 0); /* kernel 2 hi */
+
+	intel_bb_out(ibb, GEN8_3DSTATE_PS_BLEND | (2 - 2));
+	intel_bb_out(ibb, GEN8_PS_BLEND_HAS_WRITEABLE_RT);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_PS_EXTRA | (2 - 2));
+	intel_bb_out(ibb, GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
 }
 
 static void
-gen9_emit_depth(struct intel_batchbuffer *batch)
+gen9_emit_depth(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN8_3DSTATE_WM_DEPTH_STENCIL | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_STENCIL_BUFFER | (5-2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN8_3DSTATE_WM_DEPTH_STENCIL | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN7_3DSTATE_DEPTH_BUFFER | (8-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_STENCIL_BUFFER | (5-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 }
 
 static void
-gen7_emit_clear(struct intel_batchbuffer *batch) {
-	OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
-	OUT_BATCH(0);
-	OUT_BATCH(1); // clear valid
+gen7_emit_clear(struct intel_bb *ibb) {
+	intel_bb_out(ibb, GEN7_3DSTATE_CLEAR_PARAMS | (3-2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 1); // clear valid
 }
 
 static void
-gen6_emit_drawing_rectangle(struct intel_batchbuffer *batch, const struct igt_buf *dst)
+gen6_emit_drawing_rectangle(struct intel_bb *ibb, const struct intel_buf *dst)
 {
-	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH((igt_buf_height(dst) - 1) << 16 | (igt_buf_width(dst) - 1));
-	OUT_BATCH(0);
+	intel_bb_out(ibb, GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, (intel_buf_height(dst) - 1) << 16 | (intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);
 }
 
-static void gen8_emit_vf_topology(struct intel_batchbuffer *batch)
+static void gen8_emit_vf_topology(struct intel_bb *ibb)
 {
-	OUT_BATCH(GEN8_3DSTATE_VF_TOPOLOGY);
-	OUT_BATCH(_3DPRIM_RECTLIST);
+	intel_bb_out(ibb, GEN8_3DSTATE_VF_TOPOLOGY);
+	intel_bb_out(ibb, _3DPRIM_RECTLIST);
 }
 
 /* Vertex elements MUST be defined before this according to spec */
-static void gen8_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset)
+static void gen8_emit_primitive(struct intel_bb *ibb, uint32_t offset)
 {
-	OUT_BATCH(GEN8_3DSTATE_VF | (2 - 2));
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-
-	OUT_BATCH(GEN4_3DPRIMITIVE | (7-2));
-	OUT_BATCH(0);	/* gen8+ ignore the topology type field */
-	OUT_BATCH(3);	/* vertex count */
-	OUT_BATCH(0);	/*  We're specifying this instead with offset in GEN6_3DSTATE_VERTEX_BUFFERS */
-	OUT_BATCH(1);	/* single instance */
-	OUT_BATCH(0);	/* start instance location */
-	OUT_BATCH(0);	/* index buffer offset, ignored */
+	intel_bb_out(ibb, GEN8_3DSTATE_VF | (2 - 2));
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, GEN4_3DPRIMITIVE | (7-2));
+	intel_bb_out(ibb, 0);	/* gen8+ ignore the topology type field */
+	intel_bb_out(ibb, 3);	/* vertex count */
+	intel_bb_out(ibb, 0);	/*  We're specifying this instead with offset in GEN6_3DSTATE_VERTEX_BUFFERS */
+	intel_bb_out(ibb, 1);	/* single instance */
+	intel_bb_out(ibb, 0);	/* start instance location */
+	intel_bb_out(ibb, 0);	/* index buffer offset, ignored */
 }
 
 /* The general rule is if it's named gen6 it is directly copied from
@@ -990,166 +876,159 @@ static void gen8_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset
 #define BATCH_STATE_SPLIT 2048
 
 static
-void _gen9_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x,
-			  unsigned src_y, unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x,
-			  unsigned dst_y,
-			  drm_intel_bo *aux_pgtable_bo,
-			  const uint32_t ps_kernel[][4],
-			  uint32_t ps_kernel_size)
+void _gen9_render_copyfunc(struct intel_bb *ibb,
+			   uint32_t ctx,
+			   struct intel_buf *src,
+			   unsigned int src_x, unsigned int src_y,
+			   unsigned int width, unsigned int height,
+			   struct intel_buf *dst,
+			   unsigned int dst_x, unsigned int dst_y,
+			   struct intel_buf *aux_pgtable_buf,
+			   const uint32_t ps_kernel[][4],
+			   uint32_t ps_kernel_size)
 {
 	uint32_t ps_sampler_state, ps_kernel_off, ps_binding_table;
 	uint32_t scissor_state;
 	uint32_t vertex_buffer;
-	uint32_t batch_end;
 	uint32_t aux_pgtable_state;
 
 	igt_assert(src->bpp == dst->bpp);
-	intel_batchbuffer_flush_with_context(batch, context);
 
-	intel_batchbuffer_align(batch, 8);
+	intel_bb_flush_render_with_context(ibb, ctx);
 
-	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
 
-	annotation_init(&aub_annotations);
+	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
 
-	ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
-	ps_sampler_state  = gen8_create_sampler(batch);
-	ps_kernel_off = gen8_fill_ps(batch, ps_kernel, ps_kernel_size);
-	vertex_buffer = gen7_fill_vertex_buffer_data(batch, src,
+	ps_binding_table  = gen8_bind_surfaces(ibb, src, dst);
+	ps_sampler_state  = gen8_create_sampler(ibb);
+	ps_kernel_off = gen8_fill_ps(ibb, ps_kernel, ps_kernel_size);
+	vertex_buffer = gen7_fill_vertex_buffer_data(ibb, src,
 						     src_x, src_y,
 						     dst_x, dst_y,
 						     width, height);
-	cc.cc_state = gen6_create_cc_state(batch);
-	cc.blend_state = gen8_create_blend_state(batch);
-	viewport.cc_state = gen6_create_cc_viewport(batch);
-	viewport.sf_clip_state = gen7_create_sf_clip_viewport(batch);
-	scissor_state = gen6_create_scissor_rect(batch);
-
-	aux_pgtable_state = gen12_create_aux_pgtable_state(batch,
-							   aux_pgtable_bo);
-
-	/* TODO: theree is other state which isn't setup */
+	cc.cc_state = gen6_create_cc_state(ibb);
+	cc.blend_state = gen8_create_blend_state(ibb);
+	viewport.cc_state = gen6_create_cc_viewport(ibb);
+	viewport.sf_clip_state = gen7_create_sf_clip_viewport(ibb);
+	scissor_state = gen6_create_scissor_rect(ibb);
+	aux_pgtable_state = gen12_create_aux_pgtable_state(ibb, aux_pgtable_buf);
 
-	assert(batch->ptr < &batch->buffer[4095]);
-
-	batch->ptr = batch->buffer;
+	/* TODO: there is other state which isn't setup */
+	intel_bb_ptr_set(ibb, 0);
 
 	/* Start emitting the commands. The order roughly follows the mesa blorp
 	 * order */
-	OUT_BATCH(G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D |
-				GEN9_PIPELINE_SELECTION_MASK);
-
-	gen12_emit_aux_pgtable_state(batch, aux_pgtable_state, true);
-
-	gen8_emit_sip(batch);
-
-	gen7_emit_push_constants(batch);
+	intel_bb_out(ibb, G4X_PIPELINE_SELECT | PIPELINE_SELECT_3D |
+		     GEN9_PIPELINE_SELECTION_MASK);
 
-	gen9_emit_state_base_address(batch);
+	gen12_emit_aux_pgtable_state(ibb, aux_pgtable_state, true);
 
-	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
-	OUT_BATCH(viewport.cc_state);
-	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
-	OUT_BATCH(viewport.sf_clip_state);
+	gen8_emit_sip(ibb);
 
-	gen7_emit_urb(batch);
+	gen7_emit_push_constants(ibb);
 
-	gen8_emit_cc(batch);
+	gen9_emit_state_base_address(ibb);
 
-	gen8_emit_multisample(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
+	intel_bb_out(ibb, viewport.cc_state);
+	intel_bb_out(ibb, GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
+	intel_bb_out(ibb, viewport.sf_clip_state);
 
-	gen8_emit_null_state(batch);
+	gen7_emit_urb(ibb);
 
-	OUT_BATCH(GEN7_3DSTATE_STREAMOUT | (5 - 2));
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
-	OUT_BATCH(0);
+	gen8_emit_cc(ibb);
 
-	gen7_emit_clip(batch);
+	gen8_emit_multisample(ibb);
 
-	gen8_emit_sf(batch);
+	gen8_emit_null_state(ibb);
 
-	gen8_emit_ps(batch, ps_kernel_off);
+	intel_bb_out(ibb, GEN7_3DSTATE_STREAMOUT | (5 - 2));
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);
 
-	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
-	OUT_BATCH(ps_binding_table);
+	gen7_emit_clip(ibb);
 
-	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS);
-	OUT_BATCH(ps_sampler_state);
+	gen8_emit_sf(ibb);
 
-	OUT_BATCH(GEN8_3DSTATE_SCISSOR_STATE_POINTERS);
-	OUT_BATCH(scissor_state);
+	gen8_emit_ps(ibb, ps_kernel_off);
 
-	gen9_emit_depth(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
+	intel_bb_out(ibb, ps_binding_table);
 
-	gen7_emit_clear(batch);
+	intel_bb_out(ibb, GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS);
+	intel_bb_out(ibb, ps_sampler_state);
 
-	gen6_emit_drawing_rectangle(batch, dst);
+	intel_bb_out(ibb, GEN8_3DSTATE_SCISSOR_STATE_POINTERS);
+	intel_bb_out(ibb, scissor_state);
 
-	gen7_emit_vertex_buffer(batch, vertex_buffer);
-	gen6_emit_vertex_elements(batch);
+	gen9_emit_depth(ibb);
 
-	gen8_emit_vf_topology(batch);
-	gen8_emit_primitive(batch, vertex_buffer);
+	gen7_emit_clear(ibb);
 
-	OUT_BATCH(MI_BATCH_BUFFER_END);
+	gen6_emit_drawing_rectangle(ibb, dst);
 
-	batch_end = intel_batchbuffer_align(batch, 8);
-	assert(batch_end < BATCH_STATE_SPLIT);
-	annotation_add_batch(&aub_annotations, batch_end);
+	gen7_emit_vertex_buffer(ibb, vertex_buffer);
+	gen6_emit_vertex_elements(ibb);
 
-	dump_batch(batch);
+	gen8_emit_vf_topology(ibb);
+	gen8_emit_primitive(ibb, vertex_buffer);
 
-	annotation_flush(&aub_annotations, batch);
-
-	gen6_render_flush(batch, context, batch_end);
-	intel_batchbuffer_reset(batch);
+	intel_bb_emit_bbe(ibb);
+	intel_bb_exec_with_context(ibb, intel_bb_offset(ibb), ctx,
+				   I915_EXEC_RENDER | I915_EXEC_NO_RELOC,
+				   false);
+	dump_batch(ibb);
+	intel_bb_reset(ibb, false);
 }
 
-void gen9_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen9_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  unsigned int src_x, unsigned int src_y,
+			  unsigned int width, unsigned int height,
+			  struct intel_buf *dst,
+			  unsigned int dst_x, unsigned int dst_y)
 
 {
-	_gen9_render_copyfunc(batch, context, src, src_x, src_y,
+	_gen9_render_copyfunc(ibb, ctx, src, src_x, src_y,
 			  width, height, dst, dst_x, dst_y, NULL,
 			  ps_kernel_gen9, sizeof(ps_kernel_gen9));
 }
 
-void gen11_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
-
+void gen11_render_copyfunc(struct intel_bb *ibb,
+			   uint32_t ctx,
+			   struct intel_buf *src,
+			   unsigned int src_x, unsigned int src_y,
+			   unsigned int width, unsigned int height,
+			   struct intel_buf *dst,
+			   unsigned int dst_x, unsigned int dst_y)
 {
-	_gen9_render_copyfunc(batch, context, src, src_x, src_y,
+	_gen9_render_copyfunc(ibb, ctx, src, src_x, src_y,
 			  width, height, dst, dst_x, dst_y, NULL,
 			  ps_kernel_gen11, sizeof(ps_kernel_gen11));
 }
 
-void gen12_render_copyfunc(struct intel_batchbuffer *batch,
-			   drm_intel_context *context,
-			   const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			   unsigned width, unsigned height,
-			   const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
-
+void gen12_render_copyfunc(struct intel_bb *ibb,
+			   uint32_t ctx,
+			   struct intel_buf *src,
+			   unsigned int src_x, unsigned int src_y,
+			   unsigned int width, unsigned int height,
+			   struct intel_buf *dst,
+			   unsigned int dst_x, unsigned int dst_y)
 {
 	struct aux_pgtable_info pgtable_info = { };
 
-	gen12_aux_pgtable_init(&pgtable_info, batch->bufmgr, src, dst);
+	gen12_aux_pgtable_init(&pgtable_info, ibb, src, dst);
 
-	_gen9_render_copyfunc(batch, context, src, src_x, src_y,
+	_gen9_render_copyfunc(ibb, ctx, src, src_x, src_y,
 			  width, height, dst, dst_x, dst_y,
-			  pgtable_info.pgtable_bo,
+			  pgtable_info.pgtable_buf,
 			  gen12_render_copy,
 			  sizeof(gen12_render_copy));
 
-	gen12_aux_pgtable_cleanup(&pgtable_info);
+	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
 }
diff --git a/lib/rendercopy_i830.c b/lib/rendercopy_i830.c
index ca815122..e755706e 100644
--- a/lib/rendercopy_i830.c
+++ b/lib/rendercopy_i830.c
@@ -11,7 +11,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "drmtest.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_io.h"
 
@@ -72,75 +72,75 @@
 #define TB0A_ARG1_SEL_TEXEL3		(9 << 6)
 
 
-static void gen2_emit_invariant(struct intel_batchbuffer *batch)
+static void gen2_emit_invariant(struct intel_bb *ibb)
 {
 	int i;
 
 	for (i = 0; i < 4; i++) {
-		OUT_BATCH(_3DSTATE_MAP_CUBE | MAP_UNIT(i));
-		OUT_BATCH(_3DSTATE_MAP_TEX_STREAM_CMD | MAP_UNIT(i) |
-			  DISABLE_TEX_STREAM_BUMP |
-			  ENABLE_TEX_STREAM_COORD_SET | TEX_STREAM_COORD_SET(i) |
-			  ENABLE_TEX_STREAM_MAP_IDX | TEX_STREAM_MAP_IDX(i));
-		OUT_BATCH(_3DSTATE_MAP_COORD_TRANSFORM);
-		OUT_BATCH(DISABLE_TEX_TRANSFORM | TEXTURE_SET(i));
+		intel_bb_out(ibb, _3DSTATE_MAP_CUBE | MAP_UNIT(i));
+		intel_bb_out(ibb, _3DSTATE_MAP_TEX_STREAM_CMD | MAP_UNIT(i) |
+			     DISABLE_TEX_STREAM_BUMP |
+			     ENABLE_TEX_STREAM_COORD_SET | TEX_STREAM_COORD_SET(i) |
+			     ENABLE_TEX_STREAM_MAP_IDX | TEX_STREAM_MAP_IDX(i));
+		intel_bb_out(ibb, _3DSTATE_MAP_COORD_TRANSFORM);
+		intel_bb_out(ibb, DISABLE_TEX_TRANSFORM | TEXTURE_SET(i));
 	}
 
-	OUT_BATCH(_3DSTATE_MAP_COORD_SETBIND_CMD);
-	OUT_BATCH(TEXBIND_SET3(TEXCOORDSRC_VTXSET_3) |
-		  TEXBIND_SET2(TEXCOORDSRC_VTXSET_2) |
-		  TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
-		  TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
-
-	OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
-
-	OUT_BATCH(_3DSTATE_VERTEX_TRANSFORM);
-	OUT_BATCH(DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
-
-	OUT_BATCH(_3DSTATE_W_STATE_CMD);
-	OUT_BATCH(MAGIC_W_STATE_DWORD1);
-	OUT_BATCH(0x3f800000 /* 1.0 in IEEE float */ );
-
-	OUT_BATCH(_3DSTATE_INDPT_ALPHA_BLEND_CMD |
-		  DISABLE_INDPT_ALPHA_BLEND |
-		  ENABLE_ALPHA_BLENDFUNC | ABLENDFUNC_ADD);
-
-	OUT_BATCH(_3DSTATE_CONST_BLEND_COLOR_CMD);
-	OUT_BATCH(0);
-
-	OUT_BATCH(_3DSTATE_MODES_1_CMD |
-		  ENABLE_COLR_BLND_FUNC | BLENDFUNC_ADD |
-		  ENABLE_SRC_BLND_FACTOR | SRC_BLND_FACT(BLENDFACTOR_ONE) |
-		  ENABLE_DST_BLND_FACTOR | DST_BLND_FACT(BLENDFACTOR_ZERO));
-
-	OUT_BATCH(_3DSTATE_ENABLES_1_CMD |
-		  DISABLE_LOGIC_OP |
-		  DISABLE_STENCIL_TEST |
-		  DISABLE_DEPTH_BIAS |
-		  DISABLE_SPEC_ADD |
-		  DISABLE_FOG |
-		  DISABLE_ALPHA_TEST |
-		  DISABLE_DEPTH_TEST |
-		  ENABLE_COLOR_BLEND);
-
-	OUT_BATCH(_3DSTATE_ENABLES_2_CMD |
-		  DISABLE_STENCIL_WRITE |
-		  DISABLE_DITHER |
-		  DISABLE_DEPTH_WRITE |
-		  ENABLE_COLOR_MASK |
-		  ENABLE_COLOR_WRITE |
-		  ENABLE_TEX_CACHE);
+	intel_bb_out(ibb, _3DSTATE_MAP_COORD_SETBIND_CMD);
+	intel_bb_out(ibb, TEXBIND_SET3(TEXCOORDSRC_VTXSET_3) |
+		     TEXBIND_SET2(TEXCOORDSRC_VTXSET_2) |
+		     TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
+		     TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
+
+	intel_bb_out(ibb, _3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
+
+	intel_bb_out(ibb, _3DSTATE_VERTEX_TRANSFORM);
+	intel_bb_out(ibb, DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
+
+	intel_bb_out(ibb, _3DSTATE_W_STATE_CMD);
+	intel_bb_out(ibb, MAGIC_W_STATE_DWORD1);
+	intel_bb_out(ibb, 0x3f800000 /* 1.0 in IEEE float */);
+
+	intel_bb_out(ibb, _3DSTATE_INDPT_ALPHA_BLEND_CMD |
+		     DISABLE_INDPT_ALPHA_BLEND |
+		     ENABLE_ALPHA_BLENDFUNC | ABLENDFUNC_ADD);
+
+	intel_bb_out(ibb, _3DSTATE_CONST_BLEND_COLOR_CMD);
+	intel_bb_out(ibb, 0);
+
+	intel_bb_out(ibb, _3DSTATE_MODES_1_CMD |
+		     ENABLE_COLR_BLND_FUNC | BLENDFUNC_ADD |
+		     ENABLE_SRC_BLND_FACTOR | SRC_BLND_FACT(BLENDFACTOR_ONE) |
+		     ENABLE_DST_BLND_FACTOR | DST_BLND_FACT(BLENDFACTOR_ZERO));
+
+	intel_bb_out(ibb, _3DSTATE_ENABLES_1_CMD |
+		     DISABLE_LOGIC_OP |
+		     DISABLE_STENCIL_TEST |
+		     DISABLE_DEPTH_BIAS |
+		     DISABLE_SPEC_ADD |
+		     DISABLE_FOG |
+		     DISABLE_ALPHA_TEST |
+		     DISABLE_DEPTH_TEST |
+		     ENABLE_COLOR_BLEND);
+
+	intel_bb_out(ibb, _3DSTATE_ENABLES_2_CMD |
+		     DISABLE_STENCIL_WRITE |
+		     DISABLE_DITHER |
+		     DISABLE_DEPTH_WRITE |
+		     ENABLE_COLOR_MASK |
+		     ENABLE_COLOR_WRITE |
+		     ENABLE_TEX_CACHE);
 }
 
-static void gen2_emit_target(struct intel_batchbuffer *batch,
-			     const struct igt_buf *dst)
+static void gen2_emit_target(struct intel_bb *ibb,
+			     const struct intel_buf *dst)
 {
 	uint32_t tiling;
 	uint32_t format;
 
 	igt_assert_lte(dst->surface[0].stride, 8192);
-	igt_assert_lte(igt_buf_width(dst), 2048);
-	igt_assert_lte(igt_buf_height(dst), 2048);
+	igt_assert_lte(intel_buf_width(dst), 2048);
+	igt_assert_lte(intel_buf_height(dst), 2048);
 
 	switch (dst->bpp) {
 		case 8: format = COLR_BUF_8BIT; break;
@@ -155,33 +155,35 @@ static void gen2_emit_target(struct intel_batchbuffer *batch,
 	if (dst->tiling == I915_TILING_Y)
 		tiling |= BUF_3D_TILE_WALK_Y;
 
-	OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
-	OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling | BUF_3D_PITCH(dst->surface[0].stride));
-	OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
-
-	OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-	OUT_BATCH(format |
-		  DSTORG_HORT_BIAS(0x8) |
-		  DSTORG_VERT_BIAS(0x8));
-
-	OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
-	OUT_BATCH(0);
-	OUT_BATCH(0);		/* ymin, xmin */
-	OUT_BATCH(DRAW_YMAX(igt_buf_height(dst) - 1) |
-		  DRAW_XMAX(igt_buf_width(dst) - 1));
-	OUT_BATCH(0);		/* yorig, xorig */
+	intel_bb_out(ibb, _3DSTATE_BUF_INFO_CMD);
+	intel_bb_out(ibb, BUF_3D_ID_COLOR_BACK | tiling |
+		     BUF_3D_PITCH(dst->surface[0].stride));
+	intel_bb_emit_reloc(ibb, dst->handle, I915_GEM_DOMAIN_RENDER,
+			    I915_GEM_DOMAIN_RENDER, 0, dst->addr.offset);
+
+	intel_bb_out(ibb, _3DSTATE_DST_BUF_VARS_CMD);
+	intel_bb_out(ibb, format |
+		     DSTORG_HORT_BIAS(0x8) |
+		     DSTORG_VERT_BIAS(0x8));
+
+	intel_bb_out(ibb, _3DSTATE_DRAW_RECT_CMD);
+	intel_bb_out(ibb, 0);
+	intel_bb_out(ibb, 0);		/* ymin, xmin */
+	intel_bb_out(ibb, DRAW_YMAX(intel_buf_height(dst) - 1) |
+		     DRAW_XMAX(intel_buf_width(dst) - 1));
+	intel_bb_out(ibb, 0);		/* yorig, xorig */
 }
 
-static void gen2_emit_texture(struct intel_batchbuffer *batch,
-			      const struct igt_buf *src,
+static void gen2_emit_texture(struct intel_bb *ibb,
+			      const struct intel_buf *src,
 			      int unit)
 {
 	uint32_t tiling;
 	uint32_t format;
 
 	igt_assert_lte(src->surface[0].stride, 8192);
-	igt_assert_lte(igt_buf_width(src), 2048);
-	igt_assert_lte(igt_buf_height(src), 2048);
+	igt_assert_lte(intel_buf_width(src), 2048);
+	igt_assert_lte(intel_buf_height(src), 2048);
 
 	switch (src->bpp) {
 		case 8: format = MAPSURF_8BIT | MT_8BIT_L8; break;
@@ -196,78 +198,84 @@ static void gen2_emit_texture(struct intel_batchbuffer *batch,
 	if (src->tiling == I915_TILING_Y)
 		tiling |= TM0S1_TILE_WALK;
 
-	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 | LOAD_TEXTURE_MAP(unit) | 4);
-	OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0);
-	OUT_BATCH((igt_buf_height(src) - 1) << TM0S1_HEIGHT_SHIFT |
-		  (igt_buf_width(src) - 1) << TM0S1_WIDTH_SHIFT |
-		  format | tiling);
-	OUT_BATCH((src->surface[0].stride / 4 - 1) << TM0S2_PITCH_SHIFT | TM0S2_MAP_2D);
-	OUT_BATCH(FILTER_NEAREST << TM0S3_MAG_FILTER_SHIFT |
-		  FILTER_NEAREST << TM0S3_MIN_FILTER_SHIFT |
-		  MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
-	OUT_BATCH(0);	/* default color */
-
-	OUT_BATCH(_3DSTATE_MAP_COORD_SET_CMD | TEXCOORD_SET(unit) |
-		  ENABLE_TEXCOORD_PARAMS | TEXCOORDS_ARE_NORMAL |
-		  TEXCOORDTYPE_CARTESIAN |
-		  ENABLE_ADDR_V_CNTL | TEXCOORD_ADDR_V_MODE(TEXCOORDMODE_CLAMP_BORDER) |
-		  ENABLE_ADDR_U_CNTL | TEXCOORD_ADDR_U_MODE(TEXCOORDMODE_CLAMP_BORDER));
+	intel_bb_out(ibb, _3DSTATE_LOAD_STATE_IMMEDIATE_2 | LOAD_TEXTURE_MAP(unit) | 4);
+	intel_bb_emit_reloc(ibb, src->handle, I915_GEM_DOMAIN_SAMPLER, 0, 0,
+			    src->addr.offset);
+	intel_bb_out(ibb, (intel_buf_height(src) - 1) << TM0S1_HEIGHT_SHIFT |
+		     (intel_buf_width(src) - 1) << TM0S1_WIDTH_SHIFT |
+		     format | tiling);
+	intel_bb_out(ibb, (src->surface[0].stride / 4 - 1) << TM0S2_PITCH_SHIFT | TM0S2_MAP_2D);
+	intel_bb_out(ibb, FILTER_NEAREST << TM0S3_MAG_FILTER_SHIFT |
+		     FILTER_NEAREST << TM0S3_MIN_FILTER_SHIFT |
+		     MIPFILTER_NONE << TM0S3_MIP_FILTER_SHIFT);
+	intel_bb_out(ibb, 0);	/* default color */
+
+	intel_bb_out(ibb, _3DSTATE_MAP_COORD_SET_CMD | TEXCOORD_SET(unit) |
+		     ENABLE_TEXCOORD_PARAMS | TEXCOORDS_ARE_NORMAL |
+		     TEXCOORDTYPE_CARTESIAN |
+		     ENABLE_ADDR_V_CNTL | TEXCOORD_ADDR_V_MODE(TEXCOORDMODE_CLAMP_BORDER) |
+		     ENABLE_ADDR_U_CNTL | TEXCOORD_ADDR_U_MODE(TEXCOORDMODE_CLAMP_BORDER));
 }
 
-static void gen2_emit_copy_pipeline(struct intel_batchbuffer *batch)
+static void gen2_emit_copy_pipeline(struct intel_bb *ibb)
 {
-	OUT_BATCH(_3DSTATE_INDPT_ALPHA_BLEND_CMD | DISABLE_INDPT_ALPHA_BLEND);
-	OUT_BATCH(_3DSTATE_ENABLES_1_CMD | DISABLE_LOGIC_OP |
-		  DISABLE_STENCIL_TEST | DISABLE_DEPTH_BIAS |
-		  DISABLE_SPEC_ADD | DISABLE_FOG | DISABLE_ALPHA_TEST |
-		  DISABLE_COLOR_BLEND | DISABLE_DEPTH_TEST);
-
-	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
-		  LOAD_TEXTURE_BLEND_STAGE(0) | 1);
-	OUT_BATCH(TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X |
-		  TB0C_OUTPUT_WRITE_CURRENT |
-		  TB0C_OP_ARG1 | TB0C_ARG1_SEL_TEXEL0);
-	OUT_BATCH(TB0A_RESULT_SCALE_1X | TB0A_OUTPUT_WRITE_CURRENT |
-		  TB0A_OP_ARG1 | TB0A_ARG1_SEL_TEXEL0);
+	intel_bb_out(ibb, _3DSTATE_INDPT_ALPHA_BLEND_CMD | DISABLE_INDPT_ALPHA_BLEND);
+	intel_bb_out(ibb, _3DSTATE_ENABLES_1_CMD | DISABLE_LOGIC_OP |
+		     DISABLE_STENCIL_TEST | DISABLE_DEPTH_BIAS |
+		     DISABLE_SPEC_ADD | DISABLE_FOG | DISABLE_ALPHA_TEST |
+		     DISABLE_COLOR_BLEND | DISABLE_DEPTH_TEST);
+
+	intel_bb_out(ibb, _3DSTATE_LOAD_STATE_IMMEDIATE_2 |
+		     LOAD_TEXTURE_BLEND_STAGE(0) | 1);
+	intel_bb_out(ibb, TB0C_LAST_STAGE | TB0C_RESULT_SCALE_1X |
+		     TB0C_OUTPUT_WRITE_CURRENT |
+		     TB0C_OP_ARG1 | TB0C_ARG1_SEL_TEXEL0);
+	intel_bb_out(ibb, TB0A_RESULT_SCALE_1X | TB0A_OUTPUT_WRITE_CURRENT |
+		     TB0A_OP_ARG1 | TB0A_ARG1_SEL_TEXEL0);
 }
 
-void gen2_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen2_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst,
+			  uint32_t dst_x, uint32_t dst_y)
 {
 	igt_assert(src->bpp == dst->bpp);
 
-	gen2_emit_invariant(batch);
-	gen2_emit_copy_pipeline(batch);
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
 
-	gen2_emit_target(batch, dst);
-	gen2_emit_texture(batch, src, 0);
+	gen2_emit_invariant(ibb);
+	gen2_emit_copy_pipeline(ibb);
 
-	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
-		  I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
-	OUT_BATCH(1<<12);
-	OUT_BATCH(S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
-	OUT_BATCH(S8_ENABLE_COLOR_BUFFER_WRITE);
+	gen2_emit_target(ibb, dst);
+	gen2_emit_texture(ibb, src, 0);
 
-	OUT_BATCH(_3DSTATE_VERTEX_FORMAT_2_CMD | TEXCOORDFMT_2D << 0);
+	intel_bb_out(ibb, _3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+		     I1_LOAD_S(2) | I1_LOAD_S(3) | I1_LOAD_S(8) | 2);
+	intel_bb_out(ibb, 1<<12);
+	intel_bb_out(ibb, S3_CULLMODE_NONE | S3_VERTEXHAS_XY);
+	intel_bb_out(ibb, S8_ENABLE_COLOR_BUFFER_WRITE);
 
-	OUT_BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST | (3*4 -1));
-	emit_vertex(batch, dst_x + width);
-	emit_vertex(batch, dst_y + height);
-	emit_vertex_normalized(batch, src_x + width, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	intel_bb_out(ibb, _3DSTATE_VERTEX_FORMAT_2_CMD | TEXCOORDFMT_2D << 0);
 
-	emit_vertex(batch, dst_x);
-	emit_vertex(batch, dst_y + height);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y + height, igt_buf_height(src));
+	intel_bb_out(ibb, PRIM3D_INLINE | PRIM3D_RECTLIST | (3*4 - 1));
+	emit_vertex(ibb, dst_x + width);
+	emit_vertex(ibb, dst_y + height);
+	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	emit_vertex(batch, dst_x);
-	emit_vertex(batch, dst_y);
-	emit_vertex_normalized(batch, src_x, igt_buf_width(src));
-	emit_vertex_normalized(batch, src_y, igt_buf_height(src));
+	emit_vertex(ibb, dst_x);
+	emit_vertex(ibb, dst_y + height);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
 
-	intel_batchbuffer_flush(batch);
+	emit_vertex(ibb, dst_x);
+	emit_vertex(ibb, dst_y);
+	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
+	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
+
+	intel_bb_flush_blit_with_context(ibb, ctx);
 }
diff --git a/lib/rendercopy_i915.c b/lib/rendercopy_i915.c
index 56e1863e..b16d4f12 100644
--- a/lib/rendercopy_i915.c
+++ b/lib/rendercopy_i915.c
@@ -11,7 +11,7 @@
 #include "drm.h"
 #include "i915_drm.h"
 #include "drmtest.h"
-#include "intel_bufmgr.h"
+#include "intel_bufops.h"
 #include "intel_batchbuffer.h"
 #include "intel_io.h"
 
@@ -19,68 +19,73 @@
 #include "i915_3d.h"
 #include "rendercopy.h"
 
-void gen3_render_copyfunc(struct intel_batchbuffer *batch,
-			  drm_intel_context *context,
-			  const struct igt_buf *src, unsigned src_x, unsigned src_y,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+void gen3_render_copyfunc(struct intel_bb *ibb,
+			  uint32_t ctx,
+			  struct intel_buf *src,
+			  uint32_t src_x, uint32_t src_y,
+			  uint32_t width, uint32_t height,
+			  struct intel_buf *dst,
+			  uint32_t dst_x, uint32_t dst_y)
 {
 	igt_assert(src->bpp == dst->bpp);
 
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
+
 	/* invariant state */
 	{
-		OUT_BATCH(_3DSTATE_AA_CMD |
-			  AA_LINE_ECAAR_WIDTH_ENABLE |
-			  AA_LINE_ECAAR_WIDTH_1_0 |
-			  AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
-		OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
-			  IAB_MODIFY_ENABLE |
-			  IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) |
-			  IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE <<
-						   IAB_SRC_FACTOR_SHIFT) |
-			  IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO <<
-						   IAB_DST_FACTOR_SHIFT));
-		OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
-		OUT_BATCH(0);
-		OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
-		OUT_BATCH(0);
-		OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
-		OUT_BATCH(0);
-		OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
-			  CSB_TCB(0, 0) |
-			  CSB_TCB(1, 1) |
-			  CSB_TCB(2, 2) |
-			  CSB_TCB(3, 3) |
-			  CSB_TCB(4, 4) |
-			  CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
-		OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-			  ENABLE_POINT_RASTER_RULE |
-			  OGL_POINT_RASTER_RULE |
-			  ENABLE_LINE_STRIP_PROVOKE_VRTX |
-			  ENABLE_TRI_FAN_PROVOKE_VRTX |
-			  LINE_STRIP_PROVOKE_VRTX(1) |
-			  TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
-		OUT_BATCH(_3DSTATE_MODES_4_CMD |
-			  ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) |
-			  ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
-			  ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
-		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
-		OUT_BATCH(0x00000000);	/* Disable texture coordinate wrap-shortest */
-		OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
-			  S4_LINE_WIDTH_ONE |
-			  S4_CULLMODE_NONE |
-			  S4_VFMT_XY);
-		OUT_BATCH(0x00000000);	/* Stencil. */
-		OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
-		OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD);
-		OUT_BATCH(0);
-		OUT_BATCH(0);
-		OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
-		OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);	/* disable indirect state */
-		OUT_BATCH(0);
-		OUT_BATCH(_3DSTATE_STIPPLE);
-		OUT_BATCH(0x00000000);
-		OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
+		intel_bb_out(ibb, _3DSTATE_AA_CMD |
+			     AA_LINE_ECAAR_WIDTH_ENABLE |
+			     AA_LINE_ECAAR_WIDTH_1_0 |
+			     AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+		intel_bb_out(ibb, _3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+			     IAB_MODIFY_ENABLE |
+			     IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) |
+			     IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE <<
+						      IAB_SRC_FACTOR_SHIFT) |
+			     IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO <<
+						      IAB_DST_FACTOR_SHIFT));
+		intel_bb_out(ibb, _3DSTATE_DFLT_DIFFUSE_CMD);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, _3DSTATE_DFLT_SPEC_CMD);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, _3DSTATE_DFLT_Z_CMD);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, _3DSTATE_COORD_SET_BINDINGS |
+			     CSB_TCB(0, 0) |
+			     CSB_TCB(1, 1) |
+			     CSB_TCB(2, 2) |
+			     CSB_TCB(3, 3) |
+			     CSB_TCB(4, 4) |
+			     CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
+		intel_bb_out(ibb, _3DSTATE_RASTER_RULES_CMD |
+			     ENABLE_POINT_RASTER_RULE |
+			     OGL_POINT_RASTER_RULE |
+			     ENABLE_LINE_STRIP_PROVOKE_VRTX |
+			     ENABLE_TRI_FAN_PROVOKE_VRTX |
+			     LINE_STRIP_PROVOKE_VRTX(1) |
+			     TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
+		intel_bb_out(ibb, _3DSTATE_MODES_4_CMD |
+			     ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) |
+			     ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) |
+			     ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff));
+		intel_bb_out(ibb, _3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2);
+		intel_bb_out(ibb, 0x00000000);	/* Disable texture coordinate wrap-shortest */
+		intel_bb_out(ibb, (1 << S4_POINT_WIDTH_SHIFT) |
+			     S4_LINE_WIDTH_ONE |
+			     S4_CULLMODE_NONE |
+			     S4_VFMT_XY);
+		intel_bb_out(ibb, 0x00000000);	/* Stencil. */
+		intel_bb_out(ibb, _3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
+		intel_bb_out(ibb, _3DSTATE_SCISSOR_RECT_0_CMD);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, _3DSTATE_DEPTH_SUBRECT_DISABLE);
+		intel_bb_out(ibb, _3DSTATE_LOAD_INDIRECT | 0);	/* disable indirect state */
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, _3DSTATE_STIPPLE);
+		intel_bb_out(ibb, 0x00000000);
+		intel_bb_out(ibb, _3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
 	}
 
 	/* samler state */
@@ -89,8 +94,8 @@ void gen3_render_copyfunc(struct intel_batchbuffer *batch,
 		uint32_t format_bits, tiling_bits = 0;
 
 		igt_assert_lte(src->surface[0].stride, 8192);
-		igt_assert_lte(igt_buf_width(src), 2048);
-		igt_assert_lte(igt_buf_height(src), 2048);
+		igt_assert_lte(intel_buf_width(src), 2048);
+		igt_assert_lte(intel_buf_height(src), 2048);
 
 		if (src->tiling != I915_TILING_NONE)
 			tiling_bits = MS3_TILED_SURFACE;
@@ -104,23 +109,25 @@ void gen3_render_copyfunc(struct intel_batchbuffer *batch,
 			default: igt_assert(0);
 		}
 
-		OUT_BATCH(_3DSTATE_MAP_STATE | (3 * TEX_COUNT));
-		OUT_BATCH((1 << TEX_COUNT) - 1);
-		OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0);
-		OUT_BATCH(format_bits | tiling_bits |
-			  (igt_buf_height(src) - 1) << MS3_HEIGHT_SHIFT |
-			  (igt_buf_width(src) - 1) << MS3_WIDTH_SHIFT);
-		OUT_BATCH((src->surface[0].stride/4-1) << MS4_PITCH_SHIFT);
-
-		OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT));
-		OUT_BATCH((1 << TEX_COUNT) - 1);
-		OUT_BATCH(MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT |
-			  FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
-			  FILTER_NEAREST << SS2_MIN_FILTER_SHIFT);
-		OUT_BATCH(TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT |
-			  TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT |
-			  0 << SS3_TEXTUREMAP_INDEX_SHIFT);
-		OUT_BATCH(0x00000000);
+		intel_bb_out(ibb, _3DSTATE_MAP_STATE | (3 * TEX_COUNT));
+		intel_bb_out(ibb, (1 << TEX_COUNT) - 1);
+		intel_bb_emit_reloc(ibb, src->handle,
+				    I915_GEM_DOMAIN_SAMPLER, 0,
+				    0, src->addr.offset);
+		intel_bb_out(ibb, format_bits | tiling_bits |
+			     (intel_buf_height(src) - 1) << MS3_HEIGHT_SHIFT |
+			     (intel_buf_width(src) - 1) << MS3_WIDTH_SHIFT);
+		intel_bb_out(ibb, (src->surface[0].stride/4-1) << MS4_PITCH_SHIFT);
+
+		intel_bb_out(ibb, _3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT));
+		intel_bb_out(ibb, (1 << TEX_COUNT) - 1);
+		intel_bb_out(ibb, MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT |
+			     FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
+			     FILTER_NEAREST << SS2_MIN_FILTER_SHIFT);
+		intel_bb_out(ibb, TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT |
+			     TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT |
+			     0 << SS3_TEXTUREMAP_INDEX_SHIFT);
+		intel_bb_out(ibb, 0x00000000);
 	}
 
 	/* render target state */
@@ -129,8 +136,8 @@ void gen3_render_copyfunc(struct intel_batchbuffer *batch,
 		uint32_t format_bits;
 
 		igt_assert_lte(dst->surface[0].stride, 8192);
-		igt_assert_lte(igt_buf_width(dst), 2048);
-		igt_assert_lte(igt_buf_height(dst), 2048);
+		igt_assert_lte(intel_buf_width(dst), 2048);
+		igt_assert_lte(intel_buf_height(dst), 2048);
 
 		switch (dst->bpp) {
 			case 8: format_bits = COLR_BUF_8BIT; break;
@@ -144,81 +151,83 @@ void gen3_render_copyfunc(struct intel_batchbuffer *batch,
 		if (dst->tiling == I915_TILING_Y)
 			tiling_bits |= BUF_3D_TILE_WALK_Y;
 
-		OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
-		OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling_bits |
-			  BUF_3D_PITCH(dst->surface[0].stride));
-		OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
+		intel_bb_out(ibb, _3DSTATE_BUF_INFO_CMD);
+		intel_bb_out(ibb, BUF_3D_ID_COLOR_BACK | tiling_bits |
+			     BUF_3D_PITCH(dst->surface[0].stride));
+		intel_bb_emit_reloc(ibb, dst->handle,
+				    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+				    0, dst->addr.offset);
 
-		OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-		OUT_BATCH(format_bits |
-			  DSTORG_HORT_BIAS(0x8) |
-			  DSTORG_VERT_BIAS(0x8));
+		intel_bb_out(ibb, _3DSTATE_DST_BUF_VARS_CMD);
+		intel_bb_out(ibb, format_bits |
+			     DSTORG_HORT_BIAS(0x8) |
+			     DSTORG_VERT_BIAS(0x8));
 
 		/* draw rect is unconditional */
-		OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
-		OUT_BATCH(0x00000000);
-		OUT_BATCH(0x00000000);	/* ymin, xmin */
-		OUT_BATCH(DRAW_YMAX(igt_buf_height(dst) - 1) |
-			  DRAW_XMAX(igt_buf_width(dst) - 1));
+		intel_bb_out(ibb, _3DSTATE_DRAW_RECT_CMD);
+		intel_bb_out(ibb, 0x00000000);
+		intel_bb_out(ibb, 0x00000000);	/* ymin, xmin */
+		intel_bb_out(ibb, DRAW_YMAX(intel_buf_height(dst) - 1) |
+			     DRAW_XMAX(intel_buf_width(dst) - 1));
 		/* yorig, xorig (relate to color buffer?) */
-		OUT_BATCH(0x00000000);
+		intel_bb_out(ibb, 0x00000000);
 	}
 
 	/* texfmt */
 	{
-		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
-			  I1_LOAD_S(1) | I1_LOAD_S(2) | I1_LOAD_S(6) | 2);
-		OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) |
-			  (4 << S1_VERTEX_PITCH_SHIFT));
-		OUT_BATCH(~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT) | S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D));
-		OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
-			  BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
-			  BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT |
-			  BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT);
+		intel_bb_out(ibb, _3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+			     I1_LOAD_S(1) | I1_LOAD_S(2) | I1_LOAD_S(6) | 2);
+		intel_bb_out(ibb, (4 << S1_VERTEX_WIDTH_SHIFT) |
+			     (4 << S1_VERTEX_PITCH_SHIFT));
+		intel_bb_out(ibb, ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT) | S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D));
+		intel_bb_out(ibb, S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
+			     BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
+			     BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT |
+			     BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT);
 	}
 
 	/* frage shader */
 	{
-		OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2));
+		intel_bb_out(ibb, _3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2));
 		/* decl FS_T0 */
-		OUT_BATCH(D0_DCL |
-			  REG_TYPE(FS_T0) << D0_TYPE_SHIFT |
-			  REG_NR(FS_T0) << D0_NR_SHIFT |
-			  ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
-		OUT_BATCH(0);
-		OUT_BATCH(0);
+		intel_bb_out(ibb, D0_DCL |
+			     REG_TYPE(FS_T0) << D0_TYPE_SHIFT |
+			     REG_NR(FS_T0) << D0_NR_SHIFT |
+			     ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
 		/* decl FS_S0 */
-		OUT_BATCH(D0_DCL |
-			  (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) |
-			  (REG_NR(FS_S0) << D0_NR_SHIFT) |
-			  ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
-		OUT_BATCH(0);
-		OUT_BATCH(0);
+		intel_bb_out(ibb, D0_DCL |
+			     (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) |
+			     (REG_NR(FS_S0) << D0_NR_SHIFT) |
+			     ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0));
+		intel_bb_out(ibb, 0);
+		intel_bb_out(ibb, 0);
 		/* texld(FS_OC, FS_S0, FS_T0 */
-		OUT_BATCH(T0_TEXLD |
-			  (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) |
-			  (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) |
-			  (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT));
-		OUT_BATCH((REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) |
-			  (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT));
-		OUT_BATCH(0);
+		intel_bb_out(ibb, T0_TEXLD |
+			     (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) |
+			     (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) |
+			     (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT));
+		intel_bb_out(ibb, (REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) |
+			     (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT));
+		intel_bb_out(ibb, 0);
 	}
 
-	OUT_BATCH(PRIM3D_RECTLIST | (3*4 - 1));
-	emit_vertex(batch, dst_x + width);
-	emit_vertex(batch, dst_y + height);
-	emit_vertex(batch, src_x + width);
-	emit_vertex(batch, src_y + height);
+	intel_bb_out(ibb, PRIM3D_RECTLIST | (3*4 - 1));
+	emit_vertex(ibb, dst_x + width);
+	emit_vertex(ibb, dst_y + height);
+	emit_vertex(ibb, src_x + width);
+	emit_vertex(ibb, src_y + height);
 
-	emit_vertex(batch, dst_x);
-	emit_vertex(batch, dst_y + height);
-	emit_vertex(batch, src_x);
-	emit_vertex(batch, src_y + height);
+	emit_vertex(ibb, dst_x);
+	emit_vertex(ibb, dst_y + height);
+	emit_vertex(ibb, src_x);
+	emit_vertex(ibb, src_y + height);
 
-	emit_vertex(batch, dst_x);
-	emit_vertex(batch, dst_y);
-	emit_vertex(batch, src_x);
-	emit_vertex(batch, src_y);
+	emit_vertex(ibb, dst_x);
+	emit_vertex(ibb, dst_y);
+	emit_vertex(ibb, src_x);
+	emit_vertex(ibb, src_y);
 
-	intel_batchbuffer_flush(batch);
+	intel_bb_flush_blit_with_context(ibb, ctx);
 }
diff --git a/lib/veboxcopy.h b/lib/veboxcopy.h
index 949d83bf..925b8f52 100644
--- a/lib/veboxcopy.h
+++ b/lib/veboxcopy.h
@@ -1,9 +1,9 @@
 #ifndef __VEBOXCOPY_H__
 #define __VEBOXCOPY_H__
 
-void gen12_vebox_copyfunc(struct intel_batchbuffer *batch,
-			  const struct igt_buf *src,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst);
+void gen12_vebox_copyfunc(struct intel_bb *ibb,
+			  struct intel_buf *src,
+			  unsigned int width, unsigned int height,
+			  struct intel_buf *dst);
 
 #endif
diff --git a/lib/veboxcopy_gen12.c b/lib/veboxcopy_gen12.c
index 237c43f2..a44e2bff 100644
--- a/lib/veboxcopy_gen12.c
+++ b/lib/veboxcopy_gen12.c
@@ -144,7 +144,7 @@ static bool format_is_interleaved_yuv(int format)
 	return false;
 }
 
-static void emit_surface_state_cmd(struct intel_batchbuffer *batch,
+static void emit_surface_state_cmd(struct intel_bb *ibb,
 				   int surface_id,
 				   int width, int height, int bpp,
 				   int pitch, uint32_t tiling, int format,
@@ -152,7 +152,7 @@ static void emit_surface_state_cmd(struct intel_batchbuffer *batch,
 {
 	struct vebox_surface_state *ss;
 
-	ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 4);
+	ss = intel_bb_ptr_align(ibb, 4);
 
 	ss->ss0.cmd_type = 3;
 	ss->ss0.media_cmd_pipeline = 2;
@@ -175,21 +175,19 @@ static void emit_surface_state_cmd(struct intel_batchbuffer *batch,
 	ss->ss4.u_y_offset = uv_offset / pitch;
 
 	ss->ss7.derived_surface_pitch = pitch - 1;
+
+	intel_bb_ptr_add(ibb, sizeof(*ss));
 }
 
-static void emit_tiling_convert_cmd(struct intel_batchbuffer *batch,
-				    drm_intel_bo *input_bo,
-				    uint32_t input_tiling,
-				    uint32_t input_compression,
-				    drm_intel_bo *output_bo,
-				    uint32_t output_tiling,
-				    uint32_t output_compression)
+static void emit_tiling_convert_cmd(struct intel_bb *ibb,
+				    struct intel_buf *src,
+				    struct intel_buf *dst)
 {
-	uint32_t reloc_delta;
+	uint32_t reloc_delta, tc_offset, offset;
 	struct vebox_tiling_convert *tc;
-	int ret;
 
-	tc = intel_batchbuffer_subdata_alloc(batch, sizeof(*tc), 8);
+	tc = intel_bb_ptr_align(ibb, 8);
+	tc_offset = intel_bb_offset(ibb);
 
 	tc->tc0.cmd_type = 3;
 	tc->tc0.pipeline = 2;
@@ -198,71 +196,70 @@ static void emit_tiling_convert_cmd(struct intel_batchbuffer *batch,
 
 	tc->tc0.dw_count = 3;
 
-	if (input_compression != I915_COMPRESSION_NONE) {
+	if (src->compression != I915_COMPRESSION_NONE) {
 		tc->tc1_2.input_memory_compression_enable = 1;
 		tc->tc1_2.input_compression_type =
-			input_compression == I915_COMPRESSION_RENDER;
+			src->compression == I915_COMPRESSION_RENDER;
 	}
-	tc->tc1_2.input_tiled_resource_mode = input_tiling == I915_TILING_Yf;
+	tc->tc1_2.input_tiled_resource_mode = src->tiling == I915_TILING_Yf;
 	reloc_delta = tc->tc1_2_l;
 
-	igt_assert(input_bo->offset64 == ALIGN(input_bo->offset64, 0x1000));
-	tc->tc1_2.input_address = input_bo->offset64 >> 12;
+	igt_assert(src->addr.offset == ALIGN(src->addr.offset, 0x1000));
+	tc->tc1_2.input_address = src->addr.offset >> 12;
 	igt_assert(reloc_delta <= INT32_MAX);
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, tc) +
-					offsetof(typeof(*tc), tc1_2),
-				      input_bo, reloc_delta,
-				      0, 0);
-	igt_assert(ret == 0);
-
-	if (output_compression != I915_COMPRESSION_NONE) {
+
+	offset = tc_offset + offsetof(typeof(*tc), tc1_2);
+	intel_bb_offset_reloc_with_delta(ibb, src->handle, 0, 0,
+					 reloc_delta, offset,
+					 src->addr.offset);
+
+	if (dst->compression != I915_COMPRESSION_NONE) {
 		tc->tc3_4.output_memory_compression_enable = 1;
 		tc->tc3_4.output_compression_type =
-			output_compression == I915_COMPRESSION_RENDER;
+			dst->compression == I915_COMPRESSION_RENDER;
 	}
-	tc->tc3_4.output_tiled_resource_mode = output_tiling == I915_TILING_Yf;
+	tc->tc3_4.output_tiled_resource_mode = dst->tiling == I915_TILING_Yf;
 	reloc_delta = tc->tc3_4_l;
 
-	igt_assert(output_bo->offset64 == ALIGN(output_bo->offset64, 0x1000));
-	tc->tc3_4.output_address = output_bo->offset64 >> 12;
+	igt_assert(dst->addr.offset == ALIGN(dst->addr.offset, 0x1000));
+	tc->tc3_4.output_address = dst->addr.offset >> 12;
 	igt_assert(reloc_delta <= INT32_MAX);
-	ret = drm_intel_bo_emit_reloc(batch->bo,
-				      intel_batchbuffer_subdata_offset(batch, tc) +
-					offsetof(typeof(*tc), tc3_4),
-				      output_bo, reloc_delta,
-				      0, I915_GEM_DOMAIN_RENDER);
-	igt_assert(ret == 0);
 
+	offset = tc_offset + offsetof(typeof(*tc), tc3_4);
+	intel_bb_offset_reloc_with_delta(ibb, dst->handle,
+					 0, I915_GEM_DOMAIN_RENDER,
+					 reloc_delta, offset,
+					 dst->addr.offset);
+
+	intel_bb_ptr_add(ibb, sizeof(*tc));
 }
 
 /* Borrowing the idea from the rendercopy state setup. */
 #define BATCH_STATE_SPLIT 2048
 
-void gen12_vebox_copyfunc(struct intel_batchbuffer *batch,
-			  const struct igt_buf *src,
-			  unsigned width, unsigned height,
-			  const struct igt_buf *dst)
+void gen12_vebox_copyfunc(struct intel_bb *ibb,
+			  struct intel_buf *src,
+			  unsigned int width, unsigned int height,
+			  struct intel_buf *dst)
 {
 	struct aux_pgtable_info aux_pgtable_info = { };
 	uint32_t aux_pgtable_state;
 	int format;
 
-	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_VEBOX);
+	igt_assert(src->bpp == dst->bpp);
 
-	intel_batchbuffer_align(batch, 8);
+	intel_bb_flush(ibb, ibb->ctx, I915_EXEC_VEBOX);
 
-	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+	intel_bb_add_intel_buf(ibb, dst, true);
+	intel_bb_add_intel_buf(ibb, src, false);
 
-	gen12_aux_pgtable_init(&aux_pgtable_info, batch->bufmgr, src, dst);
+	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
+	gen12_aux_pgtable_init(&aux_pgtable_info, ibb, src, dst);
+	aux_pgtable_state = gen12_create_aux_pgtable_state(ibb,
+							   aux_pgtable_info.pgtable_buf);
 
-	aux_pgtable_state = gen12_create_aux_pgtable_state(batch,
-							   aux_pgtable_info.pgtable_bo);
-
-	assert(batch->ptr < &batch->buffer[4095]);
-	batch->ptr = batch->buffer;
-
-	gen12_emit_aux_pgtable_state(batch, aux_pgtable_state, false);
+	intel_bb_ptr_set(ibb, 0);
+	gen12_emit_aux_pgtable_state(ibb, aux_pgtable_state, false);
 
 	/* The tiling convert command can't convert formats. */
 	igt_assert_eq(src->format_is_yuv, dst->format_is_yuv);
@@ -292,24 +289,26 @@ void gen12_vebox_copyfunc(struct intel_batchbuffer *batch,
 
 	igt_assert(!src->format_is_yuv_semiplanar ||
 		   (src->surface[1].offset && dst->surface[1].offset));
-	emit_surface_state_cmd(batch, VEBOX_SURFACE_INPUT,
+	emit_surface_state_cmd(ibb, VEBOX_SURFACE_INPUT,
 			       width, height, src->bpp,
 			       src->surface[0].stride,
 			       src->tiling, format, src->surface[1].offset);
 
-	emit_surface_state_cmd(batch, VEBOX_SURFACE_OUTPUT,
+	emit_surface_state_cmd(ibb, VEBOX_SURFACE_OUTPUT,
 			       width, height, dst->bpp,
 			       dst->surface[0].stride,
 			       dst->tiling, format, dst->surface[1].offset);
 
-	emit_tiling_convert_cmd(batch,
-				src->bo, src->tiling, src->compression,
-				dst->bo, dst->tiling, dst->compression);
+	emit_tiling_convert_cmd(ibb, src, dst);
+
+	intel_bb_out(ibb, MI_BATCH_BUFFER_END);
+	intel_bb_ptr_align(ibb, 8);
 
-	OUT_BATCH(MI_BATCH_BUFFER_END);
+	intel_bb_exec_with_context(ibb, intel_bb_offset(ibb), 0,
+				   I915_EXEC_VEBOX | I915_EXEC_NO_RELOC,
+				   false);
 
-	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_VEBOX);
+	intel_bb_reset(ibb, false);
 
-	gen12_aux_pgtable_cleanup(&aux_pgtable_info);
-	intel_batchbuffer_reset(batch);
+	gen12_aux_pgtable_cleanup(ibb, &aux_pgtable_info);
 }
-- 
2.26.0



More information about the igt-dev mailing list