[igt-dev] [PATCH i-g-t 1/4] lib: Move common gpgpu/media functions to gpu_fill library

Thu Apr 5 18:48:46 UTC 2018

On 05/04/18 11:25, Daniele Ceraolo Spurio wrote:
> 
> 
> On 05/04/18 06:53, Katarzyna Dec wrote:
>> Gpgpu_fill and media_fill libraries are very similar and many
>> functions can be shared. I have created library gpu_fill with
>> all functions needed for implementing gpgpu_fill and media_fill
>> tests for all Gens. Duplicates, e.g. where only name was changed,
>> were removed. The earliest common function remained.
>>
>> v2: Changed code layout. GenX_fill_media_kernel was identical to
>> genX_fill_gpgpu_kernel so this function was unified to
>> gen7_fill_kernel. There were 2 very similar functions
>> gen8_emit_state_base_address for media and gpgpu, where the one
>> for gpgpu was configured like it would be using indirect state
>> (while we are using CURBE). I have checked if media fill version
>> works fine in gpgpu test on Gen8 and unified them.
>>
>> Signed-off-by: Katarzyna Dec <katarzyna.dec at intel.com>
>> Cc: Lukasz Kalamarz <lukasz.kalamarz at intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> ---
>>   lib/Makefile.sources    |   2 +
>>   lib/gpgpu_fill.c        | 567 +------------------------------------
>>   lib/gpu_fill.c          | 729 
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>   lib/gpu_fill.h          | 141 ++++++++++
>>   lib/media_fill_gen7.c   | 271 +-----------------
>>   lib/media_fill_gen8.c   | 297 +-------------------
>>   lib/media_fill_gen8lp.c | 289 +------------------
>>   lib/media_fill_gen9.c   | 304 +-------------------
>>   lib/meson.build         |   1 +
>>   9 files changed, 890 insertions(+), 1711 deletions(-)
>>   create mode 100644 lib/gpu_fill.c
>>   create mode 100644 lib/gpu_fill.h
>>
>> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
>> index 3d37ef1d..690a1d35 100644
>> --- a/lib/Makefile.sources
>> +++ b/lib/Makefile.sources
>> @@ -55,6 +55,8 @@ lib_source_list =         \
>>       intel_reg.h        \
>>       ioctl_wrappers.c    \
>>       ioctl_wrappers.h    \
>> +    gpu_fill.h        \
>> +    gpu_fill.c        \
>>       media_fill.h            \
>>       media_fill_gen7.c       \
>>       media_fill_gen8.c       \
>> diff --git a/lib/gpgpu_fill.c b/lib/gpgpu_fill.c
>> index 4d98643d..68cbac5e 100644
>> --- a/lib/gpgpu_fill.c
>> +++ b/lib/gpgpu_fill.c
>> @@ -34,6 +34,7 @@
>>   #include "gen7_media.h"
>>   #include "gen8_media.h"
>>   #include "gpgpu_fill.h"
>> +#include "gpu_fill.h"
>>   /* shaders/gpgpu/gpgpu_fill.gxa */
>>   static const uint32_t gen7_gpgpu_kernel[][4] = {
>> @@ -75,572 +76,6 @@ static const uint32_t gen9_gpgpu_kernel[][4] = {
>>       { 0x07800031, 0x20000a40, 0x06000e00, 0x82000010 },
>>   };
>> -static uint32_t
>> -batch_used(struct intel_batchbuffer *batch)
>> -{
>> -    return batch->ptr - batch->buffer;
>> -}
>> -
> 
> Hi,
> 
> As I mentioned on the previous review I think that the batch_* functions 
> shouldn't go to the gpu_fill.c file and should instead go with the other 
> intel_batchbuffer methods. I don't think that moving it to gpu_fill.c as 
> an interim solution works because we'll have to do another patch to move 
> everything to intel_batchbuffer.c, including the other copies of the 
> functions from the rendercopy files. It makes more sense IMO to move the 
> code directly to its final destination.
> If Lukasz is going to look at that then please sync with him so that his 
> patch goes before this one (either as part of this series or standalone).
> 
> Thanks,
> Daniele
> 

I've realized I've been a bit unclear here, considering that you've 
already said in the cover letter that your plan was to move the batch_* 
functions as a second step. What I wanted to express above is that I 
think moving the batch_* as first step is cleaner and results in overall 
less work (only moving things once). This isn't a blocking suggestion.

Daniele

>> -static uint32_t
>> -batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> -{
>> -    uint32_t offset = batch_used(batch);
>> -    offset = ALIGN(offset, align);
>> -    batch->ptr = batch->buffer + offset;
>> -    return offset;
>> -}
>> -
>> -static void *
>> -batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> -{
>> -    uint32_t offset = batch_align(batch, align);
>> -    batch->ptr += size;
>> -    return memset(batch->buffer + offset, 0, size);
>> -}
>> -
>> -static uint32_t
>> -batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> -{
>> -    return (uint8_t *)ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size,
>> -       uint32_t align)
>> -{
>> -    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> -}
>> -
>> -static void
>> -gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> -{
>> -    int ret;
>> -
>> -    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> -    if (ret == 0)
>> -        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> -                    NULL, 0, 0, 0);
>> -    igt_assert(ret == 0);
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch, uint8_t 
>> color)
>> -{
>> -    uint8_t *curbe_buffer;
>> -    uint32_t offset;
>> -
>> -    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> -    offset = batch_offset(batch, curbe_buffer);
>> -    *curbe_buffer = color;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen7_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN7_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss1.base_addr = buf->bo->offset;
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    igt_assert(ret == 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen8_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN8_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -    ss->ss0.vertical_alignment = 1; /* align 4 */
>> -    ss->ss0.horizontal_alignment = 1; /* align 4 */
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss8.base_addr = buf->bo->offset;
>> -
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 8 * 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    igt_assert_eq(ret, 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen7_fill_surface_state(batch, dst, 
>> GEN7_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen8_fill_surface_state(batch, dst, 
>> GEN8_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_gpgpu_kernel(struct intel_batchbuffer *batch,
>> -        const uint32_t kernel[][4],
>> -        size_t size)
>> -{
>> -    uint32_t offset;
>> -
>> -    offset = batch_copy(batch, kernel, size, 64);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst,
>> -                   const uint32_t kernel[][4], size_t size)
>> -{
>> -    struct gen7_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen7_fill_binding_table(batch, dst);
>> -    kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc1.single_program_flow = 1;
>> -    idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc2.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc2.sampler_state_pointer = 0;
>> -
>> -    idd->desc3.binding_table_entry_count = 0;
>> -    idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc4.constant_urb_entry_read_offset = 0;
>> -    idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst,
>> -                   const uint32_t kernel[][4], size_t size)
>> -{
>> -    struct gen8_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen8_fill_binding_table(batch, dst);
>> -    kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc2.single_program_flow = 1;
>> -    idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc3.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc3.sampler_state_pointer = 0;
>> -
>> -    idd->desc4.binding_table_entry_count = 0;
>> -    idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc5.constant_urb_entry_read_offset = 0;
>> -    idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static void
>> -gen7_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general/dynamic/indirect/instruction access Bound */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -}
>> -
>> -static void
>> -gen8_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0 | (0x78 << 4) | (0 << 1) |  BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -
>> -    /* stateless data port */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> -          0, BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0 );
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general state buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* dynamic state buffer size */
>> -    OUT_BATCH(1 << 12 | 1);
>> -    /* indirect object buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> -    OUT_BATCH(1 << 12 | 1);
>> -}
>> -
>> -static void
>> -gen9_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -
>> -    /* stateless data port */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> -        0, BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general state buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* dynamic state buffer size */
>> -    OUT_BATCH(1 << 12 | 1);
>> -    /* indirect object buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> -    OUT_BATCH(1 << 12 | 1);
>> -
>> -    /* Bindless surface state base address */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0xfffff000);
>> -}
>> -
>> -static void
>> -gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 | /* max num of threads */
>> -          0 << 8 | /* num of URB entry */
>> -          1 << 2); /* GPGPU mode */
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(0 << 16 |     /* URB entry size in 256 bits unit */
>> -          1);        /* CURBE entry size in 256 bits unit */
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 | 1 << 8);
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(0 << 16 | 1);
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* curbe total data length */
>> -    OUT_BATCH(64);
>> -    /* curbe data start address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(curbe_buffer);
>> -}
>> -
>> -static void
>> -gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> -             unsigned x, unsigned y,
>> -             unsigned width, unsigned height)
>> -{
>> -    uint32_t x_dim, y_dim, tmp, right_mask;
>> -
>> -    /*
>> -     * Simply do SIMD16 based dispatch, so every thread uses
>> -     * SIMD16 channels.
>> -     *
>> -     * Define our own thread group size, e.g 16x1 for every group, then
>> -     * will have 1 thread each group in SIMD16 dispatch. So thread
>> -     * width/height/depth are all 1.
>> -     *
>> -     * Then thread group X = width / 16 (aligned to 16)
>> -     * thread group Y = height;
>> -     */
>> -    x_dim = (width + 15) / 16;
>> -    y_dim = height;
>> -
>> -    tmp = width & 15;
>> -    if (tmp == 0)
>> -        right_mask = (1 << 16) - 1;
>> -    else
>> -        right_mask = (1 << tmp) - 1;
>> -
>> -    OUT_BATCH(GEN7_GPGPU_WALKER | 9);
>> -
>> -    /* interface descriptor offset */
>> -    OUT_BATCH(0);
>> -
>> -    /* SIMD size, thread w/h/d */
>> -    OUT_BATCH(1 << 30 | /* SIMD16 */
>> -          0 << 16 | /* depth:1 */
>> -          0 << 8 | /* height:1 */
>> -          0); /* width:1 */
>> -
>> -    /* thread group X */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(x_dim);
>> -
>> -    /* thread group Y */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(y_dim);
>> -
>> -    /* thread group Z */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(1);
>> -
>> -    /* right mask */
>> -    OUT_BATCH(right_mask);
>> -
>> -    /* bottom mask, height 1, always 0xffffffff */
>> -    OUT_BATCH(0xffffffff);
>> -}
>> -
>> -static void
>> -gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> -             unsigned x, unsigned y,
>> -             unsigned width, unsigned height)
>> -{
>> -    uint32_t x_dim, y_dim, tmp, right_mask;
>> -
>> -    /*
>> -     * Simply do SIMD16 based dispatch, so every thread uses
>> -     * SIMD16 channels.
>> -     *
>> -     * Define our own thread group size, e.g 16x1 for every group, then
>> -     * will have 1 thread each group in SIMD16 dispatch. So thread
>> -     * width/height/depth are all 1.
>> -     *
>> -     * Then thread group X = width / 16 (aligned to 16)
>> -     * thread group Y = height;
>> -     */
>> -    x_dim = (width + 15) / 16;
>> -    y_dim = height;
>> -
>> -    tmp = width & 15;
>> -    if (tmp == 0)
>> -        right_mask = (1 << 16) - 1;
>> -    else
>> -        right_mask = (1 << tmp) - 1;
>> -
>> -    OUT_BATCH(GEN7_GPGPU_WALKER | 13);
>> -
>> -    OUT_BATCH(0); /* kernel offset */
>> -    OUT_BATCH(0); /* indirect data length */
>> -    OUT_BATCH(0); /* indirect data offset */
>> -
>> -    /* SIMD size, thread w/h/d */
>> -    OUT_BATCH(1 << 30 | /* SIMD16 */
>> -          0 << 16 | /* depth:1 */
>> -          0 << 8 | /* height:1 */
>> -          0); /* width:1 */
>> -
>> -    /* thread group X */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(x_dim);
>> -
>> -    /* thread group Y */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(y_dim);
>> -
>> -    /* thread group Z */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(1);
>> -
>> -    /* right mask */
>> -    OUT_BATCH(right_mask);
>> -
>> -    /* bottom mask, height 1, always 0xffffffff */
>> -    OUT_BATCH(0xffffffff);
>> -}
>> -
>>   /*
>>    * This sets up the gpgpu pipeline,
>>    *
>> diff --git a/lib/gpu_fill.c b/lib/gpu_fill.c
>> new file mode 100644
>> index 00000000..b6da1cdc
>> --- /dev/null
>> +++ b/lib/gpu_fill.c
>> @@ -0,0 +1,729 @@
>> +#include <intel_bufmgr.h>
>> +#include <i915_drm.h>
>> +
>> +#include "intel_reg.h"
>> +#include "drmtest.h"
>> +#include "gpu_fill.h"
>> +#include <assert.h>
>> +#include "gen7_media.h"
>> +#include "gen8_media.h"
>> +
>> +
>> +uint32_t
>> +batch_used(struct intel_batchbuffer *batch)
>> +{
>> +    return batch->ptr - batch->buffer;
>> +}
>> +
>> +uint32_t
>> +batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> +{
>> +    uint32_t offset = batch_used(batch);
>> +    offset = ALIGN(offset, align);
>> +    batch->ptr = batch->buffer + offset;
>> +    return offset;
>> +}
>> +
>> +void *
>> +batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> +{
>> +    uint32_t offset = batch_align(batch, align);
>> +    batch->ptr += size;
>> +    return memset(batch->buffer + offset, 0, size);
>> +}
>> +
>> +uint32_t
>> +batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> +{
>> +    return (uint8_t *)ptr - batch->buffer;
>> +}
>> +
>> +uint32_t
>> +batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align)
>> +{
>> +    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> +}
>> +
>> +void
>> +gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> +{
>> +    int ret;
>> +
>> +    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> +    if (ret == 0)
>> +        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> +                    NULL, 0, 0, 0);
>> +    igt_assert(ret == 0);
>> +}
>> +
>> +uint32_t
>> +gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> +            uint8_t color)
>> +{
>> +    uint8_t *curbe_buffer;
>> +    uint32_t offset;
>> +
>> +    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> +    offset = batch_offset(batch, curbe_buffer);
>> +    *curbe_buffer = color;
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen7_fill_surface_state(struct intel_batchbuffer *batch,
>> +            struct igt_buf *buf,
>> +            uint32_t format,
>> +            int is_dst)
>> +{
>> +    struct gen7_surface_state *ss;
>> +    uint32_t write_domain, read_domain, offset;
>> +    int ret;
>> +
>> +    if (is_dst) {
>> +        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> +    } else {
>> +        write_domain = 0;
>> +        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> +    }
>> +
>> +    ss = batch_alloc(batch, sizeof(*ss), 64);
>> +    offset = batch_offset(batch, ss);
>> +
>> +    ss->ss0.surface_type = GEN7_SURFACE_2D;
>> +    ss->ss0.surface_format = format;
>> +    ss->ss0.render_cache_read_write = 1;
>> +
>> +    if (buf->tiling == I915_TILING_X)
>> +        ss->ss0.tiled_mode = 2;
>> +    else if (buf->tiling == I915_TILING_Y)
>> +        ss->ss0.tiled_mode = 3;
>> +
>> +    ss->ss1.base_addr = buf->bo->offset;
>> +    ret = drm_intel_bo_emit_reloc(batch->bo,
>> +                batch_offset(batch, ss) + 4,
>> +                buf->bo, 0,
>> +                read_domain, write_domain);
>> +    igt_assert(ret == 0);
>> +
>> +    ss->ss2.height = igt_buf_height(buf) - 1;
>> +    ss->ss2.width  = igt_buf_width(buf) - 1;
>> +
>> +    ss->ss3.pitch  = buf->stride - 1;
>> +
>> +    ss->ss7.shader_chanel_select_r = 4;
>> +    ss->ss7.shader_chanel_select_g = 5;
>> +    ss->ss7.shader_chanel_select_b = 6;
>> +    ss->ss7.shader_chanel_select_a = 7;
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen7_fill_binding_table(struct intel_batchbuffer *batch,
>> +            struct igt_buf *dst)
>> +{
>> +    uint32_t *binding_table, offset;
>> +
>> +    binding_table = batch_alloc(batch, 32, 64);
>> +    offset = batch_offset(batch, binding_table);
>> +
>> +    binding_table[0] = gen7_fill_surface_state(batch, dst, 
>> GEN7_SURFACEFORMAT_R8_UNORM, 1);
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen7_fill_kernel(struct intel_batchbuffer *batch,
>> +        const uint32_t kernel[][4],
>> +        size_t size)
>> +{
>> +    uint32_t offset;
>> +
>> +    offset = batch_copy(batch, kernel, size, 64);
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst,
>> +                   const uint32_t kernel[][4], size_t size)
>> +{
>> +    struct gen7_interface_descriptor_data *idd;
>> +    uint32_t offset;
>> +    uint32_t binding_table_offset, kernel_offset;
>> +
>> +    binding_table_offset = gen7_fill_binding_table(batch, dst);
>> +    kernel_offset = gen7_fill_kernel(batch, kernel, size);
>> +
>> +    idd = batch_alloc(batch, sizeof(*idd), 64);
>> +    offset = batch_offset(batch, idd);
>> +
>> +    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> +
>> +    idd->desc1.single_program_flow = 1;
>> +    idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
>> +
>> +    idd->desc2.sampler_count = 0;      /* 0 samplers used */
>> +    idd->desc2.sampler_state_pointer = 0;
>> +
>> +    idd->desc3.binding_table_entry_count = 0;
>> +    idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
>> +
>> +    idd->desc4.constant_urb_entry_read_offset = 0;
>> +    idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
>> +
>> +    return offset;
>> +}
>> +
>> +void
>> +gen7_emit_state_base_address(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
>> +
>> +    /* general */
>> +    OUT_BATCH(0);
>> +
>> +    /* surface */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* dynamic */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* indirect */
>> +    OUT_BATCH(0);
>> +
>> +    /* instruction */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* general/dynamic/indirect/instruction access Bound */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +}
>> +
>> +void
>> +gen7_emit_vfe_state(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
>> +
>> +    /* scratch buffer */
>> +    OUT_BATCH(0);
>> +
>> +    /* number of threads & urb entries */
>> +    OUT_BATCH(1 << 16 |
>> +        2 << 8);
>> +
>> +    OUT_BATCH(0);
>> +
>> +    /* urb entry size & curbe size */
>> +    OUT_BATCH(2 << 16 |     /* in 256 bits unit */
>> +        2);        /* in 256 bits unit */
>> +
>> +    /* scoreboard */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +}
>> +
>> +void
>> +gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
>> +
>> +    /* scratch buffer */
>> +    OUT_BATCH(0);
>> +
>> +    /* number of threads & urb entries */
>> +    OUT_BATCH(1 << 16 | /* max num of threads */
>> +          0 << 8 | /* num of URB entry */
>> +          1 << 2); /* GPGPU mode */
>> +
>> +    OUT_BATCH(0);
>> +
>> +    /* urb entry size & curbe size */
>> +    OUT_BATCH(0 << 16 |     /* URB entry size in 256 bits unit */
>> +          1);        /* CURBE entry size in 256 bits unit */
>> +
>> +    /* scoreboard */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +}
>> +
>> +void
>> +gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> +{
>> +    OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
>> +    OUT_BATCH(0);
>> +    /* curbe total data length */
>> +    OUT_BATCH(64);
>> +    /* curbe data start address, is relative to the dynamics base 
>> address */
>> +    OUT_BATCH(curbe_buffer);
>> +}
>> +
>> +void
>> +gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> +{
>> +    OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> +    OUT_BATCH(0);
>> +    /* interface descriptor data length */
>> +    OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
>> +    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> +    OUT_BATCH(interface_descriptor);
>> +}
>> +
>> +void
>> +gen7_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height)
>> +{
>> +    int i, j;
>> +
>> +    for (i = 0; i < width / 16; i++) {
>> +        for (j = 0; j < height / 16; j++) {
>> +            OUT_BATCH(GEN7_MEDIA_OBJECT | (8 - 2));
>> +
>> +            /* interface descriptor offset */
>> +            OUT_BATCH(0);
>> +
>> +            /* without indirect data */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* scoreboard */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* inline data (xoffset, yoffset) */
>> +            OUT_BATCH(x + i * 16);
>> +            OUT_BATCH(y + j * 16);
>> +        }
>> +    }
>> +}
>> +
>> +void
>> +gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> +             unsigned x, unsigned y,
>> +             unsigned width, unsigned height)
>> +{
>> +    uint32_t x_dim, y_dim, tmp, right_mask;
>> +
>> +    /*
>> +     * Simply do SIMD16 based dispatch, so every thread uses
>> +     * SIMD16 channels.
>> +     *
>> +     * Define our own thread group size, e.g 16x1 for every group, then
>> +     * will have 1 thread each group in SIMD16 dispatch. So thread
>> +     * width/height/depth are all 1.
>> +     *
>> +     * Then thread group X = width / 16 (aligned to 16)
>> +     * thread group Y = height;
>> +     */
>> +    x_dim = (width + 15) / 16;
>> +    y_dim = height;
>> +
>> +    tmp = width & 15;
>> +    if (tmp == 0)
>> +        right_mask = (1 << 16) - 1;
>> +    else
>> +        right_mask = (1 << tmp) - 1;
>> +
>> +    OUT_BATCH(GEN7_GPGPU_WALKER | 9);
>> +
>> +    /* interface descriptor offset */
>> +    OUT_BATCH(0);
>> +
>> +    /* SIMD size, thread w/h/d */
>> +    OUT_BATCH(1 << 30 | /* SIMD16 */
>> +          0 << 16 | /* depth:1 */
>> +          0 << 8 | /* height:1 */
>> +          0); /* width:1 */
>> +
>> +    /* thread group X */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(x_dim);
>> +
>> +    /* thread group Y */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(y_dim);
>> +
>> +    /* thread group Z */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(1);
>> +
>> +    /* right mask */
>> +    OUT_BATCH(right_mask);
>> +
>> +    /* bottom mask, height 1, always 0xffffffff */
>> +    OUT_BATCH(0xffffffff);
>> +}
>> +
>> +uint32_t
>> +gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> +            struct igt_buf *buf,
>> +            uint32_t format,
>> +            int is_dst)
>> +{
>> +    struct gen8_surface_state *ss;
>> +    uint32_t write_domain, read_domain, offset;
>> +    int ret;
>> +
>> +    if (is_dst) {
>> +        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> +    } else {
>> +        write_domain = 0;
>> +        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> +    }
>> +
>> +    ss = batch_alloc(batch, sizeof(*ss), 64);
>> +    offset = batch_offset(batch, ss);
>> +
>> +    ss->ss0.surface_type = GEN8_SURFACE_2D;
>> +    ss->ss0.surface_format = format;
>> +    ss->ss0.render_cache_read_write = 1;
>> +    ss->ss0.vertical_alignment = 1; /* align 4 */
>> +    ss->ss0.horizontal_alignment = 1; /* align 4 */
>> +
>> +    if (buf->tiling == I915_TILING_X)
>> +        ss->ss0.tiled_mode = 2;
>> +    else if (buf->tiling == I915_TILING_Y)
>> +        ss->ss0.tiled_mode = 3;
>> +
>> +    ss->ss8.base_addr = buf->bo->offset;
>> +
>> +    ret = drm_intel_bo_emit_reloc(batch->bo,
>> +                batch_offset(batch, ss) + 8 * 4,
>> +                buf->bo, 0,
>> +                read_domain, write_domain);
>> +    igt_assert(ret == 0);
>> +
>> +    ss->ss2.height = igt_buf_height(buf) - 1;
>> +    ss->ss2.width  = igt_buf_width(buf) - 1;
>> +    ss->ss3.pitch  = buf->stride - 1;
>> +
>> +    ss->ss7.shader_chanel_select_r = 4;
>> +    ss->ss7.shader_chanel_select_g = 5;
>> +    ss->ss7.shader_chanel_select_b = 6;
>> +    ss->ss7.shader_chanel_select_a = 7;
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> +            struct igt_buf *dst)
>> +{
>> +    uint32_t *binding_table, offset;
>> +
>> +    binding_table = batch_alloc(batch, 32, 64);
>> +    offset = batch_offset(batch, binding_table);
>> +
>> +    binding_table[0] = gen8_fill_surface_state(batch, dst, 
>> GEN8_SURFACEFORMAT_R8_UNORM, 1);
>> +
>> +    return offset;
>> +}
>> +
>> +uint32_t
>> +gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst, const uint32_t kernel[][4], size_t size)
>> +{
>> +    struct gen8_interface_descriptor_data *idd;
>> +    uint32_t offset;
>> +    uint32_t binding_table_offset, kernel_offset;
>> +
>> +    binding_table_offset = gen8_fill_binding_table(batch, dst);
>> +    kernel_offset = gen7_fill_kernel(batch, kernel, sizeof(kernel));
>> +
>> +    idd = batch_alloc(batch, sizeof(*idd), 64);
>> +    offset = batch_offset(batch, idd);
>> +
>> +    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> +
>> +    idd->desc2.single_program_flow = 1;
>> +    idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
>> +
>> +    idd->desc3.sampler_count = 0;      /* 0 samplers used */
>> +    idd->desc3.sampler_state_pointer = 0;
>> +
>> +    idd->desc4.binding_table_entry_count = 0;
>> +    idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
>> +
>> +    idd->desc5.constant_urb_entry_read_offset = 0;
>> +    idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
>> +
>> +    return offset;
>> +}
>> +
>> +void
>> +gen8_emit_state_base_address(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
>> +
>> +    /* general */
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +    OUT_BATCH(0);
>> +
>> +    /* stateless data port */
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +
>> +    /* surface */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* dynamic */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> +        0, BASE_ADDRESS_MODIFY);
>> +
>> +    /* indirect */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +
>> +    /* instruction */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* general state buffer size */
>> +    OUT_BATCH(0xfffff000 | 1);
>> +    /* dynamic state buffer size */
>> +    OUT_BATCH(1 << 12 | 1);
>> +    /* indirect object buffer size */
>> +    OUT_BATCH(0xfffff000 | 1);
>> +    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> +    OUT_BATCH(1 << 12 | 1);
>> +}
>> +
>> +void
>> +gen8_emit_vfe_state(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> +
>> +    /* scratch buffer */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +
>> +    /* number of threads & urb entries */
>> +    OUT_BATCH(1 << 16 |
>> +        2 << 8);
>> +
>> +    OUT_BATCH(0);
>> +
>> +    /* urb entry size & curbe size */
>> +    OUT_BATCH(2 << 16 |
>> +        2);
>> +
>> +    /* scoreboard */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +}
>> +
>> +void
>> +gen8_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> +
>> +    /* scratch buffer */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +
>> +    /* number of threads & urb entries */
>> +    OUT_BATCH(1 << 16 | 1 << 8);
>> +
>> +    OUT_BATCH(0);
>> +
>> +    /* urb entry size & curbe size */
>> +    OUT_BATCH(0 << 16 | 1);
>> +
>> +    /* scoreboard */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +}
>> +
>> +void
>> +gen8_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> +{
>> +    OUT_BATCH(GEN8_MEDIA_CURBE_LOAD | (4 - 2));
>> +    OUT_BATCH(0);
>> +    /* curbe total data length */
>> +    OUT_BATCH(64);
>> +    /* curbe data start address, is relative to the dynamics base 
>> address */
>> +    OUT_BATCH(curbe_buffer);
>> +}
>> +
>> +void
>> +gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> +{
>> +    OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> +    OUT_BATCH(0);
>> +    /* interface descriptor data length */
>> +    OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
>> +    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> +    OUT_BATCH(interface_descriptor);
>> +}
>> +
>> +void
>> +gen8_emit_media_state_flush(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN8_MEDIA_STATE_FLUSH | (2 - 2));
>> +    OUT_BATCH(0);
>> +}
>> +
>> +void
>> +gen8_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height)
>> +{
>> +    int i, j;
>> +
>> +    for (i = 0; i < width / 16; i++) {
>> +        for (j = 0; j < height / 16; j++) {
>> +            OUT_BATCH(GEN8_MEDIA_OBJECT | (8 - 2));
>> +
>> +            /* interface descriptor offset */
>> +            OUT_BATCH(0);
>> +
>> +            /* without indirect data */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* scoreboard */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* inline data (xoffset, yoffset) */
>> +            OUT_BATCH(x + i * 16);
>> +            OUT_BATCH(y + j * 16);
>> +            gen8_emit_media_state_flush(batch);
>> +        }
>> +    }
>> +}
>> +
>> +void
>> +gen8lp_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height)
>> +{
>> +    int i, j;
>> +
>> +    for (i = 0; i < width / 16; i++) {
>> +        for (j = 0; j < height / 16; j++) {
>> +            OUT_BATCH(GEN8_MEDIA_OBJECT | (8 - 2));
>> +
>> +            /* interface descriptor offset */
>> +            OUT_BATCH(0);
>> +
>> +            /* without indirect data */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* scoreboard */
>> +            OUT_BATCH(0);
>> +            OUT_BATCH(0);
>> +
>> +            /* inline data (xoffset, yoffset) */
>> +            OUT_BATCH(x + i * 16);
>> +            OUT_BATCH(y + j * 16);
>> +        }
>> +    }
>> +}
>> +
>> +void
>> +gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> +             unsigned x, unsigned y,
>> +             unsigned width, unsigned height)
>> +{
>> +    uint32_t x_dim, y_dim, tmp, right_mask;
>> +
>> +    /*
>> +     * Simply do SIMD16 based dispatch, so every thread uses
>> +     * SIMD16 channels.
>> +     *
>> +     * Define our own thread group size, e.g 16x1 for every group, then
>> +     * will have 1 thread each group in SIMD16 dispatch. So thread
>> +     * width/height/depth are all 1.
>> +     *
>> +     * Then thread group X = width / 16 (aligned to 16)
>> +     * thread group Y = height;
>> +     */
>> +    x_dim = (width + 15) / 16;
>> +    y_dim = height;
>> +
>> +    tmp = width & 15;
>> +    if (tmp == 0)
>> +        right_mask = (1 << 16) - 1;
>> +    else
>> +        right_mask = (1 << tmp) - 1;
>> +
>> +    OUT_BATCH(GEN7_GPGPU_WALKER | 13);
>> +
>> +    OUT_BATCH(0); /* kernel offset */
>> +    OUT_BATCH(0); /* indirect data length */
>> +    OUT_BATCH(0); /* indirect data offset */
>> +
>> +    /* SIMD size, thread w/h/d */
>> +    OUT_BATCH(1 << 30 | /* SIMD16 */
>> +          0 << 16 | /* depth:1 */
>> +          0 << 8 | /* height:1 */
>> +          0); /* width:1 */
>> +
>> +    /* thread group X */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(x_dim);
>> +
>> +    /* thread group Y */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(y_dim);
>> +
>> +    /* thread group Z */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(1);
>> +
>> +    /* right mask */
>> +    OUT_BATCH(right_mask);
>> +
>> +    /* bottom mask, height 1, always 0xffffffff */
>> +    OUT_BATCH(0xffffffff);
>> +}
>> +
>> +void
>> +gen9_emit_state_base_address(struct intel_batchbuffer *batch)
>> +{
>> +    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
>> +
>> +    /* general */
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +    OUT_BATCH(0);
>> +
>> +    /* stateless data port */
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +
>> +    /* surface */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* dynamic */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> +        0, BASE_ADDRESS_MODIFY);
>> +
>> +    /* indirect */
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0);
>> +
>> +    /* instruction */
>> +    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> +
>> +    /* general state buffer size */
>> +    OUT_BATCH(0xfffff000 | 1);
>> +    /* dynamic state buffer size */
>> +    OUT_BATCH(1 << 12 | 1);
>> +    /* indirect object buffer size */
>> +    OUT_BATCH(0xfffff000 | 1);
>> +    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> +    OUT_BATCH(1 << 12 | 1);
>> +
>> +    /* Bindless surface state base address */
>> +    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> +    OUT_BATCH(0);
>> +    OUT_BATCH(0xfffff000);
>> +}
>> diff --git a/lib/gpu_fill.h b/lib/gpu_fill.h
>> new file mode 100644
>> index 00000000..a271ce6e
>> --- /dev/null
>> +++ b/lib/gpu_fill.h
>> @@ -0,0 +1,141 @@
>> +/*
>> + * Copyright © 2018 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without 
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, 
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including 
>> the next
>> + * paragraph) shall be included in all copies or substantial portions 
>> of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + */
>> +
>> +#ifndef GPU_FILL_H
>> +#define GPU_FILL_H
>> +
>> +uint32_t
>> +batch_used(struct intel_batchbuffer *batch);
>> +
>> +uint32_t
>> +batch_align(struct intel_batchbuffer *batch, uint32_t align);
>> +
>> +void *
>> +batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align);
>> +
>> +uint32_t
>> +batch_offset(struct intel_batchbuffer *batch, void *ptr);
>> +
>> +uint32_t
>> +batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align);
>> +
>> +void
>> +gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end);
>> +
>> +uint32_t
>> +gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> +            uint8_t color);
>> +
>> +uint32_t
>> +gen7_fill_surface_state(struct intel_batchbuffer *batch,
>> +            struct igt_buf *buf,
>> +            uint32_t format,
>> +            int is_dst);
>> +
>> +uint32_t
>> +gen7_fill_binding_table(struct intel_batchbuffer *batch,
>> +            struct igt_buf *dst);
>> +uint32_t
>> +gen7_fill_kernel(struct intel_batchbuffer *batch,
>> +        const uint32_t kernel[][4],
>> +        size_t size);
>> +
>> +uint32_t
>> +gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst,
>> +                   const uint32_t kernel[][4], size_t size);
>> +
>> +void
>> +gen7_emit_state_base_address(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen7_emit_vfe_state(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer);
>> +
>> +void
>> +gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor);
>> +
>> +void
>> +gen7_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height);
>> +
>> +uint32_t
>> +gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> +            struct igt_buf *buf,
>> +            uint32_t format,
>> +            int is_dst);
>> +
>> +uint32_t
>> +gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> +            struct igt_buf *dst);
>> +
>> +uint32_t
>> +gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst, const uint32_t kernel[][4], size_t size);
>> +
>> +void
>> +gen8_emit_state_base_address(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen8_emit_vfe_state(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen8_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen8_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer);
>> +
>> +void
>> +gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor);
>> +
>> +void
>> +gen8_emit_media_state_flush(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen8_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height);
>> +
>> +void
>> +gen8lp_emit_media_objects(struct intel_batchbuffer *batch,
>> +            unsigned x, unsigned y,
>> +            unsigned width, unsigned height);
>> +
>> +void
>> +gen9_emit_state_base_address(struct intel_batchbuffer *batch);
>> +
>> +void
>> +gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> +             unsigned x, unsigned y,
>> +             unsigned width, unsigned height);
>> +void
>> +gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
>> +             unsigned x, unsigned y,
>> +             unsigned width, unsigned height);
>> +
>> +#endif /* GPU_FILL_H */
>> diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
>> index 6fb44798..c97555a6 100644
>> --- a/lib/media_fill_gen7.c
>> +++ b/lib/media_fill_gen7.c
>> @@ -5,7 +5,7 @@
>>   #include "gen7_media.h"
>>   #include "intel_reg.h"
>>   #include "drmtest.h"
>> -
>> +#include "gpu_fill.h"
>>   #include <assert.h>
>>   static const uint32_t media_kernel[][4] = {
>> @@ -22,275 +22,6 @@ static const uint32_t media_kernel[][4] = {
>>       { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
>>   };
>> -static uint32_t
>> -batch_used(struct intel_batchbuffer *batch)
>> -{
>> -    return batch->ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> -{
>> -    uint32_t offset = batch_used(batch);
>> -    offset = ALIGN(offset, align);
>> -    batch->ptr = batch->buffer + offset;
>> -    return offset;
>> -}
>> -
>> -static void *
>> -batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> -{
>> -    uint32_t offset = batch_align(batch, align);
>> -    batch->ptr += size;
>> -    return memset(batch->buffer + offset, 0, size);
>> -}
>> -
>> -static uint32_t
>> -batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> -{
>> -    return (uint8_t *)ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align)
>> -{
>> -    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> -}
>> -
>> -static void
>> -gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> -{
>> -    int ret;
>> -
>> -    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> -    if (ret == 0)
>> -        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> -                    NULL, 0, 0, 0);
>> -    igt_assert(ret == 0);
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> -            uint8_t color)
>> -{
>> -    uint8_t *curbe_buffer;
>> -    uint32_t offset;
>> -
>> -    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> -    offset = batch_offset(batch, curbe_buffer);
>> -    *curbe_buffer = color;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen7_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN7_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss1.base_addr = buf->bo->offset;
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    igt_assert(ret == 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen7_fill_surface_state(batch, dst, 
>> GEN7_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_media_kernel(struct intel_batchbuffer *batch,
>> -        const uint32_t kernel[][4],
>> -        size_t size)
>> -{
>> -    uint32_t offset;
>> -
>> -    offset = batch_copy(batch, kernel, size, 64);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst,
>> -                   const uint32_t kernel[][4], size_t size)
>> -{
>> -    struct gen7_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen7_fill_binding_table(batch, dst);
>> -    kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc1.single_program_flow = 1;
>> -    idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc2.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc2.sampler_state_pointer = 0;
>> -
>> -    idd->desc3.binding_table_entry_count = 0;
>> -    idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc4.constant_urb_entry_read_offset = 0;
>> -    idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static void
>> -gen7_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general/dynamic/indirect/instruction access Bound */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -}
>> -
>> -static void
>> -gen7_emit_vfe_state(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 |
>> -        2 << 8);
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(2 << 16 |     /* in 256 bits unit */
>> -        2);        /* in 256 bits unit */
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* curbe total data length */
>> -    OUT_BATCH(64);
>> -    /* curbe data start address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(curbe_buffer);
>> -}
>> -
>> -static void
>> -gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen7_emit_media_objects(struct intel_batchbuffer *batch,
>> -            unsigned x, unsigned y,
>> -            unsigned width, unsigned height)
>> -{
>> -    int i, j;
>> -
>> -    for (i = 0; i < width / 16; i++) {
>> -        for (j = 0; j < height / 16; j++) {
>> -            OUT_BATCH(GEN7_MEDIA_OBJECT | (8 - 2));
>> -
>> -            /* interface descriptor offset */
>> -            OUT_BATCH(0);
>> -
>> -            /* without indirect data */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* scoreboard */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* inline data (xoffset, yoffset) */
>> -            OUT_BATCH(x + i * 16);
>> -            OUT_BATCH(y + j * 16);
>> -        }
>> -    }
>> -}
>> -
>>   /*
>>    * This sets up the media pipeline,
>>    *
>> diff --git a/lib/media_fill_gen8.c b/lib/media_fill_gen8.c
>> index 4a8fe5a2..88c7dbdd 100644
>> --- a/lib/media_fill_gen8.c
>> +++ b/lib/media_fill_gen8.c
>> @@ -5,7 +5,7 @@
>>   #include "gen8_media.h"
>>   #include "intel_reg.h"
>>   #include "drmtest.h"
>> -
>> +#include "gpu_fill.h"
>>   #include <assert.h>
>> @@ -23,294 +23,6 @@ static const uint32_t media_kernel[][4] = {
>>       { 0x07800031, 0x20000a40, 0x0e000e00, 0x82000010 },
>>   };
>> -static uint32_t
>> -batch_used(struct intel_batchbuffer *batch)
>> -{
>> -    return batch->ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> -{
>> -    uint32_t offset = batch_used(batch);
>> -    offset = ALIGN(offset, align);
>> -    batch->ptr = batch->buffer + offset;
>> -    return offset;
>> -}
>> -
>> -static void *
>> -batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> -{
>> -    uint32_t offset = batch_align(batch, align);
>> -    batch->ptr += size;
>> -    return memset(batch->buffer + offset, 0, size);
>> -}
>> -
>> -static uint32_t
>> -batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> -{
>> -    return (uint8_t *)ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align)
>> -{
>> -    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> -}
>> -
>> -static void
>> -gen8_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> -{
>> -    int ret;
>> -
>> -    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> -    if (ret == 0)
>> -        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> -                    NULL, 0, 0, 0);
>> -    igt_assert(ret == 0);
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> -            uint8_t color)
>> -{
>> -    uint8_t *curbe_buffer;
>> -    uint32_t offset;
>> -
>> -    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> -    offset = batch_offset(batch, curbe_buffer);
>> -    *curbe_buffer = color;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen8_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN8_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -    ss->ss0.vertical_alignment = 1; /* align 4 */
>> -    ss->ss0.horizontal_alignment = 1; /* align 4 */
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss8.base_addr = buf->bo->offset;
>> -
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 8 * 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    igt_assert(ret == 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen8_fill_surface_state(batch, dst, 
>> GEN8_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_media_kernel(struct intel_batchbuffer *batch,
>> -        const uint32_t kernel[][4],
>> -        size_t size)
>> -{
>> -    uint32_t offset;
>> -
>> -    offset = batch_copy(batch, kernel, size, 64);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst)
>> -{
>> -    struct gen8_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen8_fill_binding_table(batch, dst);
>> -    kernel_offset = gen8_fill_media_kernel(batch, media_kernel, 
>> sizeof(media_kernel));
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc2.single_program_flow = 1;
>> -    idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc3.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc3.sampler_state_pointer = 0;
>> -
>> -    idd->desc4.binding_table_entry_count = 0;
>> -    idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc5.constant_urb_entry_read_offset = 0;
>> -    idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static void
>> -gen8_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -
>> -    /* stateless data port */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> -        0, BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general state buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* dynamic state buffer size */
>> -    OUT_BATCH(1 << 12 | 1);
>> -    /* indirect object buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> -    OUT_BATCH(1 << 12 | 1);
>> -}
>> -
>> -static void
>> -gen8_emit_vfe_state(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 |
>> -        2 << 8);
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(2 << 16 |
>> -        2);
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_CURBE_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* curbe total data length */
>> -    OUT_BATCH(64);
>> -    /* curbe data start address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(curbe_buffer);
>> -}
>> -
>> -static void
>> -gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen8_emit_media_state_flush(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_STATE_FLUSH | (2 - 2));
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_media_objects(struct intel_batchbuffer *batch,
>> -            unsigned x, unsigned y,
>> -            unsigned width, unsigned height)
>> -{
>> -    int i, j;
>> -
>> -    for (i = 0; i < width / 16; i++) {
>> -        for (j = 0; j < height / 16; j++) {
>> -            OUT_BATCH(GEN8_MEDIA_OBJECT | (8 - 2));
>> -
>> -            /* interface descriptor offset */
>> -            OUT_BATCH(0);
>> -
>> -            /* without indirect data */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* scoreboard */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* inline data (xoffset, yoffset) */
>> -            OUT_BATCH(x + i * 16);
>> -            OUT_BATCH(y + j * 16);
>> -            gen8_emit_media_state_flush(batch);
>> -        }
>> -    }
>> -}
>> -
>>   /*
>>    * This sets up the media pipeline,
>>    *
>> @@ -348,8 +60,9 @@ gen8_media_fillfunc(struct intel_batchbuffer *batch,
>>       /* setup states */
>>       batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
>> -    curbe_buffer = gen8_fill_curbe_buffer_data(batch, color);
>> -    interface_descriptor = gen8_fill_interface_descriptor(batch, dst);
>> +    curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
>> +    interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
>> +                    media_kernel, sizeof(media_kernel));
>>       igt_assert(batch->ptr < &batch->buffer[4095]);
>>       /* media pipeline */
>> @@ -370,6 +83,6 @@ gen8_media_fillfunc(struct intel_batchbuffer *batch,
>>       batch_end = batch_align(batch, 8);
>>       igt_assert(batch_end < BATCH_STATE_SPLIT);
>> -    gen8_render_flush(batch, batch_end);
>> +    gen7_render_flush(batch, batch_end);
>>       intel_batchbuffer_reset(batch);
>>   }
>> diff --git a/lib/media_fill_gen8lp.c b/lib/media_fill_gen8lp.c
>> index 1f8a4adc..b30d96a3 100644
>> --- a/lib/media_fill_gen8lp.c
>> +++ b/lib/media_fill_gen8lp.c
>> @@ -5,7 +5,7 @@
>>   #include "gen8_media.h"
>>   #include "intel_reg.h"
>>   #include "drmtest.h"
>> -
>> +#include "gpu_fill.h"
>>   #include <assert.h>
>> @@ -23,286 +23,6 @@ static const uint32_t media_kernel[][4] = {
>>       { 0x07800031, 0x20000a40, 0x0e000e00, 0x82000010 },
>>   };
>> -static uint32_t
>> -batch_used(struct intel_batchbuffer *batch)
>> -{
>> -    return batch->ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> -{
>> -    uint32_t offset = batch_used(batch);
>> -    offset = ALIGN(offset, align);
>> -    batch->ptr = batch->buffer + offset;
>> -    return offset;
>> -}
>> -
>> -static void *
>> -batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> -{
>> -    uint32_t offset = batch_align(batch, align);
>> -    batch->ptr += size;
>> -    return memset(batch->buffer + offset, 0, size);
>> -}
>> -
>> -static uint32_t
>> -batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> -{
>> -    return (uint8_t *)ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align)
>> -{
>> -    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> -}
>> -
>> -static void
>> -gen8_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> -{
>> -    int ret;
>> -
>> -    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> -    if (ret == 0)
>> -        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> -                    NULL, 0, 0, 0);
>> -    igt_assert(ret == 0);
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> -            uint8_t color)
>> -{
>> -    uint8_t *curbe_buffer;
>> -    uint32_t offset;
>> -
>> -    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> -    offset = batch_offset(batch, curbe_buffer);
>> -    *curbe_buffer = color;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen8_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN8_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -    ss->ss0.vertical_alignment = 1; /* align 4 */
>> -    ss->ss0.horizontal_alignment = 1; /* align 4 */
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss8.base_addr = buf->bo->offset;
>> -
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 8 * 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    igt_assert(ret == 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen8_fill_surface_state(batch, dst, 
>> GEN8_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_media_kernel(struct intel_batchbuffer *batch,
>> -        const uint32_t kernel[][4],
>> -        size_t size)
>> -{
>> -    uint32_t offset;
>> -
>> -    offset = batch_copy(batch, kernel, size, 64);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst)
>> -{
>> -    struct gen8_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen8_fill_binding_table(batch, dst);
>> -    kernel_offset = gen8_fill_media_kernel(batch, media_kernel, 
>> sizeof(media_kernel));
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc2.single_program_flow = 1;
>> -    idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc3.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc3.sampler_state_pointer = 0;
>> -
>> -    idd->desc4.binding_table_entry_count = 0;
>> -    idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc5.constant_urb_entry_read_offset = 0;
>> -    idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static void
>> -gen8_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -
>> -    /* stateless data port */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> -        0, BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general state buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* dynamic state buffer size */
>> -    OUT_BATCH(1 << 12 | 1);
>> -    /* indirect object buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> -    OUT_BATCH(1 << 12 | 1);
>> -}
>> -
>> -static void
>> -gen8_emit_vfe_state(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 |
>> -        2 << 8);
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(2 << 16 |
>> -        2);
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_CURBE_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* curbe total data length */
>> -    OUT_BATCH(64);
>> -    /* curbe data start address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(curbe_buffer);
>> -}
>> -
>> -static void
>> -gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen8lp_emit_media_objects(struct intel_batchbuffer *batch,
>> -            unsigned x, unsigned y,
>> -            unsigned width, unsigned height)
>> -{
>> -    int i, j;
>> -
>> -    for (i = 0; i < width / 16; i++) {
>> -        for (j = 0; j < height / 16; j++) {
>> -            OUT_BATCH(GEN8_MEDIA_OBJECT | (8 - 2));
>> -
>> -            /* interface descriptor offset */
>> -            OUT_BATCH(0);
>> -
>> -            /* without indirect data */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* scoreboard */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* inline data (xoffset, yoffset) */
>> -            OUT_BATCH(x + i * 16);
>> -            OUT_BATCH(y + j * 16);
>> -        }
>> -    }
>> -}
>> -
>>   /*
>>    * This sets up the media pipeline,
>>    *
>> @@ -340,8 +60,9 @@ gen8lp_media_fillfunc(struct intel_batchbuffer *batch,
>>       /* setup states */
>>       batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
>> -    curbe_buffer = gen8_fill_curbe_buffer_data(batch, color);
>> -    interface_descriptor = gen8_fill_interface_descriptor(batch, dst);
>> +    curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
>> +    interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
>> +                    media_kernel, sizeof(media_kernel));
>>       igt_assert(batch->ptr < &batch->buffer[4095]);
>>       /* media pipeline */
>> @@ -362,6 +83,6 @@ gen8lp_media_fillfunc(struct intel_batchbuffer *batch,
>>       batch_end = batch_align(batch, 8);
>>       igt_assert(batch_end < BATCH_STATE_SPLIT);
>> -    gen8_render_flush(batch, batch_end);
>> +    gen7_render_flush(batch, batch_end);
>>       intel_batchbuffer_reset(batch);
>>   }
>> diff --git a/lib/media_fill_gen9.c b/lib/media_fill_gen9.c
>> index 3fd21819..e5d94487 100644
>> --- a/lib/media_fill_gen9.c
>> +++ b/lib/media_fill_gen9.c
>> @@ -4,11 +4,9 @@
>>   #include "media_fill.h"
>>   #include "gen8_media.h"
>>   #include "intel_reg.h"
>> -
>> +#include "gpu_fill.h"
>>   #include <assert.h>
>> -#define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1))
>> -
>>   static const uint32_t media_kernel[][4] = {
>>       { 0x00400001, 0x20202288, 0x00000020, 0x00000000 },
>>       { 0x00600001, 0x20800208, 0x008d0000, 0x00000000 },
>> @@ -23,299 +21,6 @@ static const uint32_t media_kernel[][4] = {
>>       { 0x07800031, 0x20000a40, 0x0e000e00, 0x82000010 },
>>   };
>> -static uint32_t
>> -batch_used(struct intel_batchbuffer *batch)
>> -{
>> -    return batch->ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_align(struct intel_batchbuffer *batch, uint32_t align)
>> -{
>> -    uint32_t offset = batch_used(batch);
>> -    offset = ALIGN(offset, align);
>> -    batch->ptr = batch->buffer + offset;
>> -    return offset;
>> -}
>> -
>> -static void *
>> -batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t 
>> align)
>> -{
>> -    uint32_t offset = batch_align(batch, align);
>> -    batch->ptr += size;
>> -    return memset(batch->buffer + offset, 0, size);
>> -}
>> -
>> -static uint32_t
>> -batch_offset(struct intel_batchbuffer *batch, void *ptr)
>> -{
>> -    return (uint8_t *)ptr - batch->buffer;
>> -}
>> -
>> -static uint32_t
>> -batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t 
>> size, uint32_t align)
>> -{
>> -    return batch_offset(batch, memcpy(batch_alloc(batch, size, 
>> align), ptr, size));
>> -}
>> -
>> -static void
>> -gen8_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
>> -{
>> -    int ret;
>> -
>> -    ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
>> -    if (ret == 0)
>> -        ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
>> -                    NULL, 0, 0, 0);
>> -    assert(ret == 0);
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
>> -            uint8_t color)
>> -{
>> -    uint8_t *curbe_buffer;
>> -    uint32_t offset;
>> -
>> -    curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
>> -    offset = batch_offset(batch, curbe_buffer);
>> -    *curbe_buffer = color;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_surface_state(struct intel_batchbuffer *batch,
>> -            struct igt_buf *buf,
>> -            uint32_t format,
>> -            int is_dst)
>> -{
>> -    struct gen8_surface_state *ss;
>> -    uint32_t write_domain, read_domain, offset;
>> -    int ret;
>> -
>> -    if (is_dst) {
>> -        write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
>> -    } else {
>> -        write_domain = 0;
>> -        read_domain = I915_GEM_DOMAIN_SAMPLER;
>> -    }
>> -
>> -    ss = batch_alloc(batch, sizeof(*ss), 64);
>> -    offset = batch_offset(batch, ss);
>> -
>> -    ss->ss0.surface_type = GEN8_SURFACE_2D;
>> -    ss->ss0.surface_format = format;
>> -    ss->ss0.render_cache_read_write = 1;
>> -    ss->ss0.vertical_alignment = 1; /* align 4 */
>> -    ss->ss0.horizontal_alignment = 1; /* align 4 */
>> -
>> -    if (buf->tiling == I915_TILING_X)
>> -        ss->ss0.tiled_mode = 2;
>> -    else if (buf->tiling == I915_TILING_Y)
>> -        ss->ss0.tiled_mode = 3;
>> -
>> -    ss->ss8.base_addr = buf->bo->offset;
>> -
>> -    ret = drm_intel_bo_emit_reloc(batch->bo,
>> -                batch_offset(batch, ss) + 8 * 4,
>> -                buf->bo, 0,
>> -                read_domain, write_domain);
>> -    assert(ret == 0);
>> -
>> -    ss->ss2.height = igt_buf_height(buf) - 1;
>> -    ss->ss2.width  = igt_buf_width(buf) - 1;
>> -    ss->ss3.pitch  = buf->stride - 1;
>> -
>> -    ss->ss7.shader_chanel_select_r = 4;
>> -    ss->ss7.shader_chanel_select_g = 5;
>> -    ss->ss7.shader_chanel_select_b = 6;
>> -    ss->ss7.shader_chanel_select_a = 7;
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_binding_table(struct intel_batchbuffer *batch,
>> -            struct igt_buf *dst)
>> -{
>> -    uint32_t *binding_table, offset;
>> -
>> -    binding_table = batch_alloc(batch, 32, 64);
>> -    offset = batch_offset(batch, binding_table);
>> -
>> -    binding_table[0] = gen8_fill_surface_state(batch, dst, 
>> GEN8_SURFACEFORMAT_R8_UNORM, 1);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_media_kernel(struct intel_batchbuffer *batch,
>> -        const uint32_t kernel[][4],
>> -        size_t size)
>> -{
>> -    uint32_t offset;
>> -
>> -    offset = batch_copy(batch, kernel, size, 64);
>> -
>> -    return offset;
>> -}
>> -
>> -static uint32_t
>> -gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, 
>> struct igt_buf *dst)
>> -{
>> -    struct gen8_interface_descriptor_data *idd;
>> -    uint32_t offset;
>> -    uint32_t binding_table_offset, kernel_offset;
>> -
>> -    binding_table_offset = gen8_fill_binding_table(batch, dst);
>> -    kernel_offset = gen8_fill_media_kernel(batch, media_kernel, 
>> sizeof(media_kernel));
>> -
>> -    idd = batch_alloc(batch, sizeof(*idd), 64);
>> -    offset = batch_offset(batch, idd);
>> -
>> -    idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
>> -
>> -    idd->desc2.single_program_flow = 1;
>> -    idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
>> -
>> -    idd->desc3.sampler_count = 0;      /* 0 samplers used */
>> -    idd->desc3.sampler_state_pointer = 0;
>> -
>> -    idd->desc4.binding_table_entry_count = 0;
>> -    idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
>> -
>> -    idd->desc5.constant_urb_entry_read_offset = 0;
>> -    idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
>> -
>> -    return offset;
>> -}
>> -
>> -static void
>> -gen9_emit_state_base_address(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
>> -
>> -    /* general */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -
>> -    /* stateless data port */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -
>> -    /* surface */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* dynamic */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | 
>> I915_GEM_DOMAIN_INSTRUCTION,
>> -        0, BASE_ADDRESS_MODIFY);
>> -
>> -    /* indirect */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* instruction */
>> -    OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 
>> BASE_ADDRESS_MODIFY);
>> -
>> -    /* general state buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* dynamic state buffer size */
>> -    OUT_BATCH(1 << 12 | 1);
>> -    /* indirect object buffer size */
>> -    OUT_BATCH(0xfffff000 | 1);
>> -    /* intruction buffer size, must set modify enable bit, otherwise 
>> it may result in GPU hang */
>> -    OUT_BATCH(1 << 12 | 1);
>> -
>> -    /* Bindless surface state base address */
>> -    OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0xfffff000);
>> -}
>> -
>> -static void
>> -gen8_emit_vfe_state(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
>> -
>> -    /* scratch buffer */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -
>> -    /* number of threads & urb entries */
>> -    OUT_BATCH(1 << 16 |
>> -        2 << 8);
>> -
>> -    OUT_BATCH(0);
>> -
>> -    /* urb entry size & curbe size */
>> -    OUT_BATCH(2 << 16 |
>> -        2);
>> -
>> -    /* scoreboard */
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t 
>> curbe_buffer)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_CURBE_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* curbe total data length */
>> -    OUT_BATCH(64);
>> -    /* curbe data start address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(curbe_buffer);
>> -}
>> -
>> -static void
>> -gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, 
>> uint32_t interface_descriptor)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
>> -    OUT_BATCH(0);
>> -    /* interface descriptor data length */
>> -    OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
>> -    /* interface descriptor address, is relative to the dynamics base 
>> address */
>> -    OUT_BATCH(interface_descriptor);
>> -}
>> -
>> -static void
>> -gen8_emit_media_state_flush(struct intel_batchbuffer *batch)
>> -{
>> -    OUT_BATCH(GEN8_MEDIA_STATE_FLUSH | (2 - 2));
>> -    OUT_BATCH(0);
>> -}
>> -
>> -static void
>> -gen8_emit_media_objects(struct intel_batchbuffer *batch,
>> -            unsigned x, unsigned y,
>> -            unsigned width, unsigned height)
>> -{
>> -    int i, j;
>> -
>> -    for (i = 0; i < width / 16; i++) {
>> -        for (j = 0; j < height / 16; j++) {
>> -            OUT_BATCH(GEN8_MEDIA_OBJECT | (8 - 2));
>> -
>> -            /* interface descriptor offset */
>> -            OUT_BATCH(0);
>> -
>> -            /* without indirect data */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* scoreboard */
>> -            OUT_BATCH(0);
>> -            OUT_BATCH(0);
>> -
>> -            /* inline data (xoffset, yoffset) */
>> -            OUT_BATCH(x + i * 16);
>> -            OUT_BATCH(y + j * 16);
>> -            gen8_emit_media_state_flush(batch);
>> -        }
>> -    }
>> -}
>> -
>>   /*
>>    * This sets up the media pipeline,
>>    *
>> @@ -353,8 +58,9 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
>>       /* setup states */
>>       batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
>> -    curbe_buffer = gen8_fill_curbe_buffer_data(batch, color);
>> -    interface_descriptor = gen8_fill_interface_descriptor(batch, dst);
>> +    curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
>> +    interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
>> +                    media_kernel, sizeof(media_kernel));
>>       assert(batch->ptr < &batch->buffer[4095]);
>>       /* media pipeline */
>> @@ -387,6 +93,6 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
>>       batch_end = batch_align(batch, 8);
>>       assert(batch_end < BATCH_STATE_SPLIT);
>> -    gen8_render_flush(batch, batch_end);
>> +    gen7_render_flush(batch, batch_end);
>>       intel_batchbuffer_reset(batch);
>>   }
>> diff --git a/lib/meson.build b/lib/meson.build
>> index b3b8b14a..38c3f107 100644
>> --- a/lib/meson.build
>> +++ b/lib/meson.build
>> @@ -24,6 +24,7 @@ lib_sources = [
>>       'intel_os.c',
>>       'intel_mmio.c',
>>       'ioctl_wrappers.c',
>> +    'gpu_fill.c',
>>       'media_fill_gen7.c',
>>       'media_fill_gen8.c',
>>       'media_fill_gen8lp.c',
>>
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev