[Mesa-dev] [PATCH 2/9] intel/tools/aubinator: aubinate ppgtt aubs
Rafael Antognolli
rafael.antognolli at intel.com
Mon Jun 18 22:25:40 UTC 2018
Hi Lionel,
I've been going through the patch and I think the content so far is
mostly fine. However, it has a lot of things going on for a single patch
IMHO. I see changes for the execution list submission port, ppgtt, using
the RB tree for ggtt too, etc...
It definitely make it more painful to read (at least for me). Are these
changes so related that they couldn't be split up?
Rafael
On Thu, Jun 14, 2018 at 06:11:38PM +0100, Lionel Landwerlin wrote:
> From: Scott D Phillips <scott.d.phillips at intel.com>
>
> v2: by Lionel
> Fix memfd_create compilation issue
> Fix pml4 address stored on 32 instead of 64bits
> Return no buffer if first ppgtt page is not mapped
>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
> src/intel/tools/aubinator.c | 460 ++++++++++++++++++++++++++++++++----
> 1 file changed, 410 insertions(+), 50 deletions(-)
>
> diff --git a/src/intel/tools/aubinator.c b/src/intel/tools/aubinator.c
> index 3120e82b22e..99cd010dd9d 100644
> --- a/src/intel/tools/aubinator.c
> +++ b/src/intel/tools/aubinator.c
> @@ -37,12 +37,24 @@
> #include <sys/wait.h>
> #include <sys/mman.h>
>
> +#include "util/list.h"
> #include "util/macros.h"
> +#include "util/rb_tree.h"
>
> #include "common/gen_decoder.h"
> #include "common/gen_disasm.h"
> #include "intel_aub.h"
>
> +#ifndef HAVE_MEMFD_CREATE
> +#include <sys/syscall.h>
> +
> +static inline int
> +memfd_create(const char *name, unsigned int flags)
> +{
> + return syscall(SYS_memfd_create, name, flags);
> +}
> +#endif
> +
> /* Below is the only command missing from intel_aub.h in libdrm
> * So, reuse intel_aub.h from libdrm and #define the
> * AUB_MI_BATCH_BUFFER_END as below
> @@ -70,6 +82,31 @@ struct gen_batch_decode_ctx batch_ctx;
>
> uint64_t gtt_size, gtt_end;
> void *gtt;
> +
> +struct bo_map {
> + struct list_head link;
> + struct gen_batch_decode_bo bo;
> +};
> +
> +struct ggtt_entry {
> + struct rb_node node;
> + uint64_t virt_addr;
> + uint64_t phys_addr;
> +};
> +
> +struct phys_mem {
> + struct rb_node node;
> + uint64_t fd_offset;
> + uint64_t phys_addr;
> + uint8_t *data;
> +};
> +
> +static struct list_head maps;
> +static struct rb_tree ggtt = {NULL};
> +static struct rb_tree mem = {NULL};
> +int mem_fd = -1;
> +off_t mem_fd_len = 0;
> +
> uint64_t general_state_base;
> uint64_t surface_state_base;
> uint64_t dynamic_state_base;
> @@ -99,6 +136,191 @@ valid_offset(uint32_t offset)
> #define GEN_ENGINE_RENDER 1
> #define GEN_ENGINE_BLITTER 2
>
> +static inline struct ggtt_entry *
> +ggtt_entry_next(struct ggtt_entry *entry)
> +{
> + if (!entry)
> + return NULL;
> + struct rb_node *node = rb_node_next(&entry->node);
> + if (!node)
> + return NULL;
> + return rb_node_data(struct ggtt_entry, node, node);
> +}
> +
> +static inline int
> +cmp_uint64(uint64_t a, uint64_t b)
> +{
> + if (a < b)
> + return -1;
> + if (a > b)
> + return 1;
> + return 0;
> +}
> +
> +static inline int
> +cmp_ggtt_entry(const struct rb_node *node, const void *addr)
> +{
> + struct ggtt_entry *entry = rb_node_data(struct ggtt_entry, node, node);
> + return cmp_uint64(entry->virt_addr, *(uint64_t *)addr);
> +}
> +
> +static struct ggtt_entry *
> +ensure_ggtt_entry(struct rb_tree *tree, uint64_t virt_addr)
> +{
> + struct rb_node *node = rb_tree_search_sloppy(&ggtt, &virt_addr,
> + cmp_ggtt_entry);
> + int cmp = 0;
> + if (!node || (cmp = cmp_ggtt_entry(node, &virt_addr))) {
> + struct ggtt_entry *new_entry = calloc(1, sizeof(*new_entry));
> + new_entry->virt_addr = virt_addr;
> + rb_tree_insert_at(&ggtt, node, &new_entry->node, cmp > 0);
> + node = &new_entry->node;
> + }
> +
> + return rb_node_data(struct ggtt_entry, node, node);
> +}
> +
> +static struct ggtt_entry *
> +search_ggtt_entry(uint64_t virt_addr)
> +{
> + virt_addr &= ~0xfff;
> +
> + struct rb_node *node = rb_tree_search(&ggtt, &virt_addr, cmp_ggtt_entry);
> +
> + if (!node)
> + return NULL;
> +
> + return rb_node_data(struct ggtt_entry, node, node);
> +}
> +
> +static inline int
> +cmp_phys_mem(const struct rb_node *node, const void *addr)
> +{
> + struct phys_mem *mem = rb_node_data(struct phys_mem, node, node);
> + return cmp_uint64(mem->phys_addr, *(uint64_t *)addr);
> +}
> +
> +static struct phys_mem *
> +ensure_phys_mem(uint64_t phys_addr)
> +{
> + struct rb_node *node = rb_tree_search_sloppy(&mem, &phys_addr, cmp_phys_mem);
> + int cmp = 0;
> + if (!node || (cmp = cmp_phys_mem(node, &phys_addr))) {
> + struct phys_mem *new_mem = calloc(1, sizeof(*new_mem));
> + new_mem->phys_addr = phys_addr;
> + new_mem->fd_offset = mem_fd_len;
> +
> + int ftruncate_res = ftruncate(mem_fd, mem_fd_len += 4096);
> + assert(ftruncate_res == 0);
> +
> + new_mem->data = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED,
> + mem_fd, new_mem->fd_offset);
> + assert(new_mem->data != MAP_FAILED);
> +
> + rb_tree_insert_at(&mem, node, &new_mem->node, cmp > 0);
> + node = &new_mem->node;
> + }
> +
> + return rb_node_data(struct phys_mem, node, node);
> +}
> +
> +static struct phys_mem *
> +search_phys_mem(uint64_t phys_addr)
> +{
> + phys_addr &= ~0xfff;
> +
> + struct rb_node *node = rb_tree_search(&mem, &phys_addr, cmp_phys_mem);
> +
> + if (!node)
> + return NULL;
> +
> + return rb_node_data(struct phys_mem, node, node);
> +}
> +
> +static void
> +handle_ggtt_entry_write(uint64_t address, void *_data, uint32_t _size)
> +{
> + uint64_t virt_addr = (address / sizeof(uint64_t)) << 12;
> + uint64_t *data = _data;
> + size_t size = _size / sizeof(*data);
> + for (uint64_t *entry = data;
> + entry < data + size;
> + entry++, virt_addr += 4096) {
> + struct ggtt_entry *pt = ensure_ggtt_entry(&ggtt, virt_addr);
> + pt->phys_addr = *entry;
> + }
> +}
> +
> +static void
> +handle_physical_write(uint64_t address, void *data, uint32_t size)
> +{
> + uint32_t to_write = size;
> + for (uint64_t page = address & ~0xfff; page < address + size; page += 4096) {
> + struct phys_mem *mem = ensure_phys_mem(page);
> + uint64_t offset = MAX2(page, address) - page;
> + uint32_t size_this_page = MIN2(to_write, 4096 - offset);
> + to_write -= size_this_page;
> + memcpy(mem->data + offset, data, size_this_page);
> + data = (uint8_t *)data + size_this_page;
> + }
> +}
> +
> +static void
> +handle_ggtt_write(uint64_t address, void *data, uint32_t size)
> +{
> + uint32_t to_write = size;
> + for (uint64_t page = address & ~0xfff; page < address + size; page += 4096) {
> + struct ggtt_entry *entry = search_ggtt_entry(page);
> + assert(entry && entry->phys_addr & 0x1);
> +
> + uint64_t offset = MAX2(page, address) - page;
> + uint32_t size_this_page = MIN2(to_write, 4096 - offset);
> + to_write -= size_this_page;
> +
> + handle_physical_write(page + offset, data, size_this_page);
> + data = (uint8_t *)data + size_this_page;
> + }
> +}
> +
> +static struct phys_mem *
> +ppgtt_walk(uint64_t pml4, uint64_t address)
> +{
> + uint64_t shift = 39;
> + uint64_t addr = pml4;
> + for (int level = 4; level > 0; level--) {
> + struct phys_mem *table = search_phys_mem(addr);
> + if (!table)
> + return NULL;
> + int index = (address >> shift) & 0x1ff;
> + uint64_t entry = ((uint64_t *)table->data)[index];
> + if (!(entry & 1))
> + return NULL;
> + addr = entry & ~0xfff;
> + shift -= 9;
> + }
> + return search_phys_mem(addr);
> +}
> +
> +static bool
> +ppgtt_mapped(uint64_t pml4, uint64_t address)
> +{
> + return ppgtt_walk(pml4, address) != NULL;
> +}
> +
> +static struct gen_batch_decode_bo
> +get_gen_batch_bo(void *user_data, uint64_t address)
> +{
> + if (address > gtt_end)
> + return (struct gen_batch_decode_bo) { .map = NULL };
> +
> + /* We really only have one giant address range */
> + return (struct gen_batch_decode_bo) {
> + .addr = 0,
> + .map = gtt,
> + .size = gtt_size
> + };
> +}
> +
> static void
> handle_trace_block(uint32_t *p)
> {
> @@ -140,6 +362,7 @@ handle_trace_block(uint32_t *p)
> }
>
> (void)engine; /* TODO */
> + batch_ctx.get_bo = get_gen_batch_bo;
> gen_print_batch(&batch_ctx, data, size, 0);
>
> gtt_end = 0;
> @@ -148,17 +371,103 @@ handle_trace_block(uint32_t *p)
> }
>
> static struct gen_batch_decode_bo
> -get_gen_batch_bo(void *user_data, uint64_t address)
> +get_ggtt_batch_bo(void *user_data, uint64_t address)
> {
> - if (address > gtt_end)
> - return (struct gen_batch_decode_bo) { .map = NULL };
> + struct gen_batch_decode_bo bo = {0};
> +
> + address &= ~0xfff;
> +
> + list_for_each_entry(struct bo_map, i, &maps, link)
> + if (i->bo.addr <= address && i->bo.addr + i->bo.size > address)
> + return i->bo;
> +
> + struct ggtt_entry *start =
> + (struct ggtt_entry *)rb_tree_search_sloppy(&ggtt, &address,
> + cmp_ggtt_entry);
> + if (start && start->virt_addr < address)
> + start = ggtt_entry_next(start);
> + if (!start)
> + return bo;
> +
> + struct ggtt_entry *last = start;
> + for (struct ggtt_entry *i = ggtt_entry_next(last);
> + i && last->virt_addr + 4096 == i->virt_addr;
> + last = i, i = ggtt_entry_next(last))
> + ;
> +
> + bo.addr = MIN2(address, start->virt_addr);
> + bo.size = last->virt_addr - bo.addr + 4096;
> + bo.map = mmap(NULL, bo.size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> + assert(bo.map != MAP_FAILED);
> +
> + for (struct ggtt_entry *i = start;
> + i;
> + i = i == last ? NULL : ggtt_entry_next(i)) {
> + uint64_t phys_addr = i->phys_addr & ~0xfff;
> + struct phys_mem *phys_mem = search_phys_mem(phys_addr);
> +
> + if (!phys_mem)
> + continue;
> +
> + uint32_t map_offset = i->virt_addr - address;
> + void *res = mmap((uint8_t *)bo.map + map_offset, 4096, PROT_READ,
> + MAP_SHARED | MAP_FIXED, mem_fd, phys_mem->fd_offset);
> + assert(res != MAP_FAILED);
> + }
>
> - /* We really only have one giant address range */
> - return (struct gen_batch_decode_bo) {
> - .addr = 0,
> - .map = gtt,
> - .size = gtt_size
> - };
> + struct bo_map *m = calloc(1, sizeof(*m));
> + m->bo = bo;
> + list_add(&m->link, &maps);
> +
> + return bo;
> +}
> +
> +static struct gen_batch_decode_bo
> +get_ppgtt_batch_bo(void *user_data, uint64_t address)
> +{
> + struct gen_batch_decode_bo bo = {0};
> + uint64_t pml4 = *(uint64_t *)user_data;
> +
> + address &= ~0xfff;
> +
> + if (!ppgtt_mapped(pml4, address))
> + return bo;
> +
> + /* Map everything until the first gap since we don't know how much the
> + * decoder actually needs.
> + */
> + uint64_t end = address;
> + while (ppgtt_mapped(pml4, end))
> + end += 4096;
> +
> + bo.addr = address;
> + bo.size = end - address;
> + bo.map = mmap(NULL, bo.size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> + assert(bo.map != MAP_FAILED);
> +
> + for (uint64_t page = address; page < end; page += 4096) {
> + struct phys_mem *phys_mem = ppgtt_walk(pml4, page);
> +
> + void *res = mmap((uint8_t *)bo.map + (page - bo.addr), 4096, PROT_READ,
> + MAP_SHARED | MAP_FIXED, mem_fd, phys_mem->fd_offset);
> + assert(res != MAP_FAILED);
> + }
> +
> + struct bo_map *m = calloc(1, sizeof(*m));
> + m->bo = bo;
> + list_add(&m->link, &maps);
> +
> + return bo;
> +}
> +
> +static void
> +clear_bo_maps(void)
> +{
> + list_for_each_entry_safe(struct bo_map, i, &maps, link) {
> + munmap((void *)i->bo.map, i->bo.size);
> + list_del(&i->link);
> + free(i);
> + }
> }
>
> static void
> @@ -179,7 +488,7 @@ aubinator_init(uint16_t aub_pci_id, const char *app_name)
> batch_flags |= GEN_BATCH_DECODE_FLOATS;
>
> gen_batch_decode_ctx_init(&batch_ctx, &devinfo, outfile, batch_flags,
> - xml_path, get_gen_batch_bo, NULL, NULL);
> + xml_path, NULL, NULL, NULL);
> batch_ctx.max_vbo_decoded_lines = max_vbo_lines;
>
> char *color = GREEN_HEADER, *reset_color = NORMAL;
> @@ -245,52 +554,98 @@ handle_memtrace_version(uint32_t *p)
> static void
> handle_memtrace_reg_write(uint32_t *p)
> {
> + static struct execlist_regs {
> + uint32_t render_elsp[4];
> + int render_elsp_index;
> + uint32_t blitter_elsp[4];
> + int blitter_elsp_index;
> + } state = {};
> +
> uint32_t offset = p[1];
> uint32_t value = p[5];
> +
> int engine;
> - static int render_elsp_writes = 0;
> - static int blitter_elsp_writes = 0;
> - static int render_elsq0 = 0;
> - static int blitter_elsq0 = 0;
> - uint8_t *pphwsp;
> -
> - if (offset == 0x2230) {
> - render_elsp_writes++;
> + uint64_t context_descriptor;
> +
> + switch (offset) {
> + case 0x2230: /* render elsp */
> + state.render_elsp[state.render_elsp_index++] = value;
> + if (state.render_elsp_index < 4)
> + return;
> +
> + state.render_elsp_index = 0;
> engine = GEN_ENGINE_RENDER;
> - } else if (offset == 0x22230) {
> - blitter_elsp_writes++;
> + context_descriptor = (uint64_t)state.render_elsp[2] << 32 |
> + state.render_elsp[3];
> + break;
> + case 0x22230: /* blitter elsp */
> + state.blitter_elsp[state.blitter_elsp_index++] = value;
> + if (state.blitter_elsp_index < 4)
> + return;
> +
> + state.blitter_elsp_index = 0;
> engine = GEN_ENGINE_BLITTER;
> - } else if (offset == 0x2510) {
> - render_elsq0 = value;
> - } else if (offset == 0x22510) {
> - blitter_elsq0 = value;
> - } else if (offset == 0x2550 || offset == 0x22550) {
> - /* nothing */;
> - } else {
> + context_descriptor = (uint64_t)state.blitter_elsp[2] << 32 |
> + state.blitter_elsp[3];
> + break;
> + case 0x2510: /* render elsq0 lo */
> + state.render_elsp[3] = value;
> return;
> - }
> -
> - if (render_elsp_writes > 3 || blitter_elsp_writes > 3) {
> - render_elsp_writes = blitter_elsp_writes = 0;
> - pphwsp = (uint8_t*)gtt + (value & 0xfffff000);
> - } else if (offset == 0x2550) {
> + break;
> + case 0x2514: /* render elsq0 hi */
> + state.render_elsp[2] = value;
> + return;
> + break;
> + case 0x22510: /* blitter elsq0 lo */
> + state.blitter_elsp[3] = value;
> + return;
> + break;
> + case 0x22514: /* blitter elsq0 hi */
> + state.blitter_elsp[2] = value;
> + return;
> + break;
> + case 0x2550: /* render elsc */
> engine = GEN_ENGINE_RENDER;
> - pphwsp = (uint8_t*)gtt + (render_elsq0 & 0xfffff000);
> - } else if (offset == 0x22550) {
> + context_descriptor = (uint64_t)state.render_elsp[2] << 32 |
> + state.render_elsp[3];
> + break;
> + case 0x22550: /* blitter elsc */
> engine = GEN_ENGINE_BLITTER;
> - pphwsp = (uint8_t*)gtt + (blitter_elsq0 & 0xfffff000);
> - } else {
> + context_descriptor = (uint64_t)state.blitter_elsp[2] << 32 |
> + state.blitter_elsp[3];
> + break;
> + default:
> return;
> }
>
> const uint32_t pphwsp_size = 4096;
> - uint32_t *context = (uint32_t*)(pphwsp + pphwsp_size);
> + uint32_t pphwsp_addr = context_descriptor & 0xfffff000;
> + struct gen_batch_decode_bo pphwsp_bo = get_ggtt_batch_bo(NULL, pphwsp_addr);
> + uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map +
> + (pphwsp_bo.addr - pphwsp_addr) +
> + pphwsp_size);
> +
> uint32_t ring_buffer_head = context[5];
> uint32_t ring_buffer_tail = context[7];
> uint32_t ring_buffer_start = context[9];
> - uint32_t *commands = (uint32_t*)((uint8_t*)gtt + ring_buffer_start + ring_buffer_head);
> + uint64_t pml4 = (uint64_t)context[49] << 32 | context[51];
> +
> + struct gen_batch_decode_bo ring_bo = get_ggtt_batch_bo(NULL,
> + ring_buffer_start);
> + assert(ring_bo.size > 0);
> + void *commands = (uint8_t *)ring_bo.map + (ring_bo.addr - ring_buffer_start);
> +
> + if (context_descriptor & 0x100 /* ppgtt */) {
> + batch_ctx.get_bo = get_ppgtt_batch_bo;
> + batch_ctx.user_data = &pml4;
> + } else {
> + batch_ctx.get_bo = get_ggtt_batch_bo;
> + }
> +
> (void)engine; /* TODO */
> - gen_print_batch(&batch_ctx, commands, ring_buffer_tail - ring_buffer_head, 0);
> + gen_print_batch(&batch_ctx, commands, ring_buffer_tail - ring_buffer_head,
> + 0);
> + clear_bo_maps();
> }
>
> static void
> @@ -301,17 +656,18 @@ handle_memtrace_mem_write(uint32_t *p)
> uint32_t size = p[4];
> uint32_t *data = p + 5;
>
> - if (address_space != 1)
> - return;
> -
> - if (gtt_size < address + size) {
> - fprintf(stderr, "overflow gtt space: %s\n", strerror(errno));
> - exit(EXIT_FAILURE);
> + switch (address_space) {
> + case 0: /* GGTT */
> + handle_ggtt_write(address, data, size);
> + break;
> + case 1:
> + case 2: /* Physical */
> + handle_physical_write(address, data, size);
> + break;
> + case 4: /* GGTT Entry */
> + handle_ggtt_entry_write(address, data, size);
> + break;
> }
> -
> - memcpy((char *) gtt + address, data, size);
> - if (gtt_end < address + size)
> - gtt_end = address + size;
> }
>
> struct aub_file {
> @@ -646,6 +1002,10 @@ int main(int argc, char *argv[])
> exit(EXIT_FAILURE);
> }
>
> + mem_fd = memfd_create("phys memory", 0);
> +
> + list_inithead(&maps);
> +
> while (aub_file_more_stuff(file)) {
> switch (aub_file_decode_batch(file)) {
> case AUB_ITEM_DECODE_OK:
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list