[PATCH i-g-t] benchmarks/gem_wsim: Support gens without relocations

Thu Dec 14 14:36:30 UTC 2023

On 14/12/2023 14:13, Bernatowicz, Marcin wrote:
> Hi,
> 
> On 12/13/2023 2:47 PM, Tvrtko Ursulin wrote:
>>
>> On 12/12/2023 11:34, Marcin Bernatowicz wrote:
>>> This commit adopts the approach present in lib/igt_dummyload
>>> to support generations that do not use relocations.
>>> The intel_allocator is leveraged to compute offsets on these 
>>> generations.
>>> Apart of this change, the 'struct vm' is now shared by both i915 and
>>> Xe. It includes a 'vm_id' that the intel_allocator uses for address
>>> assignment.
>>> This modification enhances gem_wsim compatibility with newer platforms
>>> running i915.
>>>
>>> Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
>>> ---
>>>   benchmarks/gem_wsim.c | 106 +++++++++++++++++++++++++++++++-----------
>>>   1 file changed, 80 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
>>> index 30673da8f..7455cd6a1 100644
>>> --- a/benchmarks/gem_wsim.c
>>> +++ b/benchmarks/gem_wsim.c
>>> @@ -205,7 +205,7 @@ struct w_step {
>>>       uint32_t bb_handle;
>>>   };
>>> -struct xe_vm {
>>> +struct vm {
>>>       uint32_t id;
>>>       bool compute_mode;
>>>       uint64_t ahnd;
>>> @@ -226,9 +226,9 @@ struct ctx {
>>>       struct bond *bonds;
>>>       bool load_balance;
>>>       uint64_t sseu;
>>> +    /* reference to vm */
>>> +    struct vm *vm;
>>>       struct {
>>> -        /* reference to vm */
>>> -        struct xe_vm *vm;
>>>           /* exec queues */
>>>           unsigned int nr_queues;
>>>           struct xe_exec_queue *queue_list;
>>> @@ -256,10 +256,8 @@ struct workload {
>>>       unsigned int nr_ctxs;
>>>       struct ctx *ctx_list;
>>> -    struct {
>>> -        unsigned int nr_vms;
>>> -        struct xe_vm *vm_list;
>>> -    } xe;
>>> +    unsigned int nr_vms;
>>> +    struct vm *vm_list;
>>>       struct working_set **working_sets; /* array indexed by set id */
>>>       int max_working_set_id;
>>> @@ -1504,7 +1502,16 @@ static unsigned int create_bb(struct w_step 
>>> *w, int self)
>>>       gem_set_domain(fd, w->bb_handle,
>>>                  I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
>>> -    cs = ptr = gem_mmap__wc(fd, w->bb_handle, 0, w->bb_size, 
>>> PROT_WRITE);
>>> +    if (__gem_set_caching(fd, w->bb_handle,
>>> +                  I915_CACHING_CACHED) == 0) {
>>> +        cs = ptr = gem_mmap__cpu(fd, w->bb_handle,
>>> +                       0, w->bb_size,
>>> +                       PROT_READ | PROT_WRITE);
>>> +    } else
>>> +        cs = ptr = gem_mmap__device_coherent(fd,
>>> +                               w->bb_handle,
>>> +                               0, w->bb_size,
>>> +                               PROT_READ | PROT_WRITE);
>>
>> How is this hink related I did not get it? Should it be a separate patch?
> 
> It appears that 'gem_mmap__wc' is not working for atsm (it might be 
> necessary to use I915_MMAP_OFFSET_FIXED for discrete?). Without 
> extensive investigation, I've adopted the mapping approach from 
> lib/igt_dummyload. This modification could potentially be applied in a 
> separate patch.

Yeah I would say split it since mmap and relocations are two separate 
topics. The rest then looks good to me.

Regards,

Tvrtko

> 
>>
>>>       /* Store initial 64b timestamp: start */
>>>       *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_CS_MMIO_DST;
>>> @@ -1539,9 +1546,11 @@ static unsigned int create_bb(struct w_step 
>>> *w, int self)
>>>       /* Save delta for indirect read by COND_BBE */
>>>       *cs++ = MI_STORE_REGISTER_MEM_CMD | (1 + use_64b) | 
>>> MI_CS_MMIO_DST;
>>>       *cs++ = CS_GPR(NOW_TS);
>>> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
>>>       w->i915.reloc[r].target_handle = self;
>>>       w->i915.reloc[r].offset = offset_in_page(cs);
>>> -    *cs++ = w->i915.reloc[r].delta = 4000;
>>> +    w->i915.reloc[r].delta = 4000;
>>> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
>>>       *cs++ = 0;
>>>       r++;
>>> @@ -1556,17 +1565,21 @@ static unsigned int create_bb(struct w_step 
>>> *w, int self)
>>>       *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
>>>       w->i915.bb_duration = cs;
>>>       *cs++ = 0;
>>> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
>>>       w->i915.reloc[r].target_handle = self;
>>>       w->i915.reloc[r].offset = offset_in_page(cs);
>>> -    *cs++ = w->i915.reloc[r].delta = 4000;
>>> +    w->i915.reloc[r].delta = 4000;
>>> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
>>>       *cs++ = 0;
>>>       r++;
>>>       /* Otherwise back to recalculating delta */
>>>       *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
>>> +    w->i915.reloc[r].presumed_offset = w->i915.obj[self].offset;
>>>       w->i915.reloc[r].target_handle = self;
>>>       w->i915.reloc[r].offset = offset_in_page(cs);
>>> -    *cs++ = w->i915.reloc[r].delta = offset_in_page(jmp);
>>> +    w->i915.reloc[r].delta = offset_in_page(jmp);
>>> +    *cs++ = w->i915.reloc[r].presumed_offset + w->i915.reloc[r].delta;
>>>       *cs++ = 0;
>>>       r++;
>>> @@ -1648,10 +1661,10 @@ xe_get_eq(struct workload *wrk, const struct 
>>> w_step *w)
>>>       return eq;
>>>   }
>>> -static struct xe_vm *
>>> -xe_get_vm(struct workload *wrk, const struct w_step *w)
>>> +static struct vm *
>>> +get_vm(struct workload *wrk, const struct w_step *w)
>>>   {
>>> -    return wrk->xe.vm_list;
>>> +    return wrk->vm_list;
>>
>> Why does this have 'w' as an argument but doesn't use it?
>>
> 
> At present, we are operating with a single vm. However, if additional 
> vms are introduced, the 'w' parameter will be utilized to select the 
> appropriate one (return get_ctx(wrk, w)->vm).
> 
>>>   }
>>>   static uint32_t alloc_bo(int i915, unsigned long *size)
>>> @@ -1673,6 +1686,16 @@ alloc_step_batch(struct workload *wrk, struct 
>>> w_step *w)
>>>       struct dep_entry *dep;
>>>       unsigned int j = 0;
>>>       unsigned int nr_obj = 2 + w->data_deps.nr;
>>> +    unsigned int objflags = 0;
>>> +    uint64_t addr;
>>> +    struct vm *vm = get_vm(wrk, w);
>>> +
>>> +    addr = gem_aperture_size(fd) / 2;
>>> +    if (addr >> 32)
>>> +        objflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
>>> +
>>> +    if (vm->ahnd)
>>> +        objflags |= EXEC_OBJECT_PINNED;
>>>       w->i915.obj = calloc(nr_obj, sizeof(*w->i915.obj));
>>>       igt_assert(w->i915.obj);
>>> @@ -1680,11 +1703,18 @@ alloc_step_batch(struct workload *wrk, struct 
>>> w_step *w)
>>>       w->bb_size = PAGE_SIZE;
>>>       w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size);
>>>       w->i915.obj[j].flags = EXEC_OBJECT_WRITE;
>>> +    if (vm->ahnd) {
>>> +        addr = get_offset(vm->ahnd, w->i915.obj[j].handle, 
>>> w->bb_size, 0);
>>> +        w->i915.obj[j].offset = CANONICAL(addr);
>>> +        w->i915.obj[j].flags |= objflags;
>>> +    }
>>> +
>>>       j++;
>>>       igt_assert(j < nr_obj);
>>>       for_each_dep(dep, w->data_deps) {
>>>           uint32_t dep_handle;
>>> +        uint64_t dep_size;
>>
>> w->bb_size is unsigned long, the two should probably match.
> 
> I selected the dep_size type to match the uint64_t parameter type of the 
> get_offset function. While the final result will remain unchanged, it 
> might be more coherent to use unsigned long, aligning with the type of 
> values dep_size is assigned, and pass that to the get_offset function.
> 
>>
>>>           if (dep->working_set == -1) {
>>>               int dep_idx = w->idx + dep->target;
>>> @@ -1694,6 +1724,7 @@ alloc_step_batch(struct workload *wrk, struct 
>>> w_step *w)
>>>               igt_assert(wrk->steps[dep_idx].type == BATCH);
>>>               dep_handle = wrk->steps[dep_idx].i915.obj[0].handle;
>>> +            dep_size = w->bb_size;
>>>           } else {
>>>               struct working_set *set;
>>> @@ -1707,18 +1738,33 @@ alloc_step_batch(struct workload *wrk, struct 
>>> w_step *w)
>>>               igt_assert(set->sizes[dep->target].size);
>>>               dep_handle = set->handles[dep->target];
>>> +            dep_size = set->sizes[dep->target].size;
>>>           }
>>>           w->i915.obj[j].flags = dep->write ? EXEC_OBJECT_WRITE : 0;
>>>           w->i915.obj[j].handle = dep_handle;
>>> +        if (vm->ahnd) {
>>> +            addr = get_offset(vm->ahnd, w->i915.obj[j].handle, 
>>> dep_size, 0);
>>> +            w->i915.obj[j].offset = CANONICAL(addr);
>>> +            w->i915.obj[j].flags |= objflags;
>>> +        }
>>>           j++;
>>>           igt_assert(j < nr_obj);
>>>       }
>>>       w->bb_handle = w->i915.obj[j].handle = alloc_bo(fd, &w->bb_size);
>>> +    if (vm->ahnd) {
>>> +        addr = get_offset(vm->ahnd, w->i915.obj[j].handle, 
>>> w->bb_size, 0);
>>> +        w->i915.obj[j].offset = CANONICAL(addr);
>>> +        w->i915.obj[j].flags |= objflags;
>>> +    }
>>>       w->i915.obj[j].relocation_count = create_bb(w, j);
>>> -    igt_assert(w->i915.obj[j].relocation_count <= 
>>> ARRAY_SIZE(w->i915.reloc));
>>> -    w->i915.obj[j].relocs_ptr = to_user_pointer(&w->i915.reloc);
>>> +    if (vm->ahnd) {
>>> +        w->i915.obj[j].relocation_count = 0;
>>> +    } else {
>>> +        igt_assert(w->i915.obj[j].relocation_count <= 
>>> ARRAY_SIZE(w->i915.reloc));
>>> +        w->i915.obj[j].relocs_ptr = to_user_pointer(&w->i915.reloc);
>>> +    }
>>>       w->i915.eb.buffers_ptr = to_user_pointer(w->i915.obj);
>>>       w->i915.eb.buffer_count = j + 1;
>>> @@ -1738,7 +1784,7 @@ alloc_step_batch(struct workload *wrk, struct 
>>> w_step *w)
>>>   static void
>>>   xe_alloc_step_batch(struct workload *wrk, struct w_step *w)
>>>   {
>>> -    struct xe_vm *vm = xe_get_vm(wrk, w);
>>> +    struct vm *vm = get_vm(wrk, w);
>>>       struct xe_exec_queue *eq = xe_get_eq(wrk, w);
>>>       struct dep_entry *dep;
>>>       int i;
>>> @@ -2032,7 +2078,7 @@ static void measure_active_set(struct workload 
>>> *wrk)
>>>   #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, 
>>> sz__); })
>>> -static void xe_vm_create_(struct xe_vm *vm)
>>> +static void xe_vm_create_(struct vm *vm)
>>>   {
>>>       uint32_t flags = 0;
>>> @@ -2046,7 +2092,7 @@ static void xe_vm_create_(struct xe_vm *vm)
>>>   static void xe_exec_queue_create_(struct ctx *ctx, struct 
>>> xe_exec_queue *eq)
>>>   {
>>>       struct drm_xe_exec_queue_create create = {
>>> -        .vm_id = ctx->xe.vm->id,
>>> +        .vm_id = ctx->vm->id,
>>>           .width = 1,
>>>           .num_placements = eq->nr_hwes,
>>>           .instances = to_user_pointer(eq->hwe_list),
>>> @@ -2159,6 +2205,12 @@ static int prepare_contexts(unsigned int id, 
>>> struct workload *wrk)
>>>                   gem_context_get_param(fd, &param);
>>>                   igt_assert(param.value);
>>>                   share_vm = param.value;
>>> +                wrk->nr_vms = 1;
>>> +                wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm));
>>> +                igt_assert(wrk->vm_list);
>>> +                wrk->vm_list->id = share_vm;
>>> +                wrk->vm_list->ahnd = get_reloc_ahnd(fd, share_vm);
>>> +                ctx2->vm = wrk->vm_list;
>>>                   break;
>>>               }
>>> @@ -2174,6 +2226,7 @@ static int prepare_contexts(unsigned int id, 
>>> struct workload *wrk)
>>>           ctx_id = args.ctx_id;
>>>           ctx->id = ctx_id;
>>>           ctx->sseu = device_sseu.slice_mask;
>>> +        ctx->vm = wrk->vm_list;
>>>           __configure_context(ctx_id, wrk->prio);
>>> @@ -2267,16 +2320,17 @@ static int xe_prepare_contexts(unsigned int 
>>> id, struct workload *wrk)
>>>       unsigned int i;
>>>       /* shortcut, create one vm */
>>> -    wrk->xe.nr_vms = 1;
>>> -    wrk->xe.vm_list = calloc(wrk->xe.nr_vms, sizeof(struct xe_vm));
>>> -    wrk->xe.vm_list->compute_mode = false;
>>> -    xe_vm_create_(wrk->xe.vm_list);
>>> -    wrk->xe.vm_list->ahnd = intel_allocator_open(fd, 
>>> wrk->xe.vm_list->id,
>>> +    wrk->nr_vms = 1;
>>> +    wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm));
>>> +    igt_assert(wrk->vm_list);
>>> +    wrk->vm_list->compute_mode = false;
>>> +    xe_vm_create_(wrk->vm_list);
>>> +    wrk->vm_list->ahnd = intel_allocator_open(fd, wrk->vm_list->id,
>>>                                INTEL_ALLOCATOR_RELOC)
>>
>> Can some of this go to common, now that struct vm is common? Maybe 
>> not.. remind me, plan is for xe backend to support more than one vm? 
>> That's why get_vm has w_step as argument?
> 
> Yes, that's correct. I should tackle it when adding support for more vms.
> For the time being, the primary requirement for this patch is a vm id to 
> align with intel_allocator's needs.
> -- 
> marcin
> 
>>
>> Regards,
>>
>> Tvrtko
>>
>>>       __for_each_ctx(ctx, wrk, ctx_idx) {
>>>           /* link with vm */
>>> -        ctx->xe.vm = wrk->xe.vm_list;
>>> +        ctx->vm = wrk->vm_list;
>>>           for_each_w_step(w, wrk) {
>>>               if (w->context != ctx_idx)
>>>                   continue;
>>> @@ -2846,7 +2900,7 @@ static void *run_workload(void *data)
>>>                   w_step_sync(w);
>>>                   syncobj_destroy(fd, w->xe.syncs[0].handle);
>>>                   free(w->xe.syncs);
>>> -                xe_vm_unbind_sync(fd, xe_get_vm(wrk, w)->id, 0, 
>>> w->xe.exec.address,
>>> +                xe_vm_unbind_sync(fd, get_vm(wrk, w)->id, 0, 
>>> w->xe.exec.address,
>>>                             w->bb_size);
>>>                   gem_munmap(w->xe.data, w->bb_size);
>>>                   gem_close(fd, w->bb_handle);