[PATCH v8 3 3/7] drm/xe/eustall: Implement EU stall sampling APIs for Xe_HPC
Dixit, Ashutosh
ashutosh.dixit at intel.com
Sat Jan 25 03:09:38 UTC 2025
On Wed, 15 Jan 2025 12:02:09 -0800, Harish Chegondi wrote:
>
Review #3 on the same patch. I've changed the email subject to [PATCH v8 3 3/7].
Overall the priority should be on fixing code, rather then remaking the
patches (moving code around from one patch to another etc.), specially if
remaking the patches is significant effort. We can mostly live with the
patches as they are and review as we are doing now (even though it is not
ideal).
/snip/ : I hate scrolling unnecessarily and making people scroll
unnecsserarily, for future reference :)
> +static void
> +free_eu_stall_data_buf(struct xe_eu_stall_data_stream *stream)
> +{
> + if (stream->bo) {
> + xe_bo_unpin_map_no_vm(stream->bo);
> + stream->bo = NULL;
No need to set to NULL, we will free stream after this.
> +static void
> +xe_eu_stall_stream_enable(struct xe_eu_stall_data_stream *stream)
> +{
> + struct xe_gt *gt = stream->gt;
> + unsigned int fw_ref;
> + u32 reg_value;
> +
> + /* Take runtime pm ref and forcewake to disable RC6 */
> + xe_pm_runtime_get(gt_to_xe(gt));
> + fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_RENDER);
> + if (!xe_force_wake_ref_has_domain(fw_ref, XE_FW_RENDER))
> + xe_gt_err(gt, "Failed to get RENDER forcewake\n");
If this fails, this needs to return error. See xe_oa.c.
Also we need to only get RENDER forcewake or XE_FORCEWAKE_ALL?
> +
> + reg_value = gen_eustall_base(stream, true);
> + xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, reg_value);
> +}
> +
> +static void
> +xe_eu_stall_stream_disable(struct xe_eu_stall_data_stream *stream)
> +{
> + struct xe_gt *gt = stream->gt;
> + u16 group, instance;
> + unsigned int xecore;
> + u32 reg_value;
> +
> + /*
> + * Before disabling EU stall sampling, check if any of the
> + * XEHPC_EUSTALL_REPORT registers have the drop bit set. If set,
> + * clear the bit. If the user space application reads all the
> + * stall data, the drop bit would be cleared during the read.
> + * But if there is any unread data and the drop bit is set for
> + * any subslice, the drop bit would continue to be set even
> + * after disabling EU stall sampling and may cause erroneous
> + * stall data in the subsequent stall data sampling run.
> + */
> + for_each_dss_steering(xecore, gt, group, instance) {
> + reg_value = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT,
> + group, instance);
> + if (reg_value & XEHPC_EUSTALL_REPORT_OVERFLOW_DROP)
> + clear_dropped_eviction_line_bit(gt, group, instance);
> + }
Doesn't it make more sense to do this during enable rather than disable? So
we'll start from a clean place? Anyway, if this works in disable, probably
ok too.
> + reg_value = gen_eustall_base(stream, false);
> + xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, reg_value);
Why can't we just write 0 here? Then we can remove the enable param from
gen_eustall_base.
> +
> + xe_force_wake_put(gt_to_fw(gt), XE_FW_RENDER);
> + xe_pm_runtime_put(gt_to_xe(gt));
> +}
> +
> +static void eu_stall_buf_check_work_fn(struct work_struct *work)
> +{
> + struct xe_eu_stall_data_stream *stream =
> + container_of(work, typeof(*stream), buf_check_work);
> +
> + if (eu_stall_data_buf_check(stream)) {
> + stream->pollin = true;
> + wake_up(&stream->poll_wq);
> + }
> +}
> +
> +static enum
> +hrtimer_restart eu_stall_poll_check_timer_cb(struct hrtimer *hrtimer)
> +{
> + struct xe_eu_stall_data_stream *stream =
> + container_of(hrtimer, typeof(*stream), poll_check_timer);
> +
> + queue_work(stream->buf_check_wq, &stream->buf_check_work);
> + hrtimer_forward_now(hrtimer, ns_to_ktime(stream->poll_period));
> +
> + return HRTIMER_RESTART;
So there is an interesting point here. if we are not going to do any
processing directly in the hrtimer interrupt, and are going to a schedule a
work item, why do we need the hrtimer at all? We can just make
buf_check_work a delayed_work, just call queue_delayed_work() to requeue
the work every 5 msec from the work_fn.
There are plenty of examples of this but e.g look at
amdgpu_amdkfd_restore_userptr_worker().
So I think we can and should remove hrtimer, just make buf_check_work
delayerd work. You already have a dedicated buf_check_wq (so are not using
a system_wq).
> +}
> +
> +static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
> + struct eu_stall_open_properties *props)
> +{
> + u32 write_ptr_reg, write_ptr, read_ptr_reg;
> + u32 vaddr_offset, base_reg_value;
> + struct xe_gt *gt = stream->gt;
> + struct per_xecore_buf *xecore_buf;
> + u16 group, instance, num_xecore;
> + xe_dss_mask_t all_xecore;
> + unsigned int fw_ref;
> + int ret, xecore;
> +
> + init_waitqueue_head(&stream->poll_wq);
> + INIT_WORK(&stream->buf_check_work, eu_stall_buf_check_work_fn);
> + stream->buf_check_wq = alloc_ordered_workqueue("xe_eustall", 0);
> + if (!stream->buf_check_wq)
> + return -ENOMEM;
> + hrtimer_init(&stream->poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + stream->poll_check_timer.function = eu_stall_poll_check_timer_cb;
> + stream->wait_num_reports = props->wait_num_reports;
> + stream->per_xecore_buf_size = per_xecore_buf_size;
> + stream->poll_period = POLL_PERIOD_NS;
> +
> + bitmap_or(all_xecore, gt->fuse_topo.g_dss_mask, gt->fuse_topo.c_dss_mask,
> + XE_MAX_DSS_FUSE_BITS);
> + /*
> + * Enabled subslices can be discontiguous. Find the maximum number of subslices
> + * that are enabled.
> + */
> + num_xecore = xe_gt_topology_mask_last_dss(all_xecore) + 1;
> +
> + ret = alloc_eu_stall_data_buf(stream, num_xecore);
> + if (ret)
> + return ret;
> +
> + stream->xecore_buf = kcalloc(num_xecore, sizeof(*stream->xecore_buf), GFP_KERNEL);
> + if (!stream->xecore_buf)
> + return -ENOMEM;
> +
> + bitmap_zero(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS);
> +
> + xe_pm_runtime_get(gt_to_xe(gt));
> + fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_RENDER);
> + if (!xe_force_wake_ref_has_domain(fw_ref, XE_FW_RENDER))
If this fails, this needs to return error.
> + xe_gt_err(gt, "Failed to get RENDER forcewake\n");
> +
> + base_reg_value = gen_eustall_base(stream, false);
> + xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, base_reg_value);
Why don't we just write 0 here? The real value will anyway be set during enable()?
> + /* GGTT addresses can never be > 32 bits */
> + xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE_UPPER, 0);
> + base_reg_value = _MASKED_FIELD(EUSTALL_MOCS | EUSTALL_SAMPLE_RATE,
> + REG_FIELD_PREP(EUSTALL_MOCS, gt->mocs.uc_index << 1) |
Sorry I don't know this mocs business very well. Can you explain what's
going on here? Why do we program mocs.uc_index, what does it do and why are
we multiplying by 2?
Also s/base_reg_value/reg_value/ or something like that, since we are
reusing the variable to not just set XEHPC_EUSTALL_BASE.
> + REG_FIELD_PREP(EUSTALL_SAMPLE_RATE,
> + props->eu_stall_sampling_rate));
> + xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_CTRL, base_reg_value);
> +
> + for_each_dss_steering(xecore, gt, group, instance) {
> + write_ptr_reg = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT,
> + group, instance);
> + write_ptr = REG_FIELD_GET(XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK, write_ptr_reg);
> + write_ptr <<= 6;
> + write_ptr &= ((stream->per_xecore_buf_size << 1) - 1);
Why '<< 1'? Is it to preserve the rollover bit?
> + read_ptr_reg = write_ptr >> 6;
This line has no effect, it is being overwritten it in the next line right
below :/
> + read_ptr_reg = REG_FIELD_PREP(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, write_ptr);
Is the last arg here write_ptr, or is it read_ptr_reg set in the previous line?
> + read_ptr_reg &= XEHPC_EUSTALL_REPORT1_READ_PTR_MASK;
> + read_ptr_reg = _MASKED_FIELD(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, read_ptr_reg);
> + xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT1,
> + read_ptr_reg, group, instance);
Why do we need to do all this, read the write ptr reg and write to read ptr
register? Can't we just set both read and write ptr to the start of the per
xecore buf address?
> + xecore_buf = &stream->xecore_buf[xecore];
> + vaddr_offset = xecore * stream->per_xecore_buf_size;
> + xecore_buf->vaddr = stream->bo->vmap.vaddr + vaddr_offset;
Something like this vaddr (with shifts) should be written to read/write ptr
register imo, to initialize read/write ptrs to the start of the per xecore
buffer?
> + xecore_buf->write = write_ptr;
> + xecore_buf->read = write_ptr;
> + mutex_init(&xecore_buf->lock);
We also need to carefully see which of this stuff needs to be in init() and
what needs to move to enable(). Because according to uapi, a stream can be
disabled/enabled multiple times and that should work.
E.g. should read/write pointers get reset during disable/enable? To me it
looks like most of the stuff here (except buffer allocation etc.) should
really be in enable() instead of init() (xe_oa.c follows a similar logic).
> + }
> + xe_force_wake_put(gt_to_fw(gt), fw_ref);
> + xe_pm_runtime_put(gt_to_xe(gt));
> + return 0;
> +}
> +
> +static void
> +xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream)
> +{
> + if (stream->enabled)
> + return;
> +
> + stream->enabled = true;
> +
> + xe_eu_stall_stream_enable(stream);
> + hrtimer_start(&stream->poll_check_timer,
> + ns_to_ktime(stream->poll_period),
> + HRTIMER_MODE_REL);
> +}
> +
> +static void
> +xe_eu_stall_disable_locked(struct xe_eu_stall_data_stream *stream)
> +{
> + if (!stream->enabled)
> + return;
> +
> + stream->enabled = false;
> +
> + hrtimer_cancel(&stream->poll_check_timer);
> + flush_workqueue(stream->buf_check_wq);
> + xe_eu_stall_stream_disable(stream);
> +}
> +
> +static long
> +xe_eu_stall_stream_ioctl_locked(struct xe_eu_stall_data_stream *stream,
> + unsigned int cmd, unsigned long arg)
> +{
> + switch (cmd) {
> + case DRM_XE_OBSERVATION_IOCTL_ENABLE:
> + xe_eu_stall_enable_locked(stream);
> + return 0;
> + case DRM_XE_OBSERVATION_IOCTL_DISABLE:
> + xe_eu_stall_disable_locked(stream);
> + return 0;
nit but imo it might be better to return 0 from these enable/disable
functions, but anyway this is also ok.
enable() anyway needs to return error from xe_force_wake_get.
Thanks.
--
Ashutosh
More information about the Intel-xe
mailing list