[PATCH v12 1/5] drm/xe/guc: Prepare GuC register list and update ADS size for error capture
Dong, Zhanjun
zhanjun.dong at intel.com
Mon Jul 15 15:45:22 UTC 2024
See my comments inline below.
Regards,
Zhanjun Dong
On 2024-07-12 4:44 p.m., Michal Wajdeczko wrote:
>
>
> On 12.07.2024 18:47, Zhanjun Dong wrote:
>> Add referenced registers defines and list of registers.
>> Update GuC ADS size allocation to include space for
>> the lists of error state capture register descriptors.
>>
>> Then, populate GuC ADS with the lists of registers we want
>> GuC to report back to host on engine reset events. This list
...
>> diff --git a/drivers/gpu/drm/xe/abi/guc_capture_abi.h b/drivers/gpu/drm/xe/abi/guc_capture_abi.h
>> new file mode 100644
>> index 000000000000..c71630aed49b
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/abi/guc_capture_abi.h
>> @@ -0,0 +1,144 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2024 Intel Corporation
>> + */
>> +
>> +#ifndef _ABI_GUC_CAPTURE_ABI_H
>> +#define _ABI_GUC_CAPTURE_ABI_H
>> +
>> +#include <linux/types.h>
>> +
>> +#include "abi/guc_communication_mmio_abi.h"
>
> hmm, IIUC, "capture" uses ADS data and CTB communication, not MMIO, so
> this looks like a wrong header
to be removed
>
>> +
>> +/* Capture List Index */
>> +enum guc_capture_list_index_type {
>> + GUC_CAPTURE_LIST_INDEX_PF = 0,
>> + GUC_CAPTURE_LIST_INDEX_VF = 1,
>> + GUC_CAPTURE_LIST_INDEX_MAX = 2
>
> IMO, this MAX shouldn't be part of enum, if we want to benefit from
> using enum as index
The MAX is part of the spec:
"
Values:
GUC_CAPTURE_LIST_INDEX_CAPTURE_LIST_INDEX_PF = 0x0
Index of the PF capture list.
GUC_CAPTURE_LIST_INDEX_CAPTURE_LIST_INDEX_VF = 0x1
Index of the VF capture list.
GUC_CAPTURE_LIST_INDEX_CAPTURE_MAX_LIST_INDEX = 0x2
Max capture list index.
"
Spec put the MAX in the middle of the macro name, I put the MAX to the
end of macro name for easy to read
>
>> +};
>> +
>> +/* Register-types of GuC capture register lists */
>> +enum guc_state_capture_type {
>> + GUC_STATE_CAPTURE_TYPE_GLOBAL = 0,
>> + GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS,
>> + GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE
>> +};
>> +
...
>> +/**
>> + * struct guc_state_capture_header_t - State capture header.
>> + *
>> + * Prior to resetting engines that have hung or faulted, GuC microkernel
>> + * reports the engine error-state (register values that was read) by
>> + * logging them into the shared GuC log buffer using these hierarchy
>> + * of structures.
>> + */
>> +struct guc_state_capture_header_t {
>> + /**
>> + * @owner: VFID
>> + * BR[ 7: 0] MBZ when SRIOV is disabled. When SRIOV is enabled
>> + * VFID is an integer in range [0, 63] where 0 means the state capture
>> + * is corresponding to the PF and an integer N in range [1, 63] means
>> + * the state capture is for VF N.
>> + */
>> + u32 owner;
>> +#define GUC_STATE_CAPTURE_HEADER_VFID GENMASK(7, 0)
>> + /** @info: Engine class/instance and capture type info */
>> + u32 info;
>> +#define GUC_STATE_CAPTURE_HEADER_CAPTURE_TYPE GENMASK(3, 0) /* see guc_state_capture_type */
>> +#define GUC_STATE_CAPTURE_HEADER_ENGINE_CLASS GENMASK(7, 4) /* see guc_capture_list_class_type */
>> +#define GUC_STATE_CAPTURE_HEADER_ENGINE_INSTANCE GENMASK(11, 8)
>> + /** @lrca: logical ring context address. */
>> + u32 lrca; /* if type-instance, LRCA (address) that hung, else set to ~0 */
>
> keep comments in one block:
>
> /**
> * @lrca: logical ring context address.
> * if type-instance, LRCA (address) that hung, else set to ~0
> */
>
>> + /** @guc_id: context_index. */
>> + u32 guc_id; /* if type-instance, context index of hung context, else set to ~0 */
>
> ditto
Sure, will do
>
>> + /** @num_mmio_entries: Number of captured MMIO entries. */
>> + u32 num_mmio_entries;
>> +#define GUC_STATE_CAPTURE_HEADER_NUM_MMIO_ENTRIES GENMASK(9, 0)
>> +} __packed;
>> +
>> +/**
>> + * struct guc_state_capture_t - State capture.
>> + *
>> + * State capture
>> + */
...
>> diff --git a/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h b/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h
>> index ef538e34f894..219b40063f43 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h
>> @@ -6,6 +6,29 @@
>> #ifndef _ABI_GUC_COMMUNICATION_MMIO_ABI_H
>> #define _ABI_GUC_COMMUNICATION_MMIO_ABI_H
>>
>> +#include <linux/types.h>
>> +
>> +/* GuC MMIO reg state struct */
>> +struct guc_mmio_reg {
>> + u32 offset;
>> + u32 value;
>> + u32 flags;
>> + u32 mask;
>> +#define GUC_REGSET_MASKED BIT(0)
>> +#define GUC_REGSET_STEERING_NEEDED BIT(1)
>> +#define GUC_REGSET_MASKED_WITH_VALUE BIT(2)
>> +#define GUC_REGSET_RESTORE_ONLY BIT(3)
>> +#define GUC_REGSET_STEERING_GROUP GENMASK(16, 12)
>> +#define GUC_REGSET_STEERING_INSTANCE GENMASK(23, 20)
>> +} __packed;
>> +
>> +/* GuC register sets */
>> +struct guc_mmio_reg_set {
>> + u32 address;
>> + u16 count;
>> + u16 reserved;
>> +} __packed;
>> +
>
> this ABI header is for definitions related to 'communication over MMIO'
>
> your additions do not fit here!
>
> move to guc_ads_abi.h or guc_capture_abi.h
Sure, will be moved to guc_capture_abi.h
>
>> /**
>> * DOC: GuC MMIO based communication
>> *
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index eb655cee19f7..5e108cedbdc5 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -22,6 +22,7 @@
>> #include "xe_gt_sriov_vf.h"
>> #include "xe_gt_throttle.h"
>> #include "xe_guc_ads.h"
>> +#include "xe_guc_capture.h"
>> #include "xe_guc_ct.h"
>> #include "xe_guc_db_mgr.h"
>> #include "xe_guc_hwconfig.h"
>> @@ -338,6 +339,10 @@ int xe_guc_init(struct xe_guc *guc)
>> if (ret)
>> goto out;
>>
>> + ret = xe_guc_capture_init(guc);
>> + if (ret)
>> + goto out;
>> +
>> ret = xe_guc_ads_init(&guc->ads);
>> if (ret)
>> goto out;
>> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
>> index af59c9545753..a823c5b8ab22 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.h
>> +++ b/drivers/gpu/drm/xe/xe_guc.h
>> @@ -59,6 +59,30 @@ static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
>> }
>> }
>>
>> +static inline enum guc_capture_list_class_type xe_guc_class_to_capture_class(uint class)
>
> we don't use 'uint', use 'u16' or 'unsigned int'
Will change to u16
>
>> +{
>> + switch (class) {
>> + case GUC_RENDER_CLASS:
>> + case GUC_COMPUTE_CLASS:
>> + return GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE;
>> + case GUC_GSC_OTHER_CLASS:
>> + return GUC_CAPTURE_LIST_CLASS_GSC_OTHER;
>> + case GUC_VIDEO_CLASS:
>> + case GUC_VIDEOENHANCE_CLASS:
>> + case GUC_BLITTER_CLASS:
>> + return class;
>> + default:
>> + XE_WARN_ON(class);
>> + return -1;
>
> if you want to return -1 then function should be of type 'int' not 'enum
> guc_capture_list_class_type'
To be changed to return GUC_CAPTURE_LIST_CLASS_MAX(defined in spec.).
I am thinking to add assertion of "capture_class <
GUC_CAPTURE_LIST_CLASS_MAX" after these 2 xxx to capture class helper
was called, while the XE_WARN_ON(class) already exist on hwe engine
class to guc class and guc class to capture class, make the assertion
seems not needed. On the other hand, as the converted capture class only
being used to matching class type, it is safe to handle the value of
GUC_CAPTURE_LIST_CLASS_MAX.
>
>> + }
>> +}
>> +
>> +static inline enum guc_capture_list_class_type
>> +xe_engine_class_to_guc_capture_class(enum xe_engine_class class)
>> +{
>> + return xe_guc_class_to_capture_class(xe_engine_class_to_guc_class(class));
>> +}
>> +
>
> and wouldn't above inlines be better fit in xe_guc_capture.h ?
Sure, will be moved to xe_guc_capture.h
>
>> static inline struct xe_gt *guc_to_gt(struct xe_guc *guc)
>> {
>> return container_of(guc, struct xe_gt, uc.guc);
>> @@ -69,4 +93,9 @@ static inline struct xe_device *guc_to_xe(struct xe_guc *guc)
>> return gt_to_xe(guc_to_gt(guc));
>> }
>>
...
More information about the Intel-xe
mailing list